Forráskód Böngészése

Add NUMA support from branch NUMA

Corentin Salingue 7 éve
szülő
commit
cbee1918f5
48 módosított fájl, 1763 hozzáadás és 742 törlés
  1. 19 2
      configure.ac
  2. 7 3
      doc/doxygen/chapters/210_check_list_performance.doxy
  3. 21 0
      doc/doxygen/chapters/501_environment_variables.doxy
  4. 8 0
      doc/doxygen/chapters/510_configure_options.doxy
  5. 5 1
      doc/doxygen/chapters/api/data_management.doxy
  6. 10 0
      doc/doxygen/chapters/api/workers.doxy
  7. 7 1
      examples/cpp/add_vectors_cpp11.cpp
  8. 1 0
      include/starpu_config.h.in
  9. 5 1
      include/starpu_data.h
  10. 2 0
      include/starpu_data_interfaces.h
  11. 14 7
      src/core/disk.c
  12. 4 0
      src/core/perfmodel/perfmodel.h
  13. 792 536
      src/core/perfmodel/perfmodel_bus.c
  14. 14 3
      src/core/simgrid.c
  15. 546 58
      src/core/topology.c
  16. 8 1
      src/core/topology.h
  17. 7 2
      src/core/workers.c
  18. 1 0
      src/core/workers.h
  19. 29 4
      src/datawizard/coherency.c
  20. 1 2
      src/datawizard/copy_driver.c
  21. 2 2
      src/datawizard/data_request.c
  22. 10 1
      src/datawizard/datawizard.c
  23. 1 1
      src/datawizard/datawizard.h
  24. 24 10
      src/datawizard/filters.c
  25. 12 4
      src/datawizard/interfaces/bcsr_interface.c
  26. 2 2
      src/datawizard/interfaces/block_interface.c
  27. 3 2
      src/datawizard/interfaces/coo_interface.c
  28. 2 2
      src/datawizard/interfaces/csr_interface.c
  29. 25 14
      src/datawizard/interfaces/data_interface.c
  30. 1 1
      src/datawizard/interfaces/data_interface.h
  31. 2 2
      src/datawizard/interfaces/matrix_interface.c
  32. 2 2
      src/datawizard/interfaces/variable_interface.c
  33. 2 2
      src/datawizard/interfaces/vector_interface.c
  34. 41 10
      src/datawizard/malloc.c
  35. 3 1
      src/datawizard/malloc.h
  36. 46 26
      src/datawizard/memalloc.c
  37. 1 1
      src/datawizard/memory_nodes.c
  38. 16 4
      src/datawizard/user_interactions.c
  39. 25 19
      src/drivers/cpu/driver_cpu.c
  40. 3 3
      src/drivers/cuda/driver_cuda.c
  41. 1 1
      src/drivers/cuda/driver_cuda.h
  42. 0 1
      src/drivers/mp_common/source_common.c
  43. 4 4
      src/drivers/mpi/driver_mpi_common.c
  44. 5 1
      src/util/openmp_runtime_support.c
  45. 2 0
      tests/datawizard/interfaces/test_interfaces.c
  46. 8 0
      tests/datawizard/nowhere.c
  47. 8 4
      tests/datawizard/specific_node.c
  48. 11 1
      tests/disk/mem_reclaim.c

+ 19 - 2
configure.ac

@@ -562,6 +562,23 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
 ###############################################################################
 
+###############################################################################
+#                                                                             #
+#                           NUMA memory nodes                                 #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of NUMA nodes)
+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
+			[maximum number of NUMA nodes])],
+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
+AC_MSG_RESULT($nmaxnumanodes)
+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
+		[maximum number of NUMA nodes])
+
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -2138,8 +2155,8 @@ if test x$maxnodes = x0 ; then
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
-		# we add nodes to use 3 memory disks
-		nodes=4
+		# we add nodes to use 2 memory disks
+		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
 			# we could have used nmaxcudadev + 1, but this would certainly give an
 			# odd number.

+ 7 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -197,9 +197,13 @@ structures of StarPU by describing the shape of your machine and/or your
 application at the configure step.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
+can set the
+\ref enable-maxcpus "--enable-maxcpus",
+\ref enable-maxnumanodes "--enable-maxnumanodes",
+\ref enable-maxcudadev "--enable-maxcudadev",
+\ref enable-maxopencldev "--enable-maxopencldev" and
+\ref enable-maxnodes "--enable-maxnodes"
+configure parameters to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 

+ 21 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -845,6 +845,14 @@ available to the application in the main CPU memory. Setting it enables allocati
 cache in main memory. Setting it to zero lets StarPU overflow memory.
 </dd>
 
+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the NUMA node with the OS identifier <c>devid</c>.
+</dd>
+
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dd>
 \anchor STARPU_MINIMUM_AVAILABLE_MEM
@@ -1110,6 +1118,19 @@ implements an advanced but centralized management of concurrent data
 accesses (see \ref ConcurrentDataAccess).
 </dd>
 
+<dt>STARPU_USE_NUMA</dt>
+<dd>
+\anchor STARPU_USE_NUMA 
+\addindex __env__STARPU_USE_NUMA
+When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
+is considered as only one node. This is experimental for now.
+
+When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
+first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
+If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
+discovered by StarPU.
+</dd>
+
 </dl>
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 8 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -130,6 +130,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 available as the macro ::STARPU_MAXCPUS.
 </dd>
 
+<dt>--enable-maxnumanodes=<c>count</c></dt>
+<dd>
+\anchor enable-maxnumanodes
+\addindex __configure__--enable-maxnumanodes
+Use at most <c>count</c> NUMA nodes.  This information is then
+available as the macro ::STARPU_MAXNUMANODES.
+</dd>
+
 <dt>--disable-cpu</dt>
 <dd>
 \anchor disable-cpu

+ 5 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2011, 2012, 2017  INRIA
  * See the file version.doxy for copying conditions.
  */
 
@@ -104,6 +104,10 @@ data to StarPU, the specified memory node indicates where the piece of
 data initially resides (we also call this memory node the home node of
 a piece of data).
 
+In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
+and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
+numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
+
 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
 \ingroup API_Data_Management
 Register a piece of data into the handle located at the

+ 10 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -250,6 +250,16 @@ Return the type of \p node as defined by
 this function should be used in the allocation function to determine
 on which device the memory needs to be allocated.
 
+\fn int starpu_memory_nodes_numa_id_to_devid(int osid)
+\ingroup API_Workers_Properties
+This function returns the identifier of the memory node associated to the NUMA
+node identified by \p osid by the Operating System.
+
+\fn int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+\ingroup API_Workers_Properties
+This function returns the Operating System identifier of the memory node
+whose StarPU identifier is \p id.
+
 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 \ingroup API_Workers_Properties
 Return worker \p type as a string.

+ 7 - 1
examples/cpp/add_vectors_cpp11.cpp

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
- * Copyright (C) 2012 INRIA
+ * Copyright (C) 2012, 2017  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -78,6 +78,12 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
 	// StarPU data registering
 	starpu_data_handle_t spu_vec_A;
 	starpu_data_handle_t spu_vec_B;

+ 1 - 0
include/starpu_config.h.in

@@ -88,6 +88,7 @@
 #undef STARPU_MAXNODES
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCPUS
+#undef STARPU_MAXNUMANODES
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXMICDEVS

+ 5 - 1
include/starpu_data.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
- * Copyright (C) 2016  Inria
+ * Copyright (C) 2016, 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,10 @@ enum starpu_node_kind
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_memory_nodes_get_count(void);
+int starpu_memory_nodes_get_numa_count(void);
+int starpu_memory_nodes_numa_id_to_devid(int osid);
+int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+
 enum starpu_node_kind starpu_node_get_kind(unsigned node);
 
 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);

+ 2 - 0
include/starpu_data_interfaces.h

@@ -483,6 +483,8 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
 starpu_data_handle_t starpu_data_lookup(const void *ptr);
 
+int starpu_data_get_home_node(starpu_data_handle_t handle);
+
 #ifdef __cplusplus
 }
 #endif

+ 14 - 7
src/core/disk.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2013  Corentin Salingue
  * Copyright (C) 2015, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -77,16 +78,22 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 {
 	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN, "Minimum disk size is %d Bytes ! (Here %d) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
 	/* register disk */
-	unsigned memory_node = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
+	unsigned disk_memnode = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
 
-	_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-	_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+        /* Connect the disk memory node to all numa memory nodes */
+        int nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+        int numa_node;
+        for (numa_node = 0; numa_node < nb_numa_nodes; numa_node++)
+        {
+                _starpu_register_bus(disk_memnode, numa_node);
+                _starpu_register_bus(numa_node, disk_memnode);
+        }
 
 	/* connect disk */
 	void *base = func->plug(parameter, size);
 
 	/* remember it */
-	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(memory_node,func,base);
+	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(disk_memnode, func, base);
 
 #ifdef STARPU_SIMGRID
 	char name[16];
@@ -96,13 +103,13 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 	_starpu_simgrid_memory_node_set_host(memory_node, host);
 #endif
 
-	int ret = func->bandwidth(memory_node);
+	int ret = func->bandwidth(disk_memnode);
 	/* have a problem with the disk */
 	if (ret == 0)
 		return -ENOENT;
 	if (size >= 0)
-		_starpu_memory_manager_set_global_memory_size(memory_node, size);
-	return memory_node;
+		_starpu_memory_manager_set_global_memory_size(disk_memnode, size);
+	return disk_memnode;
 }
 
 void _starpu_disk_unregister(void)

+ 4 - 0
src/core/perfmodel/perfmodel.h

@@ -110,6 +110,10 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 
 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
 
+#if defined(STARPU_HAVE_HWLOC)
+hwloc_topology_t _starpu_perfmodel_get_hwtopology();
+#endif
+
 #ifdef __cplusplus
 }
 #endif

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 792 - 536
src/core/perfmodel/perfmodel_bus.c


+ 14 - 3
src/core/simgrid.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012-2017  Université de Bordeaux
- * Copyright (C) 2016  	    Inria
- * Copyright (C) 2016, 2017  	    CNRS
+ * Copyright (C) 2016, 2017  Inria
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1039,8 +1039,19 @@ void _starpu_simgrid_count_ngpus(void)
 			ngpus = 0;
 			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
 			{
-				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
+				int numa;
+				int nnumas = starpu_memory_nodes_get_numa_count();
+				int found = 0;
+				for (numa = 0; numa < nnumas; numa++)
+					if (starpu_bus_get_id(src2, numa) != -1)
+					{
+						found = 1;
+						break;
+					}
+					
+				if (!found)
 					continue;
+
 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
 				int routesize2;
 #ifdef HAVE_SG_HOST_ROUTE

+ 546 - 58
src/core/topology.c

@@ -30,6 +30,7 @@
 #include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mp_common/source_common.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/opencl/driver_opencl_utils.h>
 #include <profiling/profiling.h>
 #include <datawizard/datastats.h>
 #include <datawizard/memory_nodes.h>
@@ -54,11 +55,23 @@
 #include <hwloc/cuda.h>
 #endif
 
+#if defined(STARPU_USE_OPENCL)
+#include <hwloc/opencl.h>
+#endif
+
 static unsigned topology_is_initialized = 0;
 static int nobind;
 
 /* For checking whether two workers share the same PU, indexed by PU number */
 static int cpu_worker[STARPU_MAXCPUS];
+static unsigned nb_numa_nodes = 0;
+static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
+static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
+static unsigned numa_bus_id[STARPU_MAXNUMANODES*STARPU_MAXNUMANODES];
+static int _starpu_get_logical_numa_node_worker(unsigned workerid);
+
+#define STARPU_NUMA_UNINITIALIZED (-2)
+#define STARPU_NUMA_MAIN_RAM (-1)
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 
@@ -87,6 +100,124 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 #endif
 
+int starpu_memory_nodes_get_numa_count(void)
+{
+	return nb_numa_nodes;
+}
+
+#if defined(STARPU_HAVE_HWLOC)
+static int numa_get_logical_id(hwloc_obj_t obj)
+{
+	STARPU_ASSERT(obj);
+	while (obj->type != HWLOC_OBJ_NODE)
+	{
+		obj = obj->parent;
+
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		if (!obj)
+			return STARPU_NUMA_MAIN_RAM;
+	}
+	return obj->logical_index;
+}
+
+static int numa_get_physical_id(hwloc_obj_t obj)
+{
+	STARPU_ASSERT(obj);
+	while (obj->type != HWLOC_OBJ_NODE)
+	{
+		obj = obj->parent;
+
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		if (!obj)
+			return STARPU_NUMA_MAIN_RAM;
+	}
+	return obj->os_index;
+}
+#endif
+
+static int _starpu_get_logical_numa_node_worker(unsigned workerid)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
+		struct _starpu_machine_topology *topology = &config->topology ;
+
+		hwloc_obj_t obj;
+		switch(worker->arch) 	
+		{
+			case STARPU_CPU_WORKER:
+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
+				break;
+			default:
+				STARPU_ABORT();
+		}
+
+		return numa_get_logical_id(obj);
+	}
+	else		
+#endif 
+	{
+		(void) workerid; /* unused */
+		return STARPU_NUMA_MAIN_RAM;
+	}
+}
+
+static int _starpu_get_physical_numa_node_worker(unsigned workerid)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
+		struct _starpu_machine_topology *topology = &config->topology ;
+
+		hwloc_obj_t obj;
+		switch(worker->arch) 	
+		{
+			case STARPU_CPU_WORKER:
+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
+				break;
+			default:
+				STARPU_ABORT();
+		}
+
+		return numa_get_physical_id(obj);
+	}
+	else		
+#endif 
+	{
+		(void) workerid; /* unused */
+		return STARPU_NUMA_MAIN_RAM;
+	}
+}
+
+static int _starpu_numa_get_logical_id_from_pu(int pu)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	if (nb_numa_nodes > 1)
+	{
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		struct _starpu_machine_topology *topology = &config->topology;
+
+		hwloc_obj_t obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, pu);
+		return numa_get_logical_id(obj);
+	}
+	else
+#endif
+	{
+		return -1;
+	}
+}
+
+
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
 {
 	unsigned nworkers = starpu_worker_get_count();
@@ -846,6 +977,67 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 	return config->topology.nhwpus;
 }
 
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
+{
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+        _starpu_opencl_init();
+#endif
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+        _starpu_init_cuda();
+#endif
+        _starpu_init_topology(config);
+
+	int res;
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_machine_topology *topology = &config->topology ;
+		int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
+		res = nnumanodes > 0 ? nnumanodes : 1 ;
+	}
+	else
+#endif 
+	{	
+		res = 1;
+	}
+
+	STARPU_ASSERT_MSG(res <= STARPU_MAXNUMANODES, "Number of NUMA nodes discovered is higher than maximum accepted ! Use configure option --enable-maxnumanodes=xxx to increase the maximum value of supported NUMA nodes.\n");
+	return res;
+}
+
+//TODO change this in an array
+int starpu_memory_nodes_numa_hwloclogid_to_id(int logid)
+{
+	unsigned n;
+	for (n = 0; n < nb_numa_nodes; n++)
+		if (numa_memory_nodes_to_hwloclogid[n] == logid)
+			return n;
+	return -1;
+}
+
+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id)
+{
+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
+	return numa_memory_nodes_to_hwloclogid[id];
+}
+
+int starpu_memory_nodes_numa_devid_to_id(unsigned id)
+{
+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
+	return numa_memory_nodes_to_physicalid[id];
+}
+
+//TODO change this in an array
+int starpu_memory_nodes_numa_id_to_devid(int osid)
+{
+	unsigned n;
+	for (n = 0; n < nb_numa_nodes; n++)
+		if (numa_memory_nodes_to_physicalid[n] == osid)
+			return n;
+	return -1;
+}
+
 #ifdef STARPU_HAVE_HWLOC
 void _starpu_topology_filter(hwloc_topology_t topology)
 {
@@ -1751,35 +1943,294 @@ _starpu_bind_thread_on_cpus (
 #endif
 }
 
-static void
-_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
+static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
+{
+	unsigned worker;
+	for (worker = 0; worker < config->topology.nworkers; worker++)
+	{
+		struct _starpu_worker *workerarg = &config->workers[worker];
+
+		switch (workerarg->arch)
+		{
+			case STARPU_CPU_WORKER:
+			{
+				/* Dedicate a cpu core to that worker */
+				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				break;
+			}
+			default:
+				/* Do nothing */
+				break;
+		}
+
+
+	}
+}
+
+//TODO : Check SIMGRID
+static void _starpu_init_numa_node(struct _starpu_machine_config *config)
 {
-	/* launch one thread per CPU */
-	unsigned ram_memory_node;
+	nb_numa_nodes = 0;
+
+	unsigned i;
+	for (i = 0; i < STARPU_MAXNUMANODES; i++)
+	{
+		numa_memory_nodes_to_hwloclogid[i] = STARPU_NUMA_UNINITIALIZED;
+		numa_memory_nodes_to_physicalid[i] = STARPU_NUMA_UNINITIALIZED;
+	}
 
-	/* note that even if the CPU cpu are not used, we always have a RAM
-	 * node */
-	/* TODO : support NUMA  ;) */
-	ram_memory_node = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
-	STARPU_ASSERT(ram_memory_node == STARPU_MAIN_RAM);
 
+	char * state;
+	/* NUMA mode activated */
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		/* Take all NUMA nodes used by CPU workers */
+		unsigned worker;
+		for (worker = 0; worker < config->topology.nworkers; worker++)
+		{
+			struct _starpu_worker *workerarg = &config->workers[worker];
+			if (workerarg->arch == STARPU_CPU_WORKER)
+			{
+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
+
+				/* Convert logical id to StarPU id to check if this NUMA node is already saved or not */
+				int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
+
+				/* This shouldn't happen */
+				if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+				{
+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+					STARPU_ABORT();
+				}
+
+				if (numa_starpu_id == -1)
+				{
+					int devid = numa_logical_id == STARPU_NUMA_MAIN_RAM ? 0 : numa_logical_id;
+					int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, devid);
+					STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+					numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
+					int numa_physical_id = _starpu_get_physical_numa_node_worker(worker);
+					numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
+					nb_numa_nodes++;
+#ifdef STARPU_SIMGRID
+					snprintf(name, sizeof(name), "RAM%d", memnode);
+					host = _starpu_simgrid_get_host_by_name(name);
+					STARPU_ASSERT(host);
+					_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+				}
+			}
+		}
+
+		/* If we found NUMA nodes from CPU workers, it's good */
+		if (nb_numa_nodes != 0)
+			return;
+
+		_STARPU_DISP("No NUMA nodes found when checking CPU workers...\n");
+
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
+		_STARPU_DISP("Take NUMA nodes attached to CUDA and OpenCL devices...\n");
+#endif
+
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
+		for (i = 0; i < config->topology.ncudagpus; i++)
+		{
+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
+
+			/* Hwloc cannot recognize some devices */
+			if (!obj)
+				continue;
+
+			while (obj->type != HWLOC_OBJ_NODE)
+			{
+				obj = obj->parent;
+
+				/* If we don't find a "node" obj before the root, this means
+				 * hwloc does not know whether there are numa nodes or not, so
+				 * we should not use a per-node sampling in that case. */
+				if (!obj)
+					continue;
+			}
+			int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
+
+			/* This shouldn't happen */
+			if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+			{
+				_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+				STARPU_ABORT();
+			}
+
+			if (numa_starpu_id == -1)
+			{
+				int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
+				STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+				numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
+				numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
+				nb_numa_nodes++;
+#ifdef STARPU_SIMGRID
+				snprintf(name, sizeof(name), "RAM%d", memnode);
+				host = _starpu_simgrid_get_host_by_name(name);
+				STARPU_ASSERT(host);
+				_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+			}
+		}	
+#endif
+#if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
+		if (config->topology.nopenclgpus > 0)
+		{
+			cl_int err;
+			cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
+			cl_uint nb_platforms;
+			unsigned platform;
+			unsigned nb_opencl_devices = 0, num = 0;
+
+			err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) 
+				nb_platforms=0;
+
+			cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
+			if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
+				device_type |= CL_DEVICE_TYPE_CPU;
+			if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
+				device_type = CL_DEVICE_TYPE_CPU;
+
+			for (platform = 0; platform < nb_platforms ; platform++)
+			{
+				err = clGetDeviceIDs(platform_id[platform], device_type, 0, NULL, &num);
+				if (err != CL_SUCCESS)
+					num = 0;
+				nb_opencl_devices += num;
+
+				for (i = 0; i < num; i++)
+				{
+					hwloc_obj_t obj = hwloc_opencl_get_device_osdev_by_index(config->topology.hwtopology, platform, i);
+
+					/* Hwloc cannot recognize some devices */
+					if (!obj)
+						continue;
+
+					while (obj->type != HWLOC_OBJ_NODE)
+					{
+						obj = obj->parent;
+
+						/* If we don't find a "node" obj before the root, this means
+						 * hwloc does not know whether there are numa nodes or not, so
+						 * we should not use a per-node sampling in that case. */
+						if (!obj)
+							continue;
+					}
+					int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
+
+					/* This shouldn't happen */
+					if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+					{
+						_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+						STARPU_ABORT();
+					}
+
+					if (numa_starpu_id == -1)
+					{
+						int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
+						STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+						numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
+						numa_memory_nodes_to_physicalid[memnode] = obj->os_index;	
+						nb_numa_nodes++;
 #ifdef STARPU_SIMGRID
-	char name[16];
-	msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
-	STARPU_ASSERT(host);
-	_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
+						snprintf(name, sizeof(name), "RAM%d", memnode);
+						host = _starpu_simgrid_get_host_by_name(name);
+						STARPU_ASSERT(host);
+						_starpu_simgrid_memory_node_set_host(memnode, host);
 #endif
+					}
+				}	
+			}
+		}
+#endif
+	}
+	
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
+	//Found NUMA nodes from CUDA nodes
+	if (nb_numa_nodes != 0)
+		return;
+
+	/* In case, we do not find any NUMA nodes when checking NUMA nodes attached to GPUs, we take all of them */
+	_STARPU_DISP("No NUMA nodes found when checking GPUs devices...\n");
+#endif
+
+	_STARPU_DISP("Finally, take all NUMA nodes available... \n");
+
+	unsigned nnuma = _starpu_topology_get_nnumanodes(config);
+	if (nnuma > STARPU_MAXNUMANODES)
+	{
+		_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+		nnuma = STARPU_MAXNUMANODES;		
+	}
+
+	unsigned numa;
+	for (numa = 0; numa < nnuma; numa++)
+	{
+#if defined(STARPU_HAVE_HWLOC)
+		if (nnuma > 1)
+		{
+			hwloc_obj_t obj = hwloc_get_obj_by_type(config->topology.hwtopology, HWLOC_OBJ_NUMANODE, numa);
+			unsigned numa_logical_id = obj->logical_index;
+			unsigned numa_physical_id = obj->os_index;
 
+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
+			STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available) \n", memnode, STARPU_MAXNUMANODES);
+
+			numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
+			numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
+			nb_numa_nodes++;								
+
+#ifdef STARPU_SIMGRID
+			snprintf(name, sizeof(name), "RAM%d", memnode);
+			host = _starpu_simgrid_get_host_by_name(name);
+			STARPU_ASSERT(host);
+			_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+		}
+		else
+#endif /* defined(STARPU_HAVE_HWLOC) */
+		{
+
+			/* In this case, nnuma has only one node */
+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
+			STARPU_ASSERT_MSG(memnode == STARPU_MAIN_RAM, "Wrong Memory Node : %d (expected %d) \n", memnode, STARPU_MAIN_RAM);
+
+			numa_memory_nodes_to_hwloclogid[memnode] = STARPU_NUMA_MAIN_RAM;
+			numa_memory_nodes_to_physicalid[memnode] = STARPU_NUMA_MAIN_RAM;
+			nb_numa_nodes++;								
+#ifdef STARPU_SIMGRID
+			char name[16];
+			msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
+			STARPU_ASSERT(host);
+			_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
+#endif
+		}
+
+	}	
+	
+	STARPU_ASSERT_MSG(nb_numa_nodes > 0, "No NUMA node found... We need at least one memory node !\n");	
+}
+
+static void _starpu_init_numa_bus()
+{
+	unsigned i, j;
+	for (i = 0; i < nb_numa_nodes; i++)
+		for (j = 0; j < nb_numa_nodes; j++)
+			if (i != j)
+				numa_bus_id[i*nb_numa_nodes+j] = _starpu_register_bus(i, j);
+}
+
+static void
+_starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
+{
 	/* We will store all the busid of the different (src, dst)
 	 * combinations in a matrix which we initialize here. */
 	_starpu_initialize_busid_matrix();
 
-	/* Each device is initialized,
-	 * giving it a memory node and a core bind id.
-	 */
-	/* TODO: STARPU_MAXNUMANODES */
-	unsigned numa_init[1] = { 1 };
-	unsigned numa_memory_nodes[1] = { ram_memory_node };
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
@@ -1801,6 +2252,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
 	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
 #endif
+
 	unsigned bindid;
 
 	for (bindid = 0; bindid < config->nbindid; bindid++)
@@ -1810,6 +2262,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		config->bindid_workers[bindid].nworkers = 0;
 	}
 
+	/* Init CPU binding before NUMA nodes, because we use it to discover NUMA nodes */
+	_starpu_init_binding_cpu(config);
+
+	/* Initialize NUMA nodes */
+	_starpu_init_numa_node(config);
+	_starpu_init_numa_bus();
+
 	unsigned worker;
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
@@ -1828,33 +2287,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		{
 			case STARPU_CPU_WORKER:
 			{
-				/* TODO: NUMA */
-				int numaid = 0;
-				/* "dedicate" a cpu core to that worker */
-				if (numa_init[numaid])
-				{
-					memory_node = numa_memory_nodes[numaid];
-				}
-				else
-				{
-					numa_init[numaid] = 1;
-					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
-#ifdef STARPU_SIMGRID
-					snprintf(name, sizeof(name), "RAM%d", numaid);
-					host = _starpu_simgrid_get_host_by_name(name);
-					STARPU_ASSERT(host);
-					_starpu_simgrid_memory_node_set_host(memory_node, host);
-#endif
-				}
-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
+				int numa_starpu_id =  starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
+				if (numa_starpu_id >= STARPU_MAXNUMANODES)
+					numa_starpu_id = STARPU_MAIN_RAM;
+
+				workerarg->numa_memory_node = memory_node = numa_starpu_id;
+
 				_starpu_memory_node_add_nworkers(memory_node);
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, numa_starpu_id);
 				break;
 			}
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
+			{
+				unsigned numa;
 #ifndef STARPU_SIMGRID
 				if (may_bind_automatically[STARPU_CUDA_WORKER])
 				{
@@ -1884,8 +2332,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 
-					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_cuda_bus_ids[numa][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(numa, memory_node);
+						_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][numa] = _starpu_register_bus(memory_node, numa);
+					}
 #ifdef STARPU_SIMGRID
 					const char* cuda_memcpy_peer;
 					snprintf(name, sizeof(name), "CUDA%u", devid);
@@ -1912,8 +2363,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 							if (workerarg2->arch == STARPU_CUDA_WORKER)
 							{
 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
-								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
-								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
+								_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node2, memory_node);
+								_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node, memory_node2);
 #ifndef STARPU_SIMGRID
 #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
 								{
@@ -1931,8 +2382,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
 										}
 #endif
-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], data->ngpus);
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES], data->ngpus);
 									}
 								}
 #endif
@@ -1943,13 +2394,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
 
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
+			}
 #endif
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 		        case STARPU_OPENCL_WORKER:
+			{
+				unsigned numa;
 #ifndef STARPU_SIMGRID
 				if (may_bind_automatically[STARPU_OPENCL_WORKER])
 				{
@@ -1970,8 +2427,12 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					opencl_init[devid] = 1;
 					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 #ifdef STARPU_SIMGRID
 					snprintf(name, sizeof(name), "OpenCL%u", devid);
 					host = _starpu_simgrid_get_host_by_name(name);
@@ -1981,13 +2442,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(workerarg, numa);
+
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				break;
+			}
 #endif
 
 #ifdef STARPU_USE_MIC
 		        case STARPU_MIC_WORKER:
+			{
+				unsigned numa;
 				if (mic_init[devid])
 				{
 					memory_node = mic_memory_nodes[devid];
@@ -2004,21 +2471,30 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					//}
 					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 
 				}
 				workerarg->bindid = mic_bindid[devid];
 				_starpu_memory_node_add_nworkers(memory_node);
 
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
+			}
 #endif /* STARPU_USE_MIC */
 
 #ifdef STARPU_USE_SCC
 			case STARPU_SCC_WORKER:
 			{
+				unsigned numa;
 				/* Node 0 represents the SCC shared memory when we're on SCC. */
 				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
 				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
@@ -2026,7 +2502,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				memory_node = ram_memory_node;
 				_starpu_memory_node_add_nworkers(memory_node);
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(workerarg, numa);
+
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 			}
 				break;
@@ -2035,6 +2514,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 			case STARPU_MPI_MS_WORKER:
 			{
+				unsigned numa;
 				if (mpi_init[devid])
 				{
 					memory_node = mpi_memory_nodes[devid];
@@ -2044,11 +2524,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					mpi_init[devid] = 1;
 					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+		
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{	
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 
 				}
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
@@ -2154,7 +2641,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 	_starpu_memory_nodes_init();
 	_starpu_datastats_init();
 
-	_starpu_init_workers_binding(config, no_mp_config);
+	_starpu_init_workers_binding_and_memory(config, no_mp_config);
 
 	config->cpus_nodeid = -1;
 	config->cuda_nodeid = -1;
@@ -2293,3 +2780,4 @@ starpu_topology_print (FILE *output)
 		fprintf(output, "\n");
 	}
 }
+

+ 8 - 1
src/core/topology.h

@@ -1,7 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2010, 2012, 2014-2016  Université de Bordeaux
+ * Copyright (C) 2009-2010, 2012, 2014-2017  Université de Bordeaux
  * Copyright (C) 2010, 2015, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,6 +52,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 /* returns the number of logical cpus */
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 
+/* returns the number of NUMA nodes */
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
+
 #ifdef STARPU_HAVE_HWLOC
 /* Small convenient function to filter hwloc topology depending on HWLOC API version */
 void _starpu_topology_filter(hwloc_topology_t topology);
@@ -68,4 +72,7 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
 
+int starpu_memory_nodes_get_numa_count(void);
+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id);
+	
 #endif // __TOPOLOGY_H__

+ 7 - 2
src/core/workers.c

@@ -1599,8 +1599,13 @@ void starpu_shutdown(void)
 
 	/* tell all workers to shutdown */
 	_starpu_kill_all_workers(&_starpu_config);
-
-	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
+	
+	unsigned i;
+	unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+	for (i=0; i<nb_numa_nodes; i++)
+	{
+		_starpu_free_all_automatically_allocated_buffers(i);
+	}
 
 	{
 	     int stats = starpu_get_env_number("STARPU_STATS");

+ 1 - 0
src/core/workers.h

@@ -84,6 +84,7 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	/* condition variable used for passive waiting operations on worker
 	 * STARPU_PTHREAD_COND_BROADCAST must be used instead of STARPU_PTHREAD_COND_SIGNAL,
 	 * since the condition is shared for multiple purpose */

+ 29 - 4
src/datawizard/coherency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures. *
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2014  INRIA
+ * Copyright (C) 2014, 2017  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -320,6 +320,29 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	return 0;
 }
 
+/* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
+static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
+{
+	double timing_best;
+	int best_numa = -1;
+	unsigned numa;
+	const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+	for(numa = 0; numa < nb_numa_nodes; numa++)
+	{
+		double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
+
+		/* Compare slowness : take the lowest */
+		if (best_numa < 0 || actual < timing_best)
+		{
+			best_numa = numa;
+			timing_best = actual;
+		}
+	}
+	STARPU_ASSERT(best_numa >= 0);
+	
+	return best_numa;
+}
+
 /* Determines the path of a request : each hop is defined by (src,dst) and the
  * node that handles the hop. The returned value indicates the number of hops,
  * and the max_len is the maximum number of hops (ie. the size of the
@@ -362,9 +385,11 @@ static int determine_request_path(starpu_data_handle_t handle,
 		STARPU_ASSERT(max_len >= 2);
 		STARPU_ASSERT(src_node >= 0);
 
+		unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
+
 		/* GPU -> RAM */
 		src_nodes[0] = src_node;
-		dst_nodes[0] = STARPU_MAIN_RAM;
+		dst_nodes[0] = numa;
 
 		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
 			/* Disks don't have their own driver thread */
@@ -380,7 +405,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 		}
 
 		/* RAM -> GPU */
-		src_nodes[1] = STARPU_MAIN_RAM;
+		src_nodes[1] = numa;
 		dst_nodes[1] = dst_node;
 
 		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
@@ -573,7 +598,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
-		if (requesting_node == STARPU_MAIN_RAM && !nwait)
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
 		{
 			/* And this is the main RAM, really no need for a
 			 * request, just allocate */

+ 1 - 2
src/datawizard/copy_driver.c

@@ -864,10 +864,9 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
                 _starpu_mpi_common_wait_event(async_channel);
                 break;
 #endif
-	case STARPU_MAIN_RAM:
+	case STARPU_CPU_RAM:
 		starpu_disk_wait_request(async_channel);
 		break;
-	case STARPU_CPU_RAM:
 	default:
 		STARPU_ABORT();
 	}

+ 2 - 2
src/datawizard/data_request.c

@@ -147,7 +147,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->prefetch = is_prefetch;
 	r->prio = prio;
@@ -276,7 +276,7 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 	unsigned handling_node = r->handling_node;
 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 
 //	_STARPU_DEBUG("POST REQUEST\n");
 

+ 10 - 1
src/datawizard/datawizard.c

@@ -22,6 +22,7 @@
 #include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <core/progress_hook.h>
+#include <core/topology.h>
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
 #endif
@@ -71,8 +72,16 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
         unsigned memnode;
 
 	if (!worker)
+	{
 		/* Call from main application, only make RAM requests progress */
-		return ___starpu_datawizard_progress(STARPU_MAIN_RAM, may_alloc, push_requests);
+		int ret = 0;
+		int nnumas = starpu_memory_nodes_get_numa_count();
+		int numa;
+		for (numa = 0; numa < nnumas; numa++)
+			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
+
+		return ret;
+	}
 	if (worker->set)
 		/* Runing one of the workers of a worker set. The reference for
 		 * driving memory is its worker 0 (see registrations in topology.c) */

+ 1 - 1
src/datawizard/datawizard.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2014, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify

+ 24 - 10
src/datawizard/filters.c

@@ -188,14 +188,18 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		 * to have somewhere to gather pieces later */
 		/* FIXME: mark as unevictable! */
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
+		int home_node = initial_handle->home_node;
+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+			home_node = STARPU_MAIN_RAM;
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #endif
 		STARPU_ASSERT(!ret);
 	}
 
-	_starpu_data_unregister_ram_pointer(initial_handle);
+	for (node = 0; node < STARPU_MAXNODES; node++)
+		_starpu_data_unregister_ram_pointer(initial_handle, node);
 
 	if (nparts && !inherit_state)
 	{
@@ -324,10 +328,14 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		 * store it in the handle */
 		child->footprint = _starpu_compute_data_footprint(child);
 
-		void *ptr;
-		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
-		if (ptr != NULL)
-			_starpu_data_register_ram_pointer(child, ptr);
+		for (node = 0; node < STARPU_MAXNODES; node++)
+		{
+			if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+				continue;
+			void *ptr = starpu_data_handle_to_pointer(child, node);
+			if (ptr != NULL)
+				_starpu_data_register_ram_pointer(child, ptr);
+		}
 
 		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
 	}
@@ -428,7 +436,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 			child_handle->unregister_hook(child_handle);
 		}
 
-		_starpu_data_unregister_ram_pointer(child_handle);
+		for (node = 0; node < STARPU_MAXNODES; node++)
+			_starpu_data_unregister_ram_pointer(child_handle, node);
 
 		if (child_handle->per_worker)
 		{
@@ -444,9 +453,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		_starpu_memory_stats_free(child_handle);
 	}
 
-	ptr = starpu_data_handle_to_pointer(root_handle, STARPU_MAIN_RAM);
-	if (ptr != NULL)
-		_starpu_data_register_ram_pointer(root_handle, ptr);
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+			continue;
+		ptr = starpu_data_handle_to_pointer(root_handle, node);
+		if (ptr != NULL)
+			_starpu_data_register_ram_pointer(root_handle, ptr);
+	}
 
 	/* the gathering_node should now have a valid copy of all the children.
 	 * For all nodes, if the node had all copies and none was locally

+ 12 - 4
src/datawizard/interfaces/bcsr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -130,7 +130,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
@@ -260,9 +260,13 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
+	int node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 #ifdef STARPU_DEBUG
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
@@ -273,9 +277,13 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
+	int node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 #ifdef STARPU_DEBUG
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");

+ 2 - 2
src/datawizard/interfaces/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -163,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (nz-1)*ldz*elemsize + (ny-1)*ldy*elemsize + nx*elemsize - 1);

+ 3 - 2
src/datawizard/interfaces/coo_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2016  Université Bordeaux
+ * Copyright (C) 2013-2017  Université Bordeaux
  * Copyright (C) 2012 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,6 +18,7 @@
 #include <starpu.h>
 #include <common/fxt.h>
 #include <datawizard/memalloc.h>
+#include <datawizard/memory_nodes.h>
 
 static int
 copy_any_to_any(void *src_interface, unsigned src_node,
@@ -236,7 +237,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize,
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(columns);
 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) columns + n_values*sizeof(uint32_t) - 1);

+ 2 - 2
src/datawizard/interfaces/csr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
@@ -112,7 +112,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);

+ 25 - 14
src/datawizard/interfaces/data_interface.c

@@ -369,12 +369,14 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
 
-
-
-	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
-	if (ptr != NULL)
+	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		_starpu_data_register_ram_pointer(handle, ptr);
+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+			continue;
+
+		ptr = starpu_data_handle_to_pointer(handle, node);
+		if (ptr != NULL)
+			_starpu_data_register_ram_pointer(handle, ptr);
 	}
 }
 
@@ -521,13 +523,17 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
  * Stop monitoring a piece of data
  */
 
-void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
+void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
 {
-	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+	if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+		return;
+
 #ifdef STARPU_OPENMP
 	if (handle->removed_from_context_hash)
 		return;
 #endif
+	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
+
 	if (ram_ptr != NULL)
 	{
 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
@@ -757,7 +763,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+			home_node = handle->home_node;
+			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+				home_node = STARPU_MAIN_RAM;
+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 
@@ -850,16 +859,19 @@ retry_busy:
 
 	size_t size = _starpu_data_get_size(handle);
 
-	_starpu_data_unregister_ram_pointer(handle);
-
 	/* Destroy the data now */
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
+		if (local->allocated)
+		{
+			_starpu_data_unregister_ram_pointer(handle, node);
+
 		/* free the data copy in a lazy fashion */
-		if (local->allocated && local->automatically_allocated)
-			_starpu_request_mem_chunk_removal(handle, local, node, size);
+			if (local->automatically_allocated)
+				_starpu_request_mem_chunk_removal(handle, local, node, size);
+		}
 	}
 	if (handle->per_worker)
 	{
@@ -976,8 +988,7 @@ static void _starpu_data_invalidate(void *data)
 
 		if (local->mc && local->allocated && local->automatically_allocated)
 		{
-			if (node == STARPU_MAIN_RAM)
-				_starpu_data_unregister_ram_pointer(handle);
+			_starpu_data_unregister_ram_pointer(handle, node);
 
 			/* free the data copy in a lazy fashion */
 			_starpu_request_mem_chunk_removal(handle, local, node, size);

+ 1 - 1
src/datawizard/interfaces/data_interface.h

@@ -78,7 +78,7 @@ extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 						void *ptr)
 	STARPU_ATTRIBUTE_INTERNAL;
 
-extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
+extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
 	STARPU_ATTRIBUTE_INTERNAL;
 
 #define _starpu_data_is_multiformat_handle(handle) handle->ops->is_multiformat

+ 2 - 2
src/datawizard/interfaces/matrix_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -174,7 +174,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, int home_node,
                 .offset = 0
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (ny-1)*ld*elemsize + nx*elemsize - 1);

+ 2 - 2
src/datawizard/interfaces/variable_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -113,7 +113,7 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, int home_nod
 		.elemsize = elemsize
 	};
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + elemsize - 1);

+ 2 - 2
src/datawizard/interfaces/vector_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2017  Inria
  *
@@ -122,7 +122,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
                 .offset = 0
 	};
 #if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + nx*elemsize - 1);

+ 41 - 10
src/datawizard/malloc.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2010, 2012-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -112,8 +113,14 @@ static struct starpu_codelet malloc_pinned_cl =
 };
 #endif
 
+/* Allocation in CPU RAM */
 int starpu_malloc_flags(void **A, size_t dim, int flags)
 {
+	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
+}
+
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
+{
 	int ret=0;
 
 	STARPU_ASSERT(A);
@@ -121,14 +128,14 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
+			while (starpu_memory_allocate(dst_node, dim, flags) != 0)
 			{
 				size_t freed;
 				size_t reclaim = 2 * dim;
 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", (long)reclaim);
-				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
-				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
-				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
+				_STARPU_TRACE_START_MEMRECLAIM(dst_node,0);
+				freed = _starpu_memory_reclaim_generic(dst_node, 0, reclaim);
+				_STARPU_TRACE_END_MEMRECLAIM(dst_node,0);
 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				{
 					// We could not reclaim enough memory
@@ -137,9 +144,9 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 				}
 			}
 		else if (flags & STARPU_MEMORY_WAIT)
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
+			starpu_memory_allocate(dst_node, dim, flags);
 		else
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
+			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
@@ -298,6 +305,18 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 		_starpu_scc_allocate_shared_memory(A, dim);
 #endif
 	}
+#ifdef STARPU_HAVE_HWLOC
+	if (starpu_memory_nodes_get_numa_count() > 1) {
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t hwtopology = config->topology.hwtopology;
+		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, starpu_memory_nodes_numa_id_to_hwloclogid(dst_node));
+		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
+		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, starpu_memnode_get_numaphysid(dst_node), *A);
+		if (!*A)
+			ret = -ENOMEM;
+	}
+#endif /* STARPU_HAVE_HWLOC */
 	else
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 	if (_malloc_align != sizeof(void*))
@@ -333,7 +352,7 @@ end:
 	}
 	else if (flags & STARPU_MALLOC_COUNT)
 	{
-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
+		starpu_memory_deallocate(dst_node, dim);
 	}
 
 	return ret;
@@ -383,6 +402,11 @@ static struct starpu_codelet free_pinned_cl =
 
 int starpu_free_flags(void *A, size_t dim, int flags)
 {
+	return _starpu_free_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
+}
+
+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags)
+{
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 		if (_starpu_can_submit_cuda_task())
@@ -470,6 +494,13 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 		_starpu_scc_free_shared_memory(A);
 #endif
 	}
+#ifdef STARPU_HAVE_HWLOC
+	else if (starpu_memory_nodes_get_numa_count() > 1) {
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t hwtopology = config->topology.hwtopology;
+		hwloc_free(hwtopology, A, dim);
+	}
+#endif /* STARPU_HAVE_HWLOC */
 	else
 		free(A);
 
@@ -478,7 +509,7 @@ out:
 #endif
 	if (flags & STARPU_MALLOC_COUNT)
 	{
-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
+		starpu_memory_deallocate(dst_node, dim);
 	}
 
 	return 0;
@@ -516,7 +547,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	{
 		case STARPU_CPU_RAM:
 		{
-			starpu_malloc_flags((void**) &addr, size,
+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					/* without memcpy_peer, we can not
 					 * allocated pinned memory, since it
@@ -646,7 +677,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 	switch(kind)
 	{
 		case STARPU_CPU_RAM:
-			starpu_free_flags((void*)addr, size,
+			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					flags & ~STARPU_MALLOC_PINNED
 #else

+ 3 - 1
src/datawizard/malloc.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Université de Bordeaux
+ * Copyright (C) 2013, 2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,4 +22,6 @@ void _starpu_malloc_shutdown(unsigned dst_node);
 
 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
 #endif

+ 46 - 26
src/datawizard/memalloc.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2016  Inria
+ * Copyright (C) 2016, 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,6 +21,7 @@
 #include <datawizard/memalloc.h>
 #include <datawizard/footprint.h>
 #include <core/disk.h>
+#include <core/topology.h>
 #include <starpu.h>
 #include <common/uthash.h>
 
@@ -382,8 +383,8 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 			data_interface = mc->chunk_interface;
 		STARPU_ASSERT(data_interface);
 
-		if (handle && node == STARPU_MAIN_RAM)
-			_starpu_data_unregister_ram_pointer(handle);
+		if (handle && (starpu_node_get_kind(node) == STARPU_CPU_RAM))
+			_starpu_data_unregister_ram_pointer(handle, node);
 
 		_STARPU_TRACE_START_FREE(node, mc->size);
 		mc->ops->free_data_on_node(data_interface, node);
@@ -443,8 +444,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	if (old_replicate)
 	{
-		if (node == STARPU_MAIN_RAM)
-			_starpu_data_unregister_ram_pointer(old_replicate->handle);
+		_starpu_data_unregister_ram_pointer(old_replicate->handle, node);
 		old_replicate->allocated = 0;
 		old_replicate->automatically_allocated = 0;
 		old_replicate->initialized = 0;
@@ -1486,11 +1486,11 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 
-	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
+	if (replicate->relaxed_coherency == 0 && (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM))
 	{
 		/* We are allocating the buffer in main memory, also register it
 		 * for the gcc plugin.  */
-		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+		void *ptr = starpu_data_handle_to_pointer(handle, dst_node);
 		if (ptr != NULL)
 		{
 			_starpu_data_register_ram_pointer(handle, ptr);
@@ -1617,7 +1617,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	int target = -1;
 	unsigned nnodes = starpu_memory_nodes_get_count();
 	unsigned int i;
-	double time_disk = 0;
+	double time_disk = 0.0;
 
 	for (i = 0; i < nnodes; i++)
 	{
@@ -1628,13 +1628,17 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 			/* if we can write on the disk */
 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
 			{
-				/* only time can change between disk <-> main_ram
-				 * and not between main_ram <-> worker if we compare diks*/
-				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
-				if (target == -1 || time_disk > time_tmp)
+				unsigned numa;
+				unsigned nnumas = starpu_memory_nodes_get_numa_count();
+				for (numa = 0; numa < nnumas; numa++)
 				{
-					target = i;
-					time_disk = time_tmp;
+					/* TODO : check if starpu_transfer_predict(node, i,...) is the same */
+					double time_tmp = starpu_transfer_predict(node, numa, _starpu_data_get_size(handle)) + starpu_transfer_predict(i, numa, _starpu_data_get_size(handle));
+					if (target == -1 || time_disk > time_tmp)
+					{
+						target = i;
+						time_disk = time_tmp;
+					}
 				}
 			}
 		}
@@ -1642,6 +1646,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	return target;
 }
 
+#ifdef STARPU_DEVEL
+#  warning TODO: better choose NUMA node
+#endif
 
 static unsigned
 choose_target(starpu_data_handle_t handle, unsigned node)
@@ -1650,14 +1657,20 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	size_t size_handle = _starpu_data_get_size(handle);
 	if (handle->home_node != -1)
 		/* try to push on RAM if we can before to push on disk */
-		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
+		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		{
-			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
-			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
+ 	                unsigned i;
+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+			for (i=0; i<nb_numa_nodes; i++)
 			{
-				target = STARPU_MAIN_RAM;
+				if (handle->per_node[i].allocated || 
+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
+				{
+					target = i;
+					break;
+				}
 			}
-			else
+			if (target == -1)
 			{
 				target = get_better_disk_can_accept_size(handle, node);
 			}
@@ -1672,19 +1685,26 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	{
 		/* handle->home_node == -1 */
 		/* no place for datas in RAM, we push on disk */
-		if (node == STARPU_MAIN_RAM)
+		if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
 		{
 			target = get_better_disk_can_accept_size(handle, node);
-		}
+		} else {
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
-		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
-			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
-		{
-			target = STARPU_MAIN_RAM;
+			unsigned i;
+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+			for (i=0; i<nb_numa_nodes; i++)
+			{
+				if (handle->per_node[i].allocated || 
+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
+				{
+					target = i;
+					break;
+				}
+			}
 		}
 		/* no place in RAM */
-		else
+		if (target == -1)
 		{
 			target = get_better_disk_can_accept_size(handle, node);
 		}

+ 1 - 1
src/datawizard/memory_nodes.c

@@ -77,7 +77,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 	switch (_starpu_descr.nodes[node])
 	{
 	case STARPU_CPU_RAM:
-		prefix = "RAM";
+		prefix = "NUMA";
 		break;
 	case STARPU_CUDA_RAM:
 		prefix = "CUDA";

+ 16 - 4
src/datawizard/user_interactions.c

@@ -257,13 +257,19 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 {
-	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node_cb(handle, home_node, mode, callback, arg);
 }
 
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 {
-	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, home_node, mode, callback, arg, sequential_consistency);
 }
 
 
@@ -372,7 +378,10 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
-	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node(handle, home_node, mode);
 }
 
 int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
@@ -445,7 +454,10 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
 void starpu_data_release(starpu_data_handle_t handle)
 {
-	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	starpu_data_release_on_node(handle, home_node);
 }
 
 static void _prefetch_data_on_node(void *arg)

+ 25 - 19
src/drivers/cpu/driver_cpu.c

@@ -153,35 +153,43 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
 	size_t global_mem;
-	starpu_ssize_t limit;
+	starpu_ssize_t limit = -1;
 
-	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
-#ifdef STARPU_DEVEL
-#  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
-#endif
+	char name[32];
 
 #if defined(STARPU_HAVE_HWLOC)
 	struct _starpu_machine_topology *topology = &config->topology;
 
-#if 0
-	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
-        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
-
-	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
-	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+	int nnumas = starpu_memory_nodes_get_numa_count();
+	if (nnumas > 1)
+	{
+		int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
+
+		if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
+		     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+		else {
+		     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
+		     global_mem = obj->memory.local_memory;
+		     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
+		     limit = starpu_get_env_number(name);
+		}
+	}
 	else
-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
-#else
-	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
-#endif
+	{
+		/* Do not limit ourself to a single NUMA node */
+		global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+	}
 
 #else /* STARPU_HAVE_HWLOC */
 #ifdef STARPU_DEVEL
-#  warning use sysinfo when available to get global size
+#  warning TODO: use sysinfo when available to get global size
 #endif
 	global_mem = 0;
 #endif
 
+	if (limit == -1)
+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+
 	if (limit < 0)
 		// No limit is defined, we return the global memory size
 		return global_mem;
@@ -198,9 +206,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 	int devid = cpu_worker->devid;
 
 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
-	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
-	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
-
+	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->numa_memory_node, cpu_worker->config));
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
 	starpu_pthread_setname(cpu_worker->short_name);

+ 3 - 3
src/drivers/cuda/driver_cuda.c

@@ -53,7 +53,7 @@
 static int ncudagpus = -1;
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
-int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
@@ -163,7 +163,7 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 	int worker = starpu_worker_get_id_check();
 	int devid = starpu_worker_get_devid(worker);
 	cudaStream_t stream;
-
+	
 	stream = in_transfer_streams[devid];
 	STARPU_ASSERT(stream);
 	return stream;
@@ -323,7 +323,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 					{
 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
 						/* direct copies are made from the destination, see link_supports_direct_transfers */
-						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], 1);
 					}
 				}
 			}

+ 1 - 1
src/drivers/cuda/driver_cuda.h

@@ -32,7 +32,7 @@ extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
 void _starpu_cuda_init(void);
 unsigned _starpu_get_cuda_device_count(void);
-extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);

+ 0 - 1
src/drivers/mp_common/source_common.c

@@ -950,7 +950,6 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
         starpu_pthread_wait_reset(&worker_set->workers[0].wait);
 #endif
 
-
 	/* Test if async transfers are completed */
 	for (i = 0; i < worker_set->nworkers; i++)
 	{

+ 4 - 4
src/drivers/mpi/driver_mpi_common.c

@@ -469,7 +469,7 @@ void _starpu_mpi_common_barrier(void)
 /* Compute bandwidth and latency between source and sink nodes
  * Source node has to have the entire set of times at the end
  */
-void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
+void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
 {
         int ret;
         unsigned iter;
@@ -506,7 +506,7 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
                                         STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
                                 }
                                 end = starpu_timing_now();
-                                bandwidth_dtod[sender][receiver] = (NITER*SIZE_BANDWIDTH)/(end - start);
+                                timing_dtod[sender][receiver] = (end - start)/NITER/SIZE_BANDWIDTH;
 
                                 /* measure latency sender to receiver */
                                 start = starpu_timing_now();
@@ -546,14 +546,14 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
                 /* if we are the sender, we send the data */
                 if (sender == id_proc)
                 {
-                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
+                        MPI_Send(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
                         MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
                 }
 
                 /* the master node receives the data */
                 if (src_node_id == id_proc)
                 {
-                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                        MPI_Recv(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                         MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                 }
 

+ 5 - 1
src/util/openmp_runtime_support.c

@@ -2415,8 +2415,12 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 {
+	/* FIXME Oli: rather iterate over all nodes? */
+	int node = starpu_data_get_home_node(handle);
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	vector_interface->slice_base = slice_base;
 }

+ 2 - 0
tests/datawizard/interfaces/test_interfaces.c

@@ -794,6 +794,8 @@ handle_to_pointer(void)
 	{
 		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
 			continue;
+		if (!starpu_data_test_if_allocated_on_node(handle, node))
+			continue;
 
 		ptr = handle->ops->handle_to_pointer(handle, node);
 		if (starpu_data_lookup(ptr) != handle)

+ 8 - 0
tests/datawizard/nowhere.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2015-2016  Université de Bordeaux
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -84,6 +85,13 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		/* FIXME: assumes only one RAM node */
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
 	starpu_variable_data_register(&handle_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
 	starpu_variable_data_register(&handle_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
 

+ 8 - 4
tests/datawizard/specific_node.c

@@ -34,10 +34,8 @@ starpu_data_handle_t data_handle;
 
 unsigned data;
 
-void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void specific_kernel(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
-	/* We do not protect this variable because it is only accessed when the
-	 * "data_handle" piece of data is accessed. */
 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	STARPU_ASSERT(dataptr == &data);
@@ -55,6 +53,12 @@ static struct starpu_codelet specific_cl =
 	.nodes = {STARPU_MAIN_RAM},
 };
 
+void cpu_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*dataptr)++;
+}
+
 #ifdef STARPU_USE_CUDA
 void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
 #endif
@@ -64,7 +68,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args);
 
 static struct starpu_codelet cl =
 {
-	.cpu_funcs = {specific_kernel},
+	.cpu_funcs = {cpu_codelet_unsigned_inc},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc},
 	.cuda_flags = {STARPU_CUDA_ASYNC},

+ 11 - 1
tests/disk/mem_reclaim.c

@@ -60,7 +60,16 @@ int main(int argc, char **argv)
 }
 #else
 
-const struct starpu_data_copy_methods my_vector_copy_data_methods_s;
+static int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+/* We need a ram-to-ram copy for NUMA machine, use any_to_any for that */
+static int ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node) {
+	return any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+const struct starpu_data_copy_methods my_vector_copy_data_methods_s = {
+	.ram_to_ram = ram_to_ram
+};
 struct starpu_data_interface_ops starpu_interface_my_vector_ops;
 
 void starpu_my_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
@@ -218,6 +227,7 @@ int main(void)
 	setenv("STARPU_LIMIT_CPU_MEM", MEMSIZE_STR, 1);
 
 	/* Build an vector-like interface which doesn't have the any_to_any helper, to force making use of pack/unpack */
+	any_to_any = starpu_interface_vector_ops.copy_methods->any_to_any;
 	memcpy(&starpu_interface_my_vector_ops, &starpu_interface_vector_ops, sizeof(starpu_interface_my_vector_ops));
 	starpu_interface_my_vector_ops.copy_methods = &my_vector_copy_data_methods_s;