Browse Source

Add NUMA support from branch NUMA

Corentin Salingue 8 years ago
parent
commit
cbee1918f5
48 changed files with 1763 additions and 742 deletions
  1. 19 2
      configure.ac
  2. 7 3
      doc/doxygen/chapters/210_check_list_performance.doxy
  3. 21 0
      doc/doxygen/chapters/501_environment_variables.doxy
  4. 8 0
      doc/doxygen/chapters/510_configure_options.doxy
  5. 5 1
      doc/doxygen/chapters/api/data_management.doxy
  6. 10 0
      doc/doxygen/chapters/api/workers.doxy
  7. 7 1
      examples/cpp/add_vectors_cpp11.cpp
  8. 1 0
      include/starpu_config.h.in
  9. 5 1
      include/starpu_data.h
  10. 2 0
      include/starpu_data_interfaces.h
  11. 14 7
      src/core/disk.c
  12. 4 0
      src/core/perfmodel/perfmodel.h
  13. 792 536
      src/core/perfmodel/perfmodel_bus.c
  14. 14 3
      src/core/simgrid.c
  15. 546 58
      src/core/topology.c
  16. 8 1
      src/core/topology.h
  17. 7 2
      src/core/workers.c
  18. 1 0
      src/core/workers.h
  19. 29 4
      src/datawizard/coherency.c
  20. 1 2
      src/datawizard/copy_driver.c
  21. 2 2
      src/datawizard/data_request.c
  22. 10 1
      src/datawizard/datawizard.c
  23. 1 1
      src/datawizard/datawizard.h
  24. 24 10
      src/datawizard/filters.c
  25. 12 4
      src/datawizard/interfaces/bcsr_interface.c
  26. 2 2
      src/datawizard/interfaces/block_interface.c
  27. 3 2
      src/datawizard/interfaces/coo_interface.c
  28. 2 2
      src/datawizard/interfaces/csr_interface.c
  29. 25 14
      src/datawizard/interfaces/data_interface.c
  30. 1 1
      src/datawizard/interfaces/data_interface.h
  31. 2 2
      src/datawizard/interfaces/matrix_interface.c
  32. 2 2
      src/datawizard/interfaces/variable_interface.c
  33. 2 2
      src/datawizard/interfaces/vector_interface.c
  34. 41 10
      src/datawizard/malloc.c
  35. 3 1
      src/datawizard/malloc.h
  36. 46 26
      src/datawizard/memalloc.c
  37. 1 1
      src/datawizard/memory_nodes.c
  38. 16 4
      src/datawizard/user_interactions.c
  39. 25 19
      src/drivers/cpu/driver_cpu.c
  40. 3 3
      src/drivers/cuda/driver_cuda.c
  41. 1 1
      src/drivers/cuda/driver_cuda.h
  42. 0 1
      src/drivers/mp_common/source_common.c
  43. 4 4
      src/drivers/mpi/driver_mpi_common.c
  44. 5 1
      src/util/openmp_runtime_support.c
  45. 2 0
      tests/datawizard/interfaces/test_interfaces.c
  46. 8 0
      tests/datawizard/nowhere.c
  47. 8 4
      tests/datawizard/specific_node.c
  48. 11 1
      tests/disk/mem_reclaim.c

+ 19 - 2
configure.ac

@@ -562,6 +562,23 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
 
 ###############################################################################
 ###############################################################################
 
 
+###############################################################################
+#                                                                             #
+#                           NUMA memory nodes                                 #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of NUMA nodes)
+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
+			[maximum number of NUMA nodes])],
+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
+AC_MSG_RESULT($nmaxnumanodes)
+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
+		[maximum number of NUMA nodes])
+
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -2138,8 +2155,8 @@ if test x$maxnodes = x0 ; then
 	else
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
 		# and per MIC device
-		# we add nodes to use 3 memory disks
-		nodes=4
+		# we add nodes to use 2 memory disks
+		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
 		if test x$enable_cuda = xyes ; then
 			# we could have used nmaxcudadev + 1, but this would certainly give an
 			# we could have used nmaxcudadev + 1, but this would certainly give an
 			# odd number.
 			# odd number.

+ 7 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -197,9 +197,13 @@ structures of StarPU by describing the shape of your machine and/or your
 application at the configure step.
 application at the configure step.
 
 
 To reduce the memory footprint of the data internal structures of StarPU, one
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
+can set the
+\ref enable-maxcpus "--enable-maxcpus",
+\ref enable-maxnumanodes "--enable-maxnumanodes",
+\ref enable-maxcudadev "--enable-maxcudadev",
+\ref enable-maxopencldev "--enable-maxopencldev" and
+\ref enable-maxnodes "--enable-maxnodes"
+configure parameters to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 structures to the machine.
 
 

+ 21 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -845,6 +845,14 @@ available to the application in the main CPU memory. Setting it enables allocati
 cache in main memory. Setting it to zero lets StarPU overflow memory.
 cache in main memory. Setting it to zero lets StarPU overflow memory.
 </dd>
 </dd>
 
 
+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the NUMA node with the OS identifier <c>devid</c>.
+</dd>
+
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dd>
 <dd>
 \anchor STARPU_MINIMUM_AVAILABLE_MEM
 \anchor STARPU_MINIMUM_AVAILABLE_MEM
@@ -1110,6 +1118,19 @@ implements an advanced but centralized management of concurrent data
 accesses (see \ref ConcurrentDataAccess).
 accesses (see \ref ConcurrentDataAccess).
 </dd>
 </dd>
 
 
+<dt>STARPU_USE_NUMA</dt>
+<dd>
+\anchor STARPU_USE_NUMA 
+\addindex __env__STARPU_USE_NUMA
+When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
+is considered as only one node. This is experimental for now.
+
+When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
+first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
+If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
+discovered by StarPU.
+</dd>
+
 </dl>
 </dl>
 
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 8 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -130,6 +130,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 available as the macro ::STARPU_MAXCPUS.
 available as the macro ::STARPU_MAXCPUS.
 </dd>
 </dd>
 
 
+<dt>--enable-maxnumanodes=<c>count</c></dt>
+<dd>
+\anchor enable-maxnumanodes
+\addindex __configure__--enable-maxnumanodes
+Use at most <c>count</c> NUMA nodes.  This information is then
+available as the macro ::STARPU_MAXNUMANODES.
+</dd>
+
 <dt>--disable-cpu</dt>
 <dt>--disable-cpu</dt>
 <dd>
 <dd>
 \anchor disable-cpu
 \anchor disable-cpu

+ 5 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2011, 2012, 2017  INRIA
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
 
 
@@ -104,6 +104,10 @@ data to StarPU, the specified memory node indicates where the piece of
 data initially resides (we also call this memory node the home node of
 data initially resides (we also call this memory node the home node of
 a piece of data).
 a piece of data).
 
 
+In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
+and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
+numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
+
 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
 \ingroup API_Data_Management
 \ingroup API_Data_Management
 Register a piece of data into the handle located at the
 Register a piece of data into the handle located at the

+ 10 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -250,6 +250,16 @@ Return the type of \p node as defined by
 this function should be used in the allocation function to determine
 this function should be used in the allocation function to determine
 on which device the memory needs to be allocated.
 on which device the memory needs to be allocated.
 
 
+\fn int starpu_memory_nodes_numa_id_to_devid(int osid)
+\ingroup API_Workers_Properties
+This function returns the identifier of the memory node associated to the NUMA
+node identified by \p osid by the Operating System.
+
+\fn int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+\ingroup API_Workers_Properties
+This function returns the Operating System identifier of the memory node
+whose StarPU identifier is \p id.
+
 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 \ingroup API_Workers_Properties
 \ingroup API_Workers_Properties
 Return worker \p type as a string.
 Return worker \p type as a string.

+ 7 - 1
examples/cpp/add_vectors_cpp11.cpp

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
- * Copyright (C) 2012 INRIA
+ * Copyright (C) 2012, 2017  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -78,6 +78,12 @@ int main(int argc, char **argv)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
 	// StarPU data registering
 	// StarPU data registering
 	starpu_data_handle_t spu_vec_A;
 	starpu_data_handle_t spu_vec_A;
 	starpu_data_handle_t spu_vec_B;
 	starpu_data_handle_t spu_vec_B;

+ 1 - 0
include/starpu_config.h.in

@@ -88,6 +88,7 @@
 #undef STARPU_MAXNODES
 #undef STARPU_MAXNODES
 #undef STARPU_NMAXBUFS
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCPUS
+#undef STARPU_MAXNUMANODES
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXMICDEVS
 #undef STARPU_MAXMICDEVS

+ 5 - 1
include/starpu_data.h

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
- * Copyright (C) 2016  Inria
+ * Copyright (C) 2016, 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,10 @@ enum starpu_node_kind
 
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_memory_nodes_get_count(void);
 unsigned starpu_memory_nodes_get_count(void);
+int starpu_memory_nodes_get_numa_count(void);
+int starpu_memory_nodes_numa_id_to_devid(int osid);
+int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+
 enum starpu_node_kind starpu_node_get_kind(unsigned node);
 enum starpu_node_kind starpu_node_get_kind(unsigned node);
 
 
 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);
 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);

+ 2 - 0
include/starpu_data_interfaces.h

@@ -483,6 +483,8 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
 
 starpu_data_handle_t starpu_data_lookup(const void *ptr);
 starpu_data_handle_t starpu_data_lookup(const void *ptr);
 
 
+int starpu_data_get_home_node(starpu_data_handle_t handle);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 14 - 7
src/core/disk.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2013  Corentin Salingue
  * Copyright (C) 2013  Corentin Salingue
  * Copyright (C) 2015, 2016, 2017  CNRS
  * Copyright (C) 2015, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -77,16 +78,22 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 {
 {
 	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN, "Minimum disk size is %d Bytes ! (Here %d) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
 	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN, "Minimum disk size is %d Bytes ! (Here %d) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
 	/* register disk */
 	/* register disk */
-	unsigned memory_node = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
+	unsigned disk_memnode = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
 
 
-	_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-	_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+        /* Connect the disk memory node to all numa memory nodes */
+        int nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+        int numa_node;
+        for (numa_node = 0; numa_node < nb_numa_nodes; numa_node++)
+        {
+                _starpu_register_bus(disk_memnode, numa_node);
+                _starpu_register_bus(numa_node, disk_memnode);
+        }
 
 
 	/* connect disk */
 	/* connect disk */
 	void *base = func->plug(parameter, size);
 	void *base = func->plug(parameter, size);
 
 
 	/* remember it */
 	/* remember it */
-	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(memory_node,func,base);
+	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(disk_memnode, func, base);
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	char name[16];
 	char name[16];
@@ -96,13 +103,13 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 	_starpu_simgrid_memory_node_set_host(memory_node, host);
 	_starpu_simgrid_memory_node_set_host(memory_node, host);
 #endif
 #endif
 
 
-	int ret = func->bandwidth(memory_node);
+	int ret = func->bandwidth(disk_memnode);
 	/* have a problem with the disk */
 	/* have a problem with the disk */
 	if (ret == 0)
 	if (ret == 0)
 		return -ENOENT;
 		return -ENOENT;
 	if (size >= 0)
 	if (size >= 0)
-		_starpu_memory_manager_set_global_memory_size(memory_node, size);
-	return memory_node;
+		_starpu_memory_manager_set_global_memory_size(disk_memnode, size);
+	return disk_memnode;
 }
 }
 
 
 void _starpu_disk_unregister(void)
 void _starpu_disk_unregister(void)

+ 4 - 0
src/core/perfmodel/perfmodel.h

@@ -110,6 +110,10 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 
 
 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
 
 
+#if defined(STARPU_HAVE_HWLOC)
+hwloc_topology_t _starpu_perfmodel_get_hwtopology();
+#endif
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

File diff suppressed because it is too large
+ 792 - 536
src/core/perfmodel/perfmodel_bus.c


+ 14 - 3
src/core/simgrid.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2012-2017  Université de Bordeaux
  * Copyright (C) 2012-2017  Université de Bordeaux
- * Copyright (C) 2016  	    Inria
- * Copyright (C) 2016, 2017  	    CNRS
+ * Copyright (C) 2016, 2017  Inria
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1039,8 +1039,19 @@ void _starpu_simgrid_count_ngpus(void)
 			ngpus = 0;
 			ngpus = 0;
 			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
 			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
 			{
 			{
-				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
+				int numa;
+				int nnumas = starpu_memory_nodes_get_numa_count();
+				int found = 0;
+				for (numa = 0; numa < nnumas; numa++)
+					if (starpu_bus_get_id(src2, numa) != -1)
+					{
+						found = 1;
+						break;
+					}
+					
+				if (!found)
 					continue;
 					continue;
+
 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
 				int routesize2;
 				int routesize2;
 #ifdef HAVE_SG_HOST_ROUTE
 #ifdef HAVE_SG_HOST_ROUTE

+ 546 - 58
src/core/topology.c

@@ -30,6 +30,7 @@
 #include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mp_common/source_common.h>
 #include <drivers/mp_common/source_common.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/opencl/driver_opencl_utils.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <datawizard/datastats.h>
 #include <datawizard/datastats.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
@@ -54,11 +55,23 @@
 #include <hwloc/cuda.h>
 #include <hwloc/cuda.h>
 #endif
 #endif
 
 
+#if defined(STARPU_USE_OPENCL)
+#include <hwloc/opencl.h>
+#endif
+
 static unsigned topology_is_initialized = 0;
 static unsigned topology_is_initialized = 0;
 static int nobind;
 static int nobind;
 
 
 /* For checking whether two workers share the same PU, indexed by PU number */
 /* For checking whether two workers share the same PU, indexed by PU number */
 static int cpu_worker[STARPU_MAXCPUS];
 static int cpu_worker[STARPU_MAXCPUS];
+static unsigned nb_numa_nodes = 0;
+static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
+static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
+static unsigned numa_bus_id[STARPU_MAXNUMANODES*STARPU_MAXNUMANODES];
+static int _starpu_get_logical_numa_node_worker(unsigned workerid);
+
+#define STARPU_NUMA_UNINITIALIZED (-2)
+#define STARPU_NUMA_MAIN_RAM (-1)
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 
 
@@ -87,6 +100,124 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 #endif
 #endif
 
 
+int starpu_memory_nodes_get_numa_count(void)
+{
+	return nb_numa_nodes;
+}
+
+#if defined(STARPU_HAVE_HWLOC)
+static int numa_get_logical_id(hwloc_obj_t obj)
+{
+	STARPU_ASSERT(obj);
+	while (obj->type != HWLOC_OBJ_NODE)
+	{
+		obj = obj->parent;
+
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		if (!obj)
+			return STARPU_NUMA_MAIN_RAM;
+	}
+	return obj->logical_index;
+}
+
+static int numa_get_physical_id(hwloc_obj_t obj)
+{
+	STARPU_ASSERT(obj);
+	while (obj->type != HWLOC_OBJ_NODE)
+	{
+		obj = obj->parent;
+
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		if (!obj)
+			return STARPU_NUMA_MAIN_RAM;
+	}
+	return obj->os_index;
+}
+#endif
+
+static int _starpu_get_logical_numa_node_worker(unsigned workerid)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
+		struct _starpu_machine_topology *topology = &config->topology ;
+
+		hwloc_obj_t obj;
+		switch(worker->arch) 	
+		{
+			case STARPU_CPU_WORKER:
+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
+				break;
+			default:
+				STARPU_ABORT();
+		}
+
+		return numa_get_logical_id(obj);
+	}
+	else		
+#endif 
+	{
+		(void) workerid; /* unused */
+		return STARPU_NUMA_MAIN_RAM;
+	}
+}
+
+static int _starpu_get_physical_numa_node_worker(unsigned workerid)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
+		struct _starpu_machine_topology *topology = &config->topology ;
+
+		hwloc_obj_t obj;
+		switch(worker->arch) 	
+		{
+			case STARPU_CPU_WORKER:
+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
+				break;
+			default:
+				STARPU_ABORT();
+		}
+
+		return numa_get_physical_id(obj);
+	}
+	else		
+#endif 
+	{
+		(void) workerid; /* unused */
+		return STARPU_NUMA_MAIN_RAM;
+	}
+}
+
+static int _starpu_numa_get_logical_id_from_pu(int pu)
+{
+#if defined(STARPU_HAVE_HWLOC)
+	if (nb_numa_nodes > 1)
+	{
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		struct _starpu_machine_topology *topology = &config->topology;
+
+		hwloc_obj_t obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, pu);
+		return numa_get_logical_id(obj);
+	}
+	else
+#endif
+	{
+		return -1;
+	}
+}
+
+
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
 {
 {
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
@@ -846,6 +977,67 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 	return config->topology.nhwpus;
 	return config->topology.nhwpus;
 }
 }
 
 
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
+{
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+        _starpu_opencl_init();
+#endif
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+        _starpu_init_cuda();
+#endif
+        _starpu_init_topology(config);
+
+	int res;
+#if defined(STARPU_HAVE_HWLOC)
+	char * state;
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		struct _starpu_machine_topology *topology = &config->topology ;
+		int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
+		res = nnumanodes > 0 ? nnumanodes : 1 ;
+	}
+	else
+#endif 
+	{	
+		res = 1;
+	}
+
+	STARPU_ASSERT_MSG(res <= STARPU_MAXNUMANODES, "Number of NUMA nodes discovered is higher than maximum accepted ! Use configure option --enable-maxnumanodes=xxx to increase the maximum value of supported NUMA nodes.\n");
+	return res;
+}
+
+//TODO change this in an array
+int starpu_memory_nodes_numa_hwloclogid_to_id(int logid)
+{
+	unsigned n;
+	for (n = 0; n < nb_numa_nodes; n++)
+		if (numa_memory_nodes_to_hwloclogid[n] == logid)
+			return n;
+	return -1;
+}
+
+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id)
+{
+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
+	return numa_memory_nodes_to_hwloclogid[id];
+}
+
+int starpu_memory_nodes_numa_devid_to_id(unsigned id)
+{
+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
+	return numa_memory_nodes_to_physicalid[id];
+}
+
+//TODO change this in an array
+int starpu_memory_nodes_numa_id_to_devid(int osid)
+{
+	unsigned n;
+	for (n = 0; n < nb_numa_nodes; n++)
+		if (numa_memory_nodes_to_physicalid[n] == osid)
+			return n;
+	return -1;
+}
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 void _starpu_topology_filter(hwloc_topology_t topology)
 void _starpu_topology_filter(hwloc_topology_t topology)
 {
 {
@@ -1751,35 +1943,294 @@ _starpu_bind_thread_on_cpus (
 #endif
 #endif
 }
 }
 
 
-static void
-_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
+static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
+{
+	unsigned worker;
+	for (worker = 0; worker < config->topology.nworkers; worker++)
+	{
+		struct _starpu_worker *workerarg = &config->workers[worker];
+
+		switch (workerarg->arch)
+		{
+			case STARPU_CPU_WORKER:
+			{
+				/* Dedicate a cpu core to that worker */
+				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				break;
+			}
+			default:
+				/* Do nothing */
+				break;
+		}
+
+
+	}
+}
+
+//TODO : Check SIMGRID
+static void _starpu_init_numa_node(struct _starpu_machine_config *config)
 {
 {
-	/* launch one thread per CPU */
-	unsigned ram_memory_node;
+	nb_numa_nodes = 0;
+
+	unsigned i;
+	for (i = 0; i < STARPU_MAXNUMANODES; i++)
+	{
+		numa_memory_nodes_to_hwloclogid[i] = STARPU_NUMA_UNINITIALIZED;
+		numa_memory_nodes_to_physicalid[i] = STARPU_NUMA_UNINITIALIZED;
+	}
 
 
-	/* note that even if the CPU cpu are not used, we always have a RAM
-	 * node */
-	/* TODO : support NUMA  ;) */
-	ram_memory_node = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
-	STARPU_ASSERT(ram_memory_node == STARPU_MAIN_RAM);
 
 
+	char * state;
+	/* NUMA mode activated */
+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
+	{
+		/* Take all NUMA nodes used by CPU workers */
+		unsigned worker;
+		for (worker = 0; worker < config->topology.nworkers; worker++)
+		{
+			struct _starpu_worker *workerarg = &config->workers[worker];
+			if (workerarg->arch == STARPU_CPU_WORKER)
+			{
+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
+
+				/* Convert logical id to StarPU id to check if this NUMA node is already saved or not */
+				int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
+
+				/* This shouldn't happen */
+				if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+				{
+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+					STARPU_ABORT();
+				}
+
+				if (numa_starpu_id == -1)
+				{
+					int devid = numa_logical_id == STARPU_NUMA_MAIN_RAM ? 0 : numa_logical_id;
+					int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, devid);
+					STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+					numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
+					int numa_physical_id = _starpu_get_physical_numa_node_worker(worker);
+					numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
+					nb_numa_nodes++;
+#ifdef STARPU_SIMGRID
+					snprintf(name, sizeof(name), "RAM%d", memnode);
+					host = _starpu_simgrid_get_host_by_name(name);
+					STARPU_ASSERT(host);
+					_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+				}
+			}
+		}
+
+		/* If we found NUMA nodes from CPU workers, it's good */
+		if (nb_numa_nodes != 0)
+			return;
+
+		_STARPU_DISP("No NUMA nodes found when checking CPU workers...\n");
+
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
+		_STARPU_DISP("Take NUMA nodes attached to CUDA and OpenCL devices...\n");
+#endif
+
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
+		for (i = 0; i < config->topology.ncudagpus; i++)
+		{
+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
+
+			/* Hwloc cannot recognize some devices */
+			if (!obj)
+				continue;
+
+			while (obj->type != HWLOC_OBJ_NODE)
+			{
+				obj = obj->parent;
+
+				/* If we don't find a "node" obj before the root, this means
+				 * hwloc does not know whether there are numa nodes or not, so
+				 * we should not use a per-node sampling in that case. */
+				if (!obj)
+					continue;
+			}
+			int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
+
+			/* This shouldn't happen */
+			if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+			{
+				_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+				STARPU_ABORT();
+			}
+
+			if (numa_starpu_id == -1)
+			{
+				int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
+				STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+				numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
+				numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
+				nb_numa_nodes++;
+#ifdef STARPU_SIMGRID
+				snprintf(name, sizeof(name), "RAM%d", memnode);
+				host = _starpu_simgrid_get_host_by_name(name);
+				STARPU_ASSERT(host);
+				_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+			}
+		}	
+#endif
+#if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
+		if (config->topology.nopenclgpus > 0)
+		{
+			cl_int err;
+			cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
+			cl_uint nb_platforms;
+			unsigned platform;
+			unsigned nb_opencl_devices = 0, num = 0;
+
+			err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) 
+				nb_platforms=0;
+
+			cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
+			if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
+				device_type |= CL_DEVICE_TYPE_CPU;
+			if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
+				device_type = CL_DEVICE_TYPE_CPU;
+
+			for (platform = 0; platform < nb_platforms ; platform++)
+			{
+				err = clGetDeviceIDs(platform_id[platform], device_type, 0, NULL, &num);
+				if (err != CL_SUCCESS)
+					num = 0;
+				nb_opencl_devices += num;
+
+				for (i = 0; i < num; i++)
+				{
+					hwloc_obj_t obj = hwloc_opencl_get_device_osdev_by_index(config->topology.hwtopology, platform, i);
+
+					/* Hwloc cannot recognize some devices */
+					if (!obj)
+						continue;
+
+					while (obj->type != HWLOC_OBJ_NODE)
+					{
+						obj = obj->parent;
+
+						/* If we don't find a "node" obj before the root, this means
+						 * hwloc does not know whether there are numa nodes or not, so
+						 * we should not use a per-node sampling in that case. */
+						if (!obj)
+							continue;
+					}
+					int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
+
+					/* This shouldn't happen */
+					if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
+					{
+						_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+						STARPU_ABORT();
+					}
+
+					if (numa_starpu_id == -1)
+					{
+						int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
+						STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
+						numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
+						numa_memory_nodes_to_physicalid[memnode] = obj->os_index;	
+						nb_numa_nodes++;
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
-	char name[16];
-	msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
-	STARPU_ASSERT(host);
-	_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
+						snprintf(name, sizeof(name), "RAM%d", memnode);
+						host = _starpu_simgrid_get_host_by_name(name);
+						STARPU_ASSERT(host);
+						_starpu_simgrid_memory_node_set_host(memnode, host);
 #endif
 #endif
+					}
+				}	
+			}
+		}
+#endif
+	}
+	
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
+	//Found NUMA nodes from CUDA nodes
+	if (nb_numa_nodes != 0)
+		return;
+
+	/* In case, we do not find any NUMA nodes when checking NUMA nodes attached to GPUs, we take all of them */
+	_STARPU_DISP("No NUMA nodes found when checking GPUs devices...\n");
+#endif
+
+	_STARPU_DISP("Finally, take all NUMA nodes available... \n");
+
+	unsigned nnuma = _starpu_topology_get_nnumanodes(config);
+	if (nnuma > STARPU_MAXNUMANODES)
+	{
+		_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+		nnuma = STARPU_MAXNUMANODES;		
+	}
+
+	unsigned numa;
+	for (numa = 0; numa < nnuma; numa++)
+	{
+#if defined(STARPU_HAVE_HWLOC)
+		if (nnuma > 1)
+		{
+			hwloc_obj_t obj = hwloc_get_obj_by_type(config->topology.hwtopology, HWLOC_OBJ_NUMANODE, numa);
+			unsigned numa_logical_id = obj->logical_index;
+			unsigned numa_physical_id = obj->os_index;
 
 
+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
+			STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available) \n", memnode, STARPU_MAXNUMANODES);
+
+			numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
+			numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
+			nb_numa_nodes++;								
+
+#ifdef STARPU_SIMGRID
+			snprintf(name, sizeof(name), "RAM%d", memnode);
+			host = _starpu_simgrid_get_host_by_name(name);
+			STARPU_ASSERT(host);
+			_starpu_simgrid_memory_node_set_host(memnode, host);
+#endif
+		}
+		else
+#endif /* defined(STARPU_HAVE_HWLOC) */
+		{
+
+			/* In this case, nnuma has only one node */
+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
+			STARPU_ASSERT_MSG(memnode == STARPU_MAIN_RAM, "Wrong Memory Node : %d (expected %d) \n", memnode, STARPU_MAIN_RAM);
+
+			numa_memory_nodes_to_hwloclogid[memnode] = STARPU_NUMA_MAIN_RAM;
+			numa_memory_nodes_to_physicalid[memnode] = STARPU_NUMA_MAIN_RAM;
+			nb_numa_nodes++;								
+#ifdef STARPU_SIMGRID
+			char name[16];
+			msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
+			STARPU_ASSERT(host);
+			_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
+#endif
+		}
+
+	}	
+	
+	STARPU_ASSERT_MSG(nb_numa_nodes > 0, "No NUMA node found... We need at least one memory node !\n");	
+}
+
+static void _starpu_init_numa_bus()
+{
+	unsigned i, j;
+	for (i = 0; i < nb_numa_nodes; i++)
+		for (j = 0; j < nb_numa_nodes; j++)
+			if (i != j)
+				numa_bus_id[i*nb_numa_nodes+j] = _starpu_register_bus(i, j);
+}
+
+static void
+_starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
+{
 	/* We will store all the busid of the different (src, dst)
 	/* We will store all the busid of the different (src, dst)
 	 * combinations in a matrix which we initialize here. */
 	 * combinations in a matrix which we initialize here. */
 	_starpu_initialize_busid_matrix();
 	_starpu_initialize_busid_matrix();
 
 
-	/* Each device is initialized,
-	 * giving it a memory node and a core bind id.
-	 */
-	/* TODO: STARPU_MAXNUMANODES */
-	unsigned numa_init[1] = { 1 };
-	unsigned numa_memory_nodes[1] = { ram_memory_node };
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
@@ -1801,6 +2252,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
 	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
 	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
 #endif
 #endif
+
 	unsigned bindid;
 	unsigned bindid;
 
 
 	for (bindid = 0; bindid < config->nbindid; bindid++)
 	for (bindid = 0; bindid < config->nbindid; bindid++)
@@ -1810,6 +2262,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		config->bindid_workers[bindid].nworkers = 0;
 		config->bindid_workers[bindid].nworkers = 0;
 	}
 	}
 
 
+	/* Init CPU binding before NUMA nodes, because we use it to discover NUMA nodes */
+	_starpu_init_binding_cpu(config);
+
+	/* Initialize NUMA nodes */
+	_starpu_init_numa_node(config);
+	_starpu_init_numa_bus();
+
 	unsigned worker;
 	unsigned worker;
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
 	{
@@ -1828,33 +2287,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		{
 		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			{
 			{
-				/* TODO: NUMA */
-				int numaid = 0;
-				/* "dedicate" a cpu core to that worker */
-				if (numa_init[numaid])
-				{
-					memory_node = numa_memory_nodes[numaid];
-				}
-				else
-				{
-					numa_init[numaid] = 1;
-					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
-#ifdef STARPU_SIMGRID
-					snprintf(name, sizeof(name), "RAM%d", numaid);
-					host = _starpu_simgrid_get_host_by_name(name);
-					STARPU_ASSERT(host);
-					_starpu_simgrid_memory_node_set_host(memory_node, host);
-#endif
-				}
-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
+				int numa_starpu_id =  starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
+				if (numa_starpu_id >= STARPU_MAXNUMANODES)
+					numa_starpu_id = STARPU_MAIN_RAM;
+
+				workerarg->numa_memory_node = memory_node = numa_starpu_id;
+
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, numa_starpu_id);
 				break;
 				break;
 			}
 			}
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
+			{
+				unsigned numa;
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 				if (may_bind_automatically[STARPU_CUDA_WORKER])
 				if (may_bind_automatically[STARPU_CUDA_WORKER])
 				{
 				{
@@ -1884,8 +2332,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 
 
-					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_cuda_bus_ids[numa][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(numa, memory_node);
+						_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][numa] = _starpu_register_bus(memory_node, numa);
+					}
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 					const char* cuda_memcpy_peer;
 					const char* cuda_memcpy_peer;
 					snprintf(name, sizeof(name), "CUDA%u", devid);
 					snprintf(name, sizeof(name), "CUDA%u", devid);
@@ -1912,8 +2363,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 							if (workerarg2->arch == STARPU_CUDA_WORKER)
 							if (workerarg2->arch == STARPU_CUDA_WORKER)
 							{
 							{
 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
-								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
-								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
+								_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node2, memory_node);
+								_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node, memory_node2);
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
 #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
 								{
 								{
@@ -1931,8 +2382,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
 											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
 										}
 										}
 #endif
 #endif
-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], data->ngpus);
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES], data->ngpus);
 									}
 									}
 								}
 								}
 #endif
 #endif
@@ -1943,13 +2394,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 				break;
+			}
 #endif
 #endif
 
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 		        case STARPU_OPENCL_WORKER:
 		        case STARPU_OPENCL_WORKER:
+			{
+				unsigned numa;
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 				if (may_bind_automatically[STARPU_OPENCL_WORKER])
 				if (may_bind_automatically[STARPU_OPENCL_WORKER])
 				{
 				{
@@ -1970,8 +2427,12 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					opencl_init[devid] = 1;
 					opencl_init[devid] = 1;
 					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 					snprintf(name, sizeof(name), "OpenCL%u", devid);
 					snprintf(name, sizeof(name), "OpenCL%u", devid);
 					host = _starpu_simgrid_get_host_by_name(name);
 					host = _starpu_simgrid_get_host_by_name(name);
@@ -1981,13 +2442,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(workerarg, numa);
+
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				break;
 				break;
+			}
 #endif
 #endif
 
 
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 		        case STARPU_MIC_WORKER:
 		        case STARPU_MIC_WORKER:
+			{
+				unsigned numa;
 				if (mic_init[devid])
 				if (mic_init[devid])
 				{
 				{
 					memory_node = mic_memory_nodes[devid];
 					memory_node = mic_memory_nodes[devid];
@@ -2004,21 +2471,30 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					//}
 					//}
 					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 
 
 				}
 				}
 				workerarg->bindid = mic_bindid[devid];
 				workerarg->bindid = mic_bindid[devid];
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 				break;
+			}
 #endif /* STARPU_USE_MIC */
 #endif /* STARPU_USE_MIC */
 
 
 #ifdef STARPU_USE_SCC
 #ifdef STARPU_USE_SCC
 			case STARPU_SCC_WORKER:
 			case STARPU_SCC_WORKER:
 			{
 			{
+				unsigned numa;
 				/* Node 0 represents the SCC shared memory when we're on SCC. */
 				/* Node 0 represents the SCC shared memory when we're on SCC. */
 				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
 				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
 				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
 				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
@@ -2026,7 +2502,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				memory_node = ram_memory_node;
 				memory_node = ram_memory_node;
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(workerarg, numa);
+
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				_starpu_worker_drives_memory_node(workerarg, memory_node);
 			}
 			}
 				break;
 				break;
@@ -2035,6 +2514,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 			case STARPU_MPI_MS_WORKER:
 			case STARPU_MPI_MS_WORKER:
 			{
 			{
+				unsigned numa;
 				if (mpi_init[devid])
 				if (mpi_init[devid])
 				{
 				{
 					memory_node = mpi_memory_nodes[devid];
 					memory_node = mpi_memory_nodes[devid];
@@ -2044,11 +2524,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					mpi_init[devid] = 1;
 					mpi_init[devid] = 1;
 					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+		
+					for (numa = 0; numa < nb_numa_nodes; numa++)
+					{	
+						_starpu_register_bus(numa, memory_node);
+						_starpu_register_bus(memory_node, numa);
+					}
 
 
 				}
 				}
-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
+
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
@@ -2154,7 +2641,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 	_starpu_memory_nodes_init();
 	_starpu_memory_nodes_init();
 	_starpu_datastats_init();
 	_starpu_datastats_init();
 
 
-	_starpu_init_workers_binding(config, no_mp_config);
+	_starpu_init_workers_binding_and_memory(config, no_mp_config);
 
 
 	config->cpus_nodeid = -1;
 	config->cpus_nodeid = -1;
 	config->cuda_nodeid = -1;
 	config->cuda_nodeid = -1;
@@ -2293,3 +2780,4 @@ starpu_topology_print (FILE *output)
 		fprintf(output, "\n");
 		fprintf(output, "\n");
 	}
 	}
 }
 }
+

+ 8 - 1
src/core/topology.h

@@ -1,7 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2010, 2012, 2014-2016  Université de Bordeaux
+ * Copyright (C) 2009-2010, 2012, 2014-2017  Université de Bordeaux
  * Copyright (C) 2010, 2015, 2017  CNRS
  * Copyright (C) 2010, 2015, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,6 +52,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 /* returns the number of logical cpus */
 /* returns the number of logical cpus */
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 
 
+/* returns the number of NUMA nodes */
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 /* Small convenient function to filter hwloc topology depending on HWLOC API version */
 /* Small convenient function to filter hwloc topology depending on HWLOC API version */
 void _starpu_topology_filter(hwloc_topology_t topology);
 void _starpu_topology_filter(hwloc_topology_t topology);
@@ -68,4 +72,7 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 
 
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
 
 
+int starpu_memory_nodes_get_numa_count(void);
+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id);
+	
 #endif // __TOPOLOGY_H__
 #endif // __TOPOLOGY_H__

+ 7 - 2
src/core/workers.c

@@ -1599,8 +1599,13 @@ void starpu_shutdown(void)
 
 
 	/* tell all workers to shutdown */
 	/* tell all workers to shutdown */
 	_starpu_kill_all_workers(&_starpu_config);
 	_starpu_kill_all_workers(&_starpu_config);
-
-	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
+	
+	unsigned i;
+	unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+	for (i=0; i<nb_numa_nodes; i++)
+	{
+		_starpu_free_all_automatically_allocated_buffers(i);
+	}
 
 
 	{
 	{
 	     int stats = starpu_get_env_number("STARPU_STATS");
 	     int stats = starpu_get_env_number("STARPU_STATS");

+ 1 - 0
src/core/workers.h

@@ -84,6 +84,7 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	/* condition variable used for passive waiting operations on worker
 	/* condition variable used for passive waiting operations on worker
 	 * STARPU_PTHREAD_COND_BROADCAST must be used instead of STARPU_PTHREAD_COND_SIGNAL,
 	 * STARPU_PTHREAD_COND_BROADCAST must be used instead of STARPU_PTHREAD_COND_SIGNAL,
 	 * since the condition is shared for multiple purpose */
 	 * since the condition is shared for multiple purpose */

+ 29 - 4
src/datawizard/coherency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures. *
 /* StarPU --- Runtime system for heterogeneous multicore architectures. *
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2014  INRIA
+ * Copyright (C) 2014, 2017  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -320,6 +320,29 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	return 0;
 	return 0;
 }
 }
 
 
+/* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
+static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
+{
+	double timing_best;
+	int best_numa = -1;
+	unsigned numa;
+	const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+	for(numa = 0; numa < nb_numa_nodes; numa++)
+	{
+		double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
+
+		/* Compare slowness : take the lowest */
+		if (best_numa < 0 || actual < timing_best)
+		{
+			best_numa = numa;
+			timing_best = actual;
+		}
+	}
+	STARPU_ASSERT(best_numa >= 0);
+	
+	return best_numa;
+}
+
 /* Determines the path of a request : each hop is defined by (src,dst) and the
 /* Determines the path of a request : each hop is defined by (src,dst) and the
  * node that handles the hop. The returned value indicates the number of hops,
  * node that handles the hop. The returned value indicates the number of hops,
  * and the max_len is the maximum number of hops (ie. the size of the
  * and the max_len is the maximum number of hops (ie. the size of the
@@ -362,9 +385,11 @@ static int determine_request_path(starpu_data_handle_t handle,
 		STARPU_ASSERT(max_len >= 2);
 		STARPU_ASSERT(max_len >= 2);
 		STARPU_ASSERT(src_node >= 0);
 		STARPU_ASSERT(src_node >= 0);
 
 
+		unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
+
 		/* GPU -> RAM */
 		/* GPU -> RAM */
 		src_nodes[0] = src_node;
 		src_nodes[0] = src_node;
-		dst_nodes[0] = STARPU_MAIN_RAM;
+		dst_nodes[0] = numa;
 
 
 		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
 		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
 			/* Disks don't have their own driver thread */
 			/* Disks don't have their own driver thread */
@@ -380,7 +405,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 		}
 		}
 
 
 		/* RAM -> GPU */
 		/* RAM -> GPU */
-		src_nodes[1] = STARPU_MAIN_RAM;
+		src_nodes[1] = numa;
 		dst_nodes[1] = dst_node;
 		dst_nodes[1] = dst_node;
 
 
 		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
 		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
@@ -573,7 +598,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
 			dst_replicate->initialized = 1;
-		if (requesting_node == STARPU_MAIN_RAM && !nwait)
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
 		{
 		{
 			/* And this is the main RAM, really no need for a
 			/* And this is the main RAM, really no need for a
 			 * request, just allocate */
 			 * request, just allocate */

+ 1 - 2
src/datawizard/copy_driver.c

@@ -864,10 +864,9 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
                 _starpu_mpi_common_wait_event(async_channel);
                 _starpu_mpi_common_wait_event(async_channel);
                 break;
                 break;
 #endif
 #endif
-	case STARPU_MAIN_RAM:
+	case STARPU_CPU_RAM:
 		starpu_disk_wait_request(async_channel);
 		starpu_disk_wait_request(async_channel);
 		break;
 		break;
-	case STARPU_CPU_RAM:
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}

+ 2 - 2
src/datawizard/data_request.c

@@ -147,7 +147,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	if (handling_node == -1)
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
 	r->handling_node = handling_node;
-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->completed = 0;
 	r->prefetch = is_prefetch;
 	r->prefetch = is_prefetch;
 	r->prio = prio;
 	r->prio = prio;
@@ -276,7 +276,7 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 	unsigned handling_node = r->handling_node;
 	unsigned handling_node = r->handling_node;
 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 
 
 //	_STARPU_DEBUG("POST REQUEST\n");
 //	_STARPU_DEBUG("POST REQUEST\n");
 
 

+ 10 - 1
src/datawizard/datawizard.c

@@ -22,6 +22,7 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <core/workers.h>
 #include <core/progress_hook.h>
 #include <core/progress_hook.h>
+#include <core/topology.h>
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 #endif
 #endif
@@ -71,8 +72,16 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
         unsigned memnode;
         unsigned memnode;
 
 
 	if (!worker)
 	if (!worker)
+	{
 		/* Call from main application, only make RAM requests progress */
 		/* Call from main application, only make RAM requests progress */
-		return ___starpu_datawizard_progress(STARPU_MAIN_RAM, may_alloc, push_requests);
+		int ret = 0;
+		int nnumas = starpu_memory_nodes_get_numa_count();
+		int numa;
+		for (numa = 0; numa < nnumas; numa++)
+			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
+
+		return ret;
+	}
 	if (worker->set)
 	if (worker->set)
 		/* Runing one of the workers of a worker set. The reference for
 		/* Runing one of the workers of a worker set. The reference for
 		 * driving memory is its worker 0 (see registrations in topology.c) */
 		 * driving memory is its worker 0 (see registrations in topology.c) */

+ 1 - 1
src/datawizard/datawizard.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2014, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2013  CNRS
  * Copyright (C) 2010, 2013  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify

+ 24 - 10
src/datawizard/filters.c

@@ -188,14 +188,18 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		 * to have somewhere to gather pieces later */
 		 * to have somewhere to gather pieces later */
 		/* FIXME: mark as unevictable! */
 		/* FIXME: mark as unevictable! */
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
+		int home_node = initial_handle->home_node;
+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+			home_node = STARPU_MAIN_RAM;
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #warning we should reclaim memory if allocation failed
 #endif
 #endif
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 	}
 	}
 
 
-	_starpu_data_unregister_ram_pointer(initial_handle);
+	for (node = 0; node < STARPU_MAXNODES; node++)
+		_starpu_data_unregister_ram_pointer(initial_handle, node);
 
 
 	if (nparts && !inherit_state)
 	if (nparts && !inherit_state)
 	{
 	{
@@ -324,10 +328,14 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		 * store it in the handle */
 		 * store it in the handle */
 		child->footprint = _starpu_compute_data_footprint(child);
 		child->footprint = _starpu_compute_data_footprint(child);
 
 
-		void *ptr;
-		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
-		if (ptr != NULL)
-			_starpu_data_register_ram_pointer(child, ptr);
+		for (node = 0; node < STARPU_MAXNODES; node++)
+		{
+			if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+				continue;
+			void *ptr = starpu_data_handle_to_pointer(child, node);
+			if (ptr != NULL)
+				_starpu_data_register_ram_pointer(child, ptr);
+		}
 
 
 		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
 		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
 	}
 	}
@@ -428,7 +436,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 			child_handle->unregister_hook(child_handle);
 			child_handle->unregister_hook(child_handle);
 		}
 		}
 
 
-		_starpu_data_unregister_ram_pointer(child_handle);
+		for (node = 0; node < STARPU_MAXNODES; node++)
+			_starpu_data_unregister_ram_pointer(child_handle, node);
 
 
 		if (child_handle->per_worker)
 		if (child_handle->per_worker)
 		{
 		{
@@ -444,9 +453,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		_starpu_memory_stats_free(child_handle);
 		_starpu_memory_stats_free(child_handle);
 	}
 	}
 
 
-	ptr = starpu_data_handle_to_pointer(root_handle, STARPU_MAIN_RAM);
-	if (ptr != NULL)
-		_starpu_data_register_ram_pointer(root_handle, ptr);
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+			continue;
+		ptr = starpu_data_handle_to_pointer(root_handle, node);
+		if (ptr != NULL)
+			_starpu_data_register_ram_pointer(root_handle, ptr);
+	}
 
 
 	/* the gathering_node should now have a valid copy of all the children.
 	/* the gathering_node should now have a valid copy of all the children.
 	 * For all nodes, if the node had all copies and none was locally
 	 * For all nodes, if the node had all copies and none was locally

+ 12 - 4
src/datawizard/interfaces/bcsr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -130,7 +130,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
@@ -260,9 +260,13 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
 {
+	int node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 #ifdef STARPU_DEBUG
 #ifdef STARPU_DEBUG
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
@@ -273,9 +277,13 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
 {
+	int node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 #ifdef STARPU_DEBUG
 #ifdef STARPU_DEBUG
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");

+ 2 - 2
src/datawizard/interfaces/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -163,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (nz-1)*ldz*elemsize + (ny-1)*ldy*elemsize + nx*elemsize - 1);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (nz-1)*ldz*elemsize + (ny-1)*ldy*elemsize + nx*elemsize - 1);

+ 3 - 2
src/datawizard/interfaces/coo_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2013-2016  Université Bordeaux
+ * Copyright (C) 2013-2017  Université Bordeaux
  * Copyright (C) 2012 INRIA
  * Copyright (C) 2012 INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,6 +18,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/fxt.h>
 #include <common/fxt.h>
 #include <datawizard/memalloc.h>
 #include <datawizard/memalloc.h>
+#include <datawizard/memory_nodes.h>
 
 
 static int
 static int
 copy_any_to_any(void *src_interface, unsigned src_node,
 copy_any_to_any(void *src_interface, unsigned src_node,
@@ -236,7 +237,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize,
 		.elemsize = elemsize,
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(columns);
 		STARPU_ASSERT_ACCESSIBLE(columns);
 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) columns + n_values*sizeof(uint32_t) - 1);
 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) columns + n_values*sizeof(uint32_t) - 1);

+ 2 - 2
src/datawizard/interfaces/csr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  *
@@ -112,7 +112,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, int home_node,
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);
 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);

+ 25 - 14
src/datawizard/interfaces/data_interface.c

@@ -369,12 +369,14 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	/* now the data is available ! */
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_unlock(&handle->header_lock);
 
 
-
-
-	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
-	if (ptr != NULL)
+	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
-		_starpu_data_register_ram_pointer(handle, ptr);
+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+			continue;
+
+		ptr = starpu_data_handle_to_pointer(handle, node);
+		if (ptr != NULL)
+			_starpu_data_register_ram_pointer(handle, ptr);
 	}
 	}
 }
 }
 
 
@@ -521,13 +523,17 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
  * Stop monitoring a piece of data
  * Stop monitoring a piece of data
  */
  */
 
 
-void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
+void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
 {
 {
-	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+	if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
+		return;
+
 #ifdef STARPU_OPENMP
 #ifdef STARPU_OPENMP
 	if (handle->removed_from_context_hash)
 	if (handle->removed_from_context_hash)
 		return;
 		return;
 #endif
 #endif
+	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
+
 	if (ram_ptr != NULL)
 	if (ram_ptr != NULL)
 	{
 	{
 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
@@ -757,7 +763,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
 			struct starpu_multiformat_interface *format_interface;
-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+			home_node = handle->home_node;
+			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+				home_node = STARPU_MAIN_RAM;
+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			struct starpu_codelet *cl = NULL;
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 
 
@@ -850,16 +859,19 @@ retry_busy:
 
 
 	size_t size = _starpu_data_get_size(handle);
 	size_t size = _starpu_data_get_size(handle);
 
 
-	_starpu_data_unregister_ram_pointer(handle);
-
 	/* Destroy the data now */
 	/* Destroy the data now */
 	unsigned node;
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
 		struct _starpu_data_replicate *local = &handle->per_node[node];
+		if (local->allocated)
+		{
+			_starpu_data_unregister_ram_pointer(handle, node);
+
 		/* free the data copy in a lazy fashion */
 		/* free the data copy in a lazy fashion */
-		if (local->allocated && local->automatically_allocated)
-			_starpu_request_mem_chunk_removal(handle, local, node, size);
+			if (local->automatically_allocated)
+				_starpu_request_mem_chunk_removal(handle, local, node, size);
+		}
 	}
 	}
 	if (handle->per_worker)
 	if (handle->per_worker)
 	{
 	{
@@ -976,8 +988,7 @@ static void _starpu_data_invalidate(void *data)
 
 
 		if (local->mc && local->allocated && local->automatically_allocated)
 		if (local->mc && local->allocated && local->automatically_allocated)
 		{
 		{
-			if (node == STARPU_MAIN_RAM)
-				_starpu_data_unregister_ram_pointer(handle);
+			_starpu_data_unregister_ram_pointer(handle, node);
 
 
 			/* free the data copy in a lazy fashion */
 			/* free the data copy in a lazy fashion */
 			_starpu_request_mem_chunk_removal(handle, local, node, size);
 			_starpu_request_mem_chunk_removal(handle, local, node, size);

+ 1 - 1
src/datawizard/interfaces/data_interface.h

@@ -78,7 +78,7 @@ extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 						void *ptr)
 						void *ptr)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;
 
 
-extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
+extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;
 
 
 #define _starpu_data_is_multiformat_handle(handle) handle->ops->is_multiformat
 #define _starpu_data_is_multiformat_handle(handle) handle->ops->is_multiformat

+ 2 - 2
src/datawizard/interfaces/matrix_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -174,7 +174,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, int home_node,
                 .offset = 0
                 .offset = 0
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (ny-1)*ld*elemsize + nx*elemsize - 1);
 		STARPU_ASSERT_ACCESSIBLE(ptr + (ny-1)*ld*elemsize + nx*elemsize - 1);

+ 2 - 2
src/datawizard/interfaces/variable_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -113,7 +113,7 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, int home_nod
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + elemsize - 1);
 		STARPU_ASSERT_ACCESSIBLE(ptr + elemsize - 1);

+ 2 - 2
src/datawizard/interfaces/vector_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2017  Inria
  * Copyright (C) 2017  Inria
  *
  *
@@ -122,7 +122,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
                 .offset = 0
                 .offset = 0
 	};
 	};
 #if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
 #if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
-	if (home_node == STARPU_MAIN_RAM)
+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
 	{
 	{
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr);
 		STARPU_ASSERT_ACCESSIBLE(ptr + nx*elemsize - 1);
 		STARPU_ASSERT_ACCESSIBLE(ptr + nx*elemsize - 1);

+ 41 - 10
src/datawizard/malloc.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2009-2010, 2012-2017  Université de Bordeaux
  * Copyright (C) 2009-2010, 2012-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -112,8 +113,14 @@ static struct starpu_codelet malloc_pinned_cl =
 };
 };
 #endif
 #endif
 
 
+/* Allocation in CPU RAM */
 int starpu_malloc_flags(void **A, size_t dim, int flags)
 int starpu_malloc_flags(void **A, size_t dim, int flags)
 {
 {
+	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
+}
+
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
+{
 	int ret=0;
 	int ret=0;
 
 
 	STARPU_ASSERT(A);
 	STARPU_ASSERT(A);
@@ -121,14 +128,14 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	if (flags & STARPU_MALLOC_COUNT)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
+			while (starpu_memory_allocate(dst_node, dim, flags) != 0)
 			{
 			{
 				size_t freed;
 				size_t freed;
 				size_t reclaim = 2 * dim;
 				size_t reclaim = 2 * dim;
 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", (long)reclaim);
 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", (long)reclaim);
-				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
-				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
-				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
+				_STARPU_TRACE_START_MEMRECLAIM(dst_node,0);
+				freed = _starpu_memory_reclaim_generic(dst_node, 0, reclaim);
+				_STARPU_TRACE_END_MEMRECLAIM(dst_node,0);
 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				{
 				{
 					// We could not reclaim enough memory
 					// We could not reclaim enough memory
@@ -137,9 +144,9 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 				}
 				}
 			}
 			}
 		else if (flags & STARPU_MEMORY_WAIT)
 		else if (flags & STARPU_MEMORY_WAIT)
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
+			starpu_memory_allocate(dst_node, dim, flags);
 		else
 		else
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
+			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 	}
 
 
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
@@ -298,6 +305,18 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 		_starpu_scc_allocate_shared_memory(A, dim);
 		_starpu_scc_allocate_shared_memory(A, dim);
 #endif
 #endif
 	}
 	}
+#ifdef STARPU_HAVE_HWLOC
+	if (starpu_memory_nodes_get_numa_count() > 1) {
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t hwtopology = config->topology.hwtopology;
+		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, starpu_memory_nodes_numa_id_to_hwloclogid(dst_node));
+		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
+		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, starpu_memnode_get_numaphysid(dst_node), *A);
+		if (!*A)
+			ret = -ENOMEM;
+	}
+#endif /* STARPU_HAVE_HWLOC */
 	else
 	else
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 	if (_malloc_align != sizeof(void*))
 	if (_malloc_align != sizeof(void*))
@@ -333,7 +352,7 @@ end:
 	}
 	}
 	else if (flags & STARPU_MALLOC_COUNT)
 	else if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
+		starpu_memory_deallocate(dst_node, dim);
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -383,6 +402,11 @@ static struct starpu_codelet free_pinned_cl =
 
 
 int starpu_free_flags(void *A, size_t dim, int flags)
 int starpu_free_flags(void *A, size_t dim, int flags)
 {
 {
+	return _starpu_free_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
+}
+
+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags)
+{
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 	{
 		if (_starpu_can_submit_cuda_task())
 		if (_starpu_can_submit_cuda_task())
@@ -470,6 +494,13 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 		_starpu_scc_free_shared_memory(A);
 		_starpu_scc_free_shared_memory(A);
 #endif
 #endif
 	}
 	}
+#ifdef STARPU_HAVE_HWLOC
+	else if (starpu_memory_nodes_get_numa_count() > 1) {
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t hwtopology = config->topology.hwtopology;
+		hwloc_free(hwtopology, A, dim);
+	}
+#endif /* STARPU_HAVE_HWLOC */
 	else
 	else
 		free(A);
 		free(A);
 
 
@@ -478,7 +509,7 @@ out:
 #endif
 #endif
 	if (flags & STARPU_MALLOC_COUNT)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
+		starpu_memory_deallocate(dst_node, dim);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -516,7 +547,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	{
 	{
 		case STARPU_CPU_RAM:
 		case STARPU_CPU_RAM:
 		{
 		{
-			starpu_malloc_flags((void**) &addr, size,
+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					/* without memcpy_peer, we can not
 					/* without memcpy_peer, we can not
 					 * allocated pinned memory, since it
 					 * allocated pinned memory, since it
@@ -646,7 +677,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 	switch(kind)
 	switch(kind)
 	{
 	{
 		case STARPU_CPU_RAM:
 		case STARPU_CPU_RAM:
-			starpu_free_flags((void*)addr, size,
+			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					flags & ~STARPU_MALLOC_PINNED
 					flags & ~STARPU_MALLOC_PINNED
 #else
 #else

+ 3 - 1
src/datawizard/malloc.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2013  Université de Bordeaux
+ * Copyright (C) 2013, 2017  Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,4 +22,6 @@ void _starpu_malloc_shutdown(unsigned dst_node);
 
 
 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
 
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
 #endif
 #endif

+ 46 - 26
src/datawizard/memalloc.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2016  Inria
+ * Copyright (C) 2016, 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,6 +21,7 @@
 #include <datawizard/memalloc.h>
 #include <datawizard/memalloc.h>
 #include <datawizard/footprint.h>
 #include <datawizard/footprint.h>
 #include <core/disk.h>
 #include <core/disk.h>
+#include <core/topology.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <common/uthash.h>
 #include <common/uthash.h>
 
 
@@ -382,8 +383,8 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 			data_interface = mc->chunk_interface;
 			data_interface = mc->chunk_interface;
 		STARPU_ASSERT(data_interface);
 		STARPU_ASSERT(data_interface);
 
 
-		if (handle && node == STARPU_MAIN_RAM)
-			_starpu_data_unregister_ram_pointer(handle);
+		if (handle && (starpu_node_get_kind(node) == STARPU_CPU_RAM))
+			_starpu_data_unregister_ram_pointer(handle, node);
 
 
 		_STARPU_TRACE_START_FREE(node, mc->size);
 		_STARPU_TRACE_START_FREE(node, mc->size);
 		mc->ops->free_data_on_node(data_interface, node);
 		mc->ops->free_data_on_node(data_interface, node);
@@ -443,8 +444,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	if (old_replicate)
 	if (old_replicate)
 	{
 	{
-		if (node == STARPU_MAIN_RAM)
-			_starpu_data_unregister_ram_pointer(old_replicate->handle);
+		_starpu_data_unregister_ram_pointer(old_replicate->handle, node);
 		old_replicate->allocated = 0;
 		old_replicate->allocated = 0;
 		old_replicate->automatically_allocated = 0;
 		old_replicate->automatically_allocated = 0;
 		old_replicate->initialized = 0;
 		old_replicate->initialized = 0;
@@ -1486,11 +1486,11 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 	replicate->allocated = 1;
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 	replicate->automatically_allocated = 1;
 
 
-	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
+	if (replicate->relaxed_coherency == 0 && (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM))
 	{
 	{
 		/* We are allocating the buffer in main memory, also register it
 		/* We are allocating the buffer in main memory, also register it
 		 * for the gcc plugin.  */
 		 * for the gcc plugin.  */
-		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+		void *ptr = starpu_data_handle_to_pointer(handle, dst_node);
 		if (ptr != NULL)
 		if (ptr != NULL)
 		{
 		{
 			_starpu_data_register_ram_pointer(handle, ptr);
 			_starpu_data_register_ram_pointer(handle, ptr);
@@ -1617,7 +1617,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	int target = -1;
 	int target = -1;
 	unsigned nnodes = starpu_memory_nodes_get_count();
 	unsigned nnodes = starpu_memory_nodes_get_count();
 	unsigned int i;
 	unsigned int i;
-	double time_disk = 0;
+	double time_disk = 0.0;
 
 
 	for (i = 0; i < nnodes; i++)
 	for (i = 0; i < nnodes; i++)
 	{
 	{
@@ -1628,13 +1628,17 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 			/* if we can write on the disk */
 			/* if we can write on the disk */
 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
 			{
 			{
-				/* only time can change between disk <-> main_ram
-				 * and not between main_ram <-> worker if we compare diks*/
-				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
-				if (target == -1 || time_disk > time_tmp)
+				unsigned numa;
+				unsigned nnumas = starpu_memory_nodes_get_numa_count();
+				for (numa = 0; numa < nnumas; numa++)
 				{
 				{
-					target = i;
-					time_disk = time_tmp;
+					/* TODO : check if starpu_transfer_predict(node, i,...) is the same */
+					double time_tmp = starpu_transfer_predict(node, numa, _starpu_data_get_size(handle)) + starpu_transfer_predict(i, numa, _starpu_data_get_size(handle));
+					if (target == -1 || time_disk > time_tmp)
+					{
+						target = i;
+						time_disk = time_tmp;
+					}
 				}
 				}
 			}
 			}
 		}
 		}
@@ -1642,6 +1646,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	return target;
 	return target;
 }
 }
 
 
+#ifdef STARPU_DEVEL
+#  warning TODO: better choose NUMA node
+#endif
 
 
 static unsigned
 static unsigned
 choose_target(starpu_data_handle_t handle, unsigned node)
 choose_target(starpu_data_handle_t handle, unsigned node)
@@ -1650,14 +1657,20 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	size_t size_handle = _starpu_data_get_size(handle);
 	size_t size_handle = _starpu_data_get_size(handle);
 	if (handle->home_node != -1)
 	if (handle->home_node != -1)
 		/* try to push on RAM if we can before to push on disk */
 		/* try to push on RAM if we can before to push on disk */
-		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
+		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		{
 		{
-			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
-			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
+ 	                unsigned i;
+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+			for (i=0; i<nb_numa_nodes; i++)
 			{
 			{
-				target = STARPU_MAIN_RAM;
+				if (handle->per_node[i].allocated || 
+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
+				{
+					target = i;
+					break;
+				}
 			}
 			}
-			else
+			if (target == -1)
 			{
 			{
 				target = get_better_disk_can_accept_size(handle, node);
 				target = get_better_disk_can_accept_size(handle, node);
 			}
 			}
@@ -1672,19 +1685,26 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	{
 	{
 		/* handle->home_node == -1 */
 		/* handle->home_node == -1 */
 		/* no place for datas in RAM, we push on disk */
 		/* no place for datas in RAM, we push on disk */
-		if (node == STARPU_MAIN_RAM)
+		if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
 		{
 		{
 			target = get_better_disk_can_accept_size(handle, node);
 			target = get_better_disk_can_accept_size(handle, node);
-		}
+		} else {
 		/* node != 0 */
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
 		/* try to push data to RAM if we can before to push on disk*/
-		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
-			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
-		{
-			target = STARPU_MAIN_RAM;
+			unsigned i;
+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
+			for (i=0; i<nb_numa_nodes; i++)
+			{
+				if (handle->per_node[i].allocated || 
+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
+				{
+					target = i;
+					break;
+				}
+			}
 		}
 		}
 		/* no place in RAM */
 		/* no place in RAM */
-		else
+		if (target == -1)
 		{
 		{
 			target = get_better_disk_can_accept_size(handle, node);
 			target = get_better_disk_can_accept_size(handle, node);
 		}
 		}

+ 1 - 1
src/datawizard/memory_nodes.c

@@ -77,7 +77,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 	switch (_starpu_descr.nodes[node])
 	switch (_starpu_descr.nodes[node])
 	{
 	{
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
-		prefix = "RAM";
+		prefix = "NUMA";
 		break;
 		break;
 	case STARPU_CUDA_RAM:
 	case STARPU_CUDA_RAM:
 		prefix = "CUDA";
 		prefix = "CUDA";

+ 16 - 4
src/datawizard/user_interactions.c

@@ -257,13 +257,19 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 {
 {
-	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node_cb(handle, home_node, mode, callback, arg);
 }
 }
 
 
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 {
 {
-	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, home_node, mode, callback, arg, sequential_consistency);
 }
 }
 
 
 
 
@@ -372,7 +378,10 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 
 
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
-	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	return starpu_data_acquire_on_node(handle, home_node, mode);
 }
 }
 
 
 int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
 int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
@@ -445,7 +454,10 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
 
 void starpu_data_release(starpu_data_handle_t handle)
 void starpu_data_release(starpu_data_handle_t handle)
 {
 {
-	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
+	int home_node = handle->home_node;
+	if (home_node < 0)
+		home_node = STARPU_MAIN_RAM;
+	starpu_data_release_on_node(handle, home_node);
 }
 }
 
 
 static void _prefetch_data_on_node(void *arg)
 static void _prefetch_data_on_node(void *arg)

+ 25 - 19
src/drivers/cpu/driver_cpu.c

@@ -153,35 +153,43 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	size_t global_mem;
 	size_t global_mem;
-	starpu_ssize_t limit;
+	starpu_ssize_t limit = -1;
 
 
-	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
-#ifdef STARPU_DEVEL
-#  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
-#endif
+	char name[32];
 
 
 #if defined(STARPU_HAVE_HWLOC)
 #if defined(STARPU_HAVE_HWLOC)
 	struct _starpu_machine_topology *topology = &config->topology;
 	struct _starpu_machine_topology *topology = &config->topology;
 
 
-#if 0
-	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
-        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
-
-	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
-	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+	int nnumas = starpu_memory_nodes_get_numa_count();
+	if (nnumas > 1)
+	{
+		int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
+
+		if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
+		     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+		else {
+		     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
+		     global_mem = obj->memory.local_memory;
+		     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
+		     limit = starpu_get_env_number(name);
+		}
+	}
 	else
 	else
-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
-#else
-	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
-#endif
+	{
+		/* Do not limit ourself to a single NUMA node */
+		global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
+	}
 
 
 #else /* STARPU_HAVE_HWLOC */
 #else /* STARPU_HAVE_HWLOC */
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
-#  warning use sysinfo when available to get global size
+#  warning TODO: use sysinfo when available to get global size
 #endif
 #endif
 	global_mem = 0;
 	global_mem = 0;
 #endif
 #endif
 
 
+	if (limit == -1)
+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+
 	if (limit < 0)
 	if (limit < 0)
 		// No limit is defined, we return the global memory size
 		// No limit is defined, we return the global memory size
 		return global_mem;
 		return global_mem;
@@ -198,9 +206,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 	int devid = cpu_worker->devid;
 	int devid = cpu_worker->devid;
 
 
 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
-	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
-	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
-
+	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->numa_memory_node, cpu_worker->config));
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
 	starpu_pthread_setname(cpu_worker->short_name);
 	starpu_pthread_setname(cpu_worker->short_name);

+ 3 - 3
src/drivers/cuda/driver_cuda.c

@@ -53,7 +53,7 @@
 static int ncudagpus = -1;
 static int ncudagpus = -1;
 
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
 static size_t global_mem[STARPU_MAXCUDADEVS];
-int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
@@ -163,7 +163,7 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 	int worker = starpu_worker_get_id_check();
 	int worker = starpu_worker_get_id_check();
 	int devid = starpu_worker_get_devid(worker);
 	int devid = starpu_worker_get_devid(worker);
 	cudaStream_t stream;
 	cudaStream_t stream;
-
+	
 	stream = in_transfer_streams[devid];
 	stream = in_transfer_streams[devid];
 	STARPU_ASSERT(stream);
 	STARPU_ASSERT(stream);
 	return stream;
 	return stream;
@@ -323,7 +323,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 					{
 					{
 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
 						/* direct copies are made from the destination, see link_supports_direct_transfers */
 						/* direct copies are made from the destination, see link_supports_direct_transfers */
-						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], 1);
 					}
 					}
 				}
 				}
 			}
 			}

+ 1 - 1
src/drivers/cuda/driver_cuda.h

@@ -32,7 +32,7 @@ extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
 
 void _starpu_cuda_init(void);
 void _starpu_cuda_init(void);
 unsigned _starpu_get_cuda_device_count(void);
 unsigned _starpu_get_cuda_device_count(void);
-extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);

+ 0 - 1
src/drivers/mp_common/source_common.c

@@ -950,7 +950,6 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
         starpu_pthread_wait_reset(&worker_set->workers[0].wait);
         starpu_pthread_wait_reset(&worker_set->workers[0].wait);
 #endif
 #endif
 
 
-
 	/* Test if async transfers are completed */
 	/* Test if async transfers are completed */
 	for (i = 0; i < worker_set->nworkers; i++)
 	for (i = 0; i < worker_set->nworkers; i++)
 	{
 	{

+ 4 - 4
src/drivers/mpi/driver_mpi_common.c

@@ -469,7 +469,7 @@ void _starpu_mpi_common_barrier(void)
 /* Compute bandwidth and latency between source and sink nodes
 /* Compute bandwidth and latency between source and sink nodes
  * Source node has to have the entire set of times at the end
  * Source node has to have the entire set of times at the end
  */
  */
-void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
+void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
 {
 {
         int ret;
         int ret;
         unsigned iter;
         unsigned iter;
@@ -506,7 +506,7 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
                                         STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
                                         STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
                                 }
                                 }
                                 end = starpu_timing_now();
                                 end = starpu_timing_now();
-                                bandwidth_dtod[sender][receiver] = (NITER*SIZE_BANDWIDTH)/(end - start);
+                                timing_dtod[sender][receiver] = (end - start)/NITER/SIZE_BANDWIDTH;
 
 
                                 /* measure latency sender to receiver */
                                 /* measure latency sender to receiver */
                                 start = starpu_timing_now();
                                 start = starpu_timing_now();
@@ -546,14 +546,14 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
                 /* if we are the sender, we send the data */
                 /* if we are the sender, we send the data */
                 if (sender == id_proc)
                 if (sender == id_proc)
                 {
                 {
-                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
+                        MPI_Send(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
                         MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
                         MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
                 }
                 }
 
 
                 /* the master node receives the data */
                 /* the master node receives the data */
                 if (src_node_id == id_proc)
                 if (src_node_id == id_proc)
                 {
                 {
-                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                        MPI_Recv(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                         MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                         MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                 }
                 }
 
 

+ 5 - 1
src/util/openmp_runtime_support.c

@@ -2415,8 +2415,12 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
 
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 {
 {
+	/* FIXME Oli: rather iterate over all nodes? */
+	int node = starpu_data_get_home_node(handle);
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	vector_interface->slice_base = slice_base;
 	vector_interface->slice_base = slice_base;
 }
 }

+ 2 - 0
tests/datawizard/interfaces/test_interfaces.c

@@ -794,6 +794,8 @@ handle_to_pointer(void)
 	{
 	{
 		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
 		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
 			continue;
 			continue;
+		if (!starpu_data_test_if_allocated_on_node(handle, node))
+			continue;
 
 
 		ptr = handle->ops->handle_to_pointer(handle, node);
 		ptr = handle->ops->handle_to_pointer(handle, node);
 		if (starpu_data_lookup(ptr) != handle)
 		if (starpu_data_lookup(ptr) != handle)

+ 8 - 0
tests/datawizard/nowhere.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2015-2016  Université de Bordeaux
  * Copyright (C) 2015-2016  Université de Bordeaux
+ * Copyright (C) 2017  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -84,6 +85,13 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		/* FIXME: assumes only one RAM node */
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
 	starpu_variable_data_register(&handle_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
 	starpu_variable_data_register(&handle_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
 	starpu_variable_data_register(&handle_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
 	starpu_variable_data_register(&handle_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
 
 

+ 8 - 4
tests/datawizard/specific_node.c

@@ -34,10 +34,8 @@ starpu_data_handle_t data_handle;
 
 
 unsigned data;
 unsigned data;
 
 
-void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void specific_kernel(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 {
-	/* We do not protect this variable because it is only accessed when the
-	 * "data_handle" piece of data is accessed. */
 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	STARPU_ASSERT(dataptr == &data);
 	STARPU_ASSERT(dataptr == &data);
@@ -55,6 +53,12 @@ static struct starpu_codelet specific_cl =
 	.nodes = {STARPU_MAIN_RAM},
 	.nodes = {STARPU_MAIN_RAM},
 };
 };
 
 
+void cpu_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*dataptr)++;
+}
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
 void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
 #endif
 #endif
@@ -64,7 +68,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args);
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =
 {
 {
-	.cpu_funcs = {specific_kernel},
+	.cpu_funcs = {cpu_codelet_unsigned_inc},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc},
 	.cuda_funcs = {cuda_codelet_unsigned_inc},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.cuda_flags = {STARPU_CUDA_ASYNC},

+ 11 - 1
tests/disk/mem_reclaim.c

@@ -60,7 +60,16 @@ int main(int argc, char **argv)
 }
 }
 #else
 #else
 
 
-const struct starpu_data_copy_methods my_vector_copy_data_methods_s;
+static int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
+
+/* We need a ram-to-ram copy for NUMA machine, use any_to_any for that */
+static int ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node) {
+	return any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+const struct starpu_data_copy_methods my_vector_copy_data_methods_s = {
+	.ram_to_ram = ram_to_ram
+};
 struct starpu_data_interface_ops starpu_interface_my_vector_ops;
 struct starpu_data_interface_ops starpu_interface_my_vector_ops;
 
 
 void starpu_my_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
 void starpu_my_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
@@ -218,6 +227,7 @@ int main(void)
 	setenv("STARPU_LIMIT_CPU_MEM", MEMSIZE_STR, 1);
 	setenv("STARPU_LIMIT_CPU_MEM", MEMSIZE_STR, 1);
 
 
 	/* Build an vector-like interface which doesn't have the any_to_any helper, to force making use of pack/unpack */
 	/* Build an vector-like interface which doesn't have the any_to_any helper, to force making use of pack/unpack */
+	any_to_any = starpu_interface_vector_ops.copy_methods->any_to_any;
 	memcpy(&starpu_interface_my_vector_ops, &starpu_interface_vector_ops, sizeof(starpu_interface_my_vector_ops));
 	memcpy(&starpu_interface_my_vector_ops, &starpu_interface_vector_ops, sizeof(starpu_interface_my_vector_ops));
 	starpu_interface_my_vector_ops.copy_methods = &my_vector_copy_data_methods_s;
 	starpu_interface_my_vector_ops.copy_methods = &my_vector_copy_data_methods_s;