瀏覽代碼

Add --enable-numa option to configure and some NUMA support in StarPU memory management

Jérôme Clet-Ortega 8 年之前
父節點
當前提交
51c024c820

+ 26 - 0
configure.ac

@@ -158,6 +158,32 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
 
 ###############################################################################
 ###############################################################################
 
 
+###############################################################################
+#                                                                             #
+#                           NUMA memory nodes                                 #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(numa, [AS_HELP_STRING([--enable-numa],
+	      [use NUMA node(s)])], [enable_numa=$enableval], [enable_numa=no])
+
+if test x$enable_numa = xyes ; then
+	AC_DEFINE(STARPU_USE_NUMA, [1], [NUMA memory nodes support is enabled])
+fi
+
+AM_CONDITIONAL([STARPU_USE_NUMA], [test "x$enable_numa" = "xyes"])
+
+AC_MSG_CHECKING(maximum number of NUMA nodes)
+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
+			[maximum number of NUMA nodes])],
+			nmaxnumanodes=$enableval, nmaxnumanodes=16)
+AC_MSG_RESULT($nmaxnumanodes)
+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
+		[maximum number of NUMA nodes])
+
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])

+ 1 - 0
include/starpu_config.h.in

@@ -87,6 +87,7 @@
 #undef STARPU_MAXNODES
 #undef STARPU_MAXNODES
 #undef STARPU_NMAXBUFS
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCPUS
+#undef STARPU_MAXNUMANODES
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXMICDEVS
 #undef STARPU_MAXMICDEVS

+ 93 - 1
src/core/topology.c

@@ -57,6 +57,12 @@ static int nobind;
 
 
 /* For checking whether two workers share the same PU, indexed by PU number */
 /* For checking whether two workers share the same PU, indexed by PU number */
 static int cpu_worker[STARPU_MAXCPUS];
 static int cpu_worker[STARPU_MAXCPUS];
+#ifdef STARPU_USE_NUMA
+static unsigned nb_numa_nodes = 0;
+static unsigned numa_memory_nodes[STARPU_MAXNUMANODES];
+#endif /* STARPU_USE_NUMA */
+
+
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 
 
@@ -804,6 +810,18 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 	return config->topology.nhwpus;
 	return config->topology.nhwpus;
 }
 }
 
 
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config)
+{
+#ifdef STARPU_USE_NUMA
+	struct _starpu_machine_topology *topology = &config->topology ;
+        int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
+	return nnumanodes > 0 ? nnumanodes : 1 ;
+#else /* STARPU_USE_NUMA */
+	return 1 ;
+#endif /* STARPU_USE_NUMA */
+}
+
+
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 static void
 static void
 _starpu_init_mic_config (struct _starpu_machine_config *config,
 _starpu_init_mic_config (struct _starpu_machine_config *config,
@@ -1538,8 +1556,20 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	 * giving it a memory node and a core bind id.
 	 * giving it a memory node and a core bind id.
 	 */
 	 */
 	/* TODO: STARPU_MAXNUMANODES */
 	/* TODO: STARPU_MAXNUMANODES */
+#ifdef STARPU_USE_NUMA
+	unsigned n;
+	unsigned numa_init[STARPU_MAXNUMANODES];
+	numa_init[0] = 1 ;
+	nb_numa_nodes = 1;
+	numa_memory_nodes[0] = ram_memory_node ;
+	for (n=1; n<STARPU_MAXNUMANODES; n++)
+	{
+		numa_init[n] = 0;
+	}	
+#else /* STARPU_USE_NUMA */
 	unsigned numa_init[1] = { 1 };
 	unsigned numa_init[1] = { 1 };
 	unsigned numa_memory_nodes[1] = { ram_memory_node };
 	unsigned numa_memory_nodes[1] = { ram_memory_node };
+#endif /* STARPU_USE_NUMA */
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
@@ -1583,8 +1613,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		{
 		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			{
 			{
+				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				/* TODO: NUMA */
 				/* TODO: NUMA */
+#ifdef STARPU_USE_NUMA
+				int numaid = workerarg->numa_memory_node = _starpu_worker_numa_node(worker);
+#else /* STARPU_USE_NUMA */
 				int numaid = 0;
 				int numaid = 0;
+#endif /* STARPU_USE_NUMA */
 				/* "dedicate" a cpu core to that worker */
 				/* "dedicate" a cpu core to that worker */
 				if (numa_init[numaid])
 				if (numa_init[numaid])
 				{
 				{
@@ -1593,6 +1628,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				else
 				else
 				{
 				{
 					numa_init[numaid] = 1;
 					numa_init[numaid] = 1;
+#ifdef STARPU_USE_NUMA
+					nb_numa_nodes++;
+#endif /* STARPU_USE_NUMA */
 					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
 					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 					snprintf(name, sizeof(name), "RAM%d", numaid);
 					snprintf(name, sizeof(name), "RAM%d", numaid);
@@ -1601,12 +1639,17 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					_starpu_simgrid_memory_node_set_host(memory_node, host);
 					_starpu_simgrid_memory_node_set_host(memory_node, host);
 #endif
 #endif
 				}
 				}
-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				//workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
+#ifdef STARPU_USE_NUMA
+				if (_starpu_node_get_kind(memory_node) != STARPU_CPU_RAM)
+					starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
+#else /* STARPU_USE_NUMA */
 				if (memory_node != STARPU_MAIN_RAM)
 				if (memory_node != STARPU_MAIN_RAM)
 					starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
 					starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
+#endif /* STARPU_USE_NUMA */
 #endif
 #endif
 				break;
 				break;
 			}
 			}
@@ -1979,3 +2022,52 @@ starpu_topology_print (FILE *output)
 		fprintf(output, "\n");
 		fprintf(output, "\n");
 	}
 	}
 }
 }
+
+#ifdef STARPU_USE_NUMA
+int _starpu_get_nb_numa_nodes()
+{
+	return nb_numa_nodes;
+}
+
+int _starpu_numaid_to_memnode(unsigned numaid)
+{
+	if (numaid < nb_numa_nodes)
+		return numa_memory_nodes[numaid];
+	return -1;
+}
+
+int _starpu_memnode_to_numaid(unsigned memnode)
+{
+	int numaid;
+
+	for (numaid=0; numaid<nb_numa_nodes; numaid++)
+		if (numa_memory_nodes[numaid] == memnode)
+			return numaid;
+
+	return 0;
+}
+
+int _starpu_worker_numa_node(unsigned workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	#ifdef STARPU_HAVE_HWLOC
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
+	struct _starpu_machine_topology *topology = &config->topology ;
+	hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid) ;
+	STARPU_ASSERT(obj) ;
+	while (obj->depth != HWLOC_OBJ_NODE)
+	{
+		obj = obj->parent;
+
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		STARPU_ASSERT(obj);
+	}
+	STARPU_ASSERT(obj->depth == HWLOC_OBJ_NODE);
+	return obj->logical_index;
+	#else /* STARPU_HAVE_HWLOC */
+	return 0 ;
+	#endif /* STARPU_HAVE_HWLOC */
+}
+#endif /* STARPU_USE_NUMA */

+ 10 - 0
src/core/topology.h

@@ -51,6 +51,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 /* returns the number of logical cpus */
 /* returns the number of logical cpus */
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 
 
+/* returns the number of NUMA nodes */
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
+
 #define STARPU_NOWORKERID -1
 #define STARPU_NOWORKERID -1
 /* Bind the current thread on the CPU logically identified by "cpuid". The
 /* Bind the current thread on the CPU logically identified by "cpuid". The
  * logical ordering of the processors is either that of hwloc (if available),
  * logical ordering of the processors is either that of hwloc (if available),
@@ -63,4 +66,11 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 
 
 void *_starpu_get_worker_from_driver(struct starpu_driver *d);
 void *_starpu_get_worker_from_driver(struct starpu_driver *d);
 
 
+#ifdef STARPU_USE_NUMA
+int _starpu_numaid_to_memnode(unsigned numaid);
+int _starpu_memnode_to_numaid(unsigned memnode);
+int _starpu_get_nb_numa_nodes();
+int _starpu_worker_numa_node(unsigned workerid);
+#endif /* STARPU_USE_NUMA */
+	
 #endif // __TOPOLOGY_H__
 #endif // __TOPOLOGY_H__

+ 13 - 1
src/core/workers.c

@@ -1515,8 +1515,20 @@ void starpu_shutdown(void)
 
 
 	/* tell all workers to shutdown */
 	/* tell all workers to shutdown */
 	_starpu_kill_all_workers(&_starpu_config);
 	_starpu_kill_all_workers(&_starpu_config);
-
+	
+#ifdef STARPU_USE_NUMA
+	{
+		int i;
+		unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+		for (i=0; i<nb_numa_nodes; i++)
+		{
+			unsigned id = _starpu_numaid_to_memnode(i);
+			_starpu_free_all_automatically_allocated_buffers(id);
+		}
+	}
+#else /* STARPU_USE_NUMA */
 	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
 	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
+#endif /* STARPU_USE_NUMA */
 
 
 	{
 	{
 	     int stats = starpu_get_env_number("STARPU_STATS");
 	     int stats = starpu_get_env_number("STARPU_STATS");

+ 3 - 0
src/core/workers.h

@@ -76,6 +76,9 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
+#ifdef STARPU_USE_NUMA
+	unsigned numa_memory_node; /* which numa memory node is the worker associated with ? */
+#endif /* STARPU_USE_NUMA */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */

+ 4 - 0
src/datawizard/coherency.c

@@ -562,7 +562,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
 			dst_replicate->initialized = 1;
+#ifdef STARPU_USE_NUMA
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)		
+#else /* STARPU_USE_NUMA */
 		if (requesting_node == STARPU_MAIN_RAM && !nwait)
 		if (requesting_node == STARPU_MAIN_RAM && !nwait)
+#endif /* STARPU_USE_NUMA */
 		{
 		{
 			/* And this is the main RAM, really no need for a
 			/* And this is the main RAM, really no need for a
 			 * request, just allocate */
 			 * request, just allocate */

+ 5 - 0
src/datawizard/copy_driver.c

@@ -737,9 +737,14 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 		break;
 		break;
 #endif
 #endif
 	case STARPU_MAIN_RAM:
 	case STARPU_MAIN_RAM:
+#ifdef STARPU_USE_NUMA
+	case STARPU_CPU_RAM:
+#endif /* STARPU_USE_NUMA */
 		starpu_disk_wait_request(async_channel);
 		starpu_disk_wait_request(async_channel);
 		break;
 		break;
+#ifndef STARPU_USE_NUMA
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
+#endif /* !STARPU_USE_NUMA */
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}

+ 8 - 0
src/datawizard/data_request.c

@@ -155,7 +155,11 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	if (handling_node == -1)
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
 	r->handling_node = handling_node;
+#ifdef STARPU_USE_NUMA
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
+#else /* STARPU_USE_NUMA */
 	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+#endif /* STARPU_USE_NUMA */
 	r->completed = 0;
 	r->completed = 0;
 	r->prefetch = is_prefetch;
 	r->prefetch = is_prefetch;
 	r->prio = prio;
 	r->prio = prio;
@@ -283,7 +287,11 @@ void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling
 {
 {
 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
+#ifdef STARPU_USE_NUMA
+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
+#else /* STARPU_USE_NUMA */
 	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
+#endif /* STARPU_USE_NUMA */
 
 
 //	_STARPU_DEBUG("POST REQUEST\n");
 //	_STARPU_DEBUG("POST REQUEST\n");
 
 

+ 14 - 2
src/datawizard/filters.c

@@ -188,7 +188,13 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		/* This is lazy allocation, allocate it now in main RAM, so as
 		 * to have somewhere to gather pieces later */
 		 * to have somewhere to gather pieces later */
 		/* FIXME: mark as unevictable! */
 		/* FIXME: mark as unevictable! */
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
+		int home_node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+		home_node = initial_handle->home_node;
+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+			home_node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #warning we should reclaim memory if allocation failed
 #endif
 #endif
@@ -320,8 +326,14 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		 * store it in the handle */
 		 * store it in the handle */
 		child->footprint = _starpu_compute_data_footprint(child);
 		child->footprint = _starpu_compute_data_footprint(child);
 
 
+		int home_node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+		home_node = child->home_node;
+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+			home_node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
 		void *ptr;
 		void *ptr;
-		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
+		ptr = starpu_data_handle_to_pointer(child, home_node);
 		if (ptr != NULL)
 		if (ptr != NULL)
 			_starpu_data_register_ram_pointer(child, ptr);
 			_starpu_data_register_ram_pointer(child, ptr);
 	}
 	}

+ 64 - 8
src/datawizard/interfaces/bcsr_interface.c

@@ -155,48 +155,90 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 /* offer an access to the data parameters */
 /* offer an access to the data parameters */
 uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->nnz;
 	return data_interface->nnz;
 }
 }
 
 
 uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->nrow;
 	return data_interface->nrow;
 }
 }
 
 
 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->firstentry;
 	return data_interface->firstentry;
 }
 }
 
 
 uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->r;
 	return data_interface->r;
 }
 }
 
 
 uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->c;
 	return data_interface->c;
 }
 }
 
 
 size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->elemsize;
 	return data_interface->elemsize;
 }
 }
@@ -216,18 +258,32 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->colind;
 	return data_interface->colind;
 }
 }
 
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return data_interface->rowptr;
 	return data_interface->rowptr;
 }
 }

+ 46 - 6
src/datawizard/interfaces/block_interface.c

@@ -216,7 +216,13 @@ static void display_block_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
 	struct starpu_block_interface *block_interface;
 	struct starpu_block_interface *block_interface;
 
 
-	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 }
 }
@@ -287,7 +293,13 @@ static size_t block_interface_get_size(starpu_data_handle_t handle)
 	size_t size;
 	size_t size;
 	struct starpu_block_interface *block_interface;
 	struct starpu_block_interface *block_interface;
 
 
-	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, node);
 
 
 	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
 	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
 
 
@@ -297,24 +309,45 @@ static size_t block_interface_get_size(starpu_data_handle_t handle)
 /* offer an access to the data parameters */
 /* offer an access to the data parameters */
 uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return block_interface->nx;
 	return block_interface->nx;
 }
 }
 
 
 uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
 uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return block_interface->ny;
 	return block_interface->ny;
 }
 }
 
 
 uint32_t starpu_block_get_nz(starpu_data_handle_t handle)
 uint32_t starpu_block_get_nz(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return block_interface->nz;
 	return block_interface->nz;
 }
 }
@@ -360,8 +393,15 @@ uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle)
 
 
 size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return block_interface->elemsize;
 	return block_interface->elemsize;
 }
 }

+ 24 - 3
src/datawizard/interfaces/coo_interface.c

@@ -150,9 +150,16 @@ free_coo_buffer_on_node(void *data_interface, unsigned node)
 static size_t
 static size_t
 coo_interface_get_size(starpu_data_handle_t handle)
 coo_interface_get_size(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_coo_interface *coo_interface;
 	struct starpu_coo_interface *coo_interface;
 	coo_interface = (struct starpu_coo_interface *)
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return coo_interface->nx * coo_interface->ny * coo_interface->elemsize;
 	return coo_interface->nx * coo_interface->ny * coo_interface->elemsize;
 }
 }
@@ -160,9 +167,16 @@ coo_interface_get_size(starpu_data_handle_t handle)
 static uint32_t
 static uint32_t
 coo_interface_footprint(starpu_data_handle_t handle)
 coo_interface_footprint(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_coo_interface *coo_interface;
 	struct starpu_coo_interface *coo_interface;
 	coo_interface = (struct starpu_coo_interface *)
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return starpu_hash_crc32c_be(coo_interface->nx * coo_interface->ny, 0);
 	return starpu_hash_crc32c_be(coo_interface->nx * coo_interface->ny, 0);
 }
 }
@@ -184,9 +198,16 @@ coo_compare(void *a, void *b)
 static void
 static void
 display_coo_interface(starpu_data_handle_t handle, FILE *f)
 display_coo_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_coo_interface *coo_interface;
 	struct starpu_coo_interface *coo_interface;
 	coo_interface = (struct starpu_coo_interface *)
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 }
 }

+ 32 - 4
src/datawizard/interfaces/csr_interface.c

@@ -140,32 +140,60 @@ static int csr_compare(void *data_interface_a, void *data_interface_b)
 /* offer an access to the data parameters */
 /* offer an access to the data parameters */
 uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
 uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return csr_interface->nnz;
 	return csr_interface->nnz;
 }
 }
 
 
 uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
 uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return csr_interface->nrow;
 	return csr_interface->nrow;
 }
 }
 
 
 uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
 uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return csr_interface->firstentry;
 	return csr_interface->firstentry;
 }
 }
 
 
 size_t starpu_csr_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_csr_get_elemsize(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return csr_interface->elemsize;
 	return csr_interface->elemsize;
 }
 }

+ 42 - 3
src/datawizard/interfaces/data_interface.c

@@ -408,6 +408,21 @@ _starpu_data_initialize_per_worker(starpu_data_handle_t handle)
 		/* duplicate  the content of the interface on node 0 */
 		/* duplicate  the content of the interface on node 0 */
 		memcpy(replicate->data_interface, handle->per_node[STARPU_MAIN_RAM].data_interface, interfacesize);
 		memcpy(replicate->data_interface, handle->per_node[STARPU_MAIN_RAM].data_interface, interfacesize);
 	}
 	}
+
+	/* now the data is available ! */
+	_starpu_spin_unlock(&handle->header_lock);
+
+	int handle_node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	handle_node = handle->home_node;
+	if (handle_node < 0 || (_starpu_node_get_kind(handle_node) != STARPU_CPU_RAM))
+		handle_node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	ptr = starpu_data_handle_to_pointer(handle, handle_node);
+	if (ptr != NULL)
+	{
+		_starpu_data_register_ram_pointer(handle, ptr);
+	}
 }
 }
 
 
 void starpu_data_ptr_register(starpu_data_handle_t handle, unsigned node)
 void starpu_data_ptr_register(starpu_data_handle_t handle, unsigned node)
@@ -481,7 +496,14 @@ void starpu_data_register(starpu_data_handle_t *handleptr, int home_node,
 
 
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
 {
 {
-	void *local_interface = starpu_data_get_interface_on_node(handlesrc, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handlesrc->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
+	void *local_interface = starpu_data_get_interface_on_node(handlesrc, node);
 	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
 	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
 }
 }
 
 
@@ -515,7 +537,14 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 
 
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 {
 {
-	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
+	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
 #ifdef STARPU_OPENMP
 #ifdef STARPU_OPENMP
 	if (handle->removed_from_context_hash)
 	if (handle->removed_from_context_hash)
 		return;
 		return;
@@ -751,7 +780,13 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
 			struct starpu_multiformat_interface *format_interface;
-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+			home_node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+			home_node = handle->home_node;
+			if (home_node < 0 || (_starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+				home_node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			struct starpu_codelet *cl = NULL;
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 
 
@@ -966,7 +1001,11 @@ static void _starpu_data_invalidate(void *data)
 
 
 		if (local->mc && local->allocated && local->automatically_allocated)
 		if (local->mc && local->allocated && local->automatically_allocated)
 		{
 		{
+#ifdef STARPU_USE_NUMA
+			if (_starpu_node_get_kind(node) == STARPU_CPU_RAM)
+#else /* STARPU_USE_NUMA */		
 			if (node == STARPU_MAIN_RAM)
 			if (node == STARPU_MAIN_RAM)
+#endif /* STARPU_USE_NUMA */		
 				_starpu_data_unregister_ram_pointer(handle);
 				_starpu_data_unregister_ram_pointer(handle);
 
 
 			/* free the data copy in a lazy fashion */
 			/* free the data copy in a lazy fashion */

+ 40 - 5
src/datawizard/interfaces/matrix_interface.c

@@ -217,8 +217,15 @@ static int matrix_compare(void *data_interface_a, void *data_interface_b)
 
 
 static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 }
 }
@@ -275,8 +282,15 @@ static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void
 
 
 static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	size_t size;
 	size_t size;
 	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
 	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
@@ -287,16 +301,30 @@ static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 /* offer an access to the data parameters */
 /* offer an access to the data parameters */
 uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return matrix_interface->nx;
 	return matrix_interface->nx;
 }
 }
 
 
 uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle)
 uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return matrix_interface->ny;
 	return matrix_interface->ny;
 }
 }
@@ -329,8 +357,15 @@ uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle)
 
 
 size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return matrix_interface->elemsize;
 	return matrix_interface->elemsize;
 }
 }

+ 22 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -242,8 +242,15 @@ static int multiformat_compare(void *data_interface_a, void *data_interface_b)
 static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f)
 static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
 	struct starpu_multiformat_interface *multiformat_interface;
 	struct starpu_multiformat_interface *multiformat_interface;
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	multiformat_interface = (struct starpu_multiformat_interface *)
 	multiformat_interface = (struct starpu_multiformat_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%u\t", multiformat_interface->nx);
 	fprintf(f, "%u\t", multiformat_interface->nx);
 }
 }
@@ -253,7 +260,13 @@ static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
 {
 {
 	size_t size;
 	size_t size;
 	struct starpu_multiformat_interface *multiformat_interface;
 	struct starpu_multiformat_interface *multiformat_interface;
-	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
 	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
 	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
 	return size;
 	return size;
 }
 }
@@ -261,7 +274,13 @@ static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
 uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle)
 {
 {
 	struct starpu_multiformat_interface *multiformat_interface;
 	struct starpu_multiformat_interface *multiformat_interface;
-	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
 	return multiformat_interface->nx;
 	return multiformat_interface->nx;
 }
 }
 
 

+ 24 - 3
src/datawizard/interfaces/variable_interface.c

@@ -154,8 +154,15 @@ static int variable_compare(void *data_interface_a, void *data_interface_b)
 
 
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 }
 }
@@ -193,8 +200,15 @@ static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, vo
 
 
 static size_t variable_interface_get_size(starpu_data_handle_t handle)
 static size_t variable_interface_get_size(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return variable_interface->elemsize;
 	return variable_interface->elemsize;
 }
 }
@@ -211,7 +225,14 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle)
 
 
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
 {
 {
-	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM));
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
+	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, node));
 }
 }
 
 
 /* memory allocation/deallocation primitives for the variable interface */
 /* memory allocation/deallocation primitives for the variable interface */

+ 32 - 4
src/datawizard/interfaces/vector_interface.c

@@ -162,8 +162,15 @@ static int vector_compare(void *data_interface_a, void *data_interface_b)
 
 
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	fprintf(f, "%u\t", vector_interface->nx);
 	fprintf(f, "%u\t", vector_interface->nx);
 }
 }
@@ -201,9 +208,16 @@ static int unpack_vector_handle(starpu_data_handle_t handle, unsigned node, void
 
 
 static size_t vector_interface_get_size(starpu_data_handle_t handle)
 static size_t vector_interface_get_size(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	size_t size;
 	size_t size;
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	size = vector_interface->nx*vector_interface->elemsize;
 	size = vector_interface->nx*vector_interface->elemsize;
 
 
@@ -213,8 +227,15 @@ static size_t vector_interface_get_size(starpu_data_handle_t handle)
 /* offer an access to the data parameters */
 /* offer an access to the data parameters */
 uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return vector_interface->nx;
 	return vector_interface->nx;
 }
 }
@@ -234,8 +255,15 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle)
 
 
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle)
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 
 
 	return vector_interface->elemsize;
 	return vector_interface->elemsize;
 }
 }

+ 8 - 1
src/datawizard/interfaces/void_interface.c

@@ -70,7 +70,14 @@ static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UN
 /* declare a new data with the void interface */
 /* declare a new data with the void interface */
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 {
 {
-	starpu_data_register(handleptr, STARPU_MAIN_RAM, NULL, &starpu_interface_void_ops);
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = (*handleptr)->home_node;
+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+
+	starpu_data_register(handleptr, node, NULL, &starpu_interface_void_ops);
 }
 }
 
 
 
 

+ 37 - 7
src/datawizard/malloc.c

@@ -107,23 +107,36 @@ static struct starpu_codelet malloc_pinned_cl =
 };
 };
 #endif
 #endif
 
 
+#ifdef STARPU_USE_NUMA
 int starpu_malloc_flags(void **A, size_t dim, int flags)
 int starpu_malloc_flags(void **A, size_t dim, int flags)
 {
 {
+	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
+}
+
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
+#else /* STARPU_USE_NUMA */
+int starpu_malloc_flags(void **A, size_t dim, int flags)
+#endif /* STARPU_USE_NUMA */
+{
 	int ret=0;
 	int ret=0;
 
 
+#ifndef STARPU_USE_NUMA
+	unsigned dst_node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
+	
 	STARPU_ASSERT(A);
 	STARPU_ASSERT(A);
 
 
 	if (flags & STARPU_MALLOC_COUNT)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
+			while (starpu_memory_allocate(dst_node, dim, flags) != 0)
 			{
 			{
 				size_t freed;
 				size_t freed;
 				size_t reclaim = 2 * dim;
 				size_t reclaim = 2 * dim;
 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", reclaim);
 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", reclaim);
-				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
-				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
-				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
+				_STARPU_TRACE_START_MEMRECLAIM(dst_node,0);
+				freed = _starpu_memory_reclaim_generic(dst_node, 0, reclaim);
+				_STARPU_TRACE_END_MEMRECLAIM(dst_node,0);
 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				{
 				{
 					// We could not reclaim enough memory
 					// We could not reclaim enough memory
@@ -132,9 +145,9 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 				}
 				}
 			}
 			}
 		else if (flags & STARPU_MEMORY_WAIT)
 		else if (flags & STARPU_MEMORY_WAIT)
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
+			starpu_memory_allocate(dst_node, dim, flags);
 		else
 		else
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
+			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 	}
 
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -286,6 +299,18 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 #endif
 #endif
 	}
 	}
 	else
 	else
+#ifdef STARPU_USE_NUMA
+	{
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t hwtopology = config->topology.hwtopology;
+		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, _starpu_memnode_to_numaid(dst_node));
+		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
+		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
+		fprintf(stderr, "Allocation %d bytes on NUMA node %d [%p]\n", dim, _starpu_memnode_to_numaid(dst_node), *A);
+		if (!*A)
+			ret = -ENOMEM;
+	}
+#else /* STARPU_USE_NUMA */
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 	if (_malloc_align != sizeof(void*))
 	if (_malloc_align != sizeof(void*))
 	{
 	{
@@ -310,6 +335,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 			if (!*A)
 			if (!*A)
 				ret = -ENOMEM;
 				ret = -ENOMEM;
 		}
 		}
+#endif /* STARPU_USE_NUMA */
 
 
 #if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
 #if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
 end:
 end:
@@ -320,7 +346,7 @@ end:
 	}
 	}
 	else if (flags & STARPU_MALLOC_COUNT)
 	else if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
+		starpu_memory_deallocate(dst_node, dim);
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -496,7 +522,11 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	{
 	{
 		case STARPU_CPU_RAM:
 		case STARPU_CPU_RAM:
 		{
 		{
+#ifdef STARPU_USE_NUMA
+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
+#else /* STARPU_USE_NUMA */
 			starpu_malloc_flags((void**) &addr, size,
 			starpu_malloc_flags((void**) &addr, size,
+#endif /* STARPU_USE_NUMA */
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					/* without memcpy_peer, we can not
 					/* without memcpy_peer, we can not
 					 * allocated pinned memory, since it
 					 * allocated pinned memory, since it

+ 3 - 0
src/datawizard/malloc.h

@@ -22,4 +22,7 @@ void _starpu_malloc_shutdown(unsigned dst_node);
 
 
 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
 
+#ifdef STARPU_USE_NUMA
+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
+#endif /* STARPU_USE_NUMA */
 #endif
 #endif

+ 59 - 1
src/datawizard/memalloc.c

@@ -381,7 +381,11 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 			data_interface = mc->chunk_interface;
 			data_interface = mc->chunk_interface;
 		STARPU_ASSERT(data_interface);
 		STARPU_ASSERT(data_interface);
 
 
+#ifdef STARPU_USE_NUMA
+		if (handle && (starpu_node_get_kind(node) == STARPU_CPU_RAM))
+#else /* STARPU_USE_NUMA */
 		if (handle && node == STARPU_MAIN_RAM)
 		if (handle && node == STARPU_MAIN_RAM)
+#endif /* STARPU_USE_NUMA */
 			_starpu_data_unregister_ram_pointer(handle);
 			_starpu_data_unregister_ram_pointer(handle);
 
 
 		_STARPU_TRACE_START_FREE(node, mc->size);
 		_STARPU_TRACE_START_FREE(node, mc->size);
@@ -629,7 +633,14 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 			 * away after writing it back to main memory */
 			 * away after writing it back to main memory */
 			_starpu_spin_unlock(&mc_lock[node]);
 			_starpu_spin_unlock(&mc_lock[node]);
 			_STARPU_TRACE_START_WRITEBACK(node);
 			_STARPU_TRACE_START_WRITEBACK(node);
+#ifdef STARPU_USE_NUMA
+			int home_node = old_data->home_node;
+			if (home_node < 0 || (_starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
+				home_node = STARPU_MAIN_RAM;
+			res = transfer_subtree_to_node(old_data, node, home_node);
+#else /* STARPU_USE_NUMA */
 			res = transfer_subtree_to_node(old_data, node, STARPU_MAIN_RAM);
 			res = transfer_subtree_to_node(old_data, node, STARPU_MAIN_RAM);
+#endif /* STARPU_USE_NUMA */
 			_STARPU_TRACE_END_WRITEBACK(node);
 			_STARPU_TRACE_END_WRITEBACK(node);
 			_starpu_spin_lock(&mc_lock[node]);
 			_starpu_spin_lock(&mc_lock[node]);
 
 
@@ -1436,11 +1447,15 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 	replicate->allocated = 1;
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 	replicate->automatically_allocated = 1;
 
 
+#ifdef STARPU_USE_NUMA
+	if (replicate->relaxed_coherency == 0 && (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM))
+#else /* STARPU_USE_NUMA */
 	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
 	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
+#endif /* STARPU_USE_NUMA */
 	{
 	{
 		/* We are allocating the buffer in main memory, also register it
 		/* We are allocating the buffer in main memory, also register it
 		 * for the gcc plugin.  */
 		 * for the gcc plugin.  */
-		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+		void *ptr = starpu_data_handle_to_pointer(handle, dst_node);
 		if (ptr != NULL)
 		if (ptr != NULL)
 		{
 		{
 			_starpu_data_register_ram_pointer(handle, ptr);
 			_starpu_data_register_ram_pointer(handle, ptr);
@@ -1601,14 +1616,34 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	size_t size_handle = _starpu_data_get_size(handle);
 	size_t size_handle = _starpu_data_get_size(handle);
 	if (handle->home_node != -1)
 	if (handle->home_node != -1)
 		/* try to push on RAM if we can before to push on disk */
 		/* try to push on RAM if we can before to push on disk */
+#ifdef STARPU_USE_NUMA
+		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
+#else /* STARPU_USE_NUMA */
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
+#endif /* STARPU_USE_NUMA */
 		{
 		{
+#ifdef STARPU_USE_NUMA
+ 	                int i;
+			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+			for (i=0; i<nb_numa_nodes; i++)
+			{
+				unsigned id = _starpu_numaid_to_memnode(i);
+				if (handle->per_node[id].allocated || 
+				    _starpu_memory_manager_test_allocate_size(id, size_handle) == 1)
+				{
+					target = id;
+					break;
+				}
+			}
+			if (target == -1)
+#else /* STARPU_USE_NUMA */
 			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 			{
 			{
 				target = STARPU_MAIN_RAM;
 				target = STARPU_MAIN_RAM;
 			}
 			}
 			else
 			else
+#endif /* STARPU_USE_NUMA */
 			{
 			{
 				target = get_better_disk_can_accept_size(handle, node);
 				target = get_better_disk_can_accept_size(handle, node);
 			}
 			}
@@ -1623,12 +1658,34 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 	{
 	{
 		/* handle->home_node == -1 */
 		/* handle->home_node == -1 */
 		/* no place for datas in RAM, we push on disk */
 		/* no place for datas in RAM, we push on disk */
+#ifdef STARPU_USE_NUMA
+		if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
+#else /* STARPU_USE_NUMA */
 		if (node == STARPU_MAIN_RAM)
 		if (node == STARPU_MAIN_RAM)
+#endif /* STARPU_USE_NUMA */
 		{
 		{
 			target = get_better_disk_can_accept_size(handle, node);
 			target = get_better_disk_can_accept_size(handle, node);
 		}
 		}
 		/* node != 0 */
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
 		/* try to push data to RAM if we can before to push on disk*/
+#ifdef STARPU_USE_NUMA
+		else {
+			int i;
+			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+			for (i=0; i<nb_numa_nodes; i++)
+			{
+				unsigned id = _starpu_numaid_to_memnode(i);
+				if (handle->per_node[id].allocated || 
+				    _starpu_memory_manager_test_allocate_size(id, size_handle) == 1)
+				{
+					target = id;
+					break;
+				}
+			}
+		}
+		/* no place in RAM */
+		if (target == -1)
+#else /* STARPU_USE_NUMA */
 		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 		{
 		{
@@ -1636,6 +1693,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		}
 		}
 		/* no place in RAM */
 		/* no place in RAM */
 		else
 		else
+#endif /* STARPU_USE_NUMA */
 		{
 		{
 			target = get_better_disk_can_accept_size(handle, node);
 			target = get_better_disk_can_accept_size(handle, node);
 		}
 		}

+ 0 - 1
src/datawizard/memory_nodes.c

@@ -112,7 +112,6 @@ unsigned _starpu_memory_node_register(enum starpu_node_kind kind, int devid)
 	/* ATOMIC_ADD returns the new value ... */
 	/* ATOMIC_ADD returns the new value ... */
 	node = STARPU_ATOMIC_ADD(&_starpu_descr.nnodes, 1) - 1;
 	node = STARPU_ATOMIC_ADD(&_starpu_descr.nnodes, 1) - 1;
 	STARPU_ASSERT_MSG(node < STARPU_MAXNODES,"Too many nodes (%u) for maximum %u. Use configure option --enable-maxnodes=xxx to update the maximum number of nodes.", node, STARPU_MAXNODES);
 	STARPU_ASSERT_MSG(node < STARPU_MAXNODES,"Too many nodes (%u) for maximum %u. Use configure option --enable-maxnodes=xxx to update the maximum number of nodes.", node, STARPU_MAXNODES);
-
 	_starpu_descr.nodes[node] = kind;
 	_starpu_descr.nodes[node] = kind;
 	_STARPU_TRACE_NEW_MEM_NODE(node);
 	_STARPU_TRACE_NEW_MEM_NODE(node);
 
 

+ 24 - 0
src/datawizard/user_interactions.c

@@ -189,12 +189,24 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 {
 {
+#ifdef STARPU_USE_NUMA
+	int home_node = handle->home_node;
+	if (home_node >= 0 && (starpu_node_get_kind(home_node) == STARPU_CPU_RAM))
+		return starpu_data_acquire_on_node_cb(handle, home_node, mode, callback, arg);
+	else
+#endif /* STARPU_USE_NUMA */
 	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
 	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
 }
 }
 
 
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 {
 {
+#ifdef STARPU_USE_NUMA
+	int home_node = handle->home_node;
+	if (home_node >= 0 && (starpu_node_get_kind(home_node) == STARPU_CPU_RAM))
+	 	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, home_node, mode, callback, arg, sequential_consistency);
+	else
+#endif /* STARPU_USE_NUMA */
 	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
 	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
 }
 }
 
 
@@ -326,6 +338,12 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 
 
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
+#ifdef STARPU_USE_NUMA
+	int home_node = handle->home_node;
+	if (home_node >= 0 && (starpu_node_get_kind(home_node) == STARPU_CPU_RAM))
+	 	return starpu_data_acquire_on_node(handle, home_node, mode);
+	else
+#endif /* STARPU_USE_NUMA */
 	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
 	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
 }
 }
 
 
@@ -358,6 +376,12 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
 
 void starpu_data_release(starpu_data_handle_t handle)
 void starpu_data_release(starpu_data_handle_t handle)
 {
 {
+#ifdef STARPU_USE_NUMA
+	int home_node = handle->home_node;
+	if (home_node >= 0 && (starpu_node_get_kind(home_node) == STARPU_CPU_RAM))
+	 	return starpu_data_release_on_node(handle, home_node);
+	else
+#endif /* STARPU_USE_NUMA */	
 	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
 	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
 }
 }
 
 

+ 33 - 6
src/drivers/cpu/driver_cpu.c

@@ -168,7 +168,18 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED
 	size_t global_mem;
 	size_t global_mem;
 	starpu_ssize_t limit;
 	starpu_ssize_t limit;
 
 
-	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+#ifdef STARPU_USE_NUMA
+	char name[30];
+
+	sprintf(name, "STARPU_LIMIT_CPU_%d_MEM", nodeid);
+	limit = starpu_get_env_number(name);
+	if (limit == -1)
+	{
+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+	}	
+#else /* STARPU_USE_NUMA */
+	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");	
+#endif /* STARPU_USE_NUMA */
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
 #  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
 #endif
 #endif
@@ -176,17 +187,17 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED
 #if defined(STARPU_HAVE_HWLOC)
 #if defined(STARPU_HAVE_HWLOC)
 	struct _starpu_machine_topology *topology = &config->topology;
 	struct _starpu_machine_topology *topology = &config->topology;
 
 
-#if 0
-	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
+#ifdef STARPU_USE_NUMA
         int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
         int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
 
 
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	else
 	else
 	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
 	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
-#else
+#else /* STARPU_USE_NUMA */
+	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
 	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
-#endif
+#endif /* STARPU_USE_NUMA */
 
 
 #else /* STARPU_HAVE_HWLOC */
 #else /* STARPU_HAVE_HWLOC */
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
@@ -212,8 +223,11 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
 
 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
+#ifdef STARPU_USE_NUMA
+	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->numa_memory_node, cpu_worker->config));
+#else /* STARPU_USE_NUMA */
 	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
 	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
-
+#endif /* STARPU_USE_NUMA */
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
 	starpu_pthread_setname(cpu_worker->short_name);
 	starpu_pthread_setname(cpu_worker->short_name);
@@ -242,8 +256,21 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
 
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	res = __starpu_datawizard_progress(memnode, 1, 1);
 	res = __starpu_datawizard_progress(memnode, 1, 1);
+#ifdef STARPU_USE_NUMA
+	if (starpu_node_get_kind(memnode) != STARPU_CPU_RAM)
+	{
+		int i;
+		unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+		for (i=0; i<nb_numa_nodes; i++)
+		{
+			unsigned id = _starpu_numaid_to_memnode(i);			
+			res |= __starpu_datawizard_progress(id, 1, 1);
+		}
+	}
+#else /* STARPU_USE_NUMA */
 	if (memnode != STARPU_MAIN_RAM)
 	if (memnode != STARPU_MAIN_RAM)
 		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
 		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+#endif /* STARPU_USE_NUMA */
 	_STARPU_TRACE_END_PROGRESS(memnode);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 
 	struct _starpu_job *j;
 	struct _starpu_job *j;

+ 19 - 1
src/drivers/cuda/driver_cuda.c

@@ -163,7 +163,7 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 	int worker = starpu_worker_get_id_check();
 	int worker = starpu_worker_get_id_check();
 	int devid = starpu_worker_get_devid(worker);
 	int devid = starpu_worker_get_devid(worker);
 	cudaStream_t stream;
 	cudaStream_t stream;
-
+	
 	stream = in_transfer_streams[devid];
 	stream = in_transfer_streams[devid];
 	STARPU_ASSERT(stream);
 	STARPU_ASSERT(stream);
 	return stream;
 	return stream;
@@ -811,7 +811,16 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	{
 	{
 		/* Nothing ready yet, no better thing to do than waiting */
 		/* Nothing ready yet, no better thing to do than waiting */
 		__starpu_datawizard_progress(memnode, 1, 0);
 		__starpu_datawizard_progress(memnode, 1, 0);
+#ifdef STARPU_USE_NUMA
+		unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+		for (i=0; i<nb_numa_nodes; i++)
+		{
+			unsigned id = _starpu_numaid_to_memnode(i);
+			__starpu_datawizard_progress(id, 1, 0);
+		}
+#else /* STARPU_USE_NUMA */
 		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
 		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
+#endif /* STARPU_USE_NUMA */
 		return 0;
 		return 0;
 	}
 	}
 #endif
 #endif
@@ -819,7 +828,16 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	/* Something done, make some progress */
 	/* Something done, make some progress */
 	res = !idle;
 	res = !idle;
 	res |= __starpu_datawizard_progress(memnode, 1, 1);
 	res |= __starpu_datawizard_progress(memnode, 1, 1);
+#ifdef STARPU_USE_NUMA
+	unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+	for (i=0; i<nb_numa_nodes; i++)
+	{
+		unsigned id = _starpu_numaid_to_memnode(i);
+		res |= __starpu_datawizard_progress(id, 1, 1);
+	}
+#else /* STARPU_USE_NUMA */
 	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
 	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+#endif /* STARPU_USE_NUMA */
 
 
 	/* And pull tasks */
 	/* And pull tasks */
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);

+ 9 - 0
src/drivers/mp_common/source_common.c

@@ -693,7 +693,16 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 
 
 		_STARPU_TRACE_START_PROGRESS(memnode);
 		_STARPU_TRACE_START_PROGRESS(memnode);
 		res |= __starpu_datawizard_progress(memnode, 1, 1);
 		res |= __starpu_datawizard_progress(memnode, 1, 1);
+#ifdef STARPU_USE_NUMA
+		unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes;
+		for (i=0; i<nb_numa_nodes; i++)
+		{
+			unsigned id = _starpu_numaid_to_memnode(i);
+			res |= __starpu_datawizard_progress(id, 1, 1);
+		}
+#else /* STARPU_USE_NUMA */
 		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
 		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+#endif /* STARPU_USE_NUMA */
 		_STARPU_TRACE_END_PROGRESS(memnode);
 		_STARPU_TRACE_END_PROGRESS(memnode);
 
 
 		/* Handle message which have been store */
 		/* Handle message which have been store */

+ 20 - 0
src/drivers/opencl/driver_opencl.c

@@ -757,14 +757,34 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 	{
 	{
 		/* Not ready yet, no better thing to do than waiting */
 		/* Not ready yet, no better thing to do than waiting */
 		__starpu_datawizard_progress(memnode, 1, 0);
 		__starpu_datawizard_progress(memnode, 1, 0);
+#ifdef STARPU_USE_NUMA
+		unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+		int i;
+		for (i=0; i<nb_numa_nodes; i++)
+		{
+			unsigned id = _starpu_numaid_to_memnode(i);
+			__starpu_datawizard_progress(id, 1, 0);
+		}
+#else /* STARPU_USE_NUMA */
 		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
 		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
+#endif /* STARPU_USE_NUMA */
 		return 0;
 		return 0;
 	}
 	}
 #endif
 #endif
 
 
 	res = !idle;
 	res = !idle;
 	res |= __starpu_datawizard_progress(memnode, 1, 1);
 	res |= __starpu_datawizard_progress(memnode, 1, 1);
+#ifdef STARPU_USE_NUMA
+	unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
+	int i;
+	for (i=0; i<nb_numa_nodes; i++)
+	{
+		unsigned id = _starpu_numaid_to_memnode(i);
+		res |= __starpu_datawizard_progress(id, 1, 1);
+	}
+#else /* STARPU_USE_NUMA */
 	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
 	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+#endif /* STARPU_USE_NUMA */
 
 
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 
 

+ 7 - 1
src/util/openmp_runtime_support.c

@@ -2418,8 +2418,14 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
 
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 {
 {
+	int node = STARPU_MAIN_RAM;
+#ifdef STARPU_USE_NUMA
+	node = handle->home_node;
+	if (node < 0 || (_starpu_node_get_kind(node) != STARPU_CPU_RAM))
+		node = STARPU_MAIN_RAM;
+#endif /* STARPU_USE_NUMA */
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+		starpu_data_get_interface_on_node(handle, node);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	vector_interface->slice_base = slice_base;
 	vector_interface->slice_base = slice_base;
 }
 }