Browse Source

more fixes

Samuel Thibault 8 years ago
parent
commit
35ad7ce8b1

+ 1 - 1
configure.ac

@@ -167,7 +167,7 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 AC_MSG_CHECKING(maximum number of NUMA nodes)
 AC_MSG_CHECKING(maximum number of NUMA nodes)
 AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
 AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
 			[maximum number of NUMA nodes])],
 			[maximum number of NUMA nodes])],
-			nmaxnumanodes=$enableval, nmaxnumanodes=4)
+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
 AC_MSG_RESULT($nmaxnumanodes)
 AC_MSG_RESULT($nmaxnumanodes)
 AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
 AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
 		[maximum number of NUMA nodes])
 		[maximum number of NUMA nodes])

+ 7 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -177,9 +177,13 @@ structures of StarPU by describing the shape of your machine and/or your
 application at the configure step.
 application at the configure step.
 
 
 To reduce the memory footprint of the data internal structures of StarPU, one
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
+can set the
+\ref enable-maxcpus "--enable-maxcpus",
+\ref enable-maxnumanodes "--enable-maxnumanodes",
+\ref enable-maxcudadev "--enable-maxcudadev",
+\ref enable-maxopencldev "--enable-maxopencldev" and
+\ref enable-maxnodes "--enable-maxnodes"
+configure parameters to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 structures to the machine.
 
 

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -767,6 +767,14 @@ available to the application on each CPU device. Setting it enables allocation
 cache in main memory
 cache in main memory
 </dd>
 </dd>
 
 
+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the NUMA node with the OS identifier <c>devid</c>.
+</dd>
+
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dd>
 <dd>
 \anchor STARPU_MINIMUM_AVAILABLE_MEM
 \anchor STARPU_MINIMUM_AVAILABLE_MEM

+ 8 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -129,6 +129,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 available as the macro ::STARPU_MAXCPUS.
 available as the macro ::STARPU_MAXCPUS.
 </dd>
 </dd>
 
 
+<dt>--enable-maxnumanodes=<c>count</c></dt>
+<dd>
+\anchor enable-maxnumanodes
+\addindex __configure__--enable-maxnumanodes
+Use at most <c>count</c> NUMA nodes.  This information is then
+available as the macro ::STARPU_MAXNUMANODES.
+</dd>
+
 <dt>--disable-cpu</dt>
 <dt>--disable-cpu</dt>
 <dd>
 <dd>
 \anchor disable-cpu
 \anchor disable-cpu

+ 10 - 3
src/core/topology.c

@@ -1610,9 +1610,15 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		{
 		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			{
 			{
+				/* "dedicate" a cpu core to that worker */
 				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				int numaid = workerarg->numa_memory_node = _starpu_worker_numa_node(worker);
 				int numaid = workerarg->numa_memory_node = _starpu_worker_numa_node(worker);
-				/* "dedicate" a cpu core to that worker */
+				if (!numa_init[numaid] && nb_numa_nodes == STARPU_MAXNUMANODES-1)
+				{
+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+					numaid = STARPU_MAIN_RAM;
+				}
+
 				if (numa_init[numaid])
 				if (numa_init[numaid])
 				{
 				{
 					memory_node = numa_memory_nodes[numaid];
 					memory_node = numa_memory_nodes[numaid];
@@ -2034,14 +2040,15 @@ static int _starpu_worker_numa_node(unsigned workerid)
 	struct _starpu_machine_topology *topology = &config->topology ;
 	struct _starpu_machine_topology *topology = &config->topology ;
 	hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid) ;
 	hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid) ;
 	STARPU_ASSERT(obj) ;
 	STARPU_ASSERT(obj) ;
-	while (obj->depth != HWLOC_OBJ_NODE)
+	while (obj->type != HWLOC_OBJ_NODE)
 	{
 	{
 		obj = obj->parent;
 		obj = obj->parent;
 
 
 		/* If we don't find a "node" obj before the root, this means
 		/* If we don't find a "node" obj before the root, this means
 		 * hwloc does not know whether there are numa nodes or not, so
 		 * hwloc does not know whether there are numa nodes or not, so
 		 * we should not use a per-node sampling in that case. */
 		 * we should not use a per-node sampling in that case. */
-		STARPU_ASSERT(obj);
+		if (!obj)
+			return STARPU_MAIN_RAM;
 	}
 	}
 	STARPU_ASSERT(obj->depth == HWLOC_OBJ_NODE);
 	STARPU_ASSERT(obj->depth == HWLOC_OBJ_NODE);
 	return obj->logical_index;
 	return obj->logical_index;

+ 2 - 2
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2016  Uppsala University
  * Copyright (C) 2016  Uppsala University
@@ -76,7 +76,7 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
-	unsigned numa_memory_node; /* which numa memory node is the worker associated with ? */
+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */

+ 2 - 1
src/datawizard/datawizard.c

@@ -70,7 +70,8 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 
 
 int __starpu_datawizard_progress_ram(unsigned may_alloc, unsigned push_requests)
 int __starpu_datawizard_progress_ram(unsigned may_alloc, unsigned push_requests)
 {
 {
-	int res = 0, i;
+	int res = 0;
+	unsigned i;
 	unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 	unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 	for (i=0; i<nb_numa_nodes; i++)
 	for (i=0; i<nb_numa_nodes; i++)
 	{
 	{

+ 2 - 8
src/datawizard/interfaces/bcsr_interface.c

@@ -216,12 +216,9 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
@@ -232,12 +229,9 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 
 	/* XXX 0 */
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)

+ 1 - 7
src/datawizard/interfaces/data_interface.c

@@ -515,12 +515,9 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 
 
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 {
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 
 	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
 	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
 #ifdef STARPU_OPENMP
 #ifdef STARPU_OPENMP
@@ -758,12 +755,9 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
 			struct starpu_multiformat_interface *format_interface;
-			home_node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
 			home_node = handle->home_node;
 			home_node = handle->home_node;
 			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 				home_node = STARPU_MAIN_RAM;
 				home_node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			struct starpu_codelet *cl = NULL;
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);

+ 1 - 1
src/datawizard/malloc.c

@@ -298,7 +298,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, _starpu_memnode_to_numaid(dst_node));
 		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, _starpu_memnode_to_numaid(dst_node));
 		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
 		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
 		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
 		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
-		fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, _starpu_memnode_to_numaid(dst_node), *A);
+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, _starpu_memnode_to_numaid(dst_node), *A);
 		if (!*A)
 		if (!*A)
 			ret = -ENOMEM;
 			ret = -ENOMEM;
 	}
 	}

+ 5 - 2
src/datawizard/memalloc.c

@@ -1597,6 +1597,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	return target;
 	return target;
 }
 }
 
 
+#ifdef STARPU_DEVEL
+#  warning TODO: better choose NUMA node
+#endif
 
 
 static unsigned
 static unsigned
 choose_target(starpu_data_handle_t handle, unsigned node)
 choose_target(starpu_data_handle_t handle, unsigned node)
@@ -1607,7 +1610,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		/* try to push on RAM if we can before to push on disk */
 		/* try to push on RAM if we can before to push on disk */
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		{
 		{
- 	                int i;
+ 	                unsigned i;
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			for (i=0; i<nb_numa_nodes; i++)
 			for (i=0; i<nb_numa_nodes; i++)
 			{
 			{
@@ -1640,7 +1643,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		} else {
 		} else {
 		/* node != 0 */
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
 		/* try to push data to RAM if we can before to push on disk*/
-			int i;
+			unsigned i;
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			for (i=0; i<nb_numa_nodes; i++)
 			for (i=0; i<nb_numa_nodes; i++)
 			{
 			{

+ 11 - 11
src/drivers/cpu/driver_cpu.c

@@ -166,26 +166,23 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	size_t global_mem;
 	size_t global_mem;
-	starpu_ssize_t limit;
+	starpu_ssize_t limit = -1;
 
 
 	char name[32];
 	char name[32];
 
 
-	/* FIXME: do we want logical or physical? */
-	sprintf(name, "STARPU_LIMIT_CPU_NUMA%d_MEM", nodeid);
-	limit = starpu_get_env_number(name);
-	if (limit == -1)
-		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
-
 #if defined(STARPU_HAVE_HWLOC)
 #if defined(STARPU_HAVE_HWLOC)
 	struct _starpu_machine_topology *topology = &config->topology;
 	struct _starpu_machine_topology *topology = &config->topology;
-
 #ifdef STARPU_USE_NUMA
 #ifdef STARPU_USE_NUMA
         int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
         int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
 
 
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
-	else
-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
+	else {
+	     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
+	     global_mem = obj->memory.local_memory;
+	     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
+	     limit = starpu_get_env_number(name);
+	}
 #else /* STARPU_USE_NUMA */
 #else /* STARPU_USE_NUMA */
 	/* Do not limit ourself to a single NUMA node */
 	/* Do not limit ourself to a single NUMA node */
 	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
@@ -193,11 +190,14 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED
 
 
 #else /* STARPU_HAVE_HWLOC */
 #else /* STARPU_HAVE_HWLOC */
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
-#  warning use sysinfo when available to get global size
+#  warning TODO: use sysinfo when available to get global size
 #endif
 #endif
 	global_mem = 0;
 	global_mem = 0;
 #endif
 #endif
 
 
+	if (limit == -1)
+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+
 	if (limit < 0)
 	if (limit < 0)
 		// No limit is defined, we return the global memory size
 		// No limit is defined, we return the global memory size
 		return global_mem;
 		return global_mem;

+ 2 - 5
src/util/openmp_runtime_support.c

@@ -2418,13 +2418,10 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
 
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 {
 {
-	/* FIXME Oli? */
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	/* FIXME Oli: rather iterate over all nodes? */
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 		starpu_data_get_interface_on_node(handle, node);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);