Browse Source

more fixes

Samuel Thibault 8 years ago
parent
commit
35ad7ce8b1

+ 1 - 1
configure.ac

@@ -167,7 +167,7 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 AC_MSG_CHECKING(maximum number of NUMA nodes)
 AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
 			[maximum number of NUMA nodes])],
-			nmaxnumanodes=$enableval, nmaxnumanodes=4)
+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
 AC_MSG_RESULT($nmaxnumanodes)
 AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
 		[maximum number of NUMA nodes])

+ 7 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -177,9 +177,13 @@ structures of StarPU by describing the shape of your machine and/or your
 application at the configure step.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
+can set the
+\ref enable-maxcpus "--enable-maxcpus",
+\ref enable-maxnumanodes "--enable-maxnumanodes",
+\ref enable-maxcudadev "--enable-maxcudadev",
+\ref enable-maxopencldev "--enable-maxopencldev" and
+\ref enable-maxnodes "--enable-maxnodes"
+configure parameters to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -767,6 +767,14 @@ available to the application on each CPU device. Setting it enables allocation
 cache in main memory
 </dd>
 
+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the NUMA node with the OS identifier <c>devid</c>.
+</dd>
+
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
 <dd>
 \anchor STARPU_MINIMUM_AVAILABLE_MEM

+ 8 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -129,6 +129,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 available as the macro ::STARPU_MAXCPUS.
 </dd>
 
+<dt>--enable-maxnumanodes=<c>count</c></dt>
+<dd>
+\anchor enable-maxnumanodes
+\addindex __configure__--enable-maxnumanodes
+Use at most <c>count</c> NUMA nodes.  This information is then
+available as the macro ::STARPU_MAXNUMANODES.
+</dd>
+
 <dt>--disable-cpu</dt>
 <dd>
 \anchor disable-cpu

+ 10 - 3
src/core/topology.c

@@ -1610,9 +1610,15 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		{
 			case STARPU_CPU_WORKER:
 			{
+				/* "dedicate" a cpu core to that worker */
 				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				int numaid = workerarg->numa_memory_node = _starpu_worker_numa_node(worker);
-				/* "dedicate" a cpu core to that worker */
+				if (!numa_init[numaid] && nb_numa_nodes == STARPU_MAXNUMANODES-1)
+				{
+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
+					numaid = STARPU_MAIN_RAM;
+				}
+
 				if (numa_init[numaid])
 				{
 					memory_node = numa_memory_nodes[numaid];
@@ -2034,14 +2040,15 @@ static int _starpu_worker_numa_node(unsigned workerid)
 	struct _starpu_machine_topology *topology = &config->topology ;
 	hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid) ;
 	STARPU_ASSERT(obj) ;
-	while (obj->depth != HWLOC_OBJ_NODE)
+	while (obj->type != HWLOC_OBJ_NODE)
 	{
 		obj = obj->parent;
 
 		/* If we don't find a "node" obj before the root, this means
 		 * hwloc does not know whether there are numa nodes or not, so
 		 * we should not use a per-node sampling in that case. */
-		STARPU_ASSERT(obj);
+		if (!obj)
+			return STARPU_MAIN_RAM;
 	}
 	STARPU_ASSERT(obj->depth == HWLOC_OBJ_NODE);
 	return obj->logical_index;

+ 2 - 2
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -76,7 +76,7 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
-	unsigned numa_memory_node; /* which numa memory node is the worker associated with ? */
+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */

+ 2 - 1
src/datawizard/datawizard.c

@@ -70,7 +70,8 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 
 int __starpu_datawizard_progress_ram(unsigned may_alloc, unsigned push_requests)
 {
-	int res = 0, i;
+	int res = 0;
+	unsigned i;
 	unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 	for (i=0; i<nb_numa_nodes; i++)
 	{

+ 2 - 8
src/datawizard/interfaces/bcsr_interface.c

@@ -216,12 +216,9 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
@@ -232,12 +229,9 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)

+ 1 - 7
src/datawizard/interfaces/data_interface.c

@@ -515,12 +515,9 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 {
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 
 	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
 #ifdef STARPU_OPENMP
@@ -758,12 +755,9 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
-			home_node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
 			home_node = handle->home_node;
 			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 				home_node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);

+ 1 - 1
src/datawizard/malloc.c

@@ -298,7 +298,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, _starpu_memnode_to_numaid(dst_node));
 		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
 		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
-		fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, _starpu_memnode_to_numaid(dst_node), *A);
+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, _starpu_memnode_to_numaid(dst_node), *A);
 		if (!*A)
 			ret = -ENOMEM;
 	}

+ 5 - 2
src/datawizard/memalloc.c

@@ -1597,6 +1597,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	return target;
 }
 
+#ifdef STARPU_DEVEL
+#  warning TODO: better choose NUMA node
+#endif
 
 static unsigned
 choose_target(starpu_data_handle_t handle, unsigned node)
@@ -1607,7 +1610,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		/* try to push on RAM if we can before to push on disk */
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		{
- 	                int i;
+ 	                unsigned i;
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			for (i=0; i<nb_numa_nodes; i++)
 			{
@@ -1640,7 +1643,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		} else {
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
-			int i;
+			unsigned i;
 			unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
 			for (i=0; i<nb_numa_nodes; i++)
 			{

+ 11 - 11
src/drivers/cpu/driver_cpu.c

@@ -166,26 +166,23 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
 	size_t global_mem;
-	starpu_ssize_t limit;
+	starpu_ssize_t limit = -1;
 
 	char name[32];
 
-	/* FIXME: do we want logical or physical? */
-	sprintf(name, "STARPU_LIMIT_CPU_NUMA%d_MEM", nodeid);
-	limit = starpu_get_env_number(name);
-	if (limit == -1)
-		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
-
 #if defined(STARPU_HAVE_HWLOC)
 	struct _starpu_machine_topology *topology = &config->topology;
-
 #ifdef STARPU_USE_NUMA
         int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
 
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
-	else
-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
+	else {
+	     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
+	     global_mem = obj->memory.local_memory;
+	     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
+	     limit = starpu_get_env_number(name);
+	}
 #else /* STARPU_USE_NUMA */
 	/* Do not limit ourself to a single NUMA node */
 	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
@@ -193,11 +190,14 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED
 
 #else /* STARPU_HAVE_HWLOC */
 #ifdef STARPU_DEVEL
-#  warning use sysinfo when available to get global size
+#  warning TODO: use sysinfo when available to get global size
 #endif
 	global_mem = 0;
 #endif
 
+	if (limit == -1)
+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
+
 	if (limit < 0)
 		// No limit is defined, we return the global memory size
 		return global_mem;

+ 2 - 5
src/util/openmp_runtime_support.c

@@ -2418,13 +2418,10 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
 {
-	/* FIXME Oli? */
-	int node = STARPU_MAIN_RAM;
-#ifdef STARPU_USE_NUMA
-	node = handle->home_node;
+	/* FIXME Oli: rather iterate over all nodes? */
+	int node = handle->home_node;
 	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
 		node = STARPU_MAIN_RAM;
-#endif /* STARPU_USE_NUMA */
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);