Browse Source

make clusters safer, fix memleaks and fix a problem with 0 or 1 CPU

Terry Cojean 9 years ago
parent
commit
c553666440

+ 1 - 1
include/starpu_clusters_util.h

@@ -69,7 +69,7 @@ typedef struct starpu_cluster_machine
 
 struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_level, ...);
 int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
-void starpu_cluster_print(struct starpu_cluster_machine* clusters);
+int starpu_cluster_print(struct starpu_cluster_machine* clusters);
 
 /* Prologue functions */
 void starpu_openmp_prologue(void * sched_ctx_id);

+ 44 - 22
src/util/starpu_clusters_create.c

@@ -99,7 +99,11 @@ struct starpu_cluster_machine *starpu_cluster_machine(hwloc_obj_type_t cluster_l
 	struct starpu_cluster_machine *machine = malloc(sizeof(struct starpu_cluster_machine));
 
 	machine->params = malloc(sizeof(struct _starpu_cluster_parameters));
+	machine->id = STARPU_NMAX_SCHED_CTXS;
 	machine->groups = _starpu_cluster_group_list_new();
+	machine->nclusters = 0;
+	machine->ngroups = 0;
+	machine->topology = NULL;
 
 	_starpu_cluster_init_parameters(machine->params);
 	params = machine->params;
@@ -205,15 +209,24 @@ struct starpu_cluster_machine *starpu_cluster_machine(hwloc_obj_type_t cluster_l
 			break;
 	}
 
-	_starpu_cluster_machine(cluster_level, machine);
+	if (_starpu_cluster_machine(cluster_level, machine) == -ENODEV)
+	{
+		starpu_uncluster_machine(machine);
+		machine = NULL;
+	}
+
 	return machine;
 }
 
 int starpu_uncluster_machine(struct starpu_cluster_machine *machine)
 {
+	if (machine == NULL)
+		return -1;
 	struct _starpu_cluster_group *g, *tmp;
 	struct _starpu_cluster_group_list *group_list = machine->groups;
-	starpu_sched_ctx_delete(machine->id);
+
+	if (machine->id != STARPU_NMAX_SCHED_CTXS)
+		starpu_sched_ctx_delete(machine->id);
 	g = _starpu_cluster_group_list_begin(group_list);
 	while (g != _starpu_cluster_group_list_end(group_list))
 	{
@@ -221,7 +234,8 @@ int starpu_uncluster_machine(struct starpu_cluster_machine *machine)
 		g = _starpu_cluster_group_list_next(g);
 		_starpu_cluster_group_remove(group_list, tmp);
 	}
-	hwloc_topology_destroy(machine->topology);
+	if (machine->topology != NULL)
+		hwloc_topology_destroy(machine->topology);
 	free(machine->params);
 	free(machine);
 	starpu_sched_ctx_set_context(0);
@@ -229,8 +243,11 @@ int starpu_uncluster_machine(struct starpu_cluster_machine *machine)
 	return 0;
 }
 
-void starpu_cluster_print(struct starpu_cluster_machine *clusters)
+int starpu_cluster_print(struct starpu_cluster_machine *clusters)
 {
+	if (clusters == NULL)
+		return -1;
+
 	int cnt, w;
 	struct _starpu_cluster_group *group;
 	struct _starpu_cluster *cluster;
@@ -253,7 +270,7 @@ void starpu_cluster_print(struct starpu_cluster_machine *clusters)
 		}
 	}
 
-	return;
+	return 0;
 }
 
 void _starpu_cluster_create(struct _starpu_cluster *cluster)
@@ -283,6 +300,8 @@ void _starpu_cluster_group_create(struct _starpu_cluster_group *group)
 	     c != _starpu_cluster_list_end(group->clusters) ;
 	     c = _starpu_cluster_list_next(c))
 	{
+		if (c->ncores == 0)
+			continue;
 		_starpu_cluster_create(c);
 		if (!c->params->awake_workers)
 			_starpu_cluster_bind(c);
@@ -330,7 +349,7 @@ void _starpu_cluster_group_init(struct _starpu_cluster_group *group,
 void _starpu_cluster_init(struct _starpu_cluster *cluster,
 			  struct _starpu_cluster_group *father)
 {
-	cluster->id = 0;
+	cluster->id = STARPU_NMAX_SCHED_CTXS;
 	cluster->cpuset = hwloc_bitmap_alloc();
 	cluster->ncores = 0;
 	cluster->cores = NULL;
@@ -373,9 +392,12 @@ int _starpu_cluster_group_remove(struct _starpu_cluster_group_list *group_list,
 		c = _starpu_cluster_list_next(c);
 		_starpu_cluster_remove(cluster_list, tmp);
 	}
+	_starpu_cluster_list_delete(cluster_list);
+
 	free(group->params);
 	_starpu_cluster_group_list_erase(group_list, group);
 	_starpu_cluster_group_delete(group);
+	_starpu_cluster_group_list_delete(group_list);
 
 	return 0;
 }
@@ -421,16 +443,16 @@ int _starpu_cluster_analyze_parameters(struct _starpu_cluster_parameters *params
 	int nb_clusters = 1, j;
 	if (params->nb)
 	{
-		nb_clusters = params->nb;
+		nb_clusters = params->nb <= npus?params->nb : npus;
 	}
 	else if (params->min_nb && params->max_nb)
 	{
 		if (!params->keep_homogeneous)
 		{
 			if (params->prefere_min)
-				nb_clusters = params->min_nb;
+				nb_clusters = params->min_nb <= npus? params->min_nb : npus;
 			else
-				nb_clusters = params->max_nb;
+				nb_clusters = params->max_nb <= npus? params->max_nb : npus;
 		}
 		else
 		{
@@ -463,12 +485,14 @@ int _starpu_cluster_analyze_parameters(struct _starpu_cluster_parameters *params
 	return nb_clusters;
 }
 
-void _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
+int _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
 			     struct starpu_cluster_machine *machine)
 {
 	struct _starpu_cluster_group *g;
+	int ret = 0;
 
-	_starpu_cluster_topology(cluster_level, machine);
+	if ((ret = _starpu_cluster_topology(cluster_level, machine)))
+		return ret;
 
 	if (machine->params->sched_policy_struct != NULL)
 	{
@@ -503,16 +527,21 @@ void _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
 	starpu_task_wait_for_all();
 	starpu_sched_ctx_set_context(&machine->id);
 
-	return;
+	return ret;
 }
 
-void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
+int _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
 			      struct starpu_cluster_machine *machine)
 {
 	int w;
 	hwloc_topology_t topology;
 	hwloc_cpuset_t avail_cpus;
-	char *buf;
+
+	int nworkers = starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
+	if (nworkers == 0)
+		return -ENODEV;
+	int *workers = (int*) malloc(sizeof(int) * nworkers);
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, nworkers);
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	STARPU_ASSERT_MSG(config->topology.hwtopology != NULL, "STARPU_CLUSTER: You "
@@ -522,16 +551,11 @@ void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
 	avail_cpus = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(avail_cpus);
 
-	int nworkers = starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
-	int *workers = (int*) malloc(sizeof(int) * nworkers);
-	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, nworkers);
-
 	for (w = 0; w < nworkers ; w++)
 	{
 		struct _starpu_worker *worker_str = _starpu_get_worker_struct(workers[w]);
 		hwloc_bitmap_or(avail_cpus, avail_cpus, worker_str->hwloc_cpu_set);
 	}
-	hwloc_bitmap_list_asprintf(&buf, avail_cpus);
 
 	hwloc_topology_restrict(topology, avail_cpus, 0);
 	free(workers);
@@ -542,7 +566,7 @@ void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
 
 	hwloc_bitmap_free(avail_cpus);
 
-	return;
+	return 0;
 }
 
 void _starpu_cluster_group(hwloc_obj_type_t cluster_level,
@@ -587,7 +611,6 @@ void _starpu_cluster(struct _starpu_cluster_group *group)
 {
 	int i, avail_pus, npus, npreset=0;
 	struct _starpu_cluster *cluster;
-	char *buf;
 	npus = hwloc_get_nbobjs_inside_cpuset_by_type(group->father->topology,
 						      group->group_obj->cpuset,
 						      HWLOC_OBJ_PU);
@@ -647,7 +670,6 @@ void _starpu_cluster(struct _starpu_cluster_group *group)
 	cluster = _starpu_cluster_list_begin(group->clusters);
 	int count = 0;
 	static int starpu_cluster_warned = 0;
-	hwloc_bitmap_list_asprintf(&buf, group->group_obj->cpuset);
 
 	for (i=0 ; i<npus ; i++)
 	{

+ 2 - 2
src/util/starpu_clusters_create.h

@@ -68,9 +68,9 @@ LIST_TYPE(_starpu_cluster,
 
 
 /* Machine discovery and cluster creation main funcitons */
-void _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
+int _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
 			     struct starpu_cluster_machine* machine);
-void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
+int _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
 			      struct starpu_cluster_machine* machine);
 void _starpu_cluster_group(hwloc_obj_type_t cluster_level,
 			   struct starpu_cluster_machine* machine);