Andra Hugo 12 éve
szülő
commit
36ea3c0758
2 módosított fájl, 135 hozzáadás és 526 törlés
  1. 2 2
      src/common/fxt.h
  2. 133 524
      src/sched_policies/detect_combined_workers.c

+ 2 - 2
src/common/fxt.h

@@ -205,10 +205,10 @@ do {									\
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, syscall(SYS_gettid), 1, model_name); \
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, syscall(SYS_gettid), 0); \
+		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 	}								\
 } while(0);
 

+ 133 - 524
src/sched_policies/detect_combined_workers.c

@@ -25,479 +25,76 @@
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 
-#if 0
-/* struct _starpu_tree
- * ==================
- * Purpose
- * =======
- * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
- * object and storing every workers it contained in every sub-trees by recursion.
- *
- * Fields
- * ======
- * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
- *
- * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
- 			or in this very object.
- *
- * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
- */
-
-struct _starpu_tree
-{
-    hwloc_obj_t obj;
-    unsigned nb_workers;
-    int *workers;
-};
-
-/* gather_trees
- * ============
- * Purpose
- * =======
- * Gather all the workers of every source tree in one target tree.
- * We assume the target array of workers is big enough to contain all the workers.
- *
- * Arguments
- * =========
- * target_tree		(input, output)
- *			Pointer to the tree which will contain all the workers of every source.
- *
- * source_trees		(input)
- *			Array of trees we want to combine in a unique tree.
- *
- * nb_source_trees	(input)
- *			Number of trees we want to combine (size of the array).
- */
-
-static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
-{
-    unsigned tree_id, worker_id, index = 0;
-    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
-	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
-	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
-}
-
-/* assign_multiple_trees
- * ========================
- * Purpose
- * =======
- * Assign every tree which is large enough (greater than min_size) and merge small ones.
- * If there is no tree large enough to be assigned any more, we return.
- *
- * Return value
- * ============
- * The number of workers assigned during the function.
- *
- * Arguments
- * =========
- * trees		(input, output)
- *			In entry, array of trees to assign. In the end at most one tree still contains workers.
- *
- * nb_trees		(input)
- *			The number of trees (size of the array).
- *
- * min_size		(input)
- *			Minimum size of a combined worker.
- *
- * max_size		(input)
- *			Maximum size of a combined worker.
- */
-
-static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
+static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
 {
-    unsigned short complete = 0;
-    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
-
-    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	nb_workers_total += trees[tree_id].nb_workers;;
-
-    while(!complete)
-    {
-	complete = 1;
-
-	/* First we manage to assign every subtree large enough to be assigned alone */
-	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	{
-	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
-		continue;
-
-	    nb_workers_tree = trees[tree_id].nb_workers;
-
-	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
-	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
-	    {
-		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
-		STARPU_ASSERT(ret >= 0);
-		nb_workers_assigned += nb_workers_tree;
-		nb_workers_total -= nb_workers_tree;
-		trees[tree_id].nb_workers = 0;
-	    }
-	}
-
-	/* Then we merge too small subtrees into not too large ones
-	 * if we manage to merge some subtrees we turn the flag
-	 * complete to 0 thus we know he have to start again to assign
-	 * just merged subtrees */
-	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	{
-	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
-		continue;
-
-	    nb_workers_tree = trees[tree_id].nb_workers;
-
-	    /* We go through the array to find another subtree we can merge with this one */
-	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
-	    {
-		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
-		    continue;
-
-		nb_workers_tree2 = trees[tree_id2].nb_workers;
-
-		/*  We can merge the two subtrees, let's do it */
-		if(nb_workers_tree + nb_workers_tree2 <= max_size)
+		if (!obj->userdata)
+				/* Not something we run something on, don't care */
+				return;
+		if (obj->userdata == (void*) -1)
 		{
-		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
-			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
-
-		    trees[tree_id].nb_workers += nb_workers_tree2;
-		    trees[tree_id2].nb_workers = 0;
-
-		    /* We just merged two subtrees, we need to restart again and try to assign it */
-		    complete = 0;
-		    break;
+				/* Intra node, recurse */
+				unsigned i;
+				for (i = 0; i < obj->arity; i++)
+						find_workers(obj->children[i], cpu_workers, n);
+				return;
+		}
+		
+		/* Got to a PU leaf */
+		struct _starpu_worker *worker = obj->userdata;
+		/* is it a CPU worker? */
+		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		{
+				_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
+				/* Add it to the combined worker */
+				cpu_workers[(*n)++] = worker->workerid;
 		}
-	    }
-
-	    if(!complete)
-		break;
-	}
-    }
-
-    return nb_workers_assigned;
-}
-
-/* find_and_assign_combinations_with_hwloc_recursive
- * =================================================
- * Purpose
- * =======
- * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
- * assign are given back to the calling function to be assigned using data from other subtrees if so.
- *
- * Return value
- * ============
- * The number of workers left to be assigned.
- *
- * Arguments
- * =========
- * tree			(input, output)
- *			Tree structure containing the root to process in entry.
- *			When the function returns it also contains the number of workers left
- *			to be assigned and these very workers in the array previously allocated.
- *
- * min_size		(input)
- *			Minimum size of a combined worker.
- *
- * max_size		(input)
- *			Maximum size of a combined worker.
- */
-
-static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
-{
-    unsigned subtree_id, nb_workers = 0;
-
-    hwloc_obj_t obj = tree->obj;
-    int *workers = tree->workers;
-
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-
-    /* Is this a leaf ? (eg. a PU for hwloc) */
-    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
-    {
-	struct _starpu_worker *worker = obj->userdata;
-
-	/* If this is a CPU worker add it at the beginning
-	 * of the array , write 1 in the field nb_workers and
-	 * return the number of CPU workers found : 1 in this case. */
-	if (worker && worker->arch == STARPU_CPU_WORKER)
-	{
-	    workers[0] = worker->workerid;
-	    tree->nb_workers = 1;
-	    return 1;
-	}
-
-	tree->nb_workers = 0;
-	return 0;
-    }
-
-
-    /* If there is only one child, we go to the next level right away */
-    if (obj->arity == 1)
-    {
-	struct _starpu_tree subtree = *tree;
-	subtree.obj = obj->children[0];
-	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
-	tree->nb_workers = nb_workers;
-	return nb_workers;
-    }
-
-    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
-     * CPU leaves that fits between min and max. */
-
-    /* We allocate an array of tree structures which will contain the current node's subtrees data */
-    struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
-
-    /* We allocate the array containing the workers of each subtree and initialize the fields left */
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-    {
-	struct _starpu_tree *subtree = subtrees + subtree_id;
-
-	subtree->obj = obj->children[subtree_id];
-	subtree->nb_workers = 0;
-	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
-    }
-
-    /* We recursively go through every subtree and get all the workers which are not assigned yet */
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
-
-    if(nb_workers > max_size)
-    {
-	/* We withdraw the number of workers just assigned from the total number of workers */
-	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
-
-	/* Some workers are not assigned yet : we gather them in the array
-	 * which is returned to the father which will handle them later */
-	if(nb_workers)
-	    gather_trees(tree, subtrees, obj->arity);
-    }
-    else if(nb_workers < max_size)
-    {
-	gather_trees(tree, subtrees, obj->arity);
-    }
-    else // nb_workers == max_size
-    {
-	gather_trees(tree, subtrees, obj->arity);
-
-	unsigned sched_ctx_id = starpu_get_sched_ctx();
-	int i;
-	for(i = 0; i < nb_workers; i++)
-		if(!starpu_worker_belongs_to_sched_ctx(workers[i], sched_ctx_id))
-			return 0;
-	struct worker_collection* workers_coll = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
-	int newworkerid = starpu_combined_worker_assign_workerid(nb_workers, workers);
-	STARPU_ASSERT(newworkerid >= 0);
-	workers_coll->add(workers_coll, newworkerid);
-	nb_workers = 0;
-    }
-
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-	free(subtrees[subtree_id].workers);
-    free(subtrees);
-
-    tree->nb_workers = nb_workers;
-    return nb_workers;
-}
-
-/* get_min_max_sizes
- * =================================================
- * Purpose
- * =======
- * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
- * environment variables.
- * If both of them were not set, then we try do get some efficient values following the rule beneath :
- *
- * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
- * S = square_root(nb_cpus)
- *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
- *
- * If only one of both was not set then we set it with a value relative to the other, for example :
- *
- *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
- *
- * Arguments
- * =========
- * min_size		(output)
- *			Pointer to the minimum size of a combined worker, whether set with
- *			value given by the user or processed from the number of cpus.
- *
- * max_size		(output)
- *			Pointer to the maximum size of a combined worker, whether set with
- *			value given by the user or processed from the number of cpus.
- *
- * topology		(input)
- *			Topology of the machine : used to know the number of cpus.
- */
-
-static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
-{
-    int _min_size, _max_size;
-
-    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
-    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-
-    /* If the user didn't set both the environment variables,
-     * we need to find a minimum and a maximum size ourselves */
-    if(_min_size <= -1 || _max_size <= -1)
-    {
-
-	int nb_cpus = topology->nhwcpus;
-	int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
-	int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
-
-	    if(_min_size == -1)
-	    {
-		if(_max_size > -1)
-		    _min_size = _max_size - 1;
-		else
-		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
-	    }
-
-	if(_max_size == -1)
-	{
-	    if(_min_size > -1)
-		_max_size = _min_size + 1;
-	    else
-		_max_size = sqrt_nb_cpus + 1;
-	}
-    }
-
-    *min_size = _min_size;
-    *max_size = _max_size;
-
-    return;
-}
-
-/* find_and_assign_combinations_with_hwloc
- * =======================================
- * Purpose
- * =======
- * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
- * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
- * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
- * some workers, we assign them no matter the number for there is no way to respect
- * the wanted sizes anymore.
- *
- * Arguments
- * =========
- * topology		(input)
- *			Topology of the machine : used to know the number of cpus and
- *			to get the hwloc tree.
- */
-
-static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers)
-{
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-    struct starpu_machine_topology *topology = &config->topology;
-
-    unsigned sched_ctx_id  = starpu_get_sched_ctx();
-    if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
-	    sched_ctx_id = 0; 
-
-    struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
-
-    unsigned nb_workers;
-    unsigned int min_size, max_size;
-
-    get_min_max_sizes(&min_size, &max_size, topology);
-
-    STARPU_ASSERT(min_size <= max_size);
-
-    struct _starpu_tree tree;
-
-    /* Of course we start from the root */
-    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
-    tree.nb_workers = 0;
-    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
-
-    /* We recursively go from the root to the leaves of the tree to find
-     * subtrees that only have CPUs as leaves. */
-    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
-
-    /* There are still some workers left, since the only possibility is that
-     * the number of workers left is less than the minimum worker size we assign them all */
-    if(nb_workers > 0)
-    {
-	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
-	 * while there are enough workers to assign regarding the min_size value */
-	STARPU_ASSERT(nb_workers <= max_size);
-
-	int newworkerid = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
-	STARPU_ASSERT(newworkerid >= 0);
-	workers->add(workers, newworkerid);
-    }
-
-    free(tree.workers);
-}
-#endif
-
-static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
-{
-    if (!obj->userdata)
-	/* Not something we run something on, don't care */
-	return;
-    if (obj->userdata == (void*) -1)
-    {
-	/* Intra node, recurse */
-	unsigned i;
-	for (i = 0; i < obj->arity; i++)
-	    find_workers(obj->children[i], cpu_workers, n);
-	return;
-    }
-
-    /* Got to a PU leaf */
-    struct _starpu_worker *worker = obj->userdata;
-    /* is it a CPU worker? */
-    if (worker->perf_arch == STARPU_CPU_DEFAULT)
-    {
-	_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
-	/* Add it to the combined worker */
-	cpu_workers[(*n)++] = worker->workerid;
-    }
 }
 
-static void synthesize_intermediate_workers(int *workerids, int nworkers, hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
+static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
 {
-    unsigned nworkers, i, j;
-    unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
-    unsigned chunk_start;
-    int cpu_workers[STARPU_NMAXWORKERS];
-    int ret;
-
-    if (n <= synthesize_arity)
-	/* Not too many children, do not synthesize */
-	return;
+		unsigned nworkers, i, j;
+		unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
+		unsigned chunk_start;
+		int cpu_workers[STARPU_NMAXWORKERS];
+		int ret;
+		
+		if (n <= synthesize_arity)
+				/* Not too many children, do not synthesize */
+				return;
 
-    _STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
+		_STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
 
-    n = 0;
-    j = 0;
-    nworkers = 0;
-    chunk_start = 0;
-    for (i = 0 ; i < arity; i++)
-    {
-	if (children[i]->userdata) {
-	    n++;
-	    _STARPU_DEBUG("child %u\n", i);
-	    find_workers(children[i], cpu_workers, &nworkers);
-	    j++;
-	}
-	/* Completed a chunk, or last bit (but not if it's just 1 subobject) */
-	if (j == chunk_size || (i == arity-1 && j > 1)) {
-	    _STARPU_DEBUG("Adding it\n");
-	    ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
-	    STARPU_ASSERT(ret >= 0);
-	    /* Recurse there */
-	    synthesize_intermediate_workers(topology, children+chunk_start, i - chunk_start, n, synthesize_arity);
-	    /* And restart another one */
-	    n = 0;
-	    j = 0;
-	    nworkers = 0;
-	    chunk_start = i+1;
-	}
-    }
+		n = 0;
+		j = 0;
+		nworkers = 0;
+		chunk_start = 0;
+		for (i = 0 ; i < arity; i++)
+		{
+				if (children[i]->userdata) 
+				{
+						n++;
+						_STARPU_DEBUG("child %u\n", i);
+						find_workers(children[i], cpu_workers, &nworkers);
+						j++;
+				}
+				/* Completed a chunk, or last bit (but not if it's just 1 subobject) */
+				if (j == chunk_size || (i == arity-1 && j > 1)) 
+				{
+						_STARPU_DEBUG("Adding it\n");
+						ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
+						STARPU_ASSERT(ret >= 0);
+						/* Recurse there */
+						synthesize_intermediate_workers(children+chunk_start, i - chunk_start, n, synthesize_arity);
+						/* And restart another one */
+						n = 0;
+						j = 0;
+						nworkers = 0;
+						chunk_start = i+1;
+				}
+		}
 }
 
-static void find_and_assign_combinations(int *workerids, int nworkers, hwloc_obj_t obj, unsigned synthesize_arity)
+static void find_and_assign_combinations(hwloc_obj_t obj, unsigned synthesize_arity)
 {
     char name[64];
     unsigned i, n, nworkers;
@@ -505,65 +102,77 @@ static void find_and_assign_combinations(int *workerids, int nworkers, hwloc_obj
 
     int ret;
 
+
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct starpu_machine_topology *topology = &config->topology;
+
     hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0);
     _STARPU_DEBUG("Looking at %s\n", name);
 
     for (n = 0, i = 0; i < obj->arity; i++)
-	if (obj->children[i]->userdata)
-	    /* it has a CPU worker */
-	    n++;
-
+			if (obj->children[i]->userdata)
+					/* it has a CPU worker */
+					n++;
+	
     if (n == 1) {
-	/* If there is only one child, we go to the next level right away */
-	find_and_assign_combinations(topology, obj->children[0], synthesize_arity);
-	return;
+			/* If there is only one child, we go to the next level right away */
+			find_and_assign_combinations(obj->children[0], synthesize_arity);
+			return;
     }
-
+	
     /* Add this object */
     nworkers = 0;
     find_workers(obj, cpu_workers, &nworkers);
-
+	
     if (nworkers > 1)
     {
-	_STARPU_DEBUG("Adding it\n");
-	ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
-	STARPU_ASSERT(ret >= 0);
+			_STARPU_DEBUG("Adding it\n");
+			unsigned sched_ctx_id  = starpu_get_sched_ctx();
+			if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+					sched_ctx_id = 0; 
+			
+			struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+			int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
+			workers->add(workers,newworkerid);
+			STARPU_ASSERT(ret >= 0);
     }
-
+	
     /* Add artificial intermediate objects recursively */
-    synthesize_intermediate_workers(topology, obj->children, obj->arity, n, synthesize_arity);
-
+    synthesize_intermediate_workers(obj->children, obj->arity, n, synthesize_arity);
+	
     /* And recurse */
     for (i = 0; i < obj->arity; i++)
-	if (obj->children[i]->userdata == (void*) -1)
-	    find_and_assign_combinations(topology, obj->children[i], synthesize_arity);
+			if (obj->children[i]->userdata == (void*) -1)
+					find_and_assign_combinations(obj->children[i], synthesize_arity);
 }
 
 static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers)
 {
-    unsigned i;
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-    int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
-
-    if (synthesize_arity == -1)
-	synthesize_arity = 2;
-
-    /* First, mark nodes which contain CPU workers, simply by setting their userdata field */
-    for (i = 0; i < topology->nworkers; i++)
-    {
-	struct _starpu_worker *worker = &config->workers[i];
-	if (worker->perf_arch == STARPU_CPU_DEFAULT)
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct starpu_machine_topology *topology = &config->topology;
+	int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
+	
+	if (synthesize_arity == -1)
+		synthesize_arity = 2;
+	
+	/* First, mark nodes which contain CPU workers, simply by setting their userdata field */
+	int i;
+	for (i = 0; i < nworkers; i++)
 	{
-	    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
-	    STARPU_ASSERT(obj->userdata == worker);
-	    obj = obj->parent;
-	    while (obj) {
-		obj->userdata = (void*) -1;
-		obj = obj->parent;
-	    }
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
+		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		{
+			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
+			STARPU_ASSERT(obj->userdata == worker);
+			obj = obj->parent;
+			while (obj) {
+				obj->userdata = (void*) -1;
+				obj = obj->parent;
+			}
+		}
 	}
-    }
-    find_and_assign_combinations(topology, hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
+	find_and_assign_combinations(hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
 }
 
 #else /* STARPU_HAVE_HWLOC */
@@ -573,44 +182,44 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
     unsigned sched_ctx_id  = starpu_get_sched_ctx();
     if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 	    sched_ctx_id = 0; 
-
+	
     struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
-
-
+	
+	
     /* We put the id of all CPU workers in this array */
     int cpu_workers[STARPU_NMAXWORKERS];
     unsigned ncpus = 0;
-
+	
     struct _starpu_worker *worker;
     unsigned i;
     for (i = 0; i < nworkers; i++)
     {
 	    worker = _starpu_get_worker_struct(workerids[i]);
-	   
+		
 	    if (worker->perf_arch == STARPU_CPU_DEFAULT)
 		    cpu_workers[ncpus++] = i;
     }
-
+	
     unsigned size;
     for (size = 2; size <= ncpus; size *= 2)
     {
-	unsigned first_cpu;
-	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-	{
-	    if (first_cpu + size <= ncpus)
-	    {
-		int workerids[size];
-
-		for (i = 0; i < size; i++)
-		    workerids[i] = cpu_workers[first_cpu + i];
-
-		/* We register this combination */
-		int newworkerid;
-		newworkerid = starpu_combined_worker_assign_workerid(size, workerids);
-		STARPU_ASSERT(newworkerid >= 0);
-		workers->add(workers, newworkerid);
-	    }
-	}
+		unsigned first_cpu;
+		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
+		{
+			if (first_cpu + size <= ncpus)
+			{
+				int found_workerids[size];
+				
+				for (i = 0; i < size; i++)
+					found_workerids[i] = cpu_workers[first_cpu + i];
+				
+				/* We register this combination */
+				int newworkerid;
+				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
+				STARPU_ASSERT(newworkerid >= 0);
+				workers->add(workers, newworkerid);
+			}
+		}
     }
 }