|
@@ -25,479 +25,76 @@
|
|
|
#ifdef STARPU_HAVE_HWLOC
|
|
|
#include <hwloc.h>
|
|
|
|
|
|
-#if 0
|
|
|
-/* struct _starpu_tree
|
|
|
- * ==================
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
|
|
|
- * object and storing every workers it contained in every sub-trees by recursion.
|
|
|
- *
|
|
|
- * Fields
|
|
|
- * ======
|
|
|
- * obj A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
|
|
|
- *
|
|
|
- * nb_workers Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
|
|
|
- or in this very object.
|
|
|
- *
|
|
|
- * workers CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
|
|
|
- */
|
|
|
-
|
|
|
-struct _starpu_tree
|
|
|
-{
|
|
|
- hwloc_obj_t obj;
|
|
|
- unsigned nb_workers;
|
|
|
- int *workers;
|
|
|
-};
|
|
|
-
|
|
|
-/* gather_trees
|
|
|
- * ============
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * Gather all the workers of every source tree in one target tree.
|
|
|
- * We assume the target array of workers is big enough to contain all the workers.
|
|
|
- *
|
|
|
- * Arguments
|
|
|
- * =========
|
|
|
- * target_tree (input, output)
|
|
|
- * Pointer to the tree which will contain all the workers of every source.
|
|
|
- *
|
|
|
- * source_trees (input)
|
|
|
- * Array of trees we want to combine in a unique tree.
|
|
|
- *
|
|
|
- * nb_source_trees (input)
|
|
|
- * Number of trees we want to combine (size of the array).
|
|
|
- */
|
|
|
-
|
|
|
-static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
|
|
|
-{
|
|
|
- unsigned tree_id, worker_id, index = 0;
|
|
|
- for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
|
|
|
- for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
|
|
|
- target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
|
|
|
-}
|
|
|
-
|
|
|
-/* assign_multiple_trees
|
|
|
- * ========================
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * Assign every tree which is large enough (greater than min_size) and merge small ones.
|
|
|
- * If there is no tree large enough to be assigned any more, we return.
|
|
|
- *
|
|
|
- * Return value
|
|
|
- * ============
|
|
|
- * The number of workers assigned during the function.
|
|
|
- *
|
|
|
- * Arguments
|
|
|
- * =========
|
|
|
- * trees (input, output)
|
|
|
- * In entry, array of trees to assign. In the end at most one tree still contains workers.
|
|
|
- *
|
|
|
- * nb_trees (input)
|
|
|
- * The number of trees (size of the array).
|
|
|
- *
|
|
|
- * min_size (input)
|
|
|
- * Minimum size of a combined worker.
|
|
|
- *
|
|
|
- * max_size (input)
|
|
|
- * Maximum size of a combined worker.
|
|
|
- */
|
|
|
-
|
|
|
-static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
|
|
|
+static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
|
|
|
{
|
|
|
- unsigned short complete = 0;
|
|
|
- unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
|
|
|
-
|
|
|
- for(tree_id = 0; tree_id < nb_trees; ++tree_id)
|
|
|
- nb_workers_total += trees[tree_id].nb_workers;;
|
|
|
-
|
|
|
- while(!complete)
|
|
|
- {
|
|
|
- complete = 1;
|
|
|
-
|
|
|
- /* First we manage to assign every subtree large enough to be assigned alone */
|
|
|
- for(tree_id = 0; tree_id < nb_trees; ++tree_id)
|
|
|
- {
|
|
|
- if(trees[tree_id].nb_workers== 0) // An already assigned subtree
|
|
|
- continue;
|
|
|
-
|
|
|
- nb_workers_tree = trees[tree_id].nb_workers;
|
|
|
-
|
|
|
- /* We shouldn't assign a small tree if we could assign the whole trees instead */
|
|
|
- if(nb_workers_tree >= min_size && nb_workers_total > max_size)
|
|
|
- {
|
|
|
- int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
|
|
|
- STARPU_ASSERT(ret >= 0);
|
|
|
- nb_workers_assigned += nb_workers_tree;
|
|
|
- nb_workers_total -= nb_workers_tree;
|
|
|
- trees[tree_id].nb_workers = 0;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /* Then we merge too small subtrees into not too large ones
|
|
|
- * if we manage to merge some subtrees we turn the flag
|
|
|
- * complete to 0 thus we know he have to start again to assign
|
|
|
- * just merged subtrees */
|
|
|
- for(tree_id = 0; tree_id < nb_trees; ++tree_id)
|
|
|
- {
|
|
|
- if(trees[tree_id].nb_workers == 0) // An already assigned subtree
|
|
|
- continue;
|
|
|
-
|
|
|
- nb_workers_tree = trees[tree_id].nb_workers;
|
|
|
-
|
|
|
- /* We go through the array to find another subtree we can merge with this one */
|
|
|
- for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
|
|
|
- {
|
|
|
- if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
|
|
|
- continue;
|
|
|
-
|
|
|
- nb_workers_tree2 = trees[tree_id2].nb_workers;
|
|
|
-
|
|
|
- /* We can merge the two subtrees, let's do it */
|
|
|
- if(nb_workers_tree + nb_workers_tree2 <= max_size)
|
|
|
+ if (!obj->userdata)
|
|
|
+ /* Not something we run something on, don't care */
|
|
|
+ return;
|
|
|
+ if (obj->userdata == (void*) -1)
|
|
|
{
|
|
|
- for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
|
|
|
- trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
|
|
|
-
|
|
|
- trees[tree_id].nb_workers += nb_workers_tree2;
|
|
|
- trees[tree_id2].nb_workers = 0;
|
|
|
-
|
|
|
- /* We just merged two subtrees, we need to restart again and try to assign it */
|
|
|
- complete = 0;
|
|
|
- break;
|
|
|
+ /* Intra node, recurse */
|
|
|
+ unsigned i;
|
|
|
+ for (i = 0; i < obj->arity; i++)
|
|
|
+ find_workers(obj->children[i], cpu_workers, n);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Got to a PU leaf */
|
|
|
+ struct _starpu_worker *worker = obj->userdata;
|
|
|
+ /* is it a CPU worker? */
|
|
|
+ if (worker->perf_arch == STARPU_CPU_DEFAULT)
|
|
|
+ {
|
|
|
+ _STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
|
|
|
+ /* Add it to the combined worker */
|
|
|
+ cpu_workers[(*n)++] = worker->workerid;
|
|
|
}
|
|
|
- }
|
|
|
-
|
|
|
- if(!complete)
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return nb_workers_assigned;
|
|
|
-}
|
|
|
-
|
|
|
-/* find_and_assign_combinations_with_hwloc_recursive
|
|
|
- * =================================================
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
|
|
|
- * assign are given back to the calling function to be assigned using data from other subtrees if so.
|
|
|
- *
|
|
|
- * Return value
|
|
|
- * ============
|
|
|
- * The number of workers left to be assigned.
|
|
|
- *
|
|
|
- * Arguments
|
|
|
- * =========
|
|
|
- * tree (input, output)
|
|
|
- * Tree structure containing the root to process in entry.
|
|
|
- * When the function returns it also contains the number of workers left
|
|
|
- * to be assigned and these very workers in the array previously allocated.
|
|
|
- *
|
|
|
- * min_size (input)
|
|
|
- * Minimum size of a combined worker.
|
|
|
- *
|
|
|
- * max_size (input)
|
|
|
- * Maximum size of a combined worker.
|
|
|
- */
|
|
|
-
|
|
|
-static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
|
|
|
-{
|
|
|
- unsigned subtree_id, nb_workers = 0;
|
|
|
-
|
|
|
- hwloc_obj_t obj = tree->obj;
|
|
|
- int *workers = tree->workers;
|
|
|
-
|
|
|
- struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
-
|
|
|
- /* Is this a leaf ? (eg. a PU for hwloc) */
|
|
|
- if (!hwloc_compare_types(config->cpu_depth, obj->depth))
|
|
|
- {
|
|
|
- struct _starpu_worker *worker = obj->userdata;
|
|
|
-
|
|
|
- /* If this is a CPU worker add it at the beginning
|
|
|
- * of the array , write 1 in the field nb_workers and
|
|
|
- * return the number of CPU workers found : 1 in this case. */
|
|
|
- if (worker && worker->arch == STARPU_CPU_WORKER)
|
|
|
- {
|
|
|
- workers[0] = worker->workerid;
|
|
|
- tree->nb_workers = 1;
|
|
|
- return 1;
|
|
|
- }
|
|
|
-
|
|
|
- tree->nb_workers = 0;
|
|
|
- return 0;
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- /* If there is only one child, we go to the next level right away */
|
|
|
- if (obj->arity == 1)
|
|
|
- {
|
|
|
- struct _starpu_tree subtree = *tree;
|
|
|
- subtree.obj = obj->children[0];
|
|
|
- nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
|
|
|
- tree->nb_workers = nb_workers;
|
|
|
- return nb_workers;
|
|
|
- }
|
|
|
-
|
|
|
- /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
|
|
|
- * CPU leaves that fits between min and max. */
|
|
|
-
|
|
|
- /* We allocate an array of tree structures which will contain the current node's subtrees data */
|
|
|
- struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
|
|
|
-
|
|
|
- /* We allocate the array containing the workers of each subtree and initialize the fields left */
|
|
|
- for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
|
|
|
- {
|
|
|
- struct _starpu_tree *subtree = subtrees + subtree_id;
|
|
|
-
|
|
|
- subtree->obj = obj->children[subtree_id];
|
|
|
- subtree->nb_workers = 0;
|
|
|
- subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
|
|
|
- }
|
|
|
-
|
|
|
- /* We recursively go through every subtree and get all the workers which are not assigned yet */
|
|
|
- for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
|
|
|
- nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
|
|
|
-
|
|
|
- if(nb_workers > max_size)
|
|
|
- {
|
|
|
- /* We withdraw the number of workers just assigned from the total number of workers */
|
|
|
- nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
|
|
|
-
|
|
|
- /* Some workers are not assigned yet : we gather them in the array
|
|
|
- * which is returned to the father which will handle them later */
|
|
|
- if(nb_workers)
|
|
|
- gather_trees(tree, subtrees, obj->arity);
|
|
|
- }
|
|
|
- else if(nb_workers < max_size)
|
|
|
- {
|
|
|
- gather_trees(tree, subtrees, obj->arity);
|
|
|
- }
|
|
|
- else // nb_workers == max_size
|
|
|
- {
|
|
|
- gather_trees(tree, subtrees, obj->arity);
|
|
|
-
|
|
|
- unsigned sched_ctx_id = starpu_get_sched_ctx();
|
|
|
- int i;
|
|
|
- for(i = 0; i < nb_workers; i++)
|
|
|
- if(!starpu_worker_belongs_to_sched_ctx(workers[i], sched_ctx_id))
|
|
|
- return 0;
|
|
|
- struct worker_collection* workers_coll = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
|
|
|
- int newworkerid = starpu_combined_worker_assign_workerid(nb_workers, workers);
|
|
|
- STARPU_ASSERT(newworkerid >= 0);
|
|
|
- workers_coll->add(workers_coll, newworkerid);
|
|
|
- nb_workers = 0;
|
|
|
- }
|
|
|
-
|
|
|
- for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
|
|
|
- free(subtrees[subtree_id].workers);
|
|
|
- free(subtrees);
|
|
|
-
|
|
|
- tree->nb_workers = nb_workers;
|
|
|
- return nb_workers;
|
|
|
-}
|
|
|
-
|
|
|
-/* get_min_max_sizes
|
|
|
- * =================================================
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
|
|
|
- * environment variables.
|
|
|
- * If both of them were not set, then we try do get some efficient values following the rule beneath :
|
|
|
- *
|
|
|
- * --> exact --> MIN_SIZE = S-1 <--> MAX_SIZE = S+1
|
|
|
- * S = square_root(nb_cpus)
|
|
|
- * --> decimal --> MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
|
|
|
- *
|
|
|
- * If only one of both was not set then we set it with a value relative to the other, for example :
|
|
|
- *
|
|
|
- * MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
|
|
|
- *
|
|
|
- * Arguments
|
|
|
- * =========
|
|
|
- * min_size (output)
|
|
|
- * Pointer to the minimum size of a combined worker, whether set with
|
|
|
- * value given by the user or processed from the number of cpus.
|
|
|
- *
|
|
|
- * max_size (output)
|
|
|
- * Pointer to the maximum size of a combined worker, whether set with
|
|
|
- * value given by the user or processed from the number of cpus.
|
|
|
- *
|
|
|
- * topology (input)
|
|
|
- * Topology of the machine : used to know the number of cpus.
|
|
|
- */
|
|
|
-
|
|
|
-static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
|
|
|
-{
|
|
|
- int _min_size, _max_size;
|
|
|
-
|
|
|
- _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
|
|
|
- _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
|
|
|
-
|
|
|
- /* If the user didn't set both the environment variables,
|
|
|
- * we need to find a minimum and a maximum size ourselves */
|
|
|
- if(_min_size <= -1 || _max_size <= -1)
|
|
|
- {
|
|
|
-
|
|
|
- int nb_cpus = topology->nhwcpus;
|
|
|
- int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
|
|
|
- int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
|
|
|
-
|
|
|
- if(_min_size == -1)
|
|
|
- {
|
|
|
- if(_max_size > -1)
|
|
|
- _min_size = _max_size - 1;
|
|
|
- else
|
|
|
- _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
|
|
|
- }
|
|
|
-
|
|
|
- if(_max_size == -1)
|
|
|
- {
|
|
|
- if(_min_size > -1)
|
|
|
- _max_size = _min_size + 1;
|
|
|
- else
|
|
|
- _max_size = sqrt_nb_cpus + 1;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- *min_size = _min_size;
|
|
|
- *max_size = _max_size;
|
|
|
-
|
|
|
- return;
|
|
|
-}
|
|
|
-
|
|
|
-/* find_and_assign_combinations_with_hwloc
|
|
|
- * =======================================
|
|
|
- * Purpose
|
|
|
- * =======
|
|
|
- * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
|
|
|
- * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
|
|
|
- * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
|
|
|
- * some workers, we assign them no matter the number for there is no way to respect
|
|
|
- * the wanted sizes anymore.
|
|
|
- *
|
|
|
- * Arguments
|
|
|
- * =========
|
|
|
- * topology (input)
|
|
|
- * Topology of the machine : used to know the number of cpus and
|
|
|
- * to get the hwloc tree.
|
|
|
- */
|
|
|
-
|
|
|
-static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers)
|
|
|
-{
|
|
|
- struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
- struct starpu_machine_topology *topology = &config->topology;
|
|
|
-
|
|
|
- unsigned sched_ctx_id = starpu_get_sched_ctx();
|
|
|
- if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
|
|
|
- sched_ctx_id = 0;
|
|
|
-
|
|
|
- struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
|
|
|
-
|
|
|
- unsigned nb_workers;
|
|
|
- unsigned int min_size, max_size;
|
|
|
-
|
|
|
- get_min_max_sizes(&min_size, &max_size, topology);
|
|
|
-
|
|
|
- STARPU_ASSERT(min_size <= max_size);
|
|
|
-
|
|
|
- struct _starpu_tree tree;
|
|
|
-
|
|
|
- /* Of course we start from the root */
|
|
|
- tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
|
|
|
- tree.nb_workers = 0;
|
|
|
- tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
|
|
|
-
|
|
|
- /* We recursively go from the root to the leaves of the tree to find
|
|
|
- * subtrees that only have CPUs as leaves. */
|
|
|
- nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
|
|
|
-
|
|
|
- /* There are still some workers left, since the only possibility is that
|
|
|
- * the number of workers left is less than the minimum worker size we assign them all */
|
|
|
- if(nb_workers > 0)
|
|
|
- {
|
|
|
- /* find_and_assign_combinations_with_hwloc_recursive shouldn't return
|
|
|
- * while there are enough workers to assign regarding the min_size value */
|
|
|
- STARPU_ASSERT(nb_workers <= max_size);
|
|
|
-
|
|
|
- int newworkerid = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
|
|
|
- STARPU_ASSERT(newworkerid >= 0);
|
|
|
- workers->add(workers, newworkerid);
|
|
|
- }
|
|
|
-
|
|
|
- free(tree.workers);
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
|
|
|
-{
|
|
|
- if (!obj->userdata)
|
|
|
- /* Not something we run something on, don't care */
|
|
|
- return;
|
|
|
- if (obj->userdata == (void*) -1)
|
|
|
- {
|
|
|
- /* Intra node, recurse */
|
|
|
- unsigned i;
|
|
|
- for (i = 0; i < obj->arity; i++)
|
|
|
- find_workers(obj->children[i], cpu_workers, n);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- /* Got to a PU leaf */
|
|
|
- struct _starpu_worker *worker = obj->userdata;
|
|
|
- /* is it a CPU worker? */
|
|
|
- if (worker->perf_arch == STARPU_CPU_DEFAULT)
|
|
|
- {
|
|
|
- _STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
|
|
|
- /* Add it to the combined worker */
|
|
|
- cpu_workers[(*n)++] = worker->workerid;
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
-static void synthesize_intermediate_workers(int *workerids, int nworkers, hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
|
|
|
+static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
|
|
|
{
|
|
|
- unsigned nworkers, i, j;
|
|
|
- unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
|
|
|
- unsigned chunk_start;
|
|
|
- int cpu_workers[STARPU_NMAXWORKERS];
|
|
|
- int ret;
|
|
|
-
|
|
|
- if (n <= synthesize_arity)
|
|
|
- /* Not too many children, do not synthesize */
|
|
|
- return;
|
|
|
+ unsigned nworkers, i, j;
|
|
|
+ unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
|
|
|
+ unsigned chunk_start;
|
|
|
+ int cpu_workers[STARPU_NMAXWORKERS];
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ if (n <= synthesize_arity)
|
|
|
+ /* Not too many children, do not synthesize */
|
|
|
+ return;
|
|
|
|
|
|
- _STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
|
|
|
+ _STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
|
|
|
|
|
|
- n = 0;
|
|
|
- j = 0;
|
|
|
- nworkers = 0;
|
|
|
- chunk_start = 0;
|
|
|
- for (i = 0 ; i < arity; i++)
|
|
|
- {
|
|
|
- if (children[i]->userdata) {
|
|
|
- n++;
|
|
|
- _STARPU_DEBUG("child %u\n", i);
|
|
|
- find_workers(children[i], cpu_workers, &nworkers);
|
|
|
- j++;
|
|
|
- }
|
|
|
- /* Completed a chunk, or last bit (but not if it's just 1 subobject) */
|
|
|
- if (j == chunk_size || (i == arity-1 && j > 1)) {
|
|
|
- _STARPU_DEBUG("Adding it\n");
|
|
|
- ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
|
|
|
- STARPU_ASSERT(ret >= 0);
|
|
|
- /* Recurse there */
|
|
|
- synthesize_intermediate_workers(topology, children+chunk_start, i - chunk_start, n, synthesize_arity);
|
|
|
- /* And restart another one */
|
|
|
- n = 0;
|
|
|
- j = 0;
|
|
|
- nworkers = 0;
|
|
|
- chunk_start = i+1;
|
|
|
- }
|
|
|
- }
|
|
|
+ n = 0;
|
|
|
+ j = 0;
|
|
|
+ nworkers = 0;
|
|
|
+ chunk_start = 0;
|
|
|
+ for (i = 0 ; i < arity; i++)
|
|
|
+ {
|
|
|
+ if (children[i]->userdata)
|
|
|
+ {
|
|
|
+ n++;
|
|
|
+ _STARPU_DEBUG("child %u\n", i);
|
|
|
+ find_workers(children[i], cpu_workers, &nworkers);
|
|
|
+ j++;
|
|
|
+ }
|
|
|
+ /* Completed a chunk, or last bit (but not if it's just 1 subobject) */
|
|
|
+ if (j == chunk_size || (i == arity-1 && j > 1))
|
|
|
+ {
|
|
|
+ _STARPU_DEBUG("Adding it\n");
|
|
|
+ ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
|
|
|
+ STARPU_ASSERT(ret >= 0);
|
|
|
+ /* Recurse there */
|
|
|
+ synthesize_intermediate_workers(children+chunk_start, i - chunk_start, n, synthesize_arity);
|
|
|
+ /* And restart another one */
|
|
|
+ n = 0;
|
|
|
+ j = 0;
|
|
|
+ nworkers = 0;
|
|
|
+ chunk_start = i+1;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-static void find_and_assign_combinations(int *workerids, int nworkers, hwloc_obj_t obj, unsigned synthesize_arity)
|
|
|
+static void find_and_assign_combinations(hwloc_obj_t obj, unsigned synthesize_arity)
|
|
|
{
|
|
|
char name[64];
|
|
|
unsigned i, n, nworkers;
|
|
@@ -505,65 +102,77 @@ static void find_and_assign_combinations(int *workerids, int nworkers, hwloc_obj
|
|
|
|
|
|
int ret;
|
|
|
|
|
|
+
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
+ struct starpu_machine_topology *topology = &config->topology;
|
|
|
+
|
|
|
hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0);
|
|
|
_STARPU_DEBUG("Looking at %s\n", name);
|
|
|
|
|
|
for (n = 0, i = 0; i < obj->arity; i++)
|
|
|
- if (obj->children[i]->userdata)
|
|
|
- /* it has a CPU worker */
|
|
|
- n++;
|
|
|
-
|
|
|
+ if (obj->children[i]->userdata)
|
|
|
+ /* it has a CPU worker */
|
|
|
+ n++;
|
|
|
+
|
|
|
if (n == 1) {
|
|
|
- /* If there is only one child, we go to the next level right away */
|
|
|
- find_and_assign_combinations(topology, obj->children[0], synthesize_arity);
|
|
|
- return;
|
|
|
+ /* If there is only one child, we go to the next level right away */
|
|
|
+ find_and_assign_combinations(obj->children[0], synthesize_arity);
|
|
|
+ return;
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/* Add this object */
|
|
|
nworkers = 0;
|
|
|
find_workers(obj, cpu_workers, &nworkers);
|
|
|
-
|
|
|
+
|
|
|
if (nworkers > 1)
|
|
|
{
|
|
|
- _STARPU_DEBUG("Adding it\n");
|
|
|
- ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
|
|
|
- STARPU_ASSERT(ret >= 0);
|
|
|
+ _STARPU_DEBUG("Adding it\n");
|
|
|
+ unsigned sched_ctx_id = starpu_get_sched_ctx();
|
|
|
+ if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
|
|
|
+ sched_ctx_id = 0;
|
|
|
+
|
|
|
+ struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
|
|
|
+
|
|
|
+ int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
|
|
|
+ workers->add(workers,newworkerid);
|
|
|
+ STARPU_ASSERT(ret >= 0);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/* Add artificial intermediate objects recursively */
|
|
|
- synthesize_intermediate_workers(topology, obj->children, obj->arity, n, synthesize_arity);
|
|
|
-
|
|
|
+ synthesize_intermediate_workers(obj->children, obj->arity, n, synthesize_arity);
|
|
|
+
|
|
|
/* And recurse */
|
|
|
for (i = 0; i < obj->arity; i++)
|
|
|
- if (obj->children[i]->userdata == (void*) -1)
|
|
|
- find_and_assign_combinations(topology, obj->children[i], synthesize_arity);
|
|
|
+ if (obj->children[i]->userdata == (void*) -1)
|
|
|
+ find_and_assign_combinations(obj->children[i], synthesize_arity);
|
|
|
}
|
|
|
|
|
|
static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers)
|
|
|
{
|
|
|
- unsigned i;
|
|
|
- struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
- int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
|
|
|
-
|
|
|
- if (synthesize_arity == -1)
|
|
|
- synthesize_arity = 2;
|
|
|
-
|
|
|
- /* First, mark nodes which contain CPU workers, simply by setting their userdata field */
|
|
|
- for (i = 0; i < topology->nworkers; i++)
|
|
|
- {
|
|
|
- struct _starpu_worker *worker = &config->workers[i];
|
|
|
- if (worker->perf_arch == STARPU_CPU_DEFAULT)
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
+ struct starpu_machine_topology *topology = &config->topology;
|
|
|
+ int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
|
|
|
+
|
|
|
+ if (synthesize_arity == -1)
|
|
|
+ synthesize_arity = 2;
|
|
|
+
|
|
|
+ /* First, mark nodes which contain CPU workers, simply by setting their userdata field */
|
|
|
+ int i;
|
|
|
+ for (i = 0; i < nworkers; i++)
|
|
|
{
|
|
|
- hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
|
|
|
- STARPU_ASSERT(obj->userdata == worker);
|
|
|
- obj = obj->parent;
|
|
|
- while (obj) {
|
|
|
- obj->userdata = (void*) -1;
|
|
|
- obj = obj->parent;
|
|
|
- }
|
|
|
+ struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
|
|
|
+ if (worker->perf_arch == STARPU_CPU_DEFAULT)
|
|
|
+ {
|
|
|
+ hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
|
|
|
+ STARPU_ASSERT(obj->userdata == worker);
|
|
|
+ obj = obj->parent;
|
|
|
+ while (obj) {
|
|
|
+ obj->userdata = (void*) -1;
|
|
|
+ obj = obj->parent;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
- }
|
|
|
- find_and_assign_combinations(topology, hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
|
|
|
+ find_and_assign_combinations(hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
|
|
|
}
|
|
|
|
|
|
#else /* STARPU_HAVE_HWLOC */
|
|
@@ -573,44 +182,44 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
|
|
|
unsigned sched_ctx_id = starpu_get_sched_ctx();
|
|
|
if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
|
|
|
sched_ctx_id = 0;
|
|
|
-
|
|
|
+
|
|
|
struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
/* We put the id of all CPU workers in this array */
|
|
|
int cpu_workers[STARPU_NMAXWORKERS];
|
|
|
unsigned ncpus = 0;
|
|
|
-
|
|
|
+
|
|
|
struct _starpu_worker *worker;
|
|
|
unsigned i;
|
|
|
for (i = 0; i < nworkers; i++)
|
|
|
{
|
|
|
worker = _starpu_get_worker_struct(workerids[i]);
|
|
|
-
|
|
|
+
|
|
|
if (worker->perf_arch == STARPU_CPU_DEFAULT)
|
|
|
cpu_workers[ncpus++] = i;
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
unsigned size;
|
|
|
for (size = 2; size <= ncpus; size *= 2)
|
|
|
{
|
|
|
- unsigned first_cpu;
|
|
|
- for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
|
|
|
- {
|
|
|
- if (first_cpu + size <= ncpus)
|
|
|
- {
|
|
|
- int workerids[size];
|
|
|
-
|
|
|
- for (i = 0; i < size; i++)
|
|
|
- workerids[i] = cpu_workers[first_cpu + i];
|
|
|
-
|
|
|
- /* We register this combination */
|
|
|
- int newworkerid;
|
|
|
- newworkerid = starpu_combined_worker_assign_workerid(size, workerids);
|
|
|
- STARPU_ASSERT(newworkerid >= 0);
|
|
|
- workers->add(workers, newworkerid);
|
|
|
- }
|
|
|
- }
|
|
|
+ unsigned first_cpu;
|
|
|
+ for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
|
|
|
+ {
|
|
|
+ if (first_cpu + size <= ncpus)
|
|
|
+ {
|
|
|
+ int found_workerids[size];
|
|
|
+
|
|
|
+ for (i = 0; i < size; i++)
|
|
|
+ found_workerids[i] = cpu_workers[first_cpu + i];
|
|
|
+
|
|
|
+ /* We register this combination */
|
|
|
+ int newworkerid;
|
|
|
+ newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
|
|
|
+ STARPU_ASSERT(newworkerid >= 0);
|
|
|
+ workers->add(workers, newworkerid);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|