detect_combined_workers.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2012 Université de Bordeaux 1
  4. * Copyright (C) 2011, 2012 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <common/config.h>
  18. #include <starpu.h>
  19. #include <common/utils.h>
  20. #include <core/workers.h>
  21. #include <math.h>
  22. #include <sched_policies/detect_combined_workers.h>
  23. #ifdef STARPU_HAVE_HWLOC
  24. #include <hwloc.h>
  25. /* struct _starpu_tree
  26. * ==================
  27. * Purpose
  28. * =======
  29. * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
  30. * object and storing every workers it contained in every sub-trees by recursion.
  31. *
  32. * Fields
  33. * ======
  34. * obj A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
  35. *
  36. * nb_workers Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
  37. or in this very object.
  38. *
  39. * workers CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
  40. */
  41. struct _starpu_tree
  42. {
  43. hwloc_obj_t obj;
  44. unsigned nb_workers;
  45. int *workers;
  46. };
  47. /* gather_trees
  48. * ============
  49. * Purpose
  50. * =======
  51. * Gather all the workers of every source tree in one target tree.
  52. * We assume the target array of workers is big enough to contain all the workers.
  53. *
  54. * Arguments
  55. * =========
  56. * target_tree (input, output)
  57. * Pointer to the tree which will contain all the workers of every source.
  58. *
  59. * source_trees (input)
  60. * Array of trees we want to combine in a unique tree.
  61. *
  62. * nb_source_trees (input)
  63. * Number of trees we want to combine (size of the array).
  64. */
  65. static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
  66. {
  67. unsigned tree_id, worker_id, index = 0;
  68. for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
  69. for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
  70. target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
  71. }
  72. /* assign_multiple_trees
  73. * ========================
  74. * Purpose
  75. * =======
  76. * Assign every tree which is large enough (greater than min_size) and merge small ones.
  77. * If there is no tree large enough to be assigned any more, we return.
  78. *
  79. * Return value
  80. * ============
  81. * The number of workers assigned during the function.
  82. *
  83. * Arguments
  84. * =========
  85. * trees (input, output)
  86. * In entry, array of trees to assign. In the end at most one tree still contains workers.
  87. *
  88. * nb_trees (input)
  89. * The number of trees (size of the array).
  90. *
  91. * min_size (input)
  92. * Minimum size of a combined worker.
  93. *
  94. * max_size (input)
  95. * Maximum size of a combined worker.
  96. */
  97. static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
  98. {
  99. unsigned short complete = 0;
  100. unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
  101. for(tree_id = 0; tree_id < nb_trees; ++tree_id)
  102. nb_workers_total += trees[tree_id].nb_workers;;
  103. while(!complete)
  104. {
  105. complete = 1;
  106. /* First we manage to assign every subtree large enough to be assigned alone */
  107. for(tree_id = 0; tree_id < nb_trees; ++tree_id)
  108. {
  109. if(trees[tree_id].nb_workers== 0) // An already assigned subtree
  110. continue;
  111. nb_workers_tree = trees[tree_id].nb_workers;
  112. /* We shouldn't assign a small tree if we could assign the whole trees instead */
  113. if(nb_workers_tree >= min_size && nb_workers_total > max_size)
  114. {
  115. int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
  116. STARPU_ASSERT(ret >= 0);
  117. nb_workers_assigned += nb_workers_tree;
  118. nb_workers_total -= nb_workers_tree;
  119. trees[tree_id].nb_workers = 0;
  120. }
  121. }
  122. /* Then we merge too small subtrees into not too large ones
  123. * if we manage to merge some subtrees we turn the flag
  124. * complete to 0 thus we know he have to start again to assign
  125. * just merged subtrees */
  126. for(tree_id = 0; tree_id < nb_trees; ++tree_id)
  127. {
  128. if(trees[tree_id].nb_workers == 0) // An already assigned subtree
  129. continue;
  130. nb_workers_tree = trees[tree_id].nb_workers;
  131. /* We go through the array to find another subtree we can merge with this one */
  132. for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
  133. {
  134. if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
  135. continue;
  136. nb_workers_tree2 = trees[tree_id2].nb_workers;
  137. /* We can merge the two subtrees, let's do it */
  138. if(nb_workers_tree + nb_workers_tree2 <= max_size)
  139. {
  140. for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
  141. trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
  142. trees[tree_id].nb_workers += nb_workers_tree2;
  143. trees[tree_id2].nb_workers = 0;
  144. /* We just merged two subtrees, we need to restart again and try to assign it */
  145. complete = 0;
  146. break;
  147. }
  148. }
  149. if(!complete)
  150. break;
  151. }
  152. }
  153. return nb_workers_assigned;
  154. }
  155. /* find_and_assign_combinations_with_hwloc_recursive
  156. * =================================================
  157. * Purpose
  158. * =======
  159. * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
  160. * assign are given back to the calling function to be assigned using data from other subtrees if so.
  161. *
  162. * Return value
  163. * ============
  164. * The number of workers left to be assigned.
  165. *
  166. * Arguments
  167. * =========
  168. * tree (input, output)
  169. * Tree structure containing the root to process in entry.
  170. * When the function returns it also contains the number of workers left
  171. * to be assigned and these very workers in the array previously allocated.
  172. *
  173. * min_size (input)
  174. * Minimum size of a combined worker.
  175. *
  176. * max_size (input)
  177. * Maximum size of a combined worker.
  178. */
  179. static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
  180. {
  181. unsigned subtree_id, nb_workers = 0;
  182. hwloc_obj_t obj = tree->obj;
  183. int *workers = tree->workers;
  184. struct _starpu_machine_config *config = _starpu_get_machine_config();
  185. /* Is this a leaf ? (eg. a PU for hwloc) */
  186. if (!hwloc_compare_types(config->cpu_depth, obj->depth))
  187. {
  188. struct _starpu_worker *worker = obj->userdata;
  189. /* If this is a CPU worker add it at the beginning
  190. * of the array , write 1 in the field nb_workers and
  191. * return the number of CPU workers found : 1 in this case. */
  192. if (worker && worker->arch == STARPU_CPU_WORKER)
  193. {
  194. workers[0] = worker->workerid;
  195. tree->nb_workers = 1;
  196. return 1;
  197. }
  198. tree->nb_workers = 0;
  199. return 0;
  200. }
  201. /* If there is only one child, we go to the next level right away */
  202. if (obj->arity == 1)
  203. {
  204. struct _starpu_tree subtree = *tree;
  205. subtree.obj = obj->children[0];
  206. nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
  207. tree->nb_workers = nb_workers;
  208. return nb_workers;
  209. }
  210. /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
  211. * CPU leaves that fits between min and max. */
  212. /* We allocate an array of tree structures which will contain the current node's subtrees data */
  213. struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
  214. /* We allocate the array containing the workers of each subtree and initialize the fields left */
  215. for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
  216. {
  217. struct _starpu_tree *subtree = subtrees + subtree_id;
  218. subtree->obj = obj->children[subtree_id];
  219. subtree->nb_workers = 0;
  220. subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
  221. }
  222. /* We recursively go through every subtree and get all the workers which are not assigned yet */
  223. for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
  224. nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
  225. if(nb_workers > max_size)
  226. {
  227. /* We withdraw the number of workers just assigned from the total number of workers */
  228. nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
  229. /* Some workers are not assigned yet : we gather them in the array
  230. * which is returned to the father which will handle them later */
  231. if(nb_workers)
  232. gather_trees(tree, subtrees, obj->arity);
  233. }
  234. else if(nb_workers < max_size)
  235. {
  236. gather_trees(tree, subtrees, obj->arity);
  237. }
  238. else // nb_workers == max_size
  239. {
  240. gather_trees(tree, subtrees, obj->arity);
  241. int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
  242. STARPU_ASSERT(ret >= 0);
  243. nb_workers = 0;
  244. }
  245. for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
  246. free(subtrees[subtree_id].workers);
  247. free(subtrees);
  248. tree->nb_workers = nb_workers;
  249. return nb_workers;
  250. }
  251. /* get_min_max_sizes
  252. * =================================================
  253. * Purpose
  254. * =======
  255. * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
  256. * environment variables.
  257. * If both of them were not set, then we try do get some efficient values following the rule beneath :
  258. *
  259. * --> exact --> MIN_SIZE = S-1 <--> MAX_SIZE = S+1
  260. * S = square_root(nb_cpus)
  261. * --> decimal --> MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
  262. *
  263. * If only one of both was not set then we set it with a value relative to the other, for example :
  264. *
  265. * MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
  266. *
  267. * Arguments
  268. * =========
  269. * min_size (output)
  270. * Pointer to the minimum size of a combined worker, whether set with
  271. * value given by the user or processed from the number of cpus.
  272. *
  273. * max_size (output)
  274. * Pointer to the maximum size of a combined worker, whether set with
  275. * value given by the user or processed from the number of cpus.
  276. *
  277. * topology (input)
  278. * Topology of the machine : used to know the number of cpus.
  279. */
  280. static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
  281. {
  282. int _min_size, _max_size;
  283. _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
  284. _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
  285. /* If the user didn't set both the environment variables,
  286. * we need to find a minimum and a maximum size ourselves */
  287. if(_min_size <= -1 || _max_size <= -1)
  288. {
  289. int nb_cpus = topology->nhwcpus;
  290. int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
  291. int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
  292. if(_min_size == -1)
  293. {
  294. if(_max_size > -1)
  295. _min_size = _max_size - 1;
  296. else
  297. _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
  298. }
  299. if(_max_size == -1)
  300. {
  301. if(_min_size > -1)
  302. _max_size = _min_size + 1;
  303. else
  304. _max_size = sqrt_nb_cpus + 1;
  305. }
  306. }
  307. *min_size = _min_size;
  308. *max_size = _max_size;
  309. return;
  310. }
  311. /* find_and_assign_combinations_with_hwloc
  312. * =======================================
  313. * Purpose
  314. * =======
  315. * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
  316. * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
  317. * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
  318. * some workers, we assign them no matter the number for there is no way to respect
  319. * the wanted sizes anymore.
  320. *
  321. * Arguments
  322. * =========
  323. * topology (input)
  324. * Topology of the machine : used to know the number of cpus and
  325. * to get the hwloc tree.
  326. */
  327. static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
  328. {
  329. unsigned nb_workers;
  330. unsigned int min_size, max_size;
  331. get_min_max_sizes(&min_size, &max_size, topology);
  332. STARPU_ASSERT(min_size <= max_size);
  333. struct _starpu_tree tree;
  334. /* Of course we start from the root */
  335. tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
  336. tree.nb_workers = 0;
  337. tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
  338. /* We recursively go from the root to the leaves of the tree to find
  339. * subtrees that only have CPUs as leaves. */
  340. nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
  341. /* There are still some workers left, since the only possibility is that
  342. * the number of workers left is less than the minimum worker size we assign them all */
  343. if(nb_workers > 0)
  344. {
  345. /* find_and_assign_combinations_with_hwloc_recursive shouldn't return
  346. * while there are enough workers to assign regarding the min_size value */
  347. STARPU_ASSERT(nb_workers <= max_size);
  348. int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
  349. STARPU_ASSERT(ret >= 0);
  350. }
  351. free(tree.workers);
  352. }
  353. #else /* STARPU_HAVE_HWLOC */
  354. static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology *topology)
  355. {
  356. struct _starpu_machine_config *config = _starpu_get_machine_config();
  357. /* We put the id of all CPU workers in this array */
  358. int cpu_workers[STARPU_NMAXWORKERS];
  359. unsigned ncpus = 0;
  360. unsigned i;
  361. for (i = 0; i < topology->nworkers; i++)
  362. {
  363. if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
  364. cpu_workers[ncpus++] = i;
  365. }
  366. unsigned size;
  367. for (size = 2; size <= ncpus; size *= 2)
  368. {
  369. unsigned first_cpu;
  370. for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
  371. {
  372. if (first_cpu + size <= ncpus)
  373. {
  374. int workerids[size];
  375. for (i = 0; i < size; i++)
  376. workerids[i] = cpu_workers[first_cpu + i];
  377. /* We register this combination */
  378. int ret;
  379. ret = starpu_combined_worker_assign_workerid(size, workerids);
  380. STARPU_ASSERT(ret >= 0);
  381. }
  382. }
  383. }
  384. }
  385. #endif /* STARPU_HAVE_HWLOC */
  386. static void combine_all_cpu_workers(struct starpu_machine_topology *topology)
  387. {
  388. struct _starpu_machine_config *config = _starpu_get_machine_config();
  389. int cpu_workers[STARPU_NMAXWORKERS];
  390. unsigned ncpus = 0;
  391. unsigned i;
  392. for (i = 0; i < topology->nworkers; i++)
  393. {
  394. if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
  395. cpu_workers[ncpus++] = i;
  396. }
  397. for (i = 1; i <= ncpus; i++)
  398. {
  399. int ret;
  400. ret = starpu_combined_worker_assign_workerid(i, cpu_workers);
  401. STARPU_ASSERT(ret >= 0);
  402. }
  403. }
  404. void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology)
  405. {
  406. struct _starpu_machine_config *config = _starpu_get_machine_config();
  407. if (config->conf->single_combined_worker > 0)
  408. combine_all_cpu_workers(topology);
  409. else
  410. {
  411. #ifdef STARPU_HAVE_HWLOC
  412. find_and_assign_combinations_with_hwloc(topology);
  413. #else
  414. find_and_assign_combinations_without_hwloc(topology);
  415. #endif
  416. }
  417. }