speed.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2015 Inria
  4. * Copyright (C) 2016,2017 CNRS
  5. * Copyright (C) 2013,2014,2017 Université de Bordeaux
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "sc_hypervisor_policy.h"
  19. #include "sc_hypervisor_intern.h"
  20. #include <math.h>
  21. double sc_hypervisor_get_ctx_speed(struct sc_hypervisor_wrapper* sc_w)
  22. {
  23. struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
  24. double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
  25. double sample = config->ispeed_ctx_sample;
  26. double total_elapsed_flops = sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w);
  27. double total_flops = sc_w->total_flops;
  28. char *start_sample_prc_char = getenv("SC_HYPERVISOR_START_RESIZE");
  29. double start_sample_prc = start_sample_prc_char ? atof(start_sample_prc_char) : 0.0;
  30. double start_sample = start_sample_prc > 0.0 ? (start_sample_prc / 100) * total_flops : sample;
  31. double redim_sample = elapsed_flops == total_elapsed_flops ? (start_sample > 0.0 ? start_sample : sample) : sample;
  32. double curr_time = starpu_timing_now();
  33. double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
  34. unsigned can_compute_speed = 0;
  35. char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA");
  36. if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0))
  37. can_compute_speed = elapsed_time > config->time_sample;
  38. else
  39. can_compute_speed = elapsed_flops >= redim_sample;
  40. if(can_compute_speed)
  41. {
  42. return (elapsed_flops/1000000000.0)/elapsed_time;/* in Gflops/s */
  43. }
  44. return -1.0;
  45. }
  46. double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
  47. {
  48. if(!starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx))
  49. return -1.0;
  50. double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
  51. struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
  52. double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
  53. double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
  54. double ctx_sample = config->ispeed_ctx_sample;
  55. if(ctx_elapsed_flops > ctx_sample && elapsed_flops == 0.0)
  56. return 0.00000000000001;
  57. if( elapsed_flops > sample)
  58. {
  59. double curr_time = starpu_timing_now();
  60. double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
  61. elapsed_time -= sc_w->idle_time[worker];
  62. /* size_t elapsed_data_used = sc_w->elapsed_data[worker]; */
  63. /* enum starpu_worker_archtype arch = starpu_worker_get_type(worker); */
  64. /* if(arch == STARPU_CUDA_WORKER) */
  65. /* { */
  66. /* /\* unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx); *\/ */
  67. /* /\* if(!worker_in_ctx) *\/ */
  68. /* /\* { *\/ */
  69. /* /\* double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); *\/ */
  70. /* /\* elapsed_time += (elapsed_data_used / transfer_speed) / 1000000 ; *\/ */
  71. /* /\* } *\/ */
  72. /* double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); */
  73. /* // printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
  74. /* elapsed_time += (elapsed_tasks * latency)/1000000; */
  75. /* // printf("elapsed time after %lf \n", elapsed_time); */
  76. /* } */
  77. double vel = (elapsed_flops/elapsed_time);/* in Gflops/s */
  78. return vel;
  79. }
  80. return -1.0;
  81. }
  82. /* compute an average value of the cpu/cuda speed */
  83. double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
  84. {
  85. struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
  86. double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
  87. double ctx_sample = config->ispeed_ctx_sample;
  88. double curr_time = starpu_timing_now();
  89. double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
  90. unsigned can_compute_speed = 0;
  91. char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA");
  92. if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0))
  93. can_compute_speed = elapsed_time > config->time_sample;
  94. else
  95. can_compute_speed = ctx_elapsed_flops > ctx_sample;
  96. if(can_compute_speed)
  97. {
  98. if(ctx_elapsed_flops == 0.0) return -1.0;
  99. struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
  100. int worker;
  101. struct starpu_sched_ctx_iterator it;
  102. workers->init_iterator(workers, &it);
  103. double speed = 0.0;
  104. unsigned nworkers = 0;
  105. double all_workers_flops = 0.0;
  106. double max_workers_idle_time = 0.0;
  107. while(workers->has_next(workers, &it))
  108. {
  109. worker = workers->get_next(workers, &it);
  110. enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
  111. if(arch == req_arch && sc_w->compute_idle[worker])
  112. {
  113. if(sc_w->exec_start_time[worker] != 0.0)
  114. {
  115. double current_exec_time = 0.0;
  116. if(sc_w->exec_start_time[worker] < sc_w->start_time)
  117. current_exec_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
  118. else
  119. current_exec_time = (curr_time - sc_w->exec_start_time[worker]) / 1000000.0; /* in seconds */
  120. double suppl_flops = current_exec_time * sc_hypervisor_get_ref_speed_per_worker_type(sc_w, req_arch);
  121. all_workers_flops += suppl_flops;
  122. }
  123. all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
  124. if(max_workers_idle_time < sc_w->idle_time[worker])
  125. max_workers_idle_time = sc_w->idle_time[worker]; /* in seconds */
  126. nworkers++;
  127. }
  128. }
  129. if(nworkers != 0 && all_workers_flops > 0.0001)
  130. {
  131. // elapsed_time -= max_workers_idle_time;
  132. speed = (all_workers_flops / elapsed_time) / nworkers;
  133. }
  134. else
  135. speed = -1.0;
  136. if(speed != -1.0)
  137. {
  138. /* if ref_speed started being corrupted bc of the old bad distribution
  139. register only the last frame otherwise make the average with the speed
  140. behavior of the application until now */
  141. if(arch == STARPU_CUDA_WORKER)
  142. sc_w->ref_speed[0] = (sc_w->ref_speed[0] > 0.1) ? ((sc_w->ref_speed[0] + speed ) / 2.0) : speed;
  143. else
  144. sc_w->ref_speed[1] = (sc_w->ref_speed[1] > 0.1) ? ((sc_w->ref_speed[1] + speed ) / 2.0) : speed;
  145. }
  146. return speed;
  147. }
  148. return -1.0;
  149. }
  150. /* compute an average value of the cpu/cuda old speed */
  151. double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
  152. {
  153. if(arch == STARPU_CUDA_WORKER && sc_w->ref_speed[0] > 0.0)
  154. return sc_w->ref_speed[0];
  155. else
  156. if(arch == STARPU_CPU_WORKER && sc_w->ref_speed[1] > 0.0)
  157. return sc_w->ref_speed[1];
  158. return -1.0;
  159. }
  160. /* returns the speed necessary for the linear programs (either the monitored one either a default value) */
  161. double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
  162. {
  163. /* monitored speed in the last frame */
  164. double speed = sc_hypervisor_get_speed_per_worker_type(sc_w, arch);
  165. if(speed == -1.0)
  166. {
  167. /* avg value of the monitored speed over the entier current execution */
  168. speed = sc_hypervisor_get_ref_speed_per_worker_type(sc_w, arch);
  169. }
  170. if(speed == -1.0)
  171. {
  172. /* a default value */
  173. speed = arch == STARPU_CPU_WORKER ? SC_HYPERVISOR_DEFAULT_CPU_SPEED : SC_HYPERVISOR_DEFAULT_CUDA_SPEED;
  174. }
  175. return speed;
  176. }
  177. double sc_hypervisor_get_avg_speed(enum starpu_worker_archtype arch)
  178. {
  179. double total_executed_flops = 0.0;
  180. double total_estimated_flops = 0.0;
  181. struct sc_hypervisor_wrapper *sc_w;
  182. double max_real_start_time = 0.0;
  183. int s;
  184. unsigned nworkers = starpu_worker_get_count_by_type(arch);
  185. unsigned *sched_ctxs;
  186. int nsched_ctxs;
  187. sc_hypervisor_get_ctxs_on_level(&sched_ctxs, &nsched_ctxs, 0, STARPU_NMAX_SCHED_CTXS);
  188. for(s = 0; s < nsched_ctxs; s++)
  189. {
  190. sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
  191. struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctxs[s]);
  192. int worker;
  193. struct starpu_sched_ctx_iterator it;
  194. workers->init_iterator(workers, &it);
  195. while(workers->has_next(workers, &it))
  196. {
  197. worker = workers->get_next(workers, &it);
  198. enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
  199. if(arch == req_arch)
  200. {
  201. total_executed_flops += sc_w->total_elapsed_flops[worker] / 1000000000.0; /*in gflops */;
  202. }
  203. }
  204. total_estimated_flops += sc_w->total_flops / 1000000000.0; /*in gflops */
  205. if(max_real_start_time < sc_w->real_start_time)
  206. max_real_start_time = sc_w->real_start_time;
  207. }
  208. free(sched_ctxs);
  209. double speed = -1.0;
  210. #ifdef STARPU_SC_HYPERVISOR_DEBUG
  211. printf("total_exec_flops %lf total_estimated_flops %lf max_real_start_time %lf nworkers %u \n", total_executed_flops, total_estimated_flops, max_real_start_time, nworkers);
  212. #endif
  213. if(total_executed_flops > 0.5*total_estimated_flops)
  214. {
  215. double curr_time = starpu_timing_now();
  216. double time = (curr_time - max_real_start_time) / 1000000.0; /* in seconds */
  217. #ifdef STARPU_SC_HYPERVISOR_DEBUG
  218. printf("time = %lf\n", time);
  219. #endif
  220. speed = (total_executed_flops / time) / nworkers;
  221. }
  222. return speed;
  223. }
  224. void _consider_max_for_children(unsigned sched_ctx, unsigned consider_max)
  225. {
  226. struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctx);
  227. sc_w->consider_max = consider_max;
  228. #ifdef STARPU_SC_HYPERVISOR_DEBUG
  229. printf("ctx %u consider max %d \n", sched_ctx, sc_w->consider_max);
  230. #endif
  231. int level = starpu_sched_ctx_get_hierarchy_level(sched_ctx);
  232. unsigned *sched_ctxs_child;
  233. int nsched_ctxs_child = 0;
  234. sc_hypervisor_get_ctxs_on_level(&sched_ctxs_child, &nsched_ctxs_child, level+1, sched_ctx);
  235. int s;
  236. for(s = 0; s < nsched_ctxs_child; s++)
  237. _consider_max_for_children(sched_ctxs_child[s], consider_max);
  238. free(sched_ctxs_child);
  239. }
  240. void sc_hypervisor_check_if_consider_max(struct types_of_workers *tw)
  241. {
  242. unsigned *sched_ctxs;
  243. int nsched_ctxs;
  244. sc_hypervisor_get_ctxs_on_level(&sched_ctxs, &nsched_ctxs, 0, STARPU_NMAX_SCHED_CTXS);
  245. int nw = tw->nw;
  246. double avg_speed_per_tw[nw];
  247. int w;
  248. for(w = 0; w < nw; w++)
  249. {
  250. avg_speed_per_tw[w] = sc_hypervisor_get_avg_speed(sc_hypervisor_get_arch_for_index(w, tw));
  251. if(avg_speed_per_tw[w] == -1.0)
  252. {
  253. free(sched_ctxs);
  254. return;
  255. }
  256. }
  257. int s;
  258. for(s = 0; s < nsched_ctxs; s++)
  259. {
  260. for(w = 0; w < nw; w++)
  261. {
  262. struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
  263. double speed = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw));
  264. #ifdef STARPU_SC_HYPERVISOR_DEBUG
  265. printf("%u: speed %lf avg_speed %lf min %lf max %lf\n", sched_ctxs[s], speed, avg_speed_per_tw[w], (avg_speed_per_tw[w]*0.5), (avg_speed_per_tw[w]*1.5));
  266. #endif
  267. if(speed < avg_speed_per_tw[w]*0.5 || speed > avg_speed_per_tw[w]*1.5)
  268. _consider_max_for_children(sched_ctxs[s], 1);
  269. else
  270. _consider_max_for_children(sched_ctxs[s], 0);
  271. }
  272. }
  273. free(sched_ctxs);
  274. }