sc_hypervisor_monitoring.h 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2011,2013,2014 Inria
  4. * Copyright (C) 2015 Mathieu Lirzin
  5. * Copyright (C) 2013,2017,2019 CNRS
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifndef SC_HYPERVISOR_MONITORING_H
  19. #define SC_HYPERVISOR_MONITORING_H
  20. #include <sc_hypervisor.h>
  21. #ifdef __cplusplus
  22. extern "C"
  23. {
  24. #endif
  25. /**
  26. @ingroup API_SC_Hypervisor
  27. @{
  28. */
  29. /**
  30. Structure to check if the workers moved to another context are
  31. actually taken into account in that context.
  32. */
  33. struct sc_hypervisor_resize_ack
  34. {
  35. /**
  36. The context receiving the new workers
  37. */
  38. int receiver_sched_ctx;
  39. /**
  40. List of workers required to be moved
  41. */
  42. int *moved_workers;
  43. /**
  44. Number of workers required to be moved
  45. */
  46. int nmoved_workers;
  47. /**
  48. List of workers that actually got in the receiver ctx. If
  49. the value corresponding to a worker is 1, this worker got
  50. moved in the new context.
  51. */
  52. int *acked_workers;
  53. };
  54. /**
  55. Wrapper of the contexts available in StarPU which contains all
  56. information about a context obtained by incrementing the
  57. performance counters. it is attached to a sched_ctx storing
  58. monitoring information
  59. */
  60. struct sc_hypervisor_wrapper
  61. {
  62. /**
  63. the monitored context
  64. */
  65. unsigned sched_ctx;
  66. /**
  67. The corresponding resize configuration
  68. */
  69. struct sc_hypervisor_policy_config *config;
  70. /**
  71. the start time of the resizing sample of the workers of
  72. this context
  73. */
  74. double start_time_w[STARPU_NMAXWORKERS];
  75. /**
  76. The idle time counter of each worker of the context
  77. */
  78. double current_idle_time[STARPU_NMAXWORKERS];
  79. /**
  80. The time the workers were idle from the last resize
  81. */
  82. double idle_time[STARPU_NMAXWORKERS];
  83. /**
  84. The moment when the workers started being idle
  85. */
  86. double idle_start_time[STARPU_NMAXWORKERS];
  87. /**
  88. Time during which the worker executed tasks
  89. */
  90. double exec_time[STARPU_NMAXWORKERS];
  91. /**
  92. Time when the worker started executing a task
  93. */
  94. double exec_start_time[STARPU_NMAXWORKERS];
  95. /**
  96. List of workers that will leave the context (lazy resizing
  97. process)
  98. */
  99. int worker_to_be_removed[STARPU_NMAXWORKERS];
  100. /**
  101. Number of tasks pushed on each worker in this context
  102. */
  103. int pushed_tasks[STARPU_NMAXWORKERS];
  104. /**
  105. Number of tasks poped from each worker in this context
  106. */
  107. int poped_tasks[STARPU_NMAXWORKERS];
  108. /**
  109. The total number of flops to execute by the context
  110. */
  111. double total_flops;
  112. /**
  113. The number of flops executed by each workers of the context
  114. */
  115. double total_elapsed_flops[STARPU_NMAXWORKERS];
  116. /**
  117. number of flops executed since last resizing
  118. */
  119. double elapsed_flops[STARPU_NMAXWORKERS];
  120. /**
  121. Quantity of data (in bytes) used to execute tasks on each
  122. worker in this context
  123. */
  124. size_t elapsed_data[STARPU_NMAXWORKERS];
  125. /**
  126. Number of tasks executed on each worker in this context
  127. */
  128. int elapsed_tasks[STARPU_NMAXWORKERS];
  129. /**
  130. the average speed of the type of workers when they belonged
  131. to this context
  132. 0 - cuda 1 - cpu
  133. */
  134. double ref_speed[2];
  135. /**
  136. Number of flops submitted to this context
  137. */
  138. double submitted_flops;
  139. /**
  140. Number of flops that still have to be executed by the
  141. workers in this context
  142. */
  143. double remaining_flops;
  144. /**
  145. Start time of the resizing sample of this context
  146. */
  147. double start_time;
  148. /**
  149. First time a task was pushed to this context
  150. */
  151. double real_start_time;
  152. /**
  153. Start time for sample in which the hypervisor is not allowed to
  154. react bc too expensive */
  155. double hyp_react_start_time;
  156. /**
  157. Structure confirming the last resize finished and a new one
  158. can be done.
  159. Workers do not leave the current context until the receiver
  160. context does not ack the receive of these workers
  161. */
  162. struct sc_hypervisor_resize_ack resize_ack;
  163. /**
  164. Mutex needed to synchronize the acknowledgment of the
  165. workers into the receiver context
  166. */
  167. starpu_pthread_mutex_t mutex;
  168. /**
  169. Boolean indicating if the hypervisor can use the flops
  170. corresponding to the entire execution of the context
  171. */
  172. unsigned total_flops_available;
  173. /**
  174. boolean indicating that a context is being sized
  175. */
  176. unsigned to_be_sized;
  177. /**
  178. Boolean indicating if we add the idle of this worker to the
  179. idle of the context
  180. */
  181. unsigned compute_idle[STARPU_NMAXWORKERS];
  182. /**
  183. Boolean indicating if we add the entiere idle of this
  184. worker to the idle of the context or just half
  185. */
  186. unsigned compute_partial_idle[STARPU_NMAXWORKERS];
  187. /**
  188. consider the max in the lp
  189. */
  190. unsigned consider_max;
  191. };
  192. /**
  193. Return the wrapper of the given context
  194. @ingroup API_SC_Hypervisor
  195. */
  196. struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
  197. /**
  198. Get the list of registered contexts
  199. @ingroup API_SC_Hypervisor
  200. */
  201. unsigned *sc_hypervisor_get_sched_ctxs();
  202. /**
  203. Get the number of registered contexts
  204. @ingroup API_SC_Hypervisor
  205. */
  206. int sc_hypervisor_get_nsched_ctxs();
  207. /**
  208. Get the number of workers of a certain architecture in a context
  209. */
  210. int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch);
  211. /**
  212. Get the number of flops executed by a context since last resizing
  213. (reset to 0 when a resizing is done)
  214. @ingroup API_SC_Hypervisor
  215. */
  216. double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
  217. /**
  218. Get the number of flops executed by a context since the begining
  219. */
  220. double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
  221. /**
  222. Compute an average value of the cpu/cuda speed
  223. */
  224. double sc_hypervisorsc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
  225. /**
  226. Compte the actual speed of all workers of a specific type of worker
  227. */
  228. double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
  229. /** @} */
  230. #ifdef __cplusplus
  231. }
  232. #endif
  233. #endif