sc_hypervisor_monitoring.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2011-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2015 Mathieu Lirzin
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifndef SC_HYPERVISOR_MONITORING_H
  18. #define SC_HYPERVISOR_MONITORING_H
  19. #include <sc_hypervisor.h>
  20. #ifdef __cplusplus
  21. extern "C"
  22. {
  23. #endif
  24. /**
  25. @ingroup API_SC_Hypervisor
  26. @{
  27. */
  28. /**
  29. Structure to check if the workers moved to another context are
  30. actually taken into account in that context.
  31. */
  32. struct sc_hypervisor_resize_ack
  33. {
  34. /**
  35. The context receiving the new workers
  36. */
  37. int receiver_sched_ctx;
  38. /**
  39. List of workers required to be moved
  40. */
  41. int *moved_workers;
  42. /**
  43. Number of workers required to be moved
  44. */
  45. int nmoved_workers;
  46. /**
  47. List of workers that actually got in the receiver ctx. If
  48. the value corresponding to a worker is 1, this worker got
  49. moved in the new context.
  50. */
  51. int *acked_workers;
  52. };
  53. /**
  54. Wrapper of the contexts available in StarPU which contains all
  55. information about a context obtained by incrementing the
  56. performance counters. it is attached to a sched_ctx storing
  57. monitoring information
  58. */
  59. struct sc_hypervisor_wrapper
  60. {
  61. /**
  62. the monitored context
  63. */
  64. unsigned sched_ctx;
  65. /**
  66. The corresponding resize configuration
  67. */
  68. struct sc_hypervisor_policy_config *config;
  69. /**
  70. the start time of the resizing sample of the workers of
  71. this context
  72. */
  73. double start_time_w[STARPU_NMAXWORKERS];
  74. /**
  75. The idle time counter of each worker of the context
  76. */
  77. double current_idle_time[STARPU_NMAXWORKERS];
  78. /**
  79. The time the workers were idle from the last resize
  80. */
  81. double idle_time[STARPU_NMAXWORKERS];
  82. /**
  83. The moment when the workers started being idle
  84. */
  85. double idle_start_time[STARPU_NMAXWORKERS];
  86. /**
  87. Time during which the worker executed tasks
  88. */
  89. double exec_time[STARPU_NMAXWORKERS];
  90. /**
  91. Time when the worker started executing a task
  92. */
  93. double exec_start_time[STARPU_NMAXWORKERS];
  94. /**
  95. List of workers that will leave the context (lazy resizing
  96. process)
  97. */
  98. int worker_to_be_removed[STARPU_NMAXWORKERS];
  99. /**
  100. Number of tasks pushed on each worker in this context
  101. */
  102. int pushed_tasks[STARPU_NMAXWORKERS];
  103. /**
  104. Number of tasks poped from each worker in this context
  105. */
  106. int poped_tasks[STARPU_NMAXWORKERS];
  107. /**
  108. The total number of flops to execute by the context
  109. */
  110. double total_flops;
  111. /**
  112. The number of flops executed by each workers of the context
  113. */
  114. double total_elapsed_flops[STARPU_NMAXWORKERS];
  115. /**
  116. number of flops executed since last resizing
  117. */
  118. double elapsed_flops[STARPU_NMAXWORKERS];
  119. /**
  120. Quantity of data (in bytes) used to execute tasks on each
  121. worker in this context
  122. */
  123. size_t elapsed_data[STARPU_NMAXWORKERS];
  124. /**
  125. Number of tasks executed on each worker in this context
  126. */
  127. int elapsed_tasks[STARPU_NMAXWORKERS];
  128. /**
  129. the average speed of the type of workers when they belonged
  130. to this context
  131. 0 - cuda 1 - cpu
  132. */
  133. double ref_speed[2];
  134. /**
  135. Number of flops submitted to this context
  136. */
  137. double submitted_flops;
  138. /**
  139. Number of flops that still have to be executed by the
  140. workers in this context
  141. */
  142. double remaining_flops;
  143. /**
  144. Start time of the resizing sample of this context
  145. */
  146. double start_time;
  147. /**
  148. First time a task was pushed to this context
  149. */
  150. double real_start_time;
  151. /**
  152. Start time for sample in which the hypervisor is not allowed to
  153. react bc too expensive */
  154. double hyp_react_start_time;
  155. /**
  156. Structure confirming the last resize finished and a new one
  157. can be done.
  158. Workers do not leave the current context until the receiver
  159. context does not ack the receive of these workers
  160. */
  161. struct sc_hypervisor_resize_ack resize_ack;
  162. /**
  163. Mutex needed to synchronize the acknowledgment of the
  164. workers into the receiver context
  165. */
  166. starpu_pthread_mutex_t mutex;
  167. /**
  168. Boolean indicating if the hypervisor can use the flops
  169. corresponding to the entire execution of the context
  170. */
  171. unsigned total_flops_available;
  172. /**
  173. boolean indicating that a context is being sized
  174. */
  175. unsigned to_be_sized;
  176. /**
  177. Boolean indicating if we add the idle of this worker to the
  178. idle of the context
  179. */
  180. unsigned compute_idle[STARPU_NMAXWORKERS];
  181. /**
  182. Boolean indicating if we add the entiere idle of this
  183. worker to the idle of the context or just half
  184. */
  185. unsigned compute_partial_idle[STARPU_NMAXWORKERS];
  186. /**
  187. consider the max in the lp
  188. */
  189. unsigned consider_max;
  190. };
  191. /**
  192. Return the wrapper of the given context
  193. @ingroup API_SC_Hypervisor
  194. */
  195. struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
  196. /**
  197. Get the list of registered contexts
  198. @ingroup API_SC_Hypervisor
  199. */
  200. unsigned *sc_hypervisor_get_sched_ctxs();
  201. /**
  202. Get the number of registered contexts
  203. @ingroup API_SC_Hypervisor
  204. */
  205. int sc_hypervisor_get_nsched_ctxs();
  206. /**
  207. Get the number of workers of a certain architecture in a context
  208. */
  209. int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch);
  210. /**
  211. Get the number of flops executed by a context since last resizing
  212. (reset to 0 when a resizing is done)
  213. @ingroup API_SC_Hypervisor
  214. */
  215. double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
  216. /**
  217. Get the number of flops executed by a context since the begining
  218. */
  219. double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
  220. /**
  221. Compute an average value of the cpu/cuda speed
  222. */
  223. double sc_hypervisorsc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
  224. /**
  225. Compte the actual speed of all workers of a specific type of worker
  226. */
  227. double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
  228. /** @} */
  229. #ifdef __cplusplus
  230. }
  231. #endif
  232. #endif