sc_hypervisor_monitoring.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2011,2013,2014 Inria
  4. * Copyright (C) 2015 Mathieu Lirzin
  5. * Copyright (C) 2013,2017 CNRS
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifndef SC_HYPERVISOR_MONITORING_H
  19. #define SC_HYPERVISOR_MONITORING_H
  20. #include <sc_hypervisor.h>
  21. #ifdef __cplusplus
  22. extern "C"
  23. {
  24. #endif
  25. /* structure to indicate when the moving of workers was actually done
  26. (moved workers can be seen in the new ctx ) */
  27. struct sc_hypervisor_resize_ack
  28. {
  29. /* receiver context */
  30. int receiver_sched_ctx;
  31. /* list of workers required to be moved */
  32. int *moved_workers;
  33. /* number of workers required to be moved */
  34. int nmoved_workers;
  35. /* list of workers that actually got in the receiver ctx */
  36. int *acked_workers;
  37. };
  38. /* wrapper attached to a sched_ctx storing monitoring information */
  39. struct sc_hypervisor_wrapper
  40. {
  41. /* the sched_ctx it monitors */
  42. unsigned sched_ctx;
  43. /* user configuration meant to limit resizing */
  44. struct sc_hypervisor_policy_config *config;
  45. /* the start time of the resizing sample of the workers of this context*/
  46. double start_time_w[STARPU_NMAXWORKERS];
  47. /* idle time of workers in this context */
  48. double current_idle_time[STARPU_NMAXWORKERS];
  49. /* idle time from the last resize */
  50. double idle_time[STARPU_NMAXWORKERS];
  51. /* time when the idle started */
  52. double idle_start_time[STARPU_NMAXWORKERS];
  53. /* time during which the worker executed tasks */
  54. double exec_time[STARPU_NMAXWORKERS];
  55. /* time when the worker started executing a task */
  56. double exec_start_time[STARPU_NMAXWORKERS];
  57. /* list of workers that will leave this contexts (lazy resizing process) */
  58. int worker_to_be_removed[STARPU_NMAXWORKERS];
  59. /* number of tasks pushed on each worker in this ctx */
  60. int pushed_tasks[STARPU_NMAXWORKERS];
  61. /* number of tasks poped from each worker in this ctx */
  62. int poped_tasks[STARPU_NMAXWORKERS];
  63. /* number of flops the context has to execute */
  64. double total_flops;
  65. /* number of flops executed since the beginning until now */
  66. double total_elapsed_flops[STARPU_NMAXWORKERS];
  67. /* number of flops executed since last resizing */
  68. double elapsed_flops[STARPU_NMAXWORKERS];
  69. /* data quantity executed on each worker in this ctx */
  70. size_t elapsed_data[STARPU_NMAXWORKERS];
  71. /* nr of tasks executed on each worker in this ctx */
  72. int elapsed_tasks[STARPU_NMAXWORKERS];
  73. /* the average speed of the type of workers when they belonged to this context */
  74. /* 0 - cuda 1 - cpu */
  75. double ref_speed[2];
  76. /* number of flops submitted to this ctx */
  77. double submitted_flops;
  78. /* number of flops that still have to be executed in this ctx */
  79. double remaining_flops;
  80. /* the start time of the resizing sample of this context*/
  81. double start_time;
  82. /* the first time a task was pushed to this context*/
  83. double real_start_time;
  84. /* the start time for sample in which the hyp is not allowed to react
  85. bc too expensive */
  86. double hyp_react_start_time;
  87. /* the workers don't leave the current ctx until the receiver ctx
  88. doesn't ack the receive of these workers */
  89. struct sc_hypervisor_resize_ack resize_ack;
  90. /* mutex to protect the ack of workers */
  91. starpu_pthread_mutex_t mutex;
  92. /* boolean indicating if the resizing strategy can see the
  93. flops of all the execution or not */
  94. unsigned total_flops_available;
  95. /* boolean indicating that a context is being sized */
  96. unsigned to_be_sized;
  97. /* boolean indicating if we add the idle of this worker to
  98. the idle of the context */
  99. unsigned compute_idle[STARPU_NMAXWORKERS];
  100. /* boolean indicating if we add the entiere idle of this
  101. worker to the idle of the context or just half*/
  102. unsigned compute_partial_idle[STARPU_NMAXWORKERS];
  103. /* consider the max in the lp */
  104. unsigned consider_max;
  105. };
  106. /* return the wrapper of context that saves its monitoring information */
  107. struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
  108. /* get the list of registered contexts */
  109. unsigned *sc_hypervisor_get_sched_ctxs();
  110. /* get the number of registered contexts */
  111. int sc_hypervisor_get_nsched_ctxs();
  112. /* get the number of workers of a certain architecture in a context */
  113. int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch);
  114. /* get the number of flops executed by a context since last resizing (reset to 0 when a resizing is done)*/
  115. double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
  116. /* get the number of flops executed by a context since the begining */
  117. double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
  118. /* compute an average value of the cpu/cuda speed */
  119. double sc_hypervisorsc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
  120. /* compte the actual speed of all workers of a specific type of worker */
  121. double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
  122. #ifdef __cplusplus
  123. }
  124. #endif
  125. #endif