metrics.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package metrics
  14. import (
  15. "sync"
  16. "time"
  17. "github.com/prometheus/client_golang/prometheus"
  18. v1 "k8s.io/api/core/v1"
  19. "k8s.io/klog"
  20. metricutil "k8s.io/kubernetes/pkg/volume/util"
  21. )
  22. const (
  23. // Subsystem names.
  24. pvControllerSubsystem = "pv_collector"
  25. // Metric names.
  26. boundPVKey = "bound_pv_count"
  27. unboundPVKey = "unbound_pv_count"
  28. boundPVCKey = "bound_pvc_count"
  29. unboundPVCKey = "unbound_pvc_count"
  30. // Label names.
  31. namespaceLabel = "namespace"
  32. storageClassLabel = "storage_class"
  33. )
  34. var registerMetrics sync.Once
  35. // PVLister used to list persistent volumes.
  36. type PVLister interface {
  37. List() []interface{}
  38. }
  39. // PVCLister used to list persistent volume claims.
  40. type PVCLister interface {
  41. List() []interface{}
  42. }
  43. // Register all metrics for pv controller.
  44. func Register(pvLister PVLister, pvcLister PVCLister) {
  45. registerMetrics.Do(func() {
  46. prometheus.MustRegister(newPVAndPVCCountCollector(pvLister, pvcLister))
  47. prometheus.MustRegister(volumeOperationErrorsMetric)
  48. })
  49. }
  50. func newPVAndPVCCountCollector(pvLister PVLister, pvcLister PVCLister) *pvAndPVCCountCollector {
  51. return &pvAndPVCCountCollector{pvLister, pvcLister}
  52. }
  53. // Custom collector for current pod and container counts.
  54. type pvAndPVCCountCollector struct {
  55. // Cache for accessing information about PersistentVolumes.
  56. pvLister PVLister
  57. // Cache for accessing information about PersistentVolumeClaims.
  58. pvcLister PVCLister
  59. }
  60. var (
  61. boundPVCountDesc = prometheus.NewDesc(
  62. prometheus.BuildFQName("", pvControllerSubsystem, boundPVKey),
  63. "Gauge measuring number of persistent volume currently bound",
  64. []string{storageClassLabel}, nil)
  65. unboundPVCountDesc = prometheus.NewDesc(
  66. prometheus.BuildFQName("", pvControllerSubsystem, unboundPVKey),
  67. "Gauge measuring number of persistent volume currently unbound",
  68. []string{storageClassLabel}, nil)
  69. boundPVCCountDesc = prometheus.NewDesc(
  70. prometheus.BuildFQName("", pvControllerSubsystem, boundPVCKey),
  71. "Gauge measuring number of persistent volume claim currently bound",
  72. []string{namespaceLabel}, nil)
  73. unboundPVCCountDesc = prometheus.NewDesc(
  74. prometheus.BuildFQName("", pvControllerSubsystem, unboundPVCKey),
  75. "Gauge measuring number of persistent volume claim currently unbound",
  76. []string{namespaceLabel}, nil)
  77. volumeOperationErrorsMetric = prometheus.NewCounterVec(
  78. prometheus.CounterOpts{
  79. Name: "volume_operation_total_errors",
  80. Help: "Total volume operation erros",
  81. },
  82. []string{"plugin_name", "operation_name"})
  83. )
  84. func (collector *pvAndPVCCountCollector) Describe(ch chan<- *prometheus.Desc) {
  85. ch <- boundPVCountDesc
  86. ch <- unboundPVCountDesc
  87. ch <- boundPVCCountDesc
  88. ch <- unboundPVCCountDesc
  89. }
  90. func (collector *pvAndPVCCountCollector) Collect(ch chan<- prometheus.Metric) {
  91. collector.pvCollect(ch)
  92. collector.pvcCollect(ch)
  93. }
  94. func (collector *pvAndPVCCountCollector) pvCollect(ch chan<- prometheus.Metric) {
  95. boundNumberByStorageClass := make(map[string]int)
  96. unboundNumberByStorageClass := make(map[string]int)
  97. for _, pvObj := range collector.pvLister.List() {
  98. pv, ok := pvObj.(*v1.PersistentVolume)
  99. if !ok {
  100. continue
  101. }
  102. if pv.Status.Phase == v1.VolumeBound {
  103. boundNumberByStorageClass[pv.Spec.StorageClassName]++
  104. } else {
  105. unboundNumberByStorageClass[pv.Spec.StorageClassName]++
  106. }
  107. }
  108. for storageClassName, number := range boundNumberByStorageClass {
  109. metric, err := prometheus.NewConstMetric(
  110. boundPVCountDesc,
  111. prometheus.GaugeValue,
  112. float64(number),
  113. storageClassName)
  114. if err != nil {
  115. klog.Warningf("Create bound pv number metric failed: %v", err)
  116. continue
  117. }
  118. ch <- metric
  119. }
  120. for storageClassName, number := range unboundNumberByStorageClass {
  121. metric, err := prometheus.NewConstMetric(
  122. unboundPVCountDesc,
  123. prometheus.GaugeValue,
  124. float64(number),
  125. storageClassName)
  126. if err != nil {
  127. klog.Warningf("Create unbound pv number metric failed: %v", err)
  128. continue
  129. }
  130. ch <- metric
  131. }
  132. }
  133. func (collector *pvAndPVCCountCollector) pvcCollect(ch chan<- prometheus.Metric) {
  134. boundNumberByNamespace := make(map[string]int)
  135. unboundNumberByNamespace := make(map[string]int)
  136. for _, pvcObj := range collector.pvcLister.List() {
  137. pvc, ok := pvcObj.(*v1.PersistentVolumeClaim)
  138. if !ok {
  139. continue
  140. }
  141. if pvc.Status.Phase == v1.ClaimBound {
  142. boundNumberByNamespace[pvc.Namespace]++
  143. } else {
  144. unboundNumberByNamespace[pvc.Namespace]++
  145. }
  146. }
  147. for namespace, number := range boundNumberByNamespace {
  148. metric, err := prometheus.NewConstMetric(
  149. boundPVCCountDesc,
  150. prometheus.GaugeValue,
  151. float64(number),
  152. namespace)
  153. if err != nil {
  154. klog.Warningf("Create bound pvc number metric failed: %v", err)
  155. continue
  156. }
  157. ch <- metric
  158. }
  159. for namespace, number := range unboundNumberByNamespace {
  160. metric, err := prometheus.NewConstMetric(
  161. unboundPVCCountDesc,
  162. prometheus.GaugeValue,
  163. float64(number),
  164. namespace)
  165. if err != nil {
  166. klog.Warningf("Create unbound pvc number metric failed: %v", err)
  167. continue
  168. }
  169. ch <- metric
  170. }
  171. }
  172. // RecordVolumeOperationErrorMetric records error count into metric
  173. // volume_operation_total_errors for provisioning/deletion operations
  174. func RecordVolumeOperationErrorMetric(pluginName, opName string) {
  175. if pluginName == "" {
  176. pluginName = "N/A"
  177. }
  178. volumeOperationErrorsMetric.WithLabelValues(pluginName, opName).Inc()
  179. }
  180. // operationTimestamp stores the start time of an operation by a plugin
  181. type operationTimestamp struct {
  182. pluginName string
  183. operation string
  184. startTs time.Time
  185. }
  186. func newOperationTimestamp(pluginName, operationName string) *operationTimestamp {
  187. return &operationTimestamp{
  188. pluginName: pluginName,
  189. operation: operationName,
  190. startTs: time.Now(),
  191. }
  192. }
  193. // OperationStartTimeCache concurrent safe cache for operation start timestamps
  194. type OperationStartTimeCache struct {
  195. cache sync.Map // [string]operationTimestamp
  196. }
  197. // NewOperationStartTimeCache creates a operation timestamp cache
  198. func NewOperationStartTimeCache() OperationStartTimeCache {
  199. return OperationStartTimeCache{
  200. cache: sync.Map{}, //[string]operationTimestamp {}
  201. }
  202. }
  203. // AddIfNotExist returns directly if there exists an entry with the key. Otherwise, it
  204. // creates a new operation timestamp using operationName, pluginName, and current timestamp
  205. // and stores the operation timestamp with the key
  206. func (c *OperationStartTimeCache) AddIfNotExist(key, pluginName, operationName string) {
  207. ts := newOperationTimestamp(pluginName, operationName)
  208. c.cache.LoadOrStore(key, ts)
  209. }
  210. // Delete deletes a value for a key.
  211. func (c *OperationStartTimeCache) Delete(key string) {
  212. c.cache.Delete(key)
  213. }
  214. // Has returns a bool value indicates the existence of a key in the cache
  215. func (c *OperationStartTimeCache) Has(key string) bool {
  216. _, exists := c.cache.Load(key)
  217. return exists
  218. }
  219. // RecordMetric records either an error count metric or a latency metric if there
  220. // exists a start timestamp entry in the cache. For a successful operation, i.e.,
  221. // err == nil, the corresponding timestamp entry will be removed from cache
  222. func RecordMetric(key string, c *OperationStartTimeCache, err error) {
  223. obj, exists := c.cache.Load(key)
  224. if !exists {
  225. return
  226. }
  227. ts, ok := obj.(*operationTimestamp)
  228. if !ok {
  229. return
  230. }
  231. if err != nil {
  232. RecordVolumeOperationErrorMetric(ts.pluginName, ts.operation)
  233. } else {
  234. timeTaken := time.Since(ts.startTs).Seconds()
  235. metricutil.RecordOperationLatencyMetric(ts.pluginName, ts.operation, timeTaken)
  236. // end of this operation, remove the timestamp entry from cache
  237. c.Delete(key)
  238. }
  239. }