density_test.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. // +build linux
  2. /*
  3. Copyright 2015 The Kubernetes Authors.
  4. Licensed under the Apache License, Version 2.0 (the "License");
  5. you may not use this file except in compliance with the License.
  6. You may obtain a copy of the License at
  7. http://www.apache.org/licenses/LICENSE-2.0
  8. Unless required by applicable law or agreed to in writing, software
  9. distributed under the License is distributed on an "AS IS" BASIS,
  10. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. See the License for the specific language governing permissions and
  12. limitations under the License.
  13. */
  14. package e2enode
  15. import (
  16. "context"
  17. "fmt"
  18. "math"
  19. "sort"
  20. "strconv"
  21. "sync"
  22. "time"
  23. v1 "k8s.io/api/core/v1"
  24. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  25. "k8s.io/apimachinery/pkg/labels"
  26. "k8s.io/apimachinery/pkg/runtime"
  27. "k8s.io/apimachinery/pkg/watch"
  28. "k8s.io/client-go/tools/cache"
  29. kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
  30. kubeletstatsv1alpha1 "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
  31. kubemetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
  32. "k8s.io/kubernetes/test/e2e/framework"
  33. e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
  34. e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
  35. imageutils "k8s.io/kubernetes/test/utils/image"
  36. "github.com/onsi/ginkgo"
  37. "github.com/onsi/gomega"
  38. )
  39. const (
  40. kubeletAddr = "localhost:10255"
  41. )
  42. var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {
  43. const (
  44. // The data collection time of resource collector and the standalone cadvisor
  45. // is not synchronized, so resource collector may miss data or
  46. // collect duplicated data
  47. containerStatsPollingPeriod = 500 * time.Millisecond
  48. )
  49. var (
  50. rc *ResourceCollector
  51. )
  52. f := framework.NewDefaultFramework("density-test")
  53. ginkgo.BeforeEach(func() {
  54. // Start a standalone cadvisor pod using 'createSync', the pod is running when it returns
  55. f.PodClient().CreateSync(getCadvisorPod())
  56. // Resource collector monitors fine-grain CPU/memory usage by a standalone Cadvisor with
  57. // 1s housingkeeping interval
  58. rc = NewResourceCollector(containerStatsPollingPeriod)
  59. })
  60. ginkgo.Context("create a batch of pods", func() {
  61. // TODO(coufon): the values are generous, set more precise limits with benchmark data
  62. // and add more tests
  63. dTests := []densityTest{
  64. {
  65. podsNr: 10,
  66. interval: 0 * time.Millisecond,
  67. cpuLimits: e2ekubelet.ContainersCPUSummary{
  68. kubeletstatsv1alpha1.SystemContainerKubelet: {0.50: 0.30, 0.95: 0.50},
  69. kubeletstatsv1alpha1.SystemContainerRuntime: {0.50: 0.40, 0.95: 0.60},
  70. },
  71. memLimits: e2ekubelet.ResourceUsagePerContainer{
  72. kubeletstatsv1alpha1.SystemContainerKubelet: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 100 * 1024 * 1024},
  73. kubeletstatsv1alpha1.SystemContainerRuntime: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 500 * 1024 * 1024},
  74. },
  75. // percentile limit of single pod startup latency
  76. podStartupLimits: e2emetrics.LatencyMetric{
  77. Perc50: 16 * time.Second,
  78. Perc90: 18 * time.Second,
  79. Perc99: 20 * time.Second,
  80. },
  81. // upbound of startup latency of a batch of pods
  82. podBatchStartupLimit: 25 * time.Second,
  83. },
  84. }
  85. for _, testArg := range dTests {
  86. itArg := testArg
  87. desc := fmt.Sprintf("latency/resource should be within limit when create %d pods with %v interval", itArg.podsNr, itArg.interval)
  88. ginkgo.It(desc, func() {
  89. itArg.createMethod = "batch"
  90. testInfo := getTestNodeInfo(f, itArg.getTestName(), desc)
  91. batchLag, e2eLags := runDensityBatchTest(f, rc, itArg, testInfo, false)
  92. ginkgo.By("Verifying latency")
  93. logAndVerifyLatency(batchLag, e2eLags, itArg.podStartupLimits, itArg.podBatchStartupLimit, testInfo, true)
  94. ginkgo.By("Verifying resource")
  95. logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, true)
  96. })
  97. }
  98. })
  99. ginkgo.Context("create a batch of pods", func() {
  100. dTests := []densityTest{
  101. {
  102. podsNr: 10,
  103. interval: 0 * time.Millisecond,
  104. },
  105. {
  106. podsNr: 35,
  107. interval: 0 * time.Millisecond,
  108. },
  109. {
  110. podsNr: 105,
  111. interval: 0 * time.Millisecond,
  112. },
  113. {
  114. podsNr: 10,
  115. interval: 100 * time.Millisecond,
  116. },
  117. {
  118. podsNr: 35,
  119. interval: 100 * time.Millisecond,
  120. },
  121. {
  122. podsNr: 105,
  123. interval: 100 * time.Millisecond,
  124. },
  125. {
  126. podsNr: 10,
  127. interval: 300 * time.Millisecond,
  128. },
  129. {
  130. podsNr: 35,
  131. interval: 300 * time.Millisecond,
  132. },
  133. {
  134. podsNr: 105,
  135. interval: 300 * time.Millisecond,
  136. },
  137. }
  138. for _, testArg := range dTests {
  139. itArg := testArg
  140. desc := fmt.Sprintf("latency/resource should be within limit when create %d pods with %v interval [Benchmark][NodeSpecialFeature:Benchmark]", itArg.podsNr, itArg.interval)
  141. ginkgo.It(desc, func() {
  142. itArg.createMethod = "batch"
  143. testInfo := getTestNodeInfo(f, itArg.getTestName(), desc)
  144. batchLag, e2eLags := runDensityBatchTest(f, rc, itArg, testInfo, true)
  145. ginkgo.By("Verifying latency")
  146. logAndVerifyLatency(batchLag, e2eLags, itArg.podStartupLimits, itArg.podBatchStartupLimit, testInfo, false)
  147. ginkgo.By("Verifying resource")
  148. logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, false)
  149. })
  150. }
  151. })
  152. ginkgo.Context("create a batch of pods with higher API QPS", func() {
  153. dTests := []densityTest{
  154. {
  155. podsNr: 105,
  156. interval: 0 * time.Millisecond,
  157. APIQPSLimit: 60,
  158. },
  159. {
  160. podsNr: 105,
  161. interval: 100 * time.Millisecond,
  162. APIQPSLimit: 60,
  163. },
  164. {
  165. podsNr: 105,
  166. interval: 300 * time.Millisecond,
  167. APIQPSLimit: 60,
  168. },
  169. }
  170. for _, testArg := range dTests {
  171. itArg := testArg
  172. ginkgo.Context("", func() {
  173. desc := fmt.Sprintf("latency/resource should be within limit when create %d pods with %v interval (QPS %d) [Benchmark][NodeSpecialFeature:Benchmark]", itArg.podsNr, itArg.interval, itArg.APIQPSLimit)
  174. // The latency caused by API QPS limit takes a large portion (up to ~33%) of e2e latency.
  175. // It makes the pod startup latency of Kubelet (creation throughput as well) under-estimated.
  176. // Here we set API QPS limit from default 5 to 60 in order to test real Kubelet performance.
  177. // Note that it will cause higher resource usage.
  178. tempSetCurrentKubeletConfig(f, func(cfg *kubeletconfig.KubeletConfiguration) {
  179. framework.Logf("Old QPS limit is: %d", cfg.KubeAPIQPS)
  180. // Set new API QPS limit
  181. cfg.KubeAPIQPS = int32(itArg.APIQPSLimit)
  182. })
  183. ginkgo.It(desc, func() {
  184. itArg.createMethod = "batch"
  185. testInfo := getTestNodeInfo(f, itArg.getTestName(), desc)
  186. batchLag, e2eLags := runDensityBatchTest(f, rc, itArg, testInfo, true)
  187. ginkgo.By("Verifying latency")
  188. logAndVerifyLatency(batchLag, e2eLags, itArg.podStartupLimits, itArg.podBatchStartupLimit, testInfo, false)
  189. ginkgo.By("Verifying resource")
  190. logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, false)
  191. })
  192. })
  193. }
  194. })
  195. ginkgo.Context("create a sequence of pods", func() {
  196. dTests := []densityTest{
  197. {
  198. podsNr: 10,
  199. bgPodsNr: 50,
  200. cpuLimits: e2ekubelet.ContainersCPUSummary{
  201. kubeletstatsv1alpha1.SystemContainerKubelet: {0.50: 0.30, 0.95: 0.50},
  202. kubeletstatsv1alpha1.SystemContainerRuntime: {0.50: 0.40, 0.95: 0.60},
  203. },
  204. memLimits: e2ekubelet.ResourceUsagePerContainer{
  205. kubeletstatsv1alpha1.SystemContainerKubelet: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 100 * 1024 * 1024},
  206. kubeletstatsv1alpha1.SystemContainerRuntime: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 500 * 1024 * 1024},
  207. },
  208. podStartupLimits: e2emetrics.LatencyMetric{
  209. Perc50: 5000 * time.Millisecond,
  210. Perc90: 9000 * time.Millisecond,
  211. Perc99: 10000 * time.Millisecond,
  212. },
  213. },
  214. }
  215. for _, testArg := range dTests {
  216. itArg := testArg
  217. desc := fmt.Sprintf("latency/resource should be within limit when create %d pods with %d background pods", itArg.podsNr, itArg.bgPodsNr)
  218. ginkgo.It(desc, func() {
  219. itArg.createMethod = "sequence"
  220. testInfo := getTestNodeInfo(f, itArg.getTestName(), desc)
  221. batchlag, e2eLags := runDensitySeqTest(f, rc, itArg, testInfo)
  222. ginkgo.By("Verifying latency")
  223. logAndVerifyLatency(batchlag, e2eLags, itArg.podStartupLimits, itArg.podBatchStartupLimit, testInfo, true)
  224. ginkgo.By("Verifying resource")
  225. logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, true)
  226. })
  227. }
  228. })
  229. ginkgo.Context("create a sequence of pods", func() {
  230. dTests := []densityTest{
  231. {
  232. podsNr: 10,
  233. bgPodsNr: 50,
  234. },
  235. {
  236. podsNr: 30,
  237. bgPodsNr: 50,
  238. },
  239. {
  240. podsNr: 50,
  241. bgPodsNr: 50,
  242. },
  243. }
  244. for _, testArg := range dTests {
  245. itArg := testArg
  246. desc := fmt.Sprintf("latency/resource should be within limit when create %d pods with %d background pods [Benchmark][NodeSpeicalFeature:Benchmark]", itArg.podsNr, itArg.bgPodsNr)
  247. ginkgo.It(desc, func() {
  248. itArg.createMethod = "sequence"
  249. testInfo := getTestNodeInfo(f, itArg.getTestName(), desc)
  250. batchlag, e2eLags := runDensitySeqTest(f, rc, itArg, testInfo)
  251. ginkgo.By("Verifying latency")
  252. logAndVerifyLatency(batchlag, e2eLags, itArg.podStartupLimits, itArg.podBatchStartupLimit, testInfo, false)
  253. ginkgo.By("Verifying resource")
  254. logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, false)
  255. })
  256. }
  257. })
  258. })
  259. type densityTest struct {
  260. // number of pods
  261. podsNr int
  262. // number of background pods
  263. bgPodsNr int
  264. // interval between creating pod (rate control)
  265. interval time.Duration
  266. // create pods in 'batch' or 'sequence'
  267. createMethod string
  268. // API QPS limit
  269. APIQPSLimit int
  270. // performance limits
  271. cpuLimits e2ekubelet.ContainersCPUSummary
  272. memLimits e2ekubelet.ResourceUsagePerContainer
  273. podStartupLimits e2emetrics.LatencyMetric
  274. podBatchStartupLimit time.Duration
  275. }
  276. func (dt *densityTest) getTestName() string {
  277. // The current default API QPS limit is 5
  278. // TODO(coufon): is there any way to not hard code this?
  279. APIQPSLimit := 5
  280. if dt.APIQPSLimit > 0 {
  281. APIQPSLimit = dt.APIQPSLimit
  282. }
  283. return fmt.Sprintf("density_create_%s_%d_%d_%d_%d", dt.createMethod, dt.podsNr, dt.bgPodsNr,
  284. dt.interval.Nanoseconds()/1000000, APIQPSLimit)
  285. }
  286. // runDensityBatchTest runs the density batch pod creation test
  287. func runDensityBatchTest(f *framework.Framework, rc *ResourceCollector, testArg densityTest, testInfo map[string]string,
  288. isLogTimeSeries bool) (time.Duration, []e2emetrics.PodLatencyData) {
  289. const (
  290. podType = "density_test_pod"
  291. sleepBeforeCreatePods = 30 * time.Second
  292. )
  293. var (
  294. mutex = &sync.Mutex{}
  295. watchTimes = make(map[string]metav1.Time, 0)
  296. stopCh = make(chan struct{})
  297. )
  298. // create test pod data structure
  299. pods := newTestPods(testArg.podsNr, true, imageutils.GetPauseImageName(), podType)
  300. // the controller watches the change of pod status
  301. controller := newInformerWatchPod(f, mutex, watchTimes, podType)
  302. go controller.Run(stopCh)
  303. defer close(stopCh)
  304. // TODO(coufon): in the test we found kubelet starts while it is busy on something, as a result 'syncLoop'
  305. // does not response to pod creation immediately. Creating the first pod has a delay around 5s.
  306. // The node status has already been 'ready' so `wait and check node being ready does not help here.
  307. // Now wait here for a grace period to let 'syncLoop' be ready
  308. time.Sleep(sleepBeforeCreatePods)
  309. rc.Start()
  310. ginkgo.By("Creating a batch of pods")
  311. // It returns a map['pod name']'creation time' containing the creation timestamps
  312. createTimes := createBatchPodWithRateControl(f, pods, testArg.interval)
  313. ginkgo.By("Waiting for all Pods to be observed by the watch...")
  314. gomega.Eventually(func() bool {
  315. return len(watchTimes) == testArg.podsNr
  316. }, 10*time.Minute, 10*time.Second).Should(gomega.BeTrue())
  317. if len(watchTimes) < testArg.podsNr {
  318. framework.Failf("Timeout reached waiting for all Pods to be observed by the watch.")
  319. }
  320. // Analyze results
  321. var (
  322. firstCreate metav1.Time
  323. lastRunning metav1.Time
  324. init = true
  325. e2eLags = make([]e2emetrics.PodLatencyData, 0)
  326. )
  327. for name, create := range createTimes {
  328. watch, ok := watchTimes[name]
  329. framework.ExpectEqual(ok, true)
  330. e2eLags = append(e2eLags,
  331. e2emetrics.PodLatencyData{Name: name, Latency: watch.Time.Sub(create.Time)})
  332. if !init {
  333. if firstCreate.Time.After(create.Time) {
  334. firstCreate = create
  335. }
  336. if lastRunning.Time.Before(watch.Time) {
  337. lastRunning = watch
  338. }
  339. } else {
  340. init = false
  341. firstCreate, lastRunning = create, watch
  342. }
  343. }
  344. sort.Sort(e2emetrics.LatencySlice(e2eLags))
  345. batchLag := lastRunning.Time.Sub(firstCreate.Time)
  346. rc.Stop()
  347. deletePodsSync(f, pods)
  348. // Log time series data.
  349. if isLogTimeSeries {
  350. logDensityTimeSeries(rc, createTimes, watchTimes, testInfo)
  351. }
  352. // Log throughput data.
  353. logPodCreateThroughput(batchLag, e2eLags, testArg.podsNr, testInfo)
  354. deletePodsSync(f, []*v1.Pod{getCadvisorPod()})
  355. return batchLag, e2eLags
  356. }
  357. // runDensitySeqTest runs the density sequential pod creation test
  358. func runDensitySeqTest(f *framework.Framework, rc *ResourceCollector, testArg densityTest, testInfo map[string]string) (time.Duration, []e2emetrics.PodLatencyData) {
  359. const (
  360. podType = "density_test_pod"
  361. sleepBeforeCreatePods = 30 * time.Second
  362. )
  363. bgPods := newTestPods(testArg.bgPodsNr, true, imageutils.GetPauseImageName(), "background_pod")
  364. testPods := newTestPods(testArg.podsNr, true, imageutils.GetPauseImageName(), podType)
  365. ginkgo.By("Creating a batch of background pods")
  366. // CreatBatch is synchronized, all pods are running when it returns
  367. f.PodClient().CreateBatch(bgPods)
  368. time.Sleep(sleepBeforeCreatePods)
  369. rc.Start()
  370. // Create pods sequentially (back-to-back). e2eLags have been sorted.
  371. batchlag, e2eLags := createBatchPodSequential(f, testPods)
  372. rc.Stop()
  373. deletePodsSync(f, append(bgPods, testPods...))
  374. // Log throughput data.
  375. logPodCreateThroughput(batchlag, e2eLags, testArg.podsNr, testInfo)
  376. deletePodsSync(f, []*v1.Pod{getCadvisorPod()})
  377. return batchlag, e2eLags
  378. }
  379. // createBatchPodWithRateControl creates a batch of pods concurrently, uses one goroutine for each creation.
  380. // between creations there is an interval for throughput control
  381. func createBatchPodWithRateControl(f *framework.Framework, pods []*v1.Pod, interval time.Duration) map[string]metav1.Time {
  382. createTimes := make(map[string]metav1.Time)
  383. for _, pod := range pods {
  384. createTimes[pod.ObjectMeta.Name] = metav1.Now()
  385. go f.PodClient().Create(pod)
  386. time.Sleep(interval)
  387. }
  388. return createTimes
  389. }
  390. // getPodStartLatency gets prometheus metric 'pod start latency' from kubelet
  391. func getPodStartLatency(node string) (e2emetrics.KubeletLatencyMetrics, error) {
  392. latencyMetrics := e2emetrics.KubeletLatencyMetrics{}
  393. ms, err := e2emetrics.GrabKubeletMetricsWithoutProxy(node, "/metrics")
  394. framework.ExpectNoError(err, "Failed to get kubelet metrics without proxy in node %s", node)
  395. for _, samples := range ms {
  396. for _, sample := range samples {
  397. if sample.Metric["__name__"] == kubemetrics.KubeletSubsystem+"_"+kubemetrics.PodStartDurationKey {
  398. quantile, _ := strconv.ParseFloat(string(sample.Metric["quantile"]), 64)
  399. latencyMetrics = append(latencyMetrics,
  400. e2emetrics.KubeletLatencyMetric{
  401. Quantile: quantile,
  402. Method: kubemetrics.PodStartDurationKey,
  403. Latency: time.Duration(int(sample.Value)) * time.Microsecond})
  404. }
  405. }
  406. }
  407. return latencyMetrics, nil
  408. }
  409. // newInformerWatchPod creates an informer to check whether all pods are running.
  410. func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller {
  411. ns := f.Namespace.Name
  412. checkPodRunning := func(p *v1.Pod) {
  413. mutex.Lock()
  414. defer mutex.Unlock()
  415. defer ginkgo.GinkgoRecover()
  416. if p.Status.Phase == v1.PodRunning {
  417. if _, found := watchTimes[p.Name]; !found {
  418. watchTimes[p.Name] = metav1.Now()
  419. }
  420. }
  421. }
  422. _, controller := cache.NewInformer(
  423. &cache.ListWatch{
  424. ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
  425. options.LabelSelector = labels.SelectorFromSet(labels.Set{"type": podType}).String()
  426. obj, err := f.ClientSet.CoreV1().Pods(ns).List(context.TODO(), options)
  427. return runtime.Object(obj), err
  428. },
  429. WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
  430. options.LabelSelector = labels.SelectorFromSet(labels.Set{"type": podType}).String()
  431. return f.ClientSet.CoreV1().Pods(ns).Watch(context.TODO(), options)
  432. },
  433. },
  434. &v1.Pod{},
  435. 0,
  436. cache.ResourceEventHandlerFuncs{
  437. AddFunc: func(obj interface{}) {
  438. p, ok := obj.(*v1.Pod)
  439. framework.ExpectEqual(ok, true)
  440. go checkPodRunning(p)
  441. },
  442. UpdateFunc: func(oldObj, newObj interface{}) {
  443. p, ok := newObj.(*v1.Pod)
  444. framework.ExpectEqual(ok, true)
  445. go checkPodRunning(p)
  446. },
  447. },
  448. )
  449. return controller
  450. }
  451. // createBatchPodSequential creates pods back-to-back in sequence.
  452. func createBatchPodSequential(f *framework.Framework, pods []*v1.Pod) (time.Duration, []e2emetrics.PodLatencyData) {
  453. batchStartTime := metav1.Now()
  454. e2eLags := make([]e2emetrics.PodLatencyData, 0)
  455. for _, pod := range pods {
  456. create := metav1.Now()
  457. f.PodClient().CreateSync(pod)
  458. e2eLags = append(e2eLags,
  459. e2emetrics.PodLatencyData{Name: pod.Name, Latency: metav1.Now().Time.Sub(create.Time)})
  460. }
  461. batchLag := metav1.Now().Time.Sub(batchStartTime.Time)
  462. sort.Sort(e2emetrics.LatencySlice(e2eLags))
  463. return batchLag, e2eLags
  464. }
  465. // verifyLatencyWithinThreshold verifies whether 50, 90 and 99th percentiles of a latency metric are
  466. // within the expected threshold.
  467. func verifyLatencyWithinThreshold(threshold, actual e2emetrics.LatencyMetric, metricName string) error {
  468. if actual.Perc50 > threshold.Perc50 {
  469. return fmt.Errorf("too high %v latency 50th percentile: %v", metricName, actual.Perc50)
  470. }
  471. if actual.Perc90 > threshold.Perc90 {
  472. return fmt.Errorf("too high %v latency 90th percentile: %v", metricName, actual.Perc90)
  473. }
  474. if actual.Perc99 > threshold.Perc99 {
  475. return fmt.Errorf("too high %v latency 99th percentile: %v", metricName, actual.Perc99)
  476. }
  477. return nil
  478. }
  479. // extractLatencyMetrics returns latency metrics for each percentile(50th, 90th and 99th).
  480. func extractLatencyMetrics(latencies []e2emetrics.PodLatencyData) e2emetrics.LatencyMetric {
  481. length := len(latencies)
  482. perc50 := latencies[int(math.Ceil(float64(length*50)/100))-1].Latency
  483. perc90 := latencies[int(math.Ceil(float64(length*90)/100))-1].Latency
  484. perc99 := latencies[int(math.Ceil(float64(length*99)/100))-1].Latency
  485. perc100 := latencies[length-1].Latency
  486. return e2emetrics.LatencyMetric{Perc50: perc50, Perc90: perc90, Perc99: perc99, Perc100: perc100}
  487. }
  488. // printLatencies outputs latencies to log with readable format.
  489. func printLatencies(latencies []e2emetrics.PodLatencyData, header string) {
  490. metrics := extractLatencyMetrics(latencies)
  491. framework.Logf("10%% %s: %v", header, latencies[(len(latencies)*9)/10:])
  492. framework.Logf("perc50: %v, perc90: %v, perc99: %v", metrics.Perc50, metrics.Perc90, metrics.Perc99)
  493. }
  494. // logAndVerifyLatency verifies that whether pod creation latency satisfies the limit.
  495. func logAndVerifyLatency(batchLag time.Duration, e2eLags []e2emetrics.PodLatencyData, podStartupLimits e2emetrics.LatencyMetric,
  496. podBatchStartupLimit time.Duration, testInfo map[string]string, isVerify bool) {
  497. printLatencies(e2eLags, "worst client e2e total latencies")
  498. // TODO(coufon): do not trust 'kubelet' metrics since they are not reset!
  499. latencyMetrics, _ := getPodStartLatency(kubeletAddr)
  500. framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics))
  501. podStartupLatency := extractLatencyMetrics(e2eLags)
  502. // log latency perf data
  503. logPerfData(getLatencyPerfData(podStartupLatency, testInfo), "latency")
  504. if isVerify {
  505. // check whether e2e pod startup time is acceptable.
  506. framework.ExpectNoError(verifyLatencyWithinThreshold(podStartupLimits, podStartupLatency, "pod startup"))
  507. // check bactch pod creation latency
  508. if podBatchStartupLimit > 0 {
  509. framework.ExpectEqual(batchLag <= podBatchStartupLimit, true, "Batch creation startup time %v exceed limit %v",
  510. batchLag, podBatchStartupLimit)
  511. }
  512. }
  513. }
  514. // logThroughput calculates and logs pod creation throughput.
  515. func logPodCreateThroughput(batchLag time.Duration, e2eLags []e2emetrics.PodLatencyData, podsNr int, testInfo map[string]string) {
  516. logPerfData(getThroughputPerfData(batchLag, e2eLags, podsNr, testInfo), "throughput")
  517. }