restart_test.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. // +build linux
  2. /*
  3. Copyright 2015 The Kubernetes Authors.
  4. Licensed under the Apache License, Version 2.0 (the "License");
  5. you may not use this file except in compliance with the License.
  6. You may obtain a copy of the License at
  7. http://www.apache.org/licenses/LICENSE-2.0
  8. Unless required by applicable law or agreed to in writing, software
  9. distributed under the License is distributed on an "AS IS" BASIS,
  10. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. See the License for the specific language governing permissions and
  12. limitations under the License.
  13. */
  14. package e2e_node
  15. import (
  16. "time"
  17. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  18. "k8s.io/kubernetes/test/e2e/framework"
  19. e2elog "k8s.io/kubernetes/test/e2e/framework/log"
  20. "fmt"
  21. "os/exec"
  22. . "github.com/onsi/ginkgo"
  23. . "github.com/onsi/gomega"
  24. "k8s.io/api/core/v1"
  25. testutils "k8s.io/kubernetes/test/utils"
  26. imageutils "k8s.io/kubernetes/test/utils/image"
  27. )
  28. // waitForPods waits for timeout duration, for pod_count.
  29. // If the timeout is hit, it returns the list of currently running pods.
  30. func waitForPods(f *framework.Framework, pod_count int, timeout time.Duration) (runningPods []*v1.Pod) {
  31. for start := time.Now(); time.Since(start) < timeout; time.Sleep(10 * time.Second) {
  32. podList, err := f.PodClient().List(metav1.ListOptions{})
  33. if err != nil {
  34. e2elog.Logf("Failed to list pods on node: %v", err)
  35. continue
  36. }
  37. runningPods = []*v1.Pod{}
  38. for _, pod := range podList.Items {
  39. if r, err := testutils.PodRunningReady(&pod); err != nil || !r {
  40. continue
  41. }
  42. runningPods = append(runningPods, &pod)
  43. }
  44. e2elog.Logf("Running pod count %d", len(runningPods))
  45. if len(runningPods) >= pod_count {
  46. break
  47. }
  48. }
  49. return runningPods
  50. }
  51. var _ = framework.KubeDescribe("Restart [Serial] [Slow] [Disruptive] [NodeFeature:ContainerRuntimeRestart]", func() {
  52. const (
  53. // Saturate the node. It's not necessary that all these pods enter
  54. // Running/Ready, because we don't know the number of cores in the
  55. // test node or default limits applied (if any). It's is essential
  56. // that no containers end up in terminated. 100 was chosen because
  57. // it's the max pods per node.
  58. podCount = 100
  59. podCreationInterval = 100 * time.Millisecond
  60. recoverTimeout = 5 * time.Minute
  61. startTimeout = 3 * time.Minute
  62. // restartCount is chosen so even with minPods we exhaust the default
  63. // allocation of a /24.
  64. minPods = 50
  65. restartCount = 6
  66. )
  67. f := framework.NewDefaultFramework("restart-test")
  68. Context("Container Runtime", func() {
  69. Context("Network", func() {
  70. It("should recover from ip leak", func() {
  71. pods := newTestPods(podCount, false, imageutils.GetPauseImageName(), "restart-container-runtime-test")
  72. By(fmt.Sprintf("Trying to create %d pods on node", len(pods)))
  73. createBatchPodWithRateControl(f, pods, podCreationInterval)
  74. defer deletePodsSync(f, pods)
  75. // Give the node some time to stabilize, assume pods that enter RunningReady within
  76. // startTimeout fit on the node and the node is now saturated.
  77. runningPods := waitForPods(f, podCount, startTimeout)
  78. if len(runningPods) < minPods {
  79. framework.Failf("Failed to start %d pods, cannot test that restarting container runtime doesn't leak IPs", minPods)
  80. }
  81. for i := 0; i < restartCount; i += 1 {
  82. By(fmt.Sprintf("Killing container runtime iteration %d", i))
  83. // Wait for container runtime to be running
  84. var pid int
  85. Eventually(func() error {
  86. runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile)
  87. if err != nil {
  88. return err
  89. }
  90. if len(runtimePids) != 1 {
  91. return fmt.Errorf("unexpected container runtime pid list: %+v", runtimePids)
  92. }
  93. // Make sure the container runtime is running, pid got from pid file may not be running.
  94. pid = runtimePids[0]
  95. if _, err := exec.Command("sudo", "ps", "-p", fmt.Sprintf("%d", pid)).CombinedOutput(); err != nil {
  96. return err
  97. }
  98. return nil
  99. }, 1*time.Minute, 2*time.Second).Should(BeNil())
  100. if stdout, err := exec.Command("sudo", "kill", fmt.Sprintf("%d", pid)).CombinedOutput(); err != nil {
  101. framework.Failf("Failed to kill container runtime (pid=%d): %v, stdout: %q", pid, err, string(stdout))
  102. }
  103. // Assume that container runtime will be restarted by systemd/supervisord etc.
  104. time.Sleep(20 * time.Second)
  105. }
  106. By("Checking currently Running/Ready pods")
  107. postRestartRunningPods := waitForPods(f, len(runningPods), recoverTimeout)
  108. if len(postRestartRunningPods) == 0 {
  109. framework.Failf("Failed to start *any* pods after container runtime restart, this might indicate an IP leak")
  110. }
  111. By("Confirm no containers have terminated")
  112. for _, pod := range postRestartRunningPods {
  113. if c := testutils.TerminatedContainers(pod); len(c) != 0 {
  114. framework.Failf("Pod %q has failed containers %+v after container runtime restart, this might indicate an IP leak", pod.Name, c)
  115. }
  116. }
  117. By(fmt.Sprintf("Container runtime restart test passed with %d pods", len(postRestartRunningPods)))
  118. })
  119. })
  120. })
  121. })