job.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package apps
  14. import (
  15. "fmt"
  16. "time"
  17. "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/errors"
  19. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  20. batchinternal "k8s.io/kubernetes/pkg/apis/batch"
  21. "k8s.io/kubernetes/test/e2e/framework"
  22. jobutil "k8s.io/kubernetes/test/e2e/framework/job"
  23. "github.com/onsi/ginkgo"
  24. "github.com/onsi/gomega"
  25. )
  26. var _ = SIGDescribe("Job", func() {
  27. f := framework.NewDefaultFramework("job")
  28. parallelism := int32(2)
  29. completions := int32(4)
  30. backoffLimit := int32(6) // default value
  31. // Simplest case: all pods succeed promptly
  32. ginkgo.It("should run a job to completion when tasks succeed", func() {
  33. ginkgo.By("Creating a job")
  34. job := jobutil.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
  35. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  36. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  37. ginkgo.By("Ensuring job reaches completions")
  38. err = jobutil.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, completions)
  39. framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
  40. ginkgo.By("Ensuring pods for job exist")
  41. pods, err := jobutil.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
  42. framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name)
  43. gomega.Expect(len(pods.Items)).To(gomega.Equal(int(completions)), "failed to ensure sufficient pod for job: got %d, want %d", len(pods.Items), completions)
  44. for _, pod := range pods.Items {
  45. gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodSucceeded), "failed to ensure pod status: pod %s status %s", pod.Name, pod.Status.Phase)
  46. }
  47. })
  48. // Pods sometimes fail, but eventually succeed.
  49. ginkgo.It("should run a job to completion when tasks sometimes fail and are locally restarted", func() {
  50. ginkgo.By("Creating a job")
  51. // One failure, then a success, local restarts.
  52. // We can't use the random failure approach used by the
  53. // non-local test below, because kubelet will throttle
  54. // frequently failing containers in a given pod, ramping
  55. // up to 5 minutes between restarts, making test timeouts
  56. // due to successive failures too likely with a reasonable
  57. // test timeout.
  58. job := jobutil.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit)
  59. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  60. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  61. ginkgo.By("Ensuring job reaches completions")
  62. err = jobutil.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, completions)
  63. framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
  64. })
  65. // Pods sometimes fail, but eventually succeed, after pod restarts
  66. ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func() {
  67. ginkgo.By("Creating a job")
  68. // 50% chance of container success, local restarts.
  69. // Can't use the failOnce approach because that relies
  70. // on an emptyDir, which is not preserved across new pods.
  71. // Worst case analysis: 15 failures, each taking 1 minute to
  72. // run due to some slowness, 1 in 2^15 chance of happening,
  73. // causing test flake. Should be very rare.
  74. // With the introduction of backoff limit and high failure rate this
  75. // is hitting its timeout, the 3 is a reasonable that should make this
  76. // test less flaky, for now.
  77. job := jobutil.NewTestJob("randomlySucceedOrFail", "rand-non-local", v1.RestartPolicyNever, parallelism, 3, nil, 999)
  78. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  79. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  80. ginkgo.By("Ensuring job reaches completions")
  81. err = jobutil.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, *job.Spec.Completions)
  82. framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
  83. })
  84. ginkgo.It("should exceed active deadline", func() {
  85. ginkgo.By("Creating a job")
  86. var activeDeadlineSeconds int64 = 1
  87. job := jobutil.NewTestJob("notTerminate", "exceed-active-deadline", v1.RestartPolicyNever, parallelism, completions, &activeDeadlineSeconds, backoffLimit)
  88. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  89. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  90. ginkgo.By("Ensuring job past active deadline")
  91. err = jobutil.WaitForJobFailure(f.ClientSet, f.Namespace.Name, job.Name, time.Duration(activeDeadlineSeconds+10)*time.Second, "DeadlineExceeded")
  92. framework.ExpectNoError(err, "failed to ensure job past active deadline in namespace: %s", f.Namespace.Name)
  93. })
  94. /*
  95. Release : v1.15
  96. Testname: Jobs, active pods, graceful termination
  97. Description: Create a job. Ensure the active pods reflect paralellism in the namespace and delete the job. Job MUST be deleted successfully.
  98. */
  99. framework.ConformanceIt("should delete a job", func() {
  100. ginkgo.By("Creating a job")
  101. job := jobutil.NewTestJob("notTerminate", "foo", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
  102. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  103. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  104. ginkgo.By("Ensuring active pods == parallelism")
  105. err = jobutil.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, job.Name, parallelism)
  106. framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name)
  107. ginkgo.By("delete a job")
  108. framework.ExpectNoError(framework.DeleteResourceAndWaitForGC(f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name))
  109. ginkgo.By("Ensuring job was deleted")
  110. _, err = jobutil.GetJob(f.ClientSet, f.Namespace.Name, job.Name)
  111. framework.ExpectError(err, "failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name)
  112. gomega.Expect(errors.IsNotFound(err)).To(gomega.BeTrue())
  113. })
  114. ginkgo.It("should adopt matching orphans and release non-matching pods", func() {
  115. ginkgo.By("Creating a job")
  116. job := jobutil.NewTestJob("notTerminate", "adopt-release", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
  117. // Replace job with the one returned from Create() so it has the UID.
  118. // Save Kind since it won't be populated in the returned job.
  119. kind := job.Kind
  120. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  121. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  122. job.Kind = kind
  123. ginkgo.By("Ensuring active pods == parallelism")
  124. err = jobutil.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, job.Name, parallelism)
  125. framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name)
  126. ginkgo.By("Orphaning one of the Job's Pods")
  127. pods, err := jobutil.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
  128. framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
  129. gomega.Expect(pods.Items).To(gomega.HaveLen(int(parallelism)))
  130. pod := pods.Items[0]
  131. f.PodClient().Update(pod.Name, func(pod *v1.Pod) {
  132. pod.OwnerReferences = nil
  133. })
  134. ginkgo.By("Checking that the Job readopts the Pod")
  135. gomega.Expect(framework.WaitForPodCondition(f.ClientSet, pod.Namespace, pod.Name, "adopted", jobutil.JobTimeout,
  136. func(pod *v1.Pod) (bool, error) {
  137. controllerRef := metav1.GetControllerOf(pod)
  138. if controllerRef == nil {
  139. return false, nil
  140. }
  141. if controllerRef.Kind != job.Kind || controllerRef.Name != job.Name || controllerRef.UID != job.UID {
  142. return false, fmt.Errorf("pod has wrong controllerRef: got %v, want %v", controllerRef, job)
  143. }
  144. return true, nil
  145. },
  146. )).To(gomega.Succeed(), "wait for pod %q to be readopted", pod.Name)
  147. ginkgo.By("Removing the labels from the Job's Pod")
  148. f.PodClient().Update(pod.Name, func(pod *v1.Pod) {
  149. pod.Labels = nil
  150. })
  151. ginkgo.By("Checking that the Job releases the Pod")
  152. gomega.Expect(framework.WaitForPodCondition(f.ClientSet, pod.Namespace, pod.Name, "released", jobutil.JobTimeout,
  153. func(pod *v1.Pod) (bool, error) {
  154. controllerRef := metav1.GetControllerOf(pod)
  155. if controllerRef != nil {
  156. return false, nil
  157. }
  158. return true, nil
  159. },
  160. )).To(gomega.Succeed(), "wait for pod %q to be released", pod.Name)
  161. })
  162. ginkgo.It("should exceed backoffLimit", func() {
  163. ginkgo.By("Creating a job")
  164. backoff := 1
  165. job := jobutil.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff))
  166. job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
  167. framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
  168. ginkgo.By("Ensuring job exceed backofflimit")
  169. err = jobutil.WaitForJobFailure(f.ClientSet, f.Namespace.Name, job.Name, jobutil.JobTimeout, "BackoffLimitExceeded")
  170. framework.ExpectNoError(err, "failed to ensure job exceed backofflimit in namespace: %s", f.Namespace.Name)
  171. ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1))
  172. pods, err := jobutil.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
  173. framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
  174. // gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
  175. // due to NumRequeus not being stable enough, especially with failed status
  176. // updates we need to allow more than backoff+1
  177. // TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed
  178. if len(pods.Items) < backoff+1 {
  179. framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items)
  180. }
  181. for _, pod := range pods.Items {
  182. gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodFailed))
  183. }
  184. })
  185. })