taints.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. /*
  2. Copyright 2015 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package scheduling
  14. import (
  15. "context"
  16. "time"
  17. "k8s.io/api/core/v1"
  18. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  19. "k8s.io/apimachinery/pkg/labels"
  20. "k8s.io/apimachinery/pkg/runtime"
  21. "k8s.io/apimachinery/pkg/watch"
  22. clientset "k8s.io/client-go/kubernetes"
  23. "k8s.io/client-go/tools/cache"
  24. "k8s.io/kubernetes/test/e2e/framework"
  25. e2enode "k8s.io/kubernetes/test/e2e/framework/node"
  26. e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
  27. testutils "k8s.io/kubernetes/test/utils"
  28. imageutils "k8s.io/kubernetes/test/utils/image"
  29. "github.com/onsi/ginkgo"
  30. // ensure libs have a chance to initialize
  31. _ "github.com/stretchr/testify/assert"
  32. )
  33. var (
  34. pauseImage = imageutils.GetE2EImage(imageutils.Pause)
  35. )
  36. func getTestTaint() v1.Taint {
  37. now := metav1.Now()
  38. return v1.Taint{
  39. Key: "kubernetes.io/e2e-evict-taint-key",
  40. Value: "evictTaintVal",
  41. Effect: v1.TaintEffectNoExecute,
  42. TimeAdded: &now,
  43. }
  44. }
  45. // Create a default pod for this test, with argument saying if the Pod should have
  46. // toleration for Taits used in this test.
  47. func createPodForTaintsTest(hasToleration bool, tolerationSeconds int, podName, podLabel, ns string) *v1.Pod {
  48. grace := int64(1)
  49. if !hasToleration {
  50. return &v1.Pod{
  51. ObjectMeta: metav1.ObjectMeta{
  52. Name: podName,
  53. Namespace: ns,
  54. Labels: map[string]string{"group": podLabel},
  55. DeletionGracePeriodSeconds: &grace,
  56. },
  57. Spec: v1.PodSpec{
  58. Containers: []v1.Container{
  59. {
  60. Name: "pause",
  61. Image: pauseImage,
  62. },
  63. },
  64. },
  65. }
  66. }
  67. if tolerationSeconds <= 0 {
  68. return &v1.Pod{
  69. ObjectMeta: metav1.ObjectMeta{
  70. Name: podName,
  71. Namespace: ns,
  72. Labels: map[string]string{"group": podLabel},
  73. DeletionGracePeriodSeconds: &grace,
  74. // default - tolerate forever
  75. },
  76. Spec: v1.PodSpec{
  77. Containers: []v1.Container{
  78. {
  79. Name: "pause",
  80. Image: pauseImage,
  81. },
  82. },
  83. Tolerations: []v1.Toleration{{Key: "kubernetes.io/e2e-evict-taint-key", Value: "evictTaintVal", Effect: v1.TaintEffectNoExecute}},
  84. },
  85. }
  86. }
  87. ts := int64(tolerationSeconds)
  88. return &v1.Pod{
  89. ObjectMeta: metav1.ObjectMeta{
  90. Name: podName,
  91. Namespace: ns,
  92. Labels: map[string]string{"group": podLabel},
  93. DeletionGracePeriodSeconds: &grace,
  94. },
  95. Spec: v1.PodSpec{
  96. Containers: []v1.Container{
  97. {
  98. Name: "pause",
  99. Image: pauseImage,
  100. },
  101. },
  102. // default - tolerate forever
  103. Tolerations: []v1.Toleration{{Key: "kubernetes.io/e2e-evict-taint-key", Value: "evictTaintVal", Effect: v1.TaintEffectNoExecute, TolerationSeconds: &ts}},
  104. },
  105. }
  106. }
  107. // Creates and starts a controller (informer) that watches updates on a pod in given namespace with given name. It puts a new
  108. // struct into observedDeletion channel for every deletion it sees.
  109. func createTestController(cs clientset.Interface, observedDeletions chan string, stopCh chan struct{}, podLabel, ns string) {
  110. _, controller := cache.NewInformer(
  111. &cache.ListWatch{
  112. ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
  113. options.LabelSelector = labels.SelectorFromSet(labels.Set{"group": podLabel}).String()
  114. obj, err := cs.CoreV1().Pods(ns).List(context.TODO(), options)
  115. return runtime.Object(obj), err
  116. },
  117. WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
  118. options.LabelSelector = labels.SelectorFromSet(labels.Set{"group": podLabel}).String()
  119. return cs.CoreV1().Pods(ns).Watch(context.TODO(), options)
  120. },
  121. },
  122. &v1.Pod{},
  123. 0,
  124. cache.ResourceEventHandlerFuncs{
  125. DeleteFunc: func(oldObj interface{}) {
  126. if delPod, ok := oldObj.(*v1.Pod); ok {
  127. observedDeletions <- delPod.Name
  128. } else {
  129. observedDeletions <- ""
  130. }
  131. },
  132. },
  133. )
  134. framework.Logf("Starting informer...")
  135. go controller.Run(stopCh)
  136. }
  137. const (
  138. kubeletPodDeletionDelaySeconds = 60
  139. additionalWaitPerDeleteSeconds = 5
  140. )
  141. // Tests the behavior of NoExecuteTaintManager. Following scenarios are included:
  142. // - eviction of non-tolerating pods from a tainted node,
  143. // - lack of eviction of tolerating pods from a tainted node,
  144. // - delayed eviction of short-tolerating pod from a tainted node,
  145. // - lack of eviction of short-tolerating pod after taint removal.
  146. var _ = SIGDescribe("NoExecuteTaintManager Single Pod [Serial]", func() {
  147. var cs clientset.Interface
  148. var ns string
  149. f := framework.NewDefaultFramework("taint-single-pod")
  150. ginkgo.BeforeEach(func() {
  151. cs = f.ClientSet
  152. ns = f.Namespace.Name
  153. e2enode.WaitForTotalHealthy(cs, time.Minute)
  154. err := framework.CheckTestingNSDeletedExcept(cs, ns)
  155. framework.ExpectNoError(err)
  156. })
  157. // 1. Run a pod
  158. // 2. Taint the node running this pod with a no-execute taint
  159. // 3. See if pod will get evicted
  160. ginkgo.It("evicts pods from tainted nodes", func() {
  161. podName := "taint-eviction-1"
  162. pod := createPodForTaintsTest(false, 0, podName, podName, ns)
  163. observedDeletions := make(chan string, 100)
  164. stopCh := make(chan struct{})
  165. createTestController(cs, observedDeletions, stopCh, podName, ns)
  166. ginkgo.By("Starting pod...")
  167. nodeName, err := testutils.RunPodAndGetNodeName(cs, pod, 2*time.Minute)
  168. framework.ExpectNoError(err)
  169. framework.Logf("Pod is running on %v. Tainting Node", nodeName)
  170. ginkgo.By("Trying to apply a taint on the Node")
  171. testTaint := getTestTaint()
  172. framework.AddOrUpdateTaintOnNode(cs, nodeName, testTaint)
  173. framework.ExpectNodeHasTaint(cs, nodeName, &testTaint)
  174. defer framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  175. // Wait a bit
  176. ginkgo.By("Waiting for Pod to be deleted")
  177. timeoutChannel := time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+additionalWaitPerDeleteSeconds) * time.Second).C
  178. select {
  179. case <-timeoutChannel:
  180. framework.Failf("Failed to evict Pod")
  181. case <-observedDeletions:
  182. framework.Logf("Noticed Pod eviction. Test successful")
  183. }
  184. })
  185. // 1. Run a pod with toleration
  186. // 2. Taint the node running this pod with a no-execute taint
  187. // 3. See if pod won't get evicted
  188. ginkgo.It("doesn't evict pod with tolerations from tainted nodes", func() {
  189. podName := "taint-eviction-2"
  190. pod := createPodForTaintsTest(true, 0, podName, podName, ns)
  191. observedDeletions := make(chan string, 100)
  192. stopCh := make(chan struct{})
  193. createTestController(cs, observedDeletions, stopCh, podName, ns)
  194. ginkgo.By("Starting pod...")
  195. nodeName, err := testutils.RunPodAndGetNodeName(cs, pod, 2*time.Minute)
  196. framework.ExpectNoError(err)
  197. framework.Logf("Pod is running on %v. Tainting Node", nodeName)
  198. ginkgo.By("Trying to apply a taint on the Node")
  199. testTaint := getTestTaint()
  200. framework.AddOrUpdateTaintOnNode(cs, nodeName, testTaint)
  201. framework.ExpectNodeHasTaint(cs, nodeName, &testTaint)
  202. defer framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  203. // Wait a bit
  204. ginkgo.By("Waiting for Pod to be deleted")
  205. timeoutChannel := time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+additionalWaitPerDeleteSeconds) * time.Second).C
  206. select {
  207. case <-timeoutChannel:
  208. framework.Logf("Pod wasn't evicted. Test successful")
  209. case <-observedDeletions:
  210. framework.Failf("Pod was evicted despite toleration")
  211. }
  212. })
  213. // 1. Run a pod with a finite toleration
  214. // 2. Taint the node running this pod with a no-execute taint
  215. // 3. See if pod won't get evicted before toleration time runs out
  216. // 4. See if pod will get evicted after toleration time runs out
  217. ginkgo.It("eventually evict pod with finite tolerations from tainted nodes", func() {
  218. podName := "taint-eviction-3"
  219. pod := createPodForTaintsTest(true, kubeletPodDeletionDelaySeconds+2*additionalWaitPerDeleteSeconds, podName, podName, ns)
  220. observedDeletions := make(chan string, 100)
  221. stopCh := make(chan struct{})
  222. createTestController(cs, observedDeletions, stopCh, podName, ns)
  223. ginkgo.By("Starting pod...")
  224. nodeName, err := testutils.RunPodAndGetNodeName(cs, pod, 2*time.Minute)
  225. framework.ExpectNoError(err)
  226. framework.Logf("Pod is running on %v. Tainting Node", nodeName)
  227. ginkgo.By("Trying to apply a taint on the Node")
  228. testTaint := getTestTaint()
  229. framework.AddOrUpdateTaintOnNode(cs, nodeName, testTaint)
  230. framework.ExpectNodeHasTaint(cs, nodeName, &testTaint)
  231. defer framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  232. // Wait a bit
  233. ginkgo.By("Waiting to see if a Pod won't be deleted")
  234. timeoutChannel := time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+additionalWaitPerDeleteSeconds) * time.Second).C
  235. select {
  236. case <-timeoutChannel:
  237. framework.Logf("Pod wasn't evicted")
  238. case <-observedDeletions:
  239. framework.Failf("Pod was evicted despite toleration")
  240. return
  241. }
  242. ginkgo.By("Waiting for Pod to be deleted")
  243. timeoutChannel = time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+additionalWaitPerDeleteSeconds) * time.Second).C
  244. select {
  245. case <-timeoutChannel:
  246. framework.Failf("Pod wasn't evicted")
  247. case <-observedDeletions:
  248. framework.Logf("Pod was evicted after toleration time run out. Test successful")
  249. return
  250. }
  251. })
  252. /*
  253. Release : v1.16
  254. Testname: Taint, Pod Eviction on taint removal
  255. Description: The Pod with toleration timeout scheduled on a tainted Node MUST not be
  256. evicted if the taint is removed before toleration time ends.
  257. */
  258. framework.ConformanceIt("removing taint cancels eviction [Disruptive]", func() {
  259. podName := "taint-eviction-4"
  260. pod := createPodForTaintsTest(true, 2*additionalWaitPerDeleteSeconds, podName, podName, ns)
  261. observedDeletions := make(chan string, 100)
  262. stopCh := make(chan struct{})
  263. createTestController(cs, observedDeletions, stopCh, podName, ns)
  264. // 1. Run a pod with short toleration
  265. ginkgo.By("Starting pod...")
  266. nodeName, err := testutils.RunPodAndGetNodeName(cs, pod, 2*time.Minute)
  267. framework.ExpectNoError(err)
  268. framework.Logf("Pod is running on %v. Tainting Node", nodeName)
  269. // 2. Taint the node running this pod with a no-execute taint
  270. ginkgo.By("Trying to apply a taint on the Node")
  271. testTaint := getTestTaint()
  272. framework.AddOrUpdateTaintOnNode(cs, nodeName, testTaint)
  273. framework.ExpectNodeHasTaint(cs, nodeName, &testTaint)
  274. taintRemoved := false
  275. defer func() {
  276. if !taintRemoved {
  277. framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  278. }
  279. }()
  280. // 3. Wait some time
  281. ginkgo.By("Waiting short time to make sure Pod is queued for deletion")
  282. timeoutChannel := time.NewTimer(additionalWaitPerDeleteSeconds).C
  283. select {
  284. case <-timeoutChannel:
  285. framework.Logf("Pod wasn't evicted. Proceeding")
  286. case <-observedDeletions:
  287. framework.Failf("Pod was evicted despite toleration")
  288. return
  289. }
  290. // 4. Remove the taint
  291. framework.Logf("Removing taint from Node")
  292. framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  293. taintRemoved = true
  294. // 5. See if Pod won't be evicted.
  295. ginkgo.By("Waiting some time to make sure that toleration time passed.")
  296. timeoutChannel = time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+3*additionalWaitPerDeleteSeconds) * time.Second).C
  297. select {
  298. case <-timeoutChannel:
  299. framework.Logf("Pod wasn't evicted. Test successful")
  300. case <-observedDeletions:
  301. framework.Failf("Pod was evicted despite toleration")
  302. }
  303. })
  304. })
  305. var _ = SIGDescribe("NoExecuteTaintManager Multiple Pods [Serial]", func() {
  306. var cs clientset.Interface
  307. var ns string
  308. f := framework.NewDefaultFramework("taint-multiple-pods")
  309. ginkgo.BeforeEach(func() {
  310. cs = f.ClientSet
  311. ns = f.Namespace.Name
  312. e2enode.WaitForTotalHealthy(cs, time.Minute)
  313. err := framework.CheckTestingNSDeletedExcept(cs, ns)
  314. framework.ExpectNoError(err)
  315. })
  316. // 1. Run two pods; one with toleration, one without toleration
  317. // 2. Taint the nodes running those pods with a no-execute taint
  318. // 3. See if pod-without-toleration get evicted, and pod-with-toleration is kept
  319. ginkgo.It("only evicts pods without tolerations from tainted nodes", func() {
  320. podGroup := "taint-eviction-a"
  321. observedDeletions := make(chan string, 100)
  322. stopCh := make(chan struct{})
  323. createTestController(cs, observedDeletions, stopCh, podGroup, ns)
  324. pod1 := createPodForTaintsTest(false, 0, podGroup+"1", podGroup, ns)
  325. pod2 := createPodForTaintsTest(true, 0, podGroup+"2", podGroup, ns)
  326. ginkgo.By("Starting pods...")
  327. nodeName1, err := testutils.RunPodAndGetNodeName(cs, pod1, 2*time.Minute)
  328. framework.ExpectNoError(err)
  329. framework.Logf("Pod1 is running on %v. Tainting Node", nodeName1)
  330. nodeName2, err := testutils.RunPodAndGetNodeName(cs, pod2, 2*time.Minute)
  331. framework.ExpectNoError(err)
  332. framework.Logf("Pod2 is running on %v. Tainting Node", nodeName2)
  333. ginkgo.By("Trying to apply a taint on the Nodes")
  334. testTaint := getTestTaint()
  335. framework.AddOrUpdateTaintOnNode(cs, nodeName1, testTaint)
  336. framework.ExpectNodeHasTaint(cs, nodeName1, &testTaint)
  337. defer framework.RemoveTaintOffNode(cs, nodeName1, testTaint)
  338. if nodeName2 != nodeName1 {
  339. framework.AddOrUpdateTaintOnNode(cs, nodeName2, testTaint)
  340. framework.ExpectNodeHasTaint(cs, nodeName2, &testTaint)
  341. defer framework.RemoveTaintOffNode(cs, nodeName2, testTaint)
  342. }
  343. // Wait a bit
  344. ginkgo.By("Waiting for Pod1 to be deleted")
  345. timeoutChannel := time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+additionalWaitPerDeleteSeconds) * time.Second).C
  346. var evicted int
  347. for {
  348. select {
  349. case <-timeoutChannel:
  350. if evicted == 0 {
  351. framework.Failf("Failed to evict Pod1.")
  352. } else if evicted == 2 {
  353. framework.Failf("Pod1 is evicted. But unexpected Pod2 also get evicted.")
  354. }
  355. return
  356. case podName := <-observedDeletions:
  357. evicted++
  358. if podName == podGroup+"1" {
  359. framework.Logf("Noticed Pod %q gets evicted.", podName)
  360. } else if podName == podGroup+"2" {
  361. framework.Failf("Unexepected Pod %q gets evicted.", podName)
  362. return
  363. }
  364. }
  365. }
  366. })
  367. /*
  368. Release : v1.16
  369. Testname: Pod Eviction, Toleration limits
  370. Description: In a multi-pods scenario with tolerationSeconds, the pods MUST be evicted as per
  371. the toleration time limit.
  372. */
  373. framework.ConformanceIt("evicts pods with minTolerationSeconds [Disruptive]", func() {
  374. podGroup := "taint-eviction-b"
  375. observedDeletions := make(chan string, 100)
  376. stopCh := make(chan struct{})
  377. createTestController(cs, observedDeletions, stopCh, podGroup, ns)
  378. // 1. Run two pods both with toleration; one with tolerationSeconds=5, the other with 25
  379. pod1 := createPodForTaintsTest(true, additionalWaitPerDeleteSeconds, podGroup+"1", podGroup, ns)
  380. pod2 := createPodForTaintsTest(true, 5*additionalWaitPerDeleteSeconds, podGroup+"2", podGroup, ns)
  381. ginkgo.By("Starting pods...")
  382. nodeName, err := testutils.RunPodAndGetNodeName(cs, pod1, 2*time.Minute)
  383. framework.ExpectNoError(err)
  384. node, err := cs.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  385. framework.ExpectNoError(err)
  386. nodeHostNameLabel, ok := node.GetObjectMeta().GetLabels()["kubernetes.io/hostname"]
  387. if !ok {
  388. framework.Failf("error getting kubernetes.io/hostname label on node %s", nodeName)
  389. }
  390. framework.ExpectNoError(err)
  391. framework.Logf("Pod1 is running on %v. Tainting Node", nodeName)
  392. // ensure pod2 lands on the same node as pod1
  393. pod2.Spec.NodeSelector = map[string]string{"kubernetes.io/hostname": nodeHostNameLabel}
  394. _, err = testutils.RunPodAndGetNodeName(cs, pod2, 2*time.Minute)
  395. framework.ExpectNoError(err)
  396. // Wait for pods to be running state before eviction happens
  397. framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod1))
  398. framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod2))
  399. framework.Logf("Pod2 is running on %v. Tainting Node", nodeName)
  400. // 2. Taint the nodes running those pods with a no-execute taint
  401. ginkgo.By("Trying to apply a taint on the Node")
  402. testTaint := getTestTaint()
  403. framework.AddOrUpdateTaintOnNode(cs, nodeName, testTaint)
  404. framework.ExpectNodeHasTaint(cs, nodeName, &testTaint)
  405. defer framework.RemoveTaintOffNode(cs, nodeName, testTaint)
  406. // 3. Wait to see if both pods get evicted in between [5, 25] seconds
  407. ginkgo.By("Waiting for Pod1 and Pod2 to be deleted")
  408. timeoutChannel := time.NewTimer(time.Duration(kubeletPodDeletionDelaySeconds+3*additionalWaitPerDeleteSeconds) * time.Second).C
  409. var evicted int
  410. for evicted != 2 {
  411. select {
  412. case <-timeoutChannel:
  413. framework.Failf("Failed to evict all Pods. %d pod(s) is not evicted.", 2-evicted)
  414. return
  415. case podName := <-observedDeletions:
  416. framework.Logf("Noticed Pod %q gets evicted.", podName)
  417. evicted++
  418. }
  419. }
  420. })
  421. })