preemption_test.go 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. // This file tests preemption functionality of the scheduler.
  14. package scheduler
  15. import (
  16. "fmt"
  17. "testing"
  18. "time"
  19. "k8s.io/api/core/v1"
  20. policy "k8s.io/api/policy/v1beta1"
  21. "k8s.io/apimachinery/pkg/api/resource"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. "k8s.io/apimachinery/pkg/types"
  24. "k8s.io/apimachinery/pkg/util/intstr"
  25. "k8s.io/apimachinery/pkg/util/wait"
  26. utilfeature "k8s.io/apiserver/pkg/util/feature"
  27. clientset "k8s.io/client-go/kubernetes"
  28. featuregatetesting "k8s.io/component-base/featuregate/testing"
  29. podutil "k8s.io/kubernetes/pkg/api/v1/pod"
  30. "k8s.io/kubernetes/pkg/features"
  31. _ "k8s.io/kubernetes/pkg/scheduler/algorithmprovider"
  32. testutils "k8s.io/kubernetes/test/utils"
  33. "k8s.io/klog"
  34. )
  35. var lowPriority, mediumPriority, highPriority = int32(100), int32(200), int32(300)
  36. func waitForNominatedNodeNameWithTimeout(cs clientset.Interface, pod *v1.Pod, timeout time.Duration) error {
  37. if err := wait.Poll(100*time.Millisecond, timeout, func() (bool, error) {
  38. pod, err := cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{})
  39. if err != nil {
  40. return false, err
  41. }
  42. if len(pod.Status.NominatedNodeName) > 0 {
  43. return true, nil
  44. }
  45. return false, err
  46. }); err != nil {
  47. return fmt.Errorf("Pod %v/%v annotation did not get set: %v", pod.Namespace, pod.Name, err)
  48. }
  49. return nil
  50. }
  51. func waitForNominatedNodeName(cs clientset.Interface, pod *v1.Pod) error {
  52. return waitForNominatedNodeNameWithTimeout(cs, pod, wait.ForeverTestTimeout)
  53. }
  54. // TestPreemption tests a few preemption scenarios.
  55. func TestPreemption(t *testing.T) {
  56. // Enable PodPriority feature gate.
  57. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodPriority, true)()
  58. // Initialize scheduler.
  59. context := initTest(t, "preemption")
  60. defer cleanupTest(t, context)
  61. cs := context.clientSet
  62. defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
  63. v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
  64. v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
  65. }
  66. tests := []struct {
  67. description string
  68. existingPods []*v1.Pod
  69. pod *v1.Pod
  70. preemptedPodIndexes map[int]struct{}
  71. }{
  72. {
  73. description: "basic pod preemption",
  74. existingPods: []*v1.Pod{
  75. initPausePod(context.clientSet, &pausePodConfig{
  76. Name: "victim-pod",
  77. Namespace: context.ns.Name,
  78. Priority: &lowPriority,
  79. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  80. v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
  81. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  82. },
  83. }),
  84. },
  85. pod: initPausePod(cs, &pausePodConfig{
  86. Name: "preemptor-pod",
  87. Namespace: context.ns.Name,
  88. Priority: &highPriority,
  89. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  90. v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
  91. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  92. },
  93. }),
  94. preemptedPodIndexes: map[int]struct{}{0: {}},
  95. },
  96. {
  97. description: "preemption is performed to satisfy anti-affinity",
  98. existingPods: []*v1.Pod{
  99. initPausePod(cs, &pausePodConfig{
  100. Name: "pod-0", Namespace: context.ns.Name,
  101. Priority: &mediumPriority,
  102. Labels: map[string]string{"pod": "p0"},
  103. Resources: defaultPodRes,
  104. }),
  105. initPausePod(cs, &pausePodConfig{
  106. Name: "pod-1", Namespace: context.ns.Name,
  107. Priority: &lowPriority,
  108. Labels: map[string]string{"pod": "p1"},
  109. Resources: defaultPodRes,
  110. Affinity: &v1.Affinity{
  111. PodAntiAffinity: &v1.PodAntiAffinity{
  112. RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
  113. {
  114. LabelSelector: &metav1.LabelSelector{
  115. MatchExpressions: []metav1.LabelSelectorRequirement{
  116. {
  117. Key: "pod",
  118. Operator: metav1.LabelSelectorOpIn,
  119. Values: []string{"preemptor"},
  120. },
  121. },
  122. },
  123. TopologyKey: "node",
  124. },
  125. },
  126. },
  127. },
  128. }),
  129. },
  130. // A higher priority pod with anti-affinity.
  131. pod: initPausePod(cs, &pausePodConfig{
  132. Name: "preemptor-pod",
  133. Namespace: context.ns.Name,
  134. Priority: &highPriority,
  135. Labels: map[string]string{"pod": "preemptor"},
  136. Resources: defaultPodRes,
  137. Affinity: &v1.Affinity{
  138. PodAntiAffinity: &v1.PodAntiAffinity{
  139. RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
  140. {
  141. LabelSelector: &metav1.LabelSelector{
  142. MatchExpressions: []metav1.LabelSelectorRequirement{
  143. {
  144. Key: "pod",
  145. Operator: metav1.LabelSelectorOpIn,
  146. Values: []string{"p0"},
  147. },
  148. },
  149. },
  150. TopologyKey: "node",
  151. },
  152. },
  153. },
  154. },
  155. }),
  156. preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
  157. },
  158. {
  159. // This is similar to the previous case only pod-1 is high priority.
  160. description: "preemption is not performed when anti-affinity is not satisfied",
  161. existingPods: []*v1.Pod{
  162. initPausePod(cs, &pausePodConfig{
  163. Name: "pod-0", Namespace: context.ns.Name,
  164. Priority: &mediumPriority,
  165. Labels: map[string]string{"pod": "p0"},
  166. Resources: defaultPodRes,
  167. }),
  168. initPausePod(cs, &pausePodConfig{
  169. Name: "pod-1", Namespace: context.ns.Name,
  170. Priority: &highPriority,
  171. Labels: map[string]string{"pod": "p1"},
  172. Resources: defaultPodRes,
  173. Affinity: &v1.Affinity{
  174. PodAntiAffinity: &v1.PodAntiAffinity{
  175. RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
  176. {
  177. LabelSelector: &metav1.LabelSelector{
  178. MatchExpressions: []metav1.LabelSelectorRequirement{
  179. {
  180. Key: "pod",
  181. Operator: metav1.LabelSelectorOpIn,
  182. Values: []string{"preemptor"},
  183. },
  184. },
  185. },
  186. TopologyKey: "node",
  187. },
  188. },
  189. },
  190. },
  191. }),
  192. },
  193. // A higher priority pod with anti-affinity.
  194. pod: initPausePod(cs, &pausePodConfig{
  195. Name: "preemptor-pod",
  196. Namespace: context.ns.Name,
  197. Priority: &highPriority,
  198. Labels: map[string]string{"pod": "preemptor"},
  199. Resources: defaultPodRes,
  200. Affinity: &v1.Affinity{
  201. PodAntiAffinity: &v1.PodAntiAffinity{
  202. RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
  203. {
  204. LabelSelector: &metav1.LabelSelector{
  205. MatchExpressions: []metav1.LabelSelectorRequirement{
  206. {
  207. Key: "pod",
  208. Operator: metav1.LabelSelectorOpIn,
  209. Values: []string{"p0"},
  210. },
  211. },
  212. },
  213. TopologyKey: "node",
  214. },
  215. },
  216. },
  217. },
  218. }),
  219. preemptedPodIndexes: map[int]struct{}{},
  220. },
  221. }
  222. // Create a node with some resources and a label.
  223. nodeRes := &v1.ResourceList{
  224. v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
  225. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  226. v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
  227. }
  228. node, err := createNode(context.clientSet, "node1", nodeRes)
  229. if err != nil {
  230. t.Fatalf("Error creating nodes: %v", err)
  231. }
  232. nodeLabels := map[string]string{"node": node.Name}
  233. if err = testutils.AddLabelsToNode(context.clientSet, node.Name, nodeLabels); err != nil {
  234. t.Fatalf("Cannot add labels to node: %v", err)
  235. }
  236. if err = waitForNodeLabels(context.clientSet, node.Name, nodeLabels); err != nil {
  237. t.Fatalf("Adding labels to node didn't succeed: %v", err)
  238. }
  239. for _, test := range tests {
  240. pods := make([]*v1.Pod, len(test.existingPods))
  241. // Create and run existingPods.
  242. for i, p := range test.existingPods {
  243. pods[i], err = runPausePod(cs, p)
  244. if err != nil {
  245. t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
  246. }
  247. }
  248. // Create the "pod".
  249. preemptor, err := createPausePod(cs, test.pod)
  250. if err != nil {
  251. t.Errorf("Error while creating high priority pod: %v", err)
  252. }
  253. // Wait for preemption of pods and make sure the other ones are not preempted.
  254. for i, p := range pods {
  255. if _, found := test.preemptedPodIndexes[i]; found {
  256. if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
  257. t.Errorf("Test [%v]: Pod %v/%v is not getting evicted.", test.description, p.Namespace, p.Name)
  258. }
  259. } else {
  260. if p.DeletionTimestamp != nil {
  261. t.Errorf("Test [%v]: Didn't expect pod %v to get preempted.", test.description, p.Name)
  262. }
  263. }
  264. }
  265. // Also check that the preemptor pod gets the annotation for nominated node name.
  266. if len(test.preemptedPodIndexes) > 0 {
  267. if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  268. t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v: %v", test.description, preemptor.Name, err)
  269. }
  270. }
  271. // Cleanup
  272. pods = append(pods, preemptor)
  273. cleanupPods(cs, t, pods)
  274. }
  275. }
  276. // TestDisablePreemption tests disable pod preemption of scheduler works as expected.
  277. func TestDisablePreemption(t *testing.T) {
  278. // Enable PodPriority feature gate.
  279. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodPriority, true)()
  280. // Initialize scheduler, and disable preemption.
  281. context := initTestDisablePreemption(t, "disable-preemption")
  282. defer cleanupTest(t, context)
  283. cs := context.clientSet
  284. tests := []struct {
  285. description string
  286. existingPods []*v1.Pod
  287. pod *v1.Pod
  288. }{
  289. {
  290. description: "pod preemption will not happen",
  291. existingPods: []*v1.Pod{
  292. initPausePod(context.clientSet, &pausePodConfig{
  293. Name: "victim-pod",
  294. Namespace: context.ns.Name,
  295. Priority: &lowPriority,
  296. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  297. v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
  298. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  299. },
  300. }),
  301. },
  302. pod: initPausePod(cs, &pausePodConfig{
  303. Name: "preemptor-pod",
  304. Namespace: context.ns.Name,
  305. Priority: &highPriority,
  306. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  307. v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
  308. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  309. },
  310. }),
  311. },
  312. }
  313. // Create a node with some resources and a label.
  314. nodeRes := &v1.ResourceList{
  315. v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
  316. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  317. v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
  318. }
  319. _, err := createNode(context.clientSet, "node1", nodeRes)
  320. if err != nil {
  321. t.Fatalf("Error creating nodes: %v", err)
  322. }
  323. for _, test := range tests {
  324. pods := make([]*v1.Pod, len(test.existingPods))
  325. // Create and run existingPods.
  326. for i, p := range test.existingPods {
  327. pods[i], err = runPausePod(cs, p)
  328. if err != nil {
  329. t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
  330. }
  331. }
  332. // Create the "pod".
  333. preemptor, err := createPausePod(cs, test.pod)
  334. if err != nil {
  335. t.Errorf("Error while creating high priority pod: %v", err)
  336. }
  337. // Ensure preemptor should keep unschedulable.
  338. if err := waitForPodUnschedulable(cs, preemptor); err != nil {
  339. t.Errorf("Test [%v]: Preemptor %v should not become scheduled",
  340. test.description, preemptor.Name)
  341. }
  342. // Ensure preemptor should not be nominated.
  343. if err := waitForNominatedNodeNameWithTimeout(cs, preemptor, 5*time.Second); err == nil {
  344. t.Errorf("Test [%v]: Preemptor %v should not be nominated",
  345. test.description, preemptor.Name)
  346. }
  347. // Cleanup
  348. pods = append(pods, preemptor)
  349. cleanupPods(cs, t, pods)
  350. }
  351. }
  352. func mkPriorityPodWithGrace(tc *testContext, name string, priority int32, grace int64) *v1.Pod {
  353. defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
  354. v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
  355. v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
  356. }
  357. pod := initPausePod(tc.clientSet, &pausePodConfig{
  358. Name: name,
  359. Namespace: tc.ns.Name,
  360. Priority: &priority,
  361. Labels: map[string]string{"pod": name},
  362. Resources: defaultPodRes,
  363. })
  364. // Setting grace period to zero. Otherwise, we may never see the actual deletion
  365. // of the pods in integration tests.
  366. pod.Spec.TerminationGracePeriodSeconds = &grace
  367. return pod
  368. }
  369. // This test ensures that while the preempting pod is waiting for the victims to
  370. // terminate, other pending lower priority pods are not scheduled in the room created
  371. // after preemption and while the higher priority pods is not scheduled yet.
  372. func TestPreemptionStarvation(t *testing.T) {
  373. // Enable PodPriority feature gate.
  374. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodPriority, true)()
  375. // Initialize scheduler.
  376. context := initTest(t, "preemption")
  377. defer cleanupTest(t, context)
  378. cs := context.clientSet
  379. tests := []struct {
  380. description string
  381. numExistingPod int
  382. numExpectedPending int
  383. preemptor *v1.Pod
  384. }{
  385. {
  386. // This test ensures that while the preempting pod is waiting for the victims
  387. // terminate, other lower priority pods are not scheduled in the room created
  388. // after preemption and while the higher priority pods is not scheduled yet.
  389. description: "starvation test: higher priority pod is scheduled before the lower priority ones",
  390. numExistingPod: 10,
  391. numExpectedPending: 5,
  392. preemptor: initPausePod(cs, &pausePodConfig{
  393. Name: "preemptor-pod",
  394. Namespace: context.ns.Name,
  395. Priority: &highPriority,
  396. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  397. v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
  398. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  399. },
  400. }),
  401. },
  402. }
  403. // Create a node with some resources and a label.
  404. nodeRes := &v1.ResourceList{
  405. v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
  406. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  407. v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
  408. }
  409. _, err := createNode(context.clientSet, "node1", nodeRes)
  410. if err != nil {
  411. t.Fatalf("Error creating nodes: %v", err)
  412. }
  413. for _, test := range tests {
  414. pendingPods := make([]*v1.Pod, test.numExpectedPending)
  415. numRunningPods := test.numExistingPod - test.numExpectedPending
  416. runningPods := make([]*v1.Pod, numRunningPods)
  417. // Create and run existingPods.
  418. for i := 0; i < numRunningPods; i++ {
  419. runningPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("rpod-%v", i), mediumPriority, 0))
  420. if err != nil {
  421. t.Fatalf("Test [%v]: Error creating pause pod: %v", test.description, err)
  422. }
  423. }
  424. // make sure that runningPods are all scheduled.
  425. for _, p := range runningPods {
  426. if err := waitForPodToSchedule(cs, p); err != nil {
  427. t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
  428. }
  429. }
  430. // Create pending pods.
  431. for i := 0; i < test.numExpectedPending; i++ {
  432. pendingPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("ppod-%v", i), mediumPriority, 0))
  433. if err != nil {
  434. t.Fatalf("Test [%v]: Error creating pending pod: %v", test.description, err)
  435. }
  436. }
  437. // Make sure that all pending pods are being marked unschedulable.
  438. for _, p := range pendingPods {
  439. if err := wait.Poll(100*time.Millisecond, wait.ForeverTestTimeout,
  440. podUnschedulable(cs, p.Namespace, p.Name)); err != nil {
  441. t.Errorf("Pod %v/%v didn't get marked unschedulable: %v", p.Namespace, p.Name, err)
  442. }
  443. }
  444. // Create the preemptor.
  445. preemptor, err := createPausePod(cs, test.preemptor)
  446. if err != nil {
  447. t.Errorf("Error while creating the preempting pod: %v", err)
  448. }
  449. // Check that the preemptor pod gets the annotation for nominated node name.
  450. if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  451. t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v/%v: %v", test.description, preemptor.Namespace, preemptor.Name, err)
  452. }
  453. // Make sure that preemptor is scheduled after preemptions.
  454. if err := waitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil {
  455. t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err)
  456. }
  457. // Cleanup
  458. klog.Info("Cleaning up all pods...")
  459. allPods := pendingPods
  460. allPods = append(allPods, runningPods...)
  461. allPods = append(allPods, preemptor)
  462. cleanupPods(cs, t, allPods)
  463. }
  464. }
  465. // TestPreemptionRaces tests that other scheduling events and operations do not
  466. // race with the preemption process.
  467. func TestPreemptionRaces(t *testing.T) {
  468. // Initialize scheduler.
  469. context := initTest(t, "preemption-race")
  470. defer cleanupTest(t, context)
  471. cs := context.clientSet
  472. tests := []struct {
  473. description string
  474. numInitialPods int // Pods created and executed before running preemptor
  475. numAdditionalPods int // Pods created after creating the preemptor
  476. numRepetitions int // Repeat the tests to check races
  477. preemptor *v1.Pod
  478. }{
  479. {
  480. // This test ensures that while the preempting pod is waiting for the victims
  481. // terminate, other lower priority pods are not scheduled in the room created
  482. // after preemption and while the higher priority pods is not scheduled yet.
  483. description: "ensures that other pods are not scheduled while preemptor is being marked as nominated (issue #72124)",
  484. numInitialPods: 2,
  485. numAdditionalPods: 50,
  486. numRepetitions: 10,
  487. preemptor: initPausePod(cs, &pausePodConfig{
  488. Name: "preemptor-pod",
  489. Namespace: context.ns.Name,
  490. Priority: &highPriority,
  491. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  492. v1.ResourceCPU: *resource.NewMilliQuantity(4900, resource.DecimalSI),
  493. v1.ResourceMemory: *resource.NewQuantity(4900, resource.DecimalSI)},
  494. },
  495. }),
  496. },
  497. }
  498. // Create a node with some resources and a label.
  499. nodeRes := &v1.ResourceList{
  500. v1.ResourcePods: *resource.NewQuantity(100, resource.DecimalSI),
  501. v1.ResourceCPU: *resource.NewMilliQuantity(5000, resource.DecimalSI),
  502. v1.ResourceMemory: *resource.NewQuantity(5000, resource.DecimalSI),
  503. }
  504. _, err := createNode(context.clientSet, "node1", nodeRes)
  505. if err != nil {
  506. t.Fatalf("Error creating nodes: %v", err)
  507. }
  508. for _, test := range tests {
  509. if test.numRepetitions <= 0 {
  510. test.numRepetitions = 1
  511. }
  512. for n := 0; n < test.numRepetitions; n++ {
  513. initialPods := make([]*v1.Pod, test.numInitialPods)
  514. additionalPods := make([]*v1.Pod, test.numAdditionalPods)
  515. // Create and run existingPods.
  516. for i := 0; i < test.numInitialPods; i++ {
  517. initialPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("rpod-%v", i), mediumPriority, 0))
  518. if err != nil {
  519. t.Fatalf("Test [%v]: Error creating pause pod: %v", test.description, err)
  520. }
  521. }
  522. // make sure that initial Pods are all scheduled.
  523. for _, p := range initialPods {
  524. if err := waitForPodToSchedule(cs, p); err != nil {
  525. t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
  526. }
  527. }
  528. // Create the preemptor.
  529. klog.Info("Creating the preemptor pod...")
  530. preemptor, err := createPausePod(cs, test.preemptor)
  531. if err != nil {
  532. t.Errorf("Error while creating the preempting pod: %v", err)
  533. }
  534. klog.Info("Creating additional pods...")
  535. for i := 0; i < test.numAdditionalPods; i++ {
  536. additionalPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("ppod-%v", i), mediumPriority, 0))
  537. if err != nil {
  538. t.Fatalf("Test [%v]: Error creating pending pod: %v", test.description, err)
  539. }
  540. }
  541. // Check that the preemptor pod gets nominated node name.
  542. if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  543. t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v/%v: %v", test.description, preemptor.Namespace, preemptor.Name, err)
  544. }
  545. // Make sure that preemptor is scheduled after preemptions.
  546. if err := waitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil {
  547. t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err)
  548. }
  549. klog.Info("Check unschedulable pods still exists and were never scheduled...")
  550. for _, p := range additionalPods {
  551. pod, err := cs.CoreV1().Pods(p.Namespace).Get(p.Name, metav1.GetOptions{})
  552. if err != nil {
  553. t.Errorf("Error in getting Pod %v/%v info: %v", p.Namespace, p.Name, err)
  554. }
  555. if len(pod.Spec.NodeName) > 0 {
  556. t.Errorf("Pod %v/%v is already scheduled", p.Namespace, p.Name)
  557. }
  558. _, cond := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
  559. if cond != nil && cond.Status != v1.ConditionFalse {
  560. t.Errorf("Pod %v/%v is no longer unschedulable: %v", p.Namespace, p.Name, err)
  561. }
  562. }
  563. // Cleanup
  564. klog.Info("Cleaning up all pods...")
  565. allPods := additionalPods
  566. allPods = append(allPods, initialPods...)
  567. allPods = append(allPods, preemptor)
  568. cleanupPods(cs, t, allPods)
  569. }
  570. }
  571. }
  572. // TestNominatedNodeCleanUp checks that when there are nominated pods on a
  573. // node and a higher priority pod is nominated to run on the node, the nominated
  574. // node name of the lower priority pods is cleared.
  575. // Test scenario:
  576. // 1. Create a few low priority pods with long grade period that fill up a node.
  577. // 2. Create a medium priority pod that preempt some of those pods.
  578. // 3. Check that nominated node name of the medium priority pod is set.
  579. // 4. Create a high priority pod that preempts some pods on that node.
  580. // 5. Check that nominated node name of the high priority pod is set and nominated
  581. // node name of the medium priority pod is cleared.
  582. func TestNominatedNodeCleanUp(t *testing.T) {
  583. // Enable PodPriority feature gate.
  584. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodPriority, true)()
  585. // Initialize scheduler.
  586. context := initTest(t, "preemption")
  587. defer cleanupTest(t, context)
  588. cs := context.clientSet
  589. defer cleanupPodsInNamespace(cs, t, context.ns.Name)
  590. // Create a node with some resources and a label.
  591. nodeRes := &v1.ResourceList{
  592. v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
  593. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  594. v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
  595. }
  596. _, err := createNode(context.clientSet, "node1", nodeRes)
  597. if err != nil {
  598. t.Fatalf("Error creating nodes: %v", err)
  599. }
  600. // Step 1. Create a few low priority pods.
  601. lowPriPods := make([]*v1.Pod, 4)
  602. for i := 0; i < len(lowPriPods); i++ {
  603. lowPriPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("lpod-%v", i), lowPriority, 60))
  604. if err != nil {
  605. t.Fatalf("Error creating pause pod: %v", err)
  606. }
  607. }
  608. // make sure that the pods are all scheduled.
  609. for _, p := range lowPriPods {
  610. if err := waitForPodToSchedule(cs, p); err != nil {
  611. t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
  612. }
  613. }
  614. // Step 2. Create a medium priority pod.
  615. podConf := initPausePod(cs, &pausePodConfig{
  616. Name: "medium-priority",
  617. Namespace: context.ns.Name,
  618. Priority: &mediumPriority,
  619. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  620. v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
  621. v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)},
  622. },
  623. })
  624. medPriPod, err := createPausePod(cs, podConf)
  625. if err != nil {
  626. t.Errorf("Error while creating the medium priority pod: %v", err)
  627. }
  628. // Step 3. Check that nominated node name of the medium priority pod is set.
  629. if err := waitForNominatedNodeName(cs, medPriPod); err != nil {
  630. t.Errorf("NominatedNodeName annotation was not set for pod %v/%v: %v", medPriPod.Namespace, medPriPod.Name, err)
  631. }
  632. // Step 4. Create a high priority pod.
  633. podConf = initPausePod(cs, &pausePodConfig{
  634. Name: "high-priority",
  635. Namespace: context.ns.Name,
  636. Priority: &highPriority,
  637. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  638. v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
  639. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  640. },
  641. })
  642. highPriPod, err := createPausePod(cs, podConf)
  643. if err != nil {
  644. t.Errorf("Error while creating the high priority pod: %v", err)
  645. }
  646. // Step 5. Check that nominated node name of the high priority pod is set.
  647. if err := waitForNominatedNodeName(cs, highPriPod); err != nil {
  648. t.Errorf("NominatedNodeName annotation was not set for pod %v/%v: %v", medPriPod.Namespace, medPriPod.Name, err)
  649. }
  650. // And the nominated node name of the medium priority pod is cleared.
  651. if err := wait.Poll(100*time.Millisecond, wait.ForeverTestTimeout, func() (bool, error) {
  652. pod, err := cs.CoreV1().Pods(medPriPod.Namespace).Get(medPriPod.Name, metav1.GetOptions{})
  653. if err != nil {
  654. t.Errorf("Error getting the medium priority pod info: %v", err)
  655. }
  656. if len(pod.Status.NominatedNodeName) == 0 {
  657. return true, nil
  658. }
  659. return false, err
  660. }); err != nil {
  661. t.Errorf("The nominated node name of the medium priority pod was not cleared: %v", err)
  662. }
  663. }
  664. func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget {
  665. intMinAvailable := intstr.FromInt(minAvailable)
  666. return &policy.PodDisruptionBudget{
  667. ObjectMeta: metav1.ObjectMeta{
  668. Name: name,
  669. Namespace: namespace,
  670. },
  671. Spec: policy.PodDisruptionBudgetSpec{
  672. MinAvailable: &intMinAvailable,
  673. Selector: &metav1.LabelSelector{MatchLabels: matchLabels},
  674. },
  675. }
  676. }
  677. func addPodConditionReady(pod *v1.Pod) {
  678. pod.Status = v1.PodStatus{
  679. Phase: v1.PodRunning,
  680. Conditions: []v1.PodCondition{
  681. {
  682. Type: v1.PodReady,
  683. Status: v1.ConditionTrue,
  684. },
  685. },
  686. }
  687. }
  688. // TestPDBInPreemption tests PodDisruptionBudget support in preemption.
  689. func TestPDBInPreemption(t *testing.T) {
  690. // Enable PodPriority feature gate.
  691. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodPriority, true)()
  692. // Initialize scheduler.
  693. context := initTest(t, "preemption-pdb")
  694. defer cleanupTest(t, context)
  695. cs := context.clientSet
  696. initDisruptionController(t, context)
  697. defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
  698. v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
  699. v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
  700. }
  701. defaultNodeRes := &v1.ResourceList{
  702. v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
  703. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  704. v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
  705. }
  706. type nodeConfig struct {
  707. name string
  708. res *v1.ResourceList
  709. }
  710. tests := []struct {
  711. description string
  712. nodes []*nodeConfig
  713. pdbs []*policy.PodDisruptionBudget
  714. pdbPodNum []int32
  715. existingPods []*v1.Pod
  716. pod *v1.Pod
  717. preemptedPodIndexes map[int]struct{}
  718. }{
  719. {
  720. description: "A non-PDB violating pod is preempted despite its higher priority",
  721. nodes: []*nodeConfig{{name: "node-1", res: defaultNodeRes}},
  722. pdbs: []*policy.PodDisruptionBudget{
  723. mkMinAvailablePDB("pdb-1", context.ns.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}),
  724. },
  725. pdbPodNum: []int32{2},
  726. existingPods: []*v1.Pod{
  727. initPausePod(context.clientSet, &pausePodConfig{
  728. Name: "low-pod1",
  729. Namespace: context.ns.Name,
  730. Priority: &lowPriority,
  731. Resources: defaultPodRes,
  732. Labels: map[string]string{"foo": "bar"},
  733. }),
  734. initPausePod(context.clientSet, &pausePodConfig{
  735. Name: "low-pod2",
  736. Namespace: context.ns.Name,
  737. Priority: &lowPriority,
  738. Resources: defaultPodRes,
  739. Labels: map[string]string{"foo": "bar"},
  740. }),
  741. initPausePod(context.clientSet, &pausePodConfig{
  742. Name: "mid-pod3",
  743. Namespace: context.ns.Name,
  744. Priority: &mediumPriority,
  745. Resources: defaultPodRes,
  746. }),
  747. },
  748. pod: initPausePod(cs, &pausePodConfig{
  749. Name: "preemptor-pod",
  750. Namespace: context.ns.Name,
  751. Priority: &highPriority,
  752. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  753. v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
  754. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  755. },
  756. }),
  757. preemptedPodIndexes: map[int]struct{}{2: {}},
  758. },
  759. {
  760. description: "A node without any PDB violating pods is preferred for preemption",
  761. nodes: []*nodeConfig{
  762. {name: "node-1", res: defaultNodeRes},
  763. {name: "node-2", res: defaultNodeRes},
  764. },
  765. pdbs: []*policy.PodDisruptionBudget{
  766. mkMinAvailablePDB("pdb-1", context.ns.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}),
  767. },
  768. pdbPodNum: []int32{1},
  769. existingPods: []*v1.Pod{
  770. initPausePod(context.clientSet, &pausePodConfig{
  771. Name: "low-pod1",
  772. Namespace: context.ns.Name,
  773. Priority: &lowPriority,
  774. Resources: defaultPodRes,
  775. NodeName: "node-1",
  776. Labels: map[string]string{"foo": "bar"},
  777. }),
  778. initPausePod(context.clientSet, &pausePodConfig{
  779. Name: "mid-pod2",
  780. Namespace: context.ns.Name,
  781. Priority: &mediumPriority,
  782. NodeName: "node-2",
  783. Resources: defaultPodRes,
  784. }),
  785. },
  786. pod: initPausePod(cs, &pausePodConfig{
  787. Name: "preemptor-pod",
  788. Namespace: context.ns.Name,
  789. Priority: &highPriority,
  790. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  791. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  792. v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  793. },
  794. }),
  795. preemptedPodIndexes: map[int]struct{}{1: {}},
  796. },
  797. {
  798. description: "A node with fewer PDB violating pods is preferred for preemption",
  799. nodes: []*nodeConfig{
  800. {name: "node-1", res: defaultNodeRes},
  801. {name: "node-2", res: defaultNodeRes},
  802. {name: "node-3", res: defaultNodeRes},
  803. },
  804. pdbs: []*policy.PodDisruptionBudget{
  805. mkMinAvailablePDB("pdb-1", context.ns.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo1": "bar"}),
  806. mkMinAvailablePDB("pdb-2", context.ns.Name, types.UID("pdb-2-uid"), 2, map[string]string{"foo2": "bar"}),
  807. },
  808. pdbPodNum: []int32{1, 5},
  809. existingPods: []*v1.Pod{
  810. initPausePod(context.clientSet, &pausePodConfig{
  811. Name: "low-pod1",
  812. Namespace: context.ns.Name,
  813. Priority: &lowPriority,
  814. Resources: defaultPodRes,
  815. NodeName: "node-1",
  816. Labels: map[string]string{"foo1": "bar"},
  817. }),
  818. initPausePod(context.clientSet, &pausePodConfig{
  819. Name: "mid-pod1",
  820. Namespace: context.ns.Name,
  821. Priority: &mediumPriority,
  822. Resources: defaultPodRes,
  823. NodeName: "node-1",
  824. }),
  825. initPausePod(context.clientSet, &pausePodConfig{
  826. Name: "low-pod2",
  827. Namespace: context.ns.Name,
  828. Priority: &lowPriority,
  829. Resources: defaultPodRes,
  830. NodeName: "node-2",
  831. Labels: map[string]string{"foo2": "bar"},
  832. }),
  833. initPausePod(context.clientSet, &pausePodConfig{
  834. Name: "mid-pod2",
  835. Namespace: context.ns.Name,
  836. Priority: &mediumPriority,
  837. Resources: defaultPodRes,
  838. NodeName: "node-2",
  839. Labels: map[string]string{"foo2": "bar"},
  840. }),
  841. initPausePod(context.clientSet, &pausePodConfig{
  842. Name: "low-pod4",
  843. Namespace: context.ns.Name,
  844. Priority: &lowPriority,
  845. Resources: defaultPodRes,
  846. NodeName: "node-3",
  847. Labels: map[string]string{"foo2": "bar"},
  848. }),
  849. initPausePod(context.clientSet, &pausePodConfig{
  850. Name: "low-pod5",
  851. Namespace: context.ns.Name,
  852. Priority: &lowPriority,
  853. Resources: defaultPodRes,
  854. NodeName: "node-3",
  855. Labels: map[string]string{"foo2": "bar"},
  856. }),
  857. initPausePod(context.clientSet, &pausePodConfig{
  858. Name: "low-pod6",
  859. Namespace: context.ns.Name,
  860. Priority: &lowPriority,
  861. Resources: defaultPodRes,
  862. NodeName: "node-3",
  863. Labels: map[string]string{"foo2": "bar"},
  864. }),
  865. },
  866. pod: initPausePod(cs, &pausePodConfig{
  867. Name: "preemptor-pod",
  868. Namespace: context.ns.Name,
  869. Priority: &highPriority,
  870. Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  871. v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
  872. v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)},
  873. },
  874. }),
  875. // The third node is chosen because PDB is not violated for node 3 and the victims have lower priority than node-2.
  876. preemptedPodIndexes: map[int]struct{}{4: {}, 5: {}, 6: {}},
  877. },
  878. }
  879. for _, test := range tests {
  880. t.Logf("================ Running test: %v\n", test.description)
  881. for _, nodeConf := range test.nodes {
  882. _, err := createNode(cs, nodeConf.name, nodeConf.res)
  883. if err != nil {
  884. t.Fatalf("Error creating node %v: %v", nodeConf.name, err)
  885. }
  886. }
  887. pods := make([]*v1.Pod, len(test.existingPods))
  888. var err error
  889. // Create and run existingPods.
  890. for i, p := range test.existingPods {
  891. if pods[i], err = runPausePod(cs, p); err != nil {
  892. t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
  893. }
  894. // Add pod condition ready so that PDB is updated.
  895. addPodConditionReady(p)
  896. if _, err := context.clientSet.CoreV1().Pods(context.ns.Name).UpdateStatus(p); err != nil {
  897. t.Fatal(err)
  898. }
  899. }
  900. // Wait for Pods to be stable in scheduler cache.
  901. if err := waitCachedPodsStable(context, test.existingPods); err != nil {
  902. t.Fatalf("Not all pods are stable in the cache: %v", err)
  903. }
  904. // Create PDBs.
  905. for _, pdb := range test.pdbs {
  906. _, err := context.clientSet.PolicyV1beta1().PodDisruptionBudgets(context.ns.Name).Create(pdb)
  907. if err != nil {
  908. t.Fatalf("Failed to create PDB: %v", err)
  909. }
  910. }
  911. // Wait for PDBs to become stable.
  912. if err := waitForPDBsStable(context, test.pdbs, test.pdbPodNum); err != nil {
  913. t.Fatalf("Not all pdbs are stable in the cache: %v", err)
  914. }
  915. // Create the "pod".
  916. preemptor, err := createPausePod(cs, test.pod)
  917. if err != nil {
  918. t.Errorf("Error while creating high priority pod: %v", err)
  919. }
  920. // Wait for preemption of pods and make sure the other ones are not preempted.
  921. for i, p := range pods {
  922. if _, found := test.preemptedPodIndexes[i]; found {
  923. if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
  924. t.Errorf("Test [%v]: Pod %v/%v is not getting evicted.", test.description, p.Namespace, p.Name)
  925. }
  926. } else {
  927. if p.DeletionTimestamp != nil {
  928. t.Errorf("Test [%v]: Didn't expect pod %v/%v to get preempted.", test.description, p.Namespace, p.Name)
  929. }
  930. }
  931. }
  932. // Also check that the preemptor pod gets the annotation for nominated node name.
  933. if len(test.preemptedPodIndexes) > 0 {
  934. if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  935. t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v/%v: %v", test.description, preemptor.Namespace, preemptor.Name, err)
  936. }
  937. }
  938. // Cleanup
  939. pods = append(pods, preemptor)
  940. cleanupPods(cs, t, pods)
  941. cs.PolicyV1beta1().PodDisruptionBudgets(context.ns.Name).DeleteCollection(nil, metav1.ListOptions{})
  942. cs.CoreV1().Nodes().DeleteCollection(nil, metav1.ListOptions{})
  943. }
  944. }