node_lifecycle_controller_test.go 104 KB


  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package nodelifecycle
  14. import (
  15. "strings"
  16. "testing"
  17. "time"
  18. apps "k8s.io/api/apps/v1"
  19. coordv1beta1 "k8s.io/api/coordination/v1beta1"
  20. "k8s.io/api/core/v1"
  21. apiequality "k8s.io/apimachinery/pkg/api/equality"
  22. "k8s.io/apimachinery/pkg/api/resource"
  23. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  24. "k8s.io/apimachinery/pkg/util/diff"
  25. utilfeature "k8s.io/apiserver/pkg/util/feature"
  26. "k8s.io/client-go/informers"
  27. appsinformers "k8s.io/client-go/informers/apps/v1"
  28. coordinformers "k8s.io/client-go/informers/coordination/v1beta1"
  29. coreinformers "k8s.io/client-go/informers/core/v1"
  30. clientset "k8s.io/client-go/kubernetes"
  31. "k8s.io/client-go/kubernetes/fake"
  32. testcore "k8s.io/client-go/testing"
  33. featuregatetesting "k8s.io/component-base/featuregate/testing"
  34. "k8s.io/kubernetes/pkg/controller"
  35. "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
  36. "k8s.io/kubernetes/pkg/controller/testutil"
  37. nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
  38. "k8s.io/kubernetes/pkg/features"
  39. kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
  40. schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
  41. "k8s.io/kubernetes/pkg/util/node"
  42. taintutils "k8s.io/kubernetes/pkg/util/taints"
  43. "k8s.io/utils/pointer"
  44. )
  45. const (
  46. testNodeMonitorGracePeriod = 40 * time.Second
  47. testNodeStartupGracePeriod = 60 * time.Second
  48. testNodeMonitorPeriod = 5 * time.Second
  49. testRateLimiterQPS = float32(10000)
  50. testLargeClusterThreshold = 20
  51. testUnhealthyThreshold = float32(0.55)
  52. )
  53. func alwaysReady() bool { return true }
  54. type nodeLifecycleController struct {
  55. *Controller
  56. leaseInformer coordinformers.LeaseInformer
  57. nodeInformer coreinformers.NodeInformer
  58. daemonSetInformer appsinformers.DaemonSetInformer
  59. }
  60. // doEviction does the fake eviction and returns the status of eviction operation.
  61. func (nc *nodeLifecycleController) doEviction(fakeNodeHandler *testutil.FakeNodeHandler) bool {
  62. var podEvicted bool
  63. zones := testutil.GetZones(fakeNodeHandler)
  64. for _, zone := range zones {
  65. nc.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  66. uid, _ := value.UID.(string)
  67. nodeutil.DeletePods(fakeNodeHandler, nc.recorder, value.Value, uid, nc.daemonSetStore)
  68. return true, 0
  69. })
  70. }
  71. for _, action := range fakeNodeHandler.Actions() {
  72. if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" {
  73. podEvicted = true
  74. return podEvicted
  75. }
  76. }
  77. return podEvicted
  78. }
  79. func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1beta1.Lease {
  80. return &coordv1beta1.Lease{
  81. ObjectMeta: metav1.ObjectMeta{
  82. Name: nodeName,
  83. Namespace: v1.NamespaceNodeLease,
  84. },
  85. Spec: coordv1beta1.LeaseSpec{
  86. HolderIdentity: pointer.StringPtr(nodeName),
  87. RenewTime: &renewTime,
  88. },
  89. }
  90. }
  91. func (nc *nodeLifecycleController) syncLeaseStore(lease *coordv1beta1.Lease) error {
  92. if lease == nil {
  93. return nil
  94. }
  95. newElems := make([]interface{}, 0, 1)
  96. newElems = append(newElems, lease)
  97. return nc.leaseInformer.Informer().GetStore().Replace(newElems, "newRV")
  98. }
  99. func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeNodeHandler) error {
  100. nodes, err := fakeNodeHandler.List(metav1.ListOptions{})
  101. if err != nil {
  102. return err
  103. }
  104. newElems := make([]interface{}, 0, len(nodes.Items))
  105. for i := range nodes.Items {
  106. newElems = append(newElems, &nodes.Items[i])
  107. }
  108. return nc.nodeInformer.Informer().GetStore().Replace(newElems, "newRV")
  109. }
  110. func newNodeLifecycleControllerFromClient(
  111. kubeClient clientset.Interface,
  112. podEvictionTimeout time.Duration,
  113. evictionLimiterQPS float32,
  114. secondaryEvictionLimiterQPS float32,
  115. largeClusterThreshold int32,
  116. unhealthyZoneThreshold float32,
  117. nodeMonitorGracePeriod time.Duration,
  118. nodeStartupGracePeriod time.Duration,
  119. nodeMonitorPeriod time.Duration,
  120. useTaints bool,
  121. ) (*nodeLifecycleController, error) {
  122. factory := informers.NewSharedInformerFactory(kubeClient, controller.NoResyncPeriodFunc())
  123. leaseInformer := factory.Coordination().V1beta1().Leases()
  124. nodeInformer := factory.Core().V1().Nodes()
  125. daemonSetInformer := factory.Apps().V1().DaemonSets()
  126. nc, err := NewNodeLifecycleController(
  127. leaseInformer,
  128. factory.Core().V1().Pods(),
  129. nodeInformer,
  130. daemonSetInformer,
  131. kubeClient,
  132. nodeMonitorPeriod,
  133. nodeStartupGracePeriod,
  134. nodeMonitorGracePeriod,
  135. podEvictionTimeout,
  136. evictionLimiterQPS,
  137. secondaryEvictionLimiterQPS,
  138. largeClusterThreshold,
  139. unhealthyZoneThreshold,
  140. useTaints,
  141. useTaints,
  142. useTaints,
  143. )
  144. if err != nil {
  145. return nil, err
  146. }
  147. nc.leaseInformerSynced = alwaysReady
  148. nc.podInformerSynced = alwaysReady
  149. nc.nodeInformerSynced = alwaysReady
  150. nc.daemonSetInformerSynced = alwaysReady
  151. return &nodeLifecycleController{nc, leaseInformer, nodeInformer, daemonSetInformer}, nil
  152. }
  153. func TestMonitorNodeHealthEvictPods(t *testing.T) {
  154. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  155. evictionTimeout := 10 * time.Minute
  156. labels := map[string]string{
  157. v1.LabelZoneRegion: "region1",
  158. v1.LabelZoneFailureDomain: "zone1",
  159. }
  160. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  161. // we need second healthy node in tests. Because of how the tests are written we need to update
  162. // the status of this Node.
  163. healthyNodeNewStatus := v1.NodeStatus{
  164. Conditions: []v1.NodeCondition{
  165. {
  166. Type: v1.NodeReady,
  167. Status: v1.ConditionTrue,
  168. // Node status has just been updated, and is NotReady for 10min.
  169. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  170. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  171. },
  172. },
  173. }
  174. table := []struct {
  175. fakeNodeHandler *testutil.FakeNodeHandler
  176. daemonSets []apps.DaemonSet
  177. timeToPass time.Duration
  178. newNodeStatus v1.NodeStatus
  179. secondNodeNewStatus v1.NodeStatus
  180. expectedEvictPods bool
  181. description string
  182. }{
  183. // Node created recently, with no status (happens only at cluster startup).
  184. {
  185. fakeNodeHandler: &testutil.FakeNodeHandler{
  186. Existing: []*v1.Node{
  187. {
  188. ObjectMeta: metav1.ObjectMeta{
  189. Name: "node0",
  190. CreationTimestamp: fakeNow,
  191. Labels: map[string]string{
  192. v1.LabelZoneRegion: "region1",
  193. v1.LabelZoneFailureDomain: "zone1",
  194. },
  195. },
  196. },
  197. {
  198. ObjectMeta: metav1.ObjectMeta{
  199. Name: "node1",
  200. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  201. Labels: map[string]string{
  202. v1.LabelZoneRegion: "region1",
  203. v1.LabelZoneFailureDomain: "zone1",
  204. },
  205. },
  206. Status: v1.NodeStatus{
  207. Conditions: []v1.NodeCondition{
  208. {
  209. Type: v1.NodeReady,
  210. Status: v1.ConditionTrue,
  211. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  212. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  213. },
  214. },
  215. },
  216. },
  217. },
  218. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  219. },
  220. daemonSets: nil,
  221. timeToPass: 0,
  222. newNodeStatus: v1.NodeStatus{},
  223. secondNodeNewStatus: healthyNodeNewStatus,
  224. expectedEvictPods: false,
  225. description: "Node created recently, with no status.",
  226. },
  227. // Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup).
  228. {
  229. fakeNodeHandler: &testutil.FakeNodeHandler{
  230. Existing: []*v1.Node{
  231. {
  232. ObjectMeta: metav1.ObjectMeta{
  233. Name: "node0",
  234. CreationTimestamp: fakeNow,
  235. },
  236. },
  237. {
  238. ObjectMeta: metav1.ObjectMeta{
  239. Name: "node1",
  240. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  241. },
  242. Status: v1.NodeStatus{
  243. Conditions: []v1.NodeCondition{
  244. {
  245. Type: v1.NodeReady,
  246. Status: v1.ConditionTrue,
  247. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  248. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  249. },
  250. },
  251. },
  252. },
  253. },
  254. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  255. },
  256. daemonSets: nil,
  257. timeToPass: 0,
  258. newNodeStatus: v1.NodeStatus{},
  259. secondNodeNewStatus: healthyNodeNewStatus,
  260. expectedEvictPods: false,
  261. description: "Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup).",
  262. },
  263. // Node created long time ago, and kubelet posted NotReady for a short period of time.
  264. {
  265. fakeNodeHandler: &testutil.FakeNodeHandler{
  266. Existing: []*v1.Node{
  267. {
  268. ObjectMeta: metav1.ObjectMeta{
  269. Name: "node0",
  270. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  271. Labels: map[string]string{
  272. v1.LabelZoneRegion: "region1",
  273. v1.LabelZoneFailureDomain: "zone1",
  274. },
  275. },
  276. Status: v1.NodeStatus{
  277. Conditions: []v1.NodeCondition{
  278. {
  279. Type: v1.NodeReady,
  280. Status: v1.ConditionFalse,
  281. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  282. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  283. },
  284. },
  285. },
  286. },
  287. {
  288. ObjectMeta: metav1.ObjectMeta{
  289. Name: "node1",
  290. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  291. Labels: map[string]string{
  292. v1.LabelZoneRegion: "region1",
  293. v1.LabelZoneFailureDomain: "zone1",
  294. },
  295. },
  296. Status: v1.NodeStatus{
  297. Conditions: []v1.NodeCondition{
  298. {
  299. Type: v1.NodeReady,
  300. Status: v1.ConditionTrue,
  301. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  302. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  303. },
  304. },
  305. },
  306. },
  307. },
  308. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  309. },
  310. daemonSets: nil,
  311. timeToPass: evictionTimeout,
  312. newNodeStatus: v1.NodeStatus{
  313. Conditions: []v1.NodeCondition{
  314. {
  315. Type: v1.NodeReady,
  316. Status: v1.ConditionFalse,
  317. // Node status has just been updated, and is NotReady for 10min.
  318. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  319. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  320. },
  321. },
  322. },
  323. secondNodeNewStatus: healthyNodeNewStatus,
  324. expectedEvictPods: false,
  325. description: "Node created long time ago, and kubelet posted NotReady for a short period of time.",
  326. },
  327. // Pod is ds-managed, and kubelet posted NotReady for a long period of time.
  328. {
  329. fakeNodeHandler: &testutil.FakeNodeHandler{
  330. Existing: []*v1.Node{
  331. {
  332. ObjectMeta: metav1.ObjectMeta{
  333. Name: "node0",
  334. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  335. Labels: map[string]string{
  336. v1.LabelZoneRegion: "region1",
  337. v1.LabelZoneFailureDomain: "zone1",
  338. },
  339. },
  340. Status: v1.NodeStatus{
  341. Conditions: []v1.NodeCondition{
  342. {
  343. Type: v1.NodeReady,
  344. Status: v1.ConditionFalse,
  345. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  346. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  347. },
  348. },
  349. },
  350. },
  351. {
  352. ObjectMeta: metav1.ObjectMeta{
  353. Name: "node1",
  354. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  355. Labels: map[string]string{
  356. v1.LabelZoneRegion: "region1",
  357. v1.LabelZoneFailureDomain: "zone1",
  358. },
  359. },
  360. Status: v1.NodeStatus{
  361. Conditions: []v1.NodeCondition{
  362. {
  363. Type: v1.NodeReady,
  364. Status: v1.ConditionTrue,
  365. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  366. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  367. },
  368. },
  369. },
  370. },
  371. },
  372. Clientset: fake.NewSimpleClientset(
  373. &v1.PodList{
  374. Items: []v1.Pod{
  375. {
  376. ObjectMeta: metav1.ObjectMeta{
  377. Name: "pod0",
  378. Namespace: "default",
  379. Labels: map[string]string{"daemon": "yes"},
  380. },
  381. Spec: v1.PodSpec{
  382. NodeName: "node0",
  383. },
  384. },
  385. },
  386. },
  387. ),
  388. },
  389. daemonSets: []apps.DaemonSet{
  390. {
  391. ObjectMeta: metav1.ObjectMeta{
  392. Name: "ds0",
  393. Namespace: "default",
  394. },
  395. Spec: apps.DaemonSetSpec{
  396. Selector: &metav1.LabelSelector{
  397. MatchLabels: map[string]string{"daemon": "yes"},
  398. },
  399. },
  400. },
  401. },
  402. timeToPass: time.Hour,
  403. newNodeStatus: v1.NodeStatus{
  404. Conditions: []v1.NodeCondition{
  405. {
  406. Type: v1.NodeReady,
  407. Status: v1.ConditionFalse,
  408. // Node status has just been updated, and is NotReady for 1hr.
  409. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC),
  410. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  411. },
  412. },
  413. },
  414. secondNodeNewStatus: healthyNodeNewStatus,
  415. expectedEvictPods: false,
  416. description: "Pod is ds-managed, and kubelet posted NotReady for a long period of time.",
  417. },
  418. // Node created long time ago, and kubelet posted NotReady for a long period of time.
  419. {
  420. fakeNodeHandler: &testutil.FakeNodeHandler{
  421. Existing: []*v1.Node{
  422. {
  423. ObjectMeta: metav1.ObjectMeta{
  424. Name: "node0",
  425. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  426. Labels: map[string]string{
  427. v1.LabelZoneRegion: "region1",
  428. v1.LabelZoneFailureDomain: "zone1",
  429. },
  430. },
  431. Status: v1.NodeStatus{
  432. Conditions: []v1.NodeCondition{
  433. {
  434. Type: v1.NodeReady,
  435. Status: v1.ConditionFalse,
  436. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  437. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  438. },
  439. },
  440. },
  441. },
  442. {
  443. ObjectMeta: metav1.ObjectMeta{
  444. Name: "node1",
  445. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  446. Labels: map[string]string{
  447. v1.LabelZoneRegion: "region1",
  448. v1.LabelZoneFailureDomain: "zone1",
  449. },
  450. },
  451. Status: v1.NodeStatus{
  452. Conditions: []v1.NodeCondition{
  453. {
  454. Type: v1.NodeReady,
  455. Status: v1.ConditionTrue,
  456. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  457. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  458. },
  459. },
  460. },
  461. },
  462. },
  463. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  464. },
  465. daemonSets: nil,
  466. timeToPass: time.Hour,
  467. newNodeStatus: v1.NodeStatus{
  468. Conditions: []v1.NodeCondition{
  469. {
  470. Type: v1.NodeReady,
  471. Status: v1.ConditionFalse,
  472. // Node status has just been updated, and is NotReady for 1hr.
  473. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC),
  474. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  475. },
  476. },
  477. },
  478. secondNodeNewStatus: healthyNodeNewStatus,
  479. expectedEvictPods: true,
  480. description: "Node created long time ago, and kubelet posted NotReady for a long period of time.",
  481. },
  482. // Node created long time ago, node controller posted Unknown for a short period of time.
  483. {
  484. fakeNodeHandler: &testutil.FakeNodeHandler{
  485. Existing: []*v1.Node{
  486. {
  487. ObjectMeta: metav1.ObjectMeta{
  488. Name: "node0",
  489. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  490. Labels: map[string]string{
  491. v1.LabelZoneRegion: "region1",
  492. v1.LabelZoneFailureDomain: "zone1",
  493. },
  494. },
  495. Status: v1.NodeStatus{
  496. Conditions: []v1.NodeCondition{
  497. {
  498. Type: v1.NodeReady,
  499. Status: v1.ConditionUnknown,
  500. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  501. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  502. },
  503. },
  504. },
  505. },
  506. {
  507. ObjectMeta: metav1.ObjectMeta{
  508. Name: "node1",
  509. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  510. Labels: map[string]string{
  511. v1.LabelZoneRegion: "region1",
  512. v1.LabelZoneFailureDomain: "zone1",
  513. },
  514. },
  515. Status: v1.NodeStatus{
  516. Conditions: []v1.NodeCondition{
  517. {
  518. Type: v1.NodeReady,
  519. Status: v1.ConditionTrue,
  520. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  521. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  522. },
  523. },
  524. },
  525. },
  526. },
  527. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  528. },
  529. daemonSets: nil,
  530. timeToPass: evictionTimeout - testNodeMonitorGracePeriod,
  531. newNodeStatus: v1.NodeStatus{
  532. Conditions: []v1.NodeCondition{
  533. {
  534. Type: v1.NodeReady,
  535. Status: v1.ConditionUnknown,
  536. // Node status was updated by nodecontroller 10min ago
  537. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  538. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  539. },
  540. },
  541. },
  542. secondNodeNewStatus: healthyNodeNewStatus,
  543. expectedEvictPods: false,
  544. description: "Node created long time ago, node controller posted Unknown for a short period of time.",
  545. },
  546. // Node created long time ago, node controller posted Unknown for a long period of time.
  547. {
  548. fakeNodeHandler: &testutil.FakeNodeHandler{
  549. Existing: []*v1.Node{
  550. {
  551. ObjectMeta: metav1.ObjectMeta{
  552. Name: "node0",
  553. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  554. Labels: map[string]string{
  555. v1.LabelZoneRegion: "region1",
  556. v1.LabelZoneFailureDomain: "zone1",
  557. },
  558. },
  559. Status: v1.NodeStatus{
  560. Conditions: []v1.NodeCondition{
  561. {
  562. Type: v1.NodeReady,
  563. Status: v1.ConditionUnknown,
  564. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  565. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  566. },
  567. },
  568. },
  569. },
  570. {
  571. ObjectMeta: metav1.ObjectMeta{
  572. Name: "node1",
  573. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  574. Labels: map[string]string{
  575. v1.LabelZoneRegion: "region1",
  576. v1.LabelZoneFailureDomain: "zone1",
  577. },
  578. },
  579. Status: v1.NodeStatus{
  580. Conditions: []v1.NodeCondition{
  581. {
  582. Type: v1.NodeReady,
  583. Status: v1.ConditionTrue,
  584. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  585. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  586. },
  587. },
  588. },
  589. },
  590. },
  591. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  592. },
  593. daemonSets: nil,
  594. timeToPass: 60 * time.Minute,
  595. newNodeStatus: v1.NodeStatus{
  596. Conditions: []v1.NodeCondition{
  597. {
  598. Type: v1.NodeReady,
  599. Status: v1.ConditionUnknown,
  600. // Node status was updated by nodecontroller 1hr ago
  601. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  602. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  603. },
  604. },
  605. },
  606. secondNodeNewStatus: healthyNodeNewStatus,
  607. expectedEvictPods: true,
  608. description: "Node created long time ago, node controller posted Unknown for a long period of time.",
  609. },
  610. }
  611. for _, item := range table {
  612. nodeController, _ := newNodeLifecycleControllerFromClient(
  613. item.fakeNodeHandler,
  614. evictionTimeout,
  615. testRateLimiterQPS,
  616. testRateLimiterQPS,
  617. testLargeClusterThreshold,
  618. testUnhealthyThreshold,
  619. testNodeMonitorGracePeriod,
  620. testNodeStartupGracePeriod,
  621. testNodeMonitorPeriod,
  622. false)
  623. nodeController.now = func() metav1.Time { return fakeNow }
  624. nodeController.recorder = testutil.NewFakeRecorder()
  625. for _, ds := range item.daemonSets {
  626. nodeController.daemonSetInformer.Informer().GetStore().Add(&ds)
  627. }
  628. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  629. t.Errorf("unexpected error: %v", err)
  630. }
  631. if err := nodeController.monitorNodeHealth(); err != nil {
  632. t.Errorf("unexpected error: %v", err)
  633. }
  634. if item.timeToPass > 0 {
  635. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  636. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  637. item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus
  638. }
  639. if len(item.fakeNodeHandler.Existing[0].Labels) == 0 && len(item.fakeNodeHandler.Existing[1].Labels) == 0 {
  640. item.fakeNodeHandler.Existing[0].Labels = labels
  641. item.fakeNodeHandler.Existing[1].Labels = labels
  642. }
  643. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  644. t.Errorf("unexpected error: %v", err)
  645. }
  646. if err := nodeController.monitorNodeHealth(); err != nil {
  647. t.Errorf("unexpected error: %v", err)
  648. }
  649. zones := testutil.GetZones(item.fakeNodeHandler)
  650. for _, zone := range zones {
  651. if _, ok := nodeController.zonePodEvictor[zone]; ok {
  652. nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  653. nodeUID, _ := value.UID.(string)
  654. nodeutil.DeletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetInformer.Lister())
  655. return true, 0
  656. })
  657. } else {
  658. t.Fatalf("Zone %v was unitialized!", zone)
  659. }
  660. }
  661. podEvicted := false
  662. for _, action := range item.fakeNodeHandler.Actions() {
  663. if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" {
  664. podEvicted = true
  665. }
  666. }
  667. if item.expectedEvictPods != podEvicted {
  668. t.Errorf("expected pod eviction: %+v, got %+v for %+v", item.expectedEvictPods,
  669. podEvicted, item.description)
  670. }
  671. }
  672. }
  673. func TestPodStatusChange(t *testing.T) {
  674. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  675. evictionTimeout := 10 * time.Minute
  676. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  677. // we need second healthy node in tests. Because of how the tests are written we need to update
  678. // the status of this Node.
  679. healthyNodeNewStatus := v1.NodeStatus{
  680. Conditions: []v1.NodeCondition{
  681. {
  682. Type: v1.NodeReady,
  683. Status: v1.ConditionTrue,
  684. // Node status has just been updated, and is NotReady for 10min.
  685. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  686. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  687. },
  688. },
  689. }
  690. // Node created long time ago, node controller posted Unknown for a long period of time.
  691. table := []struct {
  692. fakeNodeHandler *testutil.FakeNodeHandler
  693. timeToPass time.Duration
  694. newNodeStatus v1.NodeStatus
  695. secondNodeNewStatus v1.NodeStatus
  696. expectedPodUpdate bool
  697. expectedReason string
  698. description string
  699. }{
  700. {
  701. fakeNodeHandler: &testutil.FakeNodeHandler{
  702. Existing: []*v1.Node{
  703. {
  704. ObjectMeta: metav1.ObjectMeta{
  705. Name: "node0",
  706. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  707. Labels: map[string]string{
  708. v1.LabelZoneRegion: "region1",
  709. v1.LabelZoneFailureDomain: "zone1",
  710. },
  711. },
  712. Status: v1.NodeStatus{
  713. Conditions: []v1.NodeCondition{
  714. {
  715. Type: v1.NodeReady,
  716. Status: v1.ConditionUnknown,
  717. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  718. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  719. },
  720. },
  721. },
  722. },
  723. {
  724. ObjectMeta: metav1.ObjectMeta{
  725. Name: "node1",
  726. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  727. Labels: map[string]string{
  728. v1.LabelZoneRegion: "region1",
  729. v1.LabelZoneFailureDomain: "zone1",
  730. },
  731. },
  732. Status: v1.NodeStatus{
  733. Conditions: []v1.NodeCondition{
  734. {
  735. Type: v1.NodeReady,
  736. Status: v1.ConditionTrue,
  737. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  738. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  739. },
  740. },
  741. },
  742. },
  743. },
  744. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  745. },
  746. timeToPass: 60 * time.Minute,
  747. newNodeStatus: v1.NodeStatus{
  748. Conditions: []v1.NodeCondition{
  749. {
  750. Type: v1.NodeReady,
  751. Status: v1.ConditionUnknown,
  752. // Node status was updated by nodecontroller 1hr ago
  753. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  754. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  755. },
  756. },
  757. },
  758. secondNodeNewStatus: healthyNodeNewStatus,
  759. expectedPodUpdate: true,
  760. expectedReason: node.NodeUnreachablePodReason,
  761. description: "Node created long time ago, node controller posted Unknown for a " +
  762. "long period of time, the pod status must include reason for termination.",
  763. },
  764. }
  765. for _, item := range table {
  766. nodeController, _ := newNodeLifecycleControllerFromClient(
  767. item.fakeNodeHandler,
  768. evictionTimeout,
  769. testRateLimiterQPS,
  770. testRateLimiterQPS,
  771. testLargeClusterThreshold,
  772. testUnhealthyThreshold,
  773. testNodeMonitorGracePeriod,
  774. testNodeStartupGracePeriod,
  775. testNodeMonitorPeriod,
  776. false)
  777. nodeController.now = func() metav1.Time { return fakeNow }
  778. nodeController.recorder = testutil.NewFakeRecorder()
  779. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  780. t.Errorf("unexpected error: %v", err)
  781. }
  782. if err := nodeController.monitorNodeHealth(); err != nil {
  783. t.Errorf("unexpected error: %v", err)
  784. }
  785. if item.timeToPass > 0 {
  786. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  787. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  788. item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus
  789. }
  790. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  791. t.Errorf("unexpected error: %v", err)
  792. }
  793. if err := nodeController.monitorNodeHealth(); err != nil {
  794. t.Errorf("unexpected error: %v", err)
  795. }
  796. zones := testutil.GetZones(item.fakeNodeHandler)
  797. for _, zone := range zones {
  798. nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  799. nodeUID, _ := value.UID.(string)
  800. nodeutil.DeletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetStore)
  801. return true, 0
  802. })
  803. }
  804. podReasonUpdate := false
  805. for _, action := range item.fakeNodeHandler.Actions() {
  806. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" {
  807. updateReason := action.(testcore.UpdateActionImpl).GetObject().(*v1.Pod).Status.Reason
  808. podReasonUpdate = true
  809. if updateReason != item.expectedReason {
  810. t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description)
  811. }
  812. }
  813. }
  814. if podReasonUpdate != item.expectedPodUpdate {
  815. t.Errorf("expected pod update: %+v, got %+v for %+v", podReasonUpdate, item.expectedPodUpdate, item.description)
  816. }
  817. }
  818. }
  819. func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) {
  820. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  821. evictionTimeout := 10 * time.Minute
  822. timeToPass := 60 * time.Minute
  823. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  824. // we need second healthy node in tests. Because of how the tests are written we need to update
  825. // the status of this Node.
  826. healthyNodeNewStatus := v1.NodeStatus{
  827. Conditions: []v1.NodeCondition{
  828. {
  829. Type: v1.NodeReady,
  830. Status: v1.ConditionTrue,
  831. LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC),
  832. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  833. },
  834. },
  835. }
  836. unhealthyNodeNewStatus := v1.NodeStatus{
  837. Conditions: []v1.NodeCondition{
  838. {
  839. Type: v1.NodeReady,
  840. Status: v1.ConditionUnknown,
  841. // Node status was updated by nodecontroller 1hr ago
  842. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  843. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  844. },
  845. },
  846. }
  847. table := []struct {
  848. nodeList []*v1.Node
  849. podList []v1.Pod
  850. updatedNodeStatuses []v1.NodeStatus
  851. expectedInitialStates map[string]ZoneState
  852. expectedFollowingStates map[string]ZoneState
  853. expectedEvictPods bool
  854. description string
  855. }{
  856. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  857. // Only zone is down - eviction shouldn't take place
  858. {
  859. nodeList: []*v1.Node{
  860. {
  861. ObjectMeta: metav1.ObjectMeta{
  862. Name: "node0",
  863. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  864. Labels: map[string]string{
  865. v1.LabelZoneRegion: "region1",
  866. v1.LabelZoneFailureDomain: "zone1",
  867. },
  868. },
  869. Status: v1.NodeStatus{
  870. Conditions: []v1.NodeCondition{
  871. {
  872. Type: v1.NodeReady,
  873. Status: v1.ConditionUnknown,
  874. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  875. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  876. },
  877. },
  878. },
  879. },
  880. {
  881. ObjectMeta: metav1.ObjectMeta{
  882. Name: "node1",
  883. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  884. Labels: map[string]string{
  885. v1.LabelZoneRegion: "region1",
  886. v1.LabelZoneFailureDomain: "zone1",
  887. },
  888. },
  889. Status: v1.NodeStatus{
  890. Conditions: []v1.NodeCondition{
  891. {
  892. Type: v1.NodeReady,
  893. Status: v1.ConditionUnknown,
  894. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  895. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  896. },
  897. },
  898. },
  899. },
  900. },
  901. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  902. updatedNodeStatuses: []v1.NodeStatus{
  903. unhealthyNodeNewStatus,
  904. unhealthyNodeNewStatus,
  905. },
  906. expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
  907. expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
  908. expectedEvictPods: false,
  909. description: "Network Disruption: Only zone is down - eviction shouldn't take place.",
  910. },
  911. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  912. // Both zones down - eviction shouldn't take place
  913. {
  914. nodeList: []*v1.Node{
  915. {
  916. ObjectMeta: metav1.ObjectMeta{
  917. Name: "node0",
  918. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  919. Labels: map[string]string{
  920. v1.LabelZoneRegion: "region1",
  921. v1.LabelZoneFailureDomain: "zone1",
  922. },
  923. },
  924. Status: v1.NodeStatus{
  925. Conditions: []v1.NodeCondition{
  926. {
  927. Type: v1.NodeReady,
  928. Status: v1.ConditionUnknown,
  929. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  930. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  931. },
  932. },
  933. },
  934. },
  935. {
  936. ObjectMeta: metav1.ObjectMeta{
  937. Name: "node1",
  938. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  939. Labels: map[string]string{
  940. v1.LabelZoneRegion: "region2",
  941. v1.LabelZoneFailureDomain: "zone2",
  942. },
  943. },
  944. Status: v1.NodeStatus{
  945. Conditions: []v1.NodeCondition{
  946. {
  947. Type: v1.NodeReady,
  948. Status: v1.ConditionUnknown,
  949. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  950. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  951. },
  952. },
  953. },
  954. },
  955. },
  956. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  957. updatedNodeStatuses: []v1.NodeStatus{
  958. unhealthyNodeNewStatus,
  959. unhealthyNodeNewStatus,
  960. },
  961. expectedInitialStates: map[string]ZoneState{
  962. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  963. testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
  964. },
  965. expectedFollowingStates: map[string]ZoneState{
  966. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  967. testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
  968. },
  969. expectedEvictPods: false,
  970. description: "Network Disruption: Both zones down - eviction shouldn't take place.",
  971. },
  972. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  973. // One zone is down - eviction should take place
  974. {
  975. nodeList: []*v1.Node{
  976. {
  977. ObjectMeta: metav1.ObjectMeta{
  978. Name: "node0",
  979. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  980. Labels: map[string]string{
  981. v1.LabelZoneRegion: "region1",
  982. v1.LabelZoneFailureDomain: "zone1",
  983. },
  984. },
  985. Status: v1.NodeStatus{
  986. Conditions: []v1.NodeCondition{
  987. {
  988. Type: v1.NodeReady,
  989. Status: v1.ConditionUnknown,
  990. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  991. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  992. },
  993. },
  994. },
  995. },
  996. {
  997. ObjectMeta: metav1.ObjectMeta{
  998. Name: "node1",
  999. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1000. Labels: map[string]string{
  1001. v1.LabelZoneRegion: "region1",
  1002. v1.LabelZoneFailureDomain: "zone2",
  1003. },
  1004. },
  1005. Status: v1.NodeStatus{
  1006. Conditions: []v1.NodeCondition{
  1007. {
  1008. Type: v1.NodeReady,
  1009. Status: v1.ConditionTrue,
  1010. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1011. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1012. },
  1013. },
  1014. },
  1015. },
  1016. },
  1017. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1018. updatedNodeStatuses: []v1.NodeStatus{
  1019. unhealthyNodeNewStatus,
  1020. healthyNodeNewStatus,
  1021. },
  1022. expectedInitialStates: map[string]ZoneState{
  1023. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1024. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1025. },
  1026. expectedFollowingStates: map[string]ZoneState{
  1027. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1028. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1029. },
  1030. expectedEvictPods: true,
  1031. description: "Network Disruption: One zone is down - eviction should take place.",
  1032. },
  1033. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period
  1034. // of on first Node, eviction should stop even though -master Node is healthy.
  1035. {
  1036. nodeList: []*v1.Node{
  1037. {
  1038. ObjectMeta: metav1.ObjectMeta{
  1039. Name: "node0",
  1040. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1041. Labels: map[string]string{
  1042. v1.LabelZoneRegion: "region1",
  1043. v1.LabelZoneFailureDomain: "zone1",
  1044. },
  1045. },
  1046. Status: v1.NodeStatus{
  1047. Conditions: []v1.NodeCondition{
  1048. {
  1049. Type: v1.NodeReady,
  1050. Status: v1.ConditionUnknown,
  1051. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1052. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1053. },
  1054. },
  1055. },
  1056. },
  1057. {
  1058. ObjectMeta: metav1.ObjectMeta{
  1059. Name: "node-master",
  1060. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1061. Labels: map[string]string{
  1062. v1.LabelZoneRegion: "region1",
  1063. v1.LabelZoneFailureDomain: "zone1",
  1064. },
  1065. },
  1066. Status: v1.NodeStatus{
  1067. Conditions: []v1.NodeCondition{
  1068. {
  1069. Type: v1.NodeReady,
  1070. Status: v1.ConditionTrue,
  1071. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1072. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1073. },
  1074. },
  1075. },
  1076. },
  1077. },
  1078. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1079. updatedNodeStatuses: []v1.NodeStatus{
  1080. unhealthyNodeNewStatus,
  1081. healthyNodeNewStatus,
  1082. },
  1083. expectedInitialStates: map[string]ZoneState{
  1084. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1085. },
  1086. expectedFollowingStates: map[string]ZoneState{
  1087. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1088. },
  1089. expectedEvictPods: false,
  1090. description: "NetworkDisruption: eviction should stop, only -master Node is healthy",
  1091. },
  1092. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  1093. // Initially both zones down, one comes back - eviction should take place
  1094. {
  1095. nodeList: []*v1.Node{
  1096. {
  1097. ObjectMeta: metav1.ObjectMeta{
  1098. Name: "node0",
  1099. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1100. Labels: map[string]string{
  1101. v1.LabelZoneRegion: "region1",
  1102. v1.LabelZoneFailureDomain: "zone1",
  1103. },
  1104. },
  1105. Status: v1.NodeStatus{
  1106. Conditions: []v1.NodeCondition{
  1107. {
  1108. Type: v1.NodeReady,
  1109. Status: v1.ConditionUnknown,
  1110. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1111. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1112. },
  1113. },
  1114. },
  1115. },
  1116. {
  1117. ObjectMeta: metav1.ObjectMeta{
  1118. Name: "node1",
  1119. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1120. Labels: map[string]string{
  1121. v1.LabelZoneRegion: "region1",
  1122. v1.LabelZoneFailureDomain: "zone2",
  1123. },
  1124. },
  1125. Status: v1.NodeStatus{
  1126. Conditions: []v1.NodeCondition{
  1127. {
  1128. Type: v1.NodeReady,
  1129. Status: v1.ConditionUnknown,
  1130. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1131. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1132. },
  1133. },
  1134. },
  1135. },
  1136. },
  1137. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1138. updatedNodeStatuses: []v1.NodeStatus{
  1139. unhealthyNodeNewStatus,
  1140. healthyNodeNewStatus,
  1141. },
  1142. expectedInitialStates: map[string]ZoneState{
  1143. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1144. testutil.CreateZoneID("region1", "zone2"): stateFullDisruption,
  1145. },
  1146. expectedFollowingStates: map[string]ZoneState{
  1147. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1148. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1149. },
  1150. expectedEvictPods: true,
  1151. description: "Initially both zones down, one comes back - eviction should take place",
  1152. },
  1153. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  1154. // Zone is partially disrupted - eviction should take place
  1155. {
  1156. nodeList: []*v1.Node{
  1157. {
  1158. ObjectMeta: metav1.ObjectMeta{
  1159. Name: "node0",
  1160. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1161. Labels: map[string]string{
  1162. v1.LabelZoneRegion: "region1",
  1163. v1.LabelZoneFailureDomain: "zone1",
  1164. },
  1165. },
  1166. Status: v1.NodeStatus{
  1167. Conditions: []v1.NodeCondition{
  1168. {
  1169. Type: v1.NodeReady,
  1170. Status: v1.ConditionUnknown,
  1171. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1172. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1173. },
  1174. },
  1175. },
  1176. },
  1177. {
  1178. ObjectMeta: metav1.ObjectMeta{
  1179. Name: "node1",
  1180. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1181. Labels: map[string]string{
  1182. v1.LabelZoneRegion: "region1",
  1183. v1.LabelZoneFailureDomain: "zone1",
  1184. },
  1185. },
  1186. Status: v1.NodeStatus{
  1187. Conditions: []v1.NodeCondition{
  1188. {
  1189. Type: v1.NodeReady,
  1190. Status: v1.ConditionUnknown,
  1191. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1192. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1193. },
  1194. },
  1195. },
  1196. },
  1197. {
  1198. ObjectMeta: metav1.ObjectMeta{
  1199. Name: "node2",
  1200. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1201. Labels: map[string]string{
  1202. v1.LabelZoneRegion: "region1",
  1203. v1.LabelZoneFailureDomain: "zone1",
  1204. },
  1205. },
  1206. Status: v1.NodeStatus{
  1207. Conditions: []v1.NodeCondition{
  1208. {
  1209. Type: v1.NodeReady,
  1210. Status: v1.ConditionUnknown,
  1211. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1212. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1213. },
  1214. },
  1215. },
  1216. },
  1217. {
  1218. ObjectMeta: metav1.ObjectMeta{
  1219. Name: "node3",
  1220. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1221. Labels: map[string]string{
  1222. v1.LabelZoneRegion: "region1",
  1223. v1.LabelZoneFailureDomain: "zone1",
  1224. },
  1225. },
  1226. Status: v1.NodeStatus{
  1227. Conditions: []v1.NodeCondition{
  1228. {
  1229. Type: v1.NodeReady,
  1230. Status: v1.ConditionTrue,
  1231. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1232. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1233. },
  1234. },
  1235. },
  1236. },
  1237. {
  1238. ObjectMeta: metav1.ObjectMeta{
  1239. Name: "node4",
  1240. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1241. Labels: map[string]string{
  1242. v1.LabelZoneRegion: "region1",
  1243. v1.LabelZoneFailureDomain: "zone1",
  1244. },
  1245. },
  1246. Status: v1.NodeStatus{
  1247. Conditions: []v1.NodeCondition{
  1248. {
  1249. Type: v1.NodeReady,
  1250. Status: v1.ConditionTrue,
  1251. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1252. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1253. },
  1254. },
  1255. },
  1256. },
  1257. },
  1258. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1259. updatedNodeStatuses: []v1.NodeStatus{
  1260. unhealthyNodeNewStatus,
  1261. unhealthyNodeNewStatus,
  1262. unhealthyNodeNewStatus,
  1263. healthyNodeNewStatus,
  1264. healthyNodeNewStatus,
  1265. },
  1266. expectedInitialStates: map[string]ZoneState{
  1267. testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
  1268. },
  1269. expectedFollowingStates: map[string]ZoneState{
  1270. testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
  1271. },
  1272. expectedEvictPods: true,
  1273. description: "Zone is partially disrupted - eviction should take place.",
  1274. },
  1275. }
  1276. for _, item := range table {
  1277. fakeNodeHandler := &testutil.FakeNodeHandler{
  1278. Existing: item.nodeList,
  1279. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}),
  1280. }
  1281. nodeController, _ := newNodeLifecycleControllerFromClient(
  1282. fakeNodeHandler,
  1283. evictionTimeout,
  1284. testRateLimiterQPS,
  1285. testRateLimiterQPS,
  1286. testLargeClusterThreshold,
  1287. testUnhealthyThreshold,
  1288. testNodeMonitorGracePeriod,
  1289. testNodeStartupGracePeriod,
  1290. testNodeMonitorPeriod,
  1291. false)
  1292. nodeController.now = func() metav1.Time { return fakeNow }
  1293. nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 {
  1294. return testRateLimiterQPS
  1295. }
  1296. nodeController.recorder = testutil.NewFakeRecorder()
  1297. nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 {
  1298. return testRateLimiterQPS
  1299. }
  1300. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  1301. t.Errorf("unexpected error: %v", err)
  1302. }
  1303. if err := nodeController.monitorNodeHealth(); err != nil {
  1304. t.Errorf("%v: unexpected error: %v", item.description, err)
  1305. }
  1306. for zone, state := range item.expectedInitialStates {
  1307. if state != nodeController.zoneStates[zone] {
  1308. t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
  1309. }
  1310. }
  1311. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
  1312. for i := range item.updatedNodeStatuses {
  1313. fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i]
  1314. }
  1315. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  1316. t.Errorf("unexpected error: %v", err)
  1317. }
  1318. if err := nodeController.monitorNodeHealth(); err != nil {
  1319. t.Errorf("%v: unexpected error: %v", item.description, err)
  1320. }
  1321. for zone, state := range item.expectedFollowingStates {
  1322. if state != nodeController.zoneStates[zone] {
  1323. t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
  1324. }
  1325. }
  1326. var podEvicted bool
  1327. start := time.Now()
  1328. // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function.
  1329. // this breaks when we have the status that we need for test case or when we don't see the
  1330. // intended result after 1 minute.
  1331. for {
  1332. podEvicted = nodeController.doEviction(fakeNodeHandler)
  1333. if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute {
  1334. break
  1335. }
  1336. }
  1337. if item.expectedEvictPods != podEvicted {
  1338. t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted)
  1339. }
  1340. }
  1341. }
  1342. func TestMonitorNodeHealthUpdateStatus(t *testing.T) {
  1343. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  1344. table := []struct {
  1345. fakeNodeHandler *testutil.FakeNodeHandler
  1346. timeToPass time.Duration
  1347. newNodeStatus v1.NodeStatus
  1348. expectedRequestCount int
  1349. expectedNodes []*v1.Node
  1350. expectedPodStatusUpdate bool
  1351. }{
  1352. // Node created long time ago, without status:
  1353. // Expect Unknown status posted from node controller.
  1354. {
  1355. fakeNodeHandler: &testutil.FakeNodeHandler{
  1356. Existing: []*v1.Node{
  1357. {
  1358. ObjectMeta: metav1.ObjectMeta{
  1359. Name: "node0",
  1360. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1361. },
  1362. },
  1363. },
  1364. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1365. },
  1366. expectedRequestCount: 2, // List+Update
  1367. expectedNodes: []*v1.Node{
  1368. {
  1369. ObjectMeta: metav1.ObjectMeta{
  1370. Name: "node0",
  1371. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1372. },
  1373. Status: v1.NodeStatus{
  1374. Conditions: []v1.NodeCondition{
  1375. {
  1376. Type: v1.NodeReady,
  1377. Status: v1.ConditionUnknown,
  1378. Reason: "NodeStatusNeverUpdated",
  1379. Message: "Kubelet never posted node status.",
  1380. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1381. LastTransitionTime: fakeNow,
  1382. },
  1383. {
  1384. Type: v1.NodeMemoryPressure,
  1385. Status: v1.ConditionUnknown,
  1386. Reason: "NodeStatusNeverUpdated",
  1387. Message: "Kubelet never posted node status.",
  1388. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1389. LastTransitionTime: fakeNow,
  1390. },
  1391. {
  1392. Type: v1.NodeDiskPressure,
  1393. Status: v1.ConditionUnknown,
  1394. Reason: "NodeStatusNeverUpdated",
  1395. Message: "Kubelet never posted node status.",
  1396. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1397. LastTransitionTime: fakeNow,
  1398. },
  1399. {
  1400. Type: v1.NodePIDPressure,
  1401. Status: v1.ConditionUnknown,
  1402. Reason: "NodeStatusNeverUpdated",
  1403. Message: "Kubelet never posted node status.",
  1404. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1405. LastTransitionTime: fakeNow,
  1406. },
  1407. },
  1408. },
  1409. },
  1410. },
  1411. expectedPodStatusUpdate: false, // Pod was never scheduled
  1412. },
  1413. // Node created recently, without status.
  1414. // Expect no action from node controller (within startup grace period).
  1415. {
  1416. fakeNodeHandler: &testutil.FakeNodeHandler{
  1417. Existing: []*v1.Node{
  1418. {
  1419. ObjectMeta: metav1.ObjectMeta{
  1420. Name: "node0",
  1421. CreationTimestamp: fakeNow,
  1422. },
  1423. },
  1424. },
  1425. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1426. },
  1427. expectedRequestCount: 1, // List
  1428. expectedNodes: nil,
  1429. expectedPodStatusUpdate: false,
  1430. },
  1431. // Node created long time ago, with status updated by kubelet exceeds grace period.
  1432. // Expect Unknown status posted from node controller.
  1433. {
  1434. fakeNodeHandler: &testutil.FakeNodeHandler{
  1435. Existing: []*v1.Node{
  1436. {
  1437. ObjectMeta: metav1.ObjectMeta{
  1438. Name: "node0",
  1439. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1440. },
  1441. Status: v1.NodeStatus{
  1442. Conditions: []v1.NodeCondition{
  1443. {
  1444. Type: v1.NodeReady,
  1445. Status: v1.ConditionTrue,
  1446. // Node status hasn't been updated for 1hr.
  1447. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1448. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1449. },
  1450. },
  1451. Capacity: v1.ResourceList{
  1452. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1453. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1454. },
  1455. },
  1456. },
  1457. },
  1458. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1459. },
  1460. expectedRequestCount: 3, // (List+)List+Update
  1461. timeToPass: time.Hour,
  1462. newNodeStatus: v1.NodeStatus{
  1463. Conditions: []v1.NodeCondition{
  1464. {
  1465. Type: v1.NodeReady,
  1466. Status: v1.ConditionTrue,
  1467. // Node status hasn't been updated for 1hr.
  1468. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1469. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1470. },
  1471. },
  1472. Capacity: v1.ResourceList{
  1473. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1474. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1475. },
  1476. },
  1477. expectedNodes: []*v1.Node{
  1478. {
  1479. ObjectMeta: metav1.ObjectMeta{
  1480. Name: "node0",
  1481. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1482. },
  1483. Status: v1.NodeStatus{
  1484. Conditions: []v1.NodeCondition{
  1485. {
  1486. Type: v1.NodeReady,
  1487. Status: v1.ConditionUnknown,
  1488. Reason: "NodeStatusUnknown",
  1489. Message: "Kubelet stopped posting node status.",
  1490. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1491. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1492. },
  1493. {
  1494. Type: v1.NodeMemoryPressure,
  1495. Status: v1.ConditionUnknown,
  1496. Reason: "NodeStatusNeverUpdated",
  1497. Message: "Kubelet never posted node status.",
  1498. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1499. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1500. },
  1501. {
  1502. Type: v1.NodeDiskPressure,
  1503. Status: v1.ConditionUnknown,
  1504. Reason: "NodeStatusNeverUpdated",
  1505. Message: "Kubelet never posted node status.",
  1506. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1507. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1508. },
  1509. {
  1510. Type: v1.NodePIDPressure,
  1511. Status: v1.ConditionUnknown,
  1512. Reason: "NodeStatusNeverUpdated",
  1513. Message: "Kubelet never posted node status.",
  1514. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1515. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1516. },
  1517. },
  1518. Capacity: v1.ResourceList{
  1519. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1520. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1521. },
  1522. },
  1523. },
  1524. },
  1525. expectedPodStatusUpdate: true,
  1526. },
  1527. // Node created long time ago, with status updated recently.
  1528. // Expect no action from node controller (within monitor grace period).
  1529. {
  1530. fakeNodeHandler: &testutil.FakeNodeHandler{
  1531. Existing: []*v1.Node{
  1532. {
  1533. ObjectMeta: metav1.ObjectMeta{
  1534. Name: "node0",
  1535. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1536. },
  1537. Status: v1.NodeStatus{
  1538. Conditions: []v1.NodeCondition{
  1539. {
  1540. Type: v1.NodeReady,
  1541. Status: v1.ConditionTrue,
  1542. // Node status has just been updated.
  1543. LastHeartbeatTime: fakeNow,
  1544. LastTransitionTime: fakeNow,
  1545. },
  1546. },
  1547. Capacity: v1.ResourceList{
  1548. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1549. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1550. },
  1551. },
  1552. },
  1553. },
  1554. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1555. },
  1556. expectedRequestCount: 1, // List
  1557. expectedNodes: nil,
  1558. expectedPodStatusUpdate: false,
  1559. },
  1560. }
  1561. for i, item := range table {
  1562. nodeController, _ := newNodeLifecycleControllerFromClient(
  1563. item.fakeNodeHandler,
  1564. 5*time.Minute,
  1565. testRateLimiterQPS,
  1566. testRateLimiterQPS,
  1567. testLargeClusterThreshold,
  1568. testUnhealthyThreshold,
  1569. testNodeMonitorGracePeriod,
  1570. testNodeStartupGracePeriod,
  1571. testNodeMonitorPeriod,
  1572. false)
  1573. nodeController.now = func() metav1.Time { return fakeNow }
  1574. nodeController.recorder = testutil.NewFakeRecorder()
  1575. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  1576. t.Errorf("unexpected error: %v", err)
  1577. }
  1578. if err := nodeController.monitorNodeHealth(); err != nil {
  1579. t.Errorf("unexpected error: %v", err)
  1580. }
  1581. if item.timeToPass > 0 {
  1582. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  1583. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  1584. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  1585. t.Errorf("unexpected error: %v", err)
  1586. }
  1587. if err := nodeController.monitorNodeHealth(); err != nil {
  1588. t.Errorf("unexpected error: %v", err)
  1589. }
  1590. }
  1591. if item.expectedRequestCount != item.fakeNodeHandler.RequestCount {
  1592. t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount)
  1593. }
  1594. if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) {
  1595. t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0]))
  1596. }
  1597. if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) {
  1598. t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0]))
  1599. }
  1600. podStatusUpdated := false
  1601. for _, action := range item.fakeNodeHandler.Actions() {
  1602. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  1603. podStatusUpdated = true
  1604. }
  1605. }
  1606. if podStatusUpdated != item.expectedPodStatusUpdate {
  1607. t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated)
  1608. }
  1609. }
  1610. }
  1611. func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) {
  1612. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeLease, true)()
  1613. nodeCreationTime := metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC)
  1614. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  1615. testcases := []struct {
  1616. description string
  1617. fakeNodeHandler *testutil.FakeNodeHandler
  1618. lease *coordv1beta1.Lease
  1619. timeToPass time.Duration
  1620. newNodeStatus v1.NodeStatus
  1621. newLease *coordv1beta1.Lease
  1622. expectedRequestCount int
  1623. expectedNodes []*v1.Node
  1624. expectedPodStatusUpdate bool
  1625. }{
  1626. // Node created recently, without status. Node lease is missing.
  1627. // Expect no action from node controller (within startup grace period).
  1628. {
  1629. description: "Node created recently, without status. Node lease is missing.",
  1630. fakeNodeHandler: &testutil.FakeNodeHandler{
  1631. Existing: []*v1.Node{
  1632. {
  1633. ObjectMeta: metav1.ObjectMeta{
  1634. Name: "node0",
  1635. CreationTimestamp: fakeNow,
  1636. },
  1637. },
  1638. },
  1639. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1640. },
  1641. expectedRequestCount: 1, // List
  1642. expectedNodes: nil,
  1643. expectedPodStatusUpdate: false,
  1644. },
  1645. // Node created recently, without status. Node lease is renewed recently.
  1646. // Expect no action from node controller (within startup grace period).
  1647. {
  1648. description: "Node created recently, without status. Node lease is renewed recently.",
  1649. fakeNodeHandler: &testutil.FakeNodeHandler{
  1650. Existing: []*v1.Node{
  1651. {
  1652. ObjectMeta: metav1.ObjectMeta{
  1653. Name: "node0",
  1654. CreationTimestamp: fakeNow,
  1655. },
  1656. },
  1657. },
  1658. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1659. },
  1660. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1661. expectedRequestCount: 1, // List
  1662. expectedNodes: nil,
  1663. expectedPodStatusUpdate: false,
  1664. },
  1665. // Node created long time ago, without status. Node lease is missing.
  1666. // Expect Unknown status posted from node controller.
  1667. {
  1668. description: "Node created long time ago, without status. Node lease is missing.",
  1669. fakeNodeHandler: &testutil.FakeNodeHandler{
  1670. Existing: []*v1.Node{
  1671. {
  1672. ObjectMeta: metav1.ObjectMeta{
  1673. Name: "node0",
  1674. CreationTimestamp: nodeCreationTime,
  1675. },
  1676. },
  1677. },
  1678. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1679. },
  1680. expectedRequestCount: 2, // List+Update
  1681. expectedNodes: []*v1.Node{
  1682. {
  1683. ObjectMeta: metav1.ObjectMeta{
  1684. Name: "node0",
  1685. CreationTimestamp: nodeCreationTime,
  1686. },
  1687. Status: v1.NodeStatus{
  1688. Conditions: []v1.NodeCondition{
  1689. {
  1690. Type: v1.NodeReady,
  1691. Status: v1.ConditionUnknown,
  1692. Reason: "NodeStatusNeverUpdated",
  1693. Message: "Kubelet never posted node status.",
  1694. LastHeartbeatTime: nodeCreationTime,
  1695. LastTransitionTime: fakeNow,
  1696. },
  1697. {
  1698. Type: v1.NodeMemoryPressure,
  1699. Status: v1.ConditionUnknown,
  1700. Reason: "NodeStatusNeverUpdated",
  1701. Message: "Kubelet never posted node status.",
  1702. LastHeartbeatTime: nodeCreationTime,
  1703. LastTransitionTime: fakeNow,
  1704. },
  1705. {
  1706. Type: v1.NodeDiskPressure,
  1707. Status: v1.ConditionUnknown,
  1708. Reason: "NodeStatusNeverUpdated",
  1709. Message: "Kubelet never posted node status.",
  1710. LastHeartbeatTime: nodeCreationTime,
  1711. LastTransitionTime: fakeNow,
  1712. },
  1713. {
  1714. Type: v1.NodePIDPressure,
  1715. Status: v1.ConditionUnknown,
  1716. Reason: "NodeStatusNeverUpdated",
  1717. Message: "Kubelet never posted node status.",
  1718. LastHeartbeatTime: nodeCreationTime,
  1719. LastTransitionTime: fakeNow,
  1720. },
  1721. },
  1722. },
  1723. },
  1724. },
  1725. expectedPodStatusUpdate: false, // Pod was never scheduled because the node was never ready.
  1726. },
  1727. // Node created long time ago, without status. Node lease is renewed recently.
  1728. // Expect no action from node controller (within monitor grace period).
  1729. {
  1730. description: "Node created long time ago, without status. Node lease is renewed recently.",
  1731. fakeNodeHandler: &testutil.FakeNodeHandler{
  1732. Existing: []*v1.Node{
  1733. {
  1734. ObjectMeta: metav1.ObjectMeta{
  1735. Name: "node0",
  1736. CreationTimestamp: nodeCreationTime,
  1737. },
  1738. },
  1739. },
  1740. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1741. },
  1742. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1743. timeToPass: time.Hour,
  1744. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour.
  1745. expectedRequestCount: 2, // List+List
  1746. expectedNodes: []*v1.Node{
  1747. {
  1748. ObjectMeta: metav1.ObjectMeta{
  1749. Name: "node0",
  1750. CreationTimestamp: nodeCreationTime,
  1751. },
  1752. },
  1753. },
  1754. expectedPodStatusUpdate: false,
  1755. },
  1756. // Node created long time ago, without status. Node lease is expired.
  1757. // Expect Unknown status posted from node controller.
  1758. {
  1759. description: "Node created long time ago, without status. Node lease is expired.",
  1760. fakeNodeHandler: &testutil.FakeNodeHandler{
  1761. Existing: []*v1.Node{
  1762. {
  1763. ObjectMeta: metav1.ObjectMeta{
  1764. Name: "node0",
  1765. CreationTimestamp: nodeCreationTime,
  1766. },
  1767. },
  1768. },
  1769. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1770. },
  1771. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1772. timeToPass: time.Hour,
  1773. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  1774. expectedRequestCount: 3, // List+List+Update
  1775. expectedNodes: []*v1.Node{
  1776. {
  1777. ObjectMeta: metav1.ObjectMeta{
  1778. Name: "node0",
  1779. CreationTimestamp: nodeCreationTime,
  1780. },
  1781. Status: v1.NodeStatus{
  1782. Conditions: []v1.NodeCondition{
  1783. {
  1784. Type: v1.NodeReady,
  1785. Status: v1.ConditionUnknown,
  1786. Reason: "NodeStatusNeverUpdated",
  1787. Message: "Kubelet never posted node status.",
  1788. LastHeartbeatTime: nodeCreationTime,
  1789. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1790. },
  1791. {
  1792. Type: v1.NodeMemoryPressure,
  1793. Status: v1.ConditionUnknown,
  1794. Reason: "NodeStatusNeverUpdated",
  1795. Message: "Kubelet never posted node status.",
  1796. LastHeartbeatTime: nodeCreationTime,
  1797. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1798. },
  1799. {
  1800. Type: v1.NodeDiskPressure,
  1801. Status: v1.ConditionUnknown,
  1802. Reason: "NodeStatusNeverUpdated",
  1803. Message: "Kubelet never posted node status.",
  1804. LastHeartbeatTime: nodeCreationTime,
  1805. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1806. },
  1807. {
  1808. Type: v1.NodePIDPressure,
  1809. Status: v1.ConditionUnknown,
  1810. Reason: "NodeStatusNeverUpdated",
  1811. Message: "Kubelet never posted node status.",
  1812. LastHeartbeatTime: nodeCreationTime,
  1813. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1814. },
  1815. },
  1816. },
  1817. },
  1818. },
  1819. expectedPodStatusUpdate: false,
  1820. },
  1821. // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.
  1822. // Expect no action from node controller (within monitor grace period).
  1823. {
  1824. description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.",
  1825. fakeNodeHandler: &testutil.FakeNodeHandler{
  1826. Existing: []*v1.Node{
  1827. {
  1828. ObjectMeta: metav1.ObjectMeta{
  1829. Name: "node0",
  1830. CreationTimestamp: nodeCreationTime,
  1831. },
  1832. Status: v1.NodeStatus{
  1833. Conditions: []v1.NodeCondition{
  1834. {
  1835. Type: v1.NodeReady,
  1836. Status: v1.ConditionTrue,
  1837. LastHeartbeatTime: fakeNow,
  1838. LastTransitionTime: fakeNow,
  1839. },
  1840. {
  1841. Type: v1.NodeDiskPressure,
  1842. Status: v1.ConditionFalse,
  1843. LastHeartbeatTime: fakeNow,
  1844. LastTransitionTime: fakeNow,
  1845. },
  1846. },
  1847. Capacity: v1.ResourceList{
  1848. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1849. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1850. },
  1851. },
  1852. },
  1853. },
  1854. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1855. },
  1856. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1857. expectedRequestCount: 2, // List+List
  1858. timeToPass: time.Hour,
  1859. newNodeStatus: v1.NodeStatus{
  1860. // Node status hasn't been updated for 1 hour.
  1861. Conditions: []v1.NodeCondition{
  1862. {
  1863. Type: v1.NodeReady,
  1864. Status: v1.ConditionTrue,
  1865. LastHeartbeatTime: fakeNow,
  1866. LastTransitionTime: fakeNow,
  1867. },
  1868. {
  1869. Type: v1.NodeDiskPressure,
  1870. Status: v1.ConditionFalse,
  1871. LastHeartbeatTime: fakeNow,
  1872. LastTransitionTime: fakeNow,
  1873. },
  1874. },
  1875. Capacity: v1.ResourceList{
  1876. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1877. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1878. },
  1879. },
  1880. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour.
  1881. expectedNodes: []*v1.Node{
  1882. {
  1883. ObjectMeta: metav1.ObjectMeta{
  1884. Name: "node0",
  1885. CreationTimestamp: nodeCreationTime,
  1886. },
  1887. Status: v1.NodeStatus{
  1888. Conditions: []v1.NodeCondition{
  1889. {
  1890. Type: v1.NodeReady,
  1891. Status: v1.ConditionTrue,
  1892. LastHeartbeatTime: fakeNow,
  1893. LastTransitionTime: fakeNow,
  1894. },
  1895. {
  1896. Type: v1.NodeDiskPressure,
  1897. Status: v1.ConditionFalse,
  1898. LastHeartbeatTime: fakeNow,
  1899. LastTransitionTime: fakeNow,
  1900. },
  1901. },
  1902. Capacity: v1.ResourceList{
  1903. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1904. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1905. },
  1906. },
  1907. },
  1908. },
  1909. expectedPodStatusUpdate: false,
  1910. },
  1911. // Node created long time ago, with status updated by kubelet recently. Node lease is expired.
  1912. // Expect no action from node controller (within monitor grace period).
  1913. {
  1914. description: "Node created long time ago, with status updated by kubelet recently. Node lease is expired.",
  1915. fakeNodeHandler: &testutil.FakeNodeHandler{
  1916. Existing: []*v1.Node{
  1917. {
  1918. ObjectMeta: metav1.ObjectMeta{
  1919. Name: "node0",
  1920. CreationTimestamp: nodeCreationTime,
  1921. },
  1922. Status: v1.NodeStatus{
  1923. Conditions: []v1.NodeCondition{
  1924. {
  1925. Type: v1.NodeReady,
  1926. Status: v1.ConditionTrue,
  1927. LastHeartbeatTime: fakeNow,
  1928. LastTransitionTime: fakeNow,
  1929. },
  1930. {
  1931. Type: v1.NodeDiskPressure,
  1932. Status: v1.ConditionFalse,
  1933. LastHeartbeatTime: fakeNow,
  1934. LastTransitionTime: fakeNow,
  1935. },
  1936. },
  1937. Capacity: v1.ResourceList{
  1938. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1939. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1940. },
  1941. },
  1942. },
  1943. },
  1944. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1945. },
  1946. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1947. expectedRequestCount: 2, // List+List
  1948. timeToPass: time.Hour,
  1949. newNodeStatus: v1.NodeStatus{
  1950. // Node status is updated after 1 hour.
  1951. Conditions: []v1.NodeCondition{
  1952. {
  1953. Type: v1.NodeReady,
  1954. Status: v1.ConditionTrue,
  1955. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1956. LastTransitionTime: fakeNow,
  1957. },
  1958. {
  1959. Type: v1.NodeDiskPressure,
  1960. Status: v1.ConditionFalse,
  1961. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1962. LastTransitionTime: fakeNow,
  1963. },
  1964. },
  1965. Capacity: v1.ResourceList{
  1966. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1967. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1968. },
  1969. },
  1970. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  1971. expectedNodes: []*v1.Node{
  1972. {
  1973. ObjectMeta: metav1.ObjectMeta{
  1974. Name: "node0",
  1975. CreationTimestamp: nodeCreationTime,
  1976. },
  1977. Status: v1.NodeStatus{
  1978. Conditions: []v1.NodeCondition{
  1979. {
  1980. Type: v1.NodeReady,
  1981. Status: v1.ConditionTrue,
  1982. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1983. LastTransitionTime: fakeNow,
  1984. },
  1985. {
  1986. Type: v1.NodeDiskPressure,
  1987. Status: v1.ConditionFalse,
  1988. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1989. LastTransitionTime: fakeNow,
  1990. },
  1991. },
  1992. Capacity: v1.ResourceList{
  1993. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1994. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1995. },
  1996. },
  1997. },
  1998. },
  1999. expectedPodStatusUpdate: false,
  2000. },
  2001. // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.
  2002. // Expect Unknown status posted from node controller.
  2003. {
  2004. description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.",
  2005. fakeNodeHandler: &testutil.FakeNodeHandler{
  2006. Existing: []*v1.Node{
  2007. {
  2008. ObjectMeta: metav1.ObjectMeta{
  2009. Name: "node0",
  2010. CreationTimestamp: nodeCreationTime,
  2011. },
  2012. Status: v1.NodeStatus{
  2013. Conditions: []v1.NodeCondition{
  2014. {
  2015. Type: v1.NodeReady,
  2016. Status: v1.ConditionTrue,
  2017. LastHeartbeatTime: fakeNow,
  2018. LastTransitionTime: fakeNow,
  2019. },
  2020. },
  2021. Capacity: v1.ResourceList{
  2022. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2023. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2024. },
  2025. },
  2026. },
  2027. },
  2028. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2029. },
  2030. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  2031. expectedRequestCount: 3, // List+List+Update
  2032. timeToPass: time.Hour,
  2033. newNodeStatus: v1.NodeStatus{
  2034. // Node status hasn't been updated for 1 hour.
  2035. Conditions: []v1.NodeCondition{
  2036. {
  2037. Type: v1.NodeReady,
  2038. Status: v1.ConditionTrue,
  2039. LastHeartbeatTime: fakeNow,
  2040. LastTransitionTime: fakeNow,
  2041. },
  2042. },
  2043. Capacity: v1.ResourceList{
  2044. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2045. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2046. },
  2047. },
  2048. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  2049. expectedNodes: []*v1.Node{
  2050. {
  2051. ObjectMeta: metav1.ObjectMeta{
  2052. Name: "node0",
  2053. CreationTimestamp: nodeCreationTime,
  2054. },
  2055. Status: v1.NodeStatus{
  2056. Conditions: []v1.NodeCondition{
  2057. {
  2058. Type: v1.NodeReady,
  2059. Status: v1.ConditionUnknown,
  2060. Reason: "NodeStatusUnknown",
  2061. Message: "Kubelet stopped posting node status.",
  2062. LastHeartbeatTime: fakeNow,
  2063. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2064. },
  2065. {
  2066. Type: v1.NodeMemoryPressure,
  2067. Status: v1.ConditionUnknown,
  2068. Reason: "NodeStatusNeverUpdated",
  2069. Message: "Kubelet never posted node status.",
  2070. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2071. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2072. },
  2073. {
  2074. Type: v1.NodeDiskPressure,
  2075. Status: v1.ConditionUnknown,
  2076. Reason: "NodeStatusNeverUpdated",
  2077. Message: "Kubelet never posted node status.",
  2078. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2079. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2080. },
  2081. {
  2082. Type: v1.NodePIDPressure,
  2083. Status: v1.ConditionUnknown,
  2084. Reason: "NodeStatusNeverUpdated",
  2085. Message: "Kubelet never posted node status.",
  2086. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2087. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2088. },
  2089. },
  2090. Capacity: v1.ResourceList{
  2091. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2092. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2093. },
  2094. },
  2095. },
  2096. },
  2097. expectedPodStatusUpdate: true,
  2098. },
  2099. }
  2100. for _, item := range testcases {
  2101. t.Run(item.description, func(t *testing.T) {
  2102. nodeController, _ := newNodeLifecycleControllerFromClient(
  2103. item.fakeNodeHandler,
  2104. 5*time.Minute,
  2105. testRateLimiterQPS,
  2106. testRateLimiterQPS,
  2107. testLargeClusterThreshold,
  2108. testUnhealthyThreshold,
  2109. testNodeMonitorGracePeriod,
  2110. testNodeStartupGracePeriod,
  2111. testNodeMonitorPeriod,
  2112. false)
  2113. nodeController.now = func() metav1.Time { return fakeNow }
  2114. nodeController.recorder = testutil.NewFakeRecorder()
  2115. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2116. t.Fatalf("unexpected error: %v", err)
  2117. }
  2118. if err := nodeController.syncLeaseStore(item.lease); err != nil {
  2119. t.Fatalf("unexpected error: %v", err)
  2120. }
  2121. if err := nodeController.monitorNodeHealth(); err != nil {
  2122. t.Fatalf("unexpected error: %v", err)
  2123. }
  2124. if item.timeToPass > 0 {
  2125. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  2126. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  2127. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2128. t.Fatalf("unexpected error: %v", err)
  2129. }
  2130. if err := nodeController.syncLeaseStore(item.newLease); err != nil {
  2131. t.Fatalf("unexpected error: %v", err)
  2132. }
  2133. if err := nodeController.monitorNodeHealth(); err != nil {
  2134. t.Fatalf("unexpected error: %v", err)
  2135. }
  2136. }
  2137. if item.expectedRequestCount != item.fakeNodeHandler.RequestCount {
  2138. t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount)
  2139. }
  2140. if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) {
  2141. t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0]))
  2142. }
  2143. if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) {
  2144. t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0]))
  2145. }
  2146. podStatusUpdated := false
  2147. for _, action := range item.fakeNodeHandler.Actions() {
  2148. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2149. podStatusUpdated = true
  2150. }
  2151. }
  2152. if podStatusUpdated != item.expectedPodStatusUpdate {
  2153. t.Errorf("expect pod status updated to be %v, but got %v", item.expectedPodStatusUpdate, podStatusUpdated)
  2154. }
  2155. })
  2156. }
  2157. }
  2158. func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) {
  2159. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  2160. table := []struct {
  2161. fakeNodeHandler *testutil.FakeNodeHandler
  2162. timeToPass time.Duration
  2163. newNodeStatus v1.NodeStatus
  2164. expectedPodStatusUpdate bool
  2165. }{
  2166. // Node created recently, without status.
  2167. // Expect no action from node controller (within startup grace period).
  2168. {
  2169. fakeNodeHandler: &testutil.FakeNodeHandler{
  2170. Existing: []*v1.Node{
  2171. {
  2172. ObjectMeta: metav1.ObjectMeta{
  2173. Name: "node0",
  2174. CreationTimestamp: fakeNow,
  2175. },
  2176. },
  2177. },
  2178. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2179. },
  2180. expectedPodStatusUpdate: false,
  2181. },
  2182. // Node created long time ago, with status updated recently.
  2183. // Expect no action from node controller (within monitor grace period).
  2184. {
  2185. fakeNodeHandler: &testutil.FakeNodeHandler{
  2186. Existing: []*v1.Node{
  2187. {
  2188. ObjectMeta: metav1.ObjectMeta{
  2189. Name: "node0",
  2190. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2191. },
  2192. Status: v1.NodeStatus{
  2193. Conditions: []v1.NodeCondition{
  2194. {
  2195. Type: v1.NodeReady,
  2196. Status: v1.ConditionTrue,
  2197. // Node status has just been updated.
  2198. LastHeartbeatTime: fakeNow,
  2199. LastTransitionTime: fakeNow,
  2200. },
  2201. },
  2202. Capacity: v1.ResourceList{
  2203. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2204. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2205. },
  2206. },
  2207. },
  2208. },
  2209. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2210. },
  2211. expectedPodStatusUpdate: false,
  2212. },
  2213. // Node created long time ago, with status updated by kubelet exceeds grace period.
  2214. // Expect pods status updated and Unknown node status posted from node controller
  2215. {
  2216. fakeNodeHandler: &testutil.FakeNodeHandler{
  2217. Existing: []*v1.Node{
  2218. {
  2219. ObjectMeta: metav1.ObjectMeta{
  2220. Name: "node0",
  2221. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2222. },
  2223. Status: v1.NodeStatus{
  2224. Conditions: []v1.NodeCondition{
  2225. {
  2226. Type: v1.NodeReady,
  2227. Status: v1.ConditionTrue,
  2228. // Node status hasn't been updated for 1hr.
  2229. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2230. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2231. },
  2232. },
  2233. Capacity: v1.ResourceList{
  2234. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2235. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2236. },
  2237. },
  2238. },
  2239. },
  2240. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2241. },
  2242. timeToPass: 1 * time.Minute,
  2243. newNodeStatus: v1.NodeStatus{
  2244. Conditions: []v1.NodeCondition{
  2245. {
  2246. Type: v1.NodeReady,
  2247. Status: v1.ConditionTrue,
  2248. // Node status hasn't been updated for 1hr.
  2249. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2250. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2251. },
  2252. },
  2253. Capacity: v1.ResourceList{
  2254. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2255. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2256. },
  2257. },
  2258. expectedPodStatusUpdate: true,
  2259. },
  2260. }
  2261. for i, item := range table {
  2262. nodeController, _ := newNodeLifecycleControllerFromClient(
  2263. item.fakeNodeHandler,
  2264. 5*time.Minute,
  2265. testRateLimiterQPS,
  2266. testRateLimiterQPS,
  2267. testLargeClusterThreshold,
  2268. testUnhealthyThreshold,
  2269. testNodeMonitorGracePeriod,
  2270. testNodeStartupGracePeriod,
  2271. testNodeMonitorPeriod,
  2272. false)
  2273. nodeController.now = func() metav1.Time { return fakeNow }
  2274. nodeController.recorder = testutil.NewFakeRecorder()
  2275. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2276. t.Errorf("unexpected error: %v", err)
  2277. }
  2278. if err := nodeController.monitorNodeHealth(); err != nil {
  2279. t.Errorf("Case[%d] unexpected error: %v", i, err)
  2280. }
  2281. if item.timeToPass > 0 {
  2282. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  2283. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  2284. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2285. t.Errorf("unexpected error: %v", err)
  2286. }
  2287. if err := nodeController.monitorNodeHealth(); err != nil {
  2288. t.Errorf("Case[%d] unexpected error: %v", i, err)
  2289. }
  2290. }
  2291. podStatusUpdated := false
  2292. for _, action := range item.fakeNodeHandler.Actions() {
  2293. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2294. podStatusUpdated = true
  2295. }
  2296. }
  2297. if podStatusUpdated != item.expectedPodStatusUpdate {
  2298. t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated)
  2299. }
  2300. }
  2301. }
  2302. // TestApplyNoExecuteTaints, ensures we just have a NoExecute taint applied to node.
  2303. // NodeController is just responsible for enqueuing the node to tainting queue from which taint manager picks up
  2304. // and evicts the pods on the node.
  2305. func TestApplyNoExecuteTaints(t *testing.T) {
  2306. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2307. evictionTimeout := 10 * time.Minute
  2308. fakeNodeHandler := &testutil.FakeNodeHandler{
  2309. Existing: []*v1.Node{
  2310. // Unreachable Taint with effect 'NoExecute' should be applied to this node.
  2311. {
  2312. ObjectMeta: metav1.ObjectMeta{
  2313. Name: "node0",
  2314. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2315. Labels: map[string]string{
  2316. v1.LabelZoneRegion: "region1",
  2317. v1.LabelZoneFailureDomain: "zone1",
  2318. },
  2319. },
  2320. Status: v1.NodeStatus{
  2321. Conditions: []v1.NodeCondition{
  2322. {
  2323. Type: v1.NodeReady,
  2324. Status: v1.ConditionUnknown,
  2325. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2326. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2327. },
  2328. },
  2329. },
  2330. },
  2331. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  2332. // we need second healthy node in tests.
  2333. {
  2334. ObjectMeta: metav1.ObjectMeta{
  2335. Name: "node1",
  2336. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2337. Labels: map[string]string{
  2338. v1.LabelZoneRegion: "region1",
  2339. v1.LabelZoneFailureDomain: "zone1",
  2340. },
  2341. },
  2342. Status: v1.NodeStatus{
  2343. Conditions: []v1.NodeCondition{
  2344. {
  2345. Type: v1.NodeReady,
  2346. Status: v1.ConditionTrue,
  2347. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2348. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2349. },
  2350. },
  2351. },
  2352. },
  2353. // NotReady Taint with NoExecute effect should be applied to this node.
  2354. {
  2355. ObjectMeta: metav1.ObjectMeta{
  2356. Name: "node2",
  2357. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2358. Labels: map[string]string{
  2359. v1.LabelZoneRegion: "region1",
  2360. v1.LabelZoneFailureDomain: "zone1",
  2361. },
  2362. },
  2363. Status: v1.NodeStatus{
  2364. Conditions: []v1.NodeCondition{
  2365. {
  2366. Type: v1.NodeReady,
  2367. Status: v1.ConditionFalse,
  2368. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2369. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2370. },
  2371. },
  2372. },
  2373. },
  2374. },
  2375. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2376. }
  2377. healthyNodeNewStatus := v1.NodeStatus{
  2378. Conditions: []v1.NodeCondition{
  2379. {
  2380. Type: v1.NodeReady,
  2381. Status: v1.ConditionTrue,
  2382. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC),
  2383. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2384. },
  2385. },
  2386. }
  2387. originalTaint := UnreachableTaintTemplate
  2388. nodeController, _ := newNodeLifecycleControllerFromClient(
  2389. fakeNodeHandler,
  2390. evictionTimeout,
  2391. testRateLimiterQPS,
  2392. testRateLimiterQPS,
  2393. testLargeClusterThreshold,
  2394. testUnhealthyThreshold,
  2395. testNodeMonitorGracePeriod,
  2396. testNodeStartupGracePeriod,
  2397. testNodeMonitorPeriod,
  2398. true)
  2399. nodeController.now = func() metav1.Time { return fakeNow }
  2400. nodeController.recorder = testutil.NewFakeRecorder()
  2401. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2402. t.Errorf("unexpected error: %v", err)
  2403. }
  2404. if err := nodeController.monitorNodeHealth(); err != nil {
  2405. t.Errorf("unexpected error: %v", err)
  2406. }
  2407. nodeController.doNoExecuteTaintingPass()
  2408. node0, err := fakeNodeHandler.Get("node0", metav1.GetOptions{})
  2409. if err != nil {
  2410. t.Errorf("Can't get current node0...")
  2411. return
  2412. }
  2413. if !taintutils.TaintExists(node0.Spec.Taints, UnreachableTaintTemplate) {
  2414. t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints)
  2415. }
  2416. node2, err := fakeNodeHandler.Get("node2", metav1.GetOptions{})
  2417. if err != nil {
  2418. t.Errorf("Can't get current node2...")
  2419. return
  2420. }
  2421. if !taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) {
  2422. t.Errorf("Can't find taint %v in %v", NotReadyTaintTemplate, node2.Spec.Taints)
  2423. }
  2424. // Make node3 healthy again.
  2425. node2.Status = healthyNodeNewStatus
  2426. _, err = fakeNodeHandler.UpdateStatus(node2)
  2427. if err != nil {
  2428. t.Errorf(err.Error())
  2429. return
  2430. }
  2431. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2432. t.Errorf("unexpected error: %v", err)
  2433. }
  2434. if err := nodeController.monitorNodeHealth(); err != nil {
  2435. t.Errorf("unexpected error: %v", err)
  2436. }
  2437. nodeController.doNoExecuteTaintingPass()
  2438. node2, err = fakeNodeHandler.Get("node2", metav1.GetOptions{})
  2439. if err != nil {
  2440. t.Errorf("Can't get current node2...")
  2441. return
  2442. }
  2443. // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect).
  2444. if taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) || len(node2.Spec.Taints) > 0 {
  2445. t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node2.Spec.Taints)
  2446. }
  2447. }
  2448. func TestSwapUnreachableNotReadyTaints(t *testing.T) {
  2449. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2450. evictionTimeout := 10 * time.Minute
  2451. fakeNodeHandler := &testutil.FakeNodeHandler{
  2452. Existing: []*v1.Node{
  2453. {
  2454. ObjectMeta: metav1.ObjectMeta{
  2455. Name: "node0",
  2456. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2457. Labels: map[string]string{
  2458. v1.LabelZoneRegion: "region1",
  2459. v1.LabelZoneFailureDomain: "zone1",
  2460. },
  2461. },
  2462. Status: v1.NodeStatus{
  2463. Conditions: []v1.NodeCondition{
  2464. {
  2465. Type: v1.NodeReady,
  2466. Status: v1.ConditionUnknown,
  2467. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2468. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2469. },
  2470. },
  2471. },
  2472. },
  2473. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  2474. // we need second healthy node in tests. Because of how the tests are written we need to update
  2475. // the status of this Node.
  2476. {
  2477. ObjectMeta: metav1.ObjectMeta{
  2478. Name: "node1",
  2479. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2480. Labels: map[string]string{
  2481. v1.LabelZoneRegion: "region1",
  2482. v1.LabelZoneFailureDomain: "zone1",
  2483. },
  2484. },
  2485. Status: v1.NodeStatus{
  2486. Conditions: []v1.NodeCondition{
  2487. {
  2488. Type: v1.NodeReady,
  2489. Status: v1.ConditionTrue,
  2490. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2491. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2492. },
  2493. },
  2494. },
  2495. },
  2496. },
  2497. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2498. }
  2499. timeToPass := evictionTimeout
  2500. newNodeStatus := v1.NodeStatus{
  2501. Conditions: []v1.NodeCondition{
  2502. {
  2503. Type: v1.NodeReady,
  2504. Status: v1.ConditionFalse,
  2505. // Node status has just been updated, and is NotReady for 10min.
  2506. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 9, 0, 0, time.UTC),
  2507. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2508. },
  2509. },
  2510. }
  2511. healthyNodeNewStatus := v1.NodeStatus{
  2512. Conditions: []v1.NodeCondition{
  2513. {
  2514. Type: v1.NodeReady,
  2515. Status: v1.ConditionTrue,
  2516. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC),
  2517. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2518. },
  2519. },
  2520. }
  2521. originalTaint := UnreachableTaintTemplate
  2522. updatedTaint := NotReadyTaintTemplate
  2523. nodeController, _ := newNodeLifecycleControllerFromClient(
  2524. fakeNodeHandler,
  2525. evictionTimeout,
  2526. testRateLimiterQPS,
  2527. testRateLimiterQPS,
  2528. testLargeClusterThreshold,
  2529. testUnhealthyThreshold,
  2530. testNodeMonitorGracePeriod,
  2531. testNodeStartupGracePeriod,
  2532. testNodeMonitorPeriod,
  2533. true)
  2534. nodeController.now = func() metav1.Time { return fakeNow }
  2535. nodeController.recorder = testutil.NewFakeRecorder()
  2536. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2537. t.Errorf("unexpected error: %v", err)
  2538. }
  2539. if err := nodeController.monitorNodeHealth(); err != nil {
  2540. t.Errorf("unexpected error: %v", err)
  2541. }
  2542. nodeController.doNoExecuteTaintingPass()
  2543. node0, err := fakeNodeHandler.Get("node0", metav1.GetOptions{})
  2544. if err != nil {
  2545. t.Errorf("Can't get current node0...")
  2546. return
  2547. }
  2548. node1, err := fakeNodeHandler.Get("node1", metav1.GetOptions{})
  2549. if err != nil {
  2550. t.Errorf("Can't get current node1...")
  2551. return
  2552. }
  2553. if originalTaint != nil && !taintutils.TaintExists(node0.Spec.Taints, originalTaint) {
  2554. t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints)
  2555. }
  2556. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
  2557. node0.Status = newNodeStatus
  2558. node1.Status = healthyNodeNewStatus
  2559. _, err = fakeNodeHandler.UpdateStatus(node0)
  2560. if err != nil {
  2561. t.Errorf(err.Error())
  2562. return
  2563. }
  2564. _, err = fakeNodeHandler.UpdateStatus(node1)
  2565. if err != nil {
  2566. t.Errorf(err.Error())
  2567. return
  2568. }
  2569. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2570. t.Errorf("unexpected error: %v", err)
  2571. }
  2572. if err := nodeController.monitorNodeHealth(); err != nil {
  2573. t.Errorf("unexpected error: %v", err)
  2574. }
  2575. nodeController.doNoExecuteTaintingPass()
  2576. node0, err = fakeNodeHandler.Get("node0", metav1.GetOptions{})
  2577. if err != nil {
  2578. t.Errorf("Can't get current node0...")
  2579. return
  2580. }
  2581. if updatedTaint != nil {
  2582. if !taintutils.TaintExists(node0.Spec.Taints, updatedTaint) {
  2583. t.Errorf("Can't find taint %v in %v", updatedTaint, node0.Spec.Taints)
  2584. }
  2585. }
  2586. }
  2587. func TestTaintsNodeByCondition(t *testing.T) {
  2588. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2589. evictionTimeout := 10 * time.Minute
  2590. fakeNodeHandler := &testutil.FakeNodeHandler{
  2591. Existing: []*v1.Node{
  2592. {
  2593. ObjectMeta: metav1.ObjectMeta{
  2594. Name: "node0",
  2595. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2596. Labels: map[string]string{
  2597. v1.LabelZoneRegion: "region1",
  2598. v1.LabelZoneFailureDomain: "zone1",
  2599. },
  2600. },
  2601. Status: v1.NodeStatus{
  2602. Conditions: []v1.NodeCondition{
  2603. {
  2604. Type: v1.NodeReady,
  2605. Status: v1.ConditionTrue,
  2606. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2607. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2608. },
  2609. },
  2610. },
  2611. },
  2612. },
  2613. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2614. }
  2615. nodeController, _ := newNodeLifecycleControllerFromClient(
  2616. fakeNodeHandler,
  2617. evictionTimeout,
  2618. testRateLimiterQPS,
  2619. testRateLimiterQPS,
  2620. testLargeClusterThreshold,
  2621. testUnhealthyThreshold,
  2622. testNodeMonitorGracePeriod,
  2623. testNodeStartupGracePeriod,
  2624. testNodeMonitorPeriod,
  2625. true)
  2626. nodeController.now = func() metav1.Time { return fakeNow }
  2627. nodeController.recorder = testutil.NewFakeRecorder()
  2628. networkUnavailableTaint := &v1.Taint{
  2629. Key: schedulerapi.TaintNodeNetworkUnavailable,
  2630. Effect: v1.TaintEffectNoSchedule,
  2631. }
  2632. notReadyTaint := &v1.Taint{
  2633. Key: schedulerapi.TaintNodeNotReady,
  2634. Effect: v1.TaintEffectNoSchedule,
  2635. }
  2636. unreachableTaint := &v1.Taint{
  2637. Key: schedulerapi.TaintNodeUnreachable,
  2638. Effect: v1.TaintEffectNoSchedule,
  2639. }
  2640. tests := []struct {
  2641. Name string
  2642. Node *v1.Node
  2643. ExpectedTaints []*v1.Taint
  2644. }{
  2645. {
  2646. Name: "NetworkUnavailable is true",
  2647. Node: &v1.Node{
  2648. ObjectMeta: metav1.ObjectMeta{
  2649. Name: "node0",
  2650. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2651. Labels: map[string]string{
  2652. v1.LabelZoneRegion: "region1",
  2653. v1.LabelZoneFailureDomain: "zone1",
  2654. },
  2655. },
  2656. Status: v1.NodeStatus{
  2657. Conditions: []v1.NodeCondition{
  2658. {
  2659. Type: v1.NodeReady,
  2660. Status: v1.ConditionTrue,
  2661. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2662. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2663. },
  2664. {
  2665. Type: v1.NodeNetworkUnavailable,
  2666. Status: v1.ConditionTrue,
  2667. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2668. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2669. },
  2670. },
  2671. },
  2672. },
  2673. ExpectedTaints: []*v1.Taint{networkUnavailableTaint},
  2674. },
  2675. {
  2676. Name: "NetworkUnavailable is true",
  2677. Node: &v1.Node{
  2678. ObjectMeta: metav1.ObjectMeta{
  2679. Name: "node0",
  2680. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2681. Labels: map[string]string{
  2682. v1.LabelZoneRegion: "region1",
  2683. v1.LabelZoneFailureDomain: "zone1",
  2684. },
  2685. },
  2686. Status: v1.NodeStatus{
  2687. Conditions: []v1.NodeCondition{
  2688. {
  2689. Type: v1.NodeReady,
  2690. Status: v1.ConditionTrue,
  2691. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2692. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2693. },
  2694. {
  2695. Type: v1.NodeNetworkUnavailable,
  2696. Status: v1.ConditionTrue,
  2697. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2698. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2699. },
  2700. },
  2701. },
  2702. },
  2703. ExpectedTaints: []*v1.Taint{networkUnavailableTaint},
  2704. },
  2705. {
  2706. Name: "Ready is false",
  2707. Node: &v1.Node{
  2708. ObjectMeta: metav1.ObjectMeta{
  2709. Name: "node0",
  2710. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2711. Labels: map[string]string{
  2712. v1.LabelZoneRegion: "region1",
  2713. v1.LabelZoneFailureDomain: "zone1",
  2714. },
  2715. },
  2716. Status: v1.NodeStatus{
  2717. Conditions: []v1.NodeCondition{
  2718. {
  2719. Type: v1.NodeReady,
  2720. Status: v1.ConditionFalse,
  2721. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2722. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2723. },
  2724. },
  2725. },
  2726. },
  2727. ExpectedTaints: []*v1.Taint{notReadyTaint},
  2728. },
  2729. {
  2730. Name: "Ready is unknown",
  2731. Node: &v1.Node{
  2732. ObjectMeta: metav1.ObjectMeta{
  2733. Name: "node0",
  2734. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2735. Labels: map[string]string{
  2736. v1.LabelZoneRegion: "region1",
  2737. v1.LabelZoneFailureDomain: "zone1",
  2738. },
  2739. },
  2740. Status: v1.NodeStatus{
  2741. Conditions: []v1.NodeCondition{
  2742. {
  2743. Type: v1.NodeReady,
  2744. Status: v1.ConditionUnknown,
  2745. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2746. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2747. },
  2748. },
  2749. },
  2750. },
  2751. ExpectedTaints: []*v1.Taint{unreachableTaint},
  2752. },
  2753. }
  2754. for _, test := range tests {
  2755. fakeNodeHandler.Update(test.Node)
  2756. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2757. t.Errorf("unexpected error: %v", err)
  2758. }
  2759. nodeController.doNoScheduleTaintingPass(test.Node.Name)
  2760. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2761. t.Errorf("unexpected error: %v", err)
  2762. }
  2763. node0, err := nodeController.nodeLister.Get("node0")
  2764. if err != nil {
  2765. t.Errorf("Can't get current node0...")
  2766. return
  2767. }
  2768. if len(node0.Spec.Taints) != len(test.ExpectedTaints) {
  2769. t.Errorf("%s: Unexpected number of taints: expected %d, got %d",
  2770. test.Name, len(test.ExpectedTaints), len(node0.Spec.Taints))
  2771. }
  2772. for _, taint := range test.ExpectedTaints {
  2773. if !taintutils.TaintExists(node0.Spec.Taints, taint) {
  2774. t.Errorf("%s: Can't find taint %v in %v", test.Name, taint, node0.Spec.Taints)
  2775. }
  2776. }
  2777. }
  2778. }
  2779. func TestNodeEventGeneration(t *testing.T) {
  2780. fakeNow := metav1.Date(2016, 9, 10, 12, 0, 0, 0, time.UTC)
  2781. fakeNodeHandler := &testutil.FakeNodeHandler{
  2782. Existing: []*v1.Node{
  2783. {
  2784. ObjectMeta: metav1.ObjectMeta{
  2785. Name: "node0",
  2786. UID: "1234567890",
  2787. CreationTimestamp: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  2788. },
  2789. Status: v1.NodeStatus{
  2790. Conditions: []v1.NodeCondition{
  2791. {
  2792. Type: v1.NodeReady,
  2793. Status: v1.ConditionUnknown,
  2794. LastHeartbeatTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  2795. LastTransitionTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  2796. },
  2797. },
  2798. },
  2799. },
  2800. },
  2801. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2802. }
  2803. nodeController, _ := newNodeLifecycleControllerFromClient(
  2804. fakeNodeHandler,
  2805. 5*time.Minute,
  2806. testRateLimiterQPS,
  2807. testRateLimiterQPS,
  2808. testLargeClusterThreshold,
  2809. testUnhealthyThreshold,
  2810. testNodeMonitorGracePeriod,
  2811. testNodeStartupGracePeriod,
  2812. testNodeMonitorPeriod,
  2813. false)
  2814. nodeController.now = func() metav1.Time { return fakeNow }
  2815. fakeRecorder := testutil.NewFakeRecorder()
  2816. nodeController.recorder = fakeRecorder
  2817. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2818. t.Errorf("unexpected error: %v", err)
  2819. }
  2820. if err := nodeController.monitorNodeHealth(); err != nil {
  2821. t.Errorf("unexpected error: %v", err)
  2822. }
  2823. if len(fakeRecorder.Events) != 1 {
  2824. t.Fatalf("unexpected events, got %v, expected %v: %+v", len(fakeRecorder.Events), 1, fakeRecorder.Events)
  2825. }
  2826. if fakeRecorder.Events[0].Reason != "RegisteredNode" {
  2827. var reasons []string
  2828. for _, event := range fakeRecorder.Events {
  2829. reasons = append(reasons, event.Reason)
  2830. }
  2831. t.Fatalf("unexpected events generation: %v", strings.Join(reasons, ","))
  2832. }
  2833. for _, event := range fakeRecorder.Events {
  2834. involvedObject := event.InvolvedObject
  2835. actualUID := string(involvedObject.UID)
  2836. if actualUID != "1234567890" {
  2837. t.Fatalf("unexpected event uid: %v", actualUID)
  2838. }
  2839. }
  2840. }
  2841. func TestReconcileNodeLabels(t *testing.T) {
  2842. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2843. evictionTimeout := 10 * time.Minute
  2844. fakeNodeHandler := &testutil.FakeNodeHandler{
  2845. Existing: []*v1.Node{
  2846. {
  2847. ObjectMeta: metav1.ObjectMeta{
  2848. Name: "node0",
  2849. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2850. Labels: map[string]string{
  2851. v1.LabelZoneRegion: "region1",
  2852. v1.LabelZoneFailureDomain: "zone1",
  2853. },
  2854. },
  2855. Status: v1.NodeStatus{
  2856. Conditions: []v1.NodeCondition{
  2857. {
  2858. Type: v1.NodeReady,
  2859. Status: v1.ConditionTrue,
  2860. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2861. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2862. },
  2863. },
  2864. },
  2865. },
  2866. },
  2867. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2868. }
  2869. nodeController, _ := newNodeLifecycleControllerFromClient(
  2870. fakeNodeHandler,
  2871. evictionTimeout,
  2872. testRateLimiterQPS,
  2873. testRateLimiterQPS,
  2874. testLargeClusterThreshold,
  2875. testUnhealthyThreshold,
  2876. testNodeMonitorGracePeriod,
  2877. testNodeStartupGracePeriod,
  2878. testNodeMonitorPeriod,
  2879. true)
  2880. nodeController.now = func() metav1.Time { return fakeNow }
  2881. nodeController.recorder = testutil.NewFakeRecorder()
  2882. tests := []struct {
  2883. Name string
  2884. Node *v1.Node
  2885. ExpectedLabels map[string]string
  2886. }{
  2887. {
  2888. Name: "No-op if node has no labels",
  2889. Node: &v1.Node{
  2890. ObjectMeta: metav1.ObjectMeta{
  2891. Name: "node0",
  2892. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2893. },
  2894. },
  2895. ExpectedLabels: nil,
  2896. },
  2897. {
  2898. Name: "No-op if no target labels present",
  2899. Node: &v1.Node{
  2900. ObjectMeta: metav1.ObjectMeta{
  2901. Name: "node0",
  2902. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2903. Labels: map[string]string{
  2904. v1.LabelZoneRegion: "region1",
  2905. },
  2906. },
  2907. },
  2908. ExpectedLabels: map[string]string{
  2909. v1.LabelZoneRegion: "region1",
  2910. },
  2911. },
  2912. {
  2913. Name: "Create OS/arch stable labels when they don't exist",
  2914. Node: &v1.Node{
  2915. ObjectMeta: metav1.ObjectMeta{
  2916. Name: "node0",
  2917. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2918. Labels: map[string]string{
  2919. kubeletapis.LabelOS: "linux",
  2920. kubeletapis.LabelArch: "amd64",
  2921. },
  2922. },
  2923. },
  2924. ExpectedLabels: map[string]string{
  2925. kubeletapis.LabelOS: "linux",
  2926. kubeletapis.LabelArch: "amd64",
  2927. v1.LabelOSStable: "linux",
  2928. v1.LabelArchStable: "amd64",
  2929. },
  2930. },
  2931. {
  2932. Name: "Reconcile OS/arch stable labels to match beta labels",
  2933. Node: &v1.Node{
  2934. ObjectMeta: metav1.ObjectMeta{
  2935. Name: "node0",
  2936. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2937. Labels: map[string]string{
  2938. kubeletapis.LabelOS: "linux",
  2939. kubeletapis.LabelArch: "amd64",
  2940. v1.LabelOSStable: "windows",
  2941. v1.LabelArchStable: "arm",
  2942. },
  2943. },
  2944. },
  2945. ExpectedLabels: map[string]string{
  2946. kubeletapis.LabelOS: "linux",
  2947. kubeletapis.LabelArch: "amd64",
  2948. v1.LabelOSStable: "linux",
  2949. v1.LabelArchStable: "amd64",
  2950. },
  2951. },
  2952. }
  2953. for _, test := range tests {
  2954. fakeNodeHandler.Update(test.Node)
  2955. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2956. t.Fatalf("unexpected error: %v", err)
  2957. }
  2958. nodeController.reconcileNodeLabels(test.Node.Name)
  2959. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2960. t.Fatalf("unexpected error: %v", err)
  2961. }
  2962. node0, err := nodeController.nodeLister.Get("node0")
  2963. if err != nil {
  2964. t.Fatalf("Can't get current node0...")
  2965. }
  2966. if len(node0.Labels) != len(test.ExpectedLabels) {
  2967. t.Errorf("%s: Unexpected number of taints: expected %d, got %d",
  2968. test.Name, len(test.ExpectedLabels), len(node0.Labels))
  2969. }
  2970. for key, expectedValue := range test.ExpectedLabels {
  2971. actualValue, ok := node0.Labels[key]
  2972. if !ok {
  2973. t.Errorf("%s: Can't find label %v in %v", test.Name, key, node0.Labels)
  2974. }
  2975. if actualValue != expectedValue {
  2976. t.Errorf("%s: label %q: expected value %q, got value %q", test.Name, key, expectedValue, actualValue)
  2977. }
  2978. }
  2979. }
  2980. }