node_lifecycle_controller_test.go 123 KB


  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package nodelifecycle
  14. import (
  15. "context"
  16. "fmt"
  17. "strings"
  18. "testing"
  19. "time"
  20. apps "k8s.io/api/apps/v1"
  21. coordv1 "k8s.io/api/coordination/v1"
  22. v1 "k8s.io/api/core/v1"
  23. apiequality "k8s.io/apimachinery/pkg/api/equality"
  24. "k8s.io/apimachinery/pkg/api/resource"
  25. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  26. "k8s.io/apimachinery/pkg/fields"
  27. "k8s.io/apimachinery/pkg/labels"
  28. "k8s.io/apimachinery/pkg/runtime"
  29. "k8s.io/apimachinery/pkg/util/diff"
  30. utilfeature "k8s.io/apiserver/pkg/util/feature"
  31. "k8s.io/client-go/informers"
  32. appsinformers "k8s.io/client-go/informers/apps/v1"
  33. coordinformers "k8s.io/client-go/informers/coordination/v1"
  34. coreinformers "k8s.io/client-go/informers/core/v1"
  35. clientset "k8s.io/client-go/kubernetes"
  36. "k8s.io/client-go/kubernetes/fake"
  37. testcore "k8s.io/client-go/testing"
  38. featuregatetesting "k8s.io/component-base/featuregate/testing"
  39. "k8s.io/kubernetes/pkg/controller"
  40. "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
  41. "k8s.io/kubernetes/pkg/controller/testutil"
  42. nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
  43. "k8s.io/kubernetes/pkg/features"
  44. kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
  45. "k8s.io/kubernetes/pkg/util/node"
  46. taintutils "k8s.io/kubernetes/pkg/util/taints"
  47. "k8s.io/utils/pointer"
  48. )
  49. const (
  50. testNodeMonitorGracePeriod = 40 * time.Second
  51. testNodeStartupGracePeriod = 60 * time.Second
  52. testNodeMonitorPeriod = 5 * time.Second
  53. testRateLimiterQPS = float32(10000)
  54. testLargeClusterThreshold = 20
  55. testUnhealthyThreshold = float32(0.55)
  56. )
  57. func alwaysReady() bool { return true }
  58. func fakeGetPodsAssignedToNode(c *fake.Clientset) func(string) ([]*v1.Pod, error) {
  59. return func(nodeName string) ([]*v1.Pod, error) {
  60. selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName})
  61. pods, err := c.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{
  62. FieldSelector: selector.String(),
  63. LabelSelector: labels.Everything().String(),
  64. })
  65. if err != nil {
  66. return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName)
  67. }
  68. rPods := make([]*v1.Pod, len(pods.Items))
  69. for i := range pods.Items {
  70. rPods[i] = &pods.Items[i]
  71. }
  72. return rPods, nil
  73. }
  74. }
  75. type nodeLifecycleController struct {
  76. *Controller
  77. leaseInformer coordinformers.LeaseInformer
  78. nodeInformer coreinformers.NodeInformer
  79. daemonSetInformer appsinformers.DaemonSetInformer
  80. }
  81. // doEviction does the fake eviction and returns the status of eviction operation.
  82. func (nc *nodeLifecycleController) doEviction(fakeNodeHandler *testutil.FakeNodeHandler) bool {
  83. nc.evictorLock.Lock()
  84. defer nc.evictorLock.Unlock()
  85. zones := testutil.GetZones(fakeNodeHandler)
  86. for _, zone := range zones {
  87. nc.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  88. uid, _ := value.UID.(string)
  89. pods, _ := nc.getPodsAssignedToNode(value.Value)
  90. nodeutil.DeletePods(fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore)
  91. _ = nc.nodeEvictionMap.setStatus(value.Value, evicted)
  92. return true, 0
  93. })
  94. }
  95. for _, action := range fakeNodeHandler.Actions() {
  96. if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" {
  97. return true
  98. }
  99. }
  100. return false
  101. }
  102. func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease {
  103. return &coordv1.Lease{
  104. ObjectMeta: metav1.ObjectMeta{
  105. Name: nodeName,
  106. Namespace: v1.NamespaceNodeLease,
  107. },
  108. Spec: coordv1.LeaseSpec{
  109. HolderIdentity: pointer.StringPtr(nodeName),
  110. RenewTime: &renewTime,
  111. },
  112. }
  113. }
  114. func (nc *nodeLifecycleController) syncLeaseStore(lease *coordv1.Lease) error {
  115. if lease == nil {
  116. return nil
  117. }
  118. newElems := make([]interface{}, 0, 1)
  119. newElems = append(newElems, lease)
  120. return nc.leaseInformer.Informer().GetStore().Replace(newElems, "newRV")
  121. }
  122. func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeNodeHandler) error {
  123. nodes, err := fakeNodeHandler.List(context.TODO(), metav1.ListOptions{})
  124. if err != nil {
  125. return err
  126. }
  127. newElems := make([]interface{}, 0, len(nodes.Items))
  128. for i := range nodes.Items {
  129. newElems = append(newElems, &nodes.Items[i])
  130. }
  131. return nc.nodeInformer.Informer().GetStore().Replace(newElems, "newRV")
  132. }
  133. func newNodeLifecycleControllerFromClient(
  134. kubeClient clientset.Interface,
  135. podEvictionTimeout time.Duration,
  136. evictionLimiterQPS float32,
  137. secondaryEvictionLimiterQPS float32,
  138. largeClusterThreshold int32,
  139. unhealthyZoneThreshold float32,
  140. nodeMonitorGracePeriod time.Duration,
  141. nodeStartupGracePeriod time.Duration,
  142. nodeMonitorPeriod time.Duration,
  143. useTaints bool,
  144. ) (*nodeLifecycleController, error) {
  145. factory := informers.NewSharedInformerFactory(kubeClient, controller.NoResyncPeriodFunc())
  146. leaseInformer := factory.Coordination().V1().Leases()
  147. nodeInformer := factory.Core().V1().Nodes()
  148. daemonSetInformer := factory.Apps().V1().DaemonSets()
  149. nc, err := NewNodeLifecycleController(
  150. leaseInformer,
  151. factory.Core().V1().Pods(),
  152. nodeInformer,
  153. daemonSetInformer,
  154. kubeClient,
  155. nodeMonitorPeriod,
  156. nodeStartupGracePeriod,
  157. nodeMonitorGracePeriod,
  158. podEvictionTimeout,
  159. evictionLimiterQPS,
  160. secondaryEvictionLimiterQPS,
  161. largeClusterThreshold,
  162. unhealthyZoneThreshold,
  163. useTaints,
  164. useTaints,
  165. )
  166. if err != nil {
  167. return nil, err
  168. }
  169. nc.leaseInformerSynced = alwaysReady
  170. nc.podInformerSynced = alwaysReady
  171. nc.nodeInformerSynced = alwaysReady
  172. nc.daemonSetInformerSynced = alwaysReady
  173. return &nodeLifecycleController{nc, leaseInformer, nodeInformer, daemonSetInformer}, nil
  174. }
  175. func TestMonitorNodeHealthEvictPods(t *testing.T) {
  176. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  177. evictionTimeout := 10 * time.Minute
  178. labels := map[string]string{
  179. v1.LabelZoneRegionStable: "region1",
  180. v1.LabelZoneFailureDomainStable: "zone1",
  181. v1.LabelZoneRegion: "region1",
  182. v1.LabelZoneFailureDomain: "zone1",
  183. }
  184. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  185. // we need second healthy node in tests. Because of how the tests are written we need to update
  186. // the status of this Node.
  187. healthyNodeNewStatus := v1.NodeStatus{
  188. Conditions: []v1.NodeCondition{
  189. {
  190. Type: v1.NodeReady,
  191. Status: v1.ConditionTrue,
  192. // Node status has just been updated, and is NotReady for 10min.
  193. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  194. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  195. },
  196. },
  197. }
  198. table := []struct {
  199. fakeNodeHandler *testutil.FakeNodeHandler
  200. daemonSets []apps.DaemonSet
  201. timeToPass time.Duration
  202. newNodeStatus v1.NodeStatus
  203. secondNodeNewStatus v1.NodeStatus
  204. expectedEvictPods bool
  205. description string
  206. }{
  207. // Node created recently, with no status (happens only at cluster startup).
  208. {
  209. fakeNodeHandler: &testutil.FakeNodeHandler{
  210. Existing: []*v1.Node{
  211. {
  212. ObjectMeta: metav1.ObjectMeta{
  213. Name: "node0",
  214. CreationTimestamp: fakeNow,
  215. Labels: map[string]string{
  216. v1.LabelZoneRegionStable: "region1",
  217. v1.LabelZoneFailureDomainStable: "zone1",
  218. v1.LabelZoneRegion: "region1",
  219. v1.LabelZoneFailureDomain: "zone1",
  220. },
  221. },
  222. },
  223. {
  224. ObjectMeta: metav1.ObjectMeta{
  225. Name: "node1",
  226. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  227. Labels: map[string]string{
  228. v1.LabelZoneRegionStable: "region1",
  229. v1.LabelZoneFailureDomainStable: "zone1",
  230. v1.LabelZoneRegion: "region1",
  231. v1.LabelZoneFailureDomain: "zone1",
  232. },
  233. },
  234. Status: v1.NodeStatus{
  235. Conditions: []v1.NodeCondition{
  236. {
  237. Type: v1.NodeReady,
  238. Status: v1.ConditionTrue,
  239. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  240. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  241. },
  242. },
  243. },
  244. },
  245. },
  246. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  247. },
  248. daemonSets: nil,
  249. timeToPass: 0,
  250. newNodeStatus: v1.NodeStatus{},
  251. secondNodeNewStatus: healthyNodeNewStatus,
  252. expectedEvictPods: false,
  253. description: "Node created recently, with no status.",
  254. },
  255. // Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup).
  256. {
  257. fakeNodeHandler: &testutil.FakeNodeHandler{
  258. Existing: []*v1.Node{
  259. {
  260. ObjectMeta: metav1.ObjectMeta{
  261. Name: "node0",
  262. CreationTimestamp: fakeNow,
  263. },
  264. },
  265. {
  266. ObjectMeta: metav1.ObjectMeta{
  267. Name: "node1",
  268. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  269. },
  270. Status: v1.NodeStatus{
  271. Conditions: []v1.NodeCondition{
  272. {
  273. Type: v1.NodeReady,
  274. Status: v1.ConditionTrue,
  275. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  276. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  277. },
  278. },
  279. },
  280. },
  281. },
  282. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  283. },
  284. daemonSets: nil,
  285. timeToPass: 0,
  286. newNodeStatus: v1.NodeStatus{},
  287. secondNodeNewStatus: healthyNodeNewStatus,
  288. expectedEvictPods: false,
  289. description: "Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup).",
  290. },
  291. // Node created long time ago, and kubelet posted NotReady for a short period of time.
  292. {
  293. fakeNodeHandler: &testutil.FakeNodeHandler{
  294. Existing: []*v1.Node{
  295. {
  296. ObjectMeta: metav1.ObjectMeta{
  297. Name: "node0",
  298. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  299. Labels: map[string]string{
  300. v1.LabelZoneRegionStable: "region1",
  301. v1.LabelZoneFailureDomainStable: "zone1",
  302. v1.LabelZoneRegion: "region1",
  303. v1.LabelZoneFailureDomain: "zone1",
  304. },
  305. },
  306. Status: v1.NodeStatus{
  307. Conditions: []v1.NodeCondition{
  308. {
  309. Type: v1.NodeReady,
  310. Status: v1.ConditionFalse,
  311. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  312. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  313. },
  314. },
  315. },
  316. },
  317. {
  318. ObjectMeta: metav1.ObjectMeta{
  319. Name: "node1",
  320. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  321. Labels: map[string]string{
  322. v1.LabelZoneRegionStable: "region1",
  323. v1.LabelZoneFailureDomainStable: "zone1",
  324. v1.LabelZoneRegion: "region1",
  325. v1.LabelZoneFailureDomain: "zone1",
  326. },
  327. },
  328. Status: v1.NodeStatus{
  329. Conditions: []v1.NodeCondition{
  330. {
  331. Type: v1.NodeReady,
  332. Status: v1.ConditionTrue,
  333. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  334. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  335. },
  336. },
  337. },
  338. },
  339. },
  340. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  341. },
  342. daemonSets: nil,
  343. timeToPass: evictionTimeout,
  344. newNodeStatus: v1.NodeStatus{
  345. Conditions: []v1.NodeCondition{
  346. {
  347. Type: v1.NodeReady,
  348. Status: v1.ConditionFalse,
  349. // Node status has just been updated, and is NotReady for 10min.
  350. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  351. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  352. },
  353. },
  354. },
  355. secondNodeNewStatus: healthyNodeNewStatus,
  356. expectedEvictPods: false,
  357. description: "Node created long time ago, and kubelet posted NotReady for a short period of time.",
  358. },
  359. // Pod is ds-managed, and kubelet posted NotReady for a long period of time.
  360. {
  361. fakeNodeHandler: &testutil.FakeNodeHandler{
  362. Existing: []*v1.Node{
  363. {
  364. ObjectMeta: metav1.ObjectMeta{
  365. Name: "node0",
  366. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  367. Labels: map[string]string{
  368. v1.LabelZoneRegionStable: "region1",
  369. v1.LabelZoneFailureDomainStable: "zone1",
  370. v1.LabelZoneRegion: "region1",
  371. v1.LabelZoneFailureDomain: "zone1",
  372. },
  373. },
  374. Status: v1.NodeStatus{
  375. Conditions: []v1.NodeCondition{
  376. {
  377. Type: v1.NodeReady,
  378. Status: v1.ConditionFalse,
  379. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  380. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  381. },
  382. },
  383. },
  384. },
  385. {
  386. ObjectMeta: metav1.ObjectMeta{
  387. Name: "node1",
  388. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  389. Labels: map[string]string{
  390. v1.LabelZoneRegionStable: "region1",
  391. v1.LabelZoneFailureDomainStable: "zone1",
  392. v1.LabelZoneRegion: "region1",
  393. v1.LabelZoneFailureDomain: "zone1",
  394. },
  395. },
  396. Status: v1.NodeStatus{
  397. Conditions: []v1.NodeCondition{
  398. {
  399. Type: v1.NodeReady,
  400. Status: v1.ConditionTrue,
  401. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  402. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  403. },
  404. },
  405. },
  406. },
  407. },
  408. Clientset: fake.NewSimpleClientset(
  409. &v1.PodList{
  410. Items: []v1.Pod{
  411. {
  412. ObjectMeta: metav1.ObjectMeta{
  413. Name: "pod0",
  414. Namespace: "default",
  415. Labels: map[string]string{"daemon": "yes"},
  416. },
  417. Spec: v1.PodSpec{
  418. NodeName: "node0",
  419. },
  420. },
  421. },
  422. },
  423. ),
  424. },
  425. daemonSets: []apps.DaemonSet{
  426. {
  427. ObjectMeta: metav1.ObjectMeta{
  428. Name: "ds0",
  429. Namespace: "default",
  430. },
  431. Spec: apps.DaemonSetSpec{
  432. Selector: &metav1.LabelSelector{
  433. MatchLabels: map[string]string{"daemon": "yes"},
  434. },
  435. },
  436. },
  437. },
  438. timeToPass: time.Hour,
  439. newNodeStatus: v1.NodeStatus{
  440. Conditions: []v1.NodeCondition{
  441. {
  442. Type: v1.NodeReady,
  443. Status: v1.ConditionFalse,
  444. // Node status has just been updated, and is NotReady for 1hr.
  445. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC),
  446. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  447. },
  448. },
  449. },
  450. secondNodeNewStatus: healthyNodeNewStatus,
  451. expectedEvictPods: false,
  452. description: "Pod is ds-managed, and kubelet posted NotReady for a long period of time.",
  453. },
  454. // Node created long time ago, and kubelet posted NotReady for a long period of time.
  455. {
  456. fakeNodeHandler: &testutil.FakeNodeHandler{
  457. Existing: []*v1.Node{
  458. {
  459. ObjectMeta: metav1.ObjectMeta{
  460. Name: "node0",
  461. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  462. Labels: map[string]string{
  463. v1.LabelZoneRegionStable: "region1",
  464. v1.LabelZoneFailureDomainStable: "zone1",
  465. v1.LabelZoneRegion: "region1",
  466. v1.LabelZoneFailureDomain: "zone1",
  467. },
  468. },
  469. Status: v1.NodeStatus{
  470. Conditions: []v1.NodeCondition{
  471. {
  472. Type: v1.NodeReady,
  473. Status: v1.ConditionFalse,
  474. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  475. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  476. },
  477. },
  478. },
  479. },
  480. {
  481. ObjectMeta: metav1.ObjectMeta{
  482. Name: "node1",
  483. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  484. Labels: map[string]string{
  485. v1.LabelZoneRegionStable: "region1",
  486. v1.LabelZoneFailureDomainStable: "zone1",
  487. v1.LabelZoneRegion: "region1",
  488. v1.LabelZoneFailureDomain: "zone1",
  489. },
  490. },
  491. Status: v1.NodeStatus{
  492. Conditions: []v1.NodeCondition{
  493. {
  494. Type: v1.NodeReady,
  495. Status: v1.ConditionTrue,
  496. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  497. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  498. },
  499. },
  500. },
  501. },
  502. },
  503. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  504. },
  505. daemonSets: nil,
  506. timeToPass: time.Hour,
  507. newNodeStatus: v1.NodeStatus{
  508. Conditions: []v1.NodeCondition{
  509. {
  510. Type: v1.NodeReady,
  511. Status: v1.ConditionFalse,
  512. // Node status has just been updated, and is NotReady for 1hr.
  513. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC),
  514. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  515. },
  516. },
  517. },
  518. secondNodeNewStatus: healthyNodeNewStatus,
  519. expectedEvictPods: true,
  520. description: "Node created long time ago, and kubelet posted NotReady for a long period of time.",
  521. },
  522. // Node created long time ago, node controller posted Unknown for a short period of time.
  523. {
  524. fakeNodeHandler: &testutil.FakeNodeHandler{
  525. Existing: []*v1.Node{
  526. {
  527. ObjectMeta: metav1.ObjectMeta{
  528. Name: "node0",
  529. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  530. Labels: map[string]string{
  531. v1.LabelZoneRegionStable: "region1",
  532. v1.LabelZoneFailureDomainStable: "zone1",
  533. v1.LabelZoneRegion: "region1",
  534. v1.LabelZoneFailureDomain: "zone1",
  535. },
  536. },
  537. Status: v1.NodeStatus{
  538. Conditions: []v1.NodeCondition{
  539. {
  540. Type: v1.NodeReady,
  541. Status: v1.ConditionUnknown,
  542. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  543. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  544. },
  545. },
  546. },
  547. },
  548. {
  549. ObjectMeta: metav1.ObjectMeta{
  550. Name: "node1",
  551. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  552. Labels: map[string]string{
  553. v1.LabelZoneRegionStable: "region1",
  554. v1.LabelZoneFailureDomainStable: "zone1",
  555. v1.LabelZoneRegion: "region1",
  556. v1.LabelZoneFailureDomain: "zone1",
  557. },
  558. },
  559. Status: v1.NodeStatus{
  560. Conditions: []v1.NodeCondition{
  561. {
  562. Type: v1.NodeReady,
  563. Status: v1.ConditionTrue,
  564. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  565. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  566. },
  567. },
  568. },
  569. },
  570. },
  571. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  572. },
  573. daemonSets: nil,
  574. timeToPass: evictionTimeout - testNodeMonitorGracePeriod,
  575. newNodeStatus: v1.NodeStatus{
  576. Conditions: []v1.NodeCondition{
  577. {
  578. Type: v1.NodeReady,
  579. Status: v1.ConditionUnknown,
  580. // Node status was updated by nodecontroller 10min ago
  581. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  582. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  583. },
  584. },
  585. },
  586. secondNodeNewStatus: healthyNodeNewStatus,
  587. expectedEvictPods: false,
  588. description: "Node created long time ago, node controller posted Unknown for a short period of time.",
  589. },
  590. // Node created long time ago, node controller posted Unknown for a long period of time.
  591. {
  592. fakeNodeHandler: &testutil.FakeNodeHandler{
  593. Existing: []*v1.Node{
  594. {
  595. ObjectMeta: metav1.ObjectMeta{
  596. Name: "node0",
  597. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  598. Labels: map[string]string{
  599. v1.LabelZoneRegionStable: "region1",
  600. v1.LabelZoneFailureDomainStable: "zone1",
  601. v1.LabelZoneRegion: "region1",
  602. v1.LabelZoneFailureDomain: "zone1",
  603. },
  604. },
  605. Status: v1.NodeStatus{
  606. Conditions: []v1.NodeCondition{
  607. {
  608. Type: v1.NodeReady,
  609. Status: v1.ConditionUnknown,
  610. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  611. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  612. },
  613. },
  614. },
  615. },
  616. {
  617. ObjectMeta: metav1.ObjectMeta{
  618. Name: "node1",
  619. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  620. Labels: map[string]string{
  621. v1.LabelZoneRegionStable: "region1",
  622. v1.LabelZoneFailureDomainStable: "zone1",
  623. v1.LabelZoneRegion: "region1",
  624. v1.LabelZoneFailureDomain: "zone1",
  625. },
  626. },
  627. Status: v1.NodeStatus{
  628. Conditions: []v1.NodeCondition{
  629. {
  630. Type: v1.NodeReady,
  631. Status: v1.ConditionTrue,
  632. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  633. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  634. },
  635. },
  636. },
  637. },
  638. },
  639. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  640. },
  641. daemonSets: nil,
  642. timeToPass: 60 * time.Minute,
  643. newNodeStatus: v1.NodeStatus{
  644. Conditions: []v1.NodeCondition{
  645. {
  646. Type: v1.NodeReady,
  647. Status: v1.ConditionUnknown,
  648. // Node status was updated by nodecontroller 1hr ago
  649. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  650. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  651. },
  652. },
  653. },
  654. secondNodeNewStatus: healthyNodeNewStatus,
  655. expectedEvictPods: true,
  656. description: "Node created long time ago, node controller posted Unknown for a long period of time.",
  657. },
  658. }
  659. for _, item := range table {
  660. nodeController, _ := newNodeLifecycleControllerFromClient(
  661. item.fakeNodeHandler,
  662. evictionTimeout,
  663. testRateLimiterQPS,
  664. testRateLimiterQPS,
  665. testLargeClusterThreshold,
  666. testUnhealthyThreshold,
  667. testNodeMonitorGracePeriod,
  668. testNodeStartupGracePeriod,
  669. testNodeMonitorPeriod,
  670. false)
  671. nodeController.now = func() metav1.Time { return fakeNow }
  672. nodeController.recorder = testutil.NewFakeRecorder()
  673. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  674. for _, ds := range item.daemonSets {
  675. nodeController.daemonSetInformer.Informer().GetStore().Add(&ds)
  676. }
  677. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  678. t.Errorf("unexpected error: %v", err)
  679. }
  680. if err := nodeController.monitorNodeHealth(); err != nil {
  681. t.Errorf("unexpected error: %v", err)
  682. }
  683. if item.timeToPass > 0 {
  684. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  685. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  686. item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus
  687. }
  688. if len(item.fakeNodeHandler.Existing[0].Labels) == 0 && len(item.fakeNodeHandler.Existing[1].Labels) == 0 {
  689. item.fakeNodeHandler.Existing[0].Labels = labels
  690. item.fakeNodeHandler.Existing[1].Labels = labels
  691. }
  692. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  693. t.Errorf("unexpected error: %v", err)
  694. }
  695. if err := nodeController.monitorNodeHealth(); err != nil {
  696. t.Errorf("unexpected error: %v", err)
  697. }
  698. zones := testutil.GetZones(item.fakeNodeHandler)
  699. for _, zone := range zones {
  700. if _, ok := nodeController.zonePodEvictor[zone]; ok {
  701. nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  702. nodeUID, _ := value.UID.(string)
  703. pods, err := nodeController.getPodsAssignedToNode(value.Value)
  704. if err != nil {
  705. t.Errorf("unexpected error: %v", err)
  706. }
  707. t.Logf("listed pods %d for node %v", len(pods), value.Value)
  708. nodeutil.DeletePods(item.fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetInformer.Lister())
  709. return true, 0
  710. })
  711. } else {
  712. t.Fatalf("Zone %v was unitialized!", zone)
  713. }
  714. }
  715. podEvicted := false
  716. for _, action := range item.fakeNodeHandler.Actions() {
  717. if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" {
  718. podEvicted = true
  719. }
  720. }
  721. if item.expectedEvictPods != podEvicted {
  722. t.Errorf("expected pod eviction: %+v, got %+v for %+v", item.expectedEvictPods,
  723. podEvicted, item.description)
  724. }
  725. }
  726. }
  727. func TestPodStatusChange(t *testing.T) {
  728. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  729. evictionTimeout := 10 * time.Minute
  730. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  731. // we need second healthy node in tests. Because of how the tests are written we need to update
  732. // the status of this Node.
  733. healthyNodeNewStatus := v1.NodeStatus{
  734. Conditions: []v1.NodeCondition{
  735. {
  736. Type: v1.NodeReady,
  737. Status: v1.ConditionTrue,
  738. // Node status has just been updated, and is NotReady for 10min.
  739. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
  740. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  741. },
  742. },
  743. }
  744. // Node created long time ago, node controller posted Unknown for a long period of time.
  745. table := []struct {
  746. fakeNodeHandler *testutil.FakeNodeHandler
  747. timeToPass time.Duration
  748. newNodeStatus v1.NodeStatus
  749. secondNodeNewStatus v1.NodeStatus
  750. expectedPodUpdate bool
  751. expectedReason string
  752. description string
  753. }{
  754. {
  755. fakeNodeHandler: &testutil.FakeNodeHandler{
  756. Existing: []*v1.Node{
  757. {
  758. ObjectMeta: metav1.ObjectMeta{
  759. Name: "node0",
  760. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  761. Labels: map[string]string{
  762. v1.LabelZoneRegionStable: "region1",
  763. v1.LabelZoneFailureDomainStable: "zone1",
  764. v1.LabelZoneRegion: "region1",
  765. v1.LabelZoneFailureDomain: "zone1",
  766. },
  767. },
  768. Status: v1.NodeStatus{
  769. Conditions: []v1.NodeCondition{
  770. {
  771. Type: v1.NodeReady,
  772. Status: v1.ConditionUnknown,
  773. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  774. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  775. },
  776. },
  777. },
  778. },
  779. {
  780. ObjectMeta: metav1.ObjectMeta{
  781. Name: "node1",
  782. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  783. Labels: map[string]string{
  784. v1.LabelZoneRegion: "region1",
  785. v1.LabelZoneFailureDomain: "zone1",
  786. },
  787. },
  788. Status: v1.NodeStatus{
  789. Conditions: []v1.NodeCondition{
  790. {
  791. Type: v1.NodeReady,
  792. Status: v1.ConditionTrue,
  793. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  794. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  795. },
  796. },
  797. },
  798. },
  799. },
  800. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  801. },
  802. timeToPass: 60 * time.Minute,
  803. newNodeStatus: v1.NodeStatus{
  804. Conditions: []v1.NodeCondition{
  805. {
  806. Type: v1.NodeReady,
  807. Status: v1.ConditionUnknown,
  808. // Node status was updated by nodecontroller 1hr ago
  809. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  810. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  811. },
  812. },
  813. },
  814. secondNodeNewStatus: healthyNodeNewStatus,
  815. expectedPodUpdate: true,
  816. expectedReason: node.NodeUnreachablePodReason,
  817. description: "Node created long time ago, node controller posted Unknown for a " +
  818. "long period of time, the pod status must include reason for termination.",
  819. },
  820. }
  821. for _, item := range table {
  822. nodeController, _ := newNodeLifecycleControllerFromClient(
  823. item.fakeNodeHandler,
  824. evictionTimeout,
  825. testRateLimiterQPS,
  826. testRateLimiterQPS,
  827. testLargeClusterThreshold,
  828. testUnhealthyThreshold,
  829. testNodeMonitorGracePeriod,
  830. testNodeStartupGracePeriod,
  831. testNodeMonitorPeriod,
  832. false)
  833. nodeController.now = func() metav1.Time { return fakeNow }
  834. nodeController.recorder = testutil.NewFakeRecorder()
  835. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  836. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  837. t.Errorf("unexpected error: %v", err)
  838. }
  839. if err := nodeController.monitorNodeHealth(); err != nil {
  840. t.Errorf("unexpected error: %v", err)
  841. }
  842. if item.timeToPass > 0 {
  843. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  844. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  845. item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus
  846. }
  847. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  848. t.Errorf("unexpected error: %v", err)
  849. }
  850. if err := nodeController.monitorNodeHealth(); err != nil {
  851. t.Errorf("unexpected error: %v", err)
  852. }
  853. zones := testutil.GetZones(item.fakeNodeHandler)
  854. for _, zone := range zones {
  855. nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  856. nodeUID, _ := value.UID.(string)
  857. pods, err := nodeController.getPodsAssignedToNode(value.Value)
  858. if err != nil {
  859. t.Errorf("unexpected error: %v", err)
  860. }
  861. nodeutil.DeletePods(item.fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetStore)
  862. return true, 0
  863. })
  864. }
  865. podReasonUpdate := false
  866. for _, action := range item.fakeNodeHandler.Actions() {
  867. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" {
  868. updateReason := action.(testcore.UpdateActionImpl).GetObject().(*v1.Pod).Status.Reason
  869. podReasonUpdate = true
  870. if updateReason != item.expectedReason {
  871. t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description)
  872. }
  873. }
  874. }
  875. if podReasonUpdate != item.expectedPodUpdate {
  876. t.Errorf("expected pod update: %+v, got %+v for %+v", item.expectedPodUpdate, podReasonUpdate, item.description)
  877. }
  878. }
  879. }
  880. func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) {
  881. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  882. evictionTimeout := 10 * time.Minute
  883. timeToPass := 60 * time.Minute
  884. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  885. // we need second healthy node in tests. Because of how the tests are written we need to update
  886. // the status of this Node.
  887. healthyNodeNewStatus := v1.NodeStatus{
  888. Conditions: []v1.NodeCondition{
  889. {
  890. Type: v1.NodeReady,
  891. Status: v1.ConditionTrue,
  892. LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC),
  893. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  894. },
  895. },
  896. }
  897. unhealthyNodeNewStatus := v1.NodeStatus{
  898. Conditions: []v1.NodeCondition{
  899. {
  900. Type: v1.NodeReady,
  901. Status: v1.ConditionUnknown,
  902. // Node status was updated by nodecontroller 1hr ago
  903. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  904. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  905. },
  906. },
  907. }
  908. table := []struct {
  909. nodeList []*v1.Node
  910. podList []v1.Pod
  911. updatedNodeStatuses []v1.NodeStatus
  912. expectedInitialStates map[string]ZoneState
  913. expectedFollowingStates map[string]ZoneState
  914. expectedEvictPods bool
  915. description string
  916. }{
  917. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  918. // Only zone is down - eviction shouldn't take place
  919. {
  920. nodeList: []*v1.Node{
  921. {
  922. ObjectMeta: metav1.ObjectMeta{
  923. Name: "node0",
  924. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  925. Labels: map[string]string{
  926. v1.LabelZoneRegionStable: "region1",
  927. v1.LabelZoneFailureDomainStable: "zone1",
  928. v1.LabelZoneRegion: "region1",
  929. v1.LabelZoneFailureDomain: "zone1",
  930. },
  931. },
  932. Status: v1.NodeStatus{
  933. Conditions: []v1.NodeCondition{
  934. {
  935. Type: v1.NodeReady,
  936. Status: v1.ConditionUnknown,
  937. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  938. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  939. },
  940. },
  941. },
  942. },
  943. {
  944. ObjectMeta: metav1.ObjectMeta{
  945. Name: "node1",
  946. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  947. Labels: map[string]string{
  948. v1.LabelZoneRegionStable: "region1",
  949. v1.LabelZoneFailureDomainStable: "zone1",
  950. v1.LabelZoneRegion: "region1",
  951. v1.LabelZoneFailureDomain: "zone1",
  952. },
  953. },
  954. Status: v1.NodeStatus{
  955. Conditions: []v1.NodeCondition{
  956. {
  957. Type: v1.NodeReady,
  958. Status: v1.ConditionUnknown,
  959. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  960. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  961. },
  962. },
  963. },
  964. },
  965. },
  966. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  967. updatedNodeStatuses: []v1.NodeStatus{
  968. unhealthyNodeNewStatus,
  969. unhealthyNodeNewStatus,
  970. },
  971. expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
  972. expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
  973. expectedEvictPods: false,
  974. description: "Network Disruption: Only zone is down - eviction shouldn't take place.",
  975. },
  976. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  977. // Both zones down - eviction shouldn't take place
  978. {
  979. nodeList: []*v1.Node{
  980. {
  981. ObjectMeta: metav1.ObjectMeta{
  982. Name: "node0",
  983. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  984. Labels: map[string]string{
  985. v1.LabelZoneRegionStable: "region1",
  986. v1.LabelZoneFailureDomainStable: "zone1",
  987. v1.LabelZoneRegion: "region1",
  988. v1.LabelZoneFailureDomain: "zone1",
  989. },
  990. },
  991. Status: v1.NodeStatus{
  992. Conditions: []v1.NodeCondition{
  993. {
  994. Type: v1.NodeReady,
  995. Status: v1.ConditionUnknown,
  996. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  997. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  998. },
  999. },
  1000. },
  1001. },
  1002. {
  1003. ObjectMeta: metav1.ObjectMeta{
  1004. Name: "node1",
  1005. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1006. Labels: map[string]string{
  1007. v1.LabelZoneRegionStable: "region2",
  1008. v1.LabelZoneFailureDomainStable: "zone2",
  1009. v1.LabelZoneRegion: "region2",
  1010. v1.LabelZoneFailureDomain: "zone2",
  1011. },
  1012. },
  1013. Status: v1.NodeStatus{
  1014. Conditions: []v1.NodeCondition{
  1015. {
  1016. Type: v1.NodeReady,
  1017. Status: v1.ConditionUnknown,
  1018. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1019. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1020. },
  1021. },
  1022. },
  1023. },
  1024. },
  1025. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1026. updatedNodeStatuses: []v1.NodeStatus{
  1027. unhealthyNodeNewStatus,
  1028. unhealthyNodeNewStatus,
  1029. },
  1030. expectedInitialStates: map[string]ZoneState{
  1031. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1032. testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
  1033. },
  1034. expectedFollowingStates: map[string]ZoneState{
  1035. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1036. testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
  1037. },
  1038. expectedEvictPods: false,
  1039. description: "Network Disruption: Both zones down - eviction shouldn't take place.",
  1040. },
  1041. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  1042. // One zone is down - eviction should take place
  1043. {
  1044. nodeList: []*v1.Node{
  1045. {
  1046. ObjectMeta: metav1.ObjectMeta{
  1047. Name: "node0",
  1048. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1049. Labels: map[string]string{
  1050. v1.LabelZoneRegionStable: "region1",
  1051. v1.LabelZoneFailureDomainStable: "zone1",
  1052. v1.LabelZoneRegion: "region1",
  1053. v1.LabelZoneFailureDomain: "zone1",
  1054. },
  1055. },
  1056. Status: v1.NodeStatus{
  1057. Conditions: []v1.NodeCondition{
  1058. {
  1059. Type: v1.NodeReady,
  1060. Status: v1.ConditionUnknown,
  1061. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1062. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1063. },
  1064. },
  1065. },
  1066. },
  1067. {
  1068. ObjectMeta: metav1.ObjectMeta{
  1069. Name: "node1",
  1070. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1071. Labels: map[string]string{
  1072. v1.LabelZoneRegionStable: "region1",
  1073. v1.LabelZoneFailureDomainStable: "zone2",
  1074. v1.LabelZoneRegion: "region1",
  1075. v1.LabelZoneFailureDomain: "zone2",
  1076. },
  1077. },
  1078. Status: v1.NodeStatus{
  1079. Conditions: []v1.NodeCondition{
  1080. {
  1081. Type: v1.NodeReady,
  1082. Status: v1.ConditionTrue,
  1083. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1084. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1085. },
  1086. },
  1087. },
  1088. },
  1089. },
  1090. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1091. updatedNodeStatuses: []v1.NodeStatus{
  1092. unhealthyNodeNewStatus,
  1093. healthyNodeNewStatus,
  1094. },
  1095. expectedInitialStates: map[string]ZoneState{
  1096. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1097. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1098. },
  1099. expectedFollowingStates: map[string]ZoneState{
  1100. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1101. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1102. },
  1103. expectedEvictPods: true,
  1104. description: "Network Disruption: One zone is down - eviction should take place.",
  1105. },
  1106. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period
  1107. // of on first Node, eviction should stop even though -master Node is healthy.
  1108. {
  1109. nodeList: []*v1.Node{
  1110. {
  1111. ObjectMeta: metav1.ObjectMeta{
  1112. Name: "node0",
  1113. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1114. Labels: map[string]string{
  1115. v1.LabelZoneRegionStable: "region1",
  1116. v1.LabelZoneFailureDomainStable: "zone1",
  1117. v1.LabelZoneRegion: "region1",
  1118. v1.LabelZoneFailureDomain: "zone1",
  1119. },
  1120. },
  1121. Status: v1.NodeStatus{
  1122. Conditions: []v1.NodeCondition{
  1123. {
  1124. Type: v1.NodeReady,
  1125. Status: v1.ConditionUnknown,
  1126. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1127. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1128. },
  1129. },
  1130. },
  1131. },
  1132. {
  1133. ObjectMeta: metav1.ObjectMeta{
  1134. Name: "node-master",
  1135. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1136. Labels: map[string]string{
  1137. v1.LabelZoneRegionStable: "region1",
  1138. v1.LabelZoneFailureDomainStable: "zone1",
  1139. v1.LabelZoneRegion: "region1",
  1140. v1.LabelZoneFailureDomain: "zone1",
  1141. },
  1142. },
  1143. Status: v1.NodeStatus{
  1144. Conditions: []v1.NodeCondition{
  1145. {
  1146. Type: v1.NodeReady,
  1147. Status: v1.ConditionTrue,
  1148. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1149. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1150. },
  1151. },
  1152. },
  1153. },
  1154. },
  1155. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1156. updatedNodeStatuses: []v1.NodeStatus{
  1157. unhealthyNodeNewStatus,
  1158. healthyNodeNewStatus,
  1159. },
  1160. expectedInitialStates: map[string]ZoneState{
  1161. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1162. },
  1163. expectedFollowingStates: map[string]ZoneState{
  1164. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1165. },
  1166. expectedEvictPods: false,
  1167. description: "NetworkDisruption: eviction should stop, only -master Node is healthy",
  1168. },
  1169. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  1170. // Initially both zones down, one comes back - eviction should take place
  1171. {
  1172. nodeList: []*v1.Node{
  1173. {
  1174. ObjectMeta: metav1.ObjectMeta{
  1175. Name: "node0",
  1176. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1177. Labels: map[string]string{
  1178. v1.LabelZoneRegionStable: "region1",
  1179. v1.LabelZoneFailureDomainStable: "zone1",
  1180. v1.LabelZoneRegion: "region1",
  1181. v1.LabelZoneFailureDomain: "zone1",
  1182. },
  1183. },
  1184. Status: v1.NodeStatus{
  1185. Conditions: []v1.NodeCondition{
  1186. {
  1187. Type: v1.NodeReady,
  1188. Status: v1.ConditionUnknown,
  1189. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1190. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1191. },
  1192. },
  1193. },
  1194. },
  1195. {
  1196. ObjectMeta: metav1.ObjectMeta{
  1197. Name: "node1",
  1198. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1199. Labels: map[string]string{
  1200. v1.LabelZoneRegionStable: "region1",
  1201. v1.LabelZoneFailureDomainStable: "zone2",
  1202. v1.LabelZoneRegion: "region1",
  1203. v1.LabelZoneFailureDomain: "zone2",
  1204. },
  1205. },
  1206. Status: v1.NodeStatus{
  1207. Conditions: []v1.NodeCondition{
  1208. {
  1209. Type: v1.NodeReady,
  1210. Status: v1.ConditionUnknown,
  1211. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1212. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1213. },
  1214. },
  1215. },
  1216. },
  1217. },
  1218. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1219. updatedNodeStatuses: []v1.NodeStatus{
  1220. unhealthyNodeNewStatus,
  1221. healthyNodeNewStatus,
  1222. },
  1223. expectedInitialStates: map[string]ZoneState{
  1224. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1225. testutil.CreateZoneID("region1", "zone2"): stateFullDisruption,
  1226. },
  1227. expectedFollowingStates: map[string]ZoneState{
  1228. testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
  1229. testutil.CreateZoneID("region1", "zone2"): stateNormal,
  1230. },
  1231. expectedEvictPods: true,
  1232. description: "Initially both zones down, one comes back - eviction should take place",
  1233. },
  1234. // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
  1235. // Zone is partially disrupted - eviction should take place
  1236. {
  1237. nodeList: []*v1.Node{
  1238. {
  1239. ObjectMeta: metav1.ObjectMeta{
  1240. Name: "node0",
  1241. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1242. Labels: map[string]string{
  1243. v1.LabelZoneRegionStable: "region1",
  1244. v1.LabelZoneFailureDomainStable: "zone1",
  1245. v1.LabelZoneRegion: "region1",
  1246. v1.LabelZoneFailureDomain: "zone1",
  1247. },
  1248. },
  1249. Status: v1.NodeStatus{
  1250. Conditions: []v1.NodeCondition{
  1251. {
  1252. Type: v1.NodeReady,
  1253. Status: v1.ConditionUnknown,
  1254. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1255. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1256. },
  1257. },
  1258. },
  1259. },
  1260. {
  1261. ObjectMeta: metav1.ObjectMeta{
  1262. Name: "node1",
  1263. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1264. Labels: map[string]string{
  1265. v1.LabelZoneRegionStable: "region1",
  1266. v1.LabelZoneFailureDomainStable: "zone1",
  1267. v1.LabelZoneRegion: "region1",
  1268. v1.LabelZoneFailureDomain: "zone1",
  1269. },
  1270. },
  1271. Status: v1.NodeStatus{
  1272. Conditions: []v1.NodeCondition{
  1273. {
  1274. Type: v1.NodeReady,
  1275. Status: v1.ConditionUnknown,
  1276. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1277. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1278. },
  1279. },
  1280. },
  1281. },
  1282. {
  1283. ObjectMeta: metav1.ObjectMeta{
  1284. Name: "node2",
  1285. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1286. Labels: map[string]string{
  1287. v1.LabelZoneRegionStable: "region1",
  1288. v1.LabelZoneFailureDomainStable: "zone1",
  1289. v1.LabelZoneRegion: "region1",
  1290. v1.LabelZoneFailureDomain: "zone1",
  1291. },
  1292. },
  1293. Status: v1.NodeStatus{
  1294. Conditions: []v1.NodeCondition{
  1295. {
  1296. Type: v1.NodeReady,
  1297. Status: v1.ConditionUnknown,
  1298. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1299. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1300. },
  1301. },
  1302. },
  1303. },
  1304. {
  1305. ObjectMeta: metav1.ObjectMeta{
  1306. Name: "node3",
  1307. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1308. Labels: map[string]string{
  1309. v1.LabelZoneRegionStable: "region1",
  1310. v1.LabelZoneFailureDomainStable: "zone1",
  1311. v1.LabelZoneRegion: "region1",
  1312. v1.LabelZoneFailureDomain: "zone1",
  1313. },
  1314. },
  1315. Status: v1.NodeStatus{
  1316. Conditions: []v1.NodeCondition{
  1317. {
  1318. Type: v1.NodeReady,
  1319. Status: v1.ConditionTrue,
  1320. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1321. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1322. },
  1323. },
  1324. },
  1325. },
  1326. {
  1327. ObjectMeta: metav1.ObjectMeta{
  1328. Name: "node4",
  1329. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1330. Labels: map[string]string{
  1331. v1.LabelZoneRegionStable: "region1",
  1332. v1.LabelZoneFailureDomainStable: "zone1",
  1333. v1.LabelZoneRegion: "region1",
  1334. v1.LabelZoneFailureDomain: "zone1",
  1335. },
  1336. },
  1337. Status: v1.NodeStatus{
  1338. Conditions: []v1.NodeCondition{
  1339. {
  1340. Type: v1.NodeReady,
  1341. Status: v1.ConditionTrue,
  1342. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1343. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1344. },
  1345. },
  1346. },
  1347. },
  1348. },
  1349. podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
  1350. updatedNodeStatuses: []v1.NodeStatus{
  1351. unhealthyNodeNewStatus,
  1352. unhealthyNodeNewStatus,
  1353. unhealthyNodeNewStatus,
  1354. healthyNodeNewStatus,
  1355. healthyNodeNewStatus,
  1356. },
  1357. expectedInitialStates: map[string]ZoneState{
  1358. testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
  1359. },
  1360. expectedFollowingStates: map[string]ZoneState{
  1361. testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
  1362. },
  1363. expectedEvictPods: true,
  1364. description: "Zone is partially disrupted - eviction should take place.",
  1365. },
  1366. }
  1367. for _, item := range table {
  1368. fakeNodeHandler := &testutil.FakeNodeHandler{
  1369. Existing: item.nodeList,
  1370. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}),
  1371. }
  1372. nodeController, _ := newNodeLifecycleControllerFromClient(
  1373. fakeNodeHandler,
  1374. evictionTimeout,
  1375. testRateLimiterQPS,
  1376. testRateLimiterQPS,
  1377. testLargeClusterThreshold,
  1378. testUnhealthyThreshold,
  1379. testNodeMonitorGracePeriod,
  1380. testNodeStartupGracePeriod,
  1381. testNodeMonitorPeriod,
  1382. false)
  1383. nodeController.now = func() metav1.Time { return fakeNow }
  1384. nodeController.recorder = testutil.NewFakeRecorder()
  1385. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  1386. nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 {
  1387. return testRateLimiterQPS
  1388. }
  1389. nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 {
  1390. return testRateLimiterQPS
  1391. }
  1392. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  1393. t.Errorf("unexpected error: %v", err)
  1394. }
  1395. if err := nodeController.monitorNodeHealth(); err != nil {
  1396. t.Errorf("%v: unexpected error: %v", item.description, err)
  1397. }
  1398. for zone, state := range item.expectedInitialStates {
  1399. if state != nodeController.zoneStates[zone] {
  1400. t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
  1401. }
  1402. }
  1403. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
  1404. for i := range item.updatedNodeStatuses {
  1405. fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i]
  1406. }
  1407. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  1408. t.Errorf("unexpected error: %v", err)
  1409. }
  1410. if err := nodeController.monitorNodeHealth(); err != nil {
  1411. t.Errorf("%v: unexpected error: %v", item.description, err)
  1412. }
  1413. for zone, state := range item.expectedFollowingStates {
  1414. if state != nodeController.zoneStates[zone] {
  1415. t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
  1416. }
  1417. }
  1418. var podEvicted bool
  1419. start := time.Now()
  1420. // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function.
  1421. // this breaks when we have the status that we need for test case or when we don't see the
  1422. // intended result after 1 minute.
  1423. for {
  1424. podEvicted = nodeController.doEviction(fakeNodeHandler)
  1425. if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute {
  1426. break
  1427. }
  1428. }
  1429. if item.expectedEvictPods != podEvicted {
  1430. t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted)
  1431. }
  1432. }
  1433. }
  1434. func TestMonitorNodeHealthUpdateStatus(t *testing.T) {
  1435. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  1436. table := []struct {
  1437. fakeNodeHandler *testutil.FakeNodeHandler
  1438. timeToPass time.Duration
  1439. newNodeStatus v1.NodeStatus
  1440. expectedRequestCount int
  1441. expectedNodes []*v1.Node
  1442. expectedPodStatusUpdate bool
  1443. }{
  1444. // Node created long time ago, without status:
  1445. // Expect Unknown status posted from node controller.
  1446. {
  1447. fakeNodeHandler: &testutil.FakeNodeHandler{
  1448. Existing: []*v1.Node{
  1449. {
  1450. ObjectMeta: metav1.ObjectMeta{
  1451. Name: "node0",
  1452. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1453. },
  1454. },
  1455. },
  1456. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1457. },
  1458. expectedRequestCount: 2, // List+Update
  1459. expectedNodes: []*v1.Node{
  1460. {
  1461. ObjectMeta: metav1.ObjectMeta{
  1462. Name: "node0",
  1463. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1464. },
  1465. Status: v1.NodeStatus{
  1466. Conditions: []v1.NodeCondition{
  1467. {
  1468. Type: v1.NodeReady,
  1469. Status: v1.ConditionUnknown,
  1470. Reason: "NodeStatusNeverUpdated",
  1471. Message: "Kubelet never posted node status.",
  1472. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1473. LastTransitionTime: fakeNow,
  1474. },
  1475. {
  1476. Type: v1.NodeMemoryPressure,
  1477. Status: v1.ConditionUnknown,
  1478. Reason: "NodeStatusNeverUpdated",
  1479. Message: "Kubelet never posted node status.",
  1480. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1481. LastTransitionTime: fakeNow,
  1482. },
  1483. {
  1484. Type: v1.NodeDiskPressure,
  1485. Status: v1.ConditionUnknown,
  1486. Reason: "NodeStatusNeverUpdated",
  1487. Message: "Kubelet never posted node status.",
  1488. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1489. LastTransitionTime: fakeNow,
  1490. },
  1491. {
  1492. Type: v1.NodePIDPressure,
  1493. Status: v1.ConditionUnknown,
  1494. Reason: "NodeStatusNeverUpdated",
  1495. Message: "Kubelet never posted node status.",
  1496. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1497. LastTransitionTime: fakeNow,
  1498. },
  1499. },
  1500. },
  1501. },
  1502. },
  1503. expectedPodStatusUpdate: false, // Pod was never scheduled
  1504. },
  1505. // Node created recently, without status.
  1506. // Expect no action from node controller (within startup grace period).
  1507. {
  1508. fakeNodeHandler: &testutil.FakeNodeHandler{
  1509. Existing: []*v1.Node{
  1510. {
  1511. ObjectMeta: metav1.ObjectMeta{
  1512. Name: "node0",
  1513. CreationTimestamp: fakeNow,
  1514. },
  1515. },
  1516. },
  1517. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1518. },
  1519. expectedRequestCount: 1, // List
  1520. expectedNodes: nil,
  1521. expectedPodStatusUpdate: false,
  1522. },
  1523. // Node created long time ago, with status updated by kubelet exceeds grace period.
  1524. // Expect Unknown status posted from node controller.
  1525. {
  1526. fakeNodeHandler: &testutil.FakeNodeHandler{
  1527. Existing: []*v1.Node{
  1528. {
  1529. ObjectMeta: metav1.ObjectMeta{
  1530. Name: "node0",
  1531. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1532. },
  1533. Status: v1.NodeStatus{
  1534. Conditions: []v1.NodeCondition{
  1535. {
  1536. Type: v1.NodeReady,
  1537. Status: v1.ConditionTrue,
  1538. // Node status hasn't been updated for 1hr.
  1539. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1540. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1541. },
  1542. },
  1543. Capacity: v1.ResourceList{
  1544. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1545. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1546. },
  1547. },
  1548. },
  1549. },
  1550. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1551. },
  1552. expectedRequestCount: 3, // (List+)List+Update
  1553. timeToPass: time.Hour,
  1554. newNodeStatus: v1.NodeStatus{
  1555. Conditions: []v1.NodeCondition{
  1556. {
  1557. Type: v1.NodeReady,
  1558. Status: v1.ConditionTrue,
  1559. // Node status hasn't been updated for 1hr.
  1560. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1561. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1562. },
  1563. },
  1564. Capacity: v1.ResourceList{
  1565. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1566. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1567. },
  1568. },
  1569. expectedNodes: []*v1.Node{
  1570. {
  1571. ObjectMeta: metav1.ObjectMeta{
  1572. Name: "node0",
  1573. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1574. },
  1575. Status: v1.NodeStatus{
  1576. Conditions: []v1.NodeCondition{
  1577. {
  1578. Type: v1.NodeReady,
  1579. Status: v1.ConditionUnknown,
  1580. Reason: "NodeStatusUnknown",
  1581. Message: "Kubelet stopped posting node status.",
  1582. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  1583. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1584. },
  1585. {
  1586. Type: v1.NodeMemoryPressure,
  1587. Status: v1.ConditionUnknown,
  1588. Reason: "NodeStatusNeverUpdated",
  1589. Message: "Kubelet never posted node status.",
  1590. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1591. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1592. },
  1593. {
  1594. Type: v1.NodeDiskPressure,
  1595. Status: v1.ConditionUnknown,
  1596. Reason: "NodeStatusNeverUpdated",
  1597. Message: "Kubelet never posted node status.",
  1598. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1599. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1600. },
  1601. {
  1602. Type: v1.NodePIDPressure,
  1603. Status: v1.ConditionUnknown,
  1604. Reason: "NodeStatusNeverUpdated",
  1605. Message: "Kubelet never posted node status.",
  1606. LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated
  1607. LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)},
  1608. },
  1609. },
  1610. Capacity: v1.ResourceList{
  1611. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1612. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1613. },
  1614. },
  1615. },
  1616. },
  1617. expectedPodStatusUpdate: true,
  1618. },
  1619. // Node created long time ago, with status updated recently.
  1620. // Expect no action from node controller (within monitor grace period).
  1621. {
  1622. fakeNodeHandler: &testutil.FakeNodeHandler{
  1623. Existing: []*v1.Node{
  1624. {
  1625. ObjectMeta: metav1.ObjectMeta{
  1626. Name: "node0",
  1627. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  1628. },
  1629. Status: v1.NodeStatus{
  1630. Conditions: []v1.NodeCondition{
  1631. {
  1632. Type: v1.NodeReady,
  1633. Status: v1.ConditionTrue,
  1634. // Node status has just been updated.
  1635. LastHeartbeatTime: fakeNow,
  1636. LastTransitionTime: fakeNow,
  1637. },
  1638. },
  1639. Capacity: v1.ResourceList{
  1640. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1641. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1642. },
  1643. },
  1644. },
  1645. },
  1646. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1647. },
  1648. expectedRequestCount: 1, // List
  1649. expectedNodes: nil,
  1650. expectedPodStatusUpdate: false,
  1651. },
  1652. }
  1653. for i, item := range table {
  1654. nodeController, _ := newNodeLifecycleControllerFromClient(
  1655. item.fakeNodeHandler,
  1656. 5*time.Minute,
  1657. testRateLimiterQPS,
  1658. testRateLimiterQPS,
  1659. testLargeClusterThreshold,
  1660. testUnhealthyThreshold,
  1661. testNodeMonitorGracePeriod,
  1662. testNodeStartupGracePeriod,
  1663. testNodeMonitorPeriod,
  1664. false)
  1665. nodeController.now = func() metav1.Time { return fakeNow }
  1666. nodeController.recorder = testutil.NewFakeRecorder()
  1667. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  1668. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  1669. t.Errorf("unexpected error: %v", err)
  1670. }
  1671. if err := nodeController.monitorNodeHealth(); err != nil {
  1672. t.Errorf("unexpected error: %v", err)
  1673. }
  1674. if item.timeToPass > 0 {
  1675. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  1676. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  1677. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  1678. t.Errorf("unexpected error: %v", err)
  1679. }
  1680. if err := nodeController.monitorNodeHealth(); err != nil {
  1681. t.Errorf("unexpected error: %v", err)
  1682. }
  1683. }
  1684. if item.expectedRequestCount != item.fakeNodeHandler.RequestCount {
  1685. t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount)
  1686. }
  1687. if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) {
  1688. t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0]))
  1689. }
  1690. if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) {
  1691. t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0]))
  1692. }
  1693. podStatusUpdated := false
  1694. for _, action := range item.fakeNodeHandler.Actions() {
  1695. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  1696. podStatusUpdated = true
  1697. }
  1698. }
  1699. if podStatusUpdated != item.expectedPodStatusUpdate {
  1700. t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated)
  1701. }
  1702. }
  1703. }
  1704. func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) {
  1705. nodeCreationTime := metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC)
  1706. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  1707. testcases := []struct {
  1708. description string
  1709. fakeNodeHandler *testutil.FakeNodeHandler
  1710. lease *coordv1.Lease
  1711. timeToPass time.Duration
  1712. newNodeStatus v1.NodeStatus
  1713. newLease *coordv1.Lease
  1714. expectedRequestCount int
  1715. expectedNodes []*v1.Node
  1716. expectedPodStatusUpdate bool
  1717. }{
  1718. // Node created recently, without status. Node lease is missing.
  1719. // Expect no action from node controller (within startup grace period).
  1720. {
  1721. description: "Node created recently, without status. Node lease is missing.",
  1722. fakeNodeHandler: &testutil.FakeNodeHandler{
  1723. Existing: []*v1.Node{
  1724. {
  1725. ObjectMeta: metav1.ObjectMeta{
  1726. Name: "node0",
  1727. CreationTimestamp: fakeNow,
  1728. },
  1729. },
  1730. },
  1731. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1732. },
  1733. expectedRequestCount: 1, // List
  1734. expectedNodes: nil,
  1735. expectedPodStatusUpdate: false,
  1736. },
  1737. // Node created recently, without status. Node lease is renewed recently.
  1738. // Expect no action from node controller (within startup grace period).
  1739. {
  1740. description: "Node created recently, without status. Node lease is renewed recently.",
  1741. fakeNodeHandler: &testutil.FakeNodeHandler{
  1742. Existing: []*v1.Node{
  1743. {
  1744. ObjectMeta: metav1.ObjectMeta{
  1745. Name: "node0",
  1746. CreationTimestamp: fakeNow,
  1747. },
  1748. },
  1749. },
  1750. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1751. },
  1752. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1753. expectedRequestCount: 1, // List
  1754. expectedNodes: nil,
  1755. expectedPodStatusUpdate: false,
  1756. },
  1757. // Node created long time ago, without status. Node lease is missing.
  1758. // Expect Unknown status posted from node controller.
  1759. {
  1760. description: "Node created long time ago, without status. Node lease is missing.",
  1761. fakeNodeHandler: &testutil.FakeNodeHandler{
  1762. Existing: []*v1.Node{
  1763. {
  1764. ObjectMeta: metav1.ObjectMeta{
  1765. Name: "node0",
  1766. CreationTimestamp: nodeCreationTime,
  1767. },
  1768. },
  1769. },
  1770. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1771. },
  1772. expectedRequestCount: 2, // List+Update
  1773. expectedNodes: []*v1.Node{
  1774. {
  1775. ObjectMeta: metav1.ObjectMeta{
  1776. Name: "node0",
  1777. CreationTimestamp: nodeCreationTime,
  1778. },
  1779. Status: v1.NodeStatus{
  1780. Conditions: []v1.NodeCondition{
  1781. {
  1782. Type: v1.NodeReady,
  1783. Status: v1.ConditionUnknown,
  1784. Reason: "NodeStatusNeverUpdated",
  1785. Message: "Kubelet never posted node status.",
  1786. LastHeartbeatTime: nodeCreationTime,
  1787. LastTransitionTime: fakeNow,
  1788. },
  1789. {
  1790. Type: v1.NodeMemoryPressure,
  1791. Status: v1.ConditionUnknown,
  1792. Reason: "NodeStatusNeverUpdated",
  1793. Message: "Kubelet never posted node status.",
  1794. LastHeartbeatTime: nodeCreationTime,
  1795. LastTransitionTime: fakeNow,
  1796. },
  1797. {
  1798. Type: v1.NodeDiskPressure,
  1799. Status: v1.ConditionUnknown,
  1800. Reason: "NodeStatusNeverUpdated",
  1801. Message: "Kubelet never posted node status.",
  1802. LastHeartbeatTime: nodeCreationTime,
  1803. LastTransitionTime: fakeNow,
  1804. },
  1805. {
  1806. Type: v1.NodePIDPressure,
  1807. Status: v1.ConditionUnknown,
  1808. Reason: "NodeStatusNeverUpdated",
  1809. Message: "Kubelet never posted node status.",
  1810. LastHeartbeatTime: nodeCreationTime,
  1811. LastTransitionTime: fakeNow,
  1812. },
  1813. },
  1814. },
  1815. },
  1816. },
  1817. expectedPodStatusUpdate: false, // Pod was never scheduled because the node was never ready.
  1818. },
  1819. // Node created long time ago, without status. Node lease is renewed recently.
  1820. // Expect no action from node controller (within monitor grace period).
  1821. {
  1822. description: "Node created long time ago, without status. Node lease is renewed recently.",
  1823. fakeNodeHandler: &testutil.FakeNodeHandler{
  1824. Existing: []*v1.Node{
  1825. {
  1826. ObjectMeta: metav1.ObjectMeta{
  1827. Name: "node0",
  1828. CreationTimestamp: nodeCreationTime,
  1829. },
  1830. },
  1831. },
  1832. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1833. },
  1834. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1835. timeToPass: time.Hour,
  1836. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour.
  1837. expectedRequestCount: 2, // List+List
  1838. expectedNodes: []*v1.Node{
  1839. {
  1840. ObjectMeta: metav1.ObjectMeta{
  1841. Name: "node0",
  1842. CreationTimestamp: nodeCreationTime,
  1843. },
  1844. },
  1845. },
  1846. expectedPodStatusUpdate: false,
  1847. },
  1848. // Node created long time ago, without status. Node lease is expired.
  1849. // Expect Unknown status posted from node controller.
  1850. {
  1851. description: "Node created long time ago, without status. Node lease is expired.",
  1852. fakeNodeHandler: &testutil.FakeNodeHandler{
  1853. Existing: []*v1.Node{
  1854. {
  1855. ObjectMeta: metav1.ObjectMeta{
  1856. Name: "node0",
  1857. CreationTimestamp: nodeCreationTime,
  1858. },
  1859. },
  1860. },
  1861. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1862. },
  1863. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1864. timeToPass: time.Hour,
  1865. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  1866. expectedRequestCount: 3, // List+List+Update
  1867. expectedNodes: []*v1.Node{
  1868. {
  1869. ObjectMeta: metav1.ObjectMeta{
  1870. Name: "node0",
  1871. CreationTimestamp: nodeCreationTime,
  1872. },
  1873. Status: v1.NodeStatus{
  1874. Conditions: []v1.NodeCondition{
  1875. {
  1876. Type: v1.NodeReady,
  1877. Status: v1.ConditionUnknown,
  1878. Reason: "NodeStatusNeverUpdated",
  1879. Message: "Kubelet never posted node status.",
  1880. LastHeartbeatTime: nodeCreationTime,
  1881. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1882. },
  1883. {
  1884. Type: v1.NodeMemoryPressure,
  1885. Status: v1.ConditionUnknown,
  1886. Reason: "NodeStatusNeverUpdated",
  1887. Message: "Kubelet never posted node status.",
  1888. LastHeartbeatTime: nodeCreationTime,
  1889. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1890. },
  1891. {
  1892. Type: v1.NodeDiskPressure,
  1893. Status: v1.ConditionUnknown,
  1894. Reason: "NodeStatusNeverUpdated",
  1895. Message: "Kubelet never posted node status.",
  1896. LastHeartbeatTime: nodeCreationTime,
  1897. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1898. },
  1899. {
  1900. Type: v1.NodePIDPressure,
  1901. Status: v1.ConditionUnknown,
  1902. Reason: "NodeStatusNeverUpdated",
  1903. Message: "Kubelet never posted node status.",
  1904. LastHeartbeatTime: nodeCreationTime,
  1905. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  1906. },
  1907. },
  1908. },
  1909. },
  1910. },
  1911. expectedPodStatusUpdate: false,
  1912. },
  1913. // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.
  1914. // Expect no action from node controller (within monitor grace period).
  1915. {
  1916. description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.",
  1917. fakeNodeHandler: &testutil.FakeNodeHandler{
  1918. Existing: []*v1.Node{
  1919. {
  1920. ObjectMeta: metav1.ObjectMeta{
  1921. Name: "node0",
  1922. CreationTimestamp: nodeCreationTime,
  1923. },
  1924. Status: v1.NodeStatus{
  1925. Conditions: []v1.NodeCondition{
  1926. {
  1927. Type: v1.NodeReady,
  1928. Status: v1.ConditionTrue,
  1929. LastHeartbeatTime: fakeNow,
  1930. LastTransitionTime: fakeNow,
  1931. },
  1932. {
  1933. Type: v1.NodeDiskPressure,
  1934. Status: v1.ConditionFalse,
  1935. LastHeartbeatTime: fakeNow,
  1936. LastTransitionTime: fakeNow,
  1937. },
  1938. },
  1939. Capacity: v1.ResourceList{
  1940. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1941. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1942. },
  1943. },
  1944. },
  1945. },
  1946. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  1947. },
  1948. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  1949. expectedRequestCount: 2, // List+List
  1950. timeToPass: time.Hour,
  1951. newNodeStatus: v1.NodeStatus{
  1952. // Node status hasn't been updated for 1 hour.
  1953. Conditions: []v1.NodeCondition{
  1954. {
  1955. Type: v1.NodeReady,
  1956. Status: v1.ConditionTrue,
  1957. LastHeartbeatTime: fakeNow,
  1958. LastTransitionTime: fakeNow,
  1959. },
  1960. {
  1961. Type: v1.NodeDiskPressure,
  1962. Status: v1.ConditionFalse,
  1963. LastHeartbeatTime: fakeNow,
  1964. LastTransitionTime: fakeNow,
  1965. },
  1966. },
  1967. Capacity: v1.ResourceList{
  1968. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1969. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1970. },
  1971. },
  1972. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour.
  1973. expectedNodes: []*v1.Node{
  1974. {
  1975. ObjectMeta: metav1.ObjectMeta{
  1976. Name: "node0",
  1977. CreationTimestamp: nodeCreationTime,
  1978. },
  1979. Status: v1.NodeStatus{
  1980. Conditions: []v1.NodeCondition{
  1981. {
  1982. Type: v1.NodeReady,
  1983. Status: v1.ConditionTrue,
  1984. LastHeartbeatTime: fakeNow,
  1985. LastTransitionTime: fakeNow,
  1986. },
  1987. {
  1988. Type: v1.NodeDiskPressure,
  1989. Status: v1.ConditionFalse,
  1990. LastHeartbeatTime: fakeNow,
  1991. LastTransitionTime: fakeNow,
  1992. },
  1993. },
  1994. Capacity: v1.ResourceList{
  1995. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  1996. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  1997. },
  1998. },
  1999. },
  2000. },
  2001. expectedPodStatusUpdate: false,
  2002. },
  2003. // Node created long time ago, with status updated by kubelet recently. Node lease is expired.
  2004. // Expect no action from node controller (within monitor grace period).
  2005. {
  2006. description: "Node created long time ago, with status updated by kubelet recently. Node lease is expired.",
  2007. fakeNodeHandler: &testutil.FakeNodeHandler{
  2008. Existing: []*v1.Node{
  2009. {
  2010. ObjectMeta: metav1.ObjectMeta{
  2011. Name: "node0",
  2012. CreationTimestamp: nodeCreationTime,
  2013. },
  2014. Status: v1.NodeStatus{
  2015. Conditions: []v1.NodeCondition{
  2016. {
  2017. Type: v1.NodeReady,
  2018. Status: v1.ConditionTrue,
  2019. LastHeartbeatTime: fakeNow,
  2020. LastTransitionTime: fakeNow,
  2021. },
  2022. {
  2023. Type: v1.NodeDiskPressure,
  2024. Status: v1.ConditionFalse,
  2025. LastHeartbeatTime: fakeNow,
  2026. LastTransitionTime: fakeNow,
  2027. },
  2028. },
  2029. Capacity: v1.ResourceList{
  2030. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2031. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2032. },
  2033. },
  2034. },
  2035. },
  2036. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2037. },
  2038. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  2039. expectedRequestCount: 2, // List+List
  2040. timeToPass: time.Hour,
  2041. newNodeStatus: v1.NodeStatus{
  2042. // Node status is updated after 1 hour.
  2043. Conditions: []v1.NodeCondition{
  2044. {
  2045. Type: v1.NodeReady,
  2046. Status: v1.ConditionTrue,
  2047. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2048. LastTransitionTime: fakeNow,
  2049. },
  2050. {
  2051. Type: v1.NodeDiskPressure,
  2052. Status: v1.ConditionFalse,
  2053. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2054. LastTransitionTime: fakeNow,
  2055. },
  2056. },
  2057. Capacity: v1.ResourceList{
  2058. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2059. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2060. },
  2061. },
  2062. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  2063. expectedNodes: []*v1.Node{
  2064. {
  2065. ObjectMeta: metav1.ObjectMeta{
  2066. Name: "node0",
  2067. CreationTimestamp: nodeCreationTime,
  2068. },
  2069. Status: v1.NodeStatus{
  2070. Conditions: []v1.NodeCondition{
  2071. {
  2072. Type: v1.NodeReady,
  2073. Status: v1.ConditionTrue,
  2074. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2075. LastTransitionTime: fakeNow,
  2076. },
  2077. {
  2078. Type: v1.NodeDiskPressure,
  2079. Status: v1.ConditionFalse,
  2080. LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2081. LastTransitionTime: fakeNow,
  2082. },
  2083. },
  2084. Capacity: v1.ResourceList{
  2085. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2086. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2087. },
  2088. },
  2089. },
  2090. },
  2091. expectedPodStatusUpdate: false,
  2092. },
  2093. // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.
  2094. // Expect Unknown status posted from node controller.
  2095. {
  2096. description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.",
  2097. fakeNodeHandler: &testutil.FakeNodeHandler{
  2098. Existing: []*v1.Node{
  2099. {
  2100. ObjectMeta: metav1.ObjectMeta{
  2101. Name: "node0",
  2102. CreationTimestamp: nodeCreationTime,
  2103. },
  2104. Status: v1.NodeStatus{
  2105. Conditions: []v1.NodeCondition{
  2106. {
  2107. Type: v1.NodeReady,
  2108. Status: v1.ConditionTrue,
  2109. LastHeartbeatTime: fakeNow,
  2110. LastTransitionTime: fakeNow,
  2111. },
  2112. },
  2113. Capacity: v1.ResourceList{
  2114. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2115. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2116. },
  2117. },
  2118. },
  2119. },
  2120. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2121. },
  2122. lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)),
  2123. expectedRequestCount: 3, // List+List+Update
  2124. timeToPass: time.Hour,
  2125. newNodeStatus: v1.NodeStatus{
  2126. // Node status hasn't been updated for 1 hour.
  2127. Conditions: []v1.NodeCondition{
  2128. {
  2129. Type: v1.NodeReady,
  2130. Status: v1.ConditionTrue,
  2131. LastHeartbeatTime: fakeNow,
  2132. LastTransitionTime: fakeNow,
  2133. },
  2134. },
  2135. Capacity: v1.ResourceList{
  2136. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2137. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2138. },
  2139. },
  2140. newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour.
  2141. expectedNodes: []*v1.Node{
  2142. {
  2143. ObjectMeta: metav1.ObjectMeta{
  2144. Name: "node0",
  2145. CreationTimestamp: nodeCreationTime,
  2146. },
  2147. Status: v1.NodeStatus{
  2148. Conditions: []v1.NodeCondition{
  2149. {
  2150. Type: v1.NodeReady,
  2151. Status: v1.ConditionUnknown,
  2152. Reason: "NodeStatusUnknown",
  2153. Message: "Kubelet stopped posting node status.",
  2154. LastHeartbeatTime: fakeNow,
  2155. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2156. },
  2157. {
  2158. Type: v1.NodeMemoryPressure,
  2159. Status: v1.ConditionUnknown,
  2160. Reason: "NodeStatusNeverUpdated",
  2161. Message: "Kubelet never posted node status.",
  2162. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2163. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2164. },
  2165. {
  2166. Type: v1.NodeDiskPressure,
  2167. Status: v1.ConditionUnknown,
  2168. Reason: "NodeStatusNeverUpdated",
  2169. Message: "Kubelet never posted node status.",
  2170. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2171. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2172. },
  2173. {
  2174. Type: v1.NodePIDPressure,
  2175. Status: v1.ConditionUnknown,
  2176. Reason: "NodeStatusNeverUpdated",
  2177. Message: "Kubelet never posted node status.",
  2178. LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated
  2179. LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)},
  2180. },
  2181. },
  2182. Capacity: v1.ResourceList{
  2183. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2184. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2185. },
  2186. },
  2187. },
  2188. },
  2189. expectedPodStatusUpdate: true,
  2190. },
  2191. }
  2192. for _, item := range testcases {
  2193. t.Run(item.description, func(t *testing.T) {
  2194. nodeController, _ := newNodeLifecycleControllerFromClient(
  2195. item.fakeNodeHandler,
  2196. 5*time.Minute,
  2197. testRateLimiterQPS,
  2198. testRateLimiterQPS,
  2199. testLargeClusterThreshold,
  2200. testUnhealthyThreshold,
  2201. testNodeMonitorGracePeriod,
  2202. testNodeStartupGracePeriod,
  2203. testNodeMonitorPeriod,
  2204. false)
  2205. nodeController.now = func() metav1.Time { return fakeNow }
  2206. nodeController.recorder = testutil.NewFakeRecorder()
  2207. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  2208. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2209. t.Fatalf("unexpected error: %v", err)
  2210. }
  2211. if err := nodeController.syncLeaseStore(item.lease); err != nil {
  2212. t.Fatalf("unexpected error: %v", err)
  2213. }
  2214. if err := nodeController.monitorNodeHealth(); err != nil {
  2215. t.Fatalf("unexpected error: %v", err)
  2216. }
  2217. if item.timeToPass > 0 {
  2218. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  2219. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  2220. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2221. t.Fatalf("unexpected error: %v", err)
  2222. }
  2223. if err := nodeController.syncLeaseStore(item.newLease); err != nil {
  2224. t.Fatalf("unexpected error: %v", err)
  2225. }
  2226. if err := nodeController.monitorNodeHealth(); err != nil {
  2227. t.Fatalf("unexpected error: %v", err)
  2228. }
  2229. }
  2230. if item.expectedRequestCount != item.fakeNodeHandler.RequestCount {
  2231. t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount)
  2232. }
  2233. if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) {
  2234. t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0]))
  2235. }
  2236. if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) {
  2237. t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0]))
  2238. }
  2239. podStatusUpdated := false
  2240. for _, action := range item.fakeNodeHandler.Actions() {
  2241. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2242. podStatusUpdated = true
  2243. }
  2244. }
  2245. if podStatusUpdated != item.expectedPodStatusUpdate {
  2246. t.Errorf("expect pod status updated to be %v, but got %v", item.expectedPodStatusUpdate, podStatusUpdated)
  2247. }
  2248. })
  2249. }
  2250. }
  2251. func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) {
  2252. fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  2253. table := []struct {
  2254. fakeNodeHandler *testutil.FakeNodeHandler
  2255. timeToPass time.Duration
  2256. newNodeStatus v1.NodeStatus
  2257. expectedPodStatusUpdate bool
  2258. }{
  2259. // Node created recently, without status.
  2260. // Expect no action from node controller (within startup grace period).
  2261. {
  2262. fakeNodeHandler: &testutil.FakeNodeHandler{
  2263. Existing: []*v1.Node{
  2264. {
  2265. ObjectMeta: metav1.ObjectMeta{
  2266. Name: "node0",
  2267. CreationTimestamp: fakeNow,
  2268. },
  2269. },
  2270. },
  2271. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2272. },
  2273. expectedPodStatusUpdate: false,
  2274. },
  2275. // Node created long time ago, with status updated recently.
  2276. // Expect no action from node controller (within monitor grace period).
  2277. {
  2278. fakeNodeHandler: &testutil.FakeNodeHandler{
  2279. Existing: []*v1.Node{
  2280. {
  2281. ObjectMeta: metav1.ObjectMeta{
  2282. Name: "node0",
  2283. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2284. },
  2285. Status: v1.NodeStatus{
  2286. Conditions: []v1.NodeCondition{
  2287. {
  2288. Type: v1.NodeReady,
  2289. Status: v1.ConditionTrue,
  2290. // Node status has just been updated.
  2291. LastHeartbeatTime: fakeNow,
  2292. LastTransitionTime: fakeNow,
  2293. },
  2294. },
  2295. Capacity: v1.ResourceList{
  2296. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2297. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2298. },
  2299. },
  2300. },
  2301. },
  2302. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2303. },
  2304. expectedPodStatusUpdate: false,
  2305. },
  2306. // Node created long time ago, with status updated by kubelet exceeds grace period.
  2307. // Expect pods status updated and Unknown node status posted from node controller
  2308. {
  2309. fakeNodeHandler: &testutil.FakeNodeHandler{
  2310. Existing: []*v1.Node{
  2311. {
  2312. ObjectMeta: metav1.ObjectMeta{
  2313. Name: "node0",
  2314. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2315. },
  2316. Status: v1.NodeStatus{
  2317. Conditions: []v1.NodeCondition{
  2318. {
  2319. Type: v1.NodeReady,
  2320. Status: v1.ConditionTrue,
  2321. // Node status hasn't been updated for 1hr.
  2322. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2323. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2324. },
  2325. },
  2326. Capacity: v1.ResourceList{
  2327. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2328. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2329. },
  2330. },
  2331. },
  2332. },
  2333. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2334. },
  2335. timeToPass: 1 * time.Minute,
  2336. newNodeStatus: v1.NodeStatus{
  2337. Conditions: []v1.NodeCondition{
  2338. {
  2339. Type: v1.NodeReady,
  2340. Status: v1.ConditionTrue,
  2341. // Node status hasn't been updated for 1hr.
  2342. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2343. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2344. },
  2345. },
  2346. Capacity: v1.ResourceList{
  2347. v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
  2348. v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
  2349. },
  2350. },
  2351. expectedPodStatusUpdate: true,
  2352. },
  2353. }
  2354. for i, item := range table {
  2355. nodeController, _ := newNodeLifecycleControllerFromClient(
  2356. item.fakeNodeHandler,
  2357. 5*time.Minute,
  2358. testRateLimiterQPS,
  2359. testRateLimiterQPS,
  2360. testLargeClusterThreshold,
  2361. testUnhealthyThreshold,
  2362. testNodeMonitorGracePeriod,
  2363. testNodeStartupGracePeriod,
  2364. testNodeMonitorPeriod,
  2365. false)
  2366. nodeController.now = func() metav1.Time { return fakeNow }
  2367. nodeController.recorder = testutil.NewFakeRecorder()
  2368. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  2369. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2370. t.Errorf("unexpected error: %v", err)
  2371. }
  2372. if err := nodeController.monitorNodeHealth(); err != nil {
  2373. t.Errorf("Case[%d] unexpected error: %v", i, err)
  2374. }
  2375. if item.timeToPass > 0 {
  2376. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} }
  2377. item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
  2378. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2379. t.Errorf("unexpected error: %v", err)
  2380. }
  2381. if err := nodeController.monitorNodeHealth(); err != nil {
  2382. t.Errorf("Case[%d] unexpected error: %v", i, err)
  2383. }
  2384. }
  2385. podStatusUpdated := false
  2386. for _, action := range item.fakeNodeHandler.Actions() {
  2387. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2388. podStatusUpdated = true
  2389. }
  2390. }
  2391. if podStatusUpdated != item.expectedPodStatusUpdate {
  2392. t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated)
  2393. }
  2394. }
  2395. }
  2396. func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) {
  2397. type nodeIteration struct {
  2398. timeToPass time.Duration
  2399. newNodes []*v1.Node
  2400. }
  2401. timeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
  2402. timePlusTwoMinutes := metav1.Date(2015, 1, 1, 12, 0, 2, 0, time.UTC)
  2403. makeNodes := func(status v1.ConditionStatus, lastHeartbeatTime, lastTransitionTime metav1.Time) []*v1.Node {
  2404. return []*v1.Node{
  2405. {
  2406. ObjectMeta: metav1.ObjectMeta{
  2407. Name: "node0",
  2408. CreationTimestamp: timeNow,
  2409. },
  2410. Status: v1.NodeStatus{
  2411. Conditions: []v1.NodeCondition{
  2412. {
  2413. Type: v1.NodeReady,
  2414. Status: status,
  2415. LastHeartbeatTime: lastHeartbeatTime,
  2416. LastTransitionTime: lastTransitionTime,
  2417. },
  2418. },
  2419. },
  2420. },
  2421. }
  2422. }
  2423. table := []struct {
  2424. desc string
  2425. fakeNodeHandler *testutil.FakeNodeHandler
  2426. updateReactor func(action testcore.Action) (bool, runtime.Object, error)
  2427. fakeGetPodsAssignedToNode func(c *fake.Clientset) func(string) ([]*v1.Pod, error)
  2428. nodeIterations []nodeIteration
  2429. expectedPodStatusUpdates int
  2430. }{
  2431. // Node created long time ago, with status updated by kubelet exceeds grace period.
  2432. // First monitorNodeHealth check will update pod status to NotReady.
  2433. // Second monitorNodeHealth check will do no updates (no retry).
  2434. {
  2435. desc: "successful pod status update, no retry required",
  2436. fakeNodeHandler: &testutil.FakeNodeHandler{
  2437. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2438. },
  2439. fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode,
  2440. nodeIterations: []nodeIteration{
  2441. {
  2442. timeToPass: 0,
  2443. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2444. },
  2445. {
  2446. timeToPass: 1 * time.Minute,
  2447. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2448. },
  2449. {
  2450. timeToPass: 1 * time.Minute,
  2451. newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes),
  2452. },
  2453. },
  2454. expectedPodStatusUpdates: 1,
  2455. },
  2456. // Node created long time ago, with status updated by kubelet exceeds grace period.
  2457. // First monitorNodeHealth check will fail to update pod status to NotReady.
  2458. // Second monitorNodeHealth check will update pod status to NotReady (retry).
  2459. {
  2460. desc: "unsuccessful pod status update, retry required",
  2461. fakeNodeHandler: &testutil.FakeNodeHandler{
  2462. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2463. },
  2464. updateReactor: func() func(action testcore.Action) (bool, runtime.Object, error) {
  2465. i := 0
  2466. return func(action testcore.Action) (bool, runtime.Object, error) {
  2467. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2468. i++
  2469. switch i {
  2470. case 1:
  2471. return true, nil, fmt.Errorf("fake error")
  2472. default:
  2473. return true, testutil.NewPod("pod0", "node0"), nil
  2474. }
  2475. }
  2476. return true, nil, fmt.Errorf("unsupported action")
  2477. }
  2478. }(),
  2479. fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode,
  2480. nodeIterations: []nodeIteration{
  2481. {
  2482. timeToPass: 0,
  2483. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2484. },
  2485. {
  2486. timeToPass: 1 * time.Minute,
  2487. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2488. },
  2489. {
  2490. timeToPass: 1 * time.Minute,
  2491. newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes),
  2492. },
  2493. },
  2494. expectedPodStatusUpdates: 2, // One failed and one retry.
  2495. },
  2496. // Node created long time ago, with status updated by kubelet exceeds grace period.
  2497. // First monitorNodeHealth check will fail to list pods.
  2498. // Second monitorNodeHealth check will update pod status to NotReady (retry).
  2499. {
  2500. desc: "unsuccessful pod list, retry required",
  2501. fakeNodeHandler: &testutil.FakeNodeHandler{
  2502. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2503. },
  2504. fakeGetPodsAssignedToNode: func(c *fake.Clientset) func(string) ([]*v1.Pod, error) {
  2505. i := 0
  2506. f := fakeGetPodsAssignedToNode(c)
  2507. return func(nodeName string) ([]*v1.Pod, error) {
  2508. i++
  2509. if i == 1 {
  2510. return nil, fmt.Errorf("fake error")
  2511. }
  2512. return f(nodeName)
  2513. }
  2514. },
  2515. nodeIterations: []nodeIteration{
  2516. {
  2517. timeToPass: 0,
  2518. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2519. },
  2520. {
  2521. timeToPass: 1 * time.Minute,
  2522. newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow),
  2523. },
  2524. {
  2525. timeToPass: 1 * time.Minute,
  2526. newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes),
  2527. },
  2528. },
  2529. expectedPodStatusUpdates: 1,
  2530. },
  2531. }
  2532. for _, item := range table {
  2533. t.Run(item.desc, func(t *testing.T) {
  2534. nodeController, _ := newNodeLifecycleControllerFromClient(
  2535. item.fakeNodeHandler,
  2536. 5*time.Minute,
  2537. testRateLimiterQPS,
  2538. testRateLimiterQPS,
  2539. testLargeClusterThreshold,
  2540. testUnhealthyThreshold,
  2541. testNodeMonitorGracePeriod,
  2542. testNodeStartupGracePeriod,
  2543. testNodeMonitorPeriod,
  2544. false)
  2545. if item.updateReactor != nil {
  2546. item.fakeNodeHandler.Clientset.PrependReactor("update", "pods", item.updateReactor)
  2547. }
  2548. nodeController.now = func() metav1.Time { return timeNow }
  2549. nodeController.recorder = testutil.NewFakeRecorder()
  2550. nodeController.getPodsAssignedToNode = item.fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset)
  2551. for _, itertion := range item.nodeIterations {
  2552. nodeController.now = func() metav1.Time { return metav1.Time{Time: timeNow.Add(itertion.timeToPass)} }
  2553. item.fakeNodeHandler.Existing = itertion.newNodes
  2554. if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil {
  2555. t.Errorf("unexpected error: %v", err)
  2556. }
  2557. if err := nodeController.monitorNodeHealth(); err != nil {
  2558. t.Errorf("unexpected error: %v", err)
  2559. }
  2560. }
  2561. podStatusUpdates := 0
  2562. for _, action := range item.fakeNodeHandler.Actions() {
  2563. if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" {
  2564. podStatusUpdates++
  2565. }
  2566. }
  2567. if podStatusUpdates != item.expectedPodStatusUpdates {
  2568. t.Errorf("expect pod status updated to happen %d times, but got %d", item.expectedPodStatusUpdates, podStatusUpdates)
  2569. }
  2570. })
  2571. }
  2572. }
  2573. // TestApplyNoExecuteTaints, ensures we just have a NoExecute taint applied to node.
  2574. // NodeController is just responsible for enqueuing the node to tainting queue from which taint manager picks up
  2575. // and evicts the pods on the node.
  2576. func TestApplyNoExecuteTaints(t *testing.T) {
  2577. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2578. evictionTimeout := 10 * time.Minute
  2579. fakeNodeHandler := &testutil.FakeNodeHandler{
  2580. Existing: []*v1.Node{
  2581. // Unreachable Taint with effect 'NoExecute' should be applied to this node.
  2582. {
  2583. ObjectMeta: metav1.ObjectMeta{
  2584. Name: "node0",
  2585. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2586. Labels: map[string]string{
  2587. v1.LabelZoneRegionStable: "region1",
  2588. v1.LabelZoneFailureDomainStable: "zone1",
  2589. v1.LabelZoneRegion: "region1",
  2590. v1.LabelZoneFailureDomain: "zone1",
  2591. },
  2592. },
  2593. Status: v1.NodeStatus{
  2594. Conditions: []v1.NodeCondition{
  2595. {
  2596. Type: v1.NodeReady,
  2597. Status: v1.ConditionUnknown,
  2598. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2599. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2600. },
  2601. },
  2602. },
  2603. },
  2604. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  2605. // we need second healthy node in tests.
  2606. {
  2607. ObjectMeta: metav1.ObjectMeta{
  2608. Name: "node1",
  2609. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2610. Labels: map[string]string{
  2611. v1.LabelZoneRegionStable: "region1",
  2612. v1.LabelZoneFailureDomainStable: "zone1",
  2613. v1.LabelZoneRegion: "region1",
  2614. v1.LabelZoneFailureDomain: "zone1",
  2615. },
  2616. },
  2617. Status: v1.NodeStatus{
  2618. Conditions: []v1.NodeCondition{
  2619. {
  2620. Type: v1.NodeReady,
  2621. Status: v1.ConditionTrue,
  2622. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2623. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2624. },
  2625. },
  2626. },
  2627. },
  2628. // NotReady Taint with NoExecute effect should be applied to this node.
  2629. {
  2630. ObjectMeta: metav1.ObjectMeta{
  2631. Name: "node2",
  2632. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2633. Labels: map[string]string{
  2634. v1.LabelZoneRegionStable: "region1",
  2635. v1.LabelZoneFailureDomainStable: "zone1",
  2636. v1.LabelZoneRegion: "region1",
  2637. v1.LabelZoneFailureDomain: "zone1",
  2638. },
  2639. },
  2640. Status: v1.NodeStatus{
  2641. Conditions: []v1.NodeCondition{
  2642. {
  2643. Type: v1.NodeReady,
  2644. Status: v1.ConditionFalse,
  2645. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2646. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2647. },
  2648. },
  2649. },
  2650. },
  2651. },
  2652. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2653. }
  2654. healthyNodeNewStatus := v1.NodeStatus{
  2655. Conditions: []v1.NodeCondition{
  2656. {
  2657. Type: v1.NodeReady,
  2658. Status: v1.ConditionTrue,
  2659. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC),
  2660. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2661. },
  2662. },
  2663. }
  2664. originalTaint := UnreachableTaintTemplate
  2665. nodeController, _ := newNodeLifecycleControllerFromClient(
  2666. fakeNodeHandler,
  2667. evictionTimeout,
  2668. testRateLimiterQPS,
  2669. testRateLimiterQPS,
  2670. testLargeClusterThreshold,
  2671. testUnhealthyThreshold,
  2672. testNodeMonitorGracePeriod,
  2673. testNodeStartupGracePeriod,
  2674. testNodeMonitorPeriod,
  2675. true)
  2676. nodeController.now = func() metav1.Time { return fakeNow }
  2677. nodeController.recorder = testutil.NewFakeRecorder()
  2678. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  2679. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2680. t.Errorf("unexpected error: %v", err)
  2681. }
  2682. if err := nodeController.monitorNodeHealth(); err != nil {
  2683. t.Errorf("unexpected error: %v", err)
  2684. }
  2685. nodeController.doNoExecuteTaintingPass()
  2686. node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{})
  2687. if err != nil {
  2688. t.Errorf("Can't get current node0...")
  2689. return
  2690. }
  2691. if !taintutils.TaintExists(node0.Spec.Taints, UnreachableTaintTemplate) {
  2692. t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints)
  2693. }
  2694. node2, err := fakeNodeHandler.Get(context.TODO(), "node2", metav1.GetOptions{})
  2695. if err != nil {
  2696. t.Errorf("Can't get current node2...")
  2697. return
  2698. }
  2699. if !taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) {
  2700. t.Errorf("Can't find taint %v in %v", NotReadyTaintTemplate, node2.Spec.Taints)
  2701. }
  2702. // Make node3 healthy again.
  2703. node2.Status = healthyNodeNewStatus
  2704. _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node2, metav1.UpdateOptions{})
  2705. if err != nil {
  2706. t.Errorf(err.Error())
  2707. return
  2708. }
  2709. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2710. t.Errorf("unexpected error: %v", err)
  2711. }
  2712. if err := nodeController.monitorNodeHealth(); err != nil {
  2713. t.Errorf("unexpected error: %v", err)
  2714. }
  2715. nodeController.doNoExecuteTaintingPass()
  2716. node2, err = fakeNodeHandler.Get(context.TODO(), "node2", metav1.GetOptions{})
  2717. if err != nil {
  2718. t.Errorf("Can't get current node2...")
  2719. return
  2720. }
  2721. // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect).
  2722. if taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) || len(node2.Spec.Taints) > 0 {
  2723. t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node2.Spec.Taints)
  2724. }
  2725. }
  2726. func TestSwapUnreachableNotReadyTaints(t *testing.T) {
  2727. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2728. evictionTimeout := 10 * time.Minute
  2729. fakeNodeHandler := &testutil.FakeNodeHandler{
  2730. Existing: []*v1.Node{
  2731. {
  2732. ObjectMeta: metav1.ObjectMeta{
  2733. Name: "node0",
  2734. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2735. Labels: map[string]string{
  2736. v1.LabelZoneRegionStable: "region1",
  2737. v1.LabelZoneFailureDomainStable: "zone1",
  2738. v1.LabelZoneRegion: "region1",
  2739. v1.LabelZoneFailureDomain: "zone1",
  2740. },
  2741. },
  2742. Status: v1.NodeStatus{
  2743. Conditions: []v1.NodeCondition{
  2744. {
  2745. Type: v1.NodeReady,
  2746. Status: v1.ConditionUnknown,
  2747. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2748. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2749. },
  2750. },
  2751. },
  2752. },
  2753. // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
  2754. // we need second healthy node in tests. Because of how the tests are written we need to update
  2755. // the status of this Node.
  2756. {
  2757. ObjectMeta: metav1.ObjectMeta{
  2758. Name: "node1",
  2759. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2760. Labels: map[string]string{
  2761. v1.LabelZoneRegionStable: "region1",
  2762. v1.LabelZoneFailureDomainStable: "zone1",
  2763. v1.LabelZoneRegion: "region1",
  2764. v1.LabelZoneFailureDomain: "zone1",
  2765. },
  2766. },
  2767. Status: v1.NodeStatus{
  2768. Conditions: []v1.NodeCondition{
  2769. {
  2770. Type: v1.NodeReady,
  2771. Status: v1.ConditionTrue,
  2772. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2773. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2774. },
  2775. },
  2776. },
  2777. },
  2778. },
  2779. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2780. }
  2781. timeToPass := evictionTimeout
  2782. newNodeStatus := v1.NodeStatus{
  2783. Conditions: []v1.NodeCondition{
  2784. {
  2785. Type: v1.NodeReady,
  2786. Status: v1.ConditionFalse,
  2787. // Node status has just been updated, and is NotReady for 10min.
  2788. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 9, 0, 0, time.UTC),
  2789. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2790. },
  2791. },
  2792. }
  2793. healthyNodeNewStatus := v1.NodeStatus{
  2794. Conditions: []v1.NodeCondition{
  2795. {
  2796. Type: v1.NodeReady,
  2797. Status: v1.ConditionTrue,
  2798. LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC),
  2799. LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
  2800. },
  2801. },
  2802. }
  2803. originalTaint := UnreachableTaintTemplate
  2804. updatedTaint := NotReadyTaintTemplate
  2805. nodeController, _ := newNodeLifecycleControllerFromClient(
  2806. fakeNodeHandler,
  2807. evictionTimeout,
  2808. testRateLimiterQPS,
  2809. testRateLimiterQPS,
  2810. testLargeClusterThreshold,
  2811. testUnhealthyThreshold,
  2812. testNodeMonitorGracePeriod,
  2813. testNodeStartupGracePeriod,
  2814. testNodeMonitorPeriod,
  2815. true)
  2816. nodeController.now = func() metav1.Time { return fakeNow }
  2817. nodeController.recorder = testutil.NewFakeRecorder()
  2818. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  2819. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2820. t.Errorf("unexpected error: %v", err)
  2821. }
  2822. if err := nodeController.monitorNodeHealth(); err != nil {
  2823. t.Errorf("unexpected error: %v", err)
  2824. }
  2825. nodeController.doNoExecuteTaintingPass()
  2826. node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{})
  2827. if err != nil {
  2828. t.Errorf("Can't get current node0...")
  2829. return
  2830. }
  2831. node1, err := fakeNodeHandler.Get(context.TODO(), "node1", metav1.GetOptions{})
  2832. if err != nil {
  2833. t.Errorf("Can't get current node1...")
  2834. return
  2835. }
  2836. if originalTaint != nil && !taintutils.TaintExists(node0.Spec.Taints, originalTaint) {
  2837. t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints)
  2838. }
  2839. nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
  2840. node0.Status = newNodeStatus
  2841. node1.Status = healthyNodeNewStatus
  2842. _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node0, metav1.UpdateOptions{})
  2843. if err != nil {
  2844. t.Errorf(err.Error())
  2845. return
  2846. }
  2847. _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node1, metav1.UpdateOptions{})
  2848. if err != nil {
  2849. t.Errorf(err.Error())
  2850. return
  2851. }
  2852. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  2853. t.Errorf("unexpected error: %v", err)
  2854. }
  2855. if err := nodeController.monitorNodeHealth(); err != nil {
  2856. t.Errorf("unexpected error: %v", err)
  2857. }
  2858. nodeController.doNoExecuteTaintingPass()
  2859. node0, err = fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{})
  2860. if err != nil {
  2861. t.Errorf("Can't get current node0...")
  2862. return
  2863. }
  2864. if updatedTaint != nil {
  2865. if !taintutils.TaintExists(node0.Spec.Taints, updatedTaint) {
  2866. t.Errorf("Can't find taint %v in %v", updatedTaint, node0.Spec.Taints)
  2867. }
  2868. }
  2869. }
  2870. func TestTaintsNodeByCondition(t *testing.T) {
  2871. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  2872. evictionTimeout := 10 * time.Minute
  2873. fakeNodeHandler := &testutil.FakeNodeHandler{
  2874. Existing: []*v1.Node{
  2875. {
  2876. ObjectMeta: metav1.ObjectMeta{
  2877. Name: "node0",
  2878. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2879. Labels: map[string]string{
  2880. v1.LabelZoneRegionStable: "region1",
  2881. v1.LabelZoneFailureDomainStable: "zone1",
  2882. v1.LabelZoneRegion: "region1",
  2883. v1.LabelZoneFailureDomain: "zone1",
  2884. },
  2885. },
  2886. Status: v1.NodeStatus{
  2887. Conditions: []v1.NodeCondition{
  2888. {
  2889. Type: v1.NodeReady,
  2890. Status: v1.ConditionTrue,
  2891. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2892. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2893. },
  2894. },
  2895. },
  2896. },
  2897. },
  2898. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  2899. }
  2900. nodeController, _ := newNodeLifecycleControllerFromClient(
  2901. fakeNodeHandler,
  2902. evictionTimeout,
  2903. testRateLimiterQPS,
  2904. testRateLimiterQPS,
  2905. testLargeClusterThreshold,
  2906. testUnhealthyThreshold,
  2907. testNodeMonitorGracePeriod,
  2908. testNodeStartupGracePeriod,
  2909. testNodeMonitorPeriod,
  2910. true)
  2911. nodeController.now = func() metav1.Time { return fakeNow }
  2912. nodeController.recorder = testutil.NewFakeRecorder()
  2913. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  2914. networkUnavailableTaint := &v1.Taint{
  2915. Key: v1.TaintNodeNetworkUnavailable,
  2916. Effect: v1.TaintEffectNoSchedule,
  2917. }
  2918. notReadyTaint := &v1.Taint{
  2919. Key: v1.TaintNodeNotReady,
  2920. Effect: v1.TaintEffectNoSchedule,
  2921. }
  2922. unreachableTaint := &v1.Taint{
  2923. Key: v1.TaintNodeUnreachable,
  2924. Effect: v1.TaintEffectNoSchedule,
  2925. }
  2926. tests := []struct {
  2927. Name string
  2928. Node *v1.Node
  2929. ExpectedTaints []*v1.Taint
  2930. }{
  2931. {
  2932. Name: "NetworkUnavailable is true",
  2933. Node: &v1.Node{
  2934. ObjectMeta: metav1.ObjectMeta{
  2935. Name: "node0",
  2936. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2937. Labels: map[string]string{
  2938. v1.LabelZoneRegionStable: "region1",
  2939. v1.LabelZoneFailureDomainStable: "zone1",
  2940. v1.LabelZoneRegion: "region1",
  2941. v1.LabelZoneFailureDomain: "zone1",
  2942. },
  2943. },
  2944. Status: v1.NodeStatus{
  2945. Conditions: []v1.NodeCondition{
  2946. {
  2947. Type: v1.NodeReady,
  2948. Status: v1.ConditionTrue,
  2949. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2950. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2951. },
  2952. {
  2953. Type: v1.NodeNetworkUnavailable,
  2954. Status: v1.ConditionTrue,
  2955. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2956. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2957. },
  2958. },
  2959. },
  2960. },
  2961. ExpectedTaints: []*v1.Taint{networkUnavailableTaint},
  2962. },
  2963. {
  2964. Name: "NetworkUnavailable is true",
  2965. Node: &v1.Node{
  2966. ObjectMeta: metav1.ObjectMeta{
  2967. Name: "node0",
  2968. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  2969. Labels: map[string]string{
  2970. v1.LabelZoneRegionStable: "region1",
  2971. v1.LabelZoneFailureDomainStable: "zone1",
  2972. v1.LabelZoneRegion: "region1",
  2973. v1.LabelZoneFailureDomain: "zone1",
  2974. },
  2975. },
  2976. Status: v1.NodeStatus{
  2977. Conditions: []v1.NodeCondition{
  2978. {
  2979. Type: v1.NodeReady,
  2980. Status: v1.ConditionTrue,
  2981. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2982. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2983. },
  2984. {
  2985. Type: v1.NodeNetworkUnavailable,
  2986. Status: v1.ConditionTrue,
  2987. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2988. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  2989. },
  2990. },
  2991. },
  2992. },
  2993. ExpectedTaints: []*v1.Taint{networkUnavailableTaint},
  2994. },
  2995. {
  2996. Name: "Ready is false",
  2997. Node: &v1.Node{
  2998. ObjectMeta: metav1.ObjectMeta{
  2999. Name: "node0",
  3000. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3001. Labels: map[string]string{
  3002. v1.LabelZoneRegionStable: "region1",
  3003. v1.LabelZoneFailureDomainStable: "zone1",
  3004. v1.LabelZoneRegion: "region1",
  3005. v1.LabelZoneFailureDomain: "zone1",
  3006. },
  3007. },
  3008. Status: v1.NodeStatus{
  3009. Conditions: []v1.NodeCondition{
  3010. {
  3011. Type: v1.NodeReady,
  3012. Status: v1.ConditionFalse,
  3013. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3014. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3015. },
  3016. },
  3017. },
  3018. },
  3019. ExpectedTaints: []*v1.Taint{notReadyTaint},
  3020. },
  3021. {
  3022. Name: "Ready is unknown",
  3023. Node: &v1.Node{
  3024. ObjectMeta: metav1.ObjectMeta{
  3025. Name: "node0",
  3026. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3027. Labels: map[string]string{
  3028. v1.LabelZoneRegionStable: "region1",
  3029. v1.LabelZoneFailureDomainStable: "zone1",
  3030. v1.LabelZoneRegion: "region1",
  3031. v1.LabelZoneFailureDomain: "zone1",
  3032. },
  3033. },
  3034. Status: v1.NodeStatus{
  3035. Conditions: []v1.NodeCondition{
  3036. {
  3037. Type: v1.NodeReady,
  3038. Status: v1.ConditionUnknown,
  3039. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3040. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3041. },
  3042. },
  3043. },
  3044. },
  3045. ExpectedTaints: []*v1.Taint{unreachableTaint},
  3046. },
  3047. }
  3048. for _, test := range tests {
  3049. fakeNodeHandler.Update(context.TODO(), test.Node, metav1.UpdateOptions{})
  3050. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  3051. t.Errorf("unexpected error: %v", err)
  3052. }
  3053. nodeController.doNoScheduleTaintingPass(test.Node.Name)
  3054. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  3055. t.Errorf("unexpected error: %v", err)
  3056. }
  3057. node0, err := nodeController.nodeLister.Get("node0")
  3058. if err != nil {
  3059. t.Errorf("Can't get current node0...")
  3060. return
  3061. }
  3062. if len(node0.Spec.Taints) != len(test.ExpectedTaints) {
  3063. t.Errorf("%s: Unexpected number of taints: expected %d, got %d",
  3064. test.Name, len(test.ExpectedTaints), len(node0.Spec.Taints))
  3065. }
  3066. for _, taint := range test.ExpectedTaints {
  3067. if !taintutils.TaintExists(node0.Spec.Taints, taint) {
  3068. t.Errorf("%s: Can't find taint %v in %v", test.Name, taint, node0.Spec.Taints)
  3069. }
  3070. }
  3071. }
  3072. }
  3073. func TestNodeEventGeneration(t *testing.T) {
  3074. fakeNow := metav1.Date(2016, 9, 10, 12, 0, 0, 0, time.UTC)
  3075. fakeNodeHandler := &testutil.FakeNodeHandler{
  3076. Existing: []*v1.Node{
  3077. {
  3078. ObjectMeta: metav1.ObjectMeta{
  3079. Name: "node0",
  3080. UID: "1234567890",
  3081. CreationTimestamp: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  3082. },
  3083. Status: v1.NodeStatus{
  3084. Conditions: []v1.NodeCondition{
  3085. {
  3086. Type: v1.NodeReady,
  3087. Status: v1.ConditionUnknown,
  3088. LastHeartbeatTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  3089. LastTransitionTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC),
  3090. },
  3091. },
  3092. },
  3093. },
  3094. },
  3095. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  3096. }
  3097. nodeController, _ := newNodeLifecycleControllerFromClient(
  3098. fakeNodeHandler,
  3099. 5*time.Minute,
  3100. testRateLimiterQPS,
  3101. testRateLimiterQPS,
  3102. testLargeClusterThreshold,
  3103. testUnhealthyThreshold,
  3104. testNodeMonitorGracePeriod,
  3105. testNodeStartupGracePeriod,
  3106. testNodeMonitorPeriod,
  3107. false)
  3108. nodeController.now = func() metav1.Time { return fakeNow }
  3109. fakeRecorder := testutil.NewFakeRecorder()
  3110. nodeController.recorder = fakeRecorder
  3111. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  3112. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  3113. t.Errorf("unexpected error: %v", err)
  3114. }
  3115. if err := nodeController.monitorNodeHealth(); err != nil {
  3116. t.Errorf("unexpected error: %v", err)
  3117. }
  3118. if len(fakeRecorder.Events) != 1 {
  3119. t.Fatalf("unexpected events, got %v, expected %v: %+v", len(fakeRecorder.Events), 1, fakeRecorder.Events)
  3120. }
  3121. if fakeRecorder.Events[0].Reason != "RegisteredNode" {
  3122. var reasons []string
  3123. for _, event := range fakeRecorder.Events {
  3124. reasons = append(reasons, event.Reason)
  3125. }
  3126. t.Fatalf("unexpected events generation: %v", strings.Join(reasons, ","))
  3127. }
  3128. for _, event := range fakeRecorder.Events {
  3129. involvedObject := event.InvolvedObject
  3130. actualUID := string(involvedObject.UID)
  3131. if actualUID != "1234567890" {
  3132. t.Fatalf("unexpected event uid: %v", actualUID)
  3133. }
  3134. }
  3135. }
  3136. func TestReconcileNodeLabels(t *testing.T) {
  3137. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  3138. evictionTimeout := 10 * time.Minute
  3139. fakeNodeHandler := &testutil.FakeNodeHandler{
  3140. Existing: []*v1.Node{
  3141. {
  3142. ObjectMeta: metav1.ObjectMeta{
  3143. Name: "node0",
  3144. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3145. Labels: map[string]string{
  3146. v1.LabelZoneRegionStable: "region1",
  3147. v1.LabelZoneFailureDomainStable: "zone1",
  3148. v1.LabelZoneRegion: "region1",
  3149. v1.LabelZoneFailureDomain: "zone1",
  3150. },
  3151. },
  3152. Status: v1.NodeStatus{
  3153. Conditions: []v1.NodeCondition{
  3154. {
  3155. Type: v1.NodeReady,
  3156. Status: v1.ConditionTrue,
  3157. LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3158. LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
  3159. },
  3160. },
  3161. },
  3162. },
  3163. },
  3164. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  3165. }
  3166. nodeController, _ := newNodeLifecycleControllerFromClient(
  3167. fakeNodeHandler,
  3168. evictionTimeout,
  3169. testRateLimiterQPS,
  3170. testRateLimiterQPS,
  3171. testLargeClusterThreshold,
  3172. testUnhealthyThreshold,
  3173. testNodeMonitorGracePeriod,
  3174. testNodeStartupGracePeriod,
  3175. testNodeMonitorPeriod,
  3176. true)
  3177. nodeController.now = func() metav1.Time { return fakeNow }
  3178. nodeController.recorder = testutil.NewFakeRecorder()
  3179. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  3180. tests := []struct {
  3181. Name string
  3182. Node *v1.Node
  3183. ExpectedLabels map[string]string
  3184. }{
  3185. {
  3186. Name: "No-op if node has no labels",
  3187. Node: &v1.Node{
  3188. ObjectMeta: metav1.ObjectMeta{
  3189. Name: "node0",
  3190. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3191. },
  3192. },
  3193. ExpectedLabels: nil,
  3194. },
  3195. {
  3196. Name: "No-op if no target labels present",
  3197. Node: &v1.Node{
  3198. ObjectMeta: metav1.ObjectMeta{
  3199. Name: "node0",
  3200. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3201. Labels: map[string]string{
  3202. v1.LabelZoneRegionStable: "region1",
  3203. },
  3204. },
  3205. },
  3206. ExpectedLabels: map[string]string{
  3207. v1.LabelZoneRegionStable: "region1",
  3208. },
  3209. },
  3210. {
  3211. Name: "Create OS/arch stable labels when they don't exist",
  3212. Node: &v1.Node{
  3213. ObjectMeta: metav1.ObjectMeta{
  3214. Name: "node0",
  3215. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3216. Labels: map[string]string{
  3217. kubeletapis.LabelOS: "linux",
  3218. kubeletapis.LabelArch: "amd64",
  3219. },
  3220. },
  3221. },
  3222. ExpectedLabels: map[string]string{
  3223. kubeletapis.LabelOS: "linux",
  3224. kubeletapis.LabelArch: "amd64",
  3225. v1.LabelOSStable: "linux",
  3226. v1.LabelArchStable: "amd64",
  3227. },
  3228. },
  3229. {
  3230. Name: "Reconcile OS/arch stable labels to match beta labels",
  3231. Node: &v1.Node{
  3232. ObjectMeta: metav1.ObjectMeta{
  3233. Name: "node0",
  3234. CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
  3235. Labels: map[string]string{
  3236. kubeletapis.LabelOS: "linux",
  3237. kubeletapis.LabelArch: "amd64",
  3238. v1.LabelOSStable: "windows",
  3239. v1.LabelArchStable: "arm",
  3240. },
  3241. },
  3242. },
  3243. ExpectedLabels: map[string]string{
  3244. kubeletapis.LabelOS: "linux",
  3245. kubeletapis.LabelArch: "amd64",
  3246. v1.LabelOSStable: "linux",
  3247. v1.LabelArchStable: "amd64",
  3248. },
  3249. },
  3250. }
  3251. for _, test := range tests {
  3252. fakeNodeHandler.Update(context.TODO(), test.Node, metav1.UpdateOptions{})
  3253. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  3254. t.Fatalf("unexpected error: %v", err)
  3255. }
  3256. nodeController.reconcileNodeLabels(test.Node.Name)
  3257. if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
  3258. t.Fatalf("unexpected error: %v", err)
  3259. }
  3260. node0, err := nodeController.nodeLister.Get("node0")
  3261. if err != nil {
  3262. t.Fatalf("Can't get current node0...")
  3263. }
  3264. if len(node0.Labels) != len(test.ExpectedLabels) {
  3265. t.Errorf("%s: Unexpected number of taints: expected %d, got %d",
  3266. test.Name, len(test.ExpectedLabels), len(node0.Labels))
  3267. }
  3268. for key, expectedValue := range test.ExpectedLabels {
  3269. actualValue, ok := node0.Labels[key]
  3270. if !ok {
  3271. t.Errorf("%s: Can't find label %v in %v", test.Name, key, node0.Labels)
  3272. }
  3273. if actualValue != expectedValue {
  3274. t.Errorf("%s: label %q: expected value %q, got value %q", test.Name, key, expectedValue, actualValue)
  3275. }
  3276. }
  3277. }
  3278. }
  3279. func TestTryUpdateNodeHealth(t *testing.T) {
  3280. fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
  3281. fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC)
  3282. evictionTimeout := 10 * time.Minute
  3283. fakeNodeHandler := &testutil.FakeNodeHandler{
  3284. Existing: []*v1.Node{
  3285. {
  3286. ObjectMeta: metav1.ObjectMeta{
  3287. Name: "node0",
  3288. CreationTimestamp: fakeNow,
  3289. },
  3290. Status: v1.NodeStatus{
  3291. Conditions: []v1.NodeCondition{
  3292. {
  3293. Type: v1.NodeReady,
  3294. Status: v1.ConditionTrue,
  3295. LastHeartbeatTime: fakeNow,
  3296. LastTransitionTime: fakeNow,
  3297. },
  3298. },
  3299. },
  3300. },
  3301. },
  3302. Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
  3303. }
  3304. nodeController, _ := newNodeLifecycleControllerFromClient(
  3305. fakeNodeHandler,
  3306. evictionTimeout,
  3307. testRateLimiterQPS,
  3308. testRateLimiterQPS,
  3309. testLargeClusterThreshold,
  3310. testUnhealthyThreshold,
  3311. testNodeMonitorGracePeriod,
  3312. testNodeStartupGracePeriod,
  3313. testNodeMonitorPeriod,
  3314. true)
  3315. nodeController.now = func() metav1.Time { return fakeNow }
  3316. nodeController.recorder = testutil.NewFakeRecorder()
  3317. nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
  3318. getStatus := func(cond *v1.NodeCondition) *v1.ConditionStatus {
  3319. if cond == nil {
  3320. return nil
  3321. }
  3322. return &cond.Status
  3323. }
  3324. tests := []struct {
  3325. name string
  3326. node *v1.Node
  3327. }{
  3328. {
  3329. name: "Status true",
  3330. node: &v1.Node{
  3331. ObjectMeta: metav1.ObjectMeta{
  3332. Name: "node0",
  3333. CreationTimestamp: fakeNow,
  3334. },
  3335. Status: v1.NodeStatus{
  3336. Conditions: []v1.NodeCondition{
  3337. {
  3338. Type: v1.NodeReady,
  3339. Status: v1.ConditionTrue,
  3340. LastHeartbeatTime: fakeNow,
  3341. LastTransitionTime: fakeNow,
  3342. },
  3343. },
  3344. },
  3345. },
  3346. },
  3347. {
  3348. name: "Status false",
  3349. node: &v1.Node{
  3350. ObjectMeta: metav1.ObjectMeta{
  3351. Name: "node0",
  3352. CreationTimestamp: fakeNow,
  3353. },
  3354. Status: v1.NodeStatus{
  3355. Conditions: []v1.NodeCondition{
  3356. {
  3357. Type: v1.NodeReady,
  3358. Status: v1.ConditionFalse,
  3359. LastHeartbeatTime: fakeNow,
  3360. LastTransitionTime: fakeNow,
  3361. },
  3362. },
  3363. },
  3364. },
  3365. },
  3366. {
  3367. name: "Status unknown",
  3368. node: &v1.Node{
  3369. ObjectMeta: metav1.ObjectMeta{
  3370. Name: "node0",
  3371. CreationTimestamp: fakeNow,
  3372. },
  3373. Status: v1.NodeStatus{
  3374. Conditions: []v1.NodeCondition{
  3375. {
  3376. Type: v1.NodeReady,
  3377. Status: v1.ConditionUnknown,
  3378. LastHeartbeatTime: fakeNow,
  3379. LastTransitionTime: fakeNow,
  3380. },
  3381. },
  3382. },
  3383. },
  3384. },
  3385. {
  3386. name: "Status nil",
  3387. node: &v1.Node{
  3388. ObjectMeta: metav1.ObjectMeta{
  3389. Name: "node0",
  3390. CreationTimestamp: fakeNow,
  3391. },
  3392. Status: v1.NodeStatus{
  3393. Conditions: []v1.NodeCondition{},
  3394. },
  3395. },
  3396. },
  3397. {
  3398. name: "Status true - after grace period",
  3399. node: &v1.Node{
  3400. ObjectMeta: metav1.ObjectMeta{
  3401. Name: "node0",
  3402. CreationTimestamp: fakeOld,
  3403. },
  3404. Status: v1.NodeStatus{
  3405. Conditions: []v1.NodeCondition{
  3406. {
  3407. Type: v1.NodeReady,
  3408. Status: v1.ConditionTrue,
  3409. LastHeartbeatTime: fakeOld,
  3410. LastTransitionTime: fakeOld,
  3411. },
  3412. },
  3413. },
  3414. },
  3415. },
  3416. {
  3417. name: "Status false - after grace period",
  3418. node: &v1.Node{
  3419. ObjectMeta: metav1.ObjectMeta{
  3420. Name: "node0",
  3421. CreationTimestamp: fakeOld,
  3422. },
  3423. Status: v1.NodeStatus{
  3424. Conditions: []v1.NodeCondition{
  3425. {
  3426. Type: v1.NodeReady,
  3427. Status: v1.ConditionFalse,
  3428. LastHeartbeatTime: fakeOld,
  3429. LastTransitionTime: fakeOld,
  3430. },
  3431. },
  3432. },
  3433. },
  3434. },
  3435. {
  3436. name: "Status unknown - after grace period",
  3437. node: &v1.Node{
  3438. ObjectMeta: metav1.ObjectMeta{
  3439. Name: "node0",
  3440. CreationTimestamp: fakeOld,
  3441. },
  3442. Status: v1.NodeStatus{
  3443. Conditions: []v1.NodeCondition{
  3444. {
  3445. Type: v1.NodeReady,
  3446. Status: v1.ConditionUnknown,
  3447. LastHeartbeatTime: fakeOld,
  3448. LastTransitionTime: fakeOld,
  3449. },
  3450. },
  3451. },
  3452. },
  3453. },
  3454. {
  3455. name: "Status nil - after grace period",
  3456. node: &v1.Node{
  3457. ObjectMeta: metav1.ObjectMeta{
  3458. Name: "node0",
  3459. CreationTimestamp: fakeOld,
  3460. },
  3461. Status: v1.NodeStatus{
  3462. Conditions: []v1.NodeCondition{},
  3463. },
  3464. },
  3465. },
  3466. }
  3467. for _, test := range tests {
  3468. t.Run(test.name, func(t *testing.T) {
  3469. nodeController.nodeHealthMap.set(test.node.Name, &nodeHealthData{
  3470. status: &test.node.Status,
  3471. probeTimestamp: test.node.CreationTimestamp,
  3472. readyTransitionTimestamp: test.node.CreationTimestamp,
  3473. })
  3474. _, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(test.node)
  3475. if err != nil {
  3476. t.Fatalf("unexpected error: %v", err)
  3477. }
  3478. _, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap.getDeepCopy(test.node.Name).status, v1.NodeReady)
  3479. savedStatus := getStatus(savedReadyCondition)
  3480. currentStatus := getStatus(currentReadyCondition)
  3481. if !apiequality.Semantic.DeepEqual(currentStatus, savedStatus) {
  3482. t.Errorf("expected %v, got %v", savedStatus, currentStatus)
  3483. }
  3484. })
  3485. }
  3486. }
  3487. func Test_isNodeExcludedFromDisruptionChecks(t *testing.T) {
  3488. validNodeStatus := v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: "Test"}}}
  3489. tests := []struct {
  3490. name string
  3491. enableExclusion bool
  3492. enableLegacy bool
  3493. input *v1.Node
  3494. want bool
  3495. }{
  3496. {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{}}}},
  3497. {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
  3498. {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
  3499. {want: false, enableExclusion: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
  3500. {want: false, enableLegacy: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
  3501. {want: true, enableLegacy: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
  3502. {want: true, enableExclusion: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
  3503. }
  3504. for _, tt := range tests {
  3505. t.Run(tt.name, func(t *testing.T) {
  3506. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeDisruptionExclusion, tt.enableExclusion)()
  3507. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LegacyNodeRoleBehavior, tt.enableLegacy)()
  3508. if result := isNodeExcludedFromDisruptionChecks(tt.input); result != tt.want {
  3509. t.Errorf("isNodeExcludedFromDisruptionChecks() = %v, want %v", result, tt.want)
  3510. }
  3511. })
  3512. }
  3513. }