node_lifecycle_controller.go 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. // The Controller sets tainted annotations on nodes.
  14. // Tainted nodes should not be used for new work loads and
  15. // some effort should be given to getting existing work
  16. // loads off of tainted nodes.
  17. package nodelifecycle
  18. import (
  19. "context"
  20. "fmt"
  21. "strings"
  22. "sync"
  23. "time"
  24. "k8s.io/klog"
  25. coordv1 "k8s.io/api/coordination/v1"
  26. v1 "k8s.io/api/core/v1"
  27. apiequality "k8s.io/apimachinery/pkg/api/equality"
  28. apierrors "k8s.io/apimachinery/pkg/api/errors"
  29. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  30. "k8s.io/apimachinery/pkg/labels"
  31. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  32. "k8s.io/apimachinery/pkg/util/wait"
  33. utilfeature "k8s.io/apiserver/pkg/util/feature"
  34. appsv1informers "k8s.io/client-go/informers/apps/v1"
  35. coordinformers "k8s.io/client-go/informers/coordination/v1"
  36. coreinformers "k8s.io/client-go/informers/core/v1"
  37. clientset "k8s.io/client-go/kubernetes"
  38. "k8s.io/client-go/kubernetes/scheme"
  39. v1core "k8s.io/client-go/kubernetes/typed/core/v1"
  40. appsv1listers "k8s.io/client-go/listers/apps/v1"
  41. coordlisters "k8s.io/client-go/listers/coordination/v1"
  42. corelisters "k8s.io/client-go/listers/core/v1"
  43. "k8s.io/client-go/tools/cache"
  44. "k8s.io/client-go/tools/record"
  45. "k8s.io/client-go/util/flowcontrol"
  46. "k8s.io/client-go/util/workqueue"
  47. "k8s.io/component-base/metrics/prometheus/ratelimiter"
  48. "k8s.io/kubernetes/pkg/controller"
  49. "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
  50. nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
  51. kubefeatures "k8s.io/kubernetes/pkg/features"
  52. kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
  53. utilnode "k8s.io/kubernetes/pkg/util/node"
  54. taintutils "k8s.io/kubernetes/pkg/util/taints"
  55. )
  56. func init() {
  57. // Register prometheus metrics
  58. Register()
  59. }
  60. var (
  61. // UnreachableTaintTemplate is the taint for when a node becomes unreachable.
  62. UnreachableTaintTemplate = &v1.Taint{
  63. Key: v1.TaintNodeUnreachable,
  64. Effect: v1.TaintEffectNoExecute,
  65. }
  66. // NotReadyTaintTemplate is the taint for when a node is not ready for
  67. // executing pods
  68. NotReadyTaintTemplate = &v1.Taint{
  69. Key: v1.TaintNodeNotReady,
  70. Effect: v1.TaintEffectNoExecute,
  71. }
  72. // map {NodeConditionType: {ConditionStatus: TaintKey}}
  73. // represents which NodeConditionType under which ConditionStatus should be
  74. // tainted with which TaintKey
  75. // for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs
  76. nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{
  77. v1.NodeReady: {
  78. v1.ConditionFalse: v1.TaintNodeNotReady,
  79. v1.ConditionUnknown: v1.TaintNodeUnreachable,
  80. },
  81. v1.NodeMemoryPressure: {
  82. v1.ConditionTrue: v1.TaintNodeMemoryPressure,
  83. },
  84. v1.NodeDiskPressure: {
  85. v1.ConditionTrue: v1.TaintNodeDiskPressure,
  86. },
  87. v1.NodeNetworkUnavailable: {
  88. v1.ConditionTrue: v1.TaintNodeNetworkUnavailable,
  89. },
  90. v1.NodePIDPressure: {
  91. v1.ConditionTrue: v1.TaintNodePIDPressure,
  92. },
  93. }
  94. taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{
  95. v1.TaintNodeNotReady: v1.NodeReady,
  96. v1.TaintNodeUnreachable: v1.NodeReady,
  97. v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable,
  98. v1.TaintNodeMemoryPressure: v1.NodeMemoryPressure,
  99. v1.TaintNodeDiskPressure: v1.NodeDiskPressure,
  100. v1.TaintNodePIDPressure: v1.NodePIDPressure,
  101. }
  102. )
  103. // ZoneState is the state of a given zone.
  104. type ZoneState string
  105. const (
  106. stateInitial = ZoneState("Initial")
  107. stateNormal = ZoneState("Normal")
  108. stateFullDisruption = ZoneState("FullDisruption")
  109. statePartialDisruption = ZoneState("PartialDisruption")
  110. )
  111. const (
  112. // The amount of time the nodecontroller should sleep between retrying node health updates
  113. retrySleepTime = 20 * time.Millisecond
  114. nodeNameKeyIndex = "spec.nodeName"
  115. // podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass.
  116. // Pod update workes will only handle lagging cache pods. 4 workes should be enough.
  117. podUpdateWorkerSize = 4
  118. )
  119. // labelReconcileInfo lists Node labels to reconcile, and how to reconcile them.
  120. // primaryKey and secondaryKey are keys of labels to reconcile.
  121. // - If both keys exist, but their values don't match. Use the value from the
  122. // primaryKey as the source of truth to reconcile.
  123. // - If ensureSecondaryExists is true, and the secondaryKey does not
  124. // exist, secondaryKey will be added with the value of the primaryKey.
  125. var labelReconcileInfo = []struct {
  126. primaryKey string
  127. secondaryKey string
  128. ensureSecondaryExists bool
  129. }{
  130. {
  131. // Reconcile the beta and the stable OS label using the beta label as
  132. // the source of truth.
  133. // TODO(#73084): switch to using the stable label as the source of
  134. // truth in v1.18.
  135. primaryKey: kubeletapis.LabelOS,
  136. secondaryKey: v1.LabelOSStable,
  137. ensureSecondaryExists: true,
  138. },
  139. {
  140. // Reconcile the beta and the stable arch label using the beta label as
  141. // the source of truth.
  142. // TODO(#73084): switch to using the stable label as the source of
  143. // truth in v1.18.
  144. primaryKey: kubeletapis.LabelArch,
  145. secondaryKey: v1.LabelArchStable,
  146. ensureSecondaryExists: true,
  147. },
  148. }
  149. type nodeHealthData struct {
  150. probeTimestamp metav1.Time
  151. readyTransitionTimestamp metav1.Time
  152. status *v1.NodeStatus
  153. lease *coordv1.Lease
  154. }
  155. func (n *nodeHealthData) deepCopy() *nodeHealthData {
  156. if n == nil {
  157. return nil
  158. }
  159. return &nodeHealthData{
  160. probeTimestamp: n.probeTimestamp,
  161. readyTransitionTimestamp: n.readyTransitionTimestamp,
  162. status: n.status.DeepCopy(),
  163. lease: n.lease.DeepCopy(),
  164. }
  165. }
  166. type nodeHealthMap struct {
  167. lock sync.RWMutex
  168. nodeHealths map[string]*nodeHealthData
  169. }
  170. func newNodeHealthMap() *nodeHealthMap {
  171. return &nodeHealthMap{
  172. nodeHealths: make(map[string]*nodeHealthData),
  173. }
  174. }
  175. // getDeepCopy - returns copy of node health data.
  176. // It prevents data being changed after retrieving it from the map.
  177. func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData {
  178. n.lock.RLock()
  179. defer n.lock.RUnlock()
  180. return n.nodeHealths[name].deepCopy()
  181. }
  182. func (n *nodeHealthMap) set(name string, data *nodeHealthData) {
  183. n.lock.Lock()
  184. defer n.lock.Unlock()
  185. n.nodeHealths[name] = data
  186. }
  187. type podUpdateItem struct {
  188. namespace string
  189. name string
  190. }
  191. type evictionStatus int
  192. const (
  193. unmarked = iota
  194. toBeEvicted
  195. evicted
  196. )
  197. // nodeEvictionMap stores evictionStatus data for each node.
  198. type nodeEvictionMap struct {
  199. lock sync.Mutex
  200. nodeEvictions map[string]evictionStatus
  201. }
  202. func newNodeEvictionMap() *nodeEvictionMap {
  203. return &nodeEvictionMap{
  204. nodeEvictions: make(map[string]evictionStatus),
  205. }
  206. }
  207. func (n *nodeEvictionMap) registerNode(nodeName string) {
  208. n.lock.Lock()
  209. defer n.lock.Unlock()
  210. n.nodeEvictions[nodeName] = unmarked
  211. }
  212. func (n *nodeEvictionMap) unregisterNode(nodeName string) {
  213. n.lock.Lock()
  214. defer n.lock.Unlock()
  215. delete(n.nodeEvictions, nodeName)
  216. }
  217. func (n *nodeEvictionMap) setStatus(nodeName string, status evictionStatus) bool {
  218. n.lock.Lock()
  219. defer n.lock.Unlock()
  220. if _, exists := n.nodeEvictions[nodeName]; !exists {
  221. return false
  222. }
  223. n.nodeEvictions[nodeName] = status
  224. return true
  225. }
  226. func (n *nodeEvictionMap) getStatus(nodeName string) (evictionStatus, bool) {
  227. n.lock.Lock()
  228. defer n.lock.Unlock()
  229. if _, exists := n.nodeEvictions[nodeName]; !exists {
  230. return unmarked, false
  231. }
  232. return n.nodeEvictions[nodeName], true
  233. }
  234. // Controller is the controller that manages node's life cycle.
  235. type Controller struct {
  236. taintManager *scheduler.NoExecuteTaintManager
  237. podLister corelisters.PodLister
  238. podInformerSynced cache.InformerSynced
  239. kubeClient clientset.Interface
  240. // This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
  241. // to avoid the problem with time skew across the cluster.
  242. now func() metav1.Time
  243. enterPartialDisruptionFunc func(nodeNum int) float32
  244. enterFullDisruptionFunc func(nodeNum int) float32
  245. computeZoneStateFunc func(nodeConditions []*v1.NodeCondition) (int, ZoneState)
  246. knownNodeSet map[string]*v1.Node
  247. // per Node map storing last observed health together with a local time when it was observed.
  248. nodeHealthMap *nodeHealthMap
  249. // evictorLock protects zonePodEvictor and zoneNoExecuteTainter.
  250. // TODO(#83954): API calls shouldn't be executed under the lock.
  251. evictorLock sync.Mutex
  252. nodeEvictionMap *nodeEvictionMap
  253. // workers that evicts pods from unresponsive nodes.
  254. zonePodEvictor map[string]*scheduler.RateLimitedTimedQueue
  255. // workers that are responsible for tainting nodes.
  256. zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue
  257. nodesToRetry sync.Map
  258. zoneStates map[string]ZoneState
  259. daemonSetStore appsv1listers.DaemonSetLister
  260. daemonSetInformerSynced cache.InformerSynced
  261. leaseLister coordlisters.LeaseLister
  262. leaseInformerSynced cache.InformerSynced
  263. nodeLister corelisters.NodeLister
  264. nodeInformerSynced cache.InformerSynced
  265. getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error)
  266. recorder record.EventRecorder
  267. // Value controlling Controller monitoring period, i.e. how often does Controller
  268. // check node health signal posted from kubelet. This value should be lower than
  269. // nodeMonitorGracePeriod.
  270. // TODO: Change node health monitor to watch based.
  271. nodeMonitorPeriod time.Duration
  272. // When node is just created, e.g. cluster bootstrap or node creation, we give
  273. // a longer grace period.
  274. nodeStartupGracePeriod time.Duration
  275. // Controller will not proactively sync node health, but will monitor node
  276. // health signal updated from kubelet. There are 2 kinds of node healthiness
  277. // signals: NodeStatus and NodeLease. NodeLease signal is generated only when
  278. // NodeLease feature is enabled. If it doesn't receive update for this amount
  279. // of time, it will start posting "NodeReady==ConditionUnknown". The amount of
  280. // time before which Controller start evicting pods is controlled via flag
  281. // 'pod-eviction-timeout'.
  282. // Note: be cautious when changing the constant, it must work with
  283. // nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
  284. // controller. The node health signal update frequency is the minimal of the
  285. // two.
  286. // There are several constraints:
  287. // 1. nodeMonitorGracePeriod must be N times more than the node health signal
  288. // update frequency, where N means number of retries allowed for kubelet to
  289. // post node status/lease. It is pointless to make nodeMonitorGracePeriod
  290. // be less than the node health signal update frequency, since there will
  291. // only be fresh values from Kubelet at an interval of node health signal
  292. // update frequency. The constant must be less than podEvictionTimeout.
  293. // 2. nodeMonitorGracePeriod can't be too large for user experience - larger
  294. // value takes longer for user to see up-to-date node health.
  295. nodeMonitorGracePeriod time.Duration
  296. podEvictionTimeout time.Duration
  297. evictionLimiterQPS float32
  298. secondaryEvictionLimiterQPS float32
  299. largeClusterThreshold int32
  300. unhealthyZoneThreshold float32
  301. // if set to true Controller will start TaintManager that will evict Pods from
  302. // tainted nodes, if they're not tolerated.
  303. runTaintManager bool
  304. // if set to true Controller will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable'
  305. // taints instead of evicting Pods itself.
  306. useTaintBasedEvictions bool
  307. nodeUpdateQueue workqueue.Interface
  308. podUpdateQueue workqueue.RateLimitingInterface
  309. }
  310. // NewNodeLifecycleController returns a new taint controller.
  311. func NewNodeLifecycleController(
  312. leaseInformer coordinformers.LeaseInformer,
  313. podInformer coreinformers.PodInformer,
  314. nodeInformer coreinformers.NodeInformer,
  315. daemonSetInformer appsv1informers.DaemonSetInformer,
  316. kubeClient clientset.Interface,
  317. nodeMonitorPeriod time.Duration,
  318. nodeStartupGracePeriod time.Duration,
  319. nodeMonitorGracePeriod time.Duration,
  320. podEvictionTimeout time.Duration,
  321. evictionLimiterQPS float32,
  322. secondaryEvictionLimiterQPS float32,
  323. largeClusterThreshold int32,
  324. unhealthyZoneThreshold float32,
  325. runTaintManager bool,
  326. useTaintBasedEvictions bool,
  327. ) (*Controller, error) {
  328. if kubeClient == nil {
  329. klog.Fatalf("kubeClient is nil when starting Controller")
  330. }
  331. eventBroadcaster := record.NewBroadcaster()
  332. recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"})
  333. eventBroadcaster.StartLogging(klog.Infof)
  334. klog.Infof("Sending events to api server.")
  335. eventBroadcaster.StartRecordingToSink(
  336. &v1core.EventSinkImpl{
  337. Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events(""),
  338. })
  339. if kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
  340. ratelimiter.RegisterMetricAndTrackRateLimiterUsage("node_lifecycle_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter())
  341. }
  342. nc := &Controller{
  343. kubeClient: kubeClient,
  344. now: metav1.Now,
  345. knownNodeSet: make(map[string]*v1.Node),
  346. nodeHealthMap: newNodeHealthMap(),
  347. nodeEvictionMap: newNodeEvictionMap(),
  348. recorder: recorder,
  349. nodeMonitorPeriod: nodeMonitorPeriod,
  350. nodeStartupGracePeriod: nodeStartupGracePeriod,
  351. nodeMonitorGracePeriod: nodeMonitorGracePeriod,
  352. zonePodEvictor: make(map[string]*scheduler.RateLimitedTimedQueue),
  353. zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue),
  354. nodesToRetry: sync.Map{},
  355. zoneStates: make(map[string]ZoneState),
  356. podEvictionTimeout: podEvictionTimeout,
  357. evictionLimiterQPS: evictionLimiterQPS,
  358. secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
  359. largeClusterThreshold: largeClusterThreshold,
  360. unhealthyZoneThreshold: unhealthyZoneThreshold,
  361. runTaintManager: runTaintManager,
  362. useTaintBasedEvictions: useTaintBasedEvictions && runTaintManager,
  363. nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"),
  364. podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"),
  365. }
  366. if useTaintBasedEvictions {
  367. klog.Infof("Controller is using taint based evictions.")
  368. }
  369. nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
  370. nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
  371. nc.computeZoneStateFunc = nc.ComputeZoneState
  372. podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  373. AddFunc: func(obj interface{}) {
  374. pod := obj.(*v1.Pod)
  375. nc.podUpdated(nil, pod)
  376. if nc.taintManager != nil {
  377. nc.taintManager.PodUpdated(nil, pod)
  378. }
  379. },
  380. UpdateFunc: func(prev, obj interface{}) {
  381. prevPod := prev.(*v1.Pod)
  382. newPod := obj.(*v1.Pod)
  383. nc.podUpdated(prevPod, newPod)
  384. if nc.taintManager != nil {
  385. nc.taintManager.PodUpdated(prevPod, newPod)
  386. }
  387. },
  388. DeleteFunc: func(obj interface{}) {
  389. pod, isPod := obj.(*v1.Pod)
  390. // We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
  391. if !isPod {
  392. deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
  393. if !ok {
  394. klog.Errorf("Received unexpected object: %v", obj)
  395. return
  396. }
  397. pod, ok = deletedState.Obj.(*v1.Pod)
  398. if !ok {
  399. klog.Errorf("DeletedFinalStateUnknown contained non-Pod object: %v", deletedState.Obj)
  400. return
  401. }
  402. }
  403. nc.podUpdated(pod, nil)
  404. if nc.taintManager != nil {
  405. nc.taintManager.PodUpdated(pod, nil)
  406. }
  407. },
  408. })
  409. nc.podInformerSynced = podInformer.Informer().HasSynced
  410. podInformer.Informer().AddIndexers(cache.Indexers{
  411. nodeNameKeyIndex: func(obj interface{}) ([]string, error) {
  412. pod, ok := obj.(*v1.Pod)
  413. if !ok {
  414. return []string{}, nil
  415. }
  416. if len(pod.Spec.NodeName) == 0 {
  417. return []string{}, nil
  418. }
  419. return []string{pod.Spec.NodeName}, nil
  420. },
  421. })
  422. podIndexer := podInformer.Informer().GetIndexer()
  423. nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) {
  424. objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName)
  425. if err != nil {
  426. return nil, err
  427. }
  428. pods := make([]*v1.Pod, 0, len(objs))
  429. for _, obj := range objs {
  430. pod, ok := obj.(*v1.Pod)
  431. if !ok {
  432. continue
  433. }
  434. pods = append(pods, pod)
  435. }
  436. return pods, nil
  437. }
  438. nc.podLister = podInformer.Lister()
  439. if nc.runTaintManager {
  440. podGetter := func(name, namespace string) (*v1.Pod, error) { return nc.podLister.Pods(namespace).Get(name) }
  441. nodeLister := nodeInformer.Lister()
  442. nodeGetter := func(name string) (*v1.Node, error) { return nodeLister.Get(name) }
  443. nc.taintManager = scheduler.NewNoExecuteTaintManager(kubeClient, podGetter, nodeGetter, nc.getPodsAssignedToNode)
  444. nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  445. AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error {
  446. nc.taintManager.NodeUpdated(nil, node)
  447. return nil
  448. }),
  449. UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error {
  450. nc.taintManager.NodeUpdated(oldNode, newNode)
  451. return nil
  452. }),
  453. DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error {
  454. nc.taintManager.NodeUpdated(node, nil)
  455. return nil
  456. }),
  457. })
  458. }
  459. klog.Infof("Controller will reconcile labels.")
  460. nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  461. AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error {
  462. nc.nodeUpdateQueue.Add(node.Name)
  463. nc.nodeEvictionMap.registerNode(node.Name)
  464. return nil
  465. }),
  466. UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
  467. nc.nodeUpdateQueue.Add(newNode.Name)
  468. return nil
  469. }),
  470. DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error {
  471. nc.nodesToRetry.Delete(node.Name)
  472. nc.nodeEvictionMap.unregisterNode(node.Name)
  473. return nil
  474. }),
  475. })
  476. nc.leaseLister = leaseInformer.Lister()
  477. nc.leaseInformerSynced = leaseInformer.Informer().HasSynced
  478. nc.nodeLister = nodeInformer.Lister()
  479. nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
  480. nc.daemonSetStore = daemonSetInformer.Lister()
  481. nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced
  482. return nc, nil
  483. }
  484. // Run starts an asynchronous loop that monitors the status of cluster nodes.
  485. func (nc *Controller) Run(stopCh <-chan struct{}) {
  486. defer utilruntime.HandleCrash()
  487. klog.Infof("Starting node controller")
  488. defer klog.Infof("Shutting down node controller")
  489. if !cache.WaitForNamedCacheSync("taint", stopCh, nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
  490. return
  491. }
  492. if nc.runTaintManager {
  493. go nc.taintManager.Run(stopCh)
  494. }
  495. // Close node update queue to cleanup go routine.
  496. defer nc.nodeUpdateQueue.ShutDown()
  497. defer nc.podUpdateQueue.ShutDown()
  498. // Start workers to reconcile labels and/or update NoSchedule taint for nodes.
  499. for i := 0; i < scheduler.UpdateWorkerSize; i++ {
  500. // Thanks to "workqueue", each worker just need to get item from queue, because
  501. // the item is flagged when got from queue: if new event come, the new item will
  502. // be re-queued until "Done", so no more than one worker handle the same item and
  503. // no event missed.
  504. go wait.Until(nc.doNodeProcessingPassWorker, time.Second, stopCh)
  505. }
  506. for i := 0; i < podUpdateWorkerSize; i++ {
  507. go wait.Until(nc.doPodProcessingWorker, time.Second, stopCh)
  508. }
  509. if nc.useTaintBasedEvictions {
  510. // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
  511. // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
  512. go wait.Until(nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod, stopCh)
  513. } else {
  514. // Managing eviction of nodes:
  515. // When we delete pods off a node, if the node was not empty at the time we then
  516. // queue an eviction watcher. If we hit an error, retry deletion.
  517. go wait.Until(nc.doEvictionPass, scheduler.NodeEvictionPeriod, stopCh)
  518. }
  519. // Incorporate the results of node health signal pushed from kubelet to master.
  520. go wait.Until(func() {
  521. if err := nc.monitorNodeHealth(); err != nil {
  522. klog.Errorf("Error monitoring node health: %v", err)
  523. }
  524. }, nc.nodeMonitorPeriod, stopCh)
  525. <-stopCh
  526. }
  527. func (nc *Controller) doNodeProcessingPassWorker() {
  528. for {
  529. obj, shutdown := nc.nodeUpdateQueue.Get()
  530. // "nodeUpdateQueue" will be shutdown when "stopCh" closed;
  531. // we do not need to re-check "stopCh" again.
  532. if shutdown {
  533. return
  534. }
  535. nodeName := obj.(string)
  536. if err := nc.doNoScheduleTaintingPass(nodeName); err != nil {
  537. klog.Errorf("Failed to taint NoSchedule on node <%s>, requeue it: %v", nodeName, err)
  538. // TODO(k82cn): Add nodeName back to the queue
  539. }
  540. // TODO: re-evaluate whether there are any labels that need to be
  541. // reconcile in 1.19. Remove this function if it's no longer necessary.
  542. if err := nc.reconcileNodeLabels(nodeName); err != nil {
  543. klog.Errorf("Failed to reconcile labels for node <%s>, requeue it: %v", nodeName, err)
  544. // TODO(yujuhong): Add nodeName back to the queue
  545. }
  546. nc.nodeUpdateQueue.Done(nodeName)
  547. }
  548. }
  549. func (nc *Controller) doNoScheduleTaintingPass(nodeName string) error {
  550. node, err := nc.nodeLister.Get(nodeName)
  551. if err != nil {
  552. // If node not found, just ignore it.
  553. if apierrors.IsNotFound(err) {
  554. return nil
  555. }
  556. return err
  557. }
  558. // Map node's condition to Taints.
  559. var taints []v1.Taint
  560. for _, condition := range node.Status.Conditions {
  561. if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found {
  562. if taintKey, found := taintMap[condition.Status]; found {
  563. taints = append(taints, v1.Taint{
  564. Key: taintKey,
  565. Effect: v1.TaintEffectNoSchedule,
  566. })
  567. }
  568. }
  569. }
  570. if node.Spec.Unschedulable {
  571. // If unschedulable, append related taint.
  572. taints = append(taints, v1.Taint{
  573. Key: v1.TaintNodeUnschedulable,
  574. Effect: v1.TaintEffectNoSchedule,
  575. })
  576. }
  577. // Get exist taints of node.
  578. nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool {
  579. // only NoSchedule taints are candidates to be compared with "taints" later
  580. if t.Effect != v1.TaintEffectNoSchedule {
  581. return false
  582. }
  583. // Find unschedulable taint of node.
  584. if t.Key == v1.TaintNodeUnschedulable {
  585. return true
  586. }
  587. // Find node condition taints of node.
  588. _, found := taintKeyToNodeConditionMap[t.Key]
  589. return found
  590. })
  591. taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints)
  592. // If nothing to add not delete, return true directly.
  593. if len(taintsToAdd) == 0 && len(taintsToDel) == 0 {
  594. return nil
  595. }
  596. if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, taintsToAdd, taintsToDel, node) {
  597. return fmt.Errorf("failed to swap taints of node %+v", node)
  598. }
  599. return nil
  600. }
  601. func (nc *Controller) doNoExecuteTaintingPass() {
  602. nc.evictorLock.Lock()
  603. defer nc.evictorLock.Unlock()
  604. for k := range nc.zoneNoExecuteTainter {
  605. // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
  606. nc.zoneNoExecuteTainter[k].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  607. node, err := nc.nodeLister.Get(value.Value)
  608. if apierrors.IsNotFound(err) {
  609. klog.Warningf("Node %v no longer present in nodeLister!", value.Value)
  610. return true, 0
  611. } else if err != nil {
  612. klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
  613. // retry in 50 millisecond
  614. return false, 50 * time.Millisecond
  615. }
  616. _, condition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
  617. // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
  618. taintToAdd := v1.Taint{}
  619. oppositeTaint := v1.Taint{}
  620. switch condition.Status {
  621. case v1.ConditionFalse:
  622. taintToAdd = *NotReadyTaintTemplate
  623. oppositeTaint = *UnreachableTaintTemplate
  624. case v1.ConditionUnknown:
  625. taintToAdd = *UnreachableTaintTemplate
  626. oppositeTaint = *NotReadyTaintTemplate
  627. default:
  628. // It seems that the Node is ready again, so there's no need to taint it.
  629. klog.V(4).Infof("Node %v was in a taint queue, but it's ready now. Ignoring taint request.", value.Value)
  630. return true, 0
  631. }
  632. result := nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
  633. if result {
  634. //count the evictionsNumber
  635. zone := utilnode.GetZoneKey(node)
  636. evictionsNumber.WithLabelValues(zone).Inc()
  637. }
  638. return result, 0
  639. })
  640. }
  641. }
  642. func (nc *Controller) doEvictionPass() {
  643. nc.evictorLock.Lock()
  644. defer nc.evictorLock.Unlock()
  645. for k := range nc.zonePodEvictor {
  646. // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
  647. nc.zonePodEvictor[k].Try(func(value scheduler.TimedValue) (bool, time.Duration) {
  648. node, err := nc.nodeLister.Get(value.Value)
  649. if apierrors.IsNotFound(err) {
  650. klog.Warningf("Node %v no longer present in nodeLister!", value.Value)
  651. } else if err != nil {
  652. klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
  653. }
  654. nodeUID, _ := value.UID.(string)
  655. pods, err := nc.getPodsAssignedToNode(value.Value)
  656. if err != nil {
  657. utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %v", value.Value, err))
  658. return false, 0
  659. }
  660. remaining, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, value.Value, nodeUID, nc.daemonSetStore)
  661. if err != nil {
  662. // We are not setting eviction status here.
  663. // New pods will be handled by zonePodEvictor retry
  664. // instead of immediate pod eviction.
  665. utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
  666. return false, 0
  667. }
  668. if !nc.nodeEvictionMap.setStatus(value.Value, evicted) {
  669. klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", value.Value)
  670. }
  671. if remaining {
  672. klog.Infof("Pods awaiting deletion due to Controller eviction")
  673. }
  674. if node != nil {
  675. zone := utilnode.GetZoneKey(node)
  676. evictionsNumber.WithLabelValues(zone).Inc()
  677. }
  678. return true, 0
  679. })
  680. }
  681. }
  682. // monitorNodeHealth verifies node health are constantly updated by kubelet, and
  683. // if not, post "NodeReady==ConditionUnknown".
  684. // For nodes who are not ready or not reachable for a long period of time.
  685. // This function will taint them if TaintBasedEvictions feature was enabled.
  686. // Otherwise, it would evict it directly.
  687. func (nc *Controller) monitorNodeHealth() error {
  688. // We are listing nodes from local cache as we can tolerate some small delays
  689. // comparing to state from etcd and there is eventual consistency anyway.
  690. nodes, err := nc.nodeLister.List(labels.Everything())
  691. if err != nil {
  692. return err
  693. }
  694. added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes)
  695. for i := range newZoneRepresentatives {
  696. nc.addPodEvictorForNewZone(newZoneRepresentatives[i])
  697. }
  698. for i := range added {
  699. klog.V(1).Infof("Controller observed a new Node: %#v", added[i].Name)
  700. nodeutil.RecordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name))
  701. nc.knownNodeSet[added[i].Name] = added[i]
  702. nc.addPodEvictorForNewZone(added[i])
  703. if nc.useTaintBasedEvictions {
  704. nc.markNodeAsReachable(added[i])
  705. } else {
  706. nc.cancelPodEviction(added[i])
  707. }
  708. }
  709. for i := range deleted {
  710. klog.V(1).Infof("Controller observed a Node deletion: %v", deleted[i].Name)
  711. nodeutil.RecordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name))
  712. delete(nc.knownNodeSet, deleted[i].Name)
  713. }
  714. zoneToNodeConditions := map[string][]*v1.NodeCondition{}
  715. for i := range nodes {
  716. var gracePeriod time.Duration
  717. var observedReadyCondition v1.NodeCondition
  718. var currentReadyCondition *v1.NodeCondition
  719. node := nodes[i].DeepCopy()
  720. if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) {
  721. gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(node)
  722. if err == nil {
  723. return true, nil
  724. }
  725. name := node.Name
  726. node, err = nc.kubeClient.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{})
  727. if err != nil {
  728. klog.Errorf("Failed while getting a Node to retry updating node health. Probably Node %s was deleted.", name)
  729. return false, err
  730. }
  731. return false, nil
  732. }); err != nil {
  733. klog.Errorf("Update health of Node '%v' from Controller error: %v. "+
  734. "Skipping - no pods will be evicted.", node.Name, err)
  735. continue
  736. }
  737. // Some nodes may be excluded from disruption checking
  738. if !isNodeExcludedFromDisruptionChecks(node) {
  739. zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
  740. }
  741. if currentReadyCondition != nil {
  742. pods, err := nc.getPodsAssignedToNode(node.Name)
  743. if err != nil {
  744. utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err))
  745. if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue {
  746. // If error happened during node status transition (Ready -> NotReady)
  747. // we need to mark node for retry to force MarkPodsNotReady execution
  748. // in the next iteration.
  749. nc.nodesToRetry.Store(node.Name, struct{}{})
  750. }
  751. continue
  752. }
  753. if nc.useTaintBasedEvictions {
  754. nc.processTaintBaseEviction(node, &observedReadyCondition)
  755. } else {
  756. if err := nc.processNoTaintBaseEviction(node, &observedReadyCondition, gracePeriod, pods); err != nil {
  757. utilruntime.HandleError(fmt.Errorf("unable to evict all pods from node %v: %v; queuing for retry", node.Name, err))
  758. }
  759. }
  760. _, needsRetry := nc.nodesToRetry.Load(node.Name)
  761. switch {
  762. case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue:
  763. // Report node event only once when status changed.
  764. nodeutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady")
  765. fallthrough
  766. case needsRetry && observedReadyCondition.Status != v1.ConditionTrue:
  767. if err = nodeutil.MarkPodsNotReady(nc.kubeClient, pods, node.Name); err != nil {
  768. utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err))
  769. nc.nodesToRetry.Store(node.Name, struct{}{})
  770. continue
  771. }
  772. }
  773. }
  774. nc.nodesToRetry.Delete(node.Name)
  775. }
  776. nc.handleDisruption(zoneToNodeConditions, nodes)
  777. return nil
  778. }
  779. func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition) {
  780. decisionTimestamp := nc.now()
  781. // Check eviction timeout against decisionTimestamp
  782. switch observedReadyCondition.Status {
  783. case v1.ConditionFalse:
  784. // We want to update the taint straight away if Node is already tainted with the UnreachableTaint
  785. if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
  786. taintToAdd := *NotReadyTaintTemplate
  787. if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
  788. klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
  789. }
  790. } else if nc.markNodeForTainting(node) {
  791. klog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.",
  792. node.Name,
  793. decisionTimestamp,
  794. )
  795. }
  796. case v1.ConditionUnknown:
  797. // We want to update the taint straight away if Node is already tainted with the UnreachableTaint
  798. if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
  799. taintToAdd := *UnreachableTaintTemplate
  800. if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
  801. klog.Errorf("Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle.")
  802. }
  803. } else if nc.markNodeForTainting(node) {
  804. klog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.",
  805. node.Name,
  806. decisionTimestamp,
  807. )
  808. }
  809. case v1.ConditionTrue:
  810. removed, err := nc.markNodeAsReachable(node)
  811. if err != nil {
  812. klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
  813. }
  814. if removed {
  815. klog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name)
  816. }
  817. }
  818. }
  819. func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration, pods []*v1.Pod) error {
  820. decisionTimestamp := nc.now()
  821. nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
  822. if nodeHealthData == nil {
  823. return fmt.Errorf("health data doesn't exist for node %q", node.Name)
  824. }
  825. // Check eviction timeout against decisionTimestamp
  826. switch observedReadyCondition.Status {
  827. case v1.ConditionFalse:
  828. if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
  829. enqueued, err := nc.evictPods(node, pods)
  830. if err != nil {
  831. return err
  832. }
  833. if enqueued {
  834. klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v",
  835. node.Name,
  836. decisionTimestamp,
  837. nodeHealthData.readyTransitionTimestamp,
  838. nc.podEvictionTimeout,
  839. )
  840. }
  841. }
  842. case v1.ConditionUnknown:
  843. if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) {
  844. enqueued, err := nc.evictPods(node, pods)
  845. if err != nil {
  846. return err
  847. }
  848. if enqueued {
  849. klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v",
  850. node.Name,
  851. decisionTimestamp,
  852. nodeHealthData.readyTransitionTimestamp,
  853. nc.podEvictionTimeout-gracePeriod,
  854. )
  855. }
  856. }
  857. case v1.ConditionTrue:
  858. if nc.cancelPodEviction(node) {
  859. klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
  860. }
  861. }
  862. return nil
  863. }
  864. // labelNodeDisruptionExclusion is a label on nodes that controls whether they are
  865. // excluded from being considered for disruption checks by the node controller.
  866. const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
  867. func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool {
  868. // DEPRECATED: will be removed in 1.19
  869. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.LegacyNodeRoleBehavior) {
  870. if legacyIsMasterNode(node.Name) {
  871. return true
  872. }
  873. }
  874. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeDisruptionExclusion) {
  875. if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok {
  876. return true
  877. }
  878. }
  879. return false
  880. }
  881. // legacyIsMasterNode returns true if given node is a registered master according
  882. // to the logic historically used for this function. This code path is deprecated
  883. // and the node disruption exclusion label should be used in the future.
  884. // This code will not be allowed to update to use the node-role label, since
  885. // node-roles may not be used for feature enablement.
  886. // DEPRECATED: Will be removed in 1.19
  887. func legacyIsMasterNode(nodeName string) bool {
  888. // We are trying to capture "master(-...)?$" regexp.
  889. // However, using regexp.MatchString() results even in more than 35%
  890. // of all space allocations in ControllerManager spent in this function.
  891. // That's why we are trying to be a bit smarter.
  892. if strings.HasSuffix(nodeName, "master") {
  893. return true
  894. }
  895. if len(nodeName) >= 10 {
  896. return strings.HasSuffix(nodeName[:len(nodeName)-3], "master-")
  897. }
  898. return false
  899. }
  900. // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
  901. // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
  902. func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
  903. nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
  904. defer func() {
  905. nc.nodeHealthMap.set(node.Name, nodeHealth)
  906. }()
  907. var gracePeriod time.Duration
  908. var observedReadyCondition v1.NodeCondition
  909. _, currentReadyCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
  910. if currentReadyCondition == nil {
  911. // If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
  912. // A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
  913. // to node.CreationTimestamp to avoid handle the corner case.
  914. observedReadyCondition = v1.NodeCondition{
  915. Type: v1.NodeReady,
  916. Status: v1.ConditionUnknown,
  917. LastHeartbeatTime: node.CreationTimestamp,
  918. LastTransitionTime: node.CreationTimestamp,
  919. }
  920. gracePeriod = nc.nodeStartupGracePeriod
  921. if nodeHealth != nil {
  922. nodeHealth.status = &node.Status
  923. } else {
  924. nodeHealth = &nodeHealthData{
  925. status: &node.Status,
  926. probeTimestamp: node.CreationTimestamp,
  927. readyTransitionTimestamp: node.CreationTimestamp,
  928. }
  929. }
  930. } else {
  931. // If ready condition is not nil, make a copy of it, since we may modify it in place later.
  932. observedReadyCondition = *currentReadyCondition
  933. gracePeriod = nc.nodeMonitorGracePeriod
  934. }
  935. // There are following cases to check:
  936. // - both saved and new status have no Ready Condition set - we leave everything as it is,
  937. // - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
  938. // - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
  939. // - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
  940. // unresponsive, so we leave it as it is,
  941. // - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
  942. // everything's in order, no transition occurred, we update only probeTimestamp,
  943. // - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
  944. // Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
  945. // TODO: things to consider:
  946. // - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
  947. // - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
  948. // if that's the case, but it does not seem necessary.
  949. var savedCondition *v1.NodeCondition
  950. var savedLease *coordv1.Lease
  951. if nodeHealth != nil {
  952. _, savedCondition = nodeutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
  953. savedLease = nodeHealth.lease
  954. }
  955. if nodeHealth == nil {
  956. klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name)
  957. nodeHealth = &nodeHealthData{
  958. status: &node.Status,
  959. probeTimestamp: nc.now(),
  960. readyTransitionTimestamp: nc.now(),
  961. }
  962. } else if savedCondition == nil && currentReadyCondition != nil {
  963. klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name)
  964. nodeHealth = &nodeHealthData{
  965. status: &node.Status,
  966. probeTimestamp: nc.now(),
  967. readyTransitionTimestamp: nc.now(),
  968. }
  969. } else if savedCondition != nil && currentReadyCondition == nil {
  970. klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name)
  971. // TODO: figure out what to do in this case. For now we do the same thing as above.
  972. nodeHealth = &nodeHealthData{
  973. status: &node.Status,
  974. probeTimestamp: nc.now(),
  975. readyTransitionTimestamp: nc.now(),
  976. }
  977. } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime {
  978. var transitionTime metav1.Time
  979. // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
  980. // otherwise we leave it as it is.
  981. if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime {
  982. klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition)
  983. transitionTime = nc.now()
  984. } else {
  985. transitionTime = nodeHealth.readyTransitionTimestamp
  986. }
  987. if klog.V(5) {
  988. klog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, nodeHealth.status, node.Status)
  989. } else {
  990. klog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name)
  991. }
  992. nodeHealth = &nodeHealthData{
  993. status: &node.Status,
  994. probeTimestamp: nc.now(),
  995. readyTransitionTimestamp: transitionTime,
  996. }
  997. }
  998. // Always update the probe time if node lease is renewed.
  999. // Note: If kubelet never posted the node status, but continues renewing the
  1000. // heartbeat leases, the node controller will assume the node is healthy and
  1001. // take no action.
  1002. observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
  1003. if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
  1004. nodeHealth.lease = observedLease
  1005. nodeHealth.probeTimestamp = nc.now()
  1006. }
  1007. if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) {
  1008. // NodeReady condition or lease was last set longer ago than gracePeriod, so
  1009. // update it to Unknown (regardless of its current value) in the master.
  1010. nodeConditionTypes := []v1.NodeConditionType{
  1011. v1.NodeReady,
  1012. v1.NodeMemoryPressure,
  1013. v1.NodeDiskPressure,
  1014. v1.NodePIDPressure,
  1015. // We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
  1016. // v1.NodeNetworkUnavailable,
  1017. }
  1018. nowTimestamp := nc.now()
  1019. for _, nodeConditionType := range nodeConditionTypes {
  1020. _, currentCondition := nodeutil.GetNodeCondition(&node.Status, nodeConditionType)
  1021. if currentCondition == nil {
  1022. klog.V(2).Infof("Condition %v of node %v was never updated by kubelet", nodeConditionType, node.Name)
  1023. node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
  1024. Type: nodeConditionType,
  1025. Status: v1.ConditionUnknown,
  1026. Reason: "NodeStatusNeverUpdated",
  1027. Message: "Kubelet never posted node status.",
  1028. LastHeartbeatTime: node.CreationTimestamp,
  1029. LastTransitionTime: nowTimestamp,
  1030. })
  1031. } else {
  1032. klog.V(2).Infof("node %v hasn't been updated for %+v. Last %v is: %+v",
  1033. node.Name, nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition)
  1034. if currentCondition.Status != v1.ConditionUnknown {
  1035. currentCondition.Status = v1.ConditionUnknown
  1036. currentCondition.Reason = "NodeStatusUnknown"
  1037. currentCondition.Message = "Kubelet stopped posting node status."
  1038. currentCondition.LastTransitionTime = nowTimestamp
  1039. }
  1040. }
  1041. }
  1042. // We need to update currentReadyCondition due to its value potentially changed.
  1043. _, currentReadyCondition = nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
  1044. if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) {
  1045. if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(context.TODO(), node, metav1.UpdateOptions{}); err != nil {
  1046. klog.Errorf("Error updating node %s: %v", node.Name, err)
  1047. return gracePeriod, observedReadyCondition, currentReadyCondition, err
  1048. }
  1049. nodeHealth = &nodeHealthData{
  1050. status: &node.Status,
  1051. probeTimestamp: nodeHealth.probeTimestamp,
  1052. readyTransitionTimestamp: nc.now(),
  1053. lease: observedLease,
  1054. }
  1055. return gracePeriod, observedReadyCondition, currentReadyCondition, nil
  1056. }
  1057. }
  1058. return gracePeriod, observedReadyCondition, currentReadyCondition, nil
  1059. }
  1060. func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) {
  1061. newZoneStates := map[string]ZoneState{}
  1062. allAreFullyDisrupted := true
  1063. for k, v := range zoneToNodeConditions {
  1064. zoneSize.WithLabelValues(k).Set(float64(len(v)))
  1065. unhealthy, newState := nc.computeZoneStateFunc(v)
  1066. zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v)))
  1067. unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy))
  1068. if newState != stateFullDisruption {
  1069. allAreFullyDisrupted = false
  1070. }
  1071. newZoneStates[k] = newState
  1072. if _, had := nc.zoneStates[k]; !had {
  1073. klog.Errorf("Setting initial state for unseen zone: %v", k)
  1074. nc.zoneStates[k] = stateInitial
  1075. }
  1076. }
  1077. allWasFullyDisrupted := true
  1078. for k, v := range nc.zoneStates {
  1079. if _, have := zoneToNodeConditions[k]; !have {
  1080. zoneSize.WithLabelValues(k).Set(0)
  1081. zoneHealth.WithLabelValues(k).Set(100)
  1082. unhealthyNodes.WithLabelValues(k).Set(0)
  1083. delete(nc.zoneStates, k)
  1084. continue
  1085. }
  1086. if v != stateFullDisruption {
  1087. allWasFullyDisrupted = false
  1088. break
  1089. }
  1090. }
  1091. // At least one node was responding in previous pass or in the current pass. Semantics is as follows:
  1092. // - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
  1093. // - if the new state is "normal" we resume normal operation (go back to default limiter settings),
  1094. // - if new state is "fullDisruption" we restore normal eviction rate,
  1095. // - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
  1096. if !allAreFullyDisrupted || !allWasFullyDisrupted {
  1097. // We're switching to full disruption mode
  1098. if allAreFullyDisrupted {
  1099. klog.V(0).Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode.")
  1100. for i := range nodes {
  1101. if nc.useTaintBasedEvictions {
  1102. _, err := nc.markNodeAsReachable(nodes[i])
  1103. if err != nil {
  1104. klog.Errorf("Failed to remove taints from Node %v", nodes[i].Name)
  1105. }
  1106. } else {
  1107. nc.cancelPodEviction(nodes[i])
  1108. }
  1109. }
  1110. // We stop all evictions.
  1111. for k := range nc.zoneStates {
  1112. if nc.useTaintBasedEvictions {
  1113. nc.zoneNoExecuteTainter[k].SwapLimiter(0)
  1114. } else {
  1115. nc.zonePodEvictor[k].SwapLimiter(0)
  1116. }
  1117. }
  1118. for k := range nc.zoneStates {
  1119. nc.zoneStates[k] = stateFullDisruption
  1120. }
  1121. // All rate limiters are updated, so we can return early here.
  1122. return
  1123. }
  1124. // We're exiting full disruption mode
  1125. if allWasFullyDisrupted {
  1126. klog.V(0).Info("Controller detected that some Nodes are Ready. Exiting master disruption mode.")
  1127. // When exiting disruption mode update probe timestamps on all Nodes.
  1128. now := nc.now()
  1129. for i := range nodes {
  1130. v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name)
  1131. v.probeTimestamp = now
  1132. v.readyTransitionTimestamp = now
  1133. nc.nodeHealthMap.set(nodes[i].Name, v)
  1134. }
  1135. // We reset all rate limiters to settings appropriate for the given state.
  1136. for k := range nc.zoneStates {
  1137. nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k])
  1138. nc.zoneStates[k] = newZoneStates[k]
  1139. }
  1140. return
  1141. }
  1142. // We know that there's at least one not-fully disrupted so,
  1143. // we can use default behavior for rate limiters
  1144. for k, v := range nc.zoneStates {
  1145. newState := newZoneStates[k]
  1146. if v == newState {
  1147. continue
  1148. }
  1149. klog.V(0).Infof("Controller detected that zone %v is now in state %v.", k, newState)
  1150. nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState)
  1151. nc.zoneStates[k] = newState
  1152. }
  1153. }
  1154. }
  1155. func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) {
  1156. if newPod == nil {
  1157. return
  1158. }
  1159. if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) {
  1160. podItem := podUpdateItem{newPod.Namespace, newPod.Name}
  1161. nc.podUpdateQueue.Add(podItem)
  1162. }
  1163. }
  1164. func (nc *Controller) doPodProcessingWorker() {
  1165. for {
  1166. obj, shutdown := nc.podUpdateQueue.Get()
  1167. // "podUpdateQueue" will be shutdown when "stopCh" closed;
  1168. // we do not need to re-check "stopCh" again.
  1169. if shutdown {
  1170. return
  1171. }
  1172. podItem := obj.(podUpdateItem)
  1173. nc.processPod(podItem)
  1174. }
  1175. }
  1176. // processPod is processing events of assigning pods to nodes. In particular:
  1177. // 1. for NodeReady=true node, taint eviction for this pod will be cancelled
  1178. // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready
  1179. // 3. if node doesn't exist in cache, it will be skipped and handled later by doEvictionPass
  1180. func (nc *Controller) processPod(podItem podUpdateItem) {
  1181. defer nc.podUpdateQueue.Done(podItem)
  1182. pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name)
  1183. if err != nil {
  1184. if apierrors.IsNotFound(err) {
  1185. // If the pod was deleted, there is no need to requeue.
  1186. return
  1187. }
  1188. klog.Warningf("Failed to read pod %v/%v: %v.", podItem.namespace, podItem.name, err)
  1189. nc.podUpdateQueue.AddRateLimited(podItem)
  1190. return
  1191. }
  1192. nodeName := pod.Spec.NodeName
  1193. nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName)
  1194. if nodeHealth == nil {
  1195. // Node data is not gathered yet or node has beed removed in the meantime.
  1196. // Pod will be handled by doEvictionPass method.
  1197. return
  1198. }
  1199. node, err := nc.nodeLister.Get(nodeName)
  1200. if err != nil {
  1201. klog.Warningf("Failed to read node %v: %v.", nodeName, err)
  1202. nc.podUpdateQueue.AddRateLimited(podItem)
  1203. return
  1204. }
  1205. _, currentReadyCondition := nodeutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
  1206. if currentReadyCondition == nil {
  1207. // Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted).
  1208. // In both cases, the pod will be handled correctly (evicted if needed) during processing
  1209. // of the next node update event.
  1210. return
  1211. }
  1212. pods := []*v1.Pod{pod}
  1213. // In taint-based eviction mode, only node updates are processed by NodeLifecycleController.
  1214. // Pods are processed by TaintManager.
  1215. if !nc.useTaintBasedEvictions {
  1216. if err := nc.processNoTaintBaseEviction(node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil {
  1217. klog.Warningf("Unable to process pod %+v eviction from node %v: %v.", podItem, nodeName, err)
  1218. nc.podUpdateQueue.AddRateLimited(podItem)
  1219. return
  1220. }
  1221. }
  1222. if currentReadyCondition.Status != v1.ConditionTrue {
  1223. if err := nodeutil.MarkPodsNotReady(nc.kubeClient, pods, nodeName); err != nil {
  1224. klog.Warningf("Unable to mark pod %+v NotReady on node %v: %v.", podItem, nodeName, err)
  1225. nc.podUpdateQueue.AddRateLimited(podItem)
  1226. }
  1227. }
  1228. }
  1229. func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) {
  1230. switch state {
  1231. case stateNormal:
  1232. if nc.useTaintBasedEvictions {
  1233. nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS)
  1234. } else {
  1235. nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
  1236. }
  1237. case statePartialDisruption:
  1238. if nc.useTaintBasedEvictions {
  1239. nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1240. nc.enterPartialDisruptionFunc(zoneSize))
  1241. } else {
  1242. nc.zonePodEvictor[zone].SwapLimiter(
  1243. nc.enterPartialDisruptionFunc(zoneSize))
  1244. }
  1245. case stateFullDisruption:
  1246. if nc.useTaintBasedEvictions {
  1247. nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1248. nc.enterFullDisruptionFunc(zoneSize))
  1249. } else {
  1250. nc.zonePodEvictor[zone].SwapLimiter(
  1251. nc.enterFullDisruptionFunc(zoneSize))
  1252. }
  1253. }
  1254. }
  1255. // classifyNodes classifies the allNodes to three categories:
  1256. // 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet'
  1257. // 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes'
  1258. // 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states
  1259. func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) {
  1260. for i := range allNodes {
  1261. if _, has := nc.knownNodeSet[allNodes[i].Name]; !has {
  1262. added = append(added, allNodes[i])
  1263. } else {
  1264. // Currently, we only consider new zone as updated.
  1265. zone := utilnode.GetZoneKey(allNodes[i])
  1266. if _, found := nc.zoneStates[zone]; !found {
  1267. newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i])
  1268. }
  1269. }
  1270. }
  1271. // If there's a difference between lengths of known Nodes and observed nodes
  1272. // we must have removed some Node.
  1273. if len(nc.knownNodeSet)+len(added) != len(allNodes) {
  1274. knowSetCopy := map[string]*v1.Node{}
  1275. for k, v := range nc.knownNodeSet {
  1276. knowSetCopy[k] = v
  1277. }
  1278. for i := range allNodes {
  1279. delete(knowSetCopy, allNodes[i].Name)
  1280. }
  1281. for i := range knowSetCopy {
  1282. deleted = append(deleted, knowSetCopy[i])
  1283. }
  1284. }
  1285. return
  1286. }
  1287. // HealthyQPSFunc returns the default value for cluster eviction rate - we take
  1288. // nodeNum for consistency with ReducedQPSFunc.
  1289. func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 {
  1290. return nc.evictionLimiterQPS
  1291. }
  1292. // ReducedQPSFunc returns the QPS for when a the cluster is large make
  1293. // evictions slower, if they're small stop evictions altogether.
  1294. func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 {
  1295. if int32(nodeNum) > nc.largeClusterThreshold {
  1296. return nc.secondaryEvictionLimiterQPS
  1297. }
  1298. return 0
  1299. }
  1300. // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor.
  1301. func (nc *Controller) addPodEvictorForNewZone(node *v1.Node) {
  1302. nc.evictorLock.Lock()
  1303. defer nc.evictorLock.Unlock()
  1304. zone := utilnode.GetZoneKey(node)
  1305. if _, found := nc.zoneStates[zone]; !found {
  1306. nc.zoneStates[zone] = stateInitial
  1307. if !nc.useTaintBasedEvictions {
  1308. nc.zonePodEvictor[zone] =
  1309. scheduler.NewRateLimitedTimedQueue(
  1310. flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst))
  1311. } else {
  1312. nc.zoneNoExecuteTainter[zone] =
  1313. scheduler.NewRateLimitedTimedQueue(
  1314. flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst))
  1315. }
  1316. // Init the metric for the new zone.
  1317. klog.Infof("Initializing eviction metric for zone: %v", zone)
  1318. evictionsNumber.WithLabelValues(zone).Add(0)
  1319. }
  1320. }
  1321. // cancelPodEviction removes any queued evictions, typically because the node is available again. It
  1322. // returns true if an eviction was queued.
  1323. func (nc *Controller) cancelPodEviction(node *v1.Node) bool {
  1324. zone := utilnode.GetZoneKey(node)
  1325. nc.evictorLock.Lock()
  1326. defer nc.evictorLock.Unlock()
  1327. if !nc.nodeEvictionMap.setStatus(node.Name, unmarked) {
  1328. klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name)
  1329. }
  1330. wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
  1331. if wasDeleting {
  1332. klog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
  1333. return true
  1334. }
  1335. return false
  1336. }
  1337. // evictPods:
  1338. // - adds node to evictor queue if the node is not marked as evicted.
  1339. // Returns false if the node name was already enqueued.
  1340. // - deletes pods immediately if node is already marked as evicted.
  1341. // Returns false, because the node wasn't added to the queue.
  1342. func (nc *Controller) evictPods(node *v1.Node, pods []*v1.Pod) (bool, error) {
  1343. nc.evictorLock.Lock()
  1344. defer nc.evictorLock.Unlock()
  1345. status, ok := nc.nodeEvictionMap.getStatus(node.Name)
  1346. if ok && status == evicted {
  1347. // Node eviction already happened for this node.
  1348. // Handling immediate pod deletion.
  1349. _, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, node.Name, string(node.UID), nc.daemonSetStore)
  1350. if err != nil {
  1351. return false, fmt.Errorf("unable to delete pods from node %q: %v", node.Name, err)
  1352. }
  1353. return false, nil
  1354. }
  1355. if !nc.nodeEvictionMap.setStatus(node.Name, toBeEvicted) {
  1356. klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name)
  1357. }
  1358. return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)), nil
  1359. }
  1360. func (nc *Controller) markNodeForTainting(node *v1.Node) bool {
  1361. nc.evictorLock.Lock()
  1362. defer nc.evictorLock.Unlock()
  1363. return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID))
  1364. }
  1365. func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
  1366. nc.evictorLock.Lock()
  1367. defer nc.evictorLock.Unlock()
  1368. err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, UnreachableTaintTemplate)
  1369. if err != nil {
  1370. klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
  1371. return false, err
  1372. }
  1373. err = controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, NotReadyTaintTemplate)
  1374. if err != nil {
  1375. klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
  1376. return false, err
  1377. }
  1378. return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
  1379. }
  1380. // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
  1381. // The zone is considered:
  1382. // - fullyDisrupted if there're no Ready Nodes,
  1383. // - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
  1384. // - normal otherwise
  1385. func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) {
  1386. readyNodes := 0
  1387. notReadyNodes := 0
  1388. for i := range nodeReadyConditions {
  1389. if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue {
  1390. readyNodes++
  1391. } else {
  1392. notReadyNodes++
  1393. }
  1394. }
  1395. switch {
  1396. case readyNodes == 0 && notReadyNodes > 0:
  1397. return notReadyNodes, stateFullDisruption
  1398. case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
  1399. return notReadyNodes, statePartialDisruption
  1400. default:
  1401. return notReadyNodes, stateNormal
  1402. }
  1403. }
  1404. // reconcileNodeLabels reconciles node labels.
  1405. func (nc *Controller) reconcileNodeLabels(nodeName string) error {
  1406. node, err := nc.nodeLister.Get(nodeName)
  1407. if err != nil {
  1408. // If node not found, just ignore it.
  1409. if apierrors.IsNotFound(err) {
  1410. return nil
  1411. }
  1412. return err
  1413. }
  1414. if node.Labels == nil {
  1415. // Nothing to reconcile.
  1416. return nil
  1417. }
  1418. labelsToUpdate := map[string]string{}
  1419. for _, r := range labelReconcileInfo {
  1420. primaryValue, primaryExists := node.Labels[r.primaryKey]
  1421. secondaryValue, secondaryExists := node.Labels[r.secondaryKey]
  1422. if !primaryExists {
  1423. // The primary label key does not exist. This should not happen
  1424. // within our supported version skew range, when no external
  1425. // components/factors modifying the node object. Ignore this case.
  1426. continue
  1427. }
  1428. if secondaryExists && primaryValue != secondaryValue {
  1429. // Secondary label exists, but not consistent with the primary
  1430. // label. Need to reconcile.
  1431. labelsToUpdate[r.secondaryKey] = primaryValue
  1432. } else if !secondaryExists && r.ensureSecondaryExists {
  1433. // Apply secondary label based on primary label.
  1434. labelsToUpdate[r.secondaryKey] = primaryValue
  1435. }
  1436. }
  1437. if len(labelsToUpdate) == 0 {
  1438. return nil
  1439. }
  1440. if !nodeutil.AddOrUpdateLabelsOnNode(nc.kubeClient, labelsToUpdate, node) {
  1441. return fmt.Errorf("failed update labels for node %+v", node)
  1442. }
  1443. return nil
  1444. }