controller_utils.go 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213
  1. /*
  2. Copyright 2014 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package controller
  14. import (
  15. "context"
  16. "encoding/binary"
  17. "encoding/json"
  18. "fmt"
  19. "hash/fnv"
  20. "sync"
  21. "sync/atomic"
  22. "time"
  23. apps "k8s.io/api/apps/v1"
  24. v1 "k8s.io/api/core/v1"
  25. apierrors "k8s.io/apimachinery/pkg/api/errors"
  26. "k8s.io/apimachinery/pkg/api/meta"
  27. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  28. "k8s.io/apimachinery/pkg/labels"
  29. "k8s.io/apimachinery/pkg/runtime"
  30. "k8s.io/apimachinery/pkg/types"
  31. "k8s.io/apimachinery/pkg/util/clock"
  32. "k8s.io/apimachinery/pkg/util/rand"
  33. "k8s.io/apimachinery/pkg/util/sets"
  34. "k8s.io/apimachinery/pkg/util/strategicpatch"
  35. "k8s.io/apimachinery/pkg/util/wait"
  36. clientset "k8s.io/client-go/kubernetes"
  37. v1core "k8s.io/client-go/kubernetes/typed/core/v1"
  38. "k8s.io/client-go/tools/cache"
  39. "k8s.io/client-go/tools/record"
  40. clientretry "k8s.io/client-go/util/retry"
  41. podutil "k8s.io/kubernetes/pkg/api/v1/pod"
  42. _ "k8s.io/kubernetes/pkg/apis/core/install"
  43. "k8s.io/kubernetes/pkg/apis/core/validation"
  44. hashutil "k8s.io/kubernetes/pkg/util/hash"
  45. taintutils "k8s.io/kubernetes/pkg/util/taints"
  46. "k8s.io/utils/integer"
  47. "k8s.io/klog"
  48. )
  49. const (
  50. // If a watch drops a delete event for a pod, it'll take this long
  51. // before a dormant controller waiting for those packets is woken up anyway. It is
  52. // specifically targeted at the case where some problem prevents an update
  53. // of expectations, without it the controller could stay asleep forever. This should
  54. // be set based on the expected latency of watch events.
  55. //
  56. // Currently a controller can service (create *and* observe the watch events for said
  57. // creation) about 10 pods a second, so it takes about 1 min to service
  58. // 500 pods. Just creation is limited to 20qps, and watching happens with ~10-30s
  59. // latency/pod at the scale of 3000 pods over 100 nodes.
  60. ExpectationsTimeout = 5 * time.Minute
  61. // When batching pod creates, SlowStartInitialBatchSize is the size of the
  62. // initial batch. The size of each successive batch is twice the size of
  63. // the previous batch. For example, for a value of 1, batch sizes would be
  64. // 1, 2, 4, 8, ... and for a value of 10, batch sizes would be
  65. // 10, 20, 40, 80, ... Setting the value higher means that quota denials
  66. // will result in more doomed API calls and associated event spam. Setting
  67. // the value lower will result in more API call round trip periods for
  68. // large batches.
  69. //
  70. // Given a number of pods to start "N":
  71. // The number of doomed calls per sync once quota is exceeded is given by:
  72. // min(N,SlowStartInitialBatchSize)
  73. // The number of batches is given by:
  74. // 1+floor(log_2(ceil(N/SlowStartInitialBatchSize)))
  75. SlowStartInitialBatchSize = 1
  76. )
  77. var UpdateTaintBackoff = wait.Backoff{
  78. Steps: 5,
  79. Duration: 100 * time.Millisecond,
  80. Jitter: 1.0,
  81. }
  82. var UpdateLabelBackoff = wait.Backoff{
  83. Steps: 5,
  84. Duration: 100 * time.Millisecond,
  85. Jitter: 1.0,
  86. }
  87. var (
  88. KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
  89. podPhaseToOrdinal = map[v1.PodPhase]int{v1.PodPending: 0, v1.PodUnknown: 1, v1.PodRunning: 2}
  90. )
  91. type ResyncPeriodFunc func() time.Duration
  92. // Returns 0 for resyncPeriod in case resyncing is not needed.
  93. func NoResyncPeriodFunc() time.Duration {
  94. return 0
  95. }
  96. // StaticResyncPeriodFunc returns the resync period specified
  97. func StaticResyncPeriodFunc(resyncPeriod time.Duration) ResyncPeriodFunc {
  98. return func() time.Duration {
  99. return resyncPeriod
  100. }
  101. }
  102. // Expectations are a way for controllers to tell the controller manager what they expect. eg:
  103. // ControllerExpectations: {
  104. // controller1: expects 2 adds in 2 minutes
  105. // controller2: expects 2 dels in 2 minutes
  106. // controller3: expects -1 adds in 2 minutes => controller3's expectations have already been met
  107. // }
  108. //
  109. // Implementation:
  110. // ControlleeExpectation = pair of atomic counters to track controllee's creation/deletion
  111. // ControllerExpectationsStore = TTLStore + a ControlleeExpectation per controller
  112. //
  113. // * Once set expectations can only be lowered
  114. // * A controller isn't synced till its expectations are either fulfilled, or expire
  115. // * Controllers that don't set expectations will get woken up for every matching controllee
  116. // ExpKeyFunc to parse out the key from a ControlleeExpectation
  117. var ExpKeyFunc = func(obj interface{}) (string, error) {
  118. if e, ok := obj.(*ControlleeExpectations); ok {
  119. return e.key, nil
  120. }
  121. return "", fmt.Errorf("could not find key for obj %#v", obj)
  122. }
  123. // ControllerExpectationsInterface is an interface that allows users to set and wait on expectations.
  124. // Only abstracted out for testing.
  125. // Warning: if using KeyFunc it is not safe to use a single ControllerExpectationsInterface with different
  126. // types of controllers, because the keys might conflict across types.
  127. type ControllerExpectationsInterface interface {
  128. GetExpectations(controllerKey string) (*ControlleeExpectations, bool, error)
  129. SatisfiedExpectations(controllerKey string) bool
  130. DeleteExpectations(controllerKey string)
  131. SetExpectations(controllerKey string, add, del int) error
  132. ExpectCreations(controllerKey string, adds int) error
  133. ExpectDeletions(controllerKey string, dels int) error
  134. CreationObserved(controllerKey string)
  135. DeletionObserved(controllerKey string)
  136. RaiseExpectations(controllerKey string, add, del int)
  137. LowerExpectations(controllerKey string, add, del int)
  138. }
  139. // ControllerExpectations is a cache mapping controllers to what they expect to see before being woken up for a sync.
  140. type ControllerExpectations struct {
  141. cache.Store
  142. }
  143. // GetExpectations returns the ControlleeExpectations of the given controller.
  144. func (r *ControllerExpectations) GetExpectations(controllerKey string) (*ControlleeExpectations, bool, error) {
  145. exp, exists, err := r.GetByKey(controllerKey)
  146. if err == nil && exists {
  147. return exp.(*ControlleeExpectations), true, nil
  148. }
  149. return nil, false, err
  150. }
  151. // DeleteExpectations deletes the expectations of the given controller from the TTLStore.
  152. func (r *ControllerExpectations) DeleteExpectations(controllerKey string) {
  153. if exp, exists, err := r.GetByKey(controllerKey); err == nil && exists {
  154. if err := r.Delete(exp); err != nil {
  155. klog.V(2).Infof("Error deleting expectations for controller %v: %v", controllerKey, err)
  156. }
  157. }
  158. }
  159. // SatisfiedExpectations returns true if the required adds/dels for the given controller have been observed.
  160. // Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller
  161. // manager.
  162. func (r *ControllerExpectations) SatisfiedExpectations(controllerKey string) bool {
  163. if exp, exists, err := r.GetExpectations(controllerKey); exists {
  164. if exp.Fulfilled() {
  165. klog.V(4).Infof("Controller expectations fulfilled %#v", exp)
  166. return true
  167. } else if exp.isExpired() {
  168. klog.V(4).Infof("Controller expectations expired %#v", exp)
  169. return true
  170. } else {
  171. klog.V(4).Infof("Controller still waiting on expectations %#v", exp)
  172. return false
  173. }
  174. } else if err != nil {
  175. klog.V(2).Infof("Error encountered while checking expectations %#v, forcing sync", err)
  176. } else {
  177. // When a new controller is created, it doesn't have expectations.
  178. // When it doesn't see expected watch events for > TTL, the expectations expire.
  179. // - In this case it wakes up, creates/deletes controllees, and sets expectations again.
  180. // When it has satisfied expectations and no controllees need to be created/destroyed > TTL, the expectations expire.
  181. // - In this case it continues without setting expectations till it needs to create/delete controllees.
  182. klog.V(4).Infof("Controller %v either never recorded expectations, or the ttl expired.", controllerKey)
  183. }
  184. // Trigger a sync if we either encountered and error (which shouldn't happen since we're
  185. // getting from local store) or this controller hasn't established expectations.
  186. return true
  187. }
  188. // TODO: Extend ExpirationCache to support explicit expiration.
  189. // TODO: Make this possible to disable in tests.
  190. // TODO: Support injection of clock.
  191. func (exp *ControlleeExpectations) isExpired() bool {
  192. return clock.RealClock{}.Since(exp.timestamp) > ExpectationsTimeout
  193. }
  194. // SetExpectations registers new expectations for the given controller. Forgets existing expectations.
  195. func (r *ControllerExpectations) SetExpectations(controllerKey string, add, del int) error {
  196. exp := &ControlleeExpectations{add: int64(add), del: int64(del), key: controllerKey, timestamp: clock.RealClock{}.Now()}
  197. klog.V(4).Infof("Setting expectations %#v", exp)
  198. return r.Add(exp)
  199. }
  200. func (r *ControllerExpectations) ExpectCreations(controllerKey string, adds int) error {
  201. return r.SetExpectations(controllerKey, adds, 0)
  202. }
  203. func (r *ControllerExpectations) ExpectDeletions(controllerKey string, dels int) error {
  204. return r.SetExpectations(controllerKey, 0, dels)
  205. }
  206. // Decrements the expectation counts of the given controller.
  207. func (r *ControllerExpectations) LowerExpectations(controllerKey string, add, del int) {
  208. if exp, exists, err := r.GetExpectations(controllerKey); err == nil && exists {
  209. exp.Add(int64(-add), int64(-del))
  210. // The expectations might've been modified since the update on the previous line.
  211. klog.V(4).Infof("Lowered expectations %#v", exp)
  212. }
  213. }
  214. // Increments the expectation counts of the given controller.
  215. func (r *ControllerExpectations) RaiseExpectations(controllerKey string, add, del int) {
  216. if exp, exists, err := r.GetExpectations(controllerKey); err == nil && exists {
  217. exp.Add(int64(add), int64(del))
  218. // The expectations might've been modified since the update on the previous line.
  219. klog.V(4).Infof("Raised expectations %#v", exp)
  220. }
  221. }
  222. // CreationObserved atomically decrements the `add` expectation count of the given controller.
  223. func (r *ControllerExpectations) CreationObserved(controllerKey string) {
  224. r.LowerExpectations(controllerKey, 1, 0)
  225. }
  226. // DeletionObserved atomically decrements the `del` expectation count of the given controller.
  227. func (r *ControllerExpectations) DeletionObserved(controllerKey string) {
  228. r.LowerExpectations(controllerKey, 0, 1)
  229. }
  230. // Expectations are either fulfilled, or expire naturally.
  231. type Expectations interface {
  232. Fulfilled() bool
  233. }
  234. // ControlleeExpectations track controllee creates/deletes.
  235. type ControlleeExpectations struct {
  236. // Important: Since these two int64 fields are using sync/atomic, they have to be at the top of the struct due to a bug on 32-bit platforms
  237. // See: https://golang.org/pkg/sync/atomic/ for more information
  238. add int64
  239. del int64
  240. key string
  241. timestamp time.Time
  242. }
  243. // Add increments the add and del counters.
  244. func (e *ControlleeExpectations) Add(add, del int64) {
  245. atomic.AddInt64(&e.add, add)
  246. atomic.AddInt64(&e.del, del)
  247. }
  248. // Fulfilled returns true if this expectation has been fulfilled.
  249. func (e *ControlleeExpectations) Fulfilled() bool {
  250. // TODO: think about why this line being atomic doesn't matter
  251. return atomic.LoadInt64(&e.add) <= 0 && atomic.LoadInt64(&e.del) <= 0
  252. }
  253. // GetExpectations returns the add and del expectations of the controllee.
  254. func (e *ControlleeExpectations) GetExpectations() (int64, int64) {
  255. return atomic.LoadInt64(&e.add), atomic.LoadInt64(&e.del)
  256. }
  257. // NewControllerExpectations returns a store for ControllerExpectations.
  258. func NewControllerExpectations() *ControllerExpectations {
  259. return &ControllerExpectations{cache.NewStore(ExpKeyFunc)}
  260. }
  261. // UIDSetKeyFunc to parse out the key from a UIDSet.
  262. var UIDSetKeyFunc = func(obj interface{}) (string, error) {
  263. if u, ok := obj.(*UIDSet); ok {
  264. return u.key, nil
  265. }
  266. return "", fmt.Errorf("could not find key for obj %#v", obj)
  267. }
  268. // UIDSet holds a key and a set of UIDs. Used by the
  269. // UIDTrackingControllerExpectations to remember which UID it has seen/still
  270. // waiting for.
  271. type UIDSet struct {
  272. sets.String
  273. key string
  274. }
  275. // UIDTrackingControllerExpectations tracks the UID of the pods it deletes.
  276. // This cache is needed over plain old expectations to safely handle graceful
  277. // deletion. The desired behavior is to treat an update that sets the
  278. // DeletionTimestamp on an object as a delete. To do so consistently, one needs
  279. // to remember the expected deletes so they aren't double counted.
  280. // TODO: Track creates as well (#22599)
  281. type UIDTrackingControllerExpectations struct {
  282. ControllerExpectationsInterface
  283. // TODO: There is a much nicer way to do this that involves a single store,
  284. // a lock per entry, and a ControlleeExpectationsInterface type.
  285. uidStoreLock sync.Mutex
  286. // Store used for the UIDs associated with any expectation tracked via the
  287. // ControllerExpectationsInterface.
  288. uidStore cache.Store
  289. }
  290. // GetUIDs is a convenience method to avoid exposing the set of expected uids.
  291. // The returned set is not thread safe, all modifications must be made holding
  292. // the uidStoreLock.
  293. func (u *UIDTrackingControllerExpectations) GetUIDs(controllerKey string) sets.String {
  294. if uid, exists, err := u.uidStore.GetByKey(controllerKey); err == nil && exists {
  295. return uid.(*UIDSet).String
  296. }
  297. return nil
  298. }
  299. // ExpectDeletions records expectations for the given deleteKeys, against the given controller.
  300. func (u *UIDTrackingControllerExpectations) ExpectDeletions(rcKey string, deletedKeys []string) error {
  301. expectedUIDs := sets.NewString()
  302. for _, k := range deletedKeys {
  303. expectedUIDs.Insert(k)
  304. }
  305. klog.V(4).Infof("Controller %v waiting on deletions for: %+v", rcKey, deletedKeys)
  306. u.uidStoreLock.Lock()
  307. defer u.uidStoreLock.Unlock()
  308. if existing := u.GetUIDs(rcKey); existing != nil && existing.Len() != 0 {
  309. klog.Errorf("Clobbering existing delete keys: %+v", existing)
  310. }
  311. if err := u.uidStore.Add(&UIDSet{expectedUIDs, rcKey}); err != nil {
  312. return err
  313. }
  314. return u.ControllerExpectationsInterface.ExpectDeletions(rcKey, expectedUIDs.Len())
  315. }
  316. // DeletionObserved records the given deleteKey as a deletion, for the given rc.
  317. func (u *UIDTrackingControllerExpectations) DeletionObserved(rcKey, deleteKey string) {
  318. u.uidStoreLock.Lock()
  319. defer u.uidStoreLock.Unlock()
  320. uids := u.GetUIDs(rcKey)
  321. if uids != nil && uids.Has(deleteKey) {
  322. klog.V(4).Infof("Controller %v received delete for pod %v", rcKey, deleteKey)
  323. u.ControllerExpectationsInterface.DeletionObserved(rcKey)
  324. uids.Delete(deleteKey)
  325. }
  326. }
  327. // DeleteExpectations deletes the UID set and invokes DeleteExpectations on the
  328. // underlying ControllerExpectationsInterface.
  329. func (u *UIDTrackingControllerExpectations) DeleteExpectations(rcKey string) {
  330. u.uidStoreLock.Lock()
  331. defer u.uidStoreLock.Unlock()
  332. u.ControllerExpectationsInterface.DeleteExpectations(rcKey)
  333. if uidExp, exists, err := u.uidStore.GetByKey(rcKey); err == nil && exists {
  334. if err := u.uidStore.Delete(uidExp); err != nil {
  335. klog.V(2).Infof("Error deleting uid expectations for controller %v: %v", rcKey, err)
  336. }
  337. }
  338. }
  339. // NewUIDTrackingControllerExpectations returns a wrapper around
  340. // ControllerExpectations that is aware of deleteKeys.
  341. func NewUIDTrackingControllerExpectations(ce ControllerExpectationsInterface) *UIDTrackingControllerExpectations {
  342. return &UIDTrackingControllerExpectations{ControllerExpectationsInterface: ce, uidStore: cache.NewStore(UIDSetKeyFunc)}
  343. }
  344. // Reasons for pod events
  345. const (
  346. // FailedCreatePodReason is added in an event and in a replica set condition
  347. // when a pod for a replica set is failed to be created.
  348. FailedCreatePodReason = "FailedCreate"
  349. // SuccessfulCreatePodReason is added in an event when a pod for a replica set
  350. // is successfully created.
  351. SuccessfulCreatePodReason = "SuccessfulCreate"
  352. // FailedDeletePodReason is added in an event and in a replica set condition
  353. // when a pod for a replica set is failed to be deleted.
  354. FailedDeletePodReason = "FailedDelete"
  355. // SuccessfulDeletePodReason is added in an event when a pod for a replica set
  356. // is successfully deleted.
  357. SuccessfulDeletePodReason = "SuccessfulDelete"
  358. )
  359. // RSControlInterface is an interface that knows how to add or delete
  360. // ReplicaSets, as well as increment or decrement them. It is used
  361. // by the deployment controller to ease testing of actions that it takes.
  362. type RSControlInterface interface {
  363. PatchReplicaSet(namespace, name string, data []byte) error
  364. }
  365. // RealRSControl is the default implementation of RSControllerInterface.
  366. type RealRSControl struct {
  367. KubeClient clientset.Interface
  368. Recorder record.EventRecorder
  369. }
  370. var _ RSControlInterface = &RealRSControl{}
  371. func (r RealRSControl) PatchReplicaSet(namespace, name string, data []byte) error {
  372. _, err := r.KubeClient.AppsV1().ReplicaSets(namespace).Patch(context.TODO(), name, types.StrategicMergePatchType, data, metav1.PatchOptions{})
  373. return err
  374. }
  375. // TODO: merge the controller revision interface in controller_history.go with this one
  376. // ControllerRevisionControlInterface is an interface that knows how to patch
  377. // ControllerRevisions, as well as increment or decrement them. It is used
  378. // by the daemonset controller to ease testing of actions that it takes.
  379. type ControllerRevisionControlInterface interface {
  380. PatchControllerRevision(namespace, name string, data []byte) error
  381. }
  382. // RealControllerRevisionControl is the default implementation of ControllerRevisionControlInterface.
  383. type RealControllerRevisionControl struct {
  384. KubeClient clientset.Interface
  385. }
  386. var _ ControllerRevisionControlInterface = &RealControllerRevisionControl{}
  387. func (r RealControllerRevisionControl) PatchControllerRevision(namespace, name string, data []byte) error {
  388. _, err := r.KubeClient.AppsV1().ControllerRevisions(namespace).Patch(context.TODO(), name, types.StrategicMergePatchType, data, metav1.PatchOptions{})
  389. return err
  390. }
  391. // PodControlInterface is an interface that knows how to add or delete pods
  392. // created as an interface to allow testing.
  393. type PodControlInterface interface {
  394. // CreatePods creates new pods according to the spec.
  395. CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error
  396. // CreatePodsOnNode creates a new pod according to the spec on the specified node,
  397. // and sets the ControllerRef.
  398. CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error
  399. // CreatePodsWithControllerRef creates new pods according to the spec, and sets object as the pod's controller.
  400. CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error
  401. // DeletePod deletes the pod identified by podID.
  402. DeletePod(namespace string, podID string, object runtime.Object) error
  403. // PatchPod patches the pod.
  404. PatchPod(namespace, name string, data []byte) error
  405. }
  406. // RealPodControl is the default implementation of PodControlInterface.
  407. type RealPodControl struct {
  408. KubeClient clientset.Interface
  409. Recorder record.EventRecorder
  410. }
  411. var _ PodControlInterface = &RealPodControl{}
  412. func getPodsLabelSet(template *v1.PodTemplateSpec) labels.Set {
  413. desiredLabels := make(labels.Set)
  414. for k, v := range template.Labels {
  415. desiredLabels[k] = v
  416. }
  417. return desiredLabels
  418. }
  419. func getPodsFinalizers(template *v1.PodTemplateSpec) []string {
  420. desiredFinalizers := make([]string, len(template.Finalizers))
  421. copy(desiredFinalizers, template.Finalizers)
  422. return desiredFinalizers
  423. }
  424. func getPodsAnnotationSet(template *v1.PodTemplateSpec) labels.Set {
  425. desiredAnnotations := make(labels.Set)
  426. for k, v := range template.Annotations {
  427. desiredAnnotations[k] = v
  428. }
  429. return desiredAnnotations
  430. }
  431. func getPodsPrefix(controllerName string) string {
  432. // use the dash (if the name isn't too long) to make the pod name a bit prettier
  433. prefix := fmt.Sprintf("%s-", controllerName)
  434. if len(validation.ValidatePodName(prefix, true)) != 0 {
  435. prefix = controllerName
  436. }
  437. return prefix
  438. }
  439. func validateControllerRef(controllerRef *metav1.OwnerReference) error {
  440. if controllerRef == nil {
  441. return fmt.Errorf("controllerRef is nil")
  442. }
  443. if len(controllerRef.APIVersion) == 0 {
  444. return fmt.Errorf("controllerRef has empty APIVersion")
  445. }
  446. if len(controllerRef.Kind) == 0 {
  447. return fmt.Errorf("controllerRef has empty Kind")
  448. }
  449. if controllerRef.Controller == nil || *controllerRef.Controller != true {
  450. return fmt.Errorf("controllerRef.Controller is not set to true")
  451. }
  452. if controllerRef.BlockOwnerDeletion == nil || *controllerRef.BlockOwnerDeletion != true {
  453. return fmt.Errorf("controllerRef.BlockOwnerDeletion is not set")
  454. }
  455. return nil
  456. }
  457. func (r RealPodControl) CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error {
  458. return r.createPods("", namespace, template, object, nil)
  459. }
  460. func (r RealPodControl) CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error {
  461. if err := validateControllerRef(controllerRef); err != nil {
  462. return err
  463. }
  464. return r.createPods("", namespace, template, controllerObject, controllerRef)
  465. }
  466. func (r RealPodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
  467. if err := validateControllerRef(controllerRef); err != nil {
  468. return err
  469. }
  470. return r.createPods(nodeName, namespace, template, object, controllerRef)
  471. }
  472. func (r RealPodControl) PatchPod(namespace, name string, data []byte) error {
  473. _, err := r.KubeClient.CoreV1().Pods(namespace).Patch(context.TODO(), name, types.StrategicMergePatchType, data, metav1.PatchOptions{})
  474. return err
  475. }
  476. func GetPodFromTemplate(template *v1.PodTemplateSpec, parentObject runtime.Object, controllerRef *metav1.OwnerReference) (*v1.Pod, error) {
  477. desiredLabels := getPodsLabelSet(template)
  478. desiredFinalizers := getPodsFinalizers(template)
  479. desiredAnnotations := getPodsAnnotationSet(template)
  480. accessor, err := meta.Accessor(parentObject)
  481. if err != nil {
  482. return nil, fmt.Errorf("parentObject does not have ObjectMeta, %v", err)
  483. }
  484. prefix := getPodsPrefix(accessor.GetName())
  485. pod := &v1.Pod{
  486. ObjectMeta: metav1.ObjectMeta{
  487. Labels: desiredLabels,
  488. Annotations: desiredAnnotations,
  489. GenerateName: prefix,
  490. Finalizers: desiredFinalizers,
  491. },
  492. }
  493. if controllerRef != nil {
  494. pod.OwnerReferences = append(pod.OwnerReferences, *controllerRef)
  495. }
  496. pod.Spec = *template.Spec.DeepCopy()
  497. return pod, nil
  498. }
  499. func (r RealPodControl) createPods(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
  500. pod, err := GetPodFromTemplate(template, object, controllerRef)
  501. if err != nil {
  502. return err
  503. }
  504. if len(nodeName) != 0 {
  505. pod.Spec.NodeName = nodeName
  506. }
  507. if len(labels.Set(pod.Labels)) == 0 {
  508. return fmt.Errorf("unable to create pods, no labels")
  509. }
  510. newPod, err := r.KubeClient.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
  511. if err != nil {
  512. // only send an event if the namespace isn't terminating
  513. if !apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
  514. r.Recorder.Eventf(object, v1.EventTypeWarning, FailedCreatePodReason, "Error creating: %v", err)
  515. }
  516. return err
  517. }
  518. accessor, err := meta.Accessor(object)
  519. if err != nil {
  520. klog.Errorf("parentObject does not have ObjectMeta, %v", err)
  521. return nil
  522. }
  523. klog.V(4).Infof("Controller %v created pod %v", accessor.GetName(), newPod.Name)
  524. r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulCreatePodReason, "Created pod: %v", newPod.Name)
  525. return nil
  526. }
  527. func (r RealPodControl) DeletePod(namespace string, podID string, object runtime.Object) error {
  528. accessor, err := meta.Accessor(object)
  529. if err != nil {
  530. return fmt.Errorf("object does not have ObjectMeta, %v", err)
  531. }
  532. klog.V(2).Infof("Controller %v deleting pod %v/%v", accessor.GetName(), namespace, podID)
  533. if err := r.KubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), podID, nil); err != nil && !apierrors.IsNotFound(err) {
  534. r.Recorder.Eventf(object, v1.EventTypeWarning, FailedDeletePodReason, "Error deleting: %v", err)
  535. return fmt.Errorf("unable to delete pods: %v", err)
  536. }
  537. r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulDeletePodReason, "Deleted pod: %v", podID)
  538. return nil
  539. }
  540. type FakePodControl struct {
  541. sync.Mutex
  542. Templates []v1.PodTemplateSpec
  543. ControllerRefs []metav1.OwnerReference
  544. DeletePodName []string
  545. Patches [][]byte
  546. Err error
  547. CreateLimit int
  548. CreateCallCount int
  549. }
  550. var _ PodControlInterface = &FakePodControl{}
  551. func (f *FakePodControl) PatchPod(namespace, name string, data []byte) error {
  552. f.Lock()
  553. defer f.Unlock()
  554. f.Patches = append(f.Patches, data)
  555. if f.Err != nil {
  556. return f.Err
  557. }
  558. return nil
  559. }
  560. func (f *FakePodControl) CreatePods(namespace string, spec *v1.PodTemplateSpec, object runtime.Object) error {
  561. f.Lock()
  562. defer f.Unlock()
  563. f.CreateCallCount++
  564. if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
  565. return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
  566. }
  567. f.Templates = append(f.Templates, *spec)
  568. if f.Err != nil {
  569. return f.Err
  570. }
  571. return nil
  572. }
  573. func (f *FakePodControl) CreatePodsWithControllerRef(namespace string, spec *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
  574. f.Lock()
  575. defer f.Unlock()
  576. f.CreateCallCount++
  577. if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
  578. return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
  579. }
  580. f.Templates = append(f.Templates, *spec)
  581. f.ControllerRefs = append(f.ControllerRefs, *controllerRef)
  582. if f.Err != nil {
  583. return f.Err
  584. }
  585. return nil
  586. }
  587. func (f *FakePodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
  588. f.Lock()
  589. defer f.Unlock()
  590. f.CreateCallCount++
  591. if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
  592. return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
  593. }
  594. f.Templates = append(f.Templates, *template)
  595. f.ControllerRefs = append(f.ControllerRefs, *controllerRef)
  596. if f.Err != nil {
  597. return f.Err
  598. }
  599. return nil
  600. }
  601. func (f *FakePodControl) DeletePod(namespace string, podID string, object runtime.Object) error {
  602. f.Lock()
  603. defer f.Unlock()
  604. f.DeletePodName = append(f.DeletePodName, podID)
  605. if f.Err != nil {
  606. return f.Err
  607. }
  608. return nil
  609. }
  610. func (f *FakePodControl) Clear() {
  611. f.Lock()
  612. defer f.Unlock()
  613. f.DeletePodName = []string{}
  614. f.Templates = []v1.PodTemplateSpec{}
  615. f.ControllerRefs = []metav1.OwnerReference{}
  616. f.Patches = [][]byte{}
  617. f.CreateLimit = 0
  618. f.CreateCallCount = 0
  619. }
  620. // ByLogging allows custom sorting of pods so the best one can be picked for getting its logs.
  621. type ByLogging []*v1.Pod
  622. func (s ByLogging) Len() int { return len(s) }
  623. func (s ByLogging) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  624. func (s ByLogging) Less(i, j int) bool {
  625. // 1. assigned < unassigned
  626. if s[i].Spec.NodeName != s[j].Spec.NodeName && (len(s[i].Spec.NodeName) == 0 || len(s[j].Spec.NodeName) == 0) {
  627. return len(s[i].Spec.NodeName) > 0
  628. }
  629. // 2. PodRunning < PodUnknown < PodPending
  630. if s[i].Status.Phase != s[j].Status.Phase {
  631. return podPhaseToOrdinal[s[i].Status.Phase] > podPhaseToOrdinal[s[j].Status.Phase]
  632. }
  633. // 3. ready < not ready
  634. if podutil.IsPodReady(s[i]) != podutil.IsPodReady(s[j]) {
  635. return podutil.IsPodReady(s[i])
  636. }
  637. // TODO: take availability into account when we push minReadySeconds information from deployment into pods,
  638. // see https://github.com/kubernetes/kubernetes/issues/22065
  639. // 4. Been ready for more time < less time < empty time
  640. if podutil.IsPodReady(s[i]) && podutil.IsPodReady(s[j]) {
  641. readyTime1 := podReadyTime(s[i])
  642. readyTime2 := podReadyTime(s[j])
  643. if !readyTime1.Equal(readyTime2) {
  644. return afterOrZero(readyTime2, readyTime1)
  645. }
  646. }
  647. // 5. Pods with containers with higher restart counts < lower restart counts
  648. if maxContainerRestarts(s[i]) != maxContainerRestarts(s[j]) {
  649. return maxContainerRestarts(s[i]) > maxContainerRestarts(s[j])
  650. }
  651. // 6. older pods < newer pods < empty timestamp pods
  652. if !s[i].CreationTimestamp.Equal(&s[j].CreationTimestamp) {
  653. return afterOrZero(&s[j].CreationTimestamp, &s[i].CreationTimestamp)
  654. }
  655. return false
  656. }
  657. // ActivePods type allows custom sorting of pods so a controller can pick the best ones to delete.
  658. type ActivePods []*v1.Pod
  659. func (s ActivePods) Len() int { return len(s) }
  660. func (s ActivePods) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  661. func (s ActivePods) Less(i, j int) bool {
  662. // 1. Unassigned < assigned
  663. // If only one of the pods is unassigned, the unassigned one is smaller
  664. if s[i].Spec.NodeName != s[j].Spec.NodeName && (len(s[i].Spec.NodeName) == 0 || len(s[j].Spec.NodeName) == 0) {
  665. return len(s[i].Spec.NodeName) == 0
  666. }
  667. // 2. PodPending < PodUnknown < PodRunning
  668. if podPhaseToOrdinal[s[i].Status.Phase] != podPhaseToOrdinal[s[j].Status.Phase] {
  669. return podPhaseToOrdinal[s[i].Status.Phase] < podPhaseToOrdinal[s[j].Status.Phase]
  670. }
  671. // 3. Not ready < ready
  672. // If only one of the pods is not ready, the not ready one is smaller
  673. if podutil.IsPodReady(s[i]) != podutil.IsPodReady(s[j]) {
  674. return !podutil.IsPodReady(s[i])
  675. }
  676. // TODO: take availability into account when we push minReadySeconds information from deployment into pods,
  677. // see https://github.com/kubernetes/kubernetes/issues/22065
  678. // 4. Been ready for empty time < less time < more time
  679. // If both pods are ready, the latest ready one is smaller
  680. if podutil.IsPodReady(s[i]) && podutil.IsPodReady(s[j]) {
  681. readyTime1 := podReadyTime(s[i])
  682. readyTime2 := podReadyTime(s[j])
  683. if !readyTime1.Equal(readyTime2) {
  684. return afterOrZero(readyTime1, readyTime2)
  685. }
  686. }
  687. // 5. Pods with containers with higher restart counts < lower restart counts
  688. if maxContainerRestarts(s[i]) != maxContainerRestarts(s[j]) {
  689. return maxContainerRestarts(s[i]) > maxContainerRestarts(s[j])
  690. }
  691. // 6. Empty creation time pods < newer pods < older pods
  692. if !s[i].CreationTimestamp.Equal(&s[j].CreationTimestamp) {
  693. return afterOrZero(&s[i].CreationTimestamp, &s[j].CreationTimestamp)
  694. }
  695. return false
  696. }
  697. // ActivePodsWithRanks is a sortable list of pods and a list of corresponding
  698. // ranks which will be considered during sorting. The two lists must have equal
  699. // length. After sorting, the pods will be ordered as follows, applying each
  700. // rule in turn until one matches:
  701. //
  702. // 1. If only one of the pods is assigned to a node, the pod that is not
  703. // assigned comes before the pod that is.
  704. // 2. If the pods' phases differ, a pending pod comes before a pod whose phase
  705. // is unknown, and a pod whose phase is unknown comes before a running pod.
  706. // 3. If exactly one of the pods is ready, the pod that is not ready comes
  707. // before the ready pod.
  708. // 4. If the pods' ranks differ, the pod with greater rank comes before the pod
  709. // with lower rank.
  710. // 5. If both pods are ready but have not been ready for the same amount of
  711. // time, the pod that has been ready for a shorter amount of time comes
  712. // before the pod that has been ready for longer.
  713. // 6. If one pod has a container that has restarted more than any container in
  714. // the other pod, the pod with the container with more restarts comes
  715. // before the other pod.
  716. // 7. If the pods' creation times differ, the pod that was created more recently
  717. // comes before the older pod.
  718. //
  719. // If none of these rules matches, the second pod comes before the first pod.
  720. //
  721. // The intention of this ordering is to put pods that should be preferred for
  722. // deletion first in the list.
  723. type ActivePodsWithRanks struct {
  724. // Pods is a list of pods.
  725. Pods []*v1.Pod
  726. // Rank is a ranking of pods. This ranking is used during sorting when
  727. // comparing two pods that are both scheduled, in the same phase, and
  728. // having the same ready status.
  729. Rank []int
  730. }
  731. func (s ActivePodsWithRanks) Len() int {
  732. return len(s.Pods)
  733. }
  734. func (s ActivePodsWithRanks) Swap(i, j int) {
  735. s.Pods[i], s.Pods[j] = s.Pods[j], s.Pods[i]
  736. s.Rank[i], s.Rank[j] = s.Rank[j], s.Rank[i]
  737. }
  738. // Less compares two pods with corresponding ranks and returns true if the first
  739. // one should be preferred for deletion.
  740. func (s ActivePodsWithRanks) Less(i, j int) bool {
  741. // 1. Unassigned < assigned
  742. // If only one of the pods is unassigned, the unassigned one is smaller
  743. if s.Pods[i].Spec.NodeName != s.Pods[j].Spec.NodeName && (len(s.Pods[i].Spec.NodeName) == 0 || len(s.Pods[j].Spec.NodeName) == 0) {
  744. return len(s.Pods[i].Spec.NodeName) == 0
  745. }
  746. // 2. PodPending < PodUnknown < PodRunning
  747. if podPhaseToOrdinal[s.Pods[i].Status.Phase] != podPhaseToOrdinal[s.Pods[j].Status.Phase] {
  748. return podPhaseToOrdinal[s.Pods[i].Status.Phase] < podPhaseToOrdinal[s.Pods[j].Status.Phase]
  749. }
  750. // 3. Not ready < ready
  751. // If only one of the pods is not ready, the not ready one is smaller
  752. if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) {
  753. return !podutil.IsPodReady(s.Pods[i])
  754. }
  755. // 4. Doubled up < not doubled up
  756. // If one of the two pods is on the same node as one or more additional
  757. // ready pods that belong to the same replicaset, whichever pod has more
  758. // colocated ready pods is less
  759. if s.Rank[i] != s.Rank[j] {
  760. return s.Rank[i] > s.Rank[j]
  761. }
  762. // TODO: take availability into account when we push minReadySeconds information from deployment into pods,
  763. // see https://github.com/kubernetes/kubernetes/issues/22065
  764. // 5. Been ready for empty time < less time < more time
  765. // If both pods are ready, the latest ready one is smaller
  766. if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) {
  767. readyTime1 := podReadyTime(s.Pods[i])
  768. readyTime2 := podReadyTime(s.Pods[j])
  769. if !readyTime1.Equal(readyTime2) {
  770. return afterOrZero(readyTime1, readyTime2)
  771. }
  772. }
  773. // 6. Pods with containers with higher restart counts < lower restart counts
  774. if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) {
  775. return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j])
  776. }
  777. // 7. Empty creation time pods < newer pods < older pods
  778. if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) {
  779. return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
  780. }
  781. return false
  782. }
  783. // afterOrZero checks if time t1 is after time t2; if one of them
  784. // is zero, the zero time is seen as after non-zero time.
  785. func afterOrZero(t1, t2 *metav1.Time) bool {
  786. if t1.Time.IsZero() || t2.Time.IsZero() {
  787. return t1.Time.IsZero()
  788. }
  789. return t1.After(t2.Time)
  790. }
  791. func podReadyTime(pod *v1.Pod) *metav1.Time {
  792. if podutil.IsPodReady(pod) {
  793. for _, c := range pod.Status.Conditions {
  794. // we only care about pod ready conditions
  795. if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
  796. return &c.LastTransitionTime
  797. }
  798. }
  799. }
  800. return &metav1.Time{}
  801. }
  802. func maxContainerRestarts(pod *v1.Pod) int {
  803. maxRestarts := 0
  804. for _, c := range pod.Status.ContainerStatuses {
  805. maxRestarts = integer.IntMax(maxRestarts, int(c.RestartCount))
  806. }
  807. return maxRestarts
  808. }
  809. // FilterActivePods returns pods that have not terminated.
  810. func FilterActivePods(pods []*v1.Pod) []*v1.Pod {
  811. var result []*v1.Pod
  812. for _, p := range pods {
  813. if IsPodActive(p) {
  814. result = append(result, p)
  815. } else {
  816. klog.V(4).Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v",
  817. p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp)
  818. }
  819. }
  820. return result
  821. }
  822. func IsPodActive(p *v1.Pod) bool {
  823. return v1.PodSucceeded != p.Status.Phase &&
  824. v1.PodFailed != p.Status.Phase &&
  825. p.DeletionTimestamp == nil
  826. }
  827. // FilterActiveReplicaSets returns replica sets that have (or at least ought to have) pods.
  828. func FilterActiveReplicaSets(replicaSets []*apps.ReplicaSet) []*apps.ReplicaSet {
  829. activeFilter := func(rs *apps.ReplicaSet) bool {
  830. return rs != nil && *(rs.Spec.Replicas) > 0
  831. }
  832. return FilterReplicaSets(replicaSets, activeFilter)
  833. }
  834. type filterRS func(rs *apps.ReplicaSet) bool
  835. // FilterReplicaSets returns replica sets that are filtered by filterFn (all returned ones should match filterFn).
  836. func FilterReplicaSets(RSes []*apps.ReplicaSet, filterFn filterRS) []*apps.ReplicaSet {
  837. var filtered []*apps.ReplicaSet
  838. for i := range RSes {
  839. if filterFn(RSes[i]) {
  840. filtered = append(filtered, RSes[i])
  841. }
  842. }
  843. return filtered
  844. }
  845. // PodKey returns a key unique to the given pod within a cluster.
  846. // It's used so we consistently use the same key scheme in this module.
  847. // It does exactly what cache.MetaNamespaceKeyFunc would have done
  848. // except there's not possibility for error since we know the exact type.
  849. func PodKey(pod *v1.Pod) string {
  850. return fmt.Sprintf("%v/%v", pod.Namespace, pod.Name)
  851. }
  852. // ControllersByCreationTimestamp sorts a list of ReplicationControllers by creation timestamp, using their names as a tie breaker.
  853. type ControllersByCreationTimestamp []*v1.ReplicationController
  854. func (o ControllersByCreationTimestamp) Len() int { return len(o) }
  855. func (o ControllersByCreationTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  856. func (o ControllersByCreationTimestamp) Less(i, j int) bool {
  857. if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
  858. return o[i].Name < o[j].Name
  859. }
  860. return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
  861. }
  862. // ReplicaSetsByCreationTimestamp sorts a list of ReplicaSet by creation timestamp, using their names as a tie breaker.
  863. type ReplicaSetsByCreationTimestamp []*apps.ReplicaSet
  864. func (o ReplicaSetsByCreationTimestamp) Len() int { return len(o) }
  865. func (o ReplicaSetsByCreationTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  866. func (o ReplicaSetsByCreationTimestamp) Less(i, j int) bool {
  867. if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
  868. return o[i].Name < o[j].Name
  869. }
  870. return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
  871. }
  872. // ReplicaSetsBySizeOlder sorts a list of ReplicaSet by size in descending order, using their creation timestamp or name as a tie breaker.
  873. // By using the creation timestamp, this sorts from old to new replica sets.
  874. type ReplicaSetsBySizeOlder []*apps.ReplicaSet
  875. func (o ReplicaSetsBySizeOlder) Len() int { return len(o) }
  876. func (o ReplicaSetsBySizeOlder) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  877. func (o ReplicaSetsBySizeOlder) Less(i, j int) bool {
  878. if *(o[i].Spec.Replicas) == *(o[j].Spec.Replicas) {
  879. return ReplicaSetsByCreationTimestamp(o).Less(i, j)
  880. }
  881. return *(o[i].Spec.Replicas) > *(o[j].Spec.Replicas)
  882. }
  883. // ReplicaSetsBySizeNewer sorts a list of ReplicaSet by size in descending order, using their creation timestamp or name as a tie breaker.
  884. // By using the creation timestamp, this sorts from new to old replica sets.
  885. type ReplicaSetsBySizeNewer []*apps.ReplicaSet
  886. func (o ReplicaSetsBySizeNewer) Len() int { return len(o) }
  887. func (o ReplicaSetsBySizeNewer) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  888. func (o ReplicaSetsBySizeNewer) Less(i, j int) bool {
  889. if *(o[i].Spec.Replicas) == *(o[j].Spec.Replicas) {
  890. return ReplicaSetsByCreationTimestamp(o).Less(j, i)
  891. }
  892. return *(o[i].Spec.Replicas) > *(o[j].Spec.Replicas)
  893. }
  894. // AddOrUpdateTaintOnNode add taints to the node. If taint was added into node, it'll issue API calls
  895. // to update nodes; otherwise, no API calls. Return error if any.
  896. func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taints ...*v1.Taint) error {
  897. if len(taints) == 0 {
  898. return nil
  899. }
  900. firstTry := true
  901. return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error {
  902. var err error
  903. var oldNode *v1.Node
  904. // First we try getting node from the API server cache, as it's cheaper. If it fails
  905. // we get it from etcd to be sure to have fresh data.
  906. if firstTry {
  907. oldNode, err = c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{ResourceVersion: "0"})
  908. firstTry = false
  909. } else {
  910. oldNode, err = c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  911. }
  912. if err != nil {
  913. return err
  914. }
  915. var newNode *v1.Node
  916. oldNodeCopy := oldNode
  917. updated := false
  918. for _, taint := range taints {
  919. curNewNode, ok, err := taintutils.AddOrUpdateTaint(oldNodeCopy, taint)
  920. if err != nil {
  921. return fmt.Errorf("failed to update taint of node")
  922. }
  923. updated = updated || ok
  924. newNode = curNewNode
  925. oldNodeCopy = curNewNode
  926. }
  927. if !updated {
  928. return nil
  929. }
  930. return PatchNodeTaints(c, nodeName, oldNode, newNode)
  931. })
  932. }
  933. // RemoveTaintOffNode is for cleaning up taints temporarily added to node,
  934. // won't fail if target taint doesn't exist or has been removed.
  935. // If passed a node it'll check if there's anything to be done, if taint is not present it won't issue
  936. // any API calls.
  937. func RemoveTaintOffNode(c clientset.Interface, nodeName string, node *v1.Node, taints ...*v1.Taint) error {
  938. if len(taints) == 0 {
  939. return nil
  940. }
  941. // Short circuit for limiting amount of API calls.
  942. if node != nil {
  943. match := false
  944. for _, taint := range taints {
  945. if taintutils.TaintExists(node.Spec.Taints, taint) {
  946. match = true
  947. break
  948. }
  949. }
  950. if !match {
  951. return nil
  952. }
  953. }
  954. firstTry := true
  955. return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error {
  956. var err error
  957. var oldNode *v1.Node
  958. // First we try getting node from the API server cache, as it's cheaper. If it fails
  959. // we get it from etcd to be sure to have fresh data.
  960. if firstTry {
  961. oldNode, err = c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{ResourceVersion: "0"})
  962. firstTry = false
  963. } else {
  964. oldNode, err = c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  965. }
  966. if err != nil {
  967. return err
  968. }
  969. var newNode *v1.Node
  970. oldNodeCopy := oldNode
  971. updated := false
  972. for _, taint := range taints {
  973. curNewNode, ok, err := taintutils.RemoveTaint(oldNodeCopy, taint)
  974. if err != nil {
  975. return fmt.Errorf("failed to remove taint of node")
  976. }
  977. updated = updated || ok
  978. newNode = curNewNode
  979. oldNodeCopy = curNewNode
  980. }
  981. if !updated {
  982. return nil
  983. }
  984. return PatchNodeTaints(c, nodeName, oldNode, newNode)
  985. })
  986. }
  987. // PatchNodeTaints patches node's taints.
  988. func PatchNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node, newNode *v1.Node) error {
  989. oldData, err := json.Marshal(oldNode)
  990. if err != nil {
  991. return fmt.Errorf("failed to marshal old node %#v for node %q: %v", oldNode, nodeName, err)
  992. }
  993. newTaints := newNode.Spec.Taints
  994. newNodeClone := oldNode.DeepCopy()
  995. newNodeClone.Spec.Taints = newTaints
  996. newData, err := json.Marshal(newNodeClone)
  997. if err != nil {
  998. return fmt.Errorf("failed to marshal new node %#v for node %q: %v", newNodeClone, nodeName, err)
  999. }
  1000. patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, v1.Node{})
  1001. if err != nil {
  1002. return fmt.Errorf("failed to create patch for node %q: %v", nodeName, err)
  1003. }
  1004. _, err = c.CoreV1().Nodes().Patch(context.TODO(), nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
  1005. return err
  1006. }
  1007. // ComputeHash returns a hash value calculated from pod template and
  1008. // a collisionCount to avoid hash collision. The hash will be safe encoded to
  1009. // avoid bad words.
  1010. func ComputeHash(template *v1.PodTemplateSpec, collisionCount *int32) string {
  1011. podTemplateSpecHasher := fnv.New32a()
  1012. hashutil.DeepHashObject(podTemplateSpecHasher, *template)
  1013. // Add collisionCount in the hash if it exists.
  1014. if collisionCount != nil {
  1015. collisionCountBytes := make([]byte, 8)
  1016. binary.LittleEndian.PutUint32(collisionCountBytes, uint32(*collisionCount))
  1017. podTemplateSpecHasher.Write(collisionCountBytes)
  1018. }
  1019. return rand.SafeEncodeString(fmt.Sprint(podTemplateSpecHasher.Sum32()))
  1020. }
  1021. func AddOrUpdateLabelsOnNode(kubeClient clientset.Interface, nodeName string, labelsToUpdate map[string]string) error {
  1022. firstTry := true
  1023. return clientretry.RetryOnConflict(UpdateLabelBackoff, func() error {
  1024. var err error
  1025. var node *v1.Node
  1026. // First we try getting node from the API server cache, as it's cheaper. If it fails
  1027. // we get it from etcd to be sure to have fresh data.
  1028. if firstTry {
  1029. node, err = kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{ResourceVersion: "0"})
  1030. firstTry = false
  1031. } else {
  1032. node, err = kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  1033. }
  1034. if err != nil {
  1035. return err
  1036. }
  1037. // Make a copy of the node and update the labels.
  1038. newNode := node.DeepCopy()
  1039. if newNode.Labels == nil {
  1040. newNode.Labels = make(map[string]string)
  1041. }
  1042. for key, value := range labelsToUpdate {
  1043. newNode.Labels[key] = value
  1044. }
  1045. oldData, err := json.Marshal(node)
  1046. if err != nil {
  1047. return fmt.Errorf("failed to marshal the existing node %#v: %v", node, err)
  1048. }
  1049. newData, err := json.Marshal(newNode)
  1050. if err != nil {
  1051. return fmt.Errorf("failed to marshal the new node %#v: %v", newNode, err)
  1052. }
  1053. patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Node{})
  1054. if err != nil {
  1055. return fmt.Errorf("failed to create a two-way merge patch: %v", err)
  1056. }
  1057. if _, err := kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}); err != nil {
  1058. return fmt.Errorf("failed to patch the node: %v", err)
  1059. }
  1060. return nil
  1061. })
  1062. }
  1063. func getOrCreateServiceAccount(coreClient v1core.CoreV1Interface, namespace, name string) (*v1.ServiceAccount, error) {
  1064. sa, err := coreClient.ServiceAccounts(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  1065. if err == nil {
  1066. return sa, nil
  1067. }
  1068. if !apierrors.IsNotFound(err) {
  1069. return nil, err
  1070. }
  1071. // Create the namespace if we can't verify it exists.
  1072. // Tolerate errors, since we don't know whether this component has namespace creation permissions.
  1073. if _, err := coreClient.Namespaces().Get(context.TODO(), namespace, metav1.GetOptions{}); apierrors.IsNotFound(err) {
  1074. if _, err = coreClient.Namespaces().Create(context.TODO(), &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}}, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) {
  1075. klog.Warningf("create non-exist namespace %s failed:%v", namespace, err)
  1076. }
  1077. }
  1078. // Create the service account
  1079. sa, err = coreClient.ServiceAccounts(namespace).Create(context.TODO(), &v1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}}, metav1.CreateOptions{})
  1080. if apierrors.IsAlreadyExists(err) {
  1081. // If we're racing to init and someone else already created it, re-fetch
  1082. return coreClient.ServiceAccounts(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  1083. }
  1084. return sa, err
  1085. }