stateful_set_control.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package statefulset
  14. import (
  15. "math"
  16. "sort"
  17. "k8s.io/klog"
  18. apps "k8s.io/api/apps/v1"
  19. "k8s.io/api/core/v1"
  20. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  21. "k8s.io/client-go/tools/record"
  22. "k8s.io/kubernetes/pkg/controller/history"
  23. )
  24. // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented
  25. // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation.
  26. type StatefulSetControlInterface interface {
  27. // UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and
  28. // persistent volume creation, update, and deletion.
  29. // If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy.
  30. // Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to
  31. // exit exceptionally at any point provided they wish the update to be re-run at a later point in time.
  32. UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error
  33. // ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned
  34. // error is nil, the returns slice of ControllerRevisions is valid.
  35. ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error)
  36. // AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are
  37. // successful the returned error is nil.
  38. AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error
  39. }
  40. // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that
  41. // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update,
  42. // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used
  43. // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any
  44. // scenario other than testing.
  45. func NewDefaultStatefulSetControl(
  46. podControl StatefulPodControlInterface,
  47. statusUpdater StatefulSetStatusUpdaterInterface,
  48. controllerHistory history.Interface,
  49. recorder record.EventRecorder) StatefulSetControlInterface {
  50. return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory, recorder}
  51. }
  52. type defaultStatefulSetControl struct {
  53. podControl StatefulPodControlInterface
  54. statusUpdater StatefulSetStatusUpdaterInterface
  55. controllerHistory history.Interface
  56. recorder record.EventRecorder
  57. }
  58. // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and
  59. // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod
  60. // is created while any pod is unhealthy, and pods are terminated in descending order. The burst
  61. // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and
  62. // in no particular order. Clients using the burst strategy should be careful to ensure they
  63. // understand the consistency implications of having unpredictable numbers of pods available.
  64. func (ssc *defaultStatefulSetControl) UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error {
  65. // list all revisions and sort them
  66. revisions, err := ssc.ListRevisions(set)
  67. if err != nil {
  68. return err
  69. }
  70. history.SortControllerRevisions(revisions)
  71. // get the current, and update revisions
  72. currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
  73. if err != nil {
  74. return err
  75. }
  76. // perform the main update function and get the status
  77. status, err := ssc.updateStatefulSet(set, currentRevision, updateRevision, collisionCount, pods)
  78. if err != nil {
  79. return err
  80. }
  81. // update the set's status
  82. err = ssc.updateStatefulSetStatus(set, status)
  83. if err != nil {
  84. return err
  85. }
  86. klog.V(4).Infof("StatefulSet %s/%s pod status replicas=%d ready=%d current=%d updated=%d",
  87. set.Namespace,
  88. set.Name,
  89. status.Replicas,
  90. status.ReadyReplicas,
  91. status.CurrentReplicas,
  92. status.UpdatedReplicas)
  93. klog.V(4).Infof("StatefulSet %s/%s revisions current=%s update=%s",
  94. set.Namespace,
  95. set.Name,
  96. status.CurrentRevision,
  97. status.UpdateRevision)
  98. // maintain the set's revision history limit
  99. return ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
  100. }
  101. func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) {
  102. selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector)
  103. if err != nil {
  104. return nil, err
  105. }
  106. return ssc.controllerHistory.ListControllerRevisions(set, selector)
  107. }
  108. func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions(
  109. set *apps.StatefulSet,
  110. revisions []*apps.ControllerRevision) error {
  111. for i := range revisions {
  112. adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i])
  113. if err != nil {
  114. return err
  115. }
  116. revisions[i] = adopted
  117. }
  118. return nil
  119. }
  120. // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and
  121. // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also
  122. // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until
  123. // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method
  124. // expects that revisions is sorted when supplied.
  125. func (ssc *defaultStatefulSetControl) truncateHistory(
  126. set *apps.StatefulSet,
  127. pods []*v1.Pod,
  128. revisions []*apps.ControllerRevision,
  129. current *apps.ControllerRevision,
  130. update *apps.ControllerRevision) error {
  131. history := make([]*apps.ControllerRevision, 0, len(revisions))
  132. // mark all live revisions
  133. live := map[string]bool{current.Name: true, update.Name: true}
  134. for i := range pods {
  135. live[getPodRevision(pods[i])] = true
  136. }
  137. // collect live revisions and historic revisions
  138. for i := range revisions {
  139. if !live[revisions[i].Name] {
  140. history = append(history, revisions[i])
  141. }
  142. }
  143. historyLen := len(history)
  144. historyLimit := int(*set.Spec.RevisionHistoryLimit)
  145. if historyLen <= historyLimit {
  146. return nil
  147. }
  148. // delete any non-live history to maintain the revision limit.
  149. history = history[:(historyLen - historyLimit)]
  150. for i := 0; i < len(history); i++ {
  151. if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil {
  152. return err
  153. }
  154. }
  155. return nil
  156. }
  157. // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also
  158. // returns a collision count that records the number of name collisions set saw when creating
  159. // new ControllerRevisions. This count is incremented on every name collision and is used in
  160. // building the ControllerRevision names for name collision avoidance. This method may create
  161. // a new revision, or modify the Revision of an existing revision if an update to set is detected.
  162. // This method expects that revisions is sorted when supplied.
  163. func (ssc *defaultStatefulSetControl) getStatefulSetRevisions(
  164. set *apps.StatefulSet,
  165. revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) {
  166. var currentRevision, updateRevision *apps.ControllerRevision
  167. revisionCount := len(revisions)
  168. history.SortControllerRevisions(revisions)
  169. // Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly.
  170. // This copy is returned so the value gets carried over to set.Status in updateStatefulSet.
  171. var collisionCount int32
  172. if set.Status.CollisionCount != nil {
  173. collisionCount = *set.Status.CollisionCount
  174. }
  175. // create a new revision from the current set
  176. updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount)
  177. if err != nil {
  178. return nil, nil, collisionCount, err
  179. }
  180. // find any equivalent revisions
  181. equalRevisions := history.FindEqualRevisions(revisions, updateRevision)
  182. equalCount := len(equalRevisions)
  183. if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) {
  184. // if the equivalent revision is immediately prior the update revision has not changed
  185. updateRevision = revisions[revisionCount-1]
  186. } else if equalCount > 0 {
  187. // if the equivalent revision is not immediately prior we will roll back by incrementing the
  188. // Revision of the equivalent revision
  189. updateRevision, err = ssc.controllerHistory.UpdateControllerRevision(
  190. equalRevisions[equalCount-1],
  191. updateRevision.Revision)
  192. if err != nil {
  193. return nil, nil, collisionCount, err
  194. }
  195. } else {
  196. //if there is no equivalent revision we create a new one
  197. updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount)
  198. if err != nil {
  199. return nil, nil, collisionCount, err
  200. }
  201. }
  202. // attempt to find the revision that corresponds to the current revision
  203. for i := range revisions {
  204. if revisions[i].Name == set.Status.CurrentRevision {
  205. currentRevision = revisions[i]
  206. }
  207. }
  208. // if the current revision is nil we initialize the history by setting it to the update revision
  209. if currentRevision == nil {
  210. currentRevision = updateRevision
  211. }
  212. return currentRevision, updateRevision, collisionCount, nil
  213. }
  214. // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in
  215. // the set in order to conform the system to the target state for the set. The target state always contains
  216. // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is
  217. // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision.
  218. // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about
  219. // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then
  220. // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other
  221. // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the
  222. // update must be recorded. If the error is not nil, the method should be retried until successful.
  223. func (ssc *defaultStatefulSetControl) updateStatefulSet(
  224. set *apps.StatefulSet,
  225. currentRevision *apps.ControllerRevision,
  226. updateRevision *apps.ControllerRevision,
  227. collisionCount int32,
  228. pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
  229. // get the current and update revisions of the set.
  230. currentSet, err := ApplyRevision(set, currentRevision)
  231. if err != nil {
  232. return nil, err
  233. }
  234. updateSet, err := ApplyRevision(set, updateRevision)
  235. if err != nil {
  236. return nil, err
  237. }
  238. // set the generation, and revisions in the returned status
  239. status := apps.StatefulSetStatus{}
  240. status.ObservedGeneration = set.Generation
  241. status.CurrentRevision = currentRevision.Name
  242. status.UpdateRevision = updateRevision.Name
  243. status.CollisionCount = new(int32)
  244. *status.CollisionCount = collisionCount
  245. replicaCount := int(*set.Spec.Replicas)
  246. // slice that will contain all Pods such that 0 <= getOrdinal(pod) < set.Spec.Replicas
  247. replicas := make([]*v1.Pod, replicaCount)
  248. // slice that will contain all Pods such that set.Spec.Replicas <= getOrdinal(pod)
  249. condemned := make([]*v1.Pod, 0, len(pods))
  250. unhealthy := 0
  251. firstUnhealthyOrdinal := math.MaxInt32
  252. var firstUnhealthyPod *v1.Pod
  253. // First we partition pods into two lists valid replicas and condemned Pods
  254. for i := range pods {
  255. status.Replicas++
  256. // count the number of running and ready replicas
  257. if isRunningAndReady(pods[i]) {
  258. status.ReadyReplicas++
  259. }
  260. // count the number of current and update replicas
  261. if isCreated(pods[i]) && !isTerminating(pods[i]) {
  262. if getPodRevision(pods[i]) == currentRevision.Name {
  263. status.CurrentReplicas++
  264. }
  265. if getPodRevision(pods[i]) == updateRevision.Name {
  266. status.UpdatedReplicas++
  267. }
  268. }
  269. if ord := getOrdinal(pods[i]); 0 <= ord && ord < replicaCount {
  270. // if the ordinal of the pod is within the range of the current number of replicas,
  271. // insert it at the indirection of its ordinal
  272. replicas[ord] = pods[i]
  273. } else if ord >= replicaCount {
  274. // if the ordinal is greater than the number of replicas add it to the condemned list
  275. condemned = append(condemned, pods[i])
  276. }
  277. // If the ordinal could not be parsed (ord < 0), ignore the Pod.
  278. }
  279. // for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision
  280. for ord := 0; ord < replicaCount; ord++ {
  281. if replicas[ord] == nil {
  282. replicas[ord] = newVersionedStatefulSetPod(
  283. currentSet,
  284. updateSet,
  285. currentRevision.Name,
  286. updateRevision.Name, ord)
  287. }
  288. }
  289. // sort the condemned Pods by their ordinals
  290. sort.Sort(ascendingOrdinal(condemned))
  291. // find the first unhealthy Pod
  292. for i := range replicas {
  293. if !isHealthy(replicas[i]) {
  294. unhealthy++
  295. if ord := getOrdinal(replicas[i]); ord < firstUnhealthyOrdinal {
  296. firstUnhealthyOrdinal = ord
  297. firstUnhealthyPod = replicas[i]
  298. }
  299. }
  300. }
  301. for i := range condemned {
  302. if !isHealthy(condemned[i]) {
  303. unhealthy++
  304. if ord := getOrdinal(condemned[i]); ord < firstUnhealthyOrdinal {
  305. firstUnhealthyOrdinal = ord
  306. firstUnhealthyPod = condemned[i]
  307. }
  308. }
  309. }
  310. if unhealthy > 0 {
  311. klog.V(4).Infof("StatefulSet %s/%s has %d unhealthy Pods starting with %s",
  312. set.Namespace,
  313. set.Name,
  314. unhealthy,
  315. firstUnhealthyPod.Name)
  316. }
  317. // If the StatefulSet is being deleted, don't do anything other than updating
  318. // status.
  319. if set.DeletionTimestamp != nil {
  320. return &status, nil
  321. }
  322. monotonic := !allowsBurst(set)
  323. // Examine each replica with respect to its ordinal
  324. for i := range replicas {
  325. // delete and recreate failed pods
  326. if isFailed(replicas[i]) {
  327. ssc.recorder.Eventf(set, v1.EventTypeWarning, "RecreatingFailedPod",
  328. "StatefulSet %s/%s is recreating failed Pod %s",
  329. set.Namespace,
  330. set.Name,
  331. replicas[i].Name)
  332. if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil {
  333. return &status, err
  334. }
  335. if getPodRevision(replicas[i]) == currentRevision.Name {
  336. status.CurrentReplicas--
  337. }
  338. if getPodRevision(replicas[i]) == updateRevision.Name {
  339. status.UpdatedReplicas--
  340. }
  341. status.Replicas--
  342. replicas[i] = newVersionedStatefulSetPod(
  343. currentSet,
  344. updateSet,
  345. currentRevision.Name,
  346. updateRevision.Name,
  347. i)
  348. }
  349. // If we find a Pod that has not been created we create the Pod
  350. if !isCreated(replicas[i]) {
  351. if err := ssc.podControl.CreateStatefulPod(set, replicas[i]); err != nil {
  352. return &status, err
  353. }
  354. status.Replicas++
  355. if getPodRevision(replicas[i]) == currentRevision.Name {
  356. status.CurrentReplicas++
  357. }
  358. if getPodRevision(replicas[i]) == updateRevision.Name {
  359. status.UpdatedReplicas++
  360. }
  361. // if the set does not allow bursting, return immediately
  362. if monotonic {
  363. return &status, nil
  364. }
  365. // pod created, no more work possible for this round
  366. continue
  367. }
  368. // If we find a Pod that is currently terminating, we must wait until graceful deletion
  369. // completes before we continue to make progress.
  370. if isTerminating(replicas[i]) && monotonic {
  371. klog.V(4).Infof(
  372. "StatefulSet %s/%s is waiting for Pod %s to Terminate",
  373. set.Namespace,
  374. set.Name,
  375. replicas[i].Name)
  376. return &status, nil
  377. }
  378. // If we have a Pod that has been created but is not running and ready we can not make progress.
  379. // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
  380. // ordinal, are Running and Ready.
  381. if !isRunningAndReady(replicas[i]) && monotonic {
  382. klog.V(4).Infof(
  383. "StatefulSet %s/%s is waiting for Pod %s to be Running and Ready",
  384. set.Namespace,
  385. set.Name,
  386. replicas[i].Name)
  387. return &status, nil
  388. }
  389. // Enforce the StatefulSet invariants
  390. if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) {
  391. continue
  392. }
  393. // Make a deep copy so we don't mutate the shared cache
  394. replica := replicas[i].DeepCopy()
  395. if err := ssc.podControl.UpdateStatefulPod(updateSet, replica); err != nil {
  396. return &status, err
  397. }
  398. }
  399. // At this point, all of the current Replicas are Running and Ready, we can consider termination.
  400. // We will wait for all predecessors to be Running and Ready prior to attempting a deletion.
  401. // We will terminate Pods in a monotonically decreasing order over [len(pods),set.Spec.Replicas).
  402. // Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over
  403. // updates.
  404. for target := len(condemned) - 1; target >= 0; target-- {
  405. // wait for terminating pods to expire
  406. if isTerminating(condemned[target]) {
  407. klog.V(4).Infof(
  408. "StatefulSet %s/%s is waiting for Pod %s to Terminate prior to scale down",
  409. set.Namespace,
  410. set.Name,
  411. condemned[target].Name)
  412. // block if we are in monotonic mode
  413. if monotonic {
  414. return &status, nil
  415. }
  416. continue
  417. }
  418. // if we are in monotonic mode and the condemned target is not the first unhealthy Pod block
  419. if !isRunningAndReady(condemned[target]) && monotonic && condemned[target] != firstUnhealthyPod {
  420. klog.V(4).Infof(
  421. "StatefulSet %s/%s is waiting for Pod %s to be Running and Ready prior to scale down",
  422. set.Namespace,
  423. set.Name,
  424. firstUnhealthyPod.Name)
  425. return &status, nil
  426. }
  427. klog.V(2).Infof("StatefulSet %s/%s terminating Pod %s for scale down",
  428. set.Namespace,
  429. set.Name,
  430. condemned[target].Name)
  431. if err := ssc.podControl.DeleteStatefulPod(set, condemned[target]); err != nil {
  432. return &status, err
  433. }
  434. if getPodRevision(condemned[target]) == currentRevision.Name {
  435. status.CurrentReplicas--
  436. }
  437. if getPodRevision(condemned[target]) == updateRevision.Name {
  438. status.UpdatedReplicas--
  439. }
  440. if monotonic {
  441. return &status, nil
  442. }
  443. }
  444. // for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted.
  445. if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType {
  446. return &status, nil
  447. }
  448. // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
  449. updateMin := 0
  450. if set.Spec.UpdateStrategy.RollingUpdate != nil {
  451. updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
  452. }
  453. // we terminate the Pod with the largest ordinal that does not match the update revision.
  454. for target := len(replicas) - 1; target >= updateMin; target-- {
  455. // delete the Pod if it is not already terminating and does not match the update revision.
  456. if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
  457. klog.V(2).Infof("StatefulSet %s/%s terminating Pod %s for update",
  458. set.Namespace,
  459. set.Name,
  460. replicas[target].Name)
  461. err := ssc.podControl.DeleteStatefulPod(set, replicas[target])
  462. status.CurrentReplicas--
  463. return &status, err
  464. }
  465. // wait for unhealthy Pods on update
  466. if !isHealthy(replicas[target]) {
  467. klog.V(4).Infof(
  468. "StatefulSet %s/%s is waiting for Pod %s to update",
  469. set.Namespace,
  470. set.Name,
  471. replicas[target].Name)
  472. return &status, nil
  473. }
  474. }
  475. return &status, nil
  476. }
  477. // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is
  478. // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the
  479. // returned error is nil, the update is successful.
  480. func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
  481. set *apps.StatefulSet,
  482. status *apps.StatefulSetStatus) error {
  483. // complete any in progress rolling update if necessary
  484. completeRollingUpdate(set, status)
  485. // if the status is not inconsistent do not perform an update
  486. if !inconsistentStatus(set, status) {
  487. return nil
  488. }
  489. // copy set and update its status
  490. set = set.DeepCopy()
  491. if err := ssc.statusUpdater.UpdateStatefulSetStatus(set, status); err != nil {
  492. return err
  493. }
  494. return nil
  495. }
  496. var _ StatefulSetControlInterface = &defaultStatefulSetControl{}