stateful_set_control.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package statefulset
  14. import (
  15. "math"
  16. "sort"
  17. "k8s.io/klog"
  18. apps "k8s.io/api/apps/v1"
  19. "k8s.io/api/core/v1"
  20. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  21. "k8s.io/client-go/tools/record"
  22. "k8s.io/kubernetes/pkg/controller/history"
  23. )
  24. // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented
  25. // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation.
  26. type StatefulSetControlInterface interface {
  27. // UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and
  28. // persistent volume creation, update, and deletion.
  29. // If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy.
  30. // Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to
  31. // exit exceptionally at any point provided they wish the update to be re-run at a later point in time.
  32. UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error
  33. // ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned
  34. // error is nil, the returns slice of ControllerRevisions is valid.
  35. ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error)
  36. // AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are
  37. // successful the returned error is nil.
  38. AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error
  39. }
  40. // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that
  41. // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update,
  42. // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used
  43. // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any
  44. // scenario other than testing.
  45. func NewDefaultStatefulSetControl(
  46. podControl StatefulPodControlInterface,
  47. statusUpdater StatefulSetStatusUpdaterInterface,
  48. controllerHistory history.Interface,
  49. recorder record.EventRecorder) StatefulSetControlInterface {
  50. return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory, recorder}
  51. }
  52. type defaultStatefulSetControl struct {
  53. podControl StatefulPodControlInterface
  54. statusUpdater StatefulSetStatusUpdaterInterface
  55. controllerHistory history.Interface
  56. recorder record.EventRecorder
  57. }
  58. // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and
  59. // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod
  60. // is created while any pod is unhealthy, and pods are terminated in descending order. The burst
  61. // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and
  62. // in no particular order. Clients using the burst strategy should be careful to ensure they
  63. // understand the consistency implications of having unpredictable numbers of pods available.
  64. func (ssc *defaultStatefulSetControl) UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error {
  65. // list all revisions and sort them
  66. revisions, err := ssc.ListRevisions(set)
  67. if err != nil {
  68. return err
  69. }
  70. history.SortControllerRevisions(revisions)
  71. // get the current, and update revisions
  72. currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
  73. if err != nil {
  74. return err
  75. }
  76. // perform the main update function and get the status
  77. status, err := ssc.updateStatefulSet(set, currentRevision, updateRevision, collisionCount, pods)
  78. if err != nil {
  79. return err
  80. }
  81. // update the set's status
  82. err = ssc.updateStatefulSetStatus(set, status)
  83. if err != nil {
  84. return err
  85. }
  86. klog.V(4).Infof("StatefulSet %s/%s pod status replicas=%d ready=%d current=%d updated=%d",
  87. set.Namespace,
  88. set.Name,
  89. status.Replicas,
  90. status.ReadyReplicas,
  91. status.CurrentReplicas,
  92. status.UpdatedReplicas)
  93. klog.V(4).Infof("StatefulSet %s/%s revisions current=%s update=%s",
  94. set.Namespace,
  95. set.Name,
  96. status.CurrentRevision,
  97. status.UpdateRevision)
  98. // maintain the set's revision history limit
  99. return ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
  100. }
  101. func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) {
  102. selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector)
  103. if err != nil {
  104. return nil, err
  105. }
  106. return ssc.controllerHistory.ListControllerRevisions(set, selector)
  107. }
  108. func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions(
  109. set *apps.StatefulSet,
  110. revisions []*apps.ControllerRevision) error {
  111. for i := range revisions {
  112. adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i])
  113. if err != nil {
  114. return err
  115. }
  116. revisions[i] = adopted
  117. }
  118. return nil
  119. }
  120. // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and
  121. // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also
  122. // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until
  123. // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method
  124. // expects that revisions is sorted when supplied.
  125. func (ssc *defaultStatefulSetControl) truncateHistory(
  126. set *apps.StatefulSet,
  127. pods []*v1.Pod,
  128. revisions []*apps.ControllerRevision,
  129. current *apps.ControllerRevision,
  130. update *apps.ControllerRevision) error {
  131. history := make([]*apps.ControllerRevision, 0, len(revisions))
  132. // mark all live revisions
  133. live := map[string]bool{current.Name: true, update.Name: true}
  134. for i := range pods {
  135. live[getPodRevision(pods[i])] = true
  136. }
  137. // collect live revisions and historic revisions
  138. for i := range revisions {
  139. if !live[revisions[i].Name] {
  140. history = append(history, revisions[i])
  141. }
  142. }
  143. historyLen := len(history)
  144. historyLimit := int(*set.Spec.RevisionHistoryLimit)
  145. if historyLen <= historyLimit {
  146. return nil
  147. }
  148. // delete any non-live history to maintain the revision limit.
  149. history = history[:(historyLen - historyLimit)]
  150. for i := 0; i < len(history); i++ {
  151. if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil {
  152. return err
  153. }
  154. }
  155. return nil
  156. }
  157. // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also
  158. // returns a collision count that records the number of name collisions set saw when creating
  159. // new ControllerRevisions. This count is incremented on every name collision and is used in
  160. // building the ControllerRevision names for name collision avoidance. This method may create
  161. // a new revision, or modify the Revision of an existing revision if an update to set is detected.
  162. // This method expects that revisions is sorted when supplied.
  163. func (ssc *defaultStatefulSetControl) getStatefulSetRevisions(
  164. set *apps.StatefulSet,
  165. revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) {
  166. var currentRevision, updateRevision *apps.ControllerRevision
  167. revisionCount := len(revisions)
  168. history.SortControllerRevisions(revisions)
  169. // Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly.
  170. // This copy is returned so the value gets carried over to set.Status in updateStatefulSet.
  171. var collisionCount int32
  172. if set.Status.CollisionCount != nil {
  173. collisionCount = *set.Status.CollisionCount
  174. }
  175. // create a new revision from the current set
  176. updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount)
  177. if err != nil {
  178. return nil, nil, collisionCount, err
  179. }
  180. // find any equivalent revisions
  181. equalRevisions := history.FindEqualRevisions(revisions, updateRevision)
  182. equalCount := len(equalRevisions)
  183. if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) {
  184. // if the equivalent revision is immediately prior the update revision has not changed
  185. updateRevision = revisions[revisionCount-1]
  186. } else if equalCount > 0 {
  187. // if the equivalent revision is not immediately prior we will roll back by incrementing the
  188. // Revision of the equivalent revision
  189. updateRevision, err = ssc.controllerHistory.UpdateControllerRevision(
  190. equalRevisions[equalCount-1],
  191. updateRevision.Revision)
  192. if err != nil {
  193. return nil, nil, collisionCount, err
  194. }
  195. } else {
  196. //if there is no equivalent revision we create a new one
  197. updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount)
  198. if err != nil {
  199. return nil, nil, collisionCount, err
  200. }
  201. }
  202. // attempt to find the revision that corresponds to the current revision
  203. for i := range revisions {
  204. if revisions[i].Name == set.Status.CurrentRevision {
  205. currentRevision = revisions[i]
  206. break
  207. }
  208. }
  209. // if the current revision is nil we initialize the history by setting it to the update revision
  210. if currentRevision == nil {
  211. currentRevision = updateRevision
  212. }
  213. return currentRevision, updateRevision, collisionCount, nil
  214. }
  215. // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in
  216. // the set in order to conform the system to the target state for the set. The target state always contains
  217. // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is
  218. // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision.
  219. // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about
  220. // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then
  221. // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other
  222. // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the
  223. // update must be recorded. If the error is not nil, the method should be retried until successful.
  224. func (ssc *defaultStatefulSetControl) updateStatefulSet(
  225. set *apps.StatefulSet,
  226. currentRevision *apps.ControllerRevision,
  227. updateRevision *apps.ControllerRevision,
  228. collisionCount int32,
  229. pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
  230. // get the current and update revisions of the set.
  231. currentSet, err := ApplyRevision(set, currentRevision)
  232. if err != nil {
  233. return nil, err
  234. }
  235. updateSet, err := ApplyRevision(set, updateRevision)
  236. if err != nil {
  237. return nil, err
  238. }
  239. // set the generation, and revisions in the returned status
  240. status := apps.StatefulSetStatus{}
  241. status.ObservedGeneration = set.Generation
  242. status.CurrentRevision = currentRevision.Name
  243. status.UpdateRevision = updateRevision.Name
  244. status.CollisionCount = new(int32)
  245. *status.CollisionCount = collisionCount
  246. replicaCount := int(*set.Spec.Replicas)
  247. // slice that will contain all Pods such that 0 <= getOrdinal(pod) < set.Spec.Replicas
  248. replicas := make([]*v1.Pod, replicaCount)
  249. // slice that will contain all Pods such that set.Spec.Replicas <= getOrdinal(pod)
  250. condemned := make([]*v1.Pod, 0, len(pods))
  251. unhealthy := 0
  252. firstUnhealthyOrdinal := math.MaxInt32
  253. var firstUnhealthyPod *v1.Pod
  254. // First we partition pods into two lists valid replicas and condemned Pods
  255. for i := range pods {
  256. status.Replicas++
  257. // count the number of running and ready replicas
  258. if isRunningAndReady(pods[i]) {
  259. status.ReadyReplicas++
  260. }
  261. // count the number of current and update replicas
  262. if isCreated(pods[i]) && !isTerminating(pods[i]) {
  263. if getPodRevision(pods[i]) == currentRevision.Name {
  264. status.CurrentReplicas++
  265. }
  266. if getPodRevision(pods[i]) == updateRevision.Name {
  267. status.UpdatedReplicas++
  268. }
  269. }
  270. if ord := getOrdinal(pods[i]); 0 <= ord && ord < replicaCount {
  271. // if the ordinal of the pod is within the range of the current number of replicas,
  272. // insert it at the indirection of its ordinal
  273. replicas[ord] = pods[i]
  274. } else if ord >= replicaCount {
  275. // if the ordinal is greater than the number of replicas add it to the condemned list
  276. condemned = append(condemned, pods[i])
  277. }
  278. // If the ordinal could not be parsed (ord < 0), ignore the Pod.
  279. }
  280. // for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision
  281. for ord := 0; ord < replicaCount; ord++ {
  282. if replicas[ord] == nil {
  283. replicas[ord] = newVersionedStatefulSetPod(
  284. currentSet,
  285. updateSet,
  286. currentRevision.Name,
  287. updateRevision.Name, ord)
  288. }
  289. }
  290. // sort the condemned Pods by their ordinals
  291. sort.Sort(ascendingOrdinal(condemned))
  292. // find the first unhealthy Pod
  293. for i := range replicas {
  294. if !isHealthy(replicas[i]) {
  295. unhealthy++
  296. if ord := getOrdinal(replicas[i]); ord < firstUnhealthyOrdinal {
  297. firstUnhealthyOrdinal = ord
  298. firstUnhealthyPod = replicas[i]
  299. }
  300. }
  301. }
  302. for i := range condemned {
  303. if !isHealthy(condemned[i]) {
  304. unhealthy++
  305. if ord := getOrdinal(condemned[i]); ord < firstUnhealthyOrdinal {
  306. firstUnhealthyOrdinal = ord
  307. firstUnhealthyPod = condemned[i]
  308. }
  309. }
  310. }
  311. if unhealthy > 0 {
  312. klog.V(4).Infof("StatefulSet %s/%s has %d unhealthy Pods starting with %s",
  313. set.Namespace,
  314. set.Name,
  315. unhealthy,
  316. firstUnhealthyPod.Name)
  317. }
  318. // If the StatefulSet is being deleted, don't do anything other than updating
  319. // status.
  320. if set.DeletionTimestamp != nil {
  321. return &status, nil
  322. }
  323. monotonic := !allowsBurst(set)
  324. // Examine each replica with respect to its ordinal
  325. for i := range replicas {
  326. // delete and recreate failed pods
  327. if isFailed(replicas[i]) {
  328. ssc.recorder.Eventf(set, v1.EventTypeWarning, "RecreatingFailedPod",
  329. "StatefulSet %s/%s is recreating failed Pod %s",
  330. set.Namespace,
  331. set.Name,
  332. replicas[i].Name)
  333. if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil {
  334. return &status, err
  335. }
  336. if getPodRevision(replicas[i]) == currentRevision.Name {
  337. status.CurrentReplicas--
  338. }
  339. if getPodRevision(replicas[i]) == updateRevision.Name {
  340. status.UpdatedReplicas--
  341. }
  342. status.Replicas--
  343. replicas[i] = newVersionedStatefulSetPod(
  344. currentSet,
  345. updateSet,
  346. currentRevision.Name,
  347. updateRevision.Name,
  348. i)
  349. }
  350. // If we find a Pod that has not been created we create the Pod
  351. if !isCreated(replicas[i]) {
  352. if err := ssc.podControl.CreateStatefulPod(set, replicas[i]); err != nil {
  353. return &status, err
  354. }
  355. status.Replicas++
  356. if getPodRevision(replicas[i]) == currentRevision.Name {
  357. status.CurrentReplicas++
  358. }
  359. if getPodRevision(replicas[i]) == updateRevision.Name {
  360. status.UpdatedReplicas++
  361. }
  362. // if the set does not allow bursting, return immediately
  363. if monotonic {
  364. return &status, nil
  365. }
  366. // pod created, no more work possible for this round
  367. continue
  368. }
  369. // If we find a Pod that is currently terminating, we must wait until graceful deletion
  370. // completes before we continue to make progress.
  371. if isTerminating(replicas[i]) && monotonic {
  372. klog.V(4).Infof(
  373. "StatefulSet %s/%s is waiting for Pod %s to Terminate",
  374. set.Namespace,
  375. set.Name,
  376. replicas[i].Name)
  377. return &status, nil
  378. }
  379. // If we have a Pod that has been created but is not running and ready we can not make progress.
  380. // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
  381. // ordinal, are Running and Ready.
  382. if !isRunningAndReady(replicas[i]) && monotonic {
  383. klog.V(4).Infof(
  384. "StatefulSet %s/%s is waiting for Pod %s to be Running and Ready",
  385. set.Namespace,
  386. set.Name,
  387. replicas[i].Name)
  388. return &status, nil
  389. }
  390. // Enforce the StatefulSet invariants
  391. if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) {
  392. continue
  393. }
  394. // Make a deep copy so we don't mutate the shared cache
  395. replica := replicas[i].DeepCopy()
  396. if err := ssc.podControl.UpdateStatefulPod(updateSet, replica); err != nil {
  397. return &status, err
  398. }
  399. }
  400. // At this point, all of the current Replicas are Running and Ready, we can consider termination.
  401. // We will wait for all predecessors to be Running and Ready prior to attempting a deletion.
  402. // We will terminate Pods in a monotonically decreasing order over [len(pods),set.Spec.Replicas).
  403. // Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over
  404. // updates.
  405. for target := len(condemned) - 1; target >= 0; target-- {
  406. // wait for terminating pods to expire
  407. if isTerminating(condemned[target]) {
  408. klog.V(4).Infof(
  409. "StatefulSet %s/%s is waiting for Pod %s to Terminate prior to scale down",
  410. set.Namespace,
  411. set.Name,
  412. condemned[target].Name)
  413. // block if we are in monotonic mode
  414. if monotonic {
  415. return &status, nil
  416. }
  417. continue
  418. }
  419. // if we are in monotonic mode and the condemned target is not the first unhealthy Pod block
  420. if !isRunningAndReady(condemned[target]) && monotonic && condemned[target] != firstUnhealthyPod {
  421. klog.V(4).Infof(
  422. "StatefulSet %s/%s is waiting for Pod %s to be Running and Ready prior to scale down",
  423. set.Namespace,
  424. set.Name,
  425. firstUnhealthyPod.Name)
  426. return &status, nil
  427. }
  428. klog.V(2).Infof("StatefulSet %s/%s terminating Pod %s for scale down",
  429. set.Namespace,
  430. set.Name,
  431. condemned[target].Name)
  432. if err := ssc.podControl.DeleteStatefulPod(set, condemned[target]); err != nil {
  433. return &status, err
  434. }
  435. if getPodRevision(condemned[target]) == currentRevision.Name {
  436. status.CurrentReplicas--
  437. }
  438. if getPodRevision(condemned[target]) == updateRevision.Name {
  439. status.UpdatedReplicas--
  440. }
  441. if monotonic {
  442. return &status, nil
  443. }
  444. }
  445. // for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted.
  446. if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType {
  447. return &status, nil
  448. }
  449. // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
  450. updateMin := 0
  451. if set.Spec.UpdateStrategy.RollingUpdate != nil {
  452. updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
  453. }
  454. // we terminate the Pod with the largest ordinal that does not match the update revision.
  455. for target := len(replicas) - 1; target >= updateMin; target-- {
  456. // delete the Pod if it is not already terminating and does not match the update revision.
  457. if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
  458. klog.V(2).Infof("StatefulSet %s/%s terminating Pod %s for update",
  459. set.Namespace,
  460. set.Name,
  461. replicas[target].Name)
  462. err := ssc.podControl.DeleteStatefulPod(set, replicas[target])
  463. status.CurrentReplicas--
  464. return &status, err
  465. }
  466. // wait for unhealthy Pods on update
  467. if !isHealthy(replicas[target]) {
  468. klog.V(4).Infof(
  469. "StatefulSet %s/%s is waiting for Pod %s to update",
  470. set.Namespace,
  471. set.Name,
  472. replicas[target].Name)
  473. return &status, nil
  474. }
  475. }
  476. return &status, nil
  477. }
  478. // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is
  479. // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the
  480. // returned error is nil, the update is successful.
  481. func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
  482. set *apps.StatefulSet,
  483. status *apps.StatefulSetStatus) error {
  484. // complete any in progress rolling update if necessary
  485. completeRollingUpdate(set, status)
  486. // if the status is not inconsistent do not perform an update
  487. if !inconsistentStatus(set, status) {
  488. return nil
  489. }
  490. // copy set and update its status
  491. set = set.DeepCopy()
  492. if err := ssc.statusUpdater.UpdateStatefulSetStatus(set, status); err != nil {
  493. return err
  494. }
  495. return nil
  496. }
  497. var _ StatefulSetControlInterface = &defaultStatefulSetControl{}