controller.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. /*
  2. Copyright 2018 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package nodelease
  14. import (
  15. "time"
  16. coordv1beta1 "k8s.io/api/coordination/v1beta1"
  17. corev1 "k8s.io/api/core/v1"
  18. apierrors "k8s.io/apimachinery/pkg/api/errors"
  19. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  20. "k8s.io/apimachinery/pkg/util/clock"
  21. "k8s.io/apimachinery/pkg/util/wait"
  22. clientset "k8s.io/client-go/kubernetes"
  23. coordclientset "k8s.io/client-go/kubernetes/typed/coordination/v1beta1"
  24. "k8s.io/utils/pointer"
  25. "k8s.io/klog"
  26. )
  27. const (
  28. // defaultRenewInterval is the default interval at which the lease is renewed
  29. // TODO(mtaufen): 10s was the decision in the KEP, to keep the behavior as close to the
  30. // current default behavior as possible. In the future, we should determine a reasonable
  31. // fraction of the lease duration at which to renew, and use that instead.
  32. defaultRenewInterval = 10 * time.Second
  33. // maxUpdateRetries is the number of immediate, successive retries the Kubelet will attempt
  34. // when renewing the lease before it waits for the renewal interval before trying again,
  35. // similar to what we do for node status retries
  36. maxUpdateRetries = 5
  37. // maxBackoff is the maximum sleep time during backoff (e.g. in backoffEnsureLease)
  38. maxBackoff = 7 * time.Second
  39. )
  40. // Controller manages creating and renewing the lease for this Kubelet
  41. type Controller interface {
  42. Run(stopCh <-chan struct{})
  43. }
  44. type controller struct {
  45. client clientset.Interface
  46. leaseClient coordclientset.LeaseInterface
  47. holderIdentity string
  48. leaseDurationSeconds int32
  49. renewInterval time.Duration
  50. clock clock.Clock
  51. onRepeatedHeartbeatFailure func()
  52. }
  53. // NewController constructs and returns a controller
  54. func NewController(clock clock.Clock, client clientset.Interface, holderIdentity string, leaseDurationSeconds int32, nodeStatusUpdateFrequency time.Duration, onRepeatedHeartbeatFailure func()) Controller {
  55. var leaseClient coordclientset.LeaseInterface
  56. if client != nil {
  57. leaseClient = client.CoordinationV1beta1().Leases(corev1.NamespaceNodeLease)
  58. }
  59. renewInterval := defaultRenewInterval
  60. // Users are able to decrease the timeout after which nodes are being
  61. // marked as "Ready: Unknown" by NodeLifecycleController to values
  62. // smaller than defaultRenewInterval. Until the knob to configure
  63. // lease renew interval is exposed to user, we temporarily decrease
  64. // renewInterval based on the NodeStatusUpdateFrequency.
  65. if renewInterval > nodeStatusUpdateFrequency {
  66. renewInterval = nodeStatusUpdateFrequency
  67. }
  68. return &controller{
  69. client: client,
  70. leaseClient: leaseClient,
  71. holderIdentity: holderIdentity,
  72. leaseDurationSeconds: leaseDurationSeconds,
  73. renewInterval: renewInterval,
  74. clock: clock,
  75. onRepeatedHeartbeatFailure: onRepeatedHeartbeatFailure,
  76. }
  77. }
  78. // Run runs the controller
  79. func (c *controller) Run(stopCh <-chan struct{}) {
  80. if c.leaseClient == nil {
  81. klog.Infof("node lease controller has nil lease client, will not claim or renew leases")
  82. return
  83. }
  84. wait.Until(c.sync, c.renewInterval, stopCh)
  85. }
  86. func (c *controller) sync() {
  87. lease, created := c.backoffEnsureLease()
  88. // we don't need to update the lease if we just created it
  89. if !created {
  90. c.retryUpdateLease(lease)
  91. }
  92. }
  93. // backoffEnsureLease attempts to create the lease if it does not exist,
  94. // and uses exponentially increasing waits to prevent overloading the API server
  95. // with retries. Returns the lease, and true if this call created the lease,
  96. // false otherwise.
  97. func (c *controller) backoffEnsureLease() (*coordv1beta1.Lease, bool) {
  98. var (
  99. lease *coordv1beta1.Lease
  100. created bool
  101. err error
  102. )
  103. sleep := 100 * time.Millisecond
  104. for {
  105. lease, created, err = c.ensureLease()
  106. if err == nil {
  107. break
  108. }
  109. sleep = minDuration(2*sleep, maxBackoff)
  110. klog.Errorf("failed to ensure node lease exists, will retry in %v, error: %v", sleep, err)
  111. // backoff wait
  112. c.clock.Sleep(sleep)
  113. }
  114. return lease, created
  115. }
  116. // ensureLease creates the lease if it does not exist. Returns the lease and
  117. // a bool (true if this call created the lease), or any error that occurs.
  118. func (c *controller) ensureLease() (*coordv1beta1.Lease, bool, error) {
  119. lease, err := c.leaseClient.Get(c.holderIdentity, metav1.GetOptions{})
  120. if apierrors.IsNotFound(err) {
  121. // lease does not exist, create it
  122. lease, err := c.leaseClient.Create(c.newLease(nil))
  123. if err != nil {
  124. return nil, false, err
  125. }
  126. return lease, true, nil
  127. } else if err != nil {
  128. // unexpected error getting lease
  129. return nil, false, err
  130. }
  131. // lease already existed
  132. return lease, false, nil
  133. }
  134. // retryUpdateLease attempts to update the lease for maxUpdateRetries,
  135. // call this once you're sure the lease has been created
  136. func (c *controller) retryUpdateLease(base *coordv1beta1.Lease) {
  137. for i := 0; i < maxUpdateRetries; i++ {
  138. _, err := c.leaseClient.Update(c.newLease(base))
  139. if err == nil {
  140. return
  141. }
  142. klog.Errorf("failed to update node lease, error: %v", err)
  143. if i > 0 && c.onRepeatedHeartbeatFailure != nil {
  144. c.onRepeatedHeartbeatFailure()
  145. }
  146. }
  147. klog.Errorf("failed %d attempts to update node lease, will retry after %v", maxUpdateRetries, c.renewInterval)
  148. }
  149. // newLease constructs a new lease if base is nil, or returns a copy of base
  150. // with desired state asserted on the copy.
  151. func (c *controller) newLease(base *coordv1beta1.Lease) *coordv1beta1.Lease {
  152. // Use the bare minimum set of fields; other fields exist for debugging/legacy,
  153. // but we don't need to make node heartbeats more complicated by using them.
  154. var lease *coordv1beta1.Lease
  155. if base == nil {
  156. lease = &coordv1beta1.Lease{
  157. ObjectMeta: metav1.ObjectMeta{
  158. Name: c.holderIdentity,
  159. Namespace: corev1.NamespaceNodeLease,
  160. },
  161. Spec: coordv1beta1.LeaseSpec{
  162. HolderIdentity: pointer.StringPtr(c.holderIdentity),
  163. LeaseDurationSeconds: pointer.Int32Ptr(c.leaseDurationSeconds),
  164. },
  165. }
  166. } else {
  167. lease = base.DeepCopy()
  168. }
  169. lease.Spec.RenewTime = &metav1.MicroTime{Time: c.clock.Now()}
  170. // Setting owner reference needs node's UID. Note that it is different from
  171. // kubelet.nodeRef.UID. When lease is initially created, it is possible that
  172. // the connection between master and node is not ready yet. So try to set
  173. // owner reference every time when renewing the lease, until successful.
  174. if lease.OwnerReferences == nil || len(lease.OwnerReferences) == 0 {
  175. if node, err := c.client.CoreV1().Nodes().Get(c.holderIdentity, metav1.GetOptions{}); err == nil {
  176. lease.OwnerReferences = []metav1.OwnerReference{
  177. {
  178. APIVersion: corev1.SchemeGroupVersion.WithKind("Node").Version,
  179. Kind: corev1.SchemeGroupVersion.WithKind("Node").Kind,
  180. Name: c.holderIdentity,
  181. UID: node.UID,
  182. },
  183. }
  184. } else {
  185. klog.Errorf("failed to get node %q when trying to set owner ref to the node lease: %v", c.holderIdentity, err)
  186. }
  187. }
  188. return lease
  189. }
  190. func minDuration(a, b time.Duration) time.Duration {
  191. if a < b {
  192. return a
  193. }
  194. return b
  195. }