reconciler.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. // Package reconciler implements interfaces that attempt to reconcile the
  14. // desired state of the with the actual state of the world by triggering
  15. // actions.
  16. package reconciler
  17. import (
  18. "fmt"
  19. "strings"
  20. "time"
  21. "k8s.io/api/core/v1"
  22. "k8s.io/apimachinery/pkg/types"
  23. "k8s.io/apimachinery/pkg/util/wait"
  24. "k8s.io/client-go/tools/record"
  25. "k8s.io/klog"
  26. "k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache"
  27. "k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics"
  28. "k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater"
  29. kevents "k8s.io/kubernetes/pkg/kubelet/events"
  30. "k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
  31. "k8s.io/kubernetes/pkg/volume"
  32. "k8s.io/kubernetes/pkg/volume/util/operationexecutor"
  33. )
  34. // Reconciler runs a periodic loop to reconcile the desired state of the world with
  35. // the actual state of the world by triggering attach detach operations.
  36. // Note: This is distinct from the Reconciler implemented by the kubelet volume
  37. // manager. This reconciles state for the attach/detach controller. That
  38. // reconciles state for the kubelet volume manager.
  39. type Reconciler interface {
  40. // Starts running the reconciliation loop which executes periodically, checks
  41. // if volumes that should be attached are attached and volumes that should
  42. // be detached are detached. If not, it will trigger attach/detach
  43. // operations to rectify.
  44. Run(stopCh <-chan struct{})
  45. }
  46. // NewReconciler returns a new instance of Reconciler that waits loopPeriod
  47. // between successive executions.
  48. // loopPeriod is the amount of time the reconciler loop waits between
  49. // successive executions.
  50. // maxWaitForUnmountDuration is the max amount of time the reconciler will wait
  51. // for the volume to be safely unmounted, after this it will detach the volume
  52. // anyway (to handle crashed/unavailable nodes). If during this time the volume
  53. // becomes used by a new pod, the detach request will be aborted and the timer
  54. // cleared.
  55. func NewReconciler(
  56. loopPeriod time.Duration,
  57. maxWaitForUnmountDuration time.Duration,
  58. syncDuration time.Duration,
  59. disableReconciliationSync bool,
  60. desiredStateOfWorld cache.DesiredStateOfWorld,
  61. actualStateOfWorld cache.ActualStateOfWorld,
  62. attacherDetacher operationexecutor.OperationExecutor,
  63. nodeStatusUpdater statusupdater.NodeStatusUpdater,
  64. recorder record.EventRecorder) Reconciler {
  65. return &reconciler{
  66. loopPeriod: loopPeriod,
  67. maxWaitForUnmountDuration: maxWaitForUnmountDuration,
  68. syncDuration: syncDuration,
  69. disableReconciliationSync: disableReconciliationSync,
  70. desiredStateOfWorld: desiredStateOfWorld,
  71. actualStateOfWorld: actualStateOfWorld,
  72. attacherDetacher: attacherDetacher,
  73. nodeStatusUpdater: nodeStatusUpdater,
  74. timeOfLastSync: time.Now(),
  75. recorder: recorder,
  76. }
  77. }
  78. type reconciler struct {
  79. loopPeriod time.Duration
  80. maxWaitForUnmountDuration time.Duration
  81. syncDuration time.Duration
  82. desiredStateOfWorld cache.DesiredStateOfWorld
  83. actualStateOfWorld cache.ActualStateOfWorld
  84. attacherDetacher operationexecutor.OperationExecutor
  85. nodeStatusUpdater statusupdater.NodeStatusUpdater
  86. timeOfLastSync time.Time
  87. disableReconciliationSync bool
  88. recorder record.EventRecorder
  89. }
  90. func (rc *reconciler) Run(stopCh <-chan struct{}) {
  91. wait.Until(rc.reconciliationLoopFunc(), rc.loopPeriod, stopCh)
  92. }
  93. // reconciliationLoopFunc this can be disabled via cli option disableReconciliation.
  94. // It periodically checks whether the attached volumes from actual state
  95. // are still attached to the node and update the status if they are not.
  96. func (rc *reconciler) reconciliationLoopFunc() func() {
  97. return func() {
  98. rc.reconcile()
  99. if rc.disableReconciliationSync {
  100. klog.V(5).Info("Skipping reconciling attached volumes still attached since it is disabled via the command line.")
  101. } else if rc.syncDuration < time.Second {
  102. klog.V(5).Info("Skipping reconciling attached volumes still attached since it is set to less than one second via the command line.")
  103. } else if time.Since(rc.timeOfLastSync) > rc.syncDuration {
  104. klog.V(5).Info("Starting reconciling attached volumes still attached")
  105. rc.sync()
  106. }
  107. }
  108. }
  109. func (rc *reconciler) sync() {
  110. defer rc.updateSyncTime()
  111. rc.syncStates()
  112. }
  113. func (rc *reconciler) updateSyncTime() {
  114. rc.timeOfLastSync = time.Now()
  115. }
  116. func (rc *reconciler) syncStates() {
  117. volumesPerNode := rc.actualStateOfWorld.GetAttachedVolumesPerNode()
  118. rc.attacherDetacher.VerifyVolumesAreAttached(volumesPerNode, rc.actualStateOfWorld)
  119. }
  120. // isMultiAttachForbidden checks if attaching this volume to multiple nodes is definitely not allowed/possible.
  121. // In its current form, this function can only reliably say for which volumes it's definitely forbidden. If it returns
  122. // false, it is not guaranteed that multi-attach is actually supported by the volume type and we must rely on the
  123. // attacher to fail fast in such cases.
  124. // Please see https://github.com/kubernetes/kubernetes/issues/40669 and https://github.com/kubernetes/kubernetes/pull/40148#discussion_r98055047
  125. func (rc *reconciler) isMultiAttachForbidden(volumeSpec *volume.Spec) bool {
  126. if volumeSpec.Volume != nil {
  127. // Check for volume types which are known to fail slow or cause trouble when trying to multi-attach
  128. if volumeSpec.Volume.AzureDisk != nil ||
  129. volumeSpec.Volume.Cinder != nil {
  130. return true
  131. }
  132. }
  133. // Only if this volume is a persistent volume, we have reliable information on whether it's allowed or not to
  134. // multi-attach. We trust in the individual volume implementations to not allow unsupported access modes
  135. if volumeSpec.PersistentVolume != nil {
  136. // Check for persistent volume types which do not fail when trying to multi-attach
  137. if len(volumeSpec.PersistentVolume.Spec.AccessModes) == 0 {
  138. // No access mode specified so we don't know for sure. Let the attacher fail if needed
  139. return false
  140. }
  141. // check if this volume is allowed to be attached to multiple PODs/nodes, if yes, return false
  142. for _, accessMode := range volumeSpec.PersistentVolume.Spec.AccessModes {
  143. if accessMode == v1.ReadWriteMany || accessMode == v1.ReadOnlyMany {
  144. return false
  145. }
  146. }
  147. return true
  148. }
  149. // we don't know if it's supported or not and let the attacher fail later in cases it's not supported
  150. return false
  151. }
  152. func (rc *reconciler) reconcile() {
  153. // Detaches are triggered before attaches so that volumes referenced by
  154. // pods that are rescheduled to a different node are detached first.
  155. // Ensure volumes that should be detached are detached.
  156. for _, attachedVolume := range rc.actualStateOfWorld.GetAttachedVolumes() {
  157. if !rc.desiredStateOfWorld.VolumeExists(
  158. attachedVolume.VolumeName, attachedVolume.NodeName) {
  159. // Don't even try to start an operation if there is already one running
  160. // This check must be done before we do any other checks, as otherwise the other checks
  161. // may pass while at the same time the volume leaves the pending state, resulting in
  162. // double detach attempts
  163. if rc.attacherDetacher.IsOperationPending(attachedVolume.VolumeName, "") {
  164. klog.V(10).Infof("Operation for volume %q is already running. Can't start detach for %q", attachedVolume.VolumeName, attachedVolume.NodeName)
  165. continue
  166. }
  167. // Set the detach request time
  168. elapsedTime, err := rc.actualStateOfWorld.SetDetachRequestTime(attachedVolume.VolumeName, attachedVolume.NodeName)
  169. if err != nil {
  170. klog.Errorf("Cannot trigger detach because it fails to set detach request time with error %v", err)
  171. continue
  172. }
  173. // Check whether timeout has reached the maximum waiting time
  174. timeout := elapsedTime > rc.maxWaitForUnmountDuration
  175. // Check whether volume is still mounted. Skip detach if it is still mounted unless timeout
  176. if attachedVolume.MountedByNode && !timeout {
  177. klog.V(5).Infof(attachedVolume.GenerateMsgDetailed("Cannot detach volume because it is still mounted", ""))
  178. continue
  179. }
  180. // Before triggering volume detach, mark volume as detached and update the node status
  181. // If it fails to update node status, skip detach volume
  182. err = rc.actualStateOfWorld.RemoveVolumeFromReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName)
  183. if err != nil {
  184. klog.V(5).Infof("RemoveVolumeFromReportAsAttached failed while removing volume %q from node %q with: %v",
  185. attachedVolume.VolumeName,
  186. attachedVolume.NodeName,
  187. err)
  188. }
  189. // Update Node Status to indicate volume is no longer safe to mount.
  190. err = rc.nodeStatusUpdater.UpdateNodeStatuses()
  191. if err != nil {
  192. // Skip detaching this volume if unable to update node status
  193. klog.Errorf(attachedVolume.GenerateErrorDetailed("UpdateNodeStatuses failed while attempting to report volume as attached", err).Error())
  194. continue
  195. }
  196. // Trigger detach volume which requires verifying safe to detach step
  197. // If timeout is true, skip verifySafeToDetach check
  198. klog.V(5).Infof(attachedVolume.GenerateMsgDetailed("Starting attacherDetacher.DetachVolume", ""))
  199. verifySafeToDetach := !timeout
  200. err = rc.attacherDetacher.DetachVolume(attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld)
  201. if err == nil {
  202. if !timeout {
  203. klog.Infof(attachedVolume.GenerateMsgDetailed("attacherDetacher.DetachVolume started", ""))
  204. } else {
  205. metrics.RecordForcedDetachMetric()
  206. klog.Warningf(attachedVolume.GenerateMsgDetailed("attacherDetacher.DetachVolume started", fmt.Sprintf("This volume is not safe to detach, but maxWaitForUnmountDuration %v expired, force detaching", rc.maxWaitForUnmountDuration)))
  207. }
  208. }
  209. if err != nil && !exponentialbackoff.IsExponentialBackoff(err) {
  210. // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
  211. // Log all other errors.
  212. klog.Errorf(attachedVolume.GenerateErrorDetailed("attacherDetacher.DetachVolume failed to start", err).Error())
  213. }
  214. }
  215. }
  216. rc.attachDesiredVolumes()
  217. // Update Node Status
  218. err := rc.nodeStatusUpdater.UpdateNodeStatuses()
  219. if err != nil {
  220. klog.Warningf("UpdateNodeStatuses failed with: %v", err)
  221. }
  222. }
  223. func (rc *reconciler) attachDesiredVolumes() {
  224. // Ensure volumes that should be attached are attached.
  225. for _, volumeToAttach := range rc.desiredStateOfWorld.GetVolumesToAttach() {
  226. if rc.actualStateOfWorld.IsVolumeAttachedToNode(volumeToAttach.VolumeName, volumeToAttach.NodeName) {
  227. // Volume/Node exists, touch it to reset detachRequestedTime
  228. if klog.V(5) {
  229. klog.Infof(volumeToAttach.GenerateMsgDetailed("Volume attached--touching", ""))
  230. }
  231. rc.actualStateOfWorld.ResetDetachRequestTime(volumeToAttach.VolumeName, volumeToAttach.NodeName)
  232. continue
  233. }
  234. // Don't even try to start an operation if there is already one running
  235. if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "") {
  236. if klog.V(10) {
  237. klog.Infof("Operation for volume %q is already running. Can't start attach for %q", volumeToAttach.VolumeName, volumeToAttach.NodeName)
  238. }
  239. continue
  240. }
  241. if rc.isMultiAttachForbidden(volumeToAttach.VolumeSpec) {
  242. nodes := rc.actualStateOfWorld.GetNodesForAttachedVolume(volumeToAttach.VolumeName)
  243. if len(nodes) > 0 {
  244. if !volumeToAttach.MultiAttachErrorReported {
  245. rc.reportMultiAttachError(volumeToAttach, nodes)
  246. rc.desiredStateOfWorld.SetMultiAttachError(volumeToAttach.VolumeName, volumeToAttach.NodeName)
  247. }
  248. continue
  249. }
  250. }
  251. // Volume/Node doesn't exist, spawn a goroutine to attach it
  252. if klog.V(5) {
  253. klog.Infof(volumeToAttach.GenerateMsgDetailed("Starting attacherDetacher.AttachVolume", ""))
  254. }
  255. err := rc.attacherDetacher.AttachVolume(volumeToAttach.VolumeToAttach, rc.actualStateOfWorld)
  256. if err == nil {
  257. klog.Infof(volumeToAttach.GenerateMsgDetailed("attacherDetacher.AttachVolume started", ""))
  258. }
  259. if err != nil && !exponentialbackoff.IsExponentialBackoff(err) {
  260. // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
  261. // Log all other errors.
  262. klog.Errorf(volumeToAttach.GenerateErrorDetailed("attacherDetacher.AttachVolume failed to start", err).Error())
  263. }
  264. }
  265. }
  266. // reportMultiAttachError sends events and logs situation that a volume that
  267. // should be attached to a node is already attached to different node(s).
  268. func (rc *reconciler) reportMultiAttachError(volumeToAttach cache.VolumeToAttach, nodes []types.NodeName) {
  269. // Filter out the current node from list of nodes where the volume is
  270. // attached.
  271. // Some methods need []string, some other needs []NodeName, collect both.
  272. // In theory, these arrays should have always only one element - the
  273. // controller does not allow more than one attachment. But use array just
  274. // in case...
  275. otherNodes := []types.NodeName{}
  276. otherNodesStr := []string{}
  277. for _, node := range nodes {
  278. if node != volumeToAttach.NodeName {
  279. otherNodes = append(otherNodes, node)
  280. otherNodesStr = append(otherNodesStr, string(node))
  281. }
  282. }
  283. // Get list of pods that use the volume on the other nodes.
  284. pods := rc.desiredStateOfWorld.GetVolumePodsOnNodes(otherNodes, volumeToAttach.VolumeName)
  285. if len(pods) == 0 {
  286. // We did not find any pods that requests the volume. The pod must have been deleted already.
  287. simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", "Volume is already exclusively attached to one node and can't be attached to another")
  288. for _, pod := range volumeToAttach.ScheduledPods {
  289. rc.recorder.Eventf(pod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg)
  290. }
  291. // Log detailed message to system admin
  292. nodeList := strings.Join(otherNodesStr, ", ")
  293. detailedMsg := volumeToAttach.GenerateMsgDetailed("Multi-Attach error", fmt.Sprintf("Volume is already exclusively attached to node %s and can't be attached to another", nodeList))
  294. klog.Warningf(detailedMsg)
  295. return
  296. }
  297. // There are pods that require the volume and run on another node. Typically
  298. // it's user error, e.g. a ReplicaSet uses a PVC and has >1 replicas. Let
  299. // the user know what pods are blocking the volume.
  300. for _, scheduledPod := range volumeToAttach.ScheduledPods {
  301. // Each scheduledPod must get a custom message. They can run in
  302. // different namespaces and user of a namespace should not see names of
  303. // pods in other namespaces.
  304. localPodNames := []string{} // Names of pods in scheduledPods's namespace
  305. otherPods := 0 // Count of pods in other namespaces
  306. for _, pod := range pods {
  307. if pod.Namespace == scheduledPod.Namespace {
  308. localPodNames = append(localPodNames, pod.Name)
  309. } else {
  310. otherPods++
  311. }
  312. }
  313. var msg string
  314. if len(localPodNames) > 0 {
  315. msg = fmt.Sprintf("Volume is already used by pod(s) %s", strings.Join(localPodNames, ", "))
  316. if otherPods > 0 {
  317. msg = fmt.Sprintf("%s and %d pod(s) in different namespaces", msg, otherPods)
  318. }
  319. } else {
  320. // No local pods, there are pods only in different namespaces.
  321. msg = fmt.Sprintf("Volume is already used by %d pod(s) in different namespaces", otherPods)
  322. }
  323. simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", msg)
  324. rc.recorder.Eventf(scheduledPod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg)
  325. }
  326. // Log all pods for system admin
  327. podNames := []string{}
  328. for _, pod := range pods {
  329. podNames = append(podNames, pod.Namespace+"/"+pod.Name)
  330. }
  331. detailedMsg := volumeToAttach.GenerateMsgDetailed("Multi-Attach error", fmt.Sprintf("Volume is already used by pods %s on node %s", strings.Join(podNames, ", "), strings.Join(otherNodesStr, ", ")))
  332. klog.Warningf(detailedMsg)
  333. }