util.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. /*
  2. Copyright 2019 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package persistentvolume
  14. import (
  15. "fmt"
  16. v1 "k8s.io/api/core/v1"
  17. storage "k8s.io/api/storage/v1"
  18. apierrors "k8s.io/apimachinery/pkg/api/errors"
  19. "k8s.io/apimachinery/pkg/api/resource"
  20. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  21. "k8s.io/apimachinery/pkg/labels"
  22. utilfeature "k8s.io/apiserver/pkg/util/feature"
  23. "k8s.io/client-go/kubernetes/scheme"
  24. storagelisters "k8s.io/client-go/listers/storage/v1"
  25. "k8s.io/client-go/tools/reference"
  26. v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
  27. "k8s.io/kubernetes/pkg/features"
  28. volumeutil "k8s.io/kubernetes/pkg/volume/util"
  29. )
  30. const (
  31. // AnnBindCompleted Annotation applies to PVCs. It indicates that the lifecycle
  32. // of the PVC has passed through the initial setup. This information changes how
  33. // we interpret some observations of the state of the objects. Value of this
  34. // Annotation does not matter.
  35. AnnBindCompleted = "pv.kubernetes.io/bind-completed"
  36. // AnnBoundByController annotation applies to PVs and PVCs. It indicates that
  37. // the binding (PV->PVC or PVC->PV) was installed by the controller. The
  38. // absence of this annotation means the binding was done by the user (i.e.
  39. // pre-bound). Value of this annotation does not matter.
  40. // External PV binders must bind PV the same way as PV controller, otherwise PV
  41. // controller may not handle it correctly.
  42. AnnBoundByController = "pv.kubernetes.io/bound-by-controller"
  43. // AnnSelectedNode annotation is added to a PVC that has been triggered by scheduler to
  44. // be dynamically provisioned. Its value is the name of the selected node.
  45. AnnSelectedNode = "volume.kubernetes.io/selected-node"
  46. // NotSupportedProvisioner is a special provisioner name which can be set
  47. // in storage class to indicate dynamic provisioning is not supported by
  48. // the storage.
  49. NotSupportedProvisioner = "kubernetes.io/no-provisioner"
  50. // AnnDynamicallyProvisioned annotation is added to a PV that has been dynamically provisioned by
  51. // Kubernetes. Its value is name of volume plugin that created the volume.
  52. // It serves both user (to show where a PV comes from) and Kubernetes (to
  53. // recognize dynamically provisioned PVs in its decisions).
  54. AnnDynamicallyProvisioned = "pv.kubernetes.io/provisioned-by"
  55. // AnnMigratedTo annotation is added to a PVC and PV that is supposed to be
  56. // dynamically provisioned/deleted by by its corresponding CSI driver
  57. // through the CSIMigration feature flags. When this annotation is set the
  58. // Kubernetes components will "stand-down" and the external-provisioner will
  59. // act on the objects
  60. AnnMigratedTo = "pv.kubernetes.io/migrated-to"
  61. // AnnStorageProvisioner annotation is added to a PVC that is supposed to be dynamically
  62. // provisioned. Its value is name of volume plugin that is supposed to provision
  63. // a volume for this PVC.
  64. AnnStorageProvisioner = "volume.beta.kubernetes.io/storage-provisioner"
  65. )
  66. // IsDelayBindingProvisioning checks if claim provisioning with selected-node annotation
  67. func IsDelayBindingProvisioning(claim *v1.PersistentVolumeClaim) bool {
  68. // When feature VolumeScheduling enabled,
  69. // Scheduler signal to the PV controller to start dynamic
  70. // provisioning by setting the "AnnSelectedNode" annotation
  71. // in the PVC
  72. _, ok := claim.Annotations[AnnSelectedNode]
  73. return ok
  74. }
  75. // IsDelayBindingMode checks if claim is in delay binding mode.
  76. func IsDelayBindingMode(claim *v1.PersistentVolumeClaim, classLister storagelisters.StorageClassLister) (bool, error) {
  77. className := v1helper.GetPersistentVolumeClaimClass(claim)
  78. if className == "" {
  79. return false, nil
  80. }
  81. class, err := classLister.Get(className)
  82. if err != nil {
  83. if apierrors.IsNotFound(err) {
  84. return false, nil
  85. }
  86. return false, err
  87. }
  88. if class.VolumeBindingMode == nil {
  89. return false, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", className)
  90. }
  91. return *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer, nil
  92. }
  93. // GetBindVolumeToClaim returns a new volume which is bound to given claim. In
  94. // addition, it returns a bool which indicates whether we made modification on
  95. // original volume.
  96. func GetBindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, bool, error) {
  97. dirty := false
  98. // Check if the volume was already bound (either by user or by controller)
  99. shouldSetBoundByController := false
  100. if !IsVolumeBoundToClaim(volume, claim) {
  101. shouldSetBoundByController = true
  102. }
  103. // The volume from method args can be pointing to watcher cache. We must not
  104. // modify these, therefore create a copy.
  105. volumeClone := volume.DeepCopy()
  106. // Bind the volume to the claim if it is not bound yet
  107. if volume.Spec.ClaimRef == nil ||
  108. volume.Spec.ClaimRef.Name != claim.Name ||
  109. volume.Spec.ClaimRef.Namespace != claim.Namespace ||
  110. volume.Spec.ClaimRef.UID != claim.UID {
  111. claimRef, err := reference.GetReference(scheme.Scheme, claim)
  112. if err != nil {
  113. return nil, false, fmt.Errorf("Unexpected error getting claim reference: %v", err)
  114. }
  115. volumeClone.Spec.ClaimRef = claimRef
  116. dirty = true
  117. }
  118. // Set AnnBoundByController if it is not set yet
  119. if shouldSetBoundByController && !metav1.HasAnnotation(volumeClone.ObjectMeta, AnnBoundByController) {
  120. metav1.SetMetaDataAnnotation(&volumeClone.ObjectMeta, AnnBoundByController, "yes")
  121. dirty = true
  122. }
  123. return volumeClone, dirty, nil
  124. }
  125. // IsVolumeBoundToClaim returns true, if given volume is pre-bound or bound
  126. // to specific claim. Both claim.Name and claim.Namespace must be equal.
  127. // If claim.UID is present in volume.Spec.ClaimRef, it must be equal too.
  128. func IsVolumeBoundToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) bool {
  129. if volume.Spec.ClaimRef == nil {
  130. return false
  131. }
  132. if claim.Name != volume.Spec.ClaimRef.Name || claim.Namespace != volume.Spec.ClaimRef.Namespace {
  133. return false
  134. }
  135. if volume.Spec.ClaimRef.UID != "" && claim.UID != volume.Spec.ClaimRef.UID {
  136. return false
  137. }
  138. return true
  139. }
  140. // FindMatchingVolume goes through the list of volumes to find the best matching volume
  141. // for the claim.
  142. //
  143. // This function is used by both the PV controller and scheduler.
  144. //
  145. // delayBinding is true only in the PV controller path. When set, prebound PVs are still returned
  146. // as a match for the claim, but unbound PVs are skipped.
  147. //
  148. // node is set only in the scheduler path. When set, the PV node affinity is checked against
  149. // the node's labels.
  150. //
  151. // excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple
  152. // unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen
  153. // PV needs to be excluded from future matching.
  154. func FindMatchingVolume(
  155. claim *v1.PersistentVolumeClaim,
  156. volumes []*v1.PersistentVolume,
  157. node *v1.Node,
  158. excludedVolumes map[string]*v1.PersistentVolume,
  159. delayBinding bool) (*v1.PersistentVolume, error) {
  160. var smallestVolume *v1.PersistentVolume
  161. var smallestVolumeQty resource.Quantity
  162. requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
  163. requestedClass := v1helper.GetPersistentVolumeClaimClass(claim)
  164. var selector labels.Selector
  165. if claim.Spec.Selector != nil {
  166. internalSelector, err := metav1.LabelSelectorAsSelector(claim.Spec.Selector)
  167. if err != nil {
  168. // should be unreachable code due to validation
  169. return nil, fmt.Errorf("error creating internal label selector for claim: %v: %v", claimToClaimKey(claim), err)
  170. }
  171. selector = internalSelector
  172. }
  173. // Go through all available volumes with two goals:
  174. // - find a volume that is either pre-bound by user or dynamically
  175. // provisioned for this claim. Because of this we need to loop through
  176. // all volumes.
  177. // - find the smallest matching one if there is no volume pre-bound to
  178. // the claim.
  179. for _, volume := range volumes {
  180. if _, ok := excludedVolumes[volume.Name]; ok {
  181. // Skip volumes in the excluded list
  182. continue
  183. }
  184. volumeQty := volume.Spec.Capacity[v1.ResourceStorage]
  185. // filter out mismatching volumeModes
  186. if CheckVolumeModeMismatches(&claim.Spec, &volume.Spec) {
  187. continue
  188. }
  189. // check if PV's DeletionTimeStamp is set, if so, skip this volume.
  190. if utilfeature.DefaultFeatureGate.Enabled(features.StorageObjectInUseProtection) {
  191. if volume.ObjectMeta.DeletionTimestamp != nil {
  192. continue
  193. }
  194. }
  195. nodeAffinityValid := true
  196. if node != nil {
  197. // Scheduler path, check that the PV NodeAffinity
  198. // is satisfied by the node
  199. err := volumeutil.CheckNodeAffinity(volume, node.Labels)
  200. if err != nil {
  201. nodeAffinityValid = false
  202. }
  203. }
  204. if IsVolumeBoundToClaim(volume, claim) {
  205. // this claim and volume are pre-bound; return
  206. // the volume if the size request is satisfied,
  207. // otherwise continue searching for a match
  208. if volumeQty.Cmp(requestedQty) < 0 {
  209. continue
  210. }
  211. // If PV node affinity is invalid, return no match.
  212. // This means the prebound PV (and therefore PVC)
  213. // is not suitable for this node.
  214. if !nodeAffinityValid {
  215. return nil, nil
  216. }
  217. return volume, nil
  218. }
  219. if node == nil && delayBinding {
  220. // PV controller does not bind this claim.
  221. // Scheduler will handle binding unbound volumes
  222. // Scheduler path will have node != nil
  223. continue
  224. }
  225. // filter out:
  226. // - volumes in non-available phase
  227. // - volumes bound to another claim
  228. // - volumes whose labels don't match the claim's selector, if specified
  229. // - volumes in Class that is not requested
  230. // - volumes whose NodeAffinity does not match the node
  231. if volume.Status.Phase != v1.VolumeAvailable {
  232. // We ignore volumes in non-available phase, because volumes that
  233. // satisfies matching criteria will be updated to available, binding
  234. // them now has high chance of encountering unnecessary failures
  235. // due to API conflicts.
  236. continue
  237. } else if volume.Spec.ClaimRef != nil {
  238. continue
  239. } else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) {
  240. continue
  241. }
  242. if v1helper.GetPersistentVolumeClass(volume) != requestedClass {
  243. continue
  244. }
  245. if !nodeAffinityValid {
  246. continue
  247. }
  248. if node != nil {
  249. // Scheduler path
  250. // Check that the access modes match
  251. if !CheckAccessModes(claim, volume) {
  252. continue
  253. }
  254. }
  255. if volumeQty.Cmp(requestedQty) >= 0 {
  256. if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 {
  257. smallestVolume = volume
  258. smallestVolumeQty = volumeQty
  259. }
  260. }
  261. }
  262. if smallestVolume != nil {
  263. // Found a matching volume
  264. return smallestVolume, nil
  265. }
  266. return nil, nil
  267. }
  268. // CheckVolumeModeMismatches is a convenience method that checks volumeMode for PersistentVolume
  269. // and PersistentVolumeClaims
  270. func CheckVolumeModeMismatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1.PersistentVolumeSpec) bool {
  271. if !utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
  272. if pvcSpec.VolumeMode != nil && *pvcSpec.VolumeMode == v1.PersistentVolumeBlock {
  273. // Block PVC does not match anything when the feature is off. We explicitly want
  274. // to prevent binding block PVC to filesystem PV.
  275. // The PVC should be ignored by PV controller.
  276. return true
  277. }
  278. if pvSpec.VolumeMode != nil && *pvSpec.VolumeMode == v1.PersistentVolumeBlock {
  279. // Block PV does not match anything when the feature is off. We explicitly want
  280. // to prevent binding block PV to filesystem PVC.
  281. // The PV should be ignored by PV controller.
  282. return true
  283. }
  284. // Both PV + PVC are not block.
  285. return false
  286. }
  287. // In HA upgrades, we cannot guarantee that the apiserver is on a version >= controller-manager.
  288. // So we default a nil volumeMode to filesystem
  289. requestedVolumeMode := v1.PersistentVolumeFilesystem
  290. if pvcSpec.VolumeMode != nil {
  291. requestedVolumeMode = *pvcSpec.VolumeMode
  292. }
  293. pvVolumeMode := v1.PersistentVolumeFilesystem
  294. if pvSpec.VolumeMode != nil {
  295. pvVolumeMode = *pvSpec.VolumeMode
  296. }
  297. return requestedVolumeMode != pvVolumeMode
  298. }
  299. // CheckAccessModes returns true if PV satisfies all the PVC's requested AccessModes
  300. func CheckAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool {
  301. pvModesMap := map[v1.PersistentVolumeAccessMode]bool{}
  302. for _, mode := range volume.Spec.AccessModes {
  303. pvModesMap[mode] = true
  304. }
  305. for _, mode := range claim.Spec.AccessModes {
  306. _, ok := pvModesMap[mode]
  307. if !ok {
  308. return false
  309. }
  310. }
  311. return true
  312. }
  313. func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
  314. return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
  315. }
  316. // GetVolumeNodeAffinity returns a VolumeNodeAffinity for given key and value.
  317. func GetVolumeNodeAffinity(key string, value string) *v1.VolumeNodeAffinity {
  318. return &v1.VolumeNodeAffinity{
  319. Required: &v1.NodeSelector{
  320. NodeSelectorTerms: []v1.NodeSelectorTerm{
  321. {
  322. MatchExpressions: []v1.NodeSelectorRequirement{
  323. {
  324. Key: key,
  325. Operator: v1.NodeSelectorOpIn,
  326. Values: []string{value},
  327. },
  328. },
  329. },
  330. },
  331. },
  332. }
  333. }