util.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /*
  2. Copyright 2019 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package persistentvolume
  14. import (
  15. "fmt"
  16. "k8s.io/api/core/v1"
  17. storage "k8s.io/api/storage/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  20. "k8s.io/apimachinery/pkg/labels"
  21. utilfeature "k8s.io/apiserver/pkg/util/feature"
  22. "k8s.io/client-go/kubernetes/scheme"
  23. storagelisters "k8s.io/client-go/listers/storage/v1"
  24. "k8s.io/client-go/tools/reference"
  25. v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
  26. "k8s.io/kubernetes/pkg/features"
  27. volumeutil "k8s.io/kubernetes/pkg/volume/util"
  28. )
  29. const (
  30. // AnnBindCompleted Annotation applies to PVCs. It indicates that the lifecycle
  31. // of the PVC has passed through the initial setup. This information changes how
  32. // we interpret some observations of the state of the objects. Value of this
  33. // Annotation does not matter.
  34. AnnBindCompleted = "pv.kubernetes.io/bind-completed"
  35. // AnnBoundByController annotation applies to PVs and PVCs. It indicates that
  36. // the binding (PV->PVC or PVC->PV) was installed by the controller. The
  37. // absence of this annotation means the binding was done by the user (i.e.
  38. // pre-bound). Value of this annotation does not matter.
  39. // External PV binders must bind PV the same way as PV controller, otherwise PV
  40. // controller may not handle it correctly.
  41. AnnBoundByController = "pv.kubernetes.io/bound-by-controller"
  42. // AnnSelectedNode annotation is added to a PVC that has been triggered by scheduler to
  43. // be dynamically provisioned. Its value is the name of the selected node.
  44. AnnSelectedNode = "volume.kubernetes.io/selected-node"
  45. // NotSupportedProvisioner is a special provisioner name which can be set
  46. // in storage class to indicate dynamic provisioning is not supported by
  47. // the storage.
  48. NotSupportedProvisioner = "kubernetes.io/no-provisioner"
  49. // AnnDynamicallyProvisioned annotation is added to a PV that has been dynamically provisioned by
  50. // Kubernetes. Its value is name of volume plugin that created the volume.
  51. // It serves both user (to show where a PV comes from) and Kubernetes (to
  52. // recognize dynamically provisioned PVs in its decisions).
  53. AnnDynamicallyProvisioned = "pv.kubernetes.io/provisioned-by"
  54. // AnnStorageProvisioner annotation is added to a PVC that is supposed to be dynamically
  55. // provisioned. Its value is name of volume plugin that is supposed to provision
  56. // a volume for this PVC.
  57. AnnStorageProvisioner = "volume.beta.kubernetes.io/storage-provisioner"
  58. )
  59. // IsDelayBindingMode checks if claim is in delay binding mode.
  60. func IsDelayBindingMode(claim *v1.PersistentVolumeClaim, classLister storagelisters.StorageClassLister) (bool, error) {
  61. className := v1helper.GetPersistentVolumeClaimClass(claim)
  62. if className == "" {
  63. return false, nil
  64. }
  65. class, err := classLister.Get(className)
  66. if err != nil {
  67. return false, nil
  68. }
  69. if class.VolumeBindingMode == nil {
  70. return false, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", className)
  71. }
  72. return *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer, nil
  73. }
  74. // GetBindVolumeToClaim returns a new volume which is bound to given claim. In
  75. // addition, it returns a bool which indicates whether we made modification on
  76. // original volume.
  77. func GetBindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, bool, error) {
  78. dirty := false
  79. // Check if the volume was already bound (either by user or by controller)
  80. shouldSetBoundByController := false
  81. if !IsVolumeBoundToClaim(volume, claim) {
  82. shouldSetBoundByController = true
  83. }
  84. // The volume from method args can be pointing to watcher cache. We must not
  85. // modify these, therefore create a copy.
  86. volumeClone := volume.DeepCopy()
  87. // Bind the volume to the claim if it is not bound yet
  88. if volume.Spec.ClaimRef == nil ||
  89. volume.Spec.ClaimRef.Name != claim.Name ||
  90. volume.Spec.ClaimRef.Namespace != claim.Namespace ||
  91. volume.Spec.ClaimRef.UID != claim.UID {
  92. claimRef, err := reference.GetReference(scheme.Scheme, claim)
  93. if err != nil {
  94. return nil, false, fmt.Errorf("Unexpected error getting claim reference: %v", err)
  95. }
  96. volumeClone.Spec.ClaimRef = claimRef
  97. dirty = true
  98. }
  99. // Set AnnBoundByController if it is not set yet
  100. if shouldSetBoundByController && !metav1.HasAnnotation(volumeClone.ObjectMeta, AnnBoundByController) {
  101. metav1.SetMetaDataAnnotation(&volumeClone.ObjectMeta, AnnBoundByController, "yes")
  102. dirty = true
  103. }
  104. return volumeClone, dirty, nil
  105. }
  106. // IsVolumeBoundToClaim returns true, if given volume is pre-bound or bound
  107. // to specific claim. Both claim.Name and claim.Namespace must be equal.
  108. // If claim.UID is present in volume.Spec.ClaimRef, it must be equal too.
  109. func IsVolumeBoundToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) bool {
  110. if volume.Spec.ClaimRef == nil {
  111. return false
  112. }
  113. if claim.Name != volume.Spec.ClaimRef.Name || claim.Namespace != volume.Spec.ClaimRef.Namespace {
  114. return false
  115. }
  116. if volume.Spec.ClaimRef.UID != "" && claim.UID != volume.Spec.ClaimRef.UID {
  117. return false
  118. }
  119. return true
  120. }
  121. // FindMatchingVolume goes through the list of volumes to find the best matching volume
  122. // for the claim.
  123. //
  124. // This function is used by both the PV controller and scheduler.
  125. //
  126. // delayBinding is true only in the PV controller path. When set, prebound PVs are still returned
  127. // as a match for the claim, but unbound PVs are skipped.
  128. //
  129. // node is set only in the scheduler path. When set, the PV node affinity is checked against
  130. // the node's labels.
  131. //
  132. // excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple
  133. // unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen
  134. // PV needs to be excluded from future matching.
  135. func FindMatchingVolume(
  136. claim *v1.PersistentVolumeClaim,
  137. volumes []*v1.PersistentVolume,
  138. node *v1.Node,
  139. excludedVolumes map[string]*v1.PersistentVolume,
  140. delayBinding bool) (*v1.PersistentVolume, error) {
  141. var smallestVolume *v1.PersistentVolume
  142. var smallestVolumeQty resource.Quantity
  143. requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
  144. requestedClass := v1helper.GetPersistentVolumeClaimClass(claim)
  145. var selector labels.Selector
  146. if claim.Spec.Selector != nil {
  147. internalSelector, err := metav1.LabelSelectorAsSelector(claim.Spec.Selector)
  148. if err != nil {
  149. // should be unreachable code due to validation
  150. return nil, fmt.Errorf("error creating internal label selector for claim: %v: %v", claimToClaimKey(claim), err)
  151. }
  152. selector = internalSelector
  153. }
  154. // Go through all available volumes with two goals:
  155. // - find a volume that is either pre-bound by user or dynamically
  156. // provisioned for this claim. Because of this we need to loop through
  157. // all volumes.
  158. // - find the smallest matching one if there is no volume pre-bound to
  159. // the claim.
  160. for _, volume := range volumes {
  161. if _, ok := excludedVolumes[volume.Name]; ok {
  162. // Skip volumes in the excluded list
  163. continue
  164. }
  165. volumeQty := volume.Spec.Capacity[v1.ResourceStorage]
  166. // check if volumeModes do not match (feature gate protected)
  167. isMismatch, err := CheckVolumeModeMismatches(&claim.Spec, &volume.Spec)
  168. if err != nil {
  169. return nil, fmt.Errorf("error checking if volumeMode was a mismatch: %v", err)
  170. }
  171. // filter out mismatching volumeModes
  172. if isMismatch {
  173. continue
  174. }
  175. // check if PV's DeletionTimeStamp is set, if so, skip this volume.
  176. if utilfeature.DefaultFeatureGate.Enabled(features.StorageObjectInUseProtection) {
  177. if volume.ObjectMeta.DeletionTimestamp != nil {
  178. continue
  179. }
  180. }
  181. nodeAffinityValid := true
  182. if node != nil {
  183. // Scheduler path, check that the PV NodeAffinity
  184. // is satisfied by the node
  185. err := volumeutil.CheckNodeAffinity(volume, node.Labels)
  186. if err != nil {
  187. nodeAffinityValid = false
  188. }
  189. }
  190. if IsVolumeBoundToClaim(volume, claim) {
  191. // this claim and volume are pre-bound; return
  192. // the volume if the size request is satisfied,
  193. // otherwise continue searching for a match
  194. if volumeQty.Cmp(requestedQty) < 0 {
  195. continue
  196. }
  197. // If PV node affinity is invalid, return no match.
  198. // This means the prebound PV (and therefore PVC)
  199. // is not suitable for this node.
  200. if !nodeAffinityValid {
  201. return nil, nil
  202. }
  203. return volume, nil
  204. }
  205. if node == nil && delayBinding {
  206. // PV controller does not bind this claim.
  207. // Scheduler will handle binding unbound volumes
  208. // Scheduler path will have node != nil
  209. continue
  210. }
  211. // filter out:
  212. // - volumes in non-available phase
  213. // - volumes bound to another claim
  214. // - volumes whose labels don't match the claim's selector, if specified
  215. // - volumes in Class that is not requested
  216. // - volumes whose NodeAffinity does not match the node
  217. if volume.Status.Phase != v1.VolumeAvailable {
  218. // We ignore volumes in non-available phase, because volumes that
  219. // satisfies matching criteria will be updated to available, binding
  220. // them now has high chance of encountering unnecessary failures
  221. // due to API conflicts.
  222. continue
  223. } else if volume.Spec.ClaimRef != nil {
  224. continue
  225. } else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) {
  226. continue
  227. }
  228. if v1helper.GetPersistentVolumeClass(volume) != requestedClass {
  229. continue
  230. }
  231. if !nodeAffinityValid {
  232. continue
  233. }
  234. if node != nil {
  235. // Scheduler path
  236. // Check that the access modes match
  237. if !CheckAccessModes(claim, volume) {
  238. continue
  239. }
  240. }
  241. if volumeQty.Cmp(requestedQty) >= 0 {
  242. if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 {
  243. smallestVolume = volume
  244. smallestVolumeQty = volumeQty
  245. }
  246. }
  247. }
  248. if smallestVolume != nil {
  249. // Found a matching volume
  250. return smallestVolume, nil
  251. }
  252. return nil, nil
  253. }
  254. // CheckVolumeModeMismatches is a convenience method that checks volumeMode for PersistentVolume
  255. // and PersistentVolumeClaims
  256. func CheckVolumeModeMismatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1.PersistentVolumeSpec) (bool, error) {
  257. if !utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
  258. return false, nil
  259. }
  260. // In HA upgrades, we cannot guarantee that the apiserver is on a version >= controller-manager.
  261. // So we default a nil volumeMode to filesystem
  262. requestedVolumeMode := v1.PersistentVolumeFilesystem
  263. if pvcSpec.VolumeMode != nil {
  264. requestedVolumeMode = *pvcSpec.VolumeMode
  265. }
  266. pvVolumeMode := v1.PersistentVolumeFilesystem
  267. if pvSpec.VolumeMode != nil {
  268. pvVolumeMode = *pvSpec.VolumeMode
  269. }
  270. return requestedVolumeMode != pvVolumeMode, nil
  271. }
  272. // CheckAccessModes returns true if PV satisfies all the PVC's requested AccessModes
  273. func CheckAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool {
  274. pvModesMap := map[v1.PersistentVolumeAccessMode]bool{}
  275. for _, mode := range volume.Spec.AccessModes {
  276. pvModesMap[mode] = true
  277. }
  278. for _, mode := range claim.Spec.AccessModes {
  279. _, ok := pvModesMap[mode]
  280. if !ok {
  281. return false
  282. }
  283. }
  284. return true
  285. }
  286. func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
  287. return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
  288. }
  289. // GetVolumeNodeAffinity returns a VolumeNodeAffinity for given key and value.
  290. func GetVolumeNodeAffinity(key string, value string) *v1.VolumeNodeAffinity {
  291. return &v1.VolumeNodeAffinity{
  292. Required: &v1.NodeSelector{
  293. NodeSelectorTerms: []v1.NodeSelectorTerm{
  294. {
  295. MatchExpressions: []v1.NodeSelectorRequirement{
  296. {
  297. Key: key,
  298. Operator: v1.NodeSelectorOpIn,
  299. Values: []string{value},
  300. },
  301. },
  302. },
  303. },
  304. },
  305. }
  306. }