interpod_affinity.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package priorities
  14. import (
  15. "context"
  16. "sync"
  17. "sync/atomic"
  18. v1 "k8s.io/api/core/v1"
  19. apierrors "k8s.io/apimachinery/pkg/api/errors"
  20. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  21. "k8s.io/client-go/util/workqueue"
  22. "k8s.io/kubernetes/pkg/scheduler/algorithm"
  23. "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
  24. priorityutil "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util"
  25. schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
  26. schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
  27. "k8s.io/klog"
  28. )
  29. // InterPodAffinity contains information to calculate inter pod affinity.
  30. type InterPodAffinity struct {
  31. info predicates.NodeInfo
  32. nodeLister algorithm.NodeLister
  33. podLister algorithm.PodLister
  34. hardPodAffinityWeight int32
  35. }
  36. // NewInterPodAffinityPriority creates an InterPodAffinity.
  37. func NewInterPodAffinityPriority(
  38. info predicates.NodeInfo,
  39. nodeLister algorithm.NodeLister,
  40. podLister algorithm.PodLister,
  41. hardPodAffinityWeight int32) PriorityFunction {
  42. interPodAffinity := &InterPodAffinity{
  43. info: info,
  44. nodeLister: nodeLister,
  45. podLister: podLister,
  46. hardPodAffinityWeight: hardPodAffinityWeight,
  47. }
  48. return interPodAffinity.CalculateInterPodAffinityPriority
  49. }
  50. type podAffinityPriorityMap struct {
  51. sync.Mutex
  52. // nodes contain all nodes that should be considered
  53. nodes []*v1.Node
  54. // counts store the mapping from node name to so-far computed score of
  55. // the node.
  56. counts map[string]*int64
  57. // The first error that we faced.
  58. firstError error
  59. }
  60. func newPodAffinityPriorityMap(nodes []*v1.Node) *podAffinityPriorityMap {
  61. return &podAffinityPriorityMap{
  62. nodes: nodes,
  63. counts: make(map[string]*int64, len(nodes)),
  64. }
  65. }
  66. func (p *podAffinityPriorityMap) setError(err error) {
  67. p.Lock()
  68. defer p.Unlock()
  69. if p.firstError == nil {
  70. p.firstError = err
  71. }
  72. }
  73. func (p *podAffinityPriorityMap) processTerm(term *v1.PodAffinityTerm, podDefiningAffinityTerm, podToCheck *v1.Pod, fixedNode *v1.Node, weight int64) {
  74. namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(podDefiningAffinityTerm, term)
  75. selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
  76. if err != nil {
  77. p.setError(err)
  78. return
  79. }
  80. match := priorityutil.PodMatchesTermsNamespaceAndSelector(podToCheck, namespaces, selector)
  81. if match {
  82. for _, node := range p.nodes {
  83. if priorityutil.NodesHaveSameTopologyKey(node, fixedNode, term.TopologyKey) {
  84. atomic.AddInt64(p.counts[node.Name], weight)
  85. }
  86. }
  87. }
  88. }
  89. func (p *podAffinityPriorityMap) processTerms(terms []v1.WeightedPodAffinityTerm, podDefiningAffinityTerm, podToCheck *v1.Pod, fixedNode *v1.Node, multiplier int) {
  90. for i := range terms {
  91. term := &terms[i]
  92. p.processTerm(&term.PodAffinityTerm, podDefiningAffinityTerm, podToCheck, fixedNode, int64(term.Weight*int32(multiplier)))
  93. }
  94. }
  95. // CalculateInterPodAffinityPriority compute a sum by iterating through the elements of weightedPodAffinityTerm and adding
  96. // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for
  97. // that node; the node(s) with the highest sum are the most preferred.
  98. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity,
  99. // symmetry need to be considered for hard requirements from podAffinity
  100. func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *v1.Pod, nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error) {
  101. affinity := pod.Spec.Affinity
  102. hasAffinityConstraints := affinity != nil && affinity.PodAffinity != nil
  103. hasAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil
  104. // priorityMap stores the mapping from node name to so-far computed score of
  105. // the node.
  106. pm := newPodAffinityPriorityMap(nodes)
  107. allNodeNames := make([]string, 0, len(nodeNameToInfo))
  108. lazyInit := hasAffinityConstraints || hasAntiAffinityConstraints
  109. for name := range nodeNameToInfo {
  110. allNodeNames = append(allNodeNames, name)
  111. // if pod has affinity defined, or target node has affinityPods
  112. if lazyInit || len(nodeNameToInfo[name].PodsWithAffinity()) != 0 {
  113. pm.counts[name] = new(int64)
  114. }
  115. }
  116. // convert the topology key based weights to the node name based weights
  117. var maxCount, minCount int64
  118. processPod := func(existingPod *v1.Pod) error {
  119. existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName)
  120. if err != nil {
  121. if apierrors.IsNotFound(err) {
  122. klog.Errorf("Node not found, %v", existingPod.Spec.NodeName)
  123. return nil
  124. }
  125. return err
  126. }
  127. existingPodAffinity := existingPod.Spec.Affinity
  128. existingHasAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAffinity != nil
  129. existingHasAntiAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAntiAffinity != nil
  130. if hasAffinityConstraints {
  131. // For every soft pod affinity term of <pod>, if <existingPod> matches the term,
  132. // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey>
  133. // value as that of <existingPods>`s node by the term`s weight.
  134. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution
  135. pm.processTerms(terms, pod, existingPod, existingPodNode, 1)
  136. }
  137. if hasAntiAffinityConstraints {
  138. // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term,
  139. // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey>
  140. // value as that of <existingPod>`s node by the term`s weight.
  141. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution
  142. pm.processTerms(terms, pod, existingPod, existingPodNode, -1)
  143. }
  144. if existingHasAffinityConstraints {
  145. // For every hard pod affinity term of <existingPod>, if <pod> matches the term,
  146. // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey>
  147. // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight>
  148. if ipa.hardPodAffinityWeight > 0 {
  149. terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution
  150. // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
  151. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
  152. // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
  153. //}
  154. for _, term := range terms {
  155. pm.processTerm(&term, existingPod, pod, existingPodNode, int64(ipa.hardPodAffinityWeight))
  156. }
  157. }
  158. // For every soft pod affinity term of <existingPod>, if <pod> matches the term,
  159. // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey>
  160. // value as that of <existingPod>'s node by the term's weight.
  161. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution
  162. pm.processTerms(terms, existingPod, pod, existingPodNode, 1)
  163. }
  164. if existingHasAntiAffinityConstraints {
  165. // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term,
  166. // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey>
  167. // value as that of <existingPod>'s node by the term's weight.
  168. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution
  169. pm.processTerms(terms, existingPod, pod, existingPodNode, -1)
  170. }
  171. return nil
  172. }
  173. processNode := func(i int) {
  174. nodeInfo := nodeNameToInfo[allNodeNames[i]]
  175. if nodeInfo.Node() != nil {
  176. if hasAffinityConstraints || hasAntiAffinityConstraints {
  177. // We need to process all the pods.
  178. for _, existingPod := range nodeInfo.Pods() {
  179. if err := processPod(existingPod); err != nil {
  180. pm.setError(err)
  181. }
  182. }
  183. } else {
  184. // The pod doesn't have any constraints - we need to check only existing
  185. // ones that have some.
  186. for _, existingPod := range nodeInfo.PodsWithAffinity() {
  187. if err := processPod(existingPod); err != nil {
  188. pm.setError(err)
  189. }
  190. }
  191. }
  192. }
  193. }
  194. workqueue.ParallelizeUntil(context.TODO(), 16, len(allNodeNames), processNode)
  195. if pm.firstError != nil {
  196. return nil, pm.firstError
  197. }
  198. for _, node := range nodes {
  199. if pm.counts[node.Name] == nil {
  200. continue
  201. }
  202. if *pm.counts[node.Name] > maxCount {
  203. maxCount = *pm.counts[node.Name]
  204. }
  205. if *pm.counts[node.Name] < minCount {
  206. minCount = *pm.counts[node.Name]
  207. }
  208. }
  209. // calculate final priority score for each node
  210. result := make(schedulerapi.HostPriorityList, 0, len(nodes))
  211. maxMinDiff := maxCount - minCount
  212. for _, node := range nodes {
  213. fScore := float64(0)
  214. if maxMinDiff > 0 && pm.counts[node.Name] != nil {
  215. fScore = float64(schedulerapi.MaxPriority) * (float64(*pm.counts[node.Name]-minCount) / float64(maxCount-minCount))
  216. }
  217. result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: float64(fScore)})
  218. if klog.V(10) {
  219. klog.Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
  220. }
  221. }
  222. return result, nil
  223. }