123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- /*
- Copyright 2016 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package priorities
- import (
- "fmt"
- "strings"
- v1 "k8s.io/api/core/v1"
- schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
- schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
- "k8s.io/kubernetes/pkg/util/parsers"
- )
- // The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for
- // container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range.
- const (
- mb int64 = 1024 * 1024
- minThreshold int64 = 23 * mb
- maxThreshold int64 = 1000 * mb
- )
- // ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images.
- // It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10
- // based on the total size of those images.
- // - If none of the images are present, this node will be given the lowest priority.
- // - If some of the images are present on a node, the larger their sizes' sum, the higher the node's priority.
- func ImageLocalityPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulernodeinfo.NodeInfo) (schedulerapi.HostPriority, error) {
- node := nodeInfo.Node()
- if node == nil {
- return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
- }
- var score int
- if priorityMeta, ok := meta.(*priorityMetadata); ok {
- score = calculatePriority(sumImageScores(nodeInfo, pod.Spec.Containers, priorityMeta.totalNumNodes))
- } else {
- // if we are not able to parse priority meta data, skip this priority
- score = 0
- }
- return schedulerapi.HostPriority{
- Host: node.Name,
- Score: float64(score),
- }, nil
- }
- // calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's
- // priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores.
- func calculatePriority(sumScores int64) int {
- if sumScores < minThreshold {
- sumScores = minThreshold
- } else if sumScores > maxThreshold {
- sumScores = maxThreshold
- }
- return int(int64(schedulerapi.MaxPriority) * (sumScores - minThreshold) / (maxThreshold - minThreshold))
- }
- // sumImageScores returns the sum of image scores of all the containers that are already on the node.
- // Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate
- // the final score. Note that the init containers are not considered for it's rare for users to deploy huge init containers.
- func sumImageScores(nodeInfo *schedulernodeinfo.NodeInfo, containers []v1.Container, totalNumNodes int) int64 {
- var sum int64
- imageStates := nodeInfo.ImageStates()
- for _, container := range containers {
- if state, ok := imageStates[normalizedImageName(container.Image)]; ok {
- sum += scaledImageScore(state, totalNumNodes)
- }
- }
- return sum
- }
- // scaledImageScore returns an adaptively scaled score for the given state of an image.
- // The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to.
- // This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or
- // a few nodes due to image locality.
- func scaledImageScore(imageState *schedulernodeinfo.ImageStateSummary, totalNumNodes int) int64 {
- spread := float64(imageState.NumNodes) / float64(totalNumNodes)
- return int64(float64(imageState.Size) * spread)
- }
- // normalizedImageName returns the CRI compliant name for a given image.
- // TODO: cover the corner cases of missed matches, e.g,
- // 1. Using Docker as runtime and docker.io/library/test:tag in pod spec, but only test:tag will present in node status
- // 2. Using the implicit registry, i.e., test:tag or library/test:tag in pod spec but only docker.io/library/test:tag
- // in node status; note that if users consistently use one registry format, this should not happen.
- func normalizedImageName(name string) string {
- if strings.LastIndex(name, ":") <= strings.LastIndex(name, "/") {
- name = name + ":" + parsers.DefaultImageTag
- }
- return name
- }
|