123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077 |
- /*
- Copyright 2016 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package eviction
- import (
- "fmt"
- "sort"
- "strconv"
- "strings"
- "time"
- "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- utilfeature "k8s.io/apiserver/pkg/util/feature"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/features"
- statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
- evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
- kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
- schedulerutils "k8s.io/kubernetes/pkg/scheduler/util"
- volumeutils "k8s.io/kubernetes/pkg/volume/util"
- )
- const (
- unsupportedEvictionSignal = "unsupported eviction signal %v"
- // Reason is the reason reported back in status.
- Reason = "Evicted"
- // nodeLowMessageFmt is the message for evictions due to resource pressure.
- nodeLowMessageFmt = "The node was low on resource: %v. "
- // nodeConditionMessageFmt is the message for evictions due to resource pressure.
- nodeConditionMessageFmt = "The node had condition: %v. "
- // containerMessageFmt provides additional information for containers exceeding requests
- containerMessageFmt = "Container %s was using %s, which exceeds its request of %s. "
- // containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit
- containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. "
- // podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit
- podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. "
- // emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit
- emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. "
- // inodes, number. internal to this module, used to account for local disk inode consumption.
- resourceInodes v1.ResourceName = "inodes"
- // resourcePids, number. internal to this module, used to account for local pid consumption.
- resourcePids v1.ResourceName = "pids"
- // OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests
- OffendingContainersKey = "offending_containers"
- // OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests
- OffendingContainersUsageKey = "offending_containers_usage"
- // StarvedResourceKey is the key for the starved resource in eviction event annotations
- StarvedResourceKey = "starved_resource"
- )
- var (
- // signalToNodeCondition maps a signal to the node condition to report if threshold is met.
- signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType
- // signalToResource maps a Signal to its associated Resource.
- signalToResource map[evictionapi.Signal]v1.ResourceName
- )
- func init() {
- // map eviction signals to node conditions
- signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{}
- signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure
- signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure
- signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure
- signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
- signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
- signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure
- signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure
- // map signals to resources (and vice-versa)
- signalToResource = map[evictionapi.Signal]v1.ResourceName{}
- signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
- signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
- signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage
- signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes
- signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage
- signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes
- signalToResource[evictionapi.SignalPIDAvailable] = resourcePids
- }
- // validSignal returns true if the signal is supported.
- func validSignal(signal evictionapi.Signal) bool {
- _, found := signalToResource[signal]
- return found
- }
- // ParseThresholdConfig parses the flags for thresholds.
- func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim map[string]string) ([]evictionapi.Threshold, error) {
- results := []evictionapi.Threshold{}
- hardThresholds, err := parseThresholdStatements(evictionHard)
- if err != nil {
- return nil, err
- }
- results = append(results, hardThresholds...)
- softThresholds, err := parseThresholdStatements(evictionSoft)
- if err != nil {
- return nil, err
- }
- gracePeriods, err := parseGracePeriods(evictionSoftGracePeriod)
- if err != nil {
- return nil, err
- }
- minReclaims, err := parseMinimumReclaims(evictionMinimumReclaim)
- if err != nil {
- return nil, err
- }
- for i := range softThresholds {
- signal := softThresholds[i].Signal
- period, found := gracePeriods[signal]
- if !found {
- return nil, fmt.Errorf("grace period must be specified for the soft eviction threshold %v", signal)
- }
- softThresholds[i].GracePeriod = period
- }
- results = append(results, softThresholds...)
- for i := range results {
- for signal, minReclaim := range minReclaims {
- if results[i].Signal == signal {
- results[i].MinReclaim = &minReclaim
- break
- }
- }
- }
- for _, key := range allocatableConfig {
- if key == kubetypes.NodeAllocatableEnforcementKey {
- results = addAllocatableThresholds(results)
- break
- }
- }
- return results, nil
- }
- func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi.Threshold {
- additionalThresholds := []evictionapi.Threshold{}
- for _, threshold := range thresholds {
- if threshold.Signal == evictionapi.SignalMemoryAvailable && isHardEvictionThreshold(threshold) {
- // Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable
- additionalThresholds = append(additionalThresholds, evictionapi.Threshold{
- Signal: evictionapi.SignalAllocatableMemoryAvailable,
- Operator: threshold.Operator,
- Value: threshold.Value,
- MinReclaim: threshold.MinReclaim,
- })
- }
- }
- return append(thresholds, additionalThresholds...)
- }
- // parseThresholdStatements parses the input statements into a list of Threshold objects.
- func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) {
- if len(statements) == 0 {
- return nil, nil
- }
- results := []evictionapi.Threshold{}
- for signal, val := range statements {
- result, err := parseThresholdStatement(evictionapi.Signal(signal), val)
- if err != nil {
- return nil, err
- }
- if result != nil {
- results = append(results, *result)
- }
- }
- return results, nil
- }
- // parseThresholdStatement parses a threshold statement and returns a threshold,
- // or nil if the threshold should be ignored.
- func parseThresholdStatement(signal evictionapi.Signal, val string) (*evictionapi.Threshold, error) {
- if !validSignal(signal) {
- return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
- }
- operator := evictionapi.OpForSignal[signal]
- if strings.HasSuffix(val, "%") {
- // ignore 0% and 100%
- if val == "0%" || val == "100%" {
- return nil, nil
- }
- percentage, err := parsePercentage(val)
- if err != nil {
- return nil, err
- }
- if percentage < 0 {
- return nil, fmt.Errorf("eviction percentage threshold %v must be >= 0%%: %s", signal, val)
- }
- if percentage > 100 {
- return nil, fmt.Errorf("eviction percentage threshold %v must be <= 100%%: %s", signal, val)
- }
- return &evictionapi.Threshold{
- Signal: signal,
- Operator: operator,
- Value: evictionapi.ThresholdValue{
- Percentage: percentage,
- },
- }, nil
- }
- quantity, err := resource.ParseQuantity(val)
- if err != nil {
- return nil, err
- }
- if quantity.Sign() < 0 || quantity.IsZero() {
- return nil, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
- }
- return &evictionapi.Threshold{
- Signal: signal,
- Operator: operator,
- Value: evictionapi.ThresholdValue{
- Quantity: &quantity,
- },
- }, nil
- }
- // parsePercentage parses a string representing a percentage value
- func parsePercentage(input string) (float32, error) {
- value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32)
- if err != nil {
- return 0, err
- }
- return float32(value) / 100, nil
- }
- // parseGracePeriods parses the grace period statements
- func parseGracePeriods(statements map[string]string) (map[evictionapi.Signal]time.Duration, error) {
- if len(statements) == 0 {
- return nil, nil
- }
- results := map[evictionapi.Signal]time.Duration{}
- for signal, val := range statements {
- signal := evictionapi.Signal(signal)
- if !validSignal(signal) {
- return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
- }
- gracePeriod, err := time.ParseDuration(val)
- if err != nil {
- return nil, err
- }
- if gracePeriod < 0 {
- return nil, fmt.Errorf("invalid eviction grace period specified: %v, must be a positive value", val)
- }
- results[signal] = gracePeriod
- }
- return results, nil
- }
- // parseMinimumReclaims parses the minimum reclaim statements
- func parseMinimumReclaims(statements map[string]string) (map[evictionapi.Signal]evictionapi.ThresholdValue, error) {
- if len(statements) == 0 {
- return nil, nil
- }
- results := map[evictionapi.Signal]evictionapi.ThresholdValue{}
- for signal, val := range statements {
- signal := evictionapi.Signal(signal)
- if !validSignal(signal) {
- return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
- }
- if strings.HasSuffix(val, "%") {
- percentage, err := parsePercentage(val)
- if err != nil {
- return nil, err
- }
- if percentage <= 0 {
- return nil, fmt.Errorf("eviction percentage minimum reclaim %v must be positive: %s", signal, val)
- }
- results[signal] = evictionapi.ThresholdValue{
- Percentage: percentage,
- }
- continue
- }
- quantity, err := resource.ParseQuantity(val)
- if err != nil {
- return nil, err
- }
- if quantity.Sign() < 0 {
- return nil, fmt.Errorf("negative eviction minimum reclaim specified for %v", signal)
- }
- results[signal] = evictionapi.ThresholdValue{
- Quantity: &quantity,
- }
- }
- return results, nil
- }
- // diskUsage converts used bytes into a resource quantity.
- func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
- if fsStats == nil || fsStats.UsedBytes == nil {
- return &resource.Quantity{Format: resource.BinarySI}
- }
- usage := int64(*fsStats.UsedBytes)
- return resource.NewQuantity(usage, resource.BinarySI)
- }
- // inodeUsage converts inodes consumed into a resource quantity.
- func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
- if fsStats == nil || fsStats.InodesUsed == nil {
- return &resource.Quantity{Format: resource.DecimalSI}
- }
- usage := int64(*fsStats.InodesUsed)
- return resource.NewQuantity(usage, resource.DecimalSI)
- }
- // memoryUsage converts working set into a resource quantity.
- func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
- if memStats == nil || memStats.WorkingSetBytes == nil {
- return &resource.Quantity{Format: resource.BinarySI}
- }
- usage := int64(*memStats.WorkingSetBytes)
- return resource.NewQuantity(usage, resource.BinarySI)
- }
- // localVolumeNames returns the set of volumes for the pod that are local
- // TODO: sumamry API should report what volumes consume local storage rather than hard-code here.
- func localVolumeNames(pod *v1.Pod) []string {
- result := []string{}
- for _, volume := range pod.Spec.Volumes {
- if volume.HostPath != nil ||
- (volume.EmptyDir != nil && volume.EmptyDir.Medium != v1.StorageMediumMemory) ||
- volume.ConfigMap != nil ||
- volume.GitRepo != nil {
- result = append(result, volume.Name)
- }
- }
- return result
- }
- // containerUsage aggregates container disk usage and inode consumption for the specified stats to measure.
- func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList {
- disk := resource.Quantity{Format: resource.BinarySI}
- inodes := resource.Quantity{Format: resource.DecimalSI}
- for _, container := range podStats.Containers {
- if hasFsStatsType(statsToMeasure, fsStatsRoot) {
- disk.Add(*diskUsage(container.Rootfs))
- inodes.Add(*inodeUsage(container.Rootfs))
- }
- if hasFsStatsType(statsToMeasure, fsStatsLogs) {
- disk.Add(*diskUsage(container.Logs))
- inodes.Add(*inodeUsage(container.Logs))
- }
- }
- return v1.ResourceList{
- v1.ResourceEphemeralStorage: disk,
- resourceInodes: inodes,
- }
- }
- // podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure.
- func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList {
- disk := resource.Quantity{Format: resource.BinarySI}
- inodes := resource.Quantity{Format: resource.DecimalSI}
- for _, volumeName := range volumeNames {
- for _, volumeStats := range podStats.VolumeStats {
- if volumeStats.Name == volumeName {
- disk.Add(*diskUsage(&volumeStats.FsStats))
- inodes.Add(*inodeUsage(&volumeStats.FsStats))
- break
- }
- }
- }
- return v1.ResourceList{
- v1.ResourceEphemeralStorage: disk,
- resourceInodes: inodes,
- }
- }
- // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
- func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
- disk := resource.Quantity{Format: resource.BinarySI}
- inodes := resource.Quantity{Format: resource.DecimalSI}
- containerUsageList := containerUsage(podStats, statsToMeasure)
- disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
- inodes.Add(containerUsageList[resourceInodes])
- if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
- volumeNames := localVolumeNames(pod)
- podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
- disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
- inodes.Add(podLocalVolumeUsageList[resourceInodes])
- }
- return v1.ResourceList{
- v1.ResourceEphemeralStorage: disk,
- resourceInodes: inodes,
- }, nil
- }
- // localEphemeralVolumeNames returns the set of ephemeral volumes for the pod that are local
- func localEphemeralVolumeNames(pod *v1.Pod) []string {
- result := []string{}
- for _, volume := range pod.Spec.Volumes {
- if volumeutils.IsLocalEphemeralVolume(volume) {
- result = append(result, volume.Name)
- }
- }
- return result
- }
- // podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure.
- func podLocalEphemeralStorageUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
- disk := resource.Quantity{Format: resource.BinarySI}
- inodes := resource.Quantity{Format: resource.DecimalSI}
- containerUsageList := containerUsage(podStats, statsToMeasure)
- disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
- inodes.Add(containerUsageList[resourceInodes])
- if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
- volumeNames := localEphemeralVolumeNames(pod)
- podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
- disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
- inodes.Add(podLocalVolumeUsageList[resourceInodes])
- }
- return v1.ResourceList{
- v1.ResourceEphemeralStorage: disk,
- resourceInodes: inodes,
- }, nil
- }
- // formatThreshold formats a threshold for logging.
- func formatThreshold(threshold evictionapi.Threshold) string {
- return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, threshold.Operator, evictionapi.ThresholdValue(threshold.Value), threshold.GracePeriod)
- }
- // cachedStatsFunc returns a statsFunc based on the provided pod stats.
- func cachedStatsFunc(podStats []statsapi.PodStats) statsFunc {
- uid2PodStats := map[string]statsapi.PodStats{}
- for i := range podStats {
- uid2PodStats[podStats[i].PodRef.UID] = podStats[i]
- }
- return func(pod *v1.Pod) (statsapi.PodStats, bool) {
- stats, found := uid2PodStats[string(pod.UID)]
- return stats, found
- }
- }
- // Cmp compares p1 and p2 and returns:
- //
- // -1 if p1 < p2
- // 0 if p1 == p2
- // +1 if p1 > p2
- //
- type cmpFunc func(p1, p2 *v1.Pod) int
- // multiSorter implements the Sort interface, sorting changes within.
- type multiSorter struct {
- pods []*v1.Pod
- cmp []cmpFunc
- }
- // Sort sorts the argument slice according to the less functions passed to OrderedBy.
- func (ms *multiSorter) Sort(pods []*v1.Pod) {
- ms.pods = pods
- sort.Sort(ms)
- }
- // OrderedBy returns a Sorter that sorts using the cmp functions, in order.
- // Call its Sort method to sort the data.
- func orderedBy(cmp ...cmpFunc) *multiSorter {
- return &multiSorter{
- cmp: cmp,
- }
- }
- // Len is part of sort.Interface.
- func (ms *multiSorter) Len() int {
- return len(ms.pods)
- }
- // Swap is part of sort.Interface.
- func (ms *multiSorter) Swap(i, j int) {
- ms.pods[i], ms.pods[j] = ms.pods[j], ms.pods[i]
- }
- // Less is part of sort.Interface.
- func (ms *multiSorter) Less(i, j int) bool {
- p1, p2 := ms.pods[i], ms.pods[j]
- var k int
- for k = 0; k < len(ms.cmp)-1; k++ {
- cmpResult := ms.cmp[k](p1, p2)
- // p1 is less than p2
- if cmpResult < 0 {
- return true
- }
- // p1 is greater than p2
- if cmpResult > 0 {
- return false
- }
- // we don't know yet
- }
- // the last cmp func is the final decider
- return ms.cmp[k](p1, p2) < 0
- }
- // priority compares pods by Priority, if priority is enabled.
- func priority(p1, p2 *v1.Pod) int {
- if !utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) {
- // If priority is not enabled, all pods are equal.
- return 0
- }
- priority1 := schedulerutils.GetPodPriority(p1)
- priority2 := schedulerutils.GetPodPriority(p2)
- if priority1 == priority2 {
- return 0
- }
- if priority1 > priority2 {
- return 1
- }
- return -1
- }
- // exceedMemoryRequests compares whether or not pods' memory usage exceeds their requests
- func exceedMemoryRequests(stats statsFunc) cmpFunc {
- return func(p1, p2 *v1.Pod) int {
- p1Stats, p1Found := stats(p1)
- p2Stats, p2Found := stats(p2)
- if !p1Found || !p2Found {
- // prioritize evicting the pod for which no stats were found
- return cmpBool(!p1Found, !p2Found)
- }
- p1Memory := memoryUsage(p1Stats.Memory)
- p2Memory := memoryUsage(p2Stats.Memory)
- p1ExceedsRequests := p1Memory.Cmp(podRequest(p1, v1.ResourceMemory)) == 1
- p2ExceedsRequests := p2Memory.Cmp(podRequest(p2, v1.ResourceMemory)) == 1
- // prioritize evicting the pod which exceeds its requests
- return cmpBool(p1ExceedsRequests, p2ExceedsRequests)
- }
- }
- // memory compares pods by largest consumer of memory relative to request.
- func memory(stats statsFunc) cmpFunc {
- return func(p1, p2 *v1.Pod) int {
- p1Stats, p1Found := stats(p1)
- p2Stats, p2Found := stats(p2)
- if !p1Found || !p2Found {
- // prioritize evicting the pod for which no stats were found
- return cmpBool(!p1Found, !p2Found)
- }
- // adjust p1, p2 usage relative to the request (if any)
- p1Memory := memoryUsage(p1Stats.Memory)
- p1Request := podRequest(p1, v1.ResourceMemory)
- p1Memory.Sub(p1Request)
- p2Memory := memoryUsage(p2Stats.Memory)
- p2Request := podRequest(p2, v1.ResourceMemory)
- p2Memory.Sub(p2Request)
- // prioritize evicting the pod which has the larger consumption of memory
- return p2Memory.Cmp(*p1Memory)
- }
- }
- // podRequest returns the total resource request of a pod which is the
- // max(max of init container requests, sum of container requests)
- func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
- containerValue := resource.Quantity{Format: resource.BinarySI}
- if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
- // if the local storage capacity isolation feature gate is disabled, pods request 0 disk
- return containerValue
- }
- for i := range pod.Spec.Containers {
- switch resourceName {
- case v1.ResourceMemory:
- containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.Memory())
- case v1.ResourceEphemeralStorage:
- containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.StorageEphemeral())
- }
- }
- initValue := resource.Quantity{Format: resource.BinarySI}
- for i := range pod.Spec.InitContainers {
- switch resourceName {
- case v1.ResourceMemory:
- if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.Memory()) < 0 {
- initValue = *pod.Spec.InitContainers[i].Resources.Requests.Memory()
- }
- case v1.ResourceEphemeralStorage:
- if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()) < 0 {
- initValue = *pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()
- }
- }
- }
- if containerValue.Cmp(initValue) > 0 {
- return containerValue
- }
- return initValue
- }
- // exceedDiskRequests compares whether or not pods' disk usage exceeds their requests
- func exceedDiskRequests(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc {
- return func(p1, p2 *v1.Pod) int {
- p1Stats, p1Found := stats(p1)
- p2Stats, p2Found := stats(p2)
- if !p1Found || !p2Found {
- // prioritize evicting the pod for which no stats were found
- return cmpBool(!p1Found, !p2Found)
- }
- p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure)
- p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure)
- if p1Err != nil || p2Err != nil {
- // prioritize evicting the pod which had an error getting stats
- return cmpBool(p1Err != nil, p2Err != nil)
- }
- p1Disk := p1Usage[diskResource]
- p2Disk := p2Usage[diskResource]
- p1ExceedsRequests := p1Disk.Cmp(podRequest(p1, diskResource)) == 1
- p2ExceedsRequests := p2Disk.Cmp(podRequest(p2, diskResource)) == 1
- // prioritize evicting the pod which exceeds its requests
- return cmpBool(p1ExceedsRequests, p2ExceedsRequests)
- }
- }
- // disk compares pods by largest consumer of disk relative to request for the specified disk resource.
- func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc {
- return func(p1, p2 *v1.Pod) int {
- p1Stats, p1Found := stats(p1)
- p2Stats, p2Found := stats(p2)
- if !p1Found || !p2Found {
- // prioritize evicting the pod for which no stats were found
- return cmpBool(!p1Found, !p2Found)
- }
- p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure)
- p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure)
- if p1Err != nil || p2Err != nil {
- // prioritize evicting the pod which had an error getting stats
- return cmpBool(p1Err != nil, p2Err != nil)
- }
- // adjust p1, p2 usage relative to the request (if any)
- p1Disk := p1Usage[diskResource]
- p2Disk := p2Usage[diskResource]
- p1Request := podRequest(p1, v1.ResourceEphemeralStorage)
- p1Disk.Sub(p1Request)
- p2Request := podRequest(p2, v1.ResourceEphemeralStorage)
- p2Disk.Sub(p2Request)
- // prioritize evicting the pod which has the larger consumption of disk
- return p2Disk.Cmp(p1Disk)
- }
- }
- // cmpBool compares booleans, placing true before false
- func cmpBool(a, b bool) int {
- if a == b {
- return 0
- }
- if !b {
- return -1
- }
- return 1
- }
- // rankMemoryPressure orders the input pods for eviction in response to memory pressure.
- // It ranks by whether or not the pod's usage exceeds its requests, then by priority, and
- // finally by memory usage above requests.
- func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) {
- orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods)
- }
- // rankPIDPressure orders the input pods by priority in response to PID pressure.
- func rankPIDPressure(pods []*v1.Pod, stats statsFunc) {
- orderedBy(priority).Sort(pods)
- }
- // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
- func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) rankFunc {
- return func(pods []*v1.Pod, stats statsFunc) {
- orderedBy(exceedDiskRequests(stats, fsStatsToMeasure, diskResource), priority, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
- }
- }
- // byEvictionPriority implements sort.Interface for []v1.ResourceName.
- type byEvictionPriority []evictionapi.Threshold
- func (a byEvictionPriority) Len() int { return len(a) }
- func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
- // Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last
- func (a byEvictionPriority) Less(i, j int) bool {
- _, jSignalHasResource := signalToResource[a[j].Signal]
- return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource
- }
- // makeSignalObservations derives observations using the specified summary provider.
- func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) {
- // build the function to work against for pod stats
- statsFunc := cachedStatsFunc(summary.Pods)
- // build an evaluation context for current eviction signals
- result := signalObservations{}
- if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
- result[evictionapi.SignalMemoryAvailable] = signalObservation{
- available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
- capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
- time: memory.Time,
- }
- }
- if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil {
- klog.Errorf("eviction manager: failed to construct signal: %q error: %v", evictionapi.SignalAllocatableMemoryAvailable, err)
- } else {
- if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
- result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{
- available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
- capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
- time: memory.Time,
- }
- }
- }
- if nodeFs := summary.Node.Fs; nodeFs != nil {
- if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
- result[evictionapi.SignalNodeFsAvailable] = signalObservation{
- available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
- capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
- time: nodeFs.Time,
- }
- }
- if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
- result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
- available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI),
- capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI),
- time: nodeFs.Time,
- }
- }
- }
- if summary.Node.Runtime != nil {
- if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
- if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
- result[evictionapi.SignalImageFsAvailable] = signalObservation{
- available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
- capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
- time: imageFs.Time,
- }
- if imageFs.InodesFree != nil && imageFs.Inodes != nil {
- result[evictionapi.SignalImageFsInodesFree] = signalObservation{
- available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI),
- capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI),
- time: imageFs.Time,
- }
- }
- }
- }
- }
- if rlimit := summary.Node.Rlimit; rlimit != nil {
- if rlimit.NumOfRunningProcesses != nil && rlimit.MaxPID != nil {
- available := int64(*rlimit.MaxPID) - int64(*rlimit.NumOfRunningProcesses)
- result[evictionapi.SignalPIDAvailable] = signalObservation{
- available: resource.NewQuantity(available, resource.BinarySI),
- capacity: resource.NewQuantity(int64(*rlimit.MaxPID), resource.BinarySI),
- time: rlimit.Time,
- }
- }
- }
- return result, statsFunc
- }
- func getSysContainer(sysContainers []statsapi.ContainerStats, name string) (*statsapi.ContainerStats, error) {
- for _, cont := range sysContainers {
- if cont.Name == name {
- return &cont, nil
- }
- }
- return nil, fmt.Errorf("system container %q not found in metrics", name)
- }
- // thresholdsMet returns the set of thresholds that were met independent of grace period
- func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold {
- results := []evictionapi.Threshold{}
- for i := range thresholds {
- threshold := thresholds[i]
- observed, found := observations[threshold.Signal]
- if !found {
- klog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal)
- continue
- }
- // determine if we have met the specified threshold
- thresholdMet := false
- quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
- // if enforceMinReclaim is specified, we compare relative to value - minreclaim
- if enforceMinReclaim && threshold.MinReclaim != nil {
- quantity.Add(*evictionapi.GetThresholdQuantity(*threshold.MinReclaim, observed.capacity))
- }
- thresholdResult := quantity.Cmp(*observed.available)
- switch threshold.Operator {
- case evictionapi.OpLessThan:
- thresholdMet = thresholdResult > 0
- }
- if thresholdMet {
- results = append(results, threshold)
- }
- }
- return results
- }
- func debugLogObservations(logPrefix string, observations signalObservations) {
- if !klog.V(3) {
- return
- }
- for k, v := range observations {
- if !v.time.IsZero() {
- klog.Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v, time: %v", logPrefix, k, v.available, v.capacity, v.time)
- } else {
- klog.Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v", logPrefix, k, v.available, v.capacity)
- }
- }
- }
- func debugLogThresholdsWithObservation(logPrefix string, thresholds []evictionapi.Threshold, observations signalObservations) {
- if !klog.V(3) {
- return
- }
- for i := range thresholds {
- threshold := thresholds[i]
- observed, found := observations[threshold.Signal]
- if found {
- quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
- klog.Infof("eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v", logPrefix, threshold.Signal, quantity, observed.available)
- } else {
- klog.Infof("eviction manager: %v: threshold [signal=%v] had no observation", logPrefix, threshold.Signal)
- }
- }
- }
- func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold {
- results := []evictionapi.Threshold{}
- for i := range thresholds {
- threshold := thresholds[i]
- observed, found := observations[threshold.Signal]
- if !found {
- klog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal)
- continue
- }
- last, found := lastObservations[threshold.Signal]
- if !found || observed.time.IsZero() || observed.time.After(last.time.Time) {
- results = append(results, threshold)
- }
- }
- return results
- }
- // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
- func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt {
- results := thresholdsObservedAt{}
- for i := range thresholds {
- observedAt, found := lastObservedAt[thresholds[i]]
- if !found {
- observedAt = now
- }
- results[thresholds[i]] = observedAt
- }
- return results
- }
- // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
- func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold {
- results := []evictionapi.Threshold{}
- for threshold, at := range observedAt {
- duration := now.Sub(at)
- if duration < threshold.GracePeriod {
- klog.V(2).Infof("eviction manager: eviction criteria not yet met for %v, duration: %v", formatThreshold(threshold), duration)
- continue
- }
- results = append(results, threshold)
- }
- return results
- }
- // nodeConditions returns the set of node conditions associated with a threshold
- func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType {
- results := []v1.NodeConditionType{}
- for _, threshold := range thresholds {
- if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
- if !hasNodeCondition(results, nodeCondition) {
- results = append(results, nodeCondition)
- }
- }
- }
- return results
- }
- // nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
- func nodeConditionsLastObservedAt(nodeConditions []v1.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt {
- results := nodeConditionsObservedAt{}
- // the input conditions were observed "now"
- for i := range nodeConditions {
- results[nodeConditions[i]] = now
- }
- // the conditions that were not observed now are merged in with their old time
- for key, value := range lastObservedAt {
- _, found := results[key]
- if !found {
- results[key] = value
- }
- }
- return results
- }
- // nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period
- func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []v1.NodeConditionType {
- results := []v1.NodeConditionType{}
- for nodeCondition, at := range observedAt {
- duration := now.Sub(at)
- if duration < period {
- results = append(results, nodeCondition)
- }
- }
- return results
- }
- // hasFsStatsType returns true if the fsStat is in the input list
- func hasFsStatsType(inputs []fsStatsType, item fsStatsType) bool {
- for _, input := range inputs {
- if input == item {
- return true
- }
- }
- return false
- }
- // hasNodeCondition returns true if the node condition is in the input list
- func hasNodeCondition(inputs []v1.NodeConditionType, item v1.NodeConditionType) bool {
- for _, input := range inputs {
- if input == item {
- return true
- }
- }
- return false
- }
- // mergeThresholds will merge both threshold lists eliminating duplicates.
- func mergeThresholds(inputsA []evictionapi.Threshold, inputsB []evictionapi.Threshold) []evictionapi.Threshold {
- results := inputsA
- for _, threshold := range inputsB {
- if !hasThreshold(results, threshold) {
- results = append(results, threshold)
- }
- }
- return results
- }
- // hasThreshold returns true if the threshold is in the input list
- func hasThreshold(inputs []evictionapi.Threshold, item evictionapi.Threshold) bool {
- for _, input := range inputs {
- if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) {
- return true
- }
- }
- return false
- }
- // compareThresholdValue returns true if the two thresholdValue objects are logically the same
- func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.ThresholdValue) bool {
- if a.Quantity != nil {
- if b.Quantity == nil {
- return false
- }
- return a.Quantity.Cmp(*b.Quantity) == 0
- }
- if b.Quantity != nil {
- return false
- }
- return a.Percentage == b.Percentage
- }
- // isHardEvictionThreshold returns true if eviction should immediately occur
- func isHardEvictionThreshold(threshold evictionapi.Threshold) bool {
- return threshold.GracePeriod == time.Duration(0)
- }
- func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool {
- return threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable
- }
- // buildSignalToRankFunc returns ranking functions associated with resources
- func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc {
- signalToRankFunc := map[evictionapi.Signal]rankFunc{
- evictionapi.SignalMemoryAvailable: rankMemoryPressure,
- evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure,
- evictionapi.SignalPIDAvailable: rankPIDPressure,
- }
- // usage of an imagefs is optional
- if withImageFs {
- // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
- signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
- signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
- // with an imagefs, imagefs pod rank func for eviction only includes rootfs
- signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage)
- signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
- } else {
- // without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
- // since imagefs and nodefs share a common device, they share common ranking functions.
- signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
- signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
- signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
- signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
- }
- return signalToRankFunc
- }
- // PodIsEvicted returns true if the reported pod status is due to an eviction.
- func PodIsEvicted(podStatus v1.PodStatus) bool {
- return podStatus.Phase == v1.PodFailed && podStatus.Reason == Reason
- }
- // buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources.
- func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs {
- signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{}
- // usage of an imagefs is optional
- if withImageFs {
- // with an imagefs, nodefs pressure should just delete logs
- signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{}
- signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{}
- // with an imagefs, imagefs pressure should delete unused images
- signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- } else {
- // without an imagefs, nodefs pressure should delete logs, and unused images
- // since imagefs and nodefs share a common device, they share common reclaim functions
- signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
- }
- return signalToReclaimFunc
- }
- // evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction
- func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc) (message string, annotations map[string]string) {
- annotations = make(map[string]string)
- message = fmt.Sprintf(nodeLowMessageFmt, resourceToReclaim)
- containers := []string{}
- containerUsage := []string{}
- podStats, ok := stats(pod)
- if !ok {
- return
- }
- for _, containerStats := range podStats.Containers {
- for _, container := range pod.Spec.Containers {
- if container.Name == containerStats.Name {
- requests := container.Resources.Requests[resourceToReclaim]
- var usage *resource.Quantity
- switch resourceToReclaim {
- case v1.ResourceEphemeralStorage:
- if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil {
- usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI)
- }
- case v1.ResourceMemory:
- if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil {
- usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI)
- }
- }
- if usage != nil && usage.Cmp(requests) > 0 {
- message += fmt.Sprintf(containerMessageFmt, container.Name, usage.String(), requests.String())
- containers = append(containers, container.Name)
- containerUsage = append(containerUsage, usage.String())
- }
- }
- }
- }
- annotations[OffendingContainersKey] = strings.Join(containers, ",")
- annotations[OffendingContainersUsageKey] = strings.Join(containerUsage, ",")
- annotations[StarvedResourceKey] = string(resourceToReclaim)
- return
- }
|