123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- /*
- Copyright 2017 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package preemption
- import (
- "fmt"
- "math"
- "k8s.io/api/core/v1"
- "k8s.io/client-go/tools/record"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/api/v1/resource"
- v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
- "k8s.io/kubernetes/pkg/kubelet/events"
- "k8s.io/kubernetes/pkg/kubelet/eviction"
- "k8s.io/kubernetes/pkg/kubelet/lifecycle"
- "k8s.io/kubernetes/pkg/kubelet/metrics"
- kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
- "k8s.io/kubernetes/pkg/kubelet/util/format"
- )
- const message = "Preempted in order to admit critical pod"
- // CriticalPodAdmissionHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
- // If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
- // so that the critical pod can be admitted. For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
- // frees up the required resource requests. The set of pods is designed to minimize impact, and is prioritized according to the ordering:
- // minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
- // minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
- // finding the fewest total requests of pods is considered besteffort.
- type CriticalPodAdmissionHandler struct {
- getPodsFunc eviction.ActivePodsFunc
- killPodFunc eviction.KillPodFunc
- recorder record.EventRecorder
- }
- var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
- func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
- return &CriticalPodAdmissionHandler{
- getPodsFunc: getPodsFunc,
- killPodFunc: killPodFunc,
- recorder: recorder,
- }
- }
- // HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
- // to allow admission of the pod despite its previous failure.
- func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(admitPod *v1.Pod, failureReasons []lifecycle.PredicateFailureReason) ([]lifecycle.PredicateFailureReason, error) {
- if !kubetypes.IsCriticalPod(admitPod) {
- return failureReasons, nil
- }
- // InsufficientResourceError is not a reason to reject a critical pod.
- // Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
- nonResourceReasons := []lifecycle.PredicateFailureReason{}
- resourceReasons := []*admissionRequirement{}
- for _, reason := range failureReasons {
- if r, ok := reason.(*lifecycle.InsufficientResourceError); ok {
- resourceReasons = append(resourceReasons, &admissionRequirement{
- resourceName: r.ResourceName,
- quantity: r.GetInsufficientAmount(),
- })
- } else {
- nonResourceReasons = append(nonResourceReasons, reason)
- }
- }
- if len(nonResourceReasons) > 0 {
- // Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
- return nonResourceReasons, nil
- }
- err := c.evictPodsToFreeRequests(admitPod, admissionRequirementList(resourceReasons))
- // if no error is returned, preemption succeeded and the pod is safe to admit.
- return nil, err
- }
- // evictPodsToFreeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
- // based on requests. For example, if the only insufficient resource is 200Mb of memory, this function could
- // evict a pod with request=250Mb.
- func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, insufficientResources admissionRequirementList) error {
- podsToPreempt, err := getPodsToPreempt(admitPod, c.getPodsFunc(), insufficientResources)
- if err != nil {
- return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
- }
- klog.Infof("preemption: attempting to evict pods %v, in order to free up resources: %s", podsToPreempt, insufficientResources.toString())
- for _, pod := range podsToPreempt {
- status := v1.PodStatus{
- Phase: v1.PodFailed,
- Message: message,
- Reason: events.PreemptContainer,
- }
- // record that we are evicting the pod
- c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
- // this is a blocking call and should only return when the pod and its containers are killed.
- err := c.killPodFunc(pod, status, nil)
- if err != nil {
- klog.Warningf("preemption: pod %s failed to evict %v", format.Pod(pod), err)
- // In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
- continue
- }
- if len(insufficientResources) > 0 {
- metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
- } else {
- metrics.Preemptions.WithLabelValues("").Inc()
- }
- klog.Infof("preemption: pod %s evicted successfully", format.Pod(pod))
- }
- return nil
- }
- // getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
- func getPodsToPreempt(pod *v1.Pod, pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
- bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pod, pods)
- // make sure that pods exist to reclaim the requirements
- unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
- if len(unableToMeetRequirements) > 0 {
- return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
- }
- // find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
- guaranteedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
- if err != nil {
- return nil, err
- }
- // Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
- burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guaranteedToEvict...)...))
- if err != nil {
- return nil, err
- }
- // Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
- bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guaranteedToEvict...)...))
- if err != nil {
- return nil, err
- }
- return append(append(bestEffortToEvict, burstableToEvict...), guaranteedToEvict...), nil
- }
- // getPodsToPreemptByDistance finds the pods that have pod requests >= admission requirements.
- // Chooses pods that minimize "distance" to the requirements.
- // If more than one pod exists that fulfills the remaining requirements,
- // it chooses the pod that has the "smaller resource request"
- // This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
- // attempts to minimize the number of pods returned.
- func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
- podsToEvict := []*v1.Pod{}
- // evict pods by shortest distance from remaining requirements, updating requirements every round.
- for len(requirements) > 0 {
- if len(pods) == 0 {
- return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
- }
- // all distances must be less than len(requirements), because the max distance for a single requirement is 1
- bestDistance := float64(len(requirements) + 1)
- bestPodIndex := 0
- // Find the pod with the smallest distance from requirements
- // Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
- for i, pod := range pods {
- dist := requirements.distance(pod)
- if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
- bestDistance = dist
- bestPodIndex = i
- }
- }
- // subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
- requirements = requirements.subtract(pods[bestPodIndex])
- podsToEvict = append(podsToEvict, pods[bestPodIndex])
- pods[bestPodIndex] = pods[len(pods)-1]
- pods = pods[:len(pods)-1]
- }
- return podsToEvict, nil
- }
- type admissionRequirement struct {
- resourceName v1.ResourceName
- quantity int64
- }
- type admissionRequirementList []*admissionRequirement
- // distance returns distance of the pods requests from the admissionRequirements.
- // The distance is measured by the fraction of the requirement satisfied by the pod,
- // so that each requirement is weighted equally, regardless of absolute magnitude.
- func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
- dist := float64(0)
- for _, req := range a {
- remainingRequest := float64(req.quantity - resource.GetResourceRequest(pod, req.resourceName))
- if remainingRequest > 0 {
- dist += math.Pow(remainingRequest/float64(req.quantity), 2)
- }
- }
- return dist
- }
- // subtract returns a new admissionRequirementList containing remaining requirements if the provided pod
- // were to be preempted
- func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
- newList := []*admissionRequirement{}
- for _, req := range a {
- newQuantity := req.quantity
- for _, pod := range pods {
- newQuantity -= resource.GetResourceRequest(pod, req.resourceName)
- if newQuantity <= 0 {
- break
- }
- }
- if newQuantity > 0 {
- newList = append(newList, &admissionRequirement{
- resourceName: req.resourceName,
- quantity: newQuantity,
- })
- }
- }
- return newList
- }
- func (a admissionRequirementList) toString() string {
- s := "["
- for _, req := range a {
- s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
- }
- return s + "]"
- }
- // sortPodsByQOS returns lists containing besteffort, burstable, and guaranteed pods that
- // can be preempted by preemptor pod.
- func sortPodsByQOS(preemptor *v1.Pod, pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
- for _, pod := range pods {
- if kubetypes.Preemptable(preemptor, pod) {
- switch v1qos.GetPodQOS(pod) {
- case v1.PodQOSBestEffort:
- bestEffort = append(bestEffort, pod)
- case v1.PodQOSBurstable:
- burstable = append(burstable, pod)
- case v1.PodQOSGuaranteed:
- guaranteed = append(guaranteed, pod)
- default:
- }
- }
- }
- return
- }
- // smallerResourceRequest returns true if pod1 has a smaller request than pod2
- func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
- priorityList := []v1.ResourceName{
- v1.ResourceMemory,
- v1.ResourceCPU,
- }
- for _, res := range priorityList {
- req1 := resource.GetResourceRequest(pod1, res)
- req2 := resource.GetResourceRequest(pod2, res)
- if req1 < req2 {
- return true
- } else if req1 > req2 {
- return false
- }
- }
- return true
- }
|