kuberuntime_manager.go 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package kuberuntime
  14. import (
  15. "errors"
  16. "fmt"
  17. "os"
  18. "time"
  19. cadvisorapi "github.com/google/cadvisor/info/v1"
  20. "k8s.io/klog"
  21. v1 "k8s.io/api/core/v1"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. kubetypes "k8s.io/apimachinery/pkg/types"
  24. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  25. utilversion "k8s.io/apimachinery/pkg/util/version"
  26. "k8s.io/client-go/tools/record"
  27. ref "k8s.io/client-go/tools/reference"
  28. "k8s.io/client-go/util/flowcontrol"
  29. internalapi "k8s.io/cri-api/pkg/apis"
  30. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
  31. "k8s.io/kubernetes/pkg/api/legacyscheme"
  32. "k8s.io/kubernetes/pkg/credentialprovider"
  33. "k8s.io/kubernetes/pkg/kubelet/cm"
  34. kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
  35. "k8s.io/kubernetes/pkg/kubelet/events"
  36. "k8s.io/kubernetes/pkg/kubelet/images"
  37. "k8s.io/kubernetes/pkg/kubelet/lifecycle"
  38. proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
  39. "k8s.io/kubernetes/pkg/kubelet/runtimeclass"
  40. "k8s.io/kubernetes/pkg/kubelet/types"
  41. "k8s.io/kubernetes/pkg/kubelet/util/cache"
  42. "k8s.io/kubernetes/pkg/kubelet/util/format"
  43. "k8s.io/kubernetes/pkg/kubelet/util/logreduction"
  44. )
  45. const (
  46. // The api version of kubelet runtime api
  47. kubeRuntimeAPIVersion = "0.1.0"
  48. // The root directory for pod logs
  49. podLogsRootDirectory = "/var/log/pods"
  50. // A minimal shutdown window for avoiding unnecessary SIGKILLs
  51. minimumGracePeriodInSeconds = 2
  52. // The expiration time of version cache.
  53. versionCacheTTL = 60 * time.Second
  54. // How frequently to report identical errors
  55. identicalErrorDelay = 1 * time.Minute
  56. )
  57. var (
  58. // ErrVersionNotSupported is returned when the api version of runtime interface is not supported
  59. ErrVersionNotSupported = errors.New("Runtime api version is not supported")
  60. )
  61. // podStateProvider can determine if a pod is deleted ir terminated
  62. type podStateProvider interface {
  63. IsPodDeleted(kubetypes.UID) bool
  64. IsPodTerminated(kubetypes.UID) bool
  65. }
  66. type kubeGenericRuntimeManager struct {
  67. runtimeName string
  68. recorder record.EventRecorder
  69. osInterface kubecontainer.OSInterface
  70. containerRefManager *kubecontainer.RefManager
  71. // machineInfo contains the machine information.
  72. machineInfo *cadvisorapi.MachineInfo
  73. // Container GC manager
  74. containerGC *containerGC
  75. // Keyring for pulling images
  76. keyring credentialprovider.DockerKeyring
  77. // Runner of lifecycle events.
  78. runner kubecontainer.HandlerRunner
  79. // RuntimeHelper that wraps kubelet to generate runtime container options.
  80. runtimeHelper kubecontainer.RuntimeHelper
  81. // Health check results.
  82. livenessManager proberesults.Manager
  83. // If true, enforce container cpu limits with CFS quota support
  84. cpuCFSQuota bool
  85. // CPUCFSQuotaPeriod sets the CPU CFS quota period value, cpu.cfs_period_us, defaults to 100ms
  86. cpuCFSQuotaPeriod metav1.Duration
  87. // wrapped image puller.
  88. imagePuller images.ImageManager
  89. // gRPC service clients
  90. runtimeService internalapi.RuntimeService
  91. imageService internalapi.ImageManagerService
  92. // The version cache of runtime daemon.
  93. versionCache *cache.ObjectCache
  94. // The directory path for seccomp profiles.
  95. seccompProfileRoot string
  96. // Internal lifecycle event handlers for container resource management.
  97. internalLifecycle cm.InternalContainerLifecycle
  98. // A shim to legacy functions for backward compatibility.
  99. legacyLogProvider LegacyLogProvider
  100. // Manage RuntimeClass resources.
  101. runtimeClassManager *runtimeclass.Manager
  102. // Cache last per-container error message to reduce log spam
  103. logReduction *logreduction.LogReduction
  104. }
  105. // KubeGenericRuntime is a interface contains interfaces for container runtime and command.
  106. type KubeGenericRuntime interface {
  107. kubecontainer.Runtime
  108. kubecontainer.StreamingRuntime
  109. kubecontainer.ContainerCommandRunner
  110. }
  111. // LegacyLogProvider gives the ability to use unsupported docker log drivers (e.g. journald)
  112. type LegacyLogProvider interface {
  113. // Get the last few lines of the logs for a specific container.
  114. GetContainerLogTail(uid kubetypes.UID, name, namespace string, containerID kubecontainer.ContainerID) (string, error)
  115. }
  116. // NewKubeGenericRuntimeManager creates a new kubeGenericRuntimeManager
  117. func NewKubeGenericRuntimeManager(
  118. recorder record.EventRecorder,
  119. livenessManager proberesults.Manager,
  120. seccompProfileRoot string,
  121. containerRefManager *kubecontainer.RefManager,
  122. machineInfo *cadvisorapi.MachineInfo,
  123. podStateProvider podStateProvider,
  124. osInterface kubecontainer.OSInterface,
  125. runtimeHelper kubecontainer.RuntimeHelper,
  126. httpClient types.HttpGetter,
  127. imageBackOff *flowcontrol.Backoff,
  128. serializeImagePulls bool,
  129. imagePullQPS float32,
  130. imagePullBurst int,
  131. cpuCFSQuota bool,
  132. cpuCFSQuotaPeriod metav1.Duration,
  133. runtimeService internalapi.RuntimeService,
  134. imageService internalapi.ImageManagerService,
  135. internalLifecycle cm.InternalContainerLifecycle,
  136. legacyLogProvider LegacyLogProvider,
  137. runtimeClassManager *runtimeclass.Manager,
  138. ) (KubeGenericRuntime, error) {
  139. kubeRuntimeManager := &kubeGenericRuntimeManager{
  140. recorder: recorder,
  141. cpuCFSQuota: cpuCFSQuota,
  142. cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
  143. seccompProfileRoot: seccompProfileRoot,
  144. livenessManager: livenessManager,
  145. containerRefManager: containerRefManager,
  146. machineInfo: machineInfo,
  147. osInterface: osInterface,
  148. runtimeHelper: runtimeHelper,
  149. runtimeService: newInstrumentedRuntimeService(runtimeService),
  150. imageService: newInstrumentedImageManagerService(imageService),
  151. keyring: credentialprovider.NewDockerKeyring(),
  152. internalLifecycle: internalLifecycle,
  153. legacyLogProvider: legacyLogProvider,
  154. runtimeClassManager: runtimeClassManager,
  155. logReduction: logreduction.NewLogReduction(identicalErrorDelay),
  156. }
  157. typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion)
  158. if err != nil {
  159. klog.Errorf("Get runtime version failed: %v", err)
  160. return nil, err
  161. }
  162. // Only matching kubeRuntimeAPIVersion is supported now
  163. // TODO: Runtime API machinery is under discussion at https://github.com/kubernetes/kubernetes/issues/28642
  164. if typedVersion.Version != kubeRuntimeAPIVersion {
  165. klog.Errorf("Runtime api version %s is not supported, only %s is supported now",
  166. typedVersion.Version,
  167. kubeRuntimeAPIVersion)
  168. return nil, ErrVersionNotSupported
  169. }
  170. kubeRuntimeManager.runtimeName = typedVersion.RuntimeName
  171. klog.Infof("Container runtime %s initialized, version: %s, apiVersion: %s",
  172. typedVersion.RuntimeName,
  173. typedVersion.RuntimeVersion,
  174. typedVersion.RuntimeApiVersion)
  175. // If the container logs directory does not exist, create it.
  176. // TODO: create podLogsRootDirectory at kubelet.go when kubelet is refactored to
  177. // new runtime interface
  178. if _, err := osInterface.Stat(podLogsRootDirectory); os.IsNotExist(err) {
  179. if err := osInterface.MkdirAll(podLogsRootDirectory, 0755); err != nil {
  180. klog.Errorf("Failed to create directory %q: %v", podLogsRootDirectory, err)
  181. }
  182. }
  183. kubeRuntimeManager.imagePuller = images.NewImageManager(
  184. kubecontainer.FilterEventRecorder(recorder),
  185. kubeRuntimeManager,
  186. imageBackOff,
  187. serializeImagePulls,
  188. imagePullQPS,
  189. imagePullBurst)
  190. kubeRuntimeManager.runner = lifecycle.NewHandlerRunner(httpClient, kubeRuntimeManager, kubeRuntimeManager)
  191. kubeRuntimeManager.containerGC = newContainerGC(runtimeService, podStateProvider, kubeRuntimeManager)
  192. kubeRuntimeManager.versionCache = cache.NewObjectCache(
  193. func() (interface{}, error) {
  194. return kubeRuntimeManager.getTypedVersion()
  195. },
  196. versionCacheTTL,
  197. )
  198. return kubeRuntimeManager, nil
  199. }
  200. // Type returns the type of the container runtime.
  201. func (m *kubeGenericRuntimeManager) Type() string {
  202. return m.runtimeName
  203. }
  204. func newRuntimeVersion(version string) (*utilversion.Version, error) {
  205. if ver, err := utilversion.ParseSemantic(version); err == nil {
  206. return ver, err
  207. }
  208. return utilversion.ParseGeneric(version)
  209. }
  210. func (m *kubeGenericRuntimeManager) getTypedVersion() (*runtimeapi.VersionResponse, error) {
  211. typedVersion, err := m.runtimeService.Version(kubeRuntimeAPIVersion)
  212. if err != nil {
  213. klog.Errorf("Get remote runtime typed version failed: %v", err)
  214. return nil, err
  215. }
  216. return typedVersion, nil
  217. }
  218. // Version returns the version information of the container runtime.
  219. func (m *kubeGenericRuntimeManager) Version() (kubecontainer.Version, error) {
  220. typedVersion, err := m.runtimeService.Version(kubeRuntimeAPIVersion)
  221. if err != nil {
  222. klog.Errorf("Get remote runtime version failed: %v", err)
  223. return nil, err
  224. }
  225. return newRuntimeVersion(typedVersion.RuntimeVersion)
  226. }
  227. // APIVersion returns the cached API version information of the container
  228. // runtime. Implementation is expected to update this cache periodically.
  229. // This may be different from the runtime engine's version.
  230. func (m *kubeGenericRuntimeManager) APIVersion() (kubecontainer.Version, error) {
  231. versionObject, err := m.versionCache.Get(m.machineInfo.MachineID)
  232. if err != nil {
  233. return nil, err
  234. }
  235. typedVersion := versionObject.(*runtimeapi.VersionResponse)
  236. return newRuntimeVersion(typedVersion.RuntimeApiVersion)
  237. }
  238. // Status returns the status of the runtime. An error is returned if the Status
  239. // function itself fails, nil otherwise.
  240. func (m *kubeGenericRuntimeManager) Status() (*kubecontainer.RuntimeStatus, error) {
  241. status, err := m.runtimeService.Status()
  242. if err != nil {
  243. return nil, err
  244. }
  245. return toKubeRuntimeStatus(status), nil
  246. }
  247. // GetPods returns a list of containers grouped by pods. The boolean parameter
  248. // specifies whether the runtime returns all containers including those already
  249. // exited and dead containers (used for garbage collection).
  250. func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, error) {
  251. pods := make(map[kubetypes.UID]*kubecontainer.Pod)
  252. sandboxes, err := m.getKubeletSandboxes(all)
  253. if err != nil {
  254. return nil, err
  255. }
  256. for i := range sandboxes {
  257. s := sandboxes[i]
  258. if s.Metadata == nil {
  259. klog.V(4).Infof("Sandbox does not have metadata: %+v", s)
  260. continue
  261. }
  262. podUID := kubetypes.UID(s.Metadata.Uid)
  263. if _, ok := pods[podUID]; !ok {
  264. pods[podUID] = &kubecontainer.Pod{
  265. ID: podUID,
  266. Name: s.Metadata.Name,
  267. Namespace: s.Metadata.Namespace,
  268. }
  269. }
  270. p := pods[podUID]
  271. converted, err := m.sandboxToKubeContainer(s)
  272. if err != nil {
  273. klog.V(4).Infof("Convert %q sandbox %v of pod %q failed: %v", m.runtimeName, s, podUID, err)
  274. continue
  275. }
  276. p.Sandboxes = append(p.Sandboxes, converted)
  277. }
  278. containers, err := m.getKubeletContainers(all)
  279. if err != nil {
  280. return nil, err
  281. }
  282. for i := range containers {
  283. c := containers[i]
  284. if c.Metadata == nil {
  285. klog.V(4).Infof("Container does not have metadata: %+v", c)
  286. continue
  287. }
  288. labelledInfo := getContainerInfoFromLabels(c.Labels)
  289. pod, found := pods[labelledInfo.PodUID]
  290. if !found {
  291. pod = &kubecontainer.Pod{
  292. ID: labelledInfo.PodUID,
  293. Name: labelledInfo.PodName,
  294. Namespace: labelledInfo.PodNamespace,
  295. }
  296. pods[labelledInfo.PodUID] = pod
  297. }
  298. converted, err := m.toKubeContainer(c)
  299. if err != nil {
  300. klog.V(4).Infof("Convert %s container %v of pod %q failed: %v", m.runtimeName, c, labelledInfo.PodUID, err)
  301. continue
  302. }
  303. pod.Containers = append(pod.Containers, converted)
  304. }
  305. // Convert map to list.
  306. var result []*kubecontainer.Pod
  307. for _, pod := range pods {
  308. result = append(result, pod)
  309. }
  310. return result, nil
  311. }
  312. // containerToKillInfo contains necessary information to kill a container.
  313. type containerToKillInfo struct {
  314. // The spec of the container.
  315. container *v1.Container
  316. // The name of the container.
  317. name string
  318. // The message indicates why the container will be killed.
  319. message string
  320. }
  321. // podActions keeps information what to do for a pod.
  322. type podActions struct {
  323. // Stop all running (regular and init) containers and the sandbox for the pod.
  324. KillPod bool
  325. // Whether need to create a new sandbox. If needed to kill pod and create
  326. // a new pod sandbox, all init containers need to be purged (i.e., removed).
  327. CreateSandbox bool
  328. // The id of existing sandbox. It is used for starting containers in ContainersToStart.
  329. SandboxID string
  330. // The attempt number of creating sandboxes for the pod.
  331. Attempt uint32
  332. // The next init container to start.
  333. NextInitContainerToStart *v1.Container
  334. // ContainersToStart keeps a list of indexes for the containers to start,
  335. // where the index is the index of the specific container in the pod spec (
  336. // pod.Spec.Containers.
  337. ContainersToStart []int
  338. // ContainersToKill keeps a map of containers that need to be killed, note that
  339. // the key is the container ID of the container, while
  340. // the value contains necessary information to kill a container.
  341. ContainersToKill map[kubecontainer.ContainerID]containerToKillInfo
  342. }
  343. // podSandboxChanged checks whether the spec of the pod is changed and returns
  344. // (changed, new attempt, original sandboxID if exist).
  345. func (m *kubeGenericRuntimeManager) podSandboxChanged(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, uint32, string) {
  346. if len(podStatus.SandboxStatuses) == 0 {
  347. klog.V(2).Infof("No sandbox for pod %q can be found. Need to start a new one", format.Pod(pod))
  348. return true, 0, ""
  349. }
  350. readySandboxCount := 0
  351. for _, s := range podStatus.SandboxStatuses {
  352. if s.State == runtimeapi.PodSandboxState_SANDBOX_READY {
  353. readySandboxCount++
  354. }
  355. }
  356. // Needs to create a new sandbox when readySandboxCount > 1 or the ready sandbox is not the latest one.
  357. sandboxStatus := podStatus.SandboxStatuses[0]
  358. if readySandboxCount > 1 {
  359. klog.V(2).Infof("More than 1 sandboxes for pod %q are ready. Need to reconcile them", format.Pod(pod))
  360. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  361. }
  362. if sandboxStatus.State != runtimeapi.PodSandboxState_SANDBOX_READY {
  363. klog.V(2).Infof("No ready sandbox for pod %q can be found. Need to start a new one", format.Pod(pod))
  364. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  365. }
  366. // Needs to create a new sandbox when network namespace changed.
  367. if sandboxStatus.GetLinux().GetNamespaces().GetOptions().GetNetwork() != networkNamespaceForPod(pod) {
  368. klog.V(2).Infof("Sandbox for pod %q has changed. Need to start a new one", format.Pod(pod))
  369. return true, sandboxStatus.Metadata.Attempt + 1, ""
  370. }
  371. // Needs to create a new sandbox when the sandbox does not have an IP address.
  372. if !kubecontainer.IsHostNetworkPod(pod) && sandboxStatus.Network.Ip == "" {
  373. klog.V(2).Infof("Sandbox for pod %q has no IP address. Need to start a new one", format.Pod(pod))
  374. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  375. }
  376. return false, sandboxStatus.Metadata.Attempt, sandboxStatus.Id
  377. }
  378. func containerChanged(container *v1.Container, containerStatus *kubecontainer.ContainerStatus) (uint64, uint64, bool) {
  379. expectedHash := kubecontainer.HashContainer(container)
  380. return expectedHash, containerStatus.Hash, containerStatus.Hash != expectedHash
  381. }
  382. func shouldRestartOnFailure(pod *v1.Pod) bool {
  383. return pod.Spec.RestartPolicy != v1.RestartPolicyNever
  384. }
  385. func containerSucceeded(c *v1.Container, podStatus *kubecontainer.PodStatus) bool {
  386. cStatus := podStatus.FindContainerStatusByName(c.Name)
  387. if cStatus == nil || cStatus.State == kubecontainer.ContainerStateRunning {
  388. return false
  389. }
  390. return cStatus.ExitCode == 0
  391. }
  392. // computePodActions checks whether the pod spec has changed and returns the changes if true.
  393. func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus) podActions {
  394. klog.V(5).Infof("Syncing Pod %q: %+v", format.Pod(pod), pod)
  395. createPodSandbox, attempt, sandboxID := m.podSandboxChanged(pod, podStatus)
  396. changes := podActions{
  397. KillPod: createPodSandbox,
  398. CreateSandbox: createPodSandbox,
  399. SandboxID: sandboxID,
  400. Attempt: attempt,
  401. ContainersToStart: []int{},
  402. ContainersToKill: make(map[kubecontainer.ContainerID]containerToKillInfo),
  403. }
  404. // If we need to (re-)create the pod sandbox, everything will need to be
  405. // killed and recreated, and init containers should be purged.
  406. if createPodSandbox {
  407. if !shouldRestartOnFailure(pod) && attempt != 0 && len(podStatus.ContainerStatuses) != 0 {
  408. // Should not restart the pod, just return.
  409. // we should not create a sandbox for a pod if it is already done.
  410. // if all containers are done and should not be started, there is no need to create a new sandbox.
  411. // this stops confusing logs on pods whose containers all have exit codes, but we recreate a sandbox before terminating it.
  412. //
  413. // If ContainerStatuses is empty, we assume that we've never
  414. // successfully created any containers. In this case, we should
  415. // retry creating the sandbox.
  416. changes.CreateSandbox = false
  417. return changes
  418. }
  419. if len(pod.Spec.InitContainers) != 0 {
  420. // Pod has init containers, return the first one.
  421. changes.NextInitContainerToStart = &pod.Spec.InitContainers[0]
  422. return changes
  423. }
  424. // Start all containers by default but exclude the ones that succeeded if
  425. // RestartPolicy is OnFailure.
  426. for idx, c := range pod.Spec.Containers {
  427. if containerSucceeded(&c, podStatus) && pod.Spec.RestartPolicy == v1.RestartPolicyOnFailure {
  428. continue
  429. }
  430. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  431. }
  432. return changes
  433. }
  434. // Check initialization progress.
  435. initLastStatus, next, done := findNextInitContainerToRun(pod, podStatus)
  436. if !done {
  437. if next != nil {
  438. initFailed := initLastStatus != nil && isInitContainerFailed(initLastStatus)
  439. if initFailed && !shouldRestartOnFailure(pod) {
  440. changes.KillPod = true
  441. } else {
  442. // Always try to stop containers in unknown state first.
  443. if initLastStatus != nil && initLastStatus.State == kubecontainer.ContainerStateUnknown {
  444. changes.ContainersToKill[initLastStatus.ID] = containerToKillInfo{
  445. name: next.Name,
  446. container: next,
  447. message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  448. initLastStatus.State),
  449. }
  450. }
  451. changes.NextInitContainerToStart = next
  452. }
  453. }
  454. // Initialization failed or still in progress. Skip inspecting non-init
  455. // containers.
  456. return changes
  457. }
  458. // Number of running containers to keep.
  459. keepCount := 0
  460. // check the status of containers.
  461. for idx, container := range pod.Spec.Containers {
  462. containerStatus := podStatus.FindContainerStatusByName(container.Name)
  463. // Call internal container post-stop lifecycle hook for any non-running container so that any
  464. // allocated cpus are released immediately. If the container is restarted, cpus will be re-allocated
  465. // to it.
  466. if containerStatus != nil && containerStatus.State != kubecontainer.ContainerStateRunning {
  467. if err := m.internalLifecycle.PostStopContainer(containerStatus.ID.ID); err != nil {
  468. klog.Errorf("internal container post-stop lifecycle hook failed for container %v in pod %v with error %v",
  469. container.Name, pod.Name, err)
  470. }
  471. }
  472. // If container does not exist, or is not running, check whether we
  473. // need to restart it.
  474. if containerStatus == nil || containerStatus.State != kubecontainer.ContainerStateRunning {
  475. if kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
  476. message := fmt.Sprintf("Container %+v is dead, but RestartPolicy says that we should restart it.", container)
  477. klog.V(3).Infof(message)
  478. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  479. if containerStatus != nil && containerStatus.State == kubecontainer.ContainerStateUnknown {
  480. // If container is in unknown state, we don't know whether it
  481. // is actually running or not, always try killing it before
  482. // restart to avoid having 2 running instances of the same container.
  483. changes.ContainersToKill[containerStatus.ID] = containerToKillInfo{
  484. name: containerStatus.Name,
  485. container: &pod.Spec.Containers[idx],
  486. message: fmt.Sprintf("Container is in %q state, try killing it before restart",
  487. containerStatus.State),
  488. }
  489. }
  490. }
  491. continue
  492. }
  493. // The container is running, but kill the container if any of the following condition is met.
  494. var message string
  495. restart := shouldRestartOnFailure(pod)
  496. if _, _, changed := containerChanged(&container, containerStatus); changed {
  497. message = fmt.Sprintf("Container %s definition changed", container.Name)
  498. // Restart regardless of the restart policy because the container
  499. // spec changed.
  500. restart = true
  501. } else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure {
  502. // If the container failed the liveness probe, we should kill it.
  503. message = fmt.Sprintf("Container %s failed liveness probe", container.Name)
  504. } else {
  505. // Keep the container.
  506. keepCount++
  507. continue
  508. }
  509. // We need to kill the container, but if we also want to restart the
  510. // container afterwards, make the intent clear in the message. Also do
  511. // not kill the entire pod since we expect container to be running eventually.
  512. if restart {
  513. message = fmt.Sprintf("%s, will be restarted", message)
  514. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  515. }
  516. changes.ContainersToKill[containerStatus.ID] = containerToKillInfo{
  517. name: containerStatus.Name,
  518. container: &pod.Spec.Containers[idx],
  519. message: message,
  520. }
  521. klog.V(2).Infof("Container %q (%q) of pod %s: %s", container.Name, containerStatus.ID, format.Pod(pod), message)
  522. }
  523. if keepCount == 0 && len(changes.ContainersToStart) == 0 {
  524. changes.KillPod = true
  525. }
  526. return changes
  527. }
  528. // SyncPod syncs the running pod into the desired pod by executing following steps:
  529. //
  530. // 1. Compute sandbox and container changes.
  531. // 2. Kill pod sandbox if necessary.
  532. // 3. Kill any containers that should not be running.
  533. // 4. Create sandbox if necessary.
  534. // 5. Create init containers.
  535. // 6. Create normal containers.
  536. func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) {
  537. // Step 1: Compute sandbox and container changes.
  538. podContainerChanges := m.computePodActions(pod, podStatus)
  539. klog.V(3).Infof("computePodActions got %+v for pod %q", podContainerChanges, format.Pod(pod))
  540. if podContainerChanges.CreateSandbox {
  541. ref, err := ref.GetReference(legacyscheme.Scheme, pod)
  542. if err != nil {
  543. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err)
  544. }
  545. if podContainerChanges.SandboxID != "" {
  546. m.recorder.Eventf(ref, v1.EventTypeNormal, events.SandboxChanged, "Pod sandbox changed, it will be killed and re-created.")
  547. } else {
  548. klog.V(4).Infof("SyncPod received new pod %q, will create a sandbox for it", format.Pod(pod))
  549. }
  550. }
  551. // Step 2: Kill the pod if the sandbox has changed.
  552. if podContainerChanges.KillPod {
  553. if podContainerChanges.CreateSandbox {
  554. klog.V(4).Infof("Stopping PodSandbox for %q, will start new one", format.Pod(pod))
  555. } else {
  556. klog.V(4).Infof("Stopping PodSandbox for %q because all other containers are dead.", format.Pod(pod))
  557. }
  558. killResult := m.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(m.runtimeName, podStatus), nil)
  559. result.AddPodSyncResult(killResult)
  560. if killResult.Error() != nil {
  561. klog.Errorf("killPodWithSyncResult failed: %v", killResult.Error())
  562. return
  563. }
  564. if podContainerChanges.CreateSandbox {
  565. m.purgeInitContainers(pod, podStatus)
  566. }
  567. } else {
  568. // Step 3: kill any running containers in this pod which are not to keep.
  569. for containerID, containerInfo := range podContainerChanges.ContainersToKill {
  570. klog.V(3).Infof("Killing unwanted container %q(id=%q) for pod %q", containerInfo.name, containerID, format.Pod(pod))
  571. killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name)
  572. result.AddSyncResult(killContainerResult)
  573. if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil {
  574. killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
  575. klog.Errorf("killContainer %q(id=%q) for pod %q failed: %v", containerInfo.name, containerID, format.Pod(pod), err)
  576. return
  577. }
  578. }
  579. }
  580. // Keep terminated init containers fairly aggressively controlled
  581. // This is an optimization because container removals are typically handled
  582. // by container garbage collector.
  583. m.pruneInitContainersBeforeStart(pod, podStatus)
  584. // We pass the value of the podIP down to generatePodSandboxConfig and
  585. // generateContainerConfig, which in turn passes it to various other
  586. // functions, in order to facilitate functionality that requires this
  587. // value (hosts file and downward API) and avoid races determining
  588. // the pod IP in cases where a container requires restart but the
  589. // podIP isn't in the status manager yet.
  590. //
  591. // We default to the IP in the passed-in pod status, and overwrite it if the
  592. // sandbox needs to be (re)started.
  593. podIP := ""
  594. if podStatus != nil {
  595. podIP = podStatus.IP
  596. }
  597. // Step 4: Create a sandbox for the pod if necessary.
  598. podSandboxID := podContainerChanges.SandboxID
  599. if podContainerChanges.CreateSandbox {
  600. var msg string
  601. var err error
  602. klog.V(4).Infof("Creating sandbox for pod %q", format.Pod(pod))
  603. createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod))
  604. result.AddSyncResult(createSandboxResult)
  605. podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt)
  606. if err != nil {
  607. createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg)
  608. klog.Errorf("createPodSandbox for pod %q failed: %v", format.Pod(pod), err)
  609. ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
  610. if referr != nil {
  611. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr)
  612. }
  613. m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedCreatePodSandBox, "Failed create pod sandbox: %v", err)
  614. return
  615. }
  616. klog.V(4).Infof("Created PodSandbox %q for pod %q", podSandboxID, format.Pod(pod))
  617. podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
  618. if err != nil {
  619. ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
  620. if referr != nil {
  621. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr)
  622. }
  623. m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err)
  624. klog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod))
  625. result.Fail(err)
  626. return
  627. }
  628. // If we ever allow updating a pod from non-host-network to
  629. // host-network, we may use a stale IP.
  630. if !kubecontainer.IsHostNetworkPod(pod) {
  631. // Overwrite the podIP passed in the pod status, since we just started the pod sandbox.
  632. podIP = m.determinePodSandboxIP(pod.Namespace, pod.Name, podSandboxStatus)
  633. klog.V(4).Infof("Determined the ip %q for pod %q after sandbox changed", podIP, format.Pod(pod))
  634. }
  635. }
  636. // Get podSandboxConfig for containers to start.
  637. configPodSandboxResult := kubecontainer.NewSyncResult(kubecontainer.ConfigPodSandbox, podSandboxID)
  638. result.AddSyncResult(configPodSandboxResult)
  639. podSandboxConfig, err := m.generatePodSandboxConfig(pod, podContainerChanges.Attempt)
  640. if err != nil {
  641. message := fmt.Sprintf("GeneratePodSandboxConfig for pod %q failed: %v", format.Pod(pod), err)
  642. klog.Error(message)
  643. configPodSandboxResult.Fail(kubecontainer.ErrConfigPodSandbox, message)
  644. return
  645. }
  646. // Step 5: start the init container.
  647. if container := podContainerChanges.NextInitContainerToStart; container != nil {
  648. // Start the next init container.
  649. startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name)
  650. result.AddSyncResult(startContainerResult)
  651. isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff)
  652. if isInBackOff {
  653. startContainerResult.Fail(err, msg)
  654. klog.V(4).Infof("Backing Off restarting init container %+v in pod %v", container, format.Pod(pod))
  655. return
  656. }
  657. klog.V(4).Infof("Creating init container %+v in pod %v", container, format.Pod(pod))
  658. if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP); err != nil {
  659. startContainerResult.Fail(err, msg)
  660. utilruntime.HandleError(fmt.Errorf("init container start failed: %v: %s", err, msg))
  661. return
  662. }
  663. // Successfully started the container; clear the entry in the failure
  664. klog.V(4).Infof("Completed init container %q for pod %q", container.Name, format.Pod(pod))
  665. }
  666. // Step 6: start containers in podContainerChanges.ContainersToStart.
  667. for _, idx := range podContainerChanges.ContainersToStart {
  668. container := &pod.Spec.Containers[idx]
  669. startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name)
  670. result.AddSyncResult(startContainerResult)
  671. isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff)
  672. if isInBackOff {
  673. startContainerResult.Fail(err, msg)
  674. klog.V(4).Infof("Backing Off restarting container %+v in pod %v", container, format.Pod(pod))
  675. continue
  676. }
  677. klog.V(4).Infof("Creating container %+v in pod %v", container, format.Pod(pod))
  678. if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP); err != nil {
  679. startContainerResult.Fail(err, msg)
  680. // known errors that are logged in other places are logged at higher levels here to avoid
  681. // repetitive log spam
  682. switch {
  683. case err == images.ErrImagePullBackOff:
  684. klog.V(3).Infof("container start failed: %v: %s", err, msg)
  685. default:
  686. utilruntime.HandleError(fmt.Errorf("container start failed: %v: %s", err, msg))
  687. }
  688. continue
  689. }
  690. }
  691. return
  692. }
  693. // If a container is still in backoff, the function will return a brief backoff error and
  694. // a detailed error message.
  695. func (m *kubeGenericRuntimeManager) doBackOff(pod *v1.Pod, container *v1.Container, podStatus *kubecontainer.PodStatus, backOff *flowcontrol.Backoff) (bool, string, error) {
  696. var cStatus *kubecontainer.ContainerStatus
  697. for _, c := range podStatus.ContainerStatuses {
  698. if c.Name == container.Name && c.State == kubecontainer.ContainerStateExited {
  699. cStatus = c
  700. break
  701. }
  702. }
  703. if cStatus == nil {
  704. return false, "", nil
  705. }
  706. klog.V(3).Infof("checking backoff for container %q in pod %q", container.Name, format.Pod(pod))
  707. // Use the finished time of the latest exited container as the start point to calculate whether to do back-off.
  708. ts := cStatus.FinishedAt
  709. // backOff requires a unique key to identify the container.
  710. key := getStableKey(pod, container)
  711. if backOff.IsInBackOffSince(key, ts) {
  712. if ref, err := kubecontainer.GenerateContainerRef(pod, container); err == nil {
  713. m.recorder.Eventf(ref, v1.EventTypeWarning, events.BackOffStartContainer, "Back-off restarting failed container")
  714. }
  715. err := fmt.Errorf("Back-off %s restarting failed container=%s pod=%s", backOff.Get(key), container.Name, format.Pod(pod))
  716. klog.V(3).Infof("%s", err.Error())
  717. return true, err.Error(), kubecontainer.ErrCrashLoopBackOff
  718. }
  719. backOff.Next(key, ts)
  720. return false, "", nil
  721. }
  722. // KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
  723. // gracePeriodOverride if specified allows the caller to override the pod default grace period.
  724. // only hard kill paths are allowed to specify a gracePeriodOverride in the kubelet in order to not corrupt user data.
  725. // it is useful when doing SIGKILL for hard eviction scenarios, or max grace period during soft eviction scenarios.
  726. func (m *kubeGenericRuntimeManager) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error {
  727. err := m.killPodWithSyncResult(pod, runningPod, gracePeriodOverride)
  728. return err.Error()
  729. }
  730. // killPodWithSyncResult kills a runningPod and returns SyncResult.
  731. // Note: The pod passed in could be *nil* when kubelet restarted.
  732. func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
  733. killContainerResults := m.killContainersWithSyncResult(pod, runningPod, gracePeriodOverride)
  734. for _, containerResult := range killContainerResults {
  735. result.AddSyncResult(containerResult)
  736. }
  737. // stop sandbox, the sandbox will be removed in GarbageCollect
  738. killSandboxResult := kubecontainer.NewSyncResult(kubecontainer.KillPodSandbox, runningPod.ID)
  739. result.AddSyncResult(killSandboxResult)
  740. // Stop all sandboxes belongs to same pod
  741. for _, podSandbox := range runningPod.Sandboxes {
  742. if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil {
  743. killSandboxResult.Fail(kubecontainer.ErrKillPodSandbox, err.Error())
  744. klog.Errorf("Failed to stop sandbox %q", podSandbox.ID)
  745. }
  746. }
  747. return
  748. }
  749. // GetPodStatus retrieves the status of the pod, including the
  750. // information of all containers in the pod that are visible in Runtime.
  751. func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
  752. // Now we retain restart count of container as a container label. Each time a container
  753. // restarts, pod will read the restart count from the registered dead container, increment
  754. // it to get the new restart count, and then add a label with the new restart count on
  755. // the newly started container.
  756. // However, there are some limitations of this method:
  757. // 1. When all dead containers were garbage collected, the container status could
  758. // not get the historical value and would be *inaccurate*. Fortunately, the chance
  759. // is really slim.
  760. // 2. When working with old version containers which have no restart count label,
  761. // we can only assume their restart count is 0.
  762. // Anyhow, we only promised "best-effort" restart count reporting, we can just ignore
  763. // these limitations now.
  764. // TODO: move this comment to SyncPod.
  765. podSandboxIDs, err := m.getSandboxIDByPodUID(uid, nil)
  766. if err != nil {
  767. return nil, err
  768. }
  769. podFullName := format.Pod(&v1.Pod{
  770. ObjectMeta: metav1.ObjectMeta{
  771. Name: name,
  772. Namespace: namespace,
  773. UID: uid,
  774. },
  775. })
  776. klog.V(4).Infof("getSandboxIDByPodUID got sandbox IDs %q for pod %q", podSandboxIDs, podFullName)
  777. sandboxStatuses := make([]*runtimeapi.PodSandboxStatus, len(podSandboxIDs))
  778. podIP := ""
  779. for idx, podSandboxID := range podSandboxIDs {
  780. podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
  781. if err != nil {
  782. klog.Errorf("PodSandboxStatus of sandbox %q for pod %q error: %v", podSandboxID, podFullName, err)
  783. return nil, err
  784. }
  785. sandboxStatuses[idx] = podSandboxStatus
  786. // Only get pod IP from latest sandbox
  787. if idx == 0 && podSandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY {
  788. podIP = m.determinePodSandboxIP(namespace, name, podSandboxStatus)
  789. }
  790. }
  791. // Get statuses of all containers visible in the pod.
  792. containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
  793. if err != nil {
  794. if m.logReduction.ShouldMessageBePrinted(err.Error(), podFullName) {
  795. klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
  796. }
  797. return nil, err
  798. }
  799. m.logReduction.ClearID(podFullName)
  800. return &kubecontainer.PodStatus{
  801. ID: uid,
  802. Name: name,
  803. Namespace: namespace,
  804. IP: podIP,
  805. SandboxStatuses: sandboxStatuses,
  806. ContainerStatuses: containerStatuses,
  807. }, nil
  808. }
  809. // GarbageCollect removes dead containers using the specified container gc policy.
  810. func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
  811. return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady, evictNonDeletedPods)
  812. }
  813. // UpdatePodCIDR is just a passthrough method to update the runtimeConfig of the shim
  814. // with the podCIDR supplied by the kubelet.
  815. func (m *kubeGenericRuntimeManager) UpdatePodCIDR(podCIDR string) error {
  816. // TODO(#35531): do we really want to write a method on this manager for each
  817. // field of the config?
  818. klog.Infof("updating runtime config through cri with podcidr %v", podCIDR)
  819. return m.runtimeService.UpdateRuntimeConfig(
  820. &runtimeapi.RuntimeConfig{
  821. NetworkConfig: &runtimeapi.NetworkConfig{
  822. PodCidr: podCIDR,
  823. },
  824. })
  825. }