kuberuntime_manager.go 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package kuberuntime
  14. import (
  15. "errors"
  16. "fmt"
  17. "os"
  18. goruntime "runtime"
  19. "time"
  20. cadvisorapi "github.com/google/cadvisor/info/v1"
  21. "k8s.io/klog"
  22. v1 "k8s.io/api/core/v1"
  23. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  24. kubetypes "k8s.io/apimachinery/pkg/types"
  25. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  26. utilversion "k8s.io/apimachinery/pkg/util/version"
  27. utilfeature "k8s.io/apiserver/pkg/util/feature"
  28. "k8s.io/client-go/tools/record"
  29. ref "k8s.io/client-go/tools/reference"
  30. "k8s.io/client-go/util/flowcontrol"
  31. internalapi "k8s.io/cri-api/pkg/apis"
  32. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
  33. "k8s.io/kubernetes/pkg/api/legacyscheme"
  34. "k8s.io/kubernetes/pkg/credentialprovider"
  35. "k8s.io/kubernetes/pkg/features"
  36. "k8s.io/kubernetes/pkg/kubelet/cm"
  37. kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
  38. "k8s.io/kubernetes/pkg/kubelet/events"
  39. "k8s.io/kubernetes/pkg/kubelet/images"
  40. "k8s.io/kubernetes/pkg/kubelet/lifecycle"
  41. proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
  42. "k8s.io/kubernetes/pkg/kubelet/runtimeclass"
  43. "k8s.io/kubernetes/pkg/kubelet/types"
  44. "k8s.io/kubernetes/pkg/kubelet/util/cache"
  45. "k8s.io/kubernetes/pkg/kubelet/util/format"
  46. "k8s.io/kubernetes/pkg/kubelet/util/logreduction"
  47. )
  48. const (
  49. // The api version of kubelet runtime api
  50. kubeRuntimeAPIVersion = "0.1.0"
  51. // The root directory for pod logs
  52. podLogsRootDirectory = "/var/log/pods"
  53. // A minimal shutdown window for avoiding unnecessary SIGKILLs
  54. minimumGracePeriodInSeconds = 2
  55. // The expiration time of version cache.
  56. versionCacheTTL = 60 * time.Second
  57. // How frequently to report identical errors
  58. identicalErrorDelay = 1 * time.Minute
  59. )
  60. var (
  61. // ErrVersionNotSupported is returned when the api version of runtime interface is not supported
  62. ErrVersionNotSupported = errors.New("runtime api version is not supported")
  63. )
  64. // podStateProvider can determine if a pod is deleted ir terminated
  65. type podStateProvider interface {
  66. IsPodDeleted(kubetypes.UID) bool
  67. IsPodTerminated(kubetypes.UID) bool
  68. }
  69. type kubeGenericRuntimeManager struct {
  70. runtimeName string
  71. recorder record.EventRecorder
  72. osInterface kubecontainer.OSInterface
  73. containerRefManager *kubecontainer.RefManager
  74. // machineInfo contains the machine information.
  75. machineInfo *cadvisorapi.MachineInfo
  76. // Container GC manager
  77. containerGC *containerGC
  78. // Keyring for pulling images
  79. keyring credentialprovider.DockerKeyring
  80. // Runner of lifecycle events.
  81. runner kubecontainer.HandlerRunner
  82. // RuntimeHelper that wraps kubelet to generate runtime container options.
  83. runtimeHelper kubecontainer.RuntimeHelper
  84. // Health check results.
  85. livenessManager proberesults.Manager
  86. startupManager proberesults.Manager
  87. // If true, enforce container cpu limits with CFS quota support
  88. cpuCFSQuota bool
  89. // CPUCFSQuotaPeriod sets the CPU CFS quota period value, cpu.cfs_period_us, defaults to 100ms
  90. cpuCFSQuotaPeriod metav1.Duration
  91. // wrapped image puller.
  92. imagePuller images.ImageManager
  93. // gRPC service clients
  94. runtimeService internalapi.RuntimeService
  95. imageService internalapi.ImageManagerService
  96. // The version cache of runtime daemon.
  97. versionCache *cache.ObjectCache
  98. // The directory path for seccomp profiles.
  99. seccompProfileRoot string
  100. // Internal lifecycle event handlers for container resource management.
  101. internalLifecycle cm.InternalContainerLifecycle
  102. // A shim to legacy functions for backward compatibility.
  103. legacyLogProvider LegacyLogProvider
  104. // Manage RuntimeClass resources.
  105. runtimeClassManager *runtimeclass.Manager
  106. // Cache last per-container error message to reduce log spam
  107. logReduction *logreduction.LogReduction
  108. }
  109. // KubeGenericRuntime is a interface contains interfaces for container runtime and command.
  110. type KubeGenericRuntime interface {
  111. kubecontainer.Runtime
  112. kubecontainer.StreamingRuntime
  113. kubecontainer.ContainerCommandRunner
  114. }
  115. // LegacyLogProvider gives the ability to use unsupported docker log drivers (e.g. journald)
  116. type LegacyLogProvider interface {
  117. // Get the last few lines of the logs for a specific container.
  118. GetContainerLogTail(uid kubetypes.UID, name, namespace string, containerID kubecontainer.ContainerID) (string, error)
  119. }
  120. // NewKubeGenericRuntimeManager creates a new kubeGenericRuntimeManager
  121. func NewKubeGenericRuntimeManager(
  122. recorder record.EventRecorder,
  123. livenessManager proberesults.Manager,
  124. startupManager proberesults.Manager,
  125. seccompProfileRoot string,
  126. containerRefManager *kubecontainer.RefManager,
  127. machineInfo *cadvisorapi.MachineInfo,
  128. podStateProvider podStateProvider,
  129. osInterface kubecontainer.OSInterface,
  130. runtimeHelper kubecontainer.RuntimeHelper,
  131. httpClient types.HTTPGetter,
  132. imageBackOff *flowcontrol.Backoff,
  133. serializeImagePulls bool,
  134. imagePullQPS float32,
  135. imagePullBurst int,
  136. cpuCFSQuota bool,
  137. cpuCFSQuotaPeriod metav1.Duration,
  138. runtimeService internalapi.RuntimeService,
  139. imageService internalapi.ImageManagerService,
  140. internalLifecycle cm.InternalContainerLifecycle,
  141. legacyLogProvider LegacyLogProvider,
  142. runtimeClassManager *runtimeclass.Manager,
  143. ) (KubeGenericRuntime, error) {
  144. kubeRuntimeManager := &kubeGenericRuntimeManager{
  145. recorder: recorder,
  146. cpuCFSQuota: cpuCFSQuota,
  147. cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
  148. seccompProfileRoot: seccompProfileRoot,
  149. livenessManager: livenessManager,
  150. startupManager: startupManager,
  151. containerRefManager: containerRefManager,
  152. machineInfo: machineInfo,
  153. osInterface: osInterface,
  154. runtimeHelper: runtimeHelper,
  155. runtimeService: newInstrumentedRuntimeService(runtimeService),
  156. imageService: newInstrumentedImageManagerService(imageService),
  157. keyring: credentialprovider.NewDockerKeyring(),
  158. internalLifecycle: internalLifecycle,
  159. legacyLogProvider: legacyLogProvider,
  160. runtimeClassManager: runtimeClassManager,
  161. logReduction: logreduction.NewLogReduction(identicalErrorDelay),
  162. }
  163. typedVersion, err := kubeRuntimeManager.getTypedVersion()
  164. if err != nil {
  165. klog.Errorf("Get runtime version failed: %v", err)
  166. return nil, err
  167. }
  168. // Only matching kubeRuntimeAPIVersion is supported now
  169. // TODO: Runtime API machinery is under discussion at https://github.com/kubernetes/kubernetes/issues/28642
  170. if typedVersion.Version != kubeRuntimeAPIVersion {
  171. klog.Errorf("Runtime api version %s is not supported, only %s is supported now",
  172. typedVersion.Version,
  173. kubeRuntimeAPIVersion)
  174. return nil, ErrVersionNotSupported
  175. }
  176. kubeRuntimeManager.runtimeName = typedVersion.RuntimeName
  177. klog.Infof("Container runtime %s initialized, version: %s, apiVersion: %s",
  178. typedVersion.RuntimeName,
  179. typedVersion.RuntimeVersion,
  180. typedVersion.RuntimeApiVersion)
  181. // If the container logs directory does not exist, create it.
  182. // TODO: create podLogsRootDirectory at kubelet.go when kubelet is refactored to
  183. // new runtime interface
  184. if _, err := osInterface.Stat(podLogsRootDirectory); os.IsNotExist(err) {
  185. if err := osInterface.MkdirAll(podLogsRootDirectory, 0755); err != nil {
  186. klog.Errorf("Failed to create directory %q: %v", podLogsRootDirectory, err)
  187. }
  188. }
  189. kubeRuntimeManager.imagePuller = images.NewImageManager(
  190. kubecontainer.FilterEventRecorder(recorder),
  191. kubeRuntimeManager,
  192. imageBackOff,
  193. serializeImagePulls,
  194. imagePullQPS,
  195. imagePullBurst)
  196. kubeRuntimeManager.runner = lifecycle.NewHandlerRunner(httpClient, kubeRuntimeManager, kubeRuntimeManager)
  197. kubeRuntimeManager.containerGC = newContainerGC(runtimeService, podStateProvider, kubeRuntimeManager)
  198. kubeRuntimeManager.versionCache = cache.NewObjectCache(
  199. func() (interface{}, error) {
  200. return kubeRuntimeManager.getTypedVersion()
  201. },
  202. versionCacheTTL,
  203. )
  204. return kubeRuntimeManager, nil
  205. }
  206. // Type returns the type of the container runtime.
  207. func (m *kubeGenericRuntimeManager) Type() string {
  208. return m.runtimeName
  209. }
  210. // SupportsSingleFileMapping returns whether the container runtime supports single file mappings or not.
  211. // It is supported on Windows only if the container runtime is containerd.
  212. func (m *kubeGenericRuntimeManager) SupportsSingleFileMapping() bool {
  213. switch goruntime.GOOS {
  214. case "windows":
  215. return m.Type() != types.DockerContainerRuntime
  216. default:
  217. return true
  218. }
  219. }
  220. func newRuntimeVersion(version string) (*utilversion.Version, error) {
  221. if ver, err := utilversion.ParseSemantic(version); err == nil {
  222. return ver, err
  223. }
  224. return utilversion.ParseGeneric(version)
  225. }
  226. func (m *kubeGenericRuntimeManager) getTypedVersion() (*runtimeapi.VersionResponse, error) {
  227. typedVersion, err := m.runtimeService.Version(kubeRuntimeAPIVersion)
  228. if err != nil {
  229. return nil, fmt.Errorf("get remote runtime typed version failed: %v", err)
  230. }
  231. return typedVersion, nil
  232. }
  233. // Version returns the version information of the container runtime.
  234. func (m *kubeGenericRuntimeManager) Version() (kubecontainer.Version, error) {
  235. typedVersion, err := m.getTypedVersion()
  236. if err != nil {
  237. return nil, err
  238. }
  239. return newRuntimeVersion(typedVersion.RuntimeVersion)
  240. }
  241. // APIVersion returns the cached API version information of the container
  242. // runtime. Implementation is expected to update this cache periodically.
  243. // This may be different from the runtime engine's version.
  244. func (m *kubeGenericRuntimeManager) APIVersion() (kubecontainer.Version, error) {
  245. versionObject, err := m.versionCache.Get(m.machineInfo.MachineID)
  246. if err != nil {
  247. return nil, err
  248. }
  249. typedVersion := versionObject.(*runtimeapi.VersionResponse)
  250. return newRuntimeVersion(typedVersion.RuntimeApiVersion)
  251. }
  252. // Status returns the status of the runtime. An error is returned if the Status
  253. // function itself fails, nil otherwise.
  254. func (m *kubeGenericRuntimeManager) Status() (*kubecontainer.RuntimeStatus, error) {
  255. status, err := m.runtimeService.Status()
  256. if err != nil {
  257. return nil, err
  258. }
  259. return toKubeRuntimeStatus(status), nil
  260. }
  261. // GetPods returns a list of containers grouped by pods. The boolean parameter
  262. // specifies whether the runtime returns all containers including those already
  263. // exited and dead containers (used for garbage collection).
  264. func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, error) {
  265. pods := make(map[kubetypes.UID]*kubecontainer.Pod)
  266. sandboxes, err := m.getKubeletSandboxes(all)
  267. if err != nil {
  268. return nil, err
  269. }
  270. for i := range sandboxes {
  271. s := sandboxes[i]
  272. if s.Metadata == nil {
  273. klog.V(4).Infof("Sandbox does not have metadata: %+v", s)
  274. continue
  275. }
  276. podUID := kubetypes.UID(s.Metadata.Uid)
  277. if _, ok := pods[podUID]; !ok {
  278. pods[podUID] = &kubecontainer.Pod{
  279. ID: podUID,
  280. Name: s.Metadata.Name,
  281. Namespace: s.Metadata.Namespace,
  282. }
  283. }
  284. p := pods[podUID]
  285. converted, err := m.sandboxToKubeContainer(s)
  286. if err != nil {
  287. klog.V(4).Infof("Convert %q sandbox %v of pod %q failed: %v", m.runtimeName, s, podUID, err)
  288. continue
  289. }
  290. p.Sandboxes = append(p.Sandboxes, converted)
  291. }
  292. containers, err := m.getKubeletContainers(all)
  293. if err != nil {
  294. return nil, err
  295. }
  296. for i := range containers {
  297. c := containers[i]
  298. if c.Metadata == nil {
  299. klog.V(4).Infof("Container does not have metadata: %+v", c)
  300. continue
  301. }
  302. labelledInfo := getContainerInfoFromLabels(c.Labels)
  303. pod, found := pods[labelledInfo.PodUID]
  304. if !found {
  305. pod = &kubecontainer.Pod{
  306. ID: labelledInfo.PodUID,
  307. Name: labelledInfo.PodName,
  308. Namespace: labelledInfo.PodNamespace,
  309. }
  310. pods[labelledInfo.PodUID] = pod
  311. }
  312. converted, err := m.toKubeContainer(c)
  313. if err != nil {
  314. klog.V(4).Infof("Convert %s container %v of pod %q failed: %v", m.runtimeName, c, labelledInfo.PodUID, err)
  315. continue
  316. }
  317. pod.Containers = append(pod.Containers, converted)
  318. }
  319. // Convert map to list.
  320. var result []*kubecontainer.Pod
  321. for _, pod := range pods {
  322. result = append(result, pod)
  323. }
  324. return result, nil
  325. }
  326. // containerToKillInfo contains necessary information to kill a container.
  327. type containerToKillInfo struct {
  328. // The spec of the container.
  329. container *v1.Container
  330. // The name of the container.
  331. name string
  332. // The message indicates why the container will be killed.
  333. message string
  334. }
  335. // podActions keeps information what to do for a pod.
  336. type podActions struct {
  337. // Stop all running (regular, init and ephemeral) containers and the sandbox for the pod.
  338. KillPod bool
  339. // Whether need to create a new sandbox. If needed to kill pod and create
  340. // a new pod sandbox, all init containers need to be purged (i.e., removed).
  341. CreateSandbox bool
  342. // The id of existing sandbox. It is used for starting containers in ContainersToStart.
  343. SandboxID string
  344. // The attempt number of creating sandboxes for the pod.
  345. Attempt uint32
  346. // The next init container to start.
  347. NextInitContainerToStart *v1.Container
  348. // ContainersToStart keeps a list of indexes for the containers to start,
  349. // where the index is the index of the specific container in the pod spec (
  350. // pod.Spec.Containers.
  351. ContainersToStart []int
  352. // ContainersToKill keeps a map of containers that need to be killed, note that
  353. // the key is the container ID of the container, while
  354. // the value contains necessary information to kill a container.
  355. ContainersToKill map[kubecontainer.ContainerID]containerToKillInfo
  356. // EphemeralContainersToStart is a list of indexes for the ephemeral containers to start,
  357. // where the index is the index of the specific container in pod.Spec.EphemeralContainers.
  358. EphemeralContainersToStart []int
  359. }
  360. // podSandboxChanged checks whether the spec of the pod is changed and returns
  361. // (changed, new attempt, original sandboxID if exist).
  362. func (m *kubeGenericRuntimeManager) podSandboxChanged(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, uint32, string) {
  363. if len(podStatus.SandboxStatuses) == 0 {
  364. klog.V(2).Infof("No sandbox for pod %q can be found. Need to start a new one", format.Pod(pod))
  365. return true, 0, ""
  366. }
  367. readySandboxCount := 0
  368. for _, s := range podStatus.SandboxStatuses {
  369. if s.State == runtimeapi.PodSandboxState_SANDBOX_READY {
  370. readySandboxCount++
  371. }
  372. }
  373. // Needs to create a new sandbox when readySandboxCount > 1 or the ready sandbox is not the latest one.
  374. sandboxStatus := podStatus.SandboxStatuses[0]
  375. if readySandboxCount > 1 {
  376. klog.V(2).Infof("Multiple sandboxes are ready for Pod %q. Need to reconcile them", format.Pod(pod))
  377. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  378. }
  379. if sandboxStatus.State != runtimeapi.PodSandboxState_SANDBOX_READY {
  380. klog.V(2).Infof("No ready sandbox for pod %q can be found. Need to start a new one", format.Pod(pod))
  381. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  382. }
  383. // Needs to create a new sandbox when network namespace changed.
  384. if sandboxStatus.GetLinux().GetNamespaces().GetOptions().GetNetwork() != networkNamespaceForPod(pod) {
  385. klog.V(2).Infof("Sandbox for pod %q has changed. Need to start a new one", format.Pod(pod))
  386. return true, sandboxStatus.Metadata.Attempt + 1, ""
  387. }
  388. // Needs to create a new sandbox when the sandbox does not have an IP address.
  389. if !kubecontainer.IsHostNetworkPod(pod) && sandboxStatus.Network.Ip == "" {
  390. klog.V(2).Infof("Sandbox for pod %q has no IP address. Need to start a new one", format.Pod(pod))
  391. return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
  392. }
  393. return false, sandboxStatus.Metadata.Attempt, sandboxStatus.Id
  394. }
  395. func containerChanged(container *v1.Container, containerStatus *kubecontainer.ContainerStatus) (uint64, uint64, bool) {
  396. expectedHash := kubecontainer.HashContainer(container)
  397. return expectedHash, containerStatus.Hash, containerStatus.Hash != expectedHash
  398. }
  399. func shouldRestartOnFailure(pod *v1.Pod) bool {
  400. return pod.Spec.RestartPolicy != v1.RestartPolicyNever
  401. }
  402. func containerSucceeded(c *v1.Container, podStatus *kubecontainer.PodStatus) bool {
  403. cStatus := podStatus.FindContainerStatusByName(c.Name)
  404. if cStatus == nil || cStatus.State == kubecontainer.ContainerStateRunning {
  405. return false
  406. }
  407. return cStatus.ExitCode == 0
  408. }
  409. // computePodActions checks whether the pod spec has changed and returns the changes if true.
  410. func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus) podActions {
  411. klog.V(5).Infof("Syncing Pod %q: %+v", format.Pod(pod), pod)
  412. createPodSandbox, attempt, sandboxID := m.podSandboxChanged(pod, podStatus)
  413. changes := podActions{
  414. KillPod: createPodSandbox,
  415. CreateSandbox: createPodSandbox,
  416. SandboxID: sandboxID,
  417. Attempt: attempt,
  418. ContainersToStart: []int{},
  419. ContainersToKill: make(map[kubecontainer.ContainerID]containerToKillInfo),
  420. }
  421. // If we need to (re-)create the pod sandbox, everything will need to be
  422. // killed and recreated, and init containers should be purged.
  423. if createPodSandbox {
  424. if !shouldRestartOnFailure(pod) && attempt != 0 && len(podStatus.ContainerStatuses) != 0 {
  425. // Should not restart the pod, just return.
  426. // we should not create a sandbox for a pod if it is already done.
  427. // if all containers are done and should not be started, there is no need to create a new sandbox.
  428. // this stops confusing logs on pods whose containers all have exit codes, but we recreate a sandbox before terminating it.
  429. //
  430. // If ContainerStatuses is empty, we assume that we've never
  431. // successfully created any containers. In this case, we should
  432. // retry creating the sandbox.
  433. changes.CreateSandbox = false
  434. return changes
  435. }
  436. if len(pod.Spec.InitContainers) != 0 {
  437. // Pod has init containers, return the first one.
  438. changes.NextInitContainerToStart = &pod.Spec.InitContainers[0]
  439. return changes
  440. }
  441. // Start all containers by default but exclude the ones that succeeded if
  442. // RestartPolicy is OnFailure.
  443. for idx, c := range pod.Spec.Containers {
  444. if containerSucceeded(&c, podStatus) && pod.Spec.RestartPolicy == v1.RestartPolicyOnFailure {
  445. continue
  446. }
  447. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  448. }
  449. return changes
  450. }
  451. // Ephemeral containers may be started even if initialization is not yet complete.
  452. if utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) {
  453. for i := range pod.Spec.EphemeralContainers {
  454. c := (*v1.Container)(&pod.Spec.EphemeralContainers[i].EphemeralContainerCommon)
  455. // Ephemeral Containers are never restarted
  456. if podStatus.FindContainerStatusByName(c.Name) == nil {
  457. changes.EphemeralContainersToStart = append(changes.EphemeralContainersToStart, i)
  458. }
  459. }
  460. }
  461. // Check initialization progress.
  462. initLastStatus, next, done := findNextInitContainerToRun(pod, podStatus)
  463. if !done {
  464. if next != nil {
  465. initFailed := initLastStatus != nil && isInitContainerFailed(initLastStatus)
  466. if initFailed && !shouldRestartOnFailure(pod) {
  467. changes.KillPod = true
  468. } else {
  469. // Always try to stop containers in unknown state first.
  470. if initLastStatus != nil && initLastStatus.State == kubecontainer.ContainerStateUnknown {
  471. changes.ContainersToKill[initLastStatus.ID] = containerToKillInfo{
  472. name: next.Name,
  473. container: next,
  474. message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  475. initLastStatus.State),
  476. }
  477. }
  478. changes.NextInitContainerToStart = next
  479. }
  480. }
  481. // Initialization failed or still in progress. Skip inspecting non-init
  482. // containers.
  483. return changes
  484. }
  485. // Number of running containers to keep.
  486. keepCount := 0
  487. // check the status of containers.
  488. for idx, container := range pod.Spec.Containers {
  489. containerStatus := podStatus.FindContainerStatusByName(container.Name)
  490. // Call internal container post-stop lifecycle hook for any non-running container so that any
  491. // allocated cpus are released immediately. If the container is restarted, cpus will be re-allocated
  492. // to it.
  493. if containerStatus != nil && containerStatus.State != kubecontainer.ContainerStateRunning {
  494. if err := m.internalLifecycle.PostStopContainer(containerStatus.ID.ID); err != nil {
  495. klog.Errorf("internal container post-stop lifecycle hook failed for container %v in pod %v with error %v",
  496. container.Name, pod.Name, err)
  497. }
  498. }
  499. // If container does not exist, or is not running, check whether we
  500. // need to restart it.
  501. if containerStatus == nil || containerStatus.State != kubecontainer.ContainerStateRunning {
  502. if kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
  503. message := fmt.Sprintf("Container %+v is dead, but RestartPolicy says that we should restart it.", container)
  504. klog.V(3).Infof(message)
  505. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  506. if containerStatus != nil && containerStatus.State == kubecontainer.ContainerStateUnknown {
  507. // If container is in unknown state, we don't know whether it
  508. // is actually running or not, always try killing it before
  509. // restart to avoid having 2 running instances of the same container.
  510. changes.ContainersToKill[containerStatus.ID] = containerToKillInfo{
  511. name: containerStatus.Name,
  512. container: &pod.Spec.Containers[idx],
  513. message: fmt.Sprintf("Container is in %q state, try killing it before restart",
  514. containerStatus.State),
  515. }
  516. }
  517. }
  518. continue
  519. }
  520. // The container is running, but kill the container if any of the following condition is met.
  521. var message string
  522. restart := shouldRestartOnFailure(pod)
  523. if _, _, changed := containerChanged(&container, containerStatus); changed {
  524. message = fmt.Sprintf("Container %s definition changed", container.Name)
  525. // Restart regardless of the restart policy because the container
  526. // spec changed.
  527. restart = true
  528. } else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure {
  529. // If the container failed the liveness probe, we should kill it.
  530. message = fmt.Sprintf("Container %s failed liveness probe", container.Name)
  531. } else if startup, found := m.startupManager.Get(containerStatus.ID); found && startup == proberesults.Failure {
  532. // If the container failed the startup probe, we should kill it.
  533. message = fmt.Sprintf("Container %s failed startup probe", container.Name)
  534. } else {
  535. // Keep the container.
  536. keepCount++
  537. continue
  538. }
  539. // We need to kill the container, but if we also want to restart the
  540. // container afterwards, make the intent clear in the message. Also do
  541. // not kill the entire pod since we expect container to be running eventually.
  542. if restart {
  543. message = fmt.Sprintf("%s, will be restarted", message)
  544. changes.ContainersToStart = append(changes.ContainersToStart, idx)
  545. }
  546. changes.ContainersToKill[containerStatus.ID] = containerToKillInfo{
  547. name: containerStatus.Name,
  548. container: &pod.Spec.Containers[idx],
  549. message: message,
  550. }
  551. klog.V(2).Infof("Container %q (%q) of pod %s: %s", container.Name, containerStatus.ID, format.Pod(pod), message)
  552. }
  553. if keepCount == 0 && len(changes.ContainersToStart) == 0 {
  554. changes.KillPod = true
  555. }
  556. return changes
  557. }
  558. // SyncPod syncs the running pod into the desired pod by executing following steps:
  559. //
  560. // 1. Compute sandbox and container changes.
  561. // 2. Kill pod sandbox if necessary.
  562. // 3. Kill any containers that should not be running.
  563. // 4. Create sandbox if necessary.
  564. // 5. Create ephemeral containers.
  565. // 6. Create init containers.
  566. // 7. Create normal containers.
  567. func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) {
  568. // Step 1: Compute sandbox and container changes.
  569. podContainerChanges := m.computePodActions(pod, podStatus)
  570. klog.V(3).Infof("computePodActions got %+v for pod %q", podContainerChanges, format.Pod(pod))
  571. if podContainerChanges.CreateSandbox {
  572. ref, err := ref.GetReference(legacyscheme.Scheme, pod)
  573. if err != nil {
  574. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err)
  575. }
  576. if podContainerChanges.SandboxID != "" {
  577. m.recorder.Eventf(ref, v1.EventTypeNormal, events.SandboxChanged, "Pod sandbox changed, it will be killed and re-created.")
  578. } else {
  579. klog.V(4).Infof("SyncPod received new pod %q, will create a sandbox for it", format.Pod(pod))
  580. }
  581. }
  582. // Step 2: Kill the pod if the sandbox has changed.
  583. if podContainerChanges.KillPod {
  584. if podContainerChanges.CreateSandbox {
  585. klog.V(4).Infof("Stopping PodSandbox for %q, will start new one", format.Pod(pod))
  586. } else {
  587. klog.V(4).Infof("Stopping PodSandbox for %q because all other containers are dead.", format.Pod(pod))
  588. }
  589. killResult := m.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(m.runtimeName, podStatus), nil)
  590. result.AddPodSyncResult(killResult)
  591. if killResult.Error() != nil {
  592. klog.Errorf("killPodWithSyncResult failed: %v", killResult.Error())
  593. return
  594. }
  595. if podContainerChanges.CreateSandbox {
  596. m.purgeInitContainers(pod, podStatus)
  597. }
  598. } else {
  599. // Step 3: kill any running containers in this pod which are not to keep.
  600. for containerID, containerInfo := range podContainerChanges.ContainersToKill {
  601. klog.V(3).Infof("Killing unwanted container %q(id=%q) for pod %q", containerInfo.name, containerID, format.Pod(pod))
  602. killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name)
  603. result.AddSyncResult(killContainerResult)
  604. if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil {
  605. killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
  606. klog.Errorf("killContainer %q(id=%q) for pod %q failed: %v", containerInfo.name, containerID, format.Pod(pod), err)
  607. return
  608. }
  609. }
  610. }
  611. // Keep terminated init containers fairly aggressively controlled
  612. // This is an optimization because container removals are typically handled
  613. // by container garbage collector.
  614. m.pruneInitContainersBeforeStart(pod, podStatus)
  615. // We pass the value of the PRIMARY podIP and list of podIPs down to
  616. // generatePodSandboxConfig and generateContainerConfig, which in turn
  617. // passes it to various other functions, in order to facilitate functionality
  618. // that requires this value (hosts file and downward API) and avoid races determining
  619. // the pod IP in cases where a container requires restart but the
  620. // podIP isn't in the status manager yet. The list of podIPs is used to
  621. // generate the hosts file.
  622. //
  623. // We default to the IPs in the passed-in pod status, and overwrite them if the
  624. // sandbox needs to be (re)started.
  625. var podIPs []string
  626. if podStatus != nil {
  627. podIPs = podStatus.IPs
  628. }
  629. // Step 4: Create a sandbox for the pod if necessary.
  630. podSandboxID := podContainerChanges.SandboxID
  631. if podContainerChanges.CreateSandbox {
  632. var msg string
  633. var err error
  634. klog.V(4).Infof("Creating sandbox for pod %q", format.Pod(pod))
  635. createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod))
  636. result.AddSyncResult(createSandboxResult)
  637. podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt)
  638. if err != nil {
  639. createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg)
  640. klog.Errorf("createPodSandbox for pod %q failed: %v", format.Pod(pod), err)
  641. ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
  642. if referr != nil {
  643. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr)
  644. }
  645. m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedCreatePodSandBox, "Failed to create pod sandbox: %v", err)
  646. return
  647. }
  648. klog.V(4).Infof("Created PodSandbox %q for pod %q", podSandboxID, format.Pod(pod))
  649. podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
  650. if err != nil {
  651. ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
  652. if referr != nil {
  653. klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr)
  654. }
  655. m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err)
  656. klog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod))
  657. result.Fail(err)
  658. return
  659. }
  660. // If we ever allow updating a pod from non-host-network to
  661. // host-network, we may use a stale IP.
  662. if !kubecontainer.IsHostNetworkPod(pod) {
  663. // Overwrite the podIPs passed in the pod status, since we just started the pod sandbox.
  664. podIPs = m.determinePodSandboxIPs(pod.Namespace, pod.Name, podSandboxStatus)
  665. klog.V(4).Infof("Determined the ip %v for pod %q after sandbox changed", podIPs, format.Pod(pod))
  666. }
  667. }
  668. // the start containers routines depend on pod ip(as in primary pod ip)
  669. // instead of trying to figure out if we have 0 < len(podIPs)
  670. // everytime, we short circuit it here
  671. podIP := ""
  672. if len(podIPs) != 0 {
  673. podIP = podIPs[0]
  674. }
  675. // Get podSandboxConfig for containers to start.
  676. configPodSandboxResult := kubecontainer.NewSyncResult(kubecontainer.ConfigPodSandbox, podSandboxID)
  677. result.AddSyncResult(configPodSandboxResult)
  678. podSandboxConfig, err := m.generatePodSandboxConfig(pod, podContainerChanges.Attempt)
  679. if err != nil {
  680. message := fmt.Sprintf("GeneratePodSandboxConfig for pod %q failed: %v", format.Pod(pod), err)
  681. klog.Error(message)
  682. configPodSandboxResult.Fail(kubecontainer.ErrConfigPodSandbox, message)
  683. return
  684. }
  685. // Helper containing boilerplate common to starting all types of containers.
  686. // typeName is a label used to describe this type of container in log messages,
  687. // currently: "container", "init container" or "ephemeral container"
  688. start := func(typeName string, spec *startSpec) error {
  689. startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, spec.container.Name)
  690. result.AddSyncResult(startContainerResult)
  691. isInBackOff, msg, err := m.doBackOff(pod, spec.container, podStatus, backOff)
  692. if isInBackOff {
  693. startContainerResult.Fail(err, msg)
  694. klog.V(4).Infof("Backing Off restarting %v %+v in pod %v", typeName, spec.container, format.Pod(pod))
  695. return err
  696. }
  697. klog.V(4).Infof("Creating %v %+v in pod %v", typeName, spec.container, format.Pod(pod))
  698. // NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs.
  699. if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil {
  700. startContainerResult.Fail(err, msg)
  701. // known errors that are logged in other places are logged at higher levels here to avoid
  702. // repetitive log spam
  703. switch {
  704. case err == images.ErrImagePullBackOff:
  705. klog.V(3).Infof("%v start failed: %v: %s", typeName, err, msg)
  706. default:
  707. utilruntime.HandleError(fmt.Errorf("%v start failed: %v: %s", typeName, err, msg))
  708. }
  709. return err
  710. }
  711. return nil
  712. }
  713. // Step 5: start ephemeral containers
  714. // These are started "prior" to init containers to allow running ephemeral containers even when there
  715. // are errors starting an init container. In practice init containers will start first since ephemeral
  716. // containers cannot be specified on pod creation.
  717. if utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) {
  718. for _, idx := range podContainerChanges.EphemeralContainersToStart {
  719. start("ephemeral container", ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx]))
  720. }
  721. }
  722. // Step 6: start the init container.
  723. if container := podContainerChanges.NextInitContainerToStart; container != nil {
  724. // Start the next init container.
  725. if err := start("init container", containerStartSpec(container)); err != nil {
  726. return
  727. }
  728. // Successfully started the container; clear the entry in the failure
  729. klog.V(4).Infof("Completed init container %q for pod %q", container.Name, format.Pod(pod))
  730. }
  731. // Step 7: start containers in podContainerChanges.ContainersToStart.
  732. for _, idx := range podContainerChanges.ContainersToStart {
  733. start("container", containerStartSpec(&pod.Spec.Containers[idx]))
  734. }
  735. return
  736. }
  737. // If a container is still in backoff, the function will return a brief backoff error and
  738. // a detailed error message.
  739. func (m *kubeGenericRuntimeManager) doBackOff(pod *v1.Pod, container *v1.Container, podStatus *kubecontainer.PodStatus, backOff *flowcontrol.Backoff) (bool, string, error) {
  740. var cStatus *kubecontainer.ContainerStatus
  741. for _, c := range podStatus.ContainerStatuses {
  742. if c.Name == container.Name && c.State == kubecontainer.ContainerStateExited {
  743. cStatus = c
  744. break
  745. }
  746. }
  747. if cStatus == nil {
  748. return false, "", nil
  749. }
  750. klog.V(3).Infof("checking backoff for container %q in pod %q", container.Name, format.Pod(pod))
  751. // Use the finished time of the latest exited container as the start point to calculate whether to do back-off.
  752. ts := cStatus.FinishedAt
  753. // backOff requires a unique key to identify the container.
  754. key := getStableKey(pod, container)
  755. if backOff.IsInBackOffSince(key, ts) {
  756. if ref, err := kubecontainer.GenerateContainerRef(pod, container); err == nil {
  757. m.recorder.Eventf(ref, v1.EventTypeWarning, events.BackOffStartContainer, "Back-off restarting failed container")
  758. }
  759. err := fmt.Errorf("back-off %s restarting failed container=%s pod=%s", backOff.Get(key), container.Name, format.Pod(pod))
  760. klog.V(3).Infof("%s", err.Error())
  761. return true, err.Error(), kubecontainer.ErrCrashLoopBackOff
  762. }
  763. backOff.Next(key, ts)
  764. return false, "", nil
  765. }
  766. // KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
  767. // gracePeriodOverride if specified allows the caller to override the pod default grace period.
  768. // only hard kill paths are allowed to specify a gracePeriodOverride in the kubelet in order to not corrupt user data.
  769. // it is useful when doing SIGKILL for hard eviction scenarios, or max grace period during soft eviction scenarios.
  770. func (m *kubeGenericRuntimeManager) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error {
  771. err := m.killPodWithSyncResult(pod, runningPod, gracePeriodOverride)
  772. return err.Error()
  773. }
  774. // killPodWithSyncResult kills a runningPod and returns SyncResult.
  775. // Note: The pod passed in could be *nil* when kubelet restarted.
  776. func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
  777. killContainerResults := m.killContainersWithSyncResult(pod, runningPod, gracePeriodOverride)
  778. for _, containerResult := range killContainerResults {
  779. result.AddSyncResult(containerResult)
  780. }
  781. // stop sandbox, the sandbox will be removed in GarbageCollect
  782. killSandboxResult := kubecontainer.NewSyncResult(kubecontainer.KillPodSandbox, runningPod.ID)
  783. result.AddSyncResult(killSandboxResult)
  784. // Stop all sandboxes belongs to same pod
  785. for _, podSandbox := range runningPod.Sandboxes {
  786. if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil {
  787. killSandboxResult.Fail(kubecontainer.ErrKillPodSandbox, err.Error())
  788. klog.Errorf("Failed to stop sandbox %q", podSandbox.ID)
  789. }
  790. }
  791. return
  792. }
  793. // GetPodStatus retrieves the status of the pod, including the
  794. // information of all containers in the pod that are visible in Runtime.
  795. func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
  796. // Now we retain restart count of container as a container label. Each time a container
  797. // restarts, pod will read the restart count from the registered dead container, increment
  798. // it to get the new restart count, and then add a label with the new restart count on
  799. // the newly started container.
  800. // However, there are some limitations of this method:
  801. // 1. When all dead containers were garbage collected, the container status could
  802. // not get the historical value and would be *inaccurate*. Fortunately, the chance
  803. // is really slim.
  804. // 2. When working with old version containers which have no restart count label,
  805. // we can only assume their restart count is 0.
  806. // Anyhow, we only promised "best-effort" restart count reporting, we can just ignore
  807. // these limitations now.
  808. // TODO: move this comment to SyncPod.
  809. podSandboxIDs, err := m.getSandboxIDByPodUID(uid, nil)
  810. if err != nil {
  811. return nil, err
  812. }
  813. podFullName := format.Pod(&v1.Pod{
  814. ObjectMeta: metav1.ObjectMeta{
  815. Name: name,
  816. Namespace: namespace,
  817. UID: uid,
  818. },
  819. })
  820. klog.V(4).Infof("getSandboxIDByPodUID got sandbox IDs %q for pod %q", podSandboxIDs, podFullName)
  821. sandboxStatuses := make([]*runtimeapi.PodSandboxStatus, len(podSandboxIDs))
  822. podIPs := []string{}
  823. for idx, podSandboxID := range podSandboxIDs {
  824. podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
  825. if err != nil {
  826. klog.Errorf("PodSandboxStatus of sandbox %q for pod %q error: %v", podSandboxID, podFullName, err)
  827. return nil, err
  828. }
  829. sandboxStatuses[idx] = podSandboxStatus
  830. // Only get pod IP from latest sandbox
  831. if idx == 0 && podSandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY {
  832. podIPs = m.determinePodSandboxIPs(namespace, name, podSandboxStatus)
  833. }
  834. }
  835. // Get statuses of all containers visible in the pod.
  836. containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
  837. if err != nil {
  838. if m.logReduction.ShouldMessageBePrinted(err.Error(), podFullName) {
  839. klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
  840. }
  841. return nil, err
  842. }
  843. m.logReduction.ClearID(podFullName)
  844. return &kubecontainer.PodStatus{
  845. ID: uid,
  846. Name: name,
  847. Namespace: namespace,
  848. IPs: podIPs,
  849. SandboxStatuses: sandboxStatuses,
  850. ContainerStatuses: containerStatuses,
  851. }, nil
  852. }
  853. // GarbageCollect removes dead containers using the specified container gc policy.
  854. func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
  855. return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady, evictNonDeletedPods)
  856. }
  857. // UpdatePodCIDR is just a passthrough method to update the runtimeConfig of the shim
  858. // with the podCIDR supplied by the kubelet.
  859. func (m *kubeGenericRuntimeManager) UpdatePodCIDR(podCIDR string) error {
  860. // TODO(#35531): do we really want to write a method on this manager for each
  861. // field of the config?
  862. klog.Infof("updating runtime config through cri with podcidr %v", podCIDR)
  863. return m.runtimeService.UpdateRuntimeConfig(
  864. &runtimeapi.RuntimeConfig{
  865. NetworkConfig: &runtimeapi.NetworkConfig{
  866. PodCidr: podCIDR,
  867. },
  868. })
  869. }