pod_container_manager_linux.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cm
  14. import (
  15. "fmt"
  16. "io/ioutil"
  17. "os"
  18. "path"
  19. "strings"
  20. v1 "k8s.io/api/core/v1"
  21. "k8s.io/apimachinery/pkg/types"
  22. utilerrors "k8s.io/apimachinery/pkg/util/errors"
  23. utilfeature "k8s.io/apiserver/pkg/util/feature"
  24. "k8s.io/klog"
  25. v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
  26. kubefeatures "k8s.io/kubernetes/pkg/features"
  27. )
  28. const (
  29. podCgroupNamePrefix = "pod"
  30. )
  31. // podContainerManagerImpl implements podContainerManager interface.
  32. // It is the general implementation which allows pod level container
  33. // management if qos Cgroup is enabled.
  34. type podContainerManagerImpl struct {
  35. // qosContainersInfo hold absolute paths of the top level qos containers
  36. qosContainersInfo QOSContainersInfo
  37. // Stores the mounted cgroup subsystems
  38. subsystems *CgroupSubsystems
  39. // cgroupManager is the cgroup Manager Object responsible for managing all
  40. // pod cgroups.
  41. cgroupManager CgroupManager
  42. // Maximum number of pids in a pod
  43. podPidsLimit int64
  44. // enforceCPULimits controls whether cfs quota is enforced or not
  45. enforceCPULimits bool
  46. // cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
  47. // node for all containers in usec
  48. cpuCFSQuotaPeriod uint64
  49. }
  50. // Make sure that podContainerManagerImpl implements the PodContainerManager interface
  51. var _ PodContainerManager = &podContainerManagerImpl{}
  52. // applyLimits sets pod cgroup resource limits
  53. // It also updates the resource limits on top level qos containers.
  54. func (m *podContainerManagerImpl) applyLimits(pod *v1.Pod) error {
  55. // This function will house the logic for setting the resource parameters
  56. // on the pod container config and updating top level qos container configs
  57. return nil
  58. }
  59. // Exists checks if the pod's cgroup already exists
  60. func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
  61. podContainerName, _ := m.GetPodContainerName(pod)
  62. return m.cgroupManager.Exists(podContainerName)
  63. }
  64. // EnsureExists takes a pod as argument and makes sure that
  65. // pod cgroup exists if qos cgroup hierarchy flag is enabled.
  66. // If the pod level container doesn't already exist it is created.
  67. func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
  68. podContainerName, _ := m.GetPodContainerName(pod)
  69. // check if container already exist
  70. alreadyExists := m.Exists(pod)
  71. if !alreadyExists {
  72. // Create the pod container
  73. containerConfig := &CgroupConfig{
  74. Name: podContainerName,
  75. ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
  76. }
  77. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
  78. containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
  79. }
  80. if err := m.cgroupManager.Create(containerConfig); err != nil {
  81. return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
  82. }
  83. }
  84. // Apply appropriate resource limits on the pod container
  85. // Top level qos containers limits are not updated
  86. // until we figure how to maintain the desired state in the kubelet.
  87. // Because maintaining the desired state is difficult without checkpointing.
  88. if err := m.applyLimits(pod); err != nil {
  89. return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
  90. }
  91. return nil
  92. }
  93. // GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
  94. func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
  95. podQOS := v1qos.GetPodQOS(pod)
  96. // Get the parent QOS container name
  97. var parentContainer CgroupName
  98. switch podQOS {
  99. case v1.PodQOSGuaranteed:
  100. parentContainer = m.qosContainersInfo.Guaranteed
  101. case v1.PodQOSBurstable:
  102. parentContainer = m.qosContainersInfo.Burstable
  103. case v1.PodQOSBestEffort:
  104. parentContainer = m.qosContainersInfo.BestEffort
  105. }
  106. podContainer := GetPodCgroupNameSuffix(pod.UID)
  107. // Get the absolute path of the cgroup
  108. cgroupName := NewCgroupName(parentContainer, podContainer)
  109. // Get the literal cgroupfs name
  110. cgroupfsName := m.cgroupManager.Name(cgroupName)
  111. return cgroupName, cgroupfsName
  112. }
  113. // Kill one process ID
  114. func (m *podContainerManagerImpl) killOnePid(pid int) error {
  115. // os.FindProcess never returns an error on POSIX
  116. // https://go-review.googlesource.com/c/go/+/19093
  117. p, _ := os.FindProcess(pid)
  118. if err := p.Kill(); err != nil {
  119. // If the process already exited, that's fine.
  120. if strings.Contains(err.Error(), "process already finished") {
  121. // Hate parsing strings, but
  122. // vendor/github.com/opencontainers/runc/libcontainer/
  123. // also does this.
  124. klog.V(3).Infof("process with pid %v no longer exists", pid)
  125. return nil
  126. }
  127. return err
  128. }
  129. return nil
  130. }
  131. // Scan through the whole cgroup directory and kill all processes either
  132. // attached to the pod cgroup or to a container cgroup under the pod cgroup
  133. func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
  134. pidsToKill := m.cgroupManager.Pids(podCgroup)
  135. // No pids charged to the terminated pod cgroup return
  136. if len(pidsToKill) == 0 {
  137. return nil
  138. }
  139. var errlist []error
  140. // os.Kill often errors out,
  141. // We try killing all the pids multiple times
  142. removed := map[int]bool{}
  143. for i := 0; i < 5; i++ {
  144. if i != 0 {
  145. klog.V(3).Infof("Attempt %v failed to kill all unwanted process from cgroup: %v. Retyring", i, podCgroup)
  146. }
  147. errlist = []error{}
  148. for _, pid := range pidsToKill {
  149. if _, ok := removed[pid]; ok {
  150. continue
  151. }
  152. klog.V(3).Infof("Attempt to kill process with pid: %v from cgroup: %v", pid, podCgroup)
  153. if err := m.killOnePid(pid); err != nil {
  154. klog.V(3).Infof("failed to kill process with pid: %v from cgroup: %v", pid, podCgroup)
  155. errlist = append(errlist, err)
  156. } else {
  157. removed[pid] = true
  158. }
  159. }
  160. if len(errlist) == 0 {
  161. klog.V(3).Infof("successfully killed all unwanted processes from cgroup: %v", podCgroup)
  162. return nil
  163. }
  164. }
  165. return utilerrors.NewAggregate(errlist)
  166. }
  167. // Destroy destroys the pod container cgroup paths
  168. func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
  169. // Try killing all the processes attached to the pod cgroup
  170. if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
  171. klog.V(3).Infof("failed to kill all the processes attached to the %v cgroups", podCgroup)
  172. return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
  173. }
  174. // Now its safe to remove the pod's cgroup
  175. containerConfig := &CgroupConfig{
  176. Name: podCgroup,
  177. ResourceParameters: &ResourceConfig{},
  178. }
  179. if err := m.cgroupManager.Destroy(containerConfig); err != nil {
  180. return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
  181. }
  182. return nil
  183. }
  184. // ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
  185. func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
  186. return m.cgroupManager.ReduceCPULimits(podCgroup)
  187. }
  188. // IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
  189. func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) {
  190. // convert the literal cgroupfs form to the driver specific value
  191. cgroupName := m.cgroupManager.CgroupName(cgroupfs)
  192. qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
  193. basePath := ""
  194. for _, qosContainerName := range qosContainersList {
  195. // a pod cgroup is a direct child of a qos node, so check if its a match
  196. if len(cgroupName) == len(qosContainerName)+1 {
  197. basePath = cgroupName[len(qosContainerName)]
  198. }
  199. }
  200. if basePath == "" {
  201. return false, types.UID("")
  202. }
  203. if !strings.HasPrefix(basePath, podCgroupNamePrefix) {
  204. return false, types.UID("")
  205. }
  206. parts := strings.Split(basePath, podCgroupNamePrefix)
  207. if len(parts) != 2 {
  208. return false, types.UID("")
  209. }
  210. return true, types.UID(parts[1])
  211. }
  212. // GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
  213. // Get list of pods whose cgroup still exist on the cgroup mounts
  214. func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
  215. // Map for storing all the found pods on the disk
  216. foundPods := make(map[types.UID]CgroupName)
  217. qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
  218. // Scan through all the subsystem mounts
  219. // and through each QoS cgroup directory for each subsystem mount
  220. // If a pod cgroup exists in even a single subsystem mount
  221. // we will attempt to delete it
  222. for _, val := range m.subsystems.MountPoints {
  223. for _, qosContainerName := range qosContainersList {
  224. // get the subsystems QoS cgroup absolute name
  225. qcConversion := m.cgroupManager.Name(qosContainerName)
  226. qc := path.Join(val, qcConversion)
  227. dirInfo, err := ioutil.ReadDir(qc)
  228. if err != nil {
  229. if os.IsNotExist(err) {
  230. continue
  231. }
  232. return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
  233. }
  234. for i := range dirInfo {
  235. // its not a directory, so continue on...
  236. if !dirInfo[i].IsDir() {
  237. continue
  238. }
  239. // convert the concrete cgroupfs name back to an internal identifier
  240. // this is needed to handle path conversion for systemd environments.
  241. // we pass the fully qualified path so decoding can work as expected
  242. // since systemd encodes the path in each segment.
  243. cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
  244. internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
  245. // we only care about base segment of the converted path since that
  246. // is what we are reading currently to know if it is a pod or not.
  247. basePath := internalPath[len(internalPath)-1]
  248. if !strings.Contains(basePath, podCgroupNamePrefix) {
  249. continue
  250. }
  251. // we then split the name on the pod prefix to determine the uid
  252. parts := strings.Split(basePath, podCgroupNamePrefix)
  253. // the uid is missing, so we log the unexpected cgroup not of form pod<uid>
  254. if len(parts) != 2 {
  255. klog.Errorf("pod cgroup manager ignoring unexpected cgroup %v because it is not a pod", cgroupfsPath)
  256. continue
  257. }
  258. podUID := parts[1]
  259. foundPods[types.UID(podUID)] = internalPath
  260. }
  261. }
  262. }
  263. return foundPods, nil
  264. }
  265. // podContainerManagerNoop implements podContainerManager interface.
  266. // It is a no-op implementation and basically does nothing
  267. // podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
  268. // enabled, so Exists() returns true always as the cgroupRoot
  269. // is expected to always exist.
  270. type podContainerManagerNoop struct {
  271. cgroupRoot CgroupName
  272. }
  273. // Make sure that podContainerManagerStub implements the PodContainerManager interface
  274. var _ PodContainerManager = &podContainerManagerNoop{}
  275. func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
  276. return true
  277. }
  278. func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
  279. return nil
  280. }
  281. func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
  282. return m.cgroupRoot, ""
  283. }
  284. func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
  285. return ""
  286. }
  287. // Destroy destroys the pod container cgroup paths
  288. func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
  289. return nil
  290. }
  291. func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
  292. return nil
  293. }
  294. func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
  295. return nil, nil
  296. }
  297. func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) {
  298. return false, types.UID("")
  299. }