pod_container_manager_linux.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cm
  14. import (
  15. "fmt"
  16. "io/ioutil"
  17. "os"
  18. "path"
  19. "strings"
  20. "k8s.io/api/core/v1"
  21. "k8s.io/apimachinery/pkg/types"
  22. utilerrors "k8s.io/apimachinery/pkg/util/errors"
  23. utilfeature "k8s.io/apiserver/pkg/util/feature"
  24. "k8s.io/klog"
  25. v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
  26. kubefeatures "k8s.io/kubernetes/pkg/features"
  27. )
  28. const (
  29. podCgroupNamePrefix = "pod"
  30. )
  31. // podContainerManagerImpl implements podContainerManager interface.
  32. // It is the general implementation which allows pod level container
  33. // management if qos Cgroup is enabled.
  34. type podContainerManagerImpl struct {
  35. // qosContainersInfo hold absolute paths of the top level qos containers
  36. qosContainersInfo QOSContainersInfo
  37. // Stores the mounted cgroup subsystems
  38. subsystems *CgroupSubsystems
  39. // cgroupManager is the cgroup Manager Object responsible for managing all
  40. // pod cgroups.
  41. cgroupManager CgroupManager
  42. // Maximum number of pids in a pod
  43. podPidsLimit int64
  44. // enforceCPULimits controls whether cfs quota is enforced or not
  45. enforceCPULimits bool
  46. // cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
  47. // node for all containers in usec
  48. cpuCFSQuotaPeriod uint64
  49. }
  50. // Make sure that podContainerManagerImpl implements the PodContainerManager interface
  51. var _ PodContainerManager = &podContainerManagerImpl{}
  52. // applyLimits sets pod cgroup resource limits
  53. // It also updates the resource limits on top level qos containers.
  54. func (m *podContainerManagerImpl) applyLimits(pod *v1.Pod) error {
  55. // This function will house the logic for setting the resource parameters
  56. // on the pod container config and updating top level qos container configs
  57. return nil
  58. }
  59. // Exists checks if the pod's cgroup already exists
  60. func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
  61. podContainerName, _ := m.GetPodContainerName(pod)
  62. return m.cgroupManager.Exists(podContainerName)
  63. }
  64. // EnsureExists takes a pod as argument and makes sure that
  65. // pod cgroup exists if qos cgroup hierarchy flag is enabled.
  66. // If the pod level container doesn't already exist it is created.
  67. func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
  68. podContainerName, _ := m.GetPodContainerName(pod)
  69. // check if container already exist
  70. alreadyExists := m.Exists(pod)
  71. if !alreadyExists {
  72. // Create the pod container
  73. containerConfig := &CgroupConfig{
  74. Name: podContainerName,
  75. ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
  76. }
  77. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
  78. containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
  79. }
  80. if err := m.cgroupManager.Create(containerConfig); err != nil {
  81. return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
  82. }
  83. }
  84. // Apply appropriate resource limits on the pod container
  85. // Top level qos containers limits are not updated
  86. // until we figure how to maintain the desired state in the kubelet.
  87. // Because maintaining the desired state is difficult without checkpointing.
  88. if err := m.applyLimits(pod); err != nil {
  89. return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
  90. }
  91. return nil
  92. }
  93. // GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
  94. func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
  95. podQOS := v1qos.GetPodQOS(pod)
  96. // Get the parent QOS container name
  97. var parentContainer CgroupName
  98. switch podQOS {
  99. case v1.PodQOSGuaranteed:
  100. parentContainer = m.qosContainersInfo.Guaranteed
  101. case v1.PodQOSBurstable:
  102. parentContainer = m.qosContainersInfo.Burstable
  103. case v1.PodQOSBestEffort:
  104. parentContainer = m.qosContainersInfo.BestEffort
  105. }
  106. podContainer := GetPodCgroupNameSuffix(pod.UID)
  107. // Get the absolute path of the cgroup
  108. cgroupName := NewCgroupName(parentContainer, podContainer)
  109. // Get the literal cgroupfs name
  110. cgroupfsName := m.cgroupManager.Name(cgroupName)
  111. return cgroupName, cgroupfsName
  112. }
  113. // Kill one process ID
  114. func (m *podContainerManagerImpl) killOnePid(pid int) error {
  115. // os.FindProcess never returns an error on POSIX
  116. // https://go-review.googlesource.com/c/go/+/19093
  117. p, _ := os.FindProcess(pid)
  118. if err := p.Kill(); err != nil {
  119. // If the process already exited, that's fine.
  120. if strings.Contains(err.Error(), "process already finished") {
  121. // Hate parsing strings, but
  122. // vendor/github.com/opencontainers/runc/libcontainer/
  123. // also does this.
  124. klog.V(3).Infof("process with pid %v no longer exists", pid)
  125. return nil
  126. }
  127. return err
  128. }
  129. return nil
  130. }
  131. // Scan through the whole cgroup directory and kill all processes either
  132. // attached to the pod cgroup or to a container cgroup under the pod cgroup
  133. func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
  134. pidsToKill := m.cgroupManager.Pids(podCgroup)
  135. // No pids charged to the terminated pod cgroup return
  136. if len(pidsToKill) == 0 {
  137. return nil
  138. }
  139. var errlist []error
  140. // os.Kill often errors out,
  141. // We try killing all the pids multiple times
  142. for i := 0; i < 5; i++ {
  143. if i != 0 {
  144. klog.V(3).Infof("Attempt %v failed to kill all unwanted process. Retyring", i)
  145. }
  146. errlist = []error{}
  147. for _, pid := range pidsToKill {
  148. klog.V(3).Infof("Attempt to kill process with pid: %v", pid)
  149. if err := m.killOnePid(pid); err != nil {
  150. klog.V(3).Infof("failed to kill process with pid: %v", pid)
  151. errlist = append(errlist, err)
  152. }
  153. }
  154. if len(errlist) == 0 {
  155. klog.V(3).Infof("successfully killed all unwanted processes.")
  156. return nil
  157. }
  158. }
  159. return utilerrors.NewAggregate(errlist)
  160. }
  161. // Destroy destroys the pod container cgroup paths
  162. func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
  163. // Try killing all the processes attached to the pod cgroup
  164. if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
  165. klog.V(3).Infof("failed to kill all the processes attached to the %v cgroups", podCgroup)
  166. return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
  167. }
  168. // Now its safe to remove the pod's cgroup
  169. containerConfig := &CgroupConfig{
  170. Name: podCgroup,
  171. ResourceParameters: &ResourceConfig{},
  172. }
  173. if err := m.cgroupManager.Destroy(containerConfig); err != nil {
  174. return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
  175. }
  176. return nil
  177. }
  178. // ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
  179. func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
  180. return m.cgroupManager.ReduceCPULimits(podCgroup)
  181. }
  182. // IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
  183. func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) {
  184. // convert the literal cgroupfs form to the driver specific value
  185. cgroupName := m.cgroupManager.CgroupName(cgroupfs)
  186. qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
  187. basePath := ""
  188. for _, qosContainerName := range qosContainersList {
  189. // a pod cgroup is a direct child of a qos node, so check if its a match
  190. if len(cgroupName) == len(qosContainerName)+1 {
  191. basePath = cgroupName[len(qosContainerName)]
  192. }
  193. }
  194. if basePath == "" {
  195. return false, types.UID("")
  196. }
  197. if !strings.HasPrefix(basePath, podCgroupNamePrefix) {
  198. return false, types.UID("")
  199. }
  200. parts := strings.Split(basePath, podCgroupNamePrefix)
  201. if len(parts) != 2 {
  202. return false, types.UID("")
  203. }
  204. return true, types.UID(parts[1])
  205. }
  206. // GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
  207. // Get list of pods whose cgroup still exist on the cgroup mounts
  208. func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
  209. // Map for storing all the found pods on the disk
  210. foundPods := make(map[types.UID]CgroupName)
  211. qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
  212. // Scan through all the subsystem mounts
  213. // and through each QoS cgroup directory for each subsystem mount
  214. // If a pod cgroup exists in even a single subsystem mount
  215. // we will attempt to delete it
  216. for _, val := range m.subsystems.MountPoints {
  217. for _, qosContainerName := range qosContainersList {
  218. // get the subsystems QoS cgroup absolute name
  219. qcConversion := m.cgroupManager.Name(qosContainerName)
  220. qc := path.Join(val, qcConversion)
  221. dirInfo, err := ioutil.ReadDir(qc)
  222. if err != nil {
  223. if os.IsNotExist(err) {
  224. continue
  225. }
  226. return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
  227. }
  228. for i := range dirInfo {
  229. // its not a directory, so continue on...
  230. if !dirInfo[i].IsDir() {
  231. continue
  232. }
  233. // convert the concrete cgroupfs name back to an internal identifier
  234. // this is needed to handle path conversion for systemd environments.
  235. // we pass the fully qualified path so decoding can work as expected
  236. // since systemd encodes the path in each segment.
  237. cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
  238. internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
  239. // we only care about base segment of the converted path since that
  240. // is what we are reading currently to know if it is a pod or not.
  241. basePath := internalPath[len(internalPath)-1]
  242. if !strings.Contains(basePath, podCgroupNamePrefix) {
  243. continue
  244. }
  245. // we then split the name on the pod prefix to determine the uid
  246. parts := strings.Split(basePath, podCgroupNamePrefix)
  247. // the uid is missing, so we log the unexpected cgroup not of form pod<uid>
  248. if len(parts) != 2 {
  249. klog.Errorf("pod cgroup manager ignoring unexpected cgroup %v because it is not a pod", cgroupfsPath)
  250. continue
  251. }
  252. podUID := parts[1]
  253. foundPods[types.UID(podUID)] = internalPath
  254. }
  255. }
  256. }
  257. return foundPods, nil
  258. }
  259. // podContainerManagerNoop implements podContainerManager interface.
  260. // It is a no-op implementation and basically does nothing
  261. // podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
  262. // enabled, so Exists() returns true always as the cgroupRoot
  263. // is expected to always exist.
  264. type podContainerManagerNoop struct {
  265. cgroupRoot CgroupName
  266. }
  267. // Make sure that podContainerManagerStub implements the PodContainerManager interface
  268. var _ PodContainerManager = &podContainerManagerNoop{}
  269. func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
  270. return true
  271. }
  272. func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
  273. return nil
  274. }
  275. func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
  276. return m.cgroupRoot, ""
  277. }
  278. func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
  279. return ""
  280. }
  281. // Destroy destroys the pod container cgroup paths
  282. func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
  283. return nil
  284. }
  285. func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
  286. return nil
  287. }
  288. func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
  289. return nil, nil
  290. }
  291. func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) {
  292. return false, types.UID("")
  293. }