qos_container_manager_linux.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cm
  14. import (
  15. "fmt"
  16. "strings"
  17. "sync"
  18. "time"
  19. "k8s.io/klog"
  20. "k8s.io/apimachinery/pkg/util/wait"
  21. units "github.com/docker/go-units"
  22. cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
  23. "k8s.io/api/core/v1"
  24. utilfeature "k8s.io/apiserver/pkg/util/feature"
  25. "k8s.io/kubernetes/pkg/api/v1/resource"
  26. v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
  27. kubefeatures "k8s.io/kubernetes/pkg/features"
  28. )
  29. const (
  30. // how often the qos cgroup manager will perform periodic update
  31. // of the qos level cgroup resource constraints
  32. periodicQOSCgroupUpdateInterval = 1 * time.Minute
  33. )
  34. type QOSContainerManager interface {
  35. Start(func() v1.ResourceList, ActivePodsFunc) error
  36. GetQOSContainersInfo() QOSContainersInfo
  37. UpdateCgroups() error
  38. }
  39. type qosContainerManagerImpl struct {
  40. sync.Mutex
  41. nodeInfo *v1.Node
  42. qosContainersInfo QOSContainersInfo
  43. subsystems *CgroupSubsystems
  44. cgroupManager CgroupManager
  45. activePods ActivePodsFunc
  46. getNodeAllocatable func() v1.ResourceList
  47. cgroupRoot CgroupName
  48. qosReserved map[v1.ResourceName]int64
  49. }
  50. func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
  51. if !nodeConfig.CgroupsPerQOS {
  52. return &qosContainerManagerNoop{
  53. cgroupRoot: cgroupRoot,
  54. }, nil
  55. }
  56. return &qosContainerManagerImpl{
  57. subsystems: subsystems,
  58. cgroupManager: cgroupManager,
  59. cgroupRoot: cgroupRoot,
  60. qosReserved: nodeConfig.QOSReserved,
  61. }, nil
  62. }
  63. func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
  64. return m.qosContainersInfo
  65. }
  66. func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
  67. cm := m.cgroupManager
  68. rootContainer := m.cgroupRoot
  69. if !cm.Exists(rootContainer) {
  70. return fmt.Errorf("root container %v doesn't exist", rootContainer)
  71. }
  72. // Top level for Qos containers are created only for Burstable
  73. // and Best Effort classes
  74. qosClasses := map[v1.PodQOSClass]CgroupName{
  75. v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
  76. v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
  77. }
  78. // Create containers for both qos classes
  79. for qosClass, containerName := range qosClasses {
  80. resourceParameters := &ResourceConfig{}
  81. // the BestEffort QoS class has a statically configured minShares value
  82. if qosClass == v1.PodQOSBestEffort {
  83. minShares := uint64(MinShares)
  84. resourceParameters.CpuShares = &minShares
  85. }
  86. // containerConfig object stores the cgroup specifications
  87. containerConfig := &CgroupConfig{
  88. Name: containerName,
  89. ResourceParameters: resourceParameters,
  90. }
  91. // for each enumerated huge page size, the qos tiers are unbounded
  92. m.setHugePagesUnbounded(containerConfig)
  93. // check if it exists
  94. if !cm.Exists(containerName) {
  95. if err := cm.Create(containerConfig); err != nil {
  96. return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
  97. }
  98. } else {
  99. // to ensure we actually have the right state, we update the config on startup
  100. if err := cm.Update(containerConfig); err != nil {
  101. return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
  102. }
  103. }
  104. }
  105. // Store the top level qos container names
  106. m.qosContainersInfo = QOSContainersInfo{
  107. Guaranteed: rootContainer,
  108. Burstable: qosClasses[v1.PodQOSBurstable],
  109. BestEffort: qosClasses[v1.PodQOSBestEffort],
  110. }
  111. m.getNodeAllocatable = getNodeAllocatable
  112. m.activePods = activePods
  113. // update qos cgroup tiers on startup and in periodic intervals
  114. // to ensure desired state is in sync with actual state.
  115. go wait.Until(func() {
  116. err := m.UpdateCgroups()
  117. if err != nil {
  118. klog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
  119. }
  120. }, periodicQOSCgroupUpdateInterval, wait.NeverStop)
  121. return nil
  122. }
  123. // setHugePagesUnbounded ensures hugetlb is effectively unbounded
  124. func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
  125. hugePageLimit := map[int64]int64{}
  126. for _, pageSize := range cgroupfs.HugePageSizes {
  127. pageSizeBytes, err := units.RAMInBytes(pageSize)
  128. if err != nil {
  129. return err
  130. }
  131. hugePageLimit[pageSizeBytes] = int64(1 << 62)
  132. }
  133. cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
  134. return nil
  135. }
  136. func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
  137. for _, v := range configs {
  138. if err := m.setHugePagesUnbounded(v); err != nil {
  139. return err
  140. }
  141. }
  142. return nil
  143. }
  144. func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
  145. pods := m.activePods()
  146. burstablePodCPURequest := int64(0)
  147. for i := range pods {
  148. pod := pods[i]
  149. qosClass := v1qos.GetPodQOS(pod)
  150. if qosClass != v1.PodQOSBurstable {
  151. // we only care about the burstable qos tier
  152. continue
  153. }
  154. req, _ := resource.PodRequestsAndLimits(pod)
  155. if request, found := req[v1.ResourceCPU]; found {
  156. burstablePodCPURequest += request.MilliValue()
  157. }
  158. }
  159. // make sure best effort is always 2 shares
  160. bestEffortCPUShares := uint64(MinShares)
  161. configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
  162. // set burstable shares based on current observe state
  163. burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
  164. configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
  165. return nil
  166. }
  167. // setMemoryReserve sums the memory limits of all pods in a QOS class,
  168. // calculates QOS class memory limits, and set those limits in the
  169. // CgroupConfig for each QOS class.
  170. func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
  171. qosMemoryRequests := map[v1.PodQOSClass]int64{
  172. v1.PodQOSGuaranteed: 0,
  173. v1.PodQOSBurstable: 0,
  174. }
  175. // Sum the pod limits for pods in each QOS class
  176. pods := m.activePods()
  177. for _, pod := range pods {
  178. podMemoryRequest := int64(0)
  179. qosClass := v1qos.GetPodQOS(pod)
  180. if qosClass == v1.PodQOSBestEffort {
  181. // limits are not set for Best Effort pods
  182. continue
  183. }
  184. req, _ := resource.PodRequestsAndLimits(pod)
  185. if request, found := req[v1.ResourceMemory]; found {
  186. podMemoryRequest += request.Value()
  187. }
  188. qosMemoryRequests[qosClass] += podMemoryRequest
  189. }
  190. resources := m.getNodeAllocatable()
  191. allocatableResource, ok := resources[v1.ResourceMemory]
  192. if !ok {
  193. klog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
  194. return
  195. }
  196. allocatable := allocatableResource.Value()
  197. if allocatable == 0 {
  198. klog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
  199. return
  200. }
  201. for qos, limits := range qosMemoryRequests {
  202. klog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
  203. }
  204. // Calculate QOS memory limits
  205. burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
  206. bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
  207. configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
  208. configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
  209. }
  210. // retrySetMemoryReserve checks for any QoS cgroups over the limit
  211. // that was attempted to be set in the first Update() and adjusts
  212. // their memory limit to the usage to prevent further growth.
  213. func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
  214. // Unreclaimable memory usage may already exceeded the desired limit
  215. // Attempt to set the limit near the current usage to put pressure
  216. // on the cgroup and prevent further growth.
  217. for qos, config := range configs {
  218. stats, err := m.cgroupManager.GetResourceStats(config.Name)
  219. if err != nil {
  220. klog.V(2).Infof("[Container Manager] %v", err)
  221. return
  222. }
  223. usage := stats.MemoryStats.Usage
  224. // Because there is no good way to determine of the original Update()
  225. // on the memory resource was successful, we determine failure of the
  226. // first attempt by checking if the usage is above the limit we attempt
  227. // to set. If it is, we assume the first attempt to set the limit failed
  228. // and try again setting the limit to the usage. Otherwise we leave
  229. // the CgroupConfig as is.
  230. if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
  231. configs[qos].ResourceParameters.Memory = &usage
  232. }
  233. }
  234. }
  235. func (m *qosContainerManagerImpl) UpdateCgroups() error {
  236. m.Lock()
  237. defer m.Unlock()
  238. qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
  239. v1.PodQOSBurstable: {
  240. Name: m.qosContainersInfo.Burstable,
  241. ResourceParameters: &ResourceConfig{},
  242. },
  243. v1.PodQOSBestEffort: {
  244. Name: m.qosContainersInfo.BestEffort,
  245. ResourceParameters: &ResourceConfig{},
  246. },
  247. }
  248. // update the qos level cgroup settings for cpu shares
  249. if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
  250. return err
  251. }
  252. // update the qos level cgroup settings for huge pages (ensure they remain unbounded)
  253. if err := m.setHugePagesConfig(qosConfigs); err != nil {
  254. return err
  255. }
  256. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
  257. for resource, percentReserve := range m.qosReserved {
  258. switch resource {
  259. case v1.ResourceMemory:
  260. m.setMemoryReserve(qosConfigs, percentReserve)
  261. }
  262. }
  263. updateSuccess := true
  264. for _, config := range qosConfigs {
  265. err := m.cgroupManager.Update(config)
  266. if err != nil {
  267. updateSuccess = false
  268. }
  269. }
  270. if updateSuccess {
  271. klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
  272. return nil
  273. }
  274. // If the resource can adjust the ResourceConfig to increase likelihood of
  275. // success, call the adjustment function here. Otherwise, the Update() will
  276. // be called again with the same values.
  277. for resource, percentReserve := range m.qosReserved {
  278. switch resource {
  279. case v1.ResourceMemory:
  280. m.retrySetMemoryReserve(qosConfigs, percentReserve)
  281. }
  282. }
  283. }
  284. for _, config := range qosConfigs {
  285. err := m.cgroupManager.Update(config)
  286. if err != nil {
  287. klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
  288. return err
  289. }
  290. }
  291. klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
  292. return nil
  293. }
  294. type qosContainerManagerNoop struct {
  295. cgroupRoot CgroupName
  296. }
  297. var _ QOSContainerManager = &qosContainerManagerNoop{}
  298. func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
  299. return QOSContainersInfo{}
  300. }
  301. func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
  302. return nil
  303. }
  304. func (m *qosContainerManagerNoop) UpdateCgroups() error {
  305. return nil
  306. }