qos_container_manager_linux.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cm
  14. import (
  15. "fmt"
  16. "strings"
  17. "sync"
  18. "time"
  19. "k8s.io/klog"
  20. "k8s.io/apimachinery/pkg/util/wait"
  21. units "github.com/docker/go-units"
  22. cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
  23. "k8s.io/api/core/v1"
  24. utilfeature "k8s.io/apiserver/pkg/util/feature"
  25. "k8s.io/kubernetes/pkg/api/v1/resource"
  26. v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
  27. kubefeatures "k8s.io/kubernetes/pkg/features"
  28. )
  29. const (
  30. // how often the qos cgroup manager will perform periodic update
  31. // of the qos level cgroup resource constraints
  32. periodicQOSCgroupUpdateInterval = 1 * time.Minute
  33. )
  34. type QOSContainerManager interface {
  35. Start(func() v1.ResourceList, ActivePodsFunc) error
  36. GetQOSContainersInfo() QOSContainersInfo
  37. UpdateCgroups() error
  38. }
  39. type qosContainerManagerImpl struct {
  40. sync.Mutex
  41. qosContainersInfo QOSContainersInfo
  42. subsystems *CgroupSubsystems
  43. cgroupManager CgroupManager
  44. activePods ActivePodsFunc
  45. getNodeAllocatable func() v1.ResourceList
  46. cgroupRoot CgroupName
  47. qosReserved map[v1.ResourceName]int64
  48. }
  49. func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
  50. if !nodeConfig.CgroupsPerQOS {
  51. return &qosContainerManagerNoop{
  52. cgroupRoot: cgroupRoot,
  53. }, nil
  54. }
  55. return &qosContainerManagerImpl{
  56. subsystems: subsystems,
  57. cgroupManager: cgroupManager,
  58. cgroupRoot: cgroupRoot,
  59. qosReserved: nodeConfig.QOSReserved,
  60. }, nil
  61. }
  62. func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
  63. return m.qosContainersInfo
  64. }
  65. func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
  66. cm := m.cgroupManager
  67. rootContainer := m.cgroupRoot
  68. if !cm.Exists(rootContainer) {
  69. return fmt.Errorf("root container %v doesn't exist", rootContainer)
  70. }
  71. // Top level for Qos containers are created only for Burstable
  72. // and Best Effort classes
  73. qosClasses := map[v1.PodQOSClass]CgroupName{
  74. v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
  75. v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
  76. }
  77. // Create containers for both qos classes
  78. for qosClass, containerName := range qosClasses {
  79. resourceParameters := &ResourceConfig{}
  80. // the BestEffort QoS class has a statically configured minShares value
  81. if qosClass == v1.PodQOSBestEffort {
  82. minShares := uint64(MinShares)
  83. resourceParameters.CpuShares = &minShares
  84. }
  85. // containerConfig object stores the cgroup specifications
  86. containerConfig := &CgroupConfig{
  87. Name: containerName,
  88. ResourceParameters: resourceParameters,
  89. }
  90. // for each enumerated huge page size, the qos tiers are unbounded
  91. m.setHugePagesUnbounded(containerConfig)
  92. // check if it exists
  93. if !cm.Exists(containerName) {
  94. if err := cm.Create(containerConfig); err != nil {
  95. return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
  96. }
  97. } else {
  98. // to ensure we actually have the right state, we update the config on startup
  99. if err := cm.Update(containerConfig); err != nil {
  100. return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
  101. }
  102. }
  103. }
  104. // Store the top level qos container names
  105. m.qosContainersInfo = QOSContainersInfo{
  106. Guaranteed: rootContainer,
  107. Burstable: qosClasses[v1.PodQOSBurstable],
  108. BestEffort: qosClasses[v1.PodQOSBestEffort],
  109. }
  110. m.getNodeAllocatable = getNodeAllocatable
  111. m.activePods = activePods
  112. // update qos cgroup tiers on startup and in periodic intervals
  113. // to ensure desired state is in sync with actual state.
  114. go wait.Until(func() {
  115. err := m.UpdateCgroups()
  116. if err != nil {
  117. klog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
  118. }
  119. }, periodicQOSCgroupUpdateInterval, wait.NeverStop)
  120. return nil
  121. }
  122. // setHugePagesUnbounded ensures hugetlb is effectively unbounded
  123. func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
  124. hugePageLimit := map[int64]int64{}
  125. for _, pageSize := range cgroupfs.HugePageSizes {
  126. pageSizeBytes, err := units.RAMInBytes(pageSize)
  127. if err != nil {
  128. return err
  129. }
  130. hugePageLimit[pageSizeBytes] = int64(1 << 62)
  131. }
  132. cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
  133. return nil
  134. }
  135. func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
  136. for _, v := range configs {
  137. if err := m.setHugePagesUnbounded(v); err != nil {
  138. return err
  139. }
  140. }
  141. return nil
  142. }
  143. func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
  144. pods := m.activePods()
  145. burstablePodCPURequest := int64(0)
  146. for i := range pods {
  147. pod := pods[i]
  148. qosClass := v1qos.GetPodQOS(pod)
  149. if qosClass != v1.PodQOSBurstable {
  150. // we only care about the burstable qos tier
  151. continue
  152. }
  153. req, _ := resource.PodRequestsAndLimits(pod)
  154. if request, found := req[v1.ResourceCPU]; found {
  155. burstablePodCPURequest += request.MilliValue()
  156. }
  157. }
  158. // make sure best effort is always 2 shares
  159. bestEffortCPUShares := uint64(MinShares)
  160. configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
  161. // set burstable shares based on current observe state
  162. burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
  163. configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
  164. return nil
  165. }
  166. // setMemoryReserve sums the memory limits of all pods in a QOS class,
  167. // calculates QOS class memory limits, and set those limits in the
  168. // CgroupConfig for each QOS class.
  169. func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
  170. qosMemoryRequests := map[v1.PodQOSClass]int64{
  171. v1.PodQOSGuaranteed: 0,
  172. v1.PodQOSBurstable: 0,
  173. }
  174. // Sum the pod limits for pods in each QOS class
  175. pods := m.activePods()
  176. for _, pod := range pods {
  177. podMemoryRequest := int64(0)
  178. qosClass := v1qos.GetPodQOS(pod)
  179. if qosClass == v1.PodQOSBestEffort {
  180. // limits are not set for Best Effort pods
  181. continue
  182. }
  183. req, _ := resource.PodRequestsAndLimits(pod)
  184. if request, found := req[v1.ResourceMemory]; found {
  185. podMemoryRequest += request.Value()
  186. }
  187. qosMemoryRequests[qosClass] += podMemoryRequest
  188. }
  189. resources := m.getNodeAllocatable()
  190. allocatableResource, ok := resources[v1.ResourceMemory]
  191. if !ok {
  192. klog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
  193. return
  194. }
  195. allocatable := allocatableResource.Value()
  196. if allocatable == 0 {
  197. klog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
  198. return
  199. }
  200. for qos, limits := range qosMemoryRequests {
  201. klog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
  202. }
  203. // Calculate QOS memory limits
  204. burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
  205. bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
  206. configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
  207. configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
  208. }
  209. // retrySetMemoryReserve checks for any QoS cgroups over the limit
  210. // that was attempted to be set in the first Update() and adjusts
  211. // their memory limit to the usage to prevent further growth.
  212. func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
  213. // Unreclaimable memory usage may already exceeded the desired limit
  214. // Attempt to set the limit near the current usage to put pressure
  215. // on the cgroup and prevent further growth.
  216. for qos, config := range configs {
  217. stats, err := m.cgroupManager.GetResourceStats(config.Name)
  218. if err != nil {
  219. klog.V(2).Infof("[Container Manager] %v", err)
  220. return
  221. }
  222. usage := stats.MemoryStats.Usage
  223. // Because there is no good way to determine of the original Update()
  224. // on the memory resource was successful, we determine failure of the
  225. // first attempt by checking if the usage is above the limit we attempt
  226. // to set. If it is, we assume the first attempt to set the limit failed
  227. // and try again setting the limit to the usage. Otherwise we leave
  228. // the CgroupConfig as is.
  229. if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
  230. configs[qos].ResourceParameters.Memory = &usage
  231. }
  232. }
  233. }
  234. func (m *qosContainerManagerImpl) UpdateCgroups() error {
  235. m.Lock()
  236. defer m.Unlock()
  237. qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
  238. v1.PodQOSBurstable: {
  239. Name: m.qosContainersInfo.Burstable,
  240. ResourceParameters: &ResourceConfig{},
  241. },
  242. v1.PodQOSBestEffort: {
  243. Name: m.qosContainersInfo.BestEffort,
  244. ResourceParameters: &ResourceConfig{},
  245. },
  246. }
  247. // update the qos level cgroup settings for cpu shares
  248. if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
  249. return err
  250. }
  251. // update the qos level cgroup settings for huge pages (ensure they remain unbounded)
  252. if err := m.setHugePagesConfig(qosConfigs); err != nil {
  253. return err
  254. }
  255. if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
  256. for resource, percentReserve := range m.qosReserved {
  257. switch resource {
  258. case v1.ResourceMemory:
  259. m.setMemoryReserve(qosConfigs, percentReserve)
  260. }
  261. }
  262. updateSuccess := true
  263. for _, config := range qosConfigs {
  264. err := m.cgroupManager.Update(config)
  265. if err != nil {
  266. updateSuccess = false
  267. }
  268. }
  269. if updateSuccess {
  270. klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
  271. return nil
  272. }
  273. // If the resource can adjust the ResourceConfig to increase likelihood of
  274. // success, call the adjustment function here. Otherwise, the Update() will
  275. // be called again with the same values.
  276. for resource, percentReserve := range m.qosReserved {
  277. switch resource {
  278. case v1.ResourceMemory:
  279. m.retrySetMemoryReserve(qosConfigs, percentReserve)
  280. }
  281. }
  282. }
  283. for _, config := range qosConfigs {
  284. err := m.cgroupManager.Update(config)
  285. if err != nil {
  286. klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
  287. return err
  288. }
  289. }
  290. klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
  291. return nil
  292. }
  293. type qosContainerManagerNoop struct {
  294. cgroupRoot CgroupName
  295. }
  296. var _ QOSContainerManager = &qosContainerManagerNoop{}
  297. func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
  298. return QOSContainersInfo{}
  299. }
  300. func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
  301. return nil
  302. }
  303. func (m *qosContainerManagerNoop) UpdateCgroups() error {
  304. return nil
  305. }