topology_manager_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. /*
  2. Copyright 2019 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package e2enode
  14. import (
  15. "fmt"
  16. "time"
  17. v1 "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  20. kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
  21. "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
  22. "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
  23. "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
  24. "k8s.io/kubernetes/test/e2e/framework"
  25. e2enode "k8s.io/kubernetes/test/e2e/framework/node"
  26. e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
  27. "github.com/onsi/ginkgo"
  28. "github.com/onsi/gomega"
  29. )
  30. // Helper for makeTopologyManagerPod().
  31. type tmCtnAttribute struct {
  32. ctnName string
  33. cpuRequest string
  34. cpuLimit string
  35. }
  36. // makeTopologyMangerPod returns a pod with the provided tmCtnAttributes.
  37. func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v1.Pod {
  38. var containers []v1.Container
  39. for _, ctnAttr := range tmCtnAttributes {
  40. cpusetCmd := fmt.Sprintf("grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d")
  41. ctn := v1.Container{
  42. Name: ctnAttr.ctnName,
  43. Image: busyboxImage,
  44. Resources: v1.ResourceRequirements{
  45. Requests: v1.ResourceList{
  46. v1.ResourceName(v1.ResourceCPU): resource.MustParse(ctnAttr.cpuRequest),
  47. v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
  48. },
  49. Limits: v1.ResourceList{
  50. v1.ResourceName(v1.ResourceCPU): resource.MustParse(ctnAttr.cpuLimit),
  51. v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
  52. },
  53. },
  54. Command: []string{"sh", "-c", cpusetCmd},
  55. }
  56. containers = append(containers, ctn)
  57. }
  58. return &v1.Pod{
  59. ObjectMeta: metav1.ObjectMeta{
  60. Name: podName,
  61. },
  62. Spec: v1.PodSpec{
  63. RestartPolicy: v1.RestartPolicyNever,
  64. Containers: containers,
  65. },
  66. }
  67. }
  68. func configureTopologyManagerInKubelet(f *framework.Framework, policy string) {
  69. // Configure Topology Manager in Kubelet with policy.
  70. oldCfg, err := getCurrentKubeletConfig()
  71. framework.ExpectNoError(err)
  72. newCfg := oldCfg.DeepCopy()
  73. if newCfg.FeatureGates == nil {
  74. newCfg.FeatureGates = make(map[string]bool)
  75. }
  76. newCfg.FeatureGates["CPUManager"] = true
  77. newCfg.FeatureGates["TopologyManager"] = true
  78. deleteStateFile()
  79. // Set the Topology Manager policy
  80. newCfg.TopologyManagerPolicy = policy
  81. //newCfg.TopologyManagerPolicy = topologymanager.PolicySingleNumaNode
  82. // Set the CPU Manager policy to static.
  83. newCfg.CPUManagerPolicy = string(cpumanager.PolicyStatic)
  84. // Set the CPU Manager reconcile period to 1 second.
  85. newCfg.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
  86. // The Kubelet panics if either kube-reserved or system-reserved is not set
  87. // when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that
  88. // kubelet doesn't panic.
  89. if newCfg.KubeReserved == nil {
  90. newCfg.KubeReserved = map[string]string{}
  91. }
  92. if _, ok := newCfg.KubeReserved["cpu"]; !ok {
  93. newCfg.KubeReserved["cpu"] = "200m"
  94. }
  95. // Dump the config -- debug
  96. framework.Logf("New kublet config is %s", *newCfg)
  97. // Update the Kubelet configuration.
  98. framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
  99. // Wait for the Kubelet to be ready.
  100. gomega.Eventually(func() bool {
  101. nodes, err := e2enode.TotalReady(f.ClientSet)
  102. framework.ExpectNoError(err)
  103. return nodes == 1
  104. }, time.Minute, time.Second).Should(gomega.BeTrue())
  105. }
  106. func runTopologyManagerSuiteTests(f *framework.Framework) {
  107. var cpuCap, cpuAlloc int64
  108. var cpuListString, expAllowedCPUsListRegex string
  109. var cpuList []int
  110. var cpu1, cpu2 int
  111. var cset cpuset.CPUSet
  112. var err error
  113. var ctnAttrs []tmCtnAttribute
  114. var pod, pod1, pod2 *v1.Pod
  115. cpuCap, cpuAlloc, _ = getLocalNodeCPUDetails(f)
  116. ginkgo.By("running a non-Gu pod")
  117. ctnAttrs = []tmCtnAttribute{
  118. {
  119. ctnName: "non-gu-container",
  120. cpuRequest: "100m",
  121. cpuLimit: "200m",
  122. },
  123. }
  124. pod = makeTopologyManagerPod("non-gu-pod", ctnAttrs)
  125. pod = f.PodClient().CreateSync(pod)
  126. ginkgo.By("checking if the expected cpuset was assigned")
  127. expAllowedCPUsListRegex = fmt.Sprintf("^0-%d\n$", cpuCap-1)
  128. err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  129. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  130. pod.Spec.Containers[0].Name, pod.Name)
  131. ginkgo.By("by deleting the pods and waiting for container removal")
  132. deletePods(f, []string{pod.Name})
  133. waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
  134. ginkgo.By("running a Gu pod")
  135. ctnAttrs = []tmCtnAttribute{
  136. {
  137. ctnName: "gu-container",
  138. cpuRequest: "1000m",
  139. cpuLimit: "1000m",
  140. },
  141. }
  142. pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
  143. pod = f.PodClient().CreateSync(pod)
  144. ginkgo.By("checking if the expected cpuset was assigned")
  145. cpu1 = 1
  146. if isHTEnabled() {
  147. cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
  148. cpu1 = cpuList[1]
  149. }
  150. expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
  151. err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  152. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  153. pod.Spec.Containers[0].Name, pod.Name)
  154. ginkgo.By("by deleting the pods and waiting for container removal")
  155. deletePods(f, []string{pod.Name})
  156. waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
  157. ginkgo.By("running multiple Gu and non-Gu pods")
  158. ctnAttrs = []tmCtnAttribute{
  159. {
  160. ctnName: "gu-container",
  161. cpuRequest: "1000m",
  162. cpuLimit: "1000m",
  163. },
  164. }
  165. pod1 = makeTopologyManagerPod("gu-pod", ctnAttrs)
  166. pod1 = f.PodClient().CreateSync(pod1)
  167. ctnAttrs = []tmCtnAttribute{
  168. {
  169. ctnName: "non-gu-container",
  170. cpuRequest: "200m",
  171. cpuLimit: "300m",
  172. },
  173. }
  174. pod2 = makeTopologyManagerPod("non-gu-pod", ctnAttrs)
  175. pod2 = f.PodClient().CreateSync(pod2)
  176. ginkgo.By("checking if the expected cpuset was assigned")
  177. cpu1 = 1
  178. if isHTEnabled() {
  179. cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
  180. cpu1 = cpuList[1]
  181. }
  182. expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
  183. err = f.PodClient().MatchContainerOutput(pod1.Name, pod1.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  184. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  185. pod1.Spec.Containers[0].Name, pod1.Name)
  186. cpuListString = "0"
  187. if cpuAlloc > 2 {
  188. cset = cpuset.MustParse(fmt.Sprintf("0-%d", cpuCap-1))
  189. cpuListString = fmt.Sprintf("%s", cset.Difference(cpuset.NewCPUSet(cpu1)))
  190. }
  191. expAllowedCPUsListRegex = fmt.Sprintf("^%s\n$", cpuListString)
  192. err = f.PodClient().MatchContainerOutput(pod2.Name, pod2.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  193. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  194. pod2.Spec.Containers[0].Name, pod2.Name)
  195. ginkgo.By("by deleting the pods and waiting for container removal")
  196. deletePods(f, []string{pod1.Name, pod2.Name})
  197. waitForContainerRemoval(pod1.Spec.Containers[0].Name, pod1.Name, pod1.Namespace)
  198. waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
  199. // Skip rest of the tests if CPU capacity < 3.
  200. if cpuCap < 3 {
  201. e2eskipper.Skipf("Skipping rest of the CPU Manager tests since CPU capacity < 3")
  202. }
  203. ginkgo.By("running a Gu pod requesting multiple CPUs")
  204. ctnAttrs = []tmCtnAttribute{
  205. {
  206. ctnName: "gu-container",
  207. cpuRequest: "2000m",
  208. cpuLimit: "2000m",
  209. },
  210. }
  211. pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
  212. pod = f.PodClient().CreateSync(pod)
  213. ginkgo.By("checking if the expected cpuset was assigned")
  214. cpuListString = "1-2"
  215. if isHTEnabled() {
  216. cpuListString = "2-3"
  217. cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
  218. if cpuList[1] != 1 {
  219. cset = cpuset.MustParse(getCPUSiblingList(1))
  220. cpuListString = fmt.Sprintf("%s", cset)
  221. }
  222. }
  223. expAllowedCPUsListRegex = fmt.Sprintf("^%s\n$", cpuListString)
  224. err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  225. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  226. pod.Spec.Containers[0].Name, pod.Name)
  227. ginkgo.By("by deleting the pods and waiting for container removal")
  228. deletePods(f, []string{pod.Name})
  229. waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
  230. ginkgo.By("running a Gu pod with multiple containers requesting integer CPUs")
  231. ctnAttrs = []tmCtnAttribute{
  232. {
  233. ctnName: "gu-container1",
  234. cpuRequest: "1000m",
  235. cpuLimit: "1000m",
  236. },
  237. {
  238. ctnName: "gu-container2",
  239. cpuRequest: "1000m",
  240. cpuLimit: "1000m",
  241. },
  242. }
  243. pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
  244. pod = f.PodClient().CreateSync(pod)
  245. ginkgo.By("checking if the expected cpuset was assigned")
  246. cpu1, cpu2 = 1, 2
  247. if isHTEnabled() {
  248. cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
  249. if cpuList[1] != 1 {
  250. cpu1, cpu2 = cpuList[1], 1
  251. }
  252. }
  253. expAllowedCPUsListRegex = fmt.Sprintf("^%d|%d\n$", cpu1, cpu2)
  254. err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  255. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  256. pod.Spec.Containers[0].Name, pod.Name)
  257. err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  258. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  259. pod.Spec.Containers[1].Name, pod.Name)
  260. ginkgo.By("by deleting the pods and waiting for container removal")
  261. deletePods(f, []string{pod.Name})
  262. waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
  263. waitForContainerRemoval(pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
  264. ginkgo.By("running multiple Gu pods")
  265. ctnAttrs = []tmCtnAttribute{
  266. {
  267. ctnName: "gu-container1",
  268. cpuRequest: "1000m",
  269. cpuLimit: "1000m",
  270. },
  271. }
  272. pod1 = makeTopologyManagerPod("gu-pod1", ctnAttrs)
  273. pod1 = f.PodClient().CreateSync(pod1)
  274. ctnAttrs = []tmCtnAttribute{
  275. {
  276. ctnName: "gu-container2",
  277. cpuRequest: "1000m",
  278. cpuLimit: "1000m",
  279. },
  280. }
  281. pod2 = makeTopologyManagerPod("gu-pod2", ctnAttrs)
  282. pod2 = f.PodClient().CreateSync(pod2)
  283. ginkgo.By("checking if the expected cpuset was assigned")
  284. cpu1, cpu2 = 1, 2
  285. if isHTEnabled() {
  286. cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
  287. if cpuList[1] != 1 {
  288. cpu1, cpu2 = cpuList[1], 1
  289. }
  290. }
  291. expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
  292. err = f.PodClient().MatchContainerOutput(pod1.Name, pod1.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  293. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  294. pod1.Spec.Containers[0].Name, pod1.Name)
  295. expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu2)
  296. err = f.PodClient().MatchContainerOutput(pod2.Name, pod2.Spec.Containers[0].Name, expAllowedCPUsListRegex)
  297. framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
  298. pod2.Spec.Containers[0].Name, pod2.Name)
  299. ginkgo.By("by deleting the pods and waiting for container removal")
  300. deletePods(f, []string{pod1.Name, pod2.Name})
  301. waitForContainerRemoval(pod1.Spec.Containers[0].Name, pod1.Name, pod1.Namespace)
  302. waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
  303. }
  304. func runTopologyManagerTests(f *framework.Framework) {
  305. var oldCfg *kubeletconfig.KubeletConfiguration
  306. ginkgo.It("run Topology Manager test suite", func() {
  307. var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
  308. topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
  309. for _, policy := range policies {
  310. // Configure Topology Manager
  311. ginkgo.By("by configuring Topology Manager policy to xxx")
  312. framework.Logf("Configuring topology Manager policy to %s", policy)
  313. configureTopologyManagerInKubelet(f, policy)
  314. // Run the tests
  315. runTopologyManagerSuiteTests(f)
  316. }
  317. // restore kubelet config
  318. setOldKubeletConfig(f, oldCfg)
  319. // Debug sleep to allow time to look at kubelet config
  320. time.Sleep(5 * time.Minute)
  321. // Delete state file to allow repeated runs
  322. deleteStateFile()
  323. })
  324. }
  325. // Serial because the test updates kubelet configuration.
  326. var _ = SIGDescribe("Topology Manager [Serial] [Feature:TopologyManager][NodeAlphaFeature:TopologyManager]", func() {
  327. f := framework.NewDefaultFramework("topology-manager-test")
  328. ginkgo.Context("With kubeconfig updated to static CPU Manager policy run the Topology Manager tests", func() {
  329. runTopologyManagerTests(f)
  330. })
  331. })