cluster_size_autoscaling.go 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package autoscaling
  14. import (
  15. "fmt"
  16. "io/ioutil"
  17. "math"
  18. "net/http"
  19. "os"
  20. "os/exec"
  21. "regexp"
  22. "strconv"
  23. "strings"
  24. "time"
  25. "k8s.io/api/core/v1"
  26. policy "k8s.io/api/policy/v1beta1"
  27. schedulerapi "k8s.io/api/scheduling/v1"
  28. "k8s.io/apimachinery/pkg/api/errors"
  29. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  30. "k8s.io/apimachinery/pkg/fields"
  31. "k8s.io/apimachinery/pkg/labels"
  32. utilerrors "k8s.io/apimachinery/pkg/util/errors"
  33. "k8s.io/apimachinery/pkg/util/intstr"
  34. "k8s.io/apimachinery/pkg/util/sets"
  35. "k8s.io/apimachinery/pkg/util/uuid"
  36. "k8s.io/apimachinery/pkg/util/wait"
  37. clientset "k8s.io/client-go/kubernetes"
  38. api "k8s.io/kubernetes/pkg/apis/core"
  39. "k8s.io/kubernetes/test/e2e/framework"
  40. e2elog "k8s.io/kubernetes/test/e2e/framework/log"
  41. "k8s.io/kubernetes/test/e2e/scheduling"
  42. testutils "k8s.io/kubernetes/test/utils"
  43. imageutils "k8s.io/kubernetes/test/utils/image"
  44. "github.com/onsi/ginkgo"
  45. "github.com/onsi/gomega"
  46. "k8s.io/klog"
  47. )
  48. const (
  49. defaultTimeout = 3 * time.Minute
  50. resizeTimeout = 5 * time.Minute
  51. manualResizeTimeout = 6 * time.Minute
  52. scaleUpTimeout = 5 * time.Minute
  53. scaleUpTriggerTimeout = 2 * time.Minute
  54. scaleDownTimeout = 20 * time.Minute
  55. podTimeout = 2 * time.Minute
  56. nodesRecoverTimeout = 5 * time.Minute
  57. rcCreationRetryTimeout = 4 * time.Minute
  58. rcCreationRetryDelay = 20 * time.Second
  59. makeSchedulableTimeout = 10 * time.Minute
  60. makeSchedulableDelay = 20 * time.Second
  61. freshStatusLimit = 20 * time.Second
  62. gkeUpdateTimeout = 15 * time.Minute
  63. gkeNodepoolNameKey = "cloud.google.com/gke-nodepool"
  64. disabledTaint = "DisabledForAutoscalingTest"
  65. criticalAddonsOnlyTaint = "CriticalAddonsOnly"
  66. newNodesForScaledownTests = 2
  67. unhealthyClusterThreshold = 4
  68. caNoScaleUpStatus = "NoActivity"
  69. caOngoingScaleUpStatus = "InProgress"
  70. timestampFormat = "2006-01-02 15:04:05 -0700 MST"
  71. expendablePriorityClassName = "expendable-priority"
  72. highPriorityClassName = "high-priority"
  73. gpuLabel = "cloud.google.com/gke-accelerator"
  74. )
  75. var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
  76. f := framework.NewDefaultFramework("autoscaling")
  77. var c clientset.Interface
  78. var nodeCount int
  79. var coreCount int64
  80. var memAllocatableMb int
  81. var originalSizes map[string]int
  82. ginkgo.BeforeEach(func() {
  83. c = f.ClientSet
  84. framework.SkipUnlessProviderIs("gce", "gke")
  85. originalSizes = make(map[string]int)
  86. sum := 0
  87. for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
  88. size, err := framework.GroupSize(mig)
  89. framework.ExpectNoError(err)
  90. ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size))
  91. originalSizes[mig] = size
  92. sum += size
  93. }
  94. // Give instances time to spin up
  95. framework.ExpectNoError(framework.WaitForReadyNodes(c, sum, scaleUpTimeout))
  96. nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  97. nodeCount = len(nodes.Items)
  98. coreCount = 0
  99. for _, node := range nodes.Items {
  100. quantity := node.Status.Allocatable[v1.ResourceCPU]
  101. coreCount += quantity.Value()
  102. }
  103. ginkgo.By(fmt.Sprintf("Initial number of schedulable nodes: %v", nodeCount))
  104. gomega.Expect(nodeCount).NotTo(gomega.BeZero())
  105. mem := nodes.Items[0].Status.Allocatable[v1.ResourceMemory]
  106. memAllocatableMb = int((&mem).Value() / 1024 / 1024)
  107. gomega.Expect(nodeCount).Should(gomega.Equal(sum))
  108. if framework.ProviderIs("gke") {
  109. val, err := isAutoscalerEnabled(5)
  110. framework.ExpectNoError(err)
  111. if !val {
  112. err = enableAutoscaler("default-pool", 3, 5)
  113. framework.ExpectNoError(err)
  114. }
  115. }
  116. })
  117. ginkgo.AfterEach(func() {
  118. framework.SkipUnlessProviderIs("gce", "gke")
  119. ginkgo.By(fmt.Sprintf("Restoring initial size of the cluster"))
  120. setMigSizes(originalSizes)
  121. expectedNodes := 0
  122. for _, size := range originalSizes {
  123. expectedNodes += size
  124. }
  125. framework.ExpectNoError(framework.WaitForReadyNodes(c, expectedNodes, scaleDownTimeout))
  126. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  127. framework.ExpectNoError(err)
  128. s := time.Now()
  129. makeSchedulableLoop:
  130. for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) {
  131. for _, n := range nodes.Items {
  132. err = makeNodeSchedulable(c, &n, true)
  133. switch err.(type) {
  134. case CriticalAddonsOnlyError:
  135. continue makeSchedulableLoop
  136. default:
  137. framework.ExpectNoError(err)
  138. }
  139. }
  140. break
  141. }
  142. klog.Infof("Made nodes schedulable again in %v", time.Since(s).String())
  143. })
  144. ginkgo.It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  145. ginkgo.By("Creating unschedulable pod")
  146. ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, defaultTimeout)
  147. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  148. ginkgo.By("Waiting for scale up hoping it won't happen")
  149. // Verify that the appropriate event was generated
  150. eventFound := false
  151. EventsLoop:
  152. for start := time.Now(); time.Since(start) < scaleUpTimeout; time.Sleep(20 * time.Second) {
  153. ginkgo.By("Waiting for NotTriggerScaleUp event")
  154. events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(metav1.ListOptions{})
  155. framework.ExpectNoError(err)
  156. for _, e := range events.Items {
  157. if e.InvolvedObject.Kind == "Pod" && e.Reason == "NotTriggerScaleUp" && strings.Contains(e.Message, "it wouldn't fit if a new node is added") {
  158. ginkgo.By("NotTriggerScaleUp event found")
  159. eventFound = true
  160. break EventsLoop
  161. }
  162. }
  163. }
  164. gomega.Expect(eventFound).Should(gomega.Equal(true))
  165. // Verify that cluster size is not changed
  166. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  167. func(size int) bool { return size <= nodeCount }, time.Second))
  168. })
  169. simpleScaleUpTest := func(unready int) {
  170. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  171. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  172. // Verify that cluster size is increased
  173. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  174. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout, unready))
  175. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  176. }
  177. ginkgo.It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
  178. func() { simpleScaleUpTest(0) })
  179. gpuType := os.Getenv("TESTED_GPU_TYPE")
  180. ginkgo.It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
  181. framework.SkipUnlessProviderIs("gke")
  182. if gpuType == "" {
  183. framework.Failf("TEST_GPU_TYPE not defined")
  184. return
  185. }
  186. const gpuPoolName = "gpu-pool"
  187. addGpuNodePool(gpuPoolName, gpuType, 1, 0)
  188. defer deleteNodePool(gpuPoolName)
  189. installNvidiaDriversDaemonSet()
  190. ginkgo.By("Enable autoscaler")
  191. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  192. defer disableAutoscaler(gpuPoolName, 0, 1)
  193. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(0))
  194. ginkgo.By("Schedule a pod which requires GPU")
  195. framework.ExpectNoError(ScheduleAnySingleGpuPod(f, "gpu-pod-rc"))
  196. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
  197. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  198. func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
  199. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(1))
  200. })
  201. ginkgo.It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
  202. framework.SkipUnlessProviderIs("gke")
  203. if gpuType == "" {
  204. framework.Failf("TEST_GPU_TYPE not defined")
  205. return
  206. }
  207. const gpuPoolName = "gpu-pool"
  208. addGpuNodePool(gpuPoolName, gpuType, 1, 1)
  209. defer deleteNodePool(gpuPoolName)
  210. installNvidiaDriversDaemonSet()
  211. ginkgo.By("Schedule a single pod which requires GPU")
  212. framework.ExpectNoError(ScheduleAnySingleGpuPod(f, "gpu-pod-rc"))
  213. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
  214. ginkgo.By("Enable autoscaler")
  215. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
  216. defer disableAutoscaler(gpuPoolName, 0, 2)
  217. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(1))
  218. ginkgo.By("Scale GPU deployment")
  219. framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true)
  220. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  221. func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
  222. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(2))
  223. })
  224. ginkgo.It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
  225. framework.SkipUnlessProviderIs("gke")
  226. if gpuType == "" {
  227. framework.Failf("TEST_GPU_TYPE not defined")
  228. return
  229. }
  230. const gpuPoolName = "gpu-pool"
  231. addGpuNodePool(gpuPoolName, gpuType, 1, 0)
  232. defer deleteNodePool(gpuPoolName)
  233. installNvidiaDriversDaemonSet()
  234. ginkgo.By("Enable autoscaler")
  235. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  236. defer disableAutoscaler(gpuPoolName, 0, 1)
  237. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(0))
  238. ginkgo.By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
  239. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  240. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  241. // Verify that cluster size is increased
  242. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  243. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
  244. // Expect gpu pool to stay intact
  245. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(0))
  246. })
  247. ginkgo.It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
  248. framework.SkipUnlessProviderIs("gke")
  249. if gpuType == "" {
  250. framework.Failf("TEST_GPU_TYPE not defined")
  251. return
  252. }
  253. const gpuPoolName = "gpu-pool"
  254. addGpuNodePool(gpuPoolName, gpuType, 1, 1)
  255. defer deleteNodePool(gpuPoolName)
  256. installNvidiaDriversDaemonSet()
  257. ginkgo.By("Schedule a single pod which requires GPU")
  258. framework.ExpectNoError(ScheduleAnySingleGpuPod(f, "gpu-pod-rc"))
  259. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
  260. ginkgo.By("Enable autoscaler")
  261. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  262. defer disableAutoscaler(gpuPoolName, 0, 1)
  263. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(1))
  264. ginkgo.By("Remove the only POD requiring GPU")
  265. framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
  266. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  267. func(size int) bool { return size == nodeCount }, scaleDownTimeout))
  268. gomega.Expect(len(getPoolNodes(f, gpuPoolName))).Should(gomega.Equal(0))
  269. })
  270. ginkgo.It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
  271. func() {
  272. framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
  273. })
  274. ginkgo.It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  275. // Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info.
  276. status, err := waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
  277. return s.ready == s.target && s.ready <= nodeCount
  278. }, scaleUpTriggerTimeout)
  279. framework.ExpectNoError(err)
  280. unmanagedNodes := nodeCount - status.ready
  281. ginkgo.By("Schedule more pods than can fit and wait for cluster to scale-up")
  282. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  283. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  284. status, err = waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
  285. return s.status == caOngoingScaleUpStatus
  286. }, scaleUpTriggerTimeout)
  287. framework.ExpectNoError(err)
  288. target := status.target
  289. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  290. ginkgo.By("Expect no more scale-up to be happening after all pods are scheduled")
  291. // wait for a while until scale-up finishes; we cannot read CA status immediately
  292. // after pods are scheduled as status config map is updated by CA once every loop iteration
  293. status, err = waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
  294. return s.status == caNoScaleUpStatus
  295. }, 2*freshStatusLimit)
  296. framework.ExpectNoError(err)
  297. if status.target != target {
  298. klog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
  299. }
  300. gomega.Expect(status.timestamp.Add(freshStatusLimit).Before(time.Now())).Should(gomega.Equal(false))
  301. gomega.Expect(status.status).Should(gomega.Equal(caNoScaleUpStatus))
  302. gomega.Expect(status.ready).Should(gomega.Equal(status.target))
  303. gomega.Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(gomega.Equal(status.target + unmanagedNodes))
  304. })
  305. ginkgo.It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  306. framework.SkipUnlessProviderIs("gke")
  307. ginkgo.By("Creating new node-pool with n1-standard-4 machines")
  308. const extraPoolName = "extra-pool"
  309. addNodePool(extraPoolName, "n1-standard-4", 1)
  310. defer deleteNodePool(extraPoolName)
  311. extraNodes := getPoolInitialSize(extraPoolName)
  312. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  313. // We wait for nodes to become schedulable to make sure the new nodes
  314. // will be returned by getPoolNodes below.
  315. framework.ExpectNoError(framework.WaitForAllNodesSchedulable(c, resizeTimeout))
  316. klog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
  317. ginkgo.By("Getting memory available on new nodes, so we can account for it when creating RC")
  318. nodes := getPoolNodes(f, extraPoolName)
  319. gomega.Expect(len(nodes)).Should(gomega.Equal(extraNodes))
  320. extraMemMb := 0
  321. for _, node := range nodes {
  322. mem := node.Status.Allocatable[v1.ResourceMemory]
  323. extraMemMb += int((&mem).Value() / 1024 / 1024)
  324. }
  325. ginkgo.By("Reserving 0.1x more memory than the cluster holds to trigger scale up")
  326. totalMemoryReservation := int(1.1 * float64(nodeCount*memAllocatableMb+extraMemMb))
  327. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  328. ReserveMemory(f, "memory-reservation", 100, totalMemoryReservation, false, defaultTimeout)
  329. // Verify, that cluster size is increased
  330. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  331. func(size int) bool { return size >= nodeCount+extraNodes+1 }, scaleUpTimeout))
  332. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  333. })
  334. ginkgo.It("should disable node pool autoscaling [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  335. framework.SkipUnlessProviderIs("gke")
  336. ginkgo.By("Creating new node-pool with n1-standard-4 machines")
  337. const extraPoolName = "extra-pool"
  338. addNodePool(extraPoolName, "n1-standard-4", 1)
  339. defer deleteNodePool(extraPoolName)
  340. extraNodes := getPoolInitialSize(extraPoolName)
  341. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  342. framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
  343. framework.ExpectNoError(disableAutoscaler(extraPoolName, 1, 2))
  344. })
  345. ginkgo.It("should increase cluster size if pods are pending due to host port conflict [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  346. scheduling.CreateHostPortPods(f, "host-port", nodeCount+2, false)
  347. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "host-port")
  348. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  349. func(size int) bool { return size >= nodeCount+2 }, scaleUpTimeout))
  350. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  351. })
  352. ginkgo.It("should increase cluster size if pods are pending due to pod anti-affinity [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  353. pods := nodeCount
  354. newPods := 2
  355. labels := map[string]string{
  356. "anti-affinity": "yes",
  357. }
  358. ginkgo.By("starting a pod with anti-affinity on each node")
  359. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  360. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "some-pod")
  361. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  362. ginkgo.By("scheduling extra pods with anti-affinity to existing ones")
  363. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels))
  364. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "extra-pod")
  365. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  366. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  367. })
  368. ginkgo.It("should increase cluster size if pod requesting EmptyDir volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  369. ginkgo.By("creating pods")
  370. pods := nodeCount
  371. newPods := 1
  372. labels := map[string]string{
  373. "anti-affinity": "yes",
  374. }
  375. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  376. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "some-pod")
  377. ginkgo.By("waiting for all pods before triggering scale up")
  378. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  379. ginkgo.By("creating a pod requesting EmptyDir")
  380. framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels, emptyDirVolumes))
  381. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "extra-pod")
  382. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  383. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  384. })
  385. ginkgo.It("should increase cluster size if pod requesting volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  386. framework.SkipUnlessProviderIs("gce", "gke")
  387. volumeLabels := labels.Set{
  388. framework.VolumeSelectorKey: f.Namespace.Name,
  389. }
  390. selector := metav1.SetAsLabelSelector(volumeLabels)
  391. ginkgo.By("creating volume & pvc")
  392. diskName, err := framework.CreatePDWithRetry()
  393. framework.ExpectNoError(err)
  394. pvConfig := framework.PersistentVolumeConfig{
  395. NamePrefix: "gce-",
  396. Labels: volumeLabels,
  397. PVSource: v1.PersistentVolumeSource{
  398. GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
  399. PDName: diskName,
  400. FSType: "ext3",
  401. ReadOnly: false,
  402. },
  403. },
  404. Prebind: nil,
  405. }
  406. emptyStorageClass := ""
  407. pvcConfig := framework.PersistentVolumeClaimConfig{
  408. Selector: selector,
  409. StorageClassName: &emptyStorageClass,
  410. }
  411. pv, pvc, err := framework.CreatePVPVC(c, pvConfig, pvcConfig, f.Namespace.Name, false)
  412. framework.ExpectNoError(err)
  413. framework.ExpectNoError(framework.WaitOnPVandPVC(c, f.Namespace.Name, pv, pvc))
  414. defer func() {
  415. errs := framework.PVPVCCleanup(c, f.Namespace.Name, pv, pvc)
  416. if len(errs) > 0 {
  417. framework.Failf("failed to delete PVC and/or PV. Errors: %v", utilerrors.NewAggregate(errs))
  418. }
  419. pv, pvc = nil, nil
  420. if diskName != "" {
  421. framework.ExpectNoError(framework.DeletePDWithRetry(diskName))
  422. }
  423. }()
  424. ginkgo.By("creating pods")
  425. pods := nodeCount
  426. labels := map[string]string{
  427. "anti-affinity": "yes",
  428. }
  429. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  430. defer func() {
  431. framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "some-pod")
  432. klog.Infof("RC and pods not using volume deleted")
  433. }()
  434. ginkgo.By("waiting for all pods before triggering scale up")
  435. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  436. ginkgo.By("creating a pod requesting PVC")
  437. pvcPodName := "pvc-pod"
  438. newPods := 1
  439. volumes := buildVolumes(pv, pvc)
  440. framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, pvcPodName, labels, labels, volumes))
  441. defer func() {
  442. framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, pvcPodName)
  443. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  444. }()
  445. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  446. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  447. })
  448. ginkgo.It("should add node to the particular mig [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  449. labelKey := "cluster-autoscaling-test.special-node"
  450. labelValue := "true"
  451. ginkgo.By("Finding the smallest MIG")
  452. minMig := ""
  453. minSize := nodeCount
  454. for mig, size := range originalSizes {
  455. if size <= minSize {
  456. minMig = mig
  457. minSize = size
  458. }
  459. }
  460. if minSize == 0 {
  461. newSizes := make(map[string]int)
  462. for mig, size := range originalSizes {
  463. newSizes[mig] = size
  464. }
  465. newSizes[minMig] = 1
  466. setMigSizes(newSizes)
  467. }
  468. removeLabels := func(nodesToClean sets.String) {
  469. ginkgo.By("Removing labels from nodes")
  470. for node := range nodesToClean {
  471. framework.RemoveLabelOffNode(c, node, labelKey)
  472. }
  473. }
  474. nodes, err := framework.GetGroupNodes(minMig)
  475. framework.ExpectNoError(err)
  476. nodesSet := sets.NewString(nodes...)
  477. defer removeLabels(nodesSet)
  478. ginkgo.By(fmt.Sprintf("Annotating nodes of the smallest MIG(%s): %v", minMig, nodes))
  479. for node := range nodesSet {
  480. framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
  481. }
  482. scheduling.CreateNodeSelectorPods(f, "node-selector", minSize+1, map[string]string{labelKey: labelValue}, false)
  483. ginkgo.By("Waiting for new node to appear and annotating it")
  484. framework.WaitForGroupSize(minMig, int32(minSize+1))
  485. // Verify that cluster size is increased
  486. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  487. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
  488. newNodes, err := framework.GetGroupNodes(minMig)
  489. framework.ExpectNoError(err)
  490. newNodesSet := sets.NewString(newNodes...)
  491. newNodesSet.Delete(nodes...)
  492. if len(newNodesSet) > 1 {
  493. ginkgo.By(fmt.Sprintf("Spotted following new nodes in %s: %v", minMig, newNodesSet))
  494. klog.Infof("Usually only 1 new node is expected, investigating")
  495. klog.Infof("Kubectl:%s\n", framework.RunKubectlOrDie("get", "nodes", "-o", "json"))
  496. if output, err := exec.Command("gcloud", "compute", "instances", "list",
  497. "--project="+framework.TestContext.CloudConfig.ProjectID,
  498. "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
  499. klog.Infof("Gcloud compute instances list: %s", output)
  500. } else {
  501. klog.Errorf("Failed to get instances list: %v", err)
  502. }
  503. for newNode := range newNodesSet {
  504. if output, err := execCmd("gcloud", "compute", "instances", "describe",
  505. newNode,
  506. "--project="+framework.TestContext.CloudConfig.ProjectID,
  507. "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
  508. klog.Infof("Gcloud compute instances describe: %s", output)
  509. } else {
  510. klog.Errorf("Failed to get instances describe: %v", err)
  511. }
  512. }
  513. // TODO: possibly remove broken node from newNodesSet to prevent removeLabel from crashing.
  514. // However at this moment we DO WANT it to crash so that we don't check all test runs for the
  515. // rare behavior, but only the broken ones.
  516. }
  517. ginkgo.By(fmt.Sprintf("New nodes: %v\n", newNodesSet))
  518. registeredNodes := sets.NewString()
  519. for nodeName := range newNodesSet {
  520. node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  521. if err == nil && node != nil {
  522. registeredNodes.Insert(nodeName)
  523. } else {
  524. klog.Errorf("Failed to get node %v: %v", nodeName, err)
  525. }
  526. }
  527. ginkgo.By(fmt.Sprintf("Setting labels for registered new nodes: %v", registeredNodes.List()))
  528. for node := range registeredNodes {
  529. framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
  530. }
  531. defer removeLabels(registeredNodes)
  532. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  533. framework.ExpectNoError(framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "node-selector"))
  534. })
  535. ginkgo.It("should scale up correct target pool [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  536. framework.SkipUnlessProviderIs("gke")
  537. ginkgo.By("Creating new node-pool with n1-standard-4 machines")
  538. const extraPoolName = "extra-pool"
  539. addNodePool(extraPoolName, "n1-standard-4", 1)
  540. defer deleteNodePool(extraPoolName)
  541. extraNodes := getPoolInitialSize(extraPoolName)
  542. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  543. framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
  544. defer disableAutoscaler(extraPoolName, 1, 2)
  545. extraPods := extraNodes + 1
  546. totalMemoryReservation := int(float64(extraPods) * 1.5 * float64(memAllocatableMb))
  547. ginkgo.By(fmt.Sprintf("Creating rc with %v pods too big to fit default-pool but fitting extra-pool", extraPods))
  548. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  549. ReserveMemory(f, "memory-reservation", extraPods, totalMemoryReservation, false, defaultTimeout)
  550. // Apparently GKE master is restarted couple minutes after the node pool is added
  551. // reseting all the timers in scale down code. Adding 5 extra minutes to workaround
  552. // this issue.
  553. // TODO: Remove the extra time when GKE restart is fixed.
  554. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes+1, scaleUpTimeout+5*time.Minute))
  555. })
  556. simpleScaleDownTest := func(unready int) {
  557. cleanup, err := addKubeSystemPdbs(f)
  558. defer cleanup()
  559. framework.ExpectNoError(err)
  560. ginkgo.By("Manually increase cluster size")
  561. increasedSize := 0
  562. newSizes := make(map[string]int)
  563. for key, val := range originalSizes {
  564. newSizes[key] = val + 2 + unready
  565. increasedSize += val + 2 + unready
  566. }
  567. setMigSizes(newSizes)
  568. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  569. func(size int) bool { return size >= increasedSize }, manualResizeTimeout, unready))
  570. ginkgo.By("Some node should be removed")
  571. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  572. func(size int) bool { return size < increasedSize }, scaleDownTimeout, unready))
  573. }
  574. ginkgo.It("should correctly scale down after a node is not needed [Feature:ClusterSizeAutoscalingScaleDown]",
  575. func() { simpleScaleDownTest(0) })
  576. ginkgo.It("should correctly scale down after a node is not needed and one node is broken [Feature:ClusterSizeAutoscalingScaleDown]",
  577. func() {
  578. framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleDownTest(1) })
  579. })
  580. ginkgo.It("should correctly scale down after a node is not needed when there is non autoscaled pool[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  581. framework.SkipUnlessProviderIs("gke")
  582. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  583. const extraPoolName = "extra-pool"
  584. addNodePool(extraPoolName, "n1-standard-1", 3)
  585. defer deleteNodePool(extraPoolName)
  586. extraNodes := getPoolInitialSize(extraPoolName)
  587. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  588. func(size int) bool { return size >= increasedSize+extraNodes }, scaleUpTimeout))
  589. ginkgo.By("Some node should be removed")
  590. // Apparently GKE master is restarted couple minutes after the node pool is added
  591. // reseting all the timers in scale down code. Adding 10 extra minutes to workaround
  592. // this issue.
  593. // TODO: Remove the extra time when GKE restart is fixed.
  594. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  595. func(size int) bool { return size < increasedSize+extraNodes }, scaleDownTimeout+10*time.Minute))
  596. })
  597. ginkgo.It("should be able to scale down when rescheduling a pod is required and pdb allows for it[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  598. runDrainTest(f, originalSizes, f.Namespace.Name, 1, 1, func(increasedSize int) {
  599. ginkgo.By("Some node should be removed")
  600. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  601. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  602. })
  603. })
  604. ginkgo.It("shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  605. runDrainTest(f, originalSizes, f.Namespace.Name, 1, 0, func(increasedSize int) {
  606. ginkgo.By("No nodes should be removed")
  607. time.Sleep(scaleDownTimeout)
  608. nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  609. gomega.Expect(len(nodes.Items)).Should(gomega.Equal(increasedSize))
  610. })
  611. })
  612. ginkgo.It("should be able to scale down by draining multiple pods one by one as dictated by pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  613. runDrainTest(f, originalSizes, f.Namespace.Name, 2, 1, func(increasedSize int) {
  614. ginkgo.By("Some node should be removed")
  615. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  616. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  617. })
  618. })
  619. ginkgo.It("should be able to scale down by draining system pods with pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  620. runDrainTest(f, originalSizes, "kube-system", 2, 1, func(increasedSize int) {
  621. ginkgo.By("Some node should be removed")
  622. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  623. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  624. })
  625. })
  626. ginkgo.It("Should be able to scale a node group up from 0[Feature:ClusterSizeAutoscalingScaleUp]", func() {
  627. // Provider-specific setup
  628. if framework.ProviderIs("gke") {
  629. // GKE-specific setup
  630. ginkgo.By("Add a new node pool with 0 nodes and min size 0")
  631. const extraPoolName = "extra-pool"
  632. addNodePool(extraPoolName, "n1-standard-4", 0)
  633. defer deleteNodePool(extraPoolName)
  634. framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
  635. defer disableAutoscaler(extraPoolName, 0, 1)
  636. } else {
  637. // on GCE, run only if there are already at least 2 node groups
  638. framework.SkipUnlessAtLeast(len(originalSizes), 2, "At least 2 node groups are needed for scale-to-0 tests")
  639. ginkgo.By("Manually scale smallest node group to 0")
  640. minMig := ""
  641. minSize := nodeCount
  642. for mig, size := range originalSizes {
  643. if size <= minSize {
  644. minMig = mig
  645. minSize = size
  646. }
  647. }
  648. framework.ExpectNoError(framework.ResizeGroup(minMig, int32(0)))
  649. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize, resizeTimeout))
  650. }
  651. ginkgo.By("Make remaining nodes unschedulable")
  652. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  653. "spec.unschedulable": "false",
  654. }.AsSelector().String()})
  655. framework.ExpectNoError(err)
  656. for _, node := range nodes.Items {
  657. err = makeNodeUnschedulable(f.ClientSet, &node)
  658. defer func(n v1.Node) {
  659. makeNodeSchedulable(f.ClientSet, &n, false)
  660. }(node)
  661. framework.ExpectNoError(err)
  662. }
  663. ginkgo.By("Run a scale-up test")
  664. ReserveMemory(f, "memory-reservation", 1, 100, false, 1*time.Second)
  665. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  666. // Verify that cluster size is increased
  667. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  668. func(size int) bool { return size >= len(nodes.Items)+1 }, scaleUpTimeout))
  669. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  670. })
  671. // Scale to 0 test is split into two functions (for GKE & GCE.)
  672. // The reason for it is that scenario is exactly the same,
  673. // but setup & verification use different APIs.
  674. //
  675. // Scenario:
  676. // (GKE only) add an extra node pool with size 1 & enable autoscaling for it
  677. // (GCE only) find the smallest MIG & resize it to 1
  678. // manually drain the single node from this node pool/MIG
  679. // wait for cluster size to decrease
  680. // verify the targeted node pool/MIG is of size 0
  681. gkeScaleToZero := func() {
  682. // GKE-specific setup
  683. ginkgo.By("Add a new node pool with size 1 and min size 0")
  684. const extraPoolName = "extra-pool"
  685. addNodePool(extraPoolName, "n1-standard-4", 1)
  686. defer deleteNodePool(extraPoolName)
  687. extraNodes := getPoolInitialSize(extraPoolName)
  688. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  689. framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
  690. defer disableAutoscaler(extraPoolName, 0, 1)
  691. ngNodes := getPoolNodes(f, extraPoolName)
  692. gomega.Expect(len(ngNodes)).To(gomega.Equal(extraNodes))
  693. for _, node := range ngNodes {
  694. ginkgo.By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
  695. }
  696. for _, node := range ngNodes {
  697. drainNode(f, node)
  698. }
  699. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  700. func(size int) bool { return size <= nodeCount }, scaleDownTimeout))
  701. // GKE-specific check
  702. newSize := getPoolSize(f, extraPoolName)
  703. gomega.Expect(newSize).Should(gomega.Equal(0))
  704. }
  705. gceScaleToZero := func() {
  706. // non-GKE only
  707. ginkgo.By("Find smallest node group and manually scale it to a single node")
  708. minMig := ""
  709. minSize := nodeCount
  710. for mig, size := range originalSizes {
  711. if size <= minSize {
  712. minMig = mig
  713. minSize = size
  714. }
  715. }
  716. framework.ExpectNoError(framework.ResizeGroup(minMig, int32(1)))
  717. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize+1, resizeTimeout))
  718. ngNodes, err := framework.GetGroupNodes(minMig)
  719. framework.ExpectNoError(err)
  720. gomega.Expect(len(ngNodes) == 1).To(gomega.BeTrue())
  721. node, err := f.ClientSet.CoreV1().Nodes().Get(ngNodes[0], metav1.GetOptions{})
  722. ginkgo.By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
  723. framework.ExpectNoError(err)
  724. // this part is identical
  725. drainNode(f, node)
  726. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  727. func(size int) bool { return size < nodeCount-minSize+1 }, scaleDownTimeout))
  728. // non-GKE only
  729. newSize, err := framework.GroupSize(minMig)
  730. framework.ExpectNoError(err)
  731. gomega.Expect(newSize).Should(gomega.Equal(0))
  732. }
  733. ginkgo.It("Should be able to scale a node group down to 0[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  734. if framework.ProviderIs("gke") { // In GKE, we can just add a node pool
  735. gkeScaleToZero()
  736. } else if len(originalSizes) >= 2 {
  737. gceScaleToZero()
  738. } else {
  739. framework.Skipf("At least 2 node groups are needed for scale-to-0 tests")
  740. }
  741. })
  742. ginkgo.It("Shouldn't perform scale up operation and should list unhealthy status if most of the cluster is broken[Feature:ClusterSizeAutoscalingScaleUp]", func() {
  743. clusterSize := nodeCount
  744. for clusterSize < unhealthyClusterThreshold+1 {
  745. clusterSize = manuallyIncreaseClusterSize(f, originalSizes)
  746. }
  747. // If new nodes are disconnected too soon, they'll be considered not started
  748. // instead of unready, and cluster won't be considered unhealthy.
  749. //
  750. // More precisely, Cluster Autoscaler compares last transition time of
  751. // several readiness conditions to node create time. If it's within
  752. // 2 minutes, it'll assume node is just starting and not unhealthy.
  753. //
  754. // Nodes become ready in less than 1 minute after being created,
  755. // so waiting extra 2 minutes before breaking them (which triggers
  756. // readiness condition transition) should be sufficient, while
  757. // making no assumptions about minimal node startup time.
  758. time.Sleep(2 * time.Minute)
  759. ginkgo.By("Block network connectivity to some nodes to simulate unhealthy cluster")
  760. nodesToBreakCount := int(math.Ceil(math.Max(float64(unhealthyClusterThreshold), 0.5*float64(clusterSize))))
  761. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  762. "spec.unschedulable": "false",
  763. }.AsSelector().String()})
  764. framework.ExpectNoError(err)
  765. gomega.Expect(nodesToBreakCount <= len(nodes.Items)).To(gomega.BeTrue())
  766. nodesToBreak := nodes.Items[:nodesToBreakCount]
  767. // TestUnderTemporaryNetworkFailure only removes connectivity to a single node,
  768. // and accepts func() callback. This is expanding the loop to recursive call
  769. // to avoid duplicating TestUnderTemporaryNetworkFailure
  770. var testFunction func()
  771. testFunction = func() {
  772. if len(nodesToBreak) > 0 {
  773. ntb := &nodesToBreak[0]
  774. nodesToBreak = nodesToBreak[1:]
  775. framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
  776. } else {
  777. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, defaultTimeout)
  778. defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
  779. time.Sleep(scaleUpTimeout)
  780. currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  781. e2elog.Logf("Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v", len(currentNodes.Items), len(nodes.Items), nodesToBreakCount)
  782. gomega.Expect(len(currentNodes.Items)).Should(gomega.Equal(len(nodes.Items) - nodesToBreakCount))
  783. status, err := getClusterwideStatus(c)
  784. e2elog.Logf("Clusterwide status: %v", status)
  785. framework.ExpectNoError(err)
  786. gomega.Expect(status).Should(gomega.Equal("Unhealthy"))
  787. }
  788. }
  789. testFunction()
  790. // Give nodes time to recover from network failure
  791. framework.ExpectNoError(framework.WaitForReadyNodes(c, len(nodes.Items), nodesRecoverTimeout))
  792. })
  793. ginkgo.It("shouldn't scale up when expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  794. defer createPriorityClasses(f)()
  795. // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
  796. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), false, time.Second, expendablePriorityClassName)
  797. defer cleanupFunc()
  798. ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
  799. time.Sleep(scaleUpTimeout)
  800. // Verify that cluster size is not changed
  801. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  802. func(size int) bool { return size == nodeCount }, time.Second))
  803. })
  804. ginkgo.It("should scale up when non expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  805. defer createPriorityClasses(f)()
  806. // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
  807. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
  808. defer cleanupFunc()
  809. // Verify that cluster size is not changed
  810. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  811. func(size int) bool { return size > nodeCount }, time.Second))
  812. })
  813. ginkgo.It("shouldn't scale up when expendable pod is preempted [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  814. defer createPriorityClasses(f)()
  815. // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node.
  816. cleanupFunc1 := ReserveMemoryWithPriority(f, "memory-reservation1", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, expendablePriorityClassName)
  817. defer cleanupFunc1()
  818. // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node. Pods created here should preempt pods created above.
  819. cleanupFunc2 := ReserveMemoryWithPriority(f, "memory-reservation2", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, highPriorityClassName)
  820. defer cleanupFunc2()
  821. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  822. func(size int) bool { return size == nodeCount }, time.Second))
  823. })
  824. ginkgo.It("should scale down when expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
  825. defer createPriorityClasses(f)()
  826. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  827. // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
  828. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, expendablePriorityClassName)
  829. defer cleanupFunc()
  830. ginkgo.By("Waiting for scale down")
  831. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  832. func(size int) bool { return size == nodeCount }, scaleDownTimeout))
  833. })
  834. ginkgo.It("shouldn't scale down when non expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
  835. defer createPriorityClasses(f)()
  836. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  837. // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
  838. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
  839. defer cleanupFunc()
  840. ginkgo.By(fmt.Sprintf("Waiting for scale down hoping it won't happen, sleep for %s", scaleDownTimeout.String()))
  841. time.Sleep(scaleDownTimeout)
  842. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  843. func(size int) bool { return size == increasedSize }, time.Second))
  844. })
  845. })
  846. func installNvidiaDriversDaemonSet() {
  847. ginkgo.By("Add daemonset which installs nvidia drivers")
  848. // the link differs from one in GKE documentation; discussed with @mindprince this one should be used
  849. framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml")
  850. }
  851. func execCmd(args ...string) *exec.Cmd {
  852. klog.Infof("Executing: %s", strings.Join(args, " "))
  853. return exec.Command(args[0], args[1:]...)
  854. }
  855. func runDrainTest(f *framework.Framework, migSizes map[string]int, namespace string, podsPerNode, pdbSize int, verifyFunction func(int)) {
  856. increasedSize := manuallyIncreaseClusterSize(f, migSizes)
  857. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  858. "spec.unschedulable": "false",
  859. }.AsSelector().String()})
  860. framework.ExpectNoError(err)
  861. numPods := len(nodes.Items) * podsPerNode
  862. testID := string(uuid.NewUUID()) // So that we can label and find pods
  863. labelMap := map[string]string{"test_id": testID}
  864. framework.ExpectNoError(runReplicatedPodOnEachNode(f, nodes.Items, namespace, podsPerNode, "reschedulable-pods", labelMap, 0))
  865. defer framework.DeleteRCAndWaitForGC(f.ClientSet, namespace, "reschedulable-pods")
  866. ginkgo.By("Create a PodDisruptionBudget")
  867. minAvailable := intstr.FromInt(numPods - pdbSize)
  868. pdb := &policy.PodDisruptionBudget{
  869. ObjectMeta: metav1.ObjectMeta{
  870. Name: "test_pdb",
  871. Namespace: namespace,
  872. },
  873. Spec: policy.PodDisruptionBudgetSpec{
  874. Selector: &metav1.LabelSelector{MatchLabels: labelMap},
  875. MinAvailable: &minAvailable,
  876. },
  877. }
  878. _, err = f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Create(pdb)
  879. defer func() {
  880. f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Delete(pdb.Name, &metav1.DeleteOptions{})
  881. }()
  882. framework.ExpectNoError(err)
  883. verifyFunction(increasedSize)
  884. }
  885. func getGkeAPIEndpoint() string {
  886. gkeAPIEndpoint := os.Getenv("CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER")
  887. if gkeAPIEndpoint == "" {
  888. gkeAPIEndpoint = "https://test-container.sandbox.googleapis.com"
  889. }
  890. if strings.HasSuffix(gkeAPIEndpoint, "/") {
  891. gkeAPIEndpoint = gkeAPIEndpoint[:len(gkeAPIEndpoint)-1]
  892. }
  893. return gkeAPIEndpoint
  894. }
  895. func getGKEURL(apiVersion string, suffix string) string {
  896. out, err := execCmd("gcloud", "auth", "print-access-token").Output()
  897. framework.ExpectNoError(err)
  898. token := strings.Replace(string(out), "\n", "", -1)
  899. return fmt.Sprintf("%s/%s/%s?access_token=%s",
  900. getGkeAPIEndpoint(),
  901. apiVersion,
  902. suffix,
  903. token)
  904. }
  905. func getGKEClusterURL(apiVersion string) string {
  906. if isRegionalCluster() {
  907. // TODO(bskiba): Use locations API for all clusters once it's graduated to v1.
  908. return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/locations/%s/clusters/%s",
  909. framework.TestContext.CloudConfig.ProjectID,
  910. framework.TestContext.CloudConfig.Region,
  911. framework.TestContext.CloudConfig.Cluster))
  912. }
  913. return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/zones/%s/clusters/%s",
  914. framework.TestContext.CloudConfig.ProjectID,
  915. framework.TestContext.CloudConfig.Zone,
  916. framework.TestContext.CloudConfig.Cluster))
  917. }
  918. func getCluster(apiVersion string) (string, error) {
  919. resp, err := http.Get(getGKEClusterURL(apiVersion))
  920. if err != nil {
  921. return "", err
  922. }
  923. defer resp.Body.Close()
  924. body, err := ioutil.ReadAll(resp.Body)
  925. if err != nil {
  926. return "", err
  927. }
  928. if resp.StatusCode != http.StatusOK {
  929. return "", fmt.Errorf("error: %s %s", resp.Status, body)
  930. }
  931. return string(body), nil
  932. }
  933. func isAutoscalerEnabled(expectedMaxNodeCountInTargetPool int) (bool, error) {
  934. apiVersion := "v1"
  935. if isRegionalCluster() {
  936. apiVersion = "v1beta1"
  937. }
  938. strBody, err := getCluster(apiVersion)
  939. if err != nil {
  940. return false, err
  941. }
  942. if strings.Contains(strBody, "\"maxNodeCount\": "+strconv.Itoa(expectedMaxNodeCountInTargetPool)) {
  943. return true, nil
  944. }
  945. return false, nil
  946. }
  947. func getClusterLocation() string {
  948. if isRegionalCluster() {
  949. return "--region=" + framework.TestContext.CloudConfig.Region
  950. }
  951. return "--zone=" + framework.TestContext.CloudConfig.Zone
  952. }
  953. func getGcloudCommandFromTrack(commandTrack string, args []string) []string {
  954. command := []string{"gcloud"}
  955. if commandTrack == "beta" || commandTrack == "alpha" {
  956. command = append(command, commandTrack)
  957. }
  958. command = append(command, args...)
  959. command = append(command, getClusterLocation())
  960. command = append(command, "--project="+framework.TestContext.CloudConfig.ProjectID)
  961. return command
  962. }
  963. func getGcloudCommand(args []string) []string {
  964. track := ""
  965. if isRegionalCluster() {
  966. track = "beta"
  967. }
  968. return getGcloudCommandFromTrack(track, args)
  969. }
  970. func isRegionalCluster() bool {
  971. // TODO(bskiba): Use an appropriate indicator that the cluster is regional.
  972. return framework.TestContext.CloudConfig.MultiZone
  973. }
  974. func enableAutoscaler(nodePool string, minCount, maxCount int) error {
  975. klog.Infof("Using gcloud to enable autoscaling for pool %s", nodePool)
  976. args := []string{"container", "clusters", "update", framework.TestContext.CloudConfig.Cluster,
  977. "--enable-autoscaling",
  978. "--min-nodes=" + strconv.Itoa(minCount),
  979. "--max-nodes=" + strconv.Itoa(maxCount),
  980. "--node-pool=" + nodePool}
  981. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  982. if err != nil {
  983. klog.Errorf("Failed config update result: %s", output)
  984. return fmt.Errorf("Failed to enable autoscaling: %v", err)
  985. }
  986. klog.Infof("Config update result: %s", output)
  987. var finalErr error
  988. for startTime := time.Now(); startTime.Add(gkeUpdateTimeout).After(time.Now()); time.Sleep(30 * time.Second) {
  989. val, err := isAutoscalerEnabled(maxCount)
  990. if err == nil && val {
  991. return nil
  992. }
  993. finalErr = err
  994. }
  995. return fmt.Errorf("autoscaler not enabled, last error: %v", finalErr)
  996. }
  997. func disableAutoscaler(nodePool string, minCount, maxCount int) error {
  998. klog.Infof("Using gcloud to disable autoscaling for pool %s", nodePool)
  999. args := []string{"container", "clusters", "update", framework.TestContext.CloudConfig.Cluster,
  1000. "--no-enable-autoscaling",
  1001. "--node-pool=" + nodePool}
  1002. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  1003. if err != nil {
  1004. klog.Errorf("Failed config update result: %s", output)
  1005. return fmt.Errorf("Failed to disable autoscaling: %v", err)
  1006. }
  1007. klog.Infof("Config update result: %s", output)
  1008. var finalErr error
  1009. for startTime := time.Now(); startTime.Add(gkeUpdateTimeout).After(time.Now()); time.Sleep(30 * time.Second) {
  1010. val, err := isAutoscalerEnabled(maxCount)
  1011. if err == nil && !val {
  1012. return nil
  1013. }
  1014. finalErr = err
  1015. }
  1016. return fmt.Errorf("autoscaler still enabled, last error: %v", finalErr)
  1017. }
  1018. func addNodePool(name string, machineType string, numNodes int) {
  1019. args := []string{"container", "node-pools", "create", name, "--quiet",
  1020. "--machine-type=" + machineType,
  1021. "--num-nodes=" + strconv.Itoa(numNodes),
  1022. "--cluster=" + framework.TestContext.CloudConfig.Cluster}
  1023. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  1024. klog.Infof("Creating node-pool %s: %s", name, output)
  1025. framework.ExpectNoError(err, string(output))
  1026. }
  1027. func addGpuNodePool(name string, gpuType string, gpuCount int, numNodes int) {
  1028. args := []string{"beta", "container", "node-pools", "create", name, "--quiet",
  1029. "--accelerator", "type=" + gpuType + ",count=" + strconv.Itoa(gpuCount),
  1030. "--num-nodes=" + strconv.Itoa(numNodes),
  1031. "--cluster=" + framework.TestContext.CloudConfig.Cluster}
  1032. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  1033. klog.Infof("Creating node-pool %s: %s", name, output)
  1034. framework.ExpectNoError(err, string(output))
  1035. }
  1036. func deleteNodePool(name string) {
  1037. klog.Infof("Deleting node pool %s", name)
  1038. args := []string{"container", "node-pools", "delete", name, "--quiet",
  1039. "--cluster=" + framework.TestContext.CloudConfig.Cluster}
  1040. err := wait.ExponentialBackoff(
  1041. wait.Backoff{Duration: 1 * time.Minute, Factor: float64(3), Steps: 3},
  1042. func() (bool, error) {
  1043. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  1044. if err != nil {
  1045. klog.Warningf("Error deleting nodegroup - error:%v, output: %s", err, output)
  1046. return false, nil
  1047. }
  1048. klog.Infof("Node-pool deletion output: %s", output)
  1049. return true, nil
  1050. })
  1051. framework.ExpectNoError(err)
  1052. }
  1053. func getPoolNodes(f *framework.Framework, poolName string) []*v1.Node {
  1054. nodes := make([]*v1.Node, 0, 1)
  1055. nodeList := framework.GetReadyNodesIncludingTaintedOrDie(f.ClientSet)
  1056. for _, node := range nodeList.Items {
  1057. if node.Labels[gkeNodepoolNameKey] == poolName {
  1058. nodes = append(nodes, &node)
  1059. }
  1060. }
  1061. return nodes
  1062. }
  1063. // getPoolInitialSize returns the initial size of the node pool taking into
  1064. // account that it may span multiple zones. In that case, node pool consists of
  1065. // multiple migs all containing initialNodeCount nodes.
  1066. func getPoolInitialSize(poolName string) int {
  1067. // get initial node count
  1068. args := []string{"container", "node-pools", "describe", poolName, "--quiet",
  1069. "--cluster=" + framework.TestContext.CloudConfig.Cluster,
  1070. "--format=value(initialNodeCount)"}
  1071. output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
  1072. klog.Infof("Node-pool initial size: %s", output)
  1073. framework.ExpectNoError(err, string(output))
  1074. fields := strings.Fields(string(output))
  1075. gomega.Expect(len(fields)).Should(gomega.Equal(1))
  1076. size, err := strconv.ParseInt(fields[0], 10, 64)
  1077. framework.ExpectNoError(err)
  1078. // get number of node pools
  1079. args = []string{"container", "node-pools", "describe", poolName, "--quiet",
  1080. "--cluster=" + framework.TestContext.CloudConfig.Cluster,
  1081. "--format=value(instanceGroupUrls)"}
  1082. output, err = execCmd(getGcloudCommand(args)...).CombinedOutput()
  1083. framework.ExpectNoError(err, string(output))
  1084. nodeGroupCount := len(strings.Split(string(output), ";"))
  1085. return int(size) * nodeGroupCount
  1086. }
  1087. func getPoolSize(f *framework.Framework, poolName string) int {
  1088. size := 0
  1089. nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  1090. for _, node := range nodeList.Items {
  1091. if node.Labels[gkeNodepoolNameKey] == poolName {
  1092. size++
  1093. }
  1094. }
  1095. return size
  1096. }
  1097. func reserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName string) func() error {
  1098. ginkgo.By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
  1099. request := int64(1024 * 1024 * megabytes / replicas)
  1100. config := &testutils.RCConfig{
  1101. Client: f.ClientSet,
  1102. Name: id,
  1103. Namespace: f.Namespace.Name,
  1104. Timeout: timeout,
  1105. Image: imageutils.GetPauseImageName(),
  1106. Replicas: replicas,
  1107. MemRequest: request,
  1108. NodeSelector: selector,
  1109. Tolerations: tolerations,
  1110. PriorityClassName: priorityClassName,
  1111. }
  1112. for start := time.Now(); time.Since(start) < rcCreationRetryTimeout; time.Sleep(rcCreationRetryDelay) {
  1113. err := framework.RunRC(*config)
  1114. if err != nil && strings.Contains(err.Error(), "Error creating replication controller") {
  1115. klog.Warningf("Failed to create memory reservation: %v", err)
  1116. continue
  1117. }
  1118. if expectRunning {
  1119. framework.ExpectNoError(err)
  1120. }
  1121. return func() error {
  1122. return framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, id)
  1123. }
  1124. }
  1125. framework.Failf("Failed to reserve memory within timeout")
  1126. return nil
  1127. }
  1128. // ReserveMemoryWithPriority creates a replication controller with pods with priority that, in summation,
  1129. // request the specified amount of memory.
  1130. func ReserveMemoryWithPriority(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, priorityClassName string) func() error {
  1131. return reserveMemory(f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName)
  1132. }
  1133. // ReserveMemoryWithSelectorAndTolerations creates a replication controller with pods with node selector that, in summation,
  1134. // request the specified amount of memory.
  1135. func ReserveMemoryWithSelectorAndTolerations(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration) func() error {
  1136. return reserveMemory(f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "")
  1137. }
  1138. // ReserveMemory creates a replication controller with pods that, in summation,
  1139. // request the specified amount of memory.
  1140. func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) func() error {
  1141. return reserveMemory(f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "")
  1142. }
  1143. // WaitForClusterSizeFunc waits until the cluster size matches the given function.
  1144. func WaitForClusterSizeFunc(c clientset.Interface, sizeFunc func(int) bool, timeout time.Duration) error {
  1145. return WaitForClusterSizeFuncWithUnready(c, sizeFunc, timeout, 0)
  1146. }
  1147. // WaitForClusterSizeFuncWithUnready waits until the cluster size matches the given function and assumes some unready nodes.
  1148. func WaitForClusterSizeFuncWithUnready(c clientset.Interface, sizeFunc func(int) bool, timeout time.Duration, expectedUnready int) error {
  1149. for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
  1150. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  1151. "spec.unschedulable": "false",
  1152. }.AsSelector().String()})
  1153. if err != nil {
  1154. klog.Warningf("Failed to list nodes: %v", err)
  1155. continue
  1156. }
  1157. numNodes := len(nodes.Items)
  1158. // Filter out not-ready nodes.
  1159. framework.FilterNodes(nodes, func(node v1.Node) bool {
  1160. return framework.IsNodeConditionSetAsExpected(&node, v1.NodeReady, true)
  1161. })
  1162. numReady := len(nodes.Items)
  1163. if numNodes == numReady+expectedUnready && sizeFunc(numNodes) {
  1164. klog.Infof("Cluster has reached the desired size")
  1165. return nil
  1166. }
  1167. klog.Infof("Waiting for cluster with func, current size %d, not ready nodes %d", numNodes, numNodes-numReady)
  1168. }
  1169. return fmt.Errorf("timeout waiting %v for appropriate cluster size", timeout)
  1170. }
  1171. func waitForCaPodsReadyInNamespace(f *framework.Framework, c clientset.Interface, tolerateUnreadyCount int) error {
  1172. var notready []string
  1173. for start := time.Now(); time.Now().Before(start.Add(scaleUpTimeout)); time.Sleep(20 * time.Second) {
  1174. pods, err := c.CoreV1().Pods(f.Namespace.Name).List(metav1.ListOptions{})
  1175. if err != nil {
  1176. return fmt.Errorf("failed to get pods: %v", err)
  1177. }
  1178. notready = make([]string, 0)
  1179. for _, pod := range pods.Items {
  1180. ready := false
  1181. for _, c := range pod.Status.Conditions {
  1182. if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
  1183. ready = true
  1184. }
  1185. }
  1186. // Failed pods in this context generally mean that they have been
  1187. // double scheduled onto a node, but then failed a constraint check.
  1188. if pod.Status.Phase == v1.PodFailed {
  1189. klog.Warningf("Pod has failed: %v", pod)
  1190. }
  1191. if !ready && pod.Status.Phase != v1.PodFailed {
  1192. notready = append(notready, pod.Name)
  1193. }
  1194. }
  1195. if len(notready) <= tolerateUnreadyCount {
  1196. klog.Infof("sufficient number of pods ready. Tolerating %d unready", tolerateUnreadyCount)
  1197. return nil
  1198. }
  1199. klog.Infof("Too many pods are not ready yet: %v", notready)
  1200. }
  1201. klog.Info("Timeout on waiting for pods being ready")
  1202. klog.Info(framework.RunKubectlOrDie("get", "pods", "-o", "json", "--all-namespaces"))
  1203. klog.Info(framework.RunKubectlOrDie("get", "nodes", "-o", "json"))
  1204. // Some pods are still not running.
  1205. return fmt.Errorf("Too many pods are still not running: %v", notready)
  1206. }
  1207. func waitForAllCaPodsReadyInNamespace(f *framework.Framework, c clientset.Interface) error {
  1208. return waitForCaPodsReadyInNamespace(f, c, 0)
  1209. }
  1210. func getAnyNode(c clientset.Interface) *v1.Node {
  1211. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  1212. "spec.unschedulable": "false",
  1213. }.AsSelector().String()})
  1214. if err != nil {
  1215. klog.Errorf("Failed to get node list: %v", err)
  1216. return nil
  1217. }
  1218. if len(nodes.Items) == 0 {
  1219. klog.Errorf("No nodes")
  1220. return nil
  1221. }
  1222. return &nodes.Items[0]
  1223. }
  1224. func setMigSizes(sizes map[string]int) bool {
  1225. madeChanges := false
  1226. for mig, desiredSize := range sizes {
  1227. currentSize, err := framework.GroupSize(mig)
  1228. framework.ExpectNoError(err)
  1229. if desiredSize != currentSize {
  1230. ginkgo.By(fmt.Sprintf("Setting size of %s to %d", mig, desiredSize))
  1231. err = framework.ResizeGroup(mig, int32(desiredSize))
  1232. framework.ExpectNoError(err)
  1233. madeChanges = true
  1234. }
  1235. }
  1236. return madeChanges
  1237. }
  1238. func drainNode(f *framework.Framework, node *v1.Node) {
  1239. ginkgo.By("Make the single node unschedulable")
  1240. makeNodeUnschedulable(f.ClientSet, node)
  1241. ginkgo.By("Manually drain the single node")
  1242. podOpts := metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, node.Name).String()}
  1243. pods, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceAll).List(podOpts)
  1244. framework.ExpectNoError(err)
  1245. for _, pod := range pods.Items {
  1246. err = f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(pod.Name, metav1.NewDeleteOptions(0))
  1247. framework.ExpectNoError(err)
  1248. }
  1249. }
  1250. func makeNodeUnschedulable(c clientset.Interface, node *v1.Node) error {
  1251. ginkgo.By(fmt.Sprintf("Taint node %s", node.Name))
  1252. for j := 0; j < 3; j++ {
  1253. freshNode, err := c.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{})
  1254. if err != nil {
  1255. return err
  1256. }
  1257. for _, taint := range freshNode.Spec.Taints {
  1258. if taint.Key == disabledTaint {
  1259. return nil
  1260. }
  1261. }
  1262. freshNode.Spec.Taints = append(freshNode.Spec.Taints, v1.Taint{
  1263. Key: disabledTaint,
  1264. Value: "DisabledForTest",
  1265. Effect: v1.TaintEffectNoSchedule,
  1266. })
  1267. _, err = c.CoreV1().Nodes().Update(freshNode)
  1268. if err == nil {
  1269. return nil
  1270. }
  1271. if !errors.IsConflict(err) {
  1272. return err
  1273. }
  1274. klog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j)
  1275. }
  1276. return fmt.Errorf("Failed to taint node in allowed number of retries")
  1277. }
  1278. // CriticalAddonsOnlyError implements the `error` interface, and signifies the
  1279. // presence of the `CriticalAddonsOnly` taint on the node.
  1280. type CriticalAddonsOnlyError struct{}
  1281. func (CriticalAddonsOnlyError) Error() string {
  1282. return fmt.Sprintf("CriticalAddonsOnly taint found on node")
  1283. }
  1284. func makeNodeSchedulable(c clientset.Interface, node *v1.Node, failOnCriticalAddonsOnly bool) error {
  1285. ginkgo.By(fmt.Sprintf("Remove taint from node %s", node.Name))
  1286. for j := 0; j < 3; j++ {
  1287. freshNode, err := c.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{})
  1288. if err != nil {
  1289. return err
  1290. }
  1291. var newTaints []v1.Taint
  1292. for _, taint := range freshNode.Spec.Taints {
  1293. if failOnCriticalAddonsOnly && taint.Key == criticalAddonsOnlyTaint {
  1294. return CriticalAddonsOnlyError{}
  1295. }
  1296. if taint.Key != disabledTaint {
  1297. newTaints = append(newTaints, taint)
  1298. }
  1299. }
  1300. if len(newTaints) == len(freshNode.Spec.Taints) {
  1301. return nil
  1302. }
  1303. freshNode.Spec.Taints = newTaints
  1304. _, err = c.CoreV1().Nodes().Update(freshNode)
  1305. if err == nil {
  1306. return nil
  1307. }
  1308. if !errors.IsConflict(err) {
  1309. return err
  1310. }
  1311. klog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j)
  1312. }
  1313. return fmt.Errorf("Failed to remove taint from node in allowed number of retries")
  1314. }
  1315. // ScheduleAnySingleGpuPod schedules a pod which requires single GPU of any type
  1316. func ScheduleAnySingleGpuPod(f *framework.Framework, id string) error {
  1317. return ScheduleGpuPod(f, id, "", 1)
  1318. }
  1319. // ScheduleGpuPod schedules a pod which requires a given number of gpus of given type
  1320. func ScheduleGpuPod(f *framework.Framework, id string, gpuType string, gpuLimit int64) error {
  1321. config := &testutils.RCConfig{
  1322. Client: f.ClientSet,
  1323. Name: id,
  1324. Namespace: f.Namespace.Name,
  1325. Timeout: 3 * scaleUpTimeout, // spinning up GPU node is slow
  1326. Image: imageutils.GetPauseImageName(),
  1327. Replicas: 1,
  1328. GpuLimit: gpuLimit,
  1329. Labels: map[string]string{"requires-gpu": "yes"},
  1330. }
  1331. if gpuType != "" {
  1332. config.NodeSelector = map[string]string{gpuLabel: gpuType}
  1333. }
  1334. err := framework.RunRC(*config)
  1335. if err != nil {
  1336. return err
  1337. }
  1338. return nil
  1339. }
  1340. // Create an RC running a given number of pods with anti-affinity
  1341. func runAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error {
  1342. config := &testutils.RCConfig{
  1343. Affinity: buildAntiAffinity(antiAffinityLabels),
  1344. Client: f.ClientSet,
  1345. Name: id,
  1346. Namespace: namespace,
  1347. Timeout: scaleUpTimeout,
  1348. Image: imageutils.GetPauseImageName(),
  1349. Replicas: pods,
  1350. Labels: podLabels,
  1351. }
  1352. err := framework.RunRC(*config)
  1353. if err != nil {
  1354. return err
  1355. }
  1356. _, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(id, metav1.GetOptions{})
  1357. if err != nil {
  1358. return err
  1359. }
  1360. return nil
  1361. }
  1362. func runVolumeAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string, volumes []v1.Volume) error {
  1363. config := &testutils.RCConfig{
  1364. Affinity: buildAntiAffinity(antiAffinityLabels),
  1365. Volumes: volumes,
  1366. Client: f.ClientSet,
  1367. Name: id,
  1368. Namespace: namespace,
  1369. Timeout: scaleUpTimeout,
  1370. Image: imageutils.GetPauseImageName(),
  1371. Replicas: pods,
  1372. Labels: podLabels,
  1373. }
  1374. err := framework.RunRC(*config)
  1375. if err != nil {
  1376. return err
  1377. }
  1378. _, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(id, metav1.GetOptions{})
  1379. if err != nil {
  1380. return err
  1381. }
  1382. return nil
  1383. }
  1384. var emptyDirVolumes = []v1.Volume{
  1385. {
  1386. Name: "empty-volume",
  1387. VolumeSource: v1.VolumeSource{
  1388. EmptyDir: &v1.EmptyDirVolumeSource{},
  1389. },
  1390. },
  1391. }
  1392. func buildVolumes(pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) []v1.Volume {
  1393. return []v1.Volume{
  1394. {
  1395. Name: pv.Name,
  1396. VolumeSource: v1.VolumeSource{
  1397. PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1398. ClaimName: pvc.Name,
  1399. ReadOnly: false,
  1400. },
  1401. },
  1402. },
  1403. }
  1404. }
  1405. func buildAntiAffinity(labels map[string]string) *v1.Affinity {
  1406. return &v1.Affinity{
  1407. PodAntiAffinity: &v1.PodAntiAffinity{
  1408. RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
  1409. {
  1410. LabelSelector: &metav1.LabelSelector{
  1411. MatchLabels: labels,
  1412. },
  1413. TopologyKey: "kubernetes.io/hostname",
  1414. },
  1415. },
  1416. },
  1417. }
  1418. }
  1419. // Create an RC running a given number of pods on each node without adding any constraint forcing
  1420. // such pod distribution. This is meant to create a bunch of underutilized (but not unused) nodes
  1421. // with pods that can be rescheduled on different nodes.
  1422. // This is achieved using the following method:
  1423. // 1. disable scheduling on each node
  1424. // 2. create an empty RC
  1425. // 3. for each node:
  1426. // 3a. enable scheduling on that node
  1427. // 3b. increase number of replicas in RC by podsPerNode
  1428. func runReplicatedPodOnEachNode(f *framework.Framework, nodes []v1.Node, namespace string, podsPerNode int, id string, labels map[string]string, memRequest int64) error {
  1429. ginkgo.By("Run a pod on each node")
  1430. for _, node := range nodes {
  1431. err := makeNodeUnschedulable(f.ClientSet, &node)
  1432. defer func(n v1.Node) {
  1433. makeNodeSchedulable(f.ClientSet, &n, false)
  1434. }(node)
  1435. if err != nil {
  1436. return err
  1437. }
  1438. }
  1439. config := &testutils.RCConfig{
  1440. Client: f.ClientSet,
  1441. Name: id,
  1442. Namespace: namespace,
  1443. Timeout: defaultTimeout,
  1444. Image: imageutils.GetPauseImageName(),
  1445. Replicas: 0,
  1446. Labels: labels,
  1447. MemRequest: memRequest,
  1448. }
  1449. err := framework.RunRC(*config)
  1450. if err != nil {
  1451. return err
  1452. }
  1453. rc, err := f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(id, metav1.GetOptions{})
  1454. if err != nil {
  1455. return err
  1456. }
  1457. for i, node := range nodes {
  1458. err = makeNodeSchedulable(f.ClientSet, &node, false)
  1459. if err != nil {
  1460. return err
  1461. }
  1462. // Update replicas count, to create new pods that will be allocated on node
  1463. // (we retry 409 errors in case rc reference got out of sync)
  1464. for j := 0; j < 3; j++ {
  1465. *rc.Spec.Replicas = int32((i + 1) * podsPerNode)
  1466. rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Update(rc)
  1467. if err == nil {
  1468. break
  1469. }
  1470. if !errors.IsConflict(err) {
  1471. return err
  1472. }
  1473. klog.Warningf("Got 409 conflict when trying to scale RC, retries left: %v", 3-j)
  1474. rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(id, metav1.GetOptions{})
  1475. if err != nil {
  1476. return err
  1477. }
  1478. }
  1479. err = wait.PollImmediate(5*time.Second, podTimeout, func() (bool, error) {
  1480. rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(id, metav1.GetOptions{})
  1481. if err != nil || rc.Status.ReadyReplicas < int32((i+1)*podsPerNode) {
  1482. return false, nil
  1483. }
  1484. return true, nil
  1485. })
  1486. if err != nil {
  1487. return fmt.Errorf("failed to coerce RC into spawning a pod on node %s within timeout", node.Name)
  1488. }
  1489. err = makeNodeUnschedulable(f.ClientSet, &node)
  1490. if err != nil {
  1491. return err
  1492. }
  1493. }
  1494. return nil
  1495. }
  1496. // Increase cluster size by newNodesForScaledownTests to create some unused nodes
  1497. // that can be later removed by cluster autoscaler.
  1498. func manuallyIncreaseClusterSize(f *framework.Framework, originalSizes map[string]int) int {
  1499. ginkgo.By("Manually increase cluster size")
  1500. increasedSize := 0
  1501. newSizes := make(map[string]int)
  1502. for key, val := range originalSizes {
  1503. newSizes[key] = val + newNodesForScaledownTests
  1504. increasedSize += val + newNodesForScaledownTests
  1505. }
  1506. setMigSizes(newSizes)
  1507. checkClusterSize := func(size int) bool {
  1508. if size >= increasedSize {
  1509. return true
  1510. }
  1511. resized := setMigSizes(newSizes)
  1512. if resized {
  1513. klog.Warning("Unexpected node group size while waiting for cluster resize. Setting size to target again.")
  1514. }
  1515. return false
  1516. }
  1517. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, checkClusterSize, manualResizeTimeout))
  1518. return increasedSize
  1519. }
  1520. // Try to get clusterwide health from CA status configmap.
  1521. // Status configmap is not parsing-friendly, so evil regexpery follows.
  1522. func getClusterwideStatus(c clientset.Interface) (string, error) {
  1523. configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
  1524. if err != nil {
  1525. return "", err
  1526. }
  1527. status, ok := configMap.Data["status"]
  1528. if !ok {
  1529. return "", fmt.Errorf("Status information not found in configmap")
  1530. }
  1531. matcher, err := regexp.Compile("Cluster-wide:\\s*\n\\s*Health:\\s*([A-Za-z]+)")
  1532. if err != nil {
  1533. return "", err
  1534. }
  1535. result := matcher.FindStringSubmatch(status)
  1536. if len(result) < 2 {
  1537. return "", fmt.Errorf("Failed to parse CA status configmap, raw status: %v", status)
  1538. }
  1539. return result[1], nil
  1540. }
  1541. type scaleUpStatus struct {
  1542. status string
  1543. ready int
  1544. target int
  1545. timestamp time.Time
  1546. }
  1547. // Try to get timestamp from status.
  1548. // Status configmap is not parsing-friendly, so evil regexpery follows.
  1549. func getStatusTimestamp(status string) (time.Time, error) {
  1550. timestampMatcher, err := regexp.Compile("Cluster-autoscaler status at \\s*([0-9\\-]+ [0-9]+:[0-9]+:[0-9]+\\.[0-9]+ \\+[0-9]+ [A-Za-z]+)")
  1551. if err != nil {
  1552. return time.Time{}, err
  1553. }
  1554. timestampMatch := timestampMatcher.FindStringSubmatch(status)
  1555. if len(timestampMatch) < 2 {
  1556. return time.Time{}, fmt.Errorf("Failed to parse CA status timestamp, raw status: %v", status)
  1557. }
  1558. timestamp, err := time.Parse(timestampFormat, timestampMatch[1])
  1559. if err != nil {
  1560. return time.Time{}, err
  1561. }
  1562. return timestamp, nil
  1563. }
  1564. // Try to get scaleup statuses of all node groups.
  1565. // Status configmap is not parsing-friendly, so evil regexpery follows.
  1566. func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
  1567. configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
  1568. if err != nil {
  1569. return nil, err
  1570. }
  1571. status, ok := configMap.Data["status"]
  1572. if !ok {
  1573. return nil, fmt.Errorf("Status information not found in configmap")
  1574. }
  1575. timestamp, err := getStatusTimestamp(status)
  1576. if err != nil {
  1577. return nil, err
  1578. }
  1579. matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
  1580. if err != nil {
  1581. return nil, err
  1582. }
  1583. matches := matcher.FindAllStringSubmatch(status, -1)
  1584. if len(matches) < 1 {
  1585. return nil, fmt.Errorf("Failed to parse CA status configmap, raw status: %v", status)
  1586. }
  1587. result := scaleUpStatus{
  1588. status: caNoScaleUpStatus,
  1589. ready: 0,
  1590. target: 0,
  1591. timestamp: timestamp,
  1592. }
  1593. for _, match := range matches {
  1594. if match[1] == caOngoingScaleUpStatus {
  1595. result.status = caOngoingScaleUpStatus
  1596. }
  1597. newReady, err := strconv.Atoi(match[2])
  1598. if err != nil {
  1599. return nil, err
  1600. }
  1601. result.ready += newReady
  1602. newTarget, err := strconv.Atoi(match[3])
  1603. if err != nil {
  1604. return nil, err
  1605. }
  1606. result.target += newTarget
  1607. }
  1608. klog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target)
  1609. return &result, nil
  1610. }
  1611. func waitForScaleUpStatus(c clientset.Interface, cond func(s *scaleUpStatus) bool, timeout time.Duration) (*scaleUpStatus, error) {
  1612. var finalErr error
  1613. var status *scaleUpStatus
  1614. err := wait.PollImmediate(5*time.Second, timeout, func() (bool, error) {
  1615. status, finalErr = getScaleUpStatus(c)
  1616. if finalErr != nil {
  1617. return false, nil
  1618. }
  1619. if status.timestamp.Add(freshStatusLimit).Before(time.Now()) {
  1620. // stale status
  1621. finalErr = fmt.Errorf("Status too old")
  1622. return false, nil
  1623. }
  1624. return cond(status), nil
  1625. })
  1626. if err != nil {
  1627. err = fmt.Errorf("Failed to find expected scale up status: %v, last status: %v, final err: %v", err, status, finalErr)
  1628. }
  1629. return status, err
  1630. }
  1631. // This is a temporary fix to allow CA to migrate some kube-system pods
  1632. // TODO: Remove this when the PDB is added for some of those components
  1633. func addKubeSystemPdbs(f *framework.Framework) (func(), error) {
  1634. ginkgo.By("Create PodDisruptionBudgets for kube-system components, so they can be migrated if required")
  1635. var newPdbs []string
  1636. cleanup := func() {
  1637. var finalErr error
  1638. for _, newPdbName := range newPdbs {
  1639. ginkgo.By(fmt.Sprintf("Delete PodDisruptionBudget %v", newPdbName))
  1640. err := f.ClientSet.PolicyV1beta1().PodDisruptionBudgets("kube-system").Delete(newPdbName, &metav1.DeleteOptions{})
  1641. if err != nil {
  1642. // log error, but attempt to remove other pdbs
  1643. klog.Errorf("Failed to delete PodDisruptionBudget %v, err: %v", newPdbName, err)
  1644. finalErr = err
  1645. }
  1646. }
  1647. if finalErr != nil {
  1648. framework.Failf("Error during PodDisruptionBudget cleanup: %v", finalErr)
  1649. }
  1650. }
  1651. type pdbInfo struct {
  1652. label string
  1653. minAvailable int
  1654. }
  1655. pdbsToAdd := []pdbInfo{
  1656. {label: "kube-dns", minAvailable: 1},
  1657. {label: "kube-dns-autoscaler", minAvailable: 0},
  1658. {label: "metrics-server", minAvailable: 0},
  1659. {label: "kubernetes-dashboard", minAvailable: 0},
  1660. {label: "glbc", minAvailable: 0},
  1661. }
  1662. for _, pdbData := range pdbsToAdd {
  1663. ginkgo.By(fmt.Sprintf("Create PodDisruptionBudget for %v", pdbData.label))
  1664. labelMap := map[string]string{"k8s-app": pdbData.label}
  1665. pdbName := fmt.Sprintf("test-pdb-for-%v", pdbData.label)
  1666. minAvailable := intstr.FromInt(pdbData.minAvailable)
  1667. pdb := &policy.PodDisruptionBudget{
  1668. ObjectMeta: metav1.ObjectMeta{
  1669. Name: pdbName,
  1670. Namespace: "kube-system",
  1671. },
  1672. Spec: policy.PodDisruptionBudgetSpec{
  1673. Selector: &metav1.LabelSelector{MatchLabels: labelMap},
  1674. MinAvailable: &minAvailable,
  1675. },
  1676. }
  1677. _, err := f.ClientSet.PolicyV1beta1().PodDisruptionBudgets("kube-system").Create(pdb)
  1678. newPdbs = append(newPdbs, pdbName)
  1679. if err != nil {
  1680. return cleanup, err
  1681. }
  1682. }
  1683. return cleanup, nil
  1684. }
  1685. func createPriorityClasses(f *framework.Framework) func() {
  1686. priorityClasses := map[string]int32{
  1687. expendablePriorityClassName: -15,
  1688. highPriorityClassName: 1000,
  1689. }
  1690. for className, priority := range priorityClasses {
  1691. _, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(&schedulerapi.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: className}, Value: priority})
  1692. if err != nil {
  1693. klog.Errorf("Error creating priority class: %v", err)
  1694. }
  1695. gomega.Expect(err == nil || errors.IsAlreadyExists(err)).To(gomega.Equal(true))
  1696. }
  1697. return func() {
  1698. for className := range priorityClasses {
  1699. err := f.ClientSet.SchedulingV1().PriorityClasses().Delete(className, nil)
  1700. if err != nil {
  1701. klog.Errorf("Error deleting priority class: %v", err)
  1702. }
  1703. }
  1704. }
  1705. }