util.go 70 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970
  1. /*
  2. Copyright 2014 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package framework
  14. import (
  15. "bytes"
  16. "context"
  17. "encoding/json"
  18. "fmt"
  19. "io"
  20. "io/ioutil"
  21. "math/rand"
  22. "net"
  23. "net/http"
  24. "net/url"
  25. "os"
  26. "os/exec"
  27. "path"
  28. "sort"
  29. "strconv"
  30. "strings"
  31. "sync"
  32. "syscall"
  33. "time"
  34. "golang.org/x/net/websocket"
  35. "k8s.io/klog"
  36. "github.com/onsi/ginkgo"
  37. "github.com/onsi/gomega"
  38. gomegatypes "github.com/onsi/gomega/types"
  39. appsv1 "k8s.io/api/apps/v1"
  40. v1 "k8s.io/api/core/v1"
  41. apierrors "k8s.io/apimachinery/pkg/api/errors"
  42. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  43. "k8s.io/apimachinery/pkg/fields"
  44. "k8s.io/apimachinery/pkg/labels"
  45. "k8s.io/apimachinery/pkg/runtime"
  46. "k8s.io/apimachinery/pkg/runtime/schema"
  47. "k8s.io/apimachinery/pkg/util/sets"
  48. "k8s.io/apimachinery/pkg/util/uuid"
  49. "k8s.io/apimachinery/pkg/util/wait"
  50. utilyaml "k8s.io/apimachinery/pkg/util/yaml"
  51. clientset "k8s.io/client-go/kubernetes"
  52. "k8s.io/client-go/kubernetes/scheme"
  53. "k8s.io/client-go/rest"
  54. restclient "k8s.io/client-go/rest"
  55. scaleclient "k8s.io/client-go/scale"
  56. "k8s.io/client-go/tools/clientcmd"
  57. clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
  58. watchtools "k8s.io/client-go/tools/watch"
  59. podutil "k8s.io/kubernetes/pkg/api/v1/pod"
  60. "k8s.io/kubernetes/pkg/client/conditions"
  61. "k8s.io/kubernetes/pkg/controller"
  62. "k8s.io/kubernetes/pkg/master/ports"
  63. taintutils "k8s.io/kubernetes/pkg/util/taints"
  64. testutils "k8s.io/kubernetes/test/utils"
  65. imageutils "k8s.io/kubernetes/test/utils/image"
  66. uexec "k8s.io/utils/exec"
  67. // TODO: Remove the following imports (ref: https://github.com/kubernetes/kubernetes/issues/81245)
  68. e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl"
  69. e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
  70. e2enode "k8s.io/kubernetes/test/e2e/framework/node"
  71. e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
  72. e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
  73. e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
  74. )
  75. const (
  76. // PodListTimeout is how long to wait for the pod to be listable.
  77. PodListTimeout = time.Minute
  78. // PodStartTimeout is how long to wait for the pod to be started.
  79. PodStartTimeout = 5 * time.Minute
  80. // PodStartShortTimeout is same as `PodStartTimeout` to wait for the pod to be started, but shorter.
  81. // Use it case by case when we are sure pod start will not be delayed.
  82. // minutes by slow docker pulls or something else.
  83. PodStartShortTimeout = 2 * time.Minute
  84. // PodDeleteTimeout is how long to wait for a pod to be deleted.
  85. PodDeleteTimeout = 5 * time.Minute
  86. // PodGetTimeout is how long to wait for a pod to be got.
  87. PodGetTimeout = 2 * time.Minute
  88. // PodEventTimeout is how much we wait for a pod event to occur.
  89. PodEventTimeout = 2 * time.Minute
  90. // ServiceStartTimeout is how long to wait for a service endpoint to be resolvable.
  91. ServiceStartTimeout = 3 * time.Minute
  92. // Poll is how often to Poll pods, nodes and claims.
  93. Poll = 2 * time.Second
  94. // PollShortTimeout is the short timeout value in polling.
  95. PollShortTimeout = 1 * time.Minute
  96. // ServiceAccountProvisionTimeout is how long to wait for a service account to be provisioned.
  97. // service accounts are provisioned after namespace creation
  98. // a service account is required to support pod creation in a namespace as part of admission control
  99. ServiceAccountProvisionTimeout = 2 * time.Minute
  100. // SingleCallTimeout is how long to try single API calls (like 'get' or 'list'). Used to prevent
  101. // transient failures from failing tests.
  102. SingleCallTimeout = 5 * time.Minute
  103. // NodeReadyInitialTimeout is how long nodes have to be "ready" when a test begins. They should already
  104. // be "ready" before the test starts, so this is small.
  105. NodeReadyInitialTimeout = 20 * time.Second
  106. // PodReadyBeforeTimeout is how long pods have to be "ready" when a test begins.
  107. PodReadyBeforeTimeout = 5 * time.Minute
  108. // ClaimProvisionShortTimeout is same as `ClaimProvisionTimeout` to wait for claim to be dynamically provisioned, but shorter.
  109. // Use it case by case when we are sure this timeout is enough.
  110. ClaimProvisionShortTimeout = 1 * time.Minute
  111. // ClaimProvisionTimeout is how long claims have to become dynamically provisioned.
  112. ClaimProvisionTimeout = 5 * time.Minute
  113. // RestartNodeReadyAgainTimeout is how long a node is allowed to become "Ready" after it is restarted before
  114. // the test is considered failed.
  115. RestartNodeReadyAgainTimeout = 5 * time.Minute
  116. // RestartPodReadyAgainTimeout is how long a pod is allowed to become "running" and "ready" after a node
  117. // restart before test is considered failed.
  118. RestartPodReadyAgainTimeout = 5 * time.Minute
  119. // SnapshotCreateTimeout is how long for snapshot to create snapshotContent.
  120. SnapshotCreateTimeout = 5 * time.Minute
  121. // Number of objects that gc can delete in a second.
  122. // GC issues 2 requestes for single delete.
  123. gcThroughput = 10
  124. // Minimal number of nodes for the cluster to be considered large.
  125. largeClusterThreshold = 100
  126. // TODO(justinsb): Avoid hardcoding this.
  127. awsMasterIP = "172.20.0.9"
  128. // ssh port
  129. sshPort = "22"
  130. )
  131. var (
  132. // BusyBoxImage is the image URI of BusyBox.
  133. BusyBoxImage = imageutils.GetE2EImage(imageutils.BusyBox)
  134. // AgnHostImage is the image URI of AgnHost
  135. AgnHostImage = imageutils.GetE2EImage(imageutils.Agnhost)
  136. // ProvidersWithSSH are those providers where each node is accessible with SSH
  137. ProvidersWithSSH = []string{"gce", "gke", "aws", "local"}
  138. // ServeHostnameImage is a serve hostname image name.
  139. ServeHostnameImage = imageutils.GetE2EImage(imageutils.Agnhost)
  140. )
  141. // RunID is a unique identifier of the e2e run.
  142. // Beware that this ID is not the same for all tests in the e2e run, because each Ginkgo node creates it separately.
  143. var RunID = uuid.NewUUID()
  144. // CreateTestingNSFn is a func that is responsible for creating namespace used for executing e2e tests.
  145. type CreateTestingNSFn func(baseName string, c clientset.Interface, labels map[string]string) (*v1.Namespace, error)
  146. // GetMasterHost returns a hostname of a master.
  147. func GetMasterHost() string {
  148. masterURL, err := url.Parse(TestContext.Host)
  149. ExpectNoError(err)
  150. return masterURL.Hostname()
  151. }
  152. // ProviderIs returns true if the provider is included is the providers. Otherwise false.
  153. func ProviderIs(providers ...string) bool {
  154. for _, provider := range providers {
  155. if strings.EqualFold(provider, TestContext.Provider) {
  156. return true
  157. }
  158. }
  159. return false
  160. }
  161. // MasterOSDistroIs returns true if the master OS distro is included in the supportedMasterOsDistros. Otherwise false.
  162. func MasterOSDistroIs(supportedMasterOsDistros ...string) bool {
  163. for _, distro := range supportedMasterOsDistros {
  164. if strings.EqualFold(distro, TestContext.MasterOSDistro) {
  165. return true
  166. }
  167. }
  168. return false
  169. }
  170. // NodeOSDistroIs returns true if the node OS distro is included in the supportedNodeOsDistros. Otherwise false.
  171. func NodeOSDistroIs(supportedNodeOsDistros ...string) bool {
  172. for _, distro := range supportedNodeOsDistros {
  173. if strings.EqualFold(distro, TestContext.NodeOSDistro) {
  174. return true
  175. }
  176. }
  177. return false
  178. }
  179. // DeleteNamespaces deletes all namespaces that match the given delete and skip filters.
  180. // Filter is by simple strings.Contains; first skip filter, then delete filter.
  181. // Returns the list of deleted namespaces or an error.
  182. func DeleteNamespaces(c clientset.Interface, deleteFilter, skipFilter []string) ([]string, error) {
  183. ginkgo.By("Deleting namespaces")
  184. nsList, err := c.CoreV1().Namespaces().List(context.TODO(), metav1.ListOptions{})
  185. ExpectNoError(err, "Failed to get namespace list")
  186. var deleted []string
  187. var wg sync.WaitGroup
  188. OUTER:
  189. for _, item := range nsList.Items {
  190. if skipFilter != nil {
  191. for _, pattern := range skipFilter {
  192. if strings.Contains(item.Name, pattern) {
  193. continue OUTER
  194. }
  195. }
  196. }
  197. if deleteFilter != nil {
  198. var shouldDelete bool
  199. for _, pattern := range deleteFilter {
  200. if strings.Contains(item.Name, pattern) {
  201. shouldDelete = true
  202. break
  203. }
  204. }
  205. if !shouldDelete {
  206. continue OUTER
  207. }
  208. }
  209. wg.Add(1)
  210. deleted = append(deleted, item.Name)
  211. go func(nsName string) {
  212. defer wg.Done()
  213. defer ginkgo.GinkgoRecover()
  214. gomega.Expect(c.CoreV1().Namespaces().Delete(context.TODO(), nsName, nil)).To(gomega.Succeed())
  215. Logf("namespace : %v api call to delete is complete ", nsName)
  216. }(item.Name)
  217. }
  218. wg.Wait()
  219. return deleted, nil
  220. }
  221. // WaitForNamespacesDeleted waits for the namespaces to be deleted.
  222. func WaitForNamespacesDeleted(c clientset.Interface, namespaces []string, timeout time.Duration) error {
  223. ginkgo.By("Waiting for namespaces to vanish")
  224. nsMap := map[string]bool{}
  225. for _, ns := range namespaces {
  226. nsMap[ns] = true
  227. }
  228. //Now POLL until all namespaces have been eradicated.
  229. return wait.Poll(2*time.Second, timeout,
  230. func() (bool, error) {
  231. nsList, err := c.CoreV1().Namespaces().List(context.TODO(), metav1.ListOptions{})
  232. if err != nil {
  233. return false, err
  234. }
  235. for _, item := range nsList.Items {
  236. if _, ok := nsMap[item.Name]; ok {
  237. return false, nil
  238. }
  239. }
  240. return true, nil
  241. })
  242. }
  243. func waitForServiceAccountInNamespace(c clientset.Interface, ns, serviceAccountName string, timeout time.Duration) error {
  244. w, err := c.CoreV1().ServiceAccounts(ns).Watch(context.TODO(), metav1.SingleObject(metav1.ObjectMeta{Name: serviceAccountName}))
  245. if err != nil {
  246. return err
  247. }
  248. ctx, cancel := watchtools.ContextWithOptionalTimeout(context.Background(), timeout)
  249. defer cancel()
  250. _, err = watchtools.UntilWithoutRetry(ctx, w, conditions.ServiceAccountHasSecrets)
  251. return err
  252. }
  253. // WaitForDefaultServiceAccountInNamespace waits for the default service account to be provisioned
  254. // the default service account is what is associated with pods when they do not specify a service account
  255. // as a result, pods are not able to be provisioned in a namespace until the service account is provisioned
  256. func WaitForDefaultServiceAccountInNamespace(c clientset.Interface, namespace string) error {
  257. return waitForServiceAccountInNamespace(c, namespace, "default", ServiceAccountProvisionTimeout)
  258. }
  259. // WaitForPersistentVolumeDeleted waits for a PersistentVolume to get deleted or until timeout occurs, whichever comes first.
  260. func WaitForPersistentVolumeDeleted(c clientset.Interface, pvName string, Poll, timeout time.Duration) error {
  261. Logf("Waiting up to %v for PersistentVolume %s to get deleted", timeout, pvName)
  262. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  263. pv, err := c.CoreV1().PersistentVolumes().Get(context.TODO(), pvName, metav1.GetOptions{})
  264. if err == nil {
  265. Logf("PersistentVolume %s found and phase=%s (%v)", pvName, pv.Status.Phase, time.Since(start))
  266. continue
  267. }
  268. if apierrors.IsNotFound(err) {
  269. Logf("PersistentVolume %s was removed", pvName)
  270. return nil
  271. }
  272. Logf("Get persistent volume %s in failed, ignoring for %v: %v", pvName, Poll, err)
  273. }
  274. return fmt.Errorf("PersistentVolume %s still exists within %v", pvName, timeout)
  275. }
  276. // findAvailableNamespaceName random namespace name starting with baseName.
  277. func findAvailableNamespaceName(baseName string, c clientset.Interface) (string, error) {
  278. var name string
  279. err := wait.PollImmediate(Poll, 30*time.Second, func() (bool, error) {
  280. name = fmt.Sprintf("%v-%v", baseName, RandomSuffix())
  281. _, err := c.CoreV1().Namespaces().Get(context.TODO(), name, metav1.GetOptions{})
  282. if err == nil {
  283. // Already taken
  284. return false, nil
  285. }
  286. if apierrors.IsNotFound(err) {
  287. return true, nil
  288. }
  289. Logf("Unexpected error while getting namespace: %v", err)
  290. return false, nil
  291. })
  292. return name, err
  293. }
  294. // CreateTestingNS should be used by every test, note that we append a common prefix to the provided test name.
  295. // Please see NewFramework instead of using this directly.
  296. func CreateTestingNS(baseName string, c clientset.Interface, labels map[string]string) (*v1.Namespace, error) {
  297. if labels == nil {
  298. labels = map[string]string{}
  299. }
  300. labels["e2e-run"] = string(RunID)
  301. // We don't use ObjectMeta.GenerateName feature, as in case of API call
  302. // failure we don't know whether the namespace was created and what is its
  303. // name.
  304. name, err := findAvailableNamespaceName(baseName, c)
  305. if err != nil {
  306. return nil, err
  307. }
  308. namespaceObj := &v1.Namespace{
  309. ObjectMeta: metav1.ObjectMeta{
  310. Name: name,
  311. Namespace: "",
  312. Labels: labels,
  313. },
  314. Status: v1.NamespaceStatus{},
  315. }
  316. // Be robust about making the namespace creation call.
  317. var got *v1.Namespace
  318. if err := wait.PollImmediate(Poll, 30*time.Second, func() (bool, error) {
  319. var err error
  320. got, err = c.CoreV1().Namespaces().Create(context.TODO(), namespaceObj, metav1.CreateOptions{})
  321. if err != nil {
  322. Logf("Unexpected error while creating namespace: %v", err)
  323. return false, nil
  324. }
  325. return true, nil
  326. }); err != nil {
  327. return nil, err
  328. }
  329. if TestContext.VerifyServiceAccount {
  330. if err := WaitForDefaultServiceAccountInNamespace(c, got.Name); err != nil {
  331. // Even if we fail to create serviceAccount in the namespace,
  332. // we have successfully create a namespace.
  333. // So, return the created namespace.
  334. return got, err
  335. }
  336. }
  337. return got, nil
  338. }
  339. // CheckTestingNSDeletedExcept checks whether all e2e based existing namespaces are in the Terminating state
  340. // and waits until they are finally deleted. It ignores namespace skip.
  341. func CheckTestingNSDeletedExcept(c clientset.Interface, skip string) error {
  342. // TODO: Since we don't have support for bulk resource deletion in the API,
  343. // while deleting a namespace we are deleting all objects from that namespace
  344. // one by one (one deletion == one API call). This basically exposes us to
  345. // throttling - currently controller-manager has a limit of max 20 QPS.
  346. // Once #10217 is implemented and used in namespace-controller, deleting all
  347. // object from a given namespace should be much faster and we will be able
  348. // to lower this timeout.
  349. // However, now Density test is producing ~26000 events and Load capacity test
  350. // is producing ~35000 events, thus assuming there are no other requests it will
  351. // take ~30 minutes to fully delete the namespace. Thus I'm setting it to 60
  352. // minutes to avoid any timeouts here.
  353. timeout := 60 * time.Minute
  354. Logf("Waiting for terminating namespaces to be deleted...")
  355. for start := time.Now(); time.Since(start) < timeout; time.Sleep(15 * time.Second) {
  356. namespaces, err := c.CoreV1().Namespaces().List(context.TODO(), metav1.ListOptions{})
  357. if err != nil {
  358. Logf("Listing namespaces failed: %v", err)
  359. continue
  360. }
  361. terminating := 0
  362. for _, ns := range namespaces.Items {
  363. if strings.HasPrefix(ns.ObjectMeta.Name, "e2e-tests-") && ns.ObjectMeta.Name != skip {
  364. if ns.Status.Phase == v1.NamespaceActive {
  365. return fmt.Errorf("Namespace %s is active", ns.ObjectMeta.Name)
  366. }
  367. terminating++
  368. }
  369. }
  370. if terminating == 0 {
  371. return nil
  372. }
  373. }
  374. return fmt.Errorf("Waiting for terminating namespaces to be deleted timed out")
  375. }
  376. // WaitForService waits until the service appears (exist == true), or disappears (exist == false)
  377. func WaitForService(c clientset.Interface, namespace, name string, exist bool, interval, timeout time.Duration) error {
  378. err := wait.PollImmediate(interval, timeout, func() (bool, error) {
  379. _, err := c.CoreV1().Services(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  380. switch {
  381. case err == nil:
  382. Logf("Service %s in namespace %s found.", name, namespace)
  383. return exist, nil
  384. case apierrors.IsNotFound(err):
  385. Logf("Service %s in namespace %s disappeared.", name, namespace)
  386. return !exist, nil
  387. case !testutils.IsRetryableAPIError(err):
  388. Logf("Non-retryable failure while getting service.")
  389. return false, err
  390. default:
  391. Logf("Get service %s in namespace %s failed: %v", name, namespace, err)
  392. return false, nil
  393. }
  394. })
  395. if err != nil {
  396. stateMsg := map[bool]string{true: "to appear", false: "to disappear"}
  397. return fmt.Errorf("error waiting for service %s/%s %s: %v", namespace, name, stateMsg[exist], err)
  398. }
  399. return nil
  400. }
  401. //WaitForServiceEndpointsNum waits until the amount of endpoints that implement service to expectNum.
  402. func WaitForServiceEndpointsNum(c clientset.Interface, namespace, serviceName string, expectNum int, interval, timeout time.Duration) error {
  403. return wait.Poll(interval, timeout, func() (bool, error) {
  404. Logf("Waiting for amount of service:%s endpoints to be %d", serviceName, expectNum)
  405. list, err := c.CoreV1().Endpoints(namespace).List(context.TODO(), metav1.ListOptions{})
  406. if err != nil {
  407. return false, err
  408. }
  409. for _, e := range list.Items {
  410. if e.Name == serviceName && countEndpointsNum(&e) == expectNum {
  411. return true, nil
  412. }
  413. }
  414. return false, nil
  415. })
  416. }
  417. func countEndpointsNum(e *v1.Endpoints) int {
  418. num := 0
  419. for _, sub := range e.Subsets {
  420. num += len(sub.Addresses)
  421. }
  422. return num
  423. }
  424. // restclientConfig returns a config holds the information needed to build connection to kubernetes clusters.
  425. func restclientConfig(kubeContext string) (*clientcmdapi.Config, error) {
  426. Logf(">>> kubeConfig: %s", TestContext.KubeConfig)
  427. if TestContext.KubeConfig == "" {
  428. return nil, fmt.Errorf("KubeConfig must be specified to load client config")
  429. }
  430. c, err := clientcmd.LoadFromFile(TestContext.KubeConfig)
  431. if err != nil {
  432. return nil, fmt.Errorf("error loading KubeConfig: %v", err.Error())
  433. }
  434. if kubeContext != "" {
  435. Logf(">>> kubeContext: %s", kubeContext)
  436. c.CurrentContext = kubeContext
  437. }
  438. return c, nil
  439. }
  440. // ClientConfigGetter is a func that returns getter to return a config.
  441. type ClientConfigGetter func() (*restclient.Config, error)
  442. // LoadConfig returns a config for a rest client with the UserAgent set to include the current test name.
  443. func LoadConfig() (config *restclient.Config, err error) {
  444. defer func() {
  445. if err == nil && config != nil {
  446. testDesc := ginkgo.CurrentGinkgoTestDescription()
  447. if len(testDesc.ComponentTexts) > 0 {
  448. componentTexts := strings.Join(testDesc.ComponentTexts, " ")
  449. config.UserAgent = fmt.Sprintf("%s -- %s", rest.DefaultKubernetesUserAgent(), componentTexts)
  450. }
  451. }
  452. }()
  453. if TestContext.NodeE2E {
  454. // This is a node e2e test, apply the node e2e configuration
  455. return &restclient.Config{Host: TestContext.Host}, nil
  456. }
  457. c, err := restclientConfig(TestContext.KubeContext)
  458. if err != nil {
  459. if TestContext.KubeConfig == "" {
  460. return restclient.InClusterConfig()
  461. }
  462. return nil, err
  463. }
  464. // In case Host is not set in TestContext, sets it as
  465. // CurrentContext Server for k8s API client to connect to.
  466. if TestContext.Host == "" && c.Clusters != nil {
  467. currentContext, ok := c.Clusters[c.CurrentContext]
  468. if ok {
  469. TestContext.Host = currentContext.Server
  470. }
  471. }
  472. return clientcmd.NewDefaultClientConfig(*c, &clientcmd.ConfigOverrides{ClusterInfo: clientcmdapi.Cluster{Server: TestContext.Host}}).ClientConfig()
  473. }
  474. // LoadClientset returns clientset for connecting to kubernetes clusters.
  475. func LoadClientset() (*clientset.Clientset, error) {
  476. config, err := LoadConfig()
  477. if err != nil {
  478. return nil, fmt.Errorf("error creating client: %v", err.Error())
  479. }
  480. return clientset.NewForConfig(config)
  481. }
  482. // RandomSuffix provides a random sequence to append to pods,services,rcs.
  483. func RandomSuffix() string {
  484. return strconv.Itoa(rand.Intn(10000))
  485. }
  486. // Cleanup stops everything from filePath from namespace ns and checks if everything matching selectors from the given namespace is correctly stopped.
  487. func Cleanup(filePath, ns string, selectors ...string) {
  488. ginkgo.By("using delete to clean up resources")
  489. var nsArg string
  490. if ns != "" {
  491. nsArg = fmt.Sprintf("--namespace=%s", ns)
  492. }
  493. RunKubectlOrDie(ns, "delete", "--grace-period=0", "-f", filePath, nsArg)
  494. AssertCleanup(ns, selectors...)
  495. }
  496. // AssertCleanup asserts that cleanup of a namespace wrt selectors occurred.
  497. func AssertCleanup(ns string, selectors ...string) {
  498. var nsArg string
  499. if ns != "" {
  500. nsArg = fmt.Sprintf("--namespace=%s", ns)
  501. }
  502. var e error
  503. verifyCleanupFunc := func() (bool, error) {
  504. e = nil
  505. for _, selector := range selectors {
  506. resources := RunKubectlOrDie(ns, "get", "rc,svc", "-l", selector, "--no-headers", nsArg)
  507. if resources != "" {
  508. e = fmt.Errorf("Resources left running after stop:\n%s", resources)
  509. return false, nil
  510. }
  511. pods := RunKubectlOrDie(ns, "get", "pods", "-l", selector, nsArg, "-o", "go-template={{ range .items }}{{ if not .metadata.deletionTimestamp }}{{ .metadata.name }}{{ \"\\n\" }}{{ end }}{{ end }}")
  512. if pods != "" {
  513. e = fmt.Errorf("Pods left unterminated after stop:\n%s", pods)
  514. return false, nil
  515. }
  516. }
  517. return true, nil
  518. }
  519. err := wait.PollImmediate(500*time.Millisecond, 1*time.Minute, verifyCleanupFunc)
  520. if err != nil {
  521. Failf(e.Error())
  522. }
  523. }
  524. // LookForStringInPodExec looks for the given string in the output of a command
  525. // executed in a specific pod container.
  526. // TODO(alejandrox1): move to pod/ subpkg once kubectl methods are refactored.
  527. func LookForStringInPodExec(ns, podName string, command []string, expectedString string, timeout time.Duration) (result string, err error) {
  528. return lookForString(expectedString, timeout, func() string {
  529. // use the first container
  530. args := []string{"exec", podName, fmt.Sprintf("--namespace=%v", ns), "--"}
  531. args = append(args, command...)
  532. return RunKubectlOrDie(ns, args...)
  533. })
  534. }
  535. // lookForString looks for the given string in the output of fn, repeatedly calling fn until
  536. // the timeout is reached or the string is found. Returns last log and possibly
  537. // error if the string was not found.
  538. // TODO(alejandrox1): move to pod/ subpkg once kubectl methods are refactored.
  539. func lookForString(expectedString string, timeout time.Duration, fn func() string) (result string, err error) {
  540. for t := time.Now(); time.Since(t) < timeout; time.Sleep(Poll) {
  541. result = fn()
  542. if strings.Contains(result, expectedString) {
  543. return
  544. }
  545. }
  546. err = fmt.Errorf("Failed to find \"%s\", last result: \"%s\"", expectedString, result)
  547. return
  548. }
  549. // KubectlBuilder is used to build, customize and execute a kubectl Command.
  550. // Add more functions to customize the builder as needed.
  551. type KubectlBuilder struct {
  552. cmd *exec.Cmd
  553. timeout <-chan time.Time
  554. }
  555. // NewKubectlCommand returns a KubectlBuilder for running kubectl.
  556. func NewKubectlCommand(namespace string, args ...string) *KubectlBuilder {
  557. b := new(KubectlBuilder)
  558. tk := e2ekubectl.NewTestKubeconfig(TestContext.CertDir, TestContext.Host, TestContext.KubeConfig, TestContext.KubeContext, TestContext.KubectlPath, namespace)
  559. b.cmd = tk.KubectlCmd(args...)
  560. return b
  561. }
  562. // WithEnv sets the given environment and returns itself.
  563. func (b *KubectlBuilder) WithEnv(env []string) *KubectlBuilder {
  564. b.cmd.Env = env
  565. return b
  566. }
  567. // WithTimeout sets the given timeout and returns itself.
  568. func (b *KubectlBuilder) WithTimeout(t <-chan time.Time) *KubectlBuilder {
  569. b.timeout = t
  570. return b
  571. }
  572. // WithStdinData sets the given data to stdin and returns itself.
  573. func (b KubectlBuilder) WithStdinData(data string) *KubectlBuilder {
  574. b.cmd.Stdin = strings.NewReader(data)
  575. return &b
  576. }
  577. // WithStdinReader sets the given reader and returns itself.
  578. func (b KubectlBuilder) WithStdinReader(reader io.Reader) *KubectlBuilder {
  579. b.cmd.Stdin = reader
  580. return &b
  581. }
  582. // ExecOrDie runs the kubectl executable or dies if error occurs.
  583. func (b KubectlBuilder) ExecOrDie(namespace string) string {
  584. str, err := b.Exec()
  585. // In case of i/o timeout error, try talking to the apiserver again after 2s before dying.
  586. // Note that we're still dying after retrying so that we can get visibility to triage it further.
  587. if isTimeout(err) {
  588. Logf("Hit i/o timeout error, talking to the server 2s later to see if it's temporary.")
  589. time.Sleep(2 * time.Second)
  590. retryStr, retryErr := RunKubectl(namespace, "version")
  591. Logf("stdout: %q", retryStr)
  592. Logf("err: %v", retryErr)
  593. }
  594. ExpectNoError(err)
  595. return str
  596. }
  597. func isTimeout(err error) bool {
  598. switch err := err.(type) {
  599. case *url.Error:
  600. if err, ok := err.Err.(net.Error); ok && err.Timeout() {
  601. return true
  602. }
  603. case net.Error:
  604. if err.Timeout() {
  605. return true
  606. }
  607. }
  608. return false
  609. }
  610. // Exec runs the kubectl executable.
  611. func (b KubectlBuilder) Exec() (string, error) {
  612. var stdout, stderr bytes.Buffer
  613. cmd := b.cmd
  614. cmd.Stdout, cmd.Stderr = &stdout, &stderr
  615. Logf("Running '%s %s'", cmd.Path, strings.Join(cmd.Args[1:], " ")) // skip arg[0] as it is printed separately
  616. if err := cmd.Start(); err != nil {
  617. return "", fmt.Errorf("error starting %v:\nCommand stdout:\n%v\nstderr:\n%v\nerror:\n%v", cmd, cmd.Stdout, cmd.Stderr, err)
  618. }
  619. errCh := make(chan error, 1)
  620. go func() {
  621. errCh <- cmd.Wait()
  622. }()
  623. select {
  624. case err := <-errCh:
  625. if err != nil {
  626. var rc = 127
  627. if ee, ok := err.(*exec.ExitError); ok {
  628. rc = int(ee.Sys().(syscall.WaitStatus).ExitStatus())
  629. Logf("rc: %d", rc)
  630. }
  631. return "", uexec.CodeExitError{
  632. Err: fmt.Errorf("error running %v:\nCommand stdout:\n%v\nstderr:\n%v\nerror:\n%v", cmd, cmd.Stdout, cmd.Stderr, err),
  633. Code: rc,
  634. }
  635. }
  636. case <-b.timeout:
  637. b.cmd.Process.Kill()
  638. return "", fmt.Errorf("timed out waiting for command %v:\nCommand stdout:\n%v\nstderr:\n%v", cmd, cmd.Stdout, cmd.Stderr)
  639. }
  640. Logf("stderr: %q", stderr.String())
  641. Logf("stdout: %q", stdout.String())
  642. return stdout.String(), nil
  643. }
  644. // RunKubectlOrDie is a convenience wrapper over kubectlBuilder
  645. func RunKubectlOrDie(namespace string, args ...string) string {
  646. return NewKubectlCommand(namespace, args...).ExecOrDie(namespace)
  647. }
  648. // RunKubectl is a convenience wrapper over kubectlBuilder
  649. func RunKubectl(namespace string, args ...string) (string, error) {
  650. return NewKubectlCommand(namespace, args...).Exec()
  651. }
  652. // RunKubectlOrDieInput is a convenience wrapper over kubectlBuilder that takes input to stdin
  653. func RunKubectlOrDieInput(namespace string, data string, args ...string) string {
  654. return NewKubectlCommand(namespace, args...).WithStdinData(data).ExecOrDie(namespace)
  655. }
  656. // RunKubectlInput is a convenience wrapper over kubectlBuilder that takes input to stdin
  657. func RunKubectlInput(namespace string, data string, args ...string) (string, error) {
  658. return NewKubectlCommand(namespace, args...).WithStdinData(data).Exec()
  659. }
  660. // RunKubemciWithKubeconfig is a convenience wrapper over RunKubemciCmd
  661. func RunKubemciWithKubeconfig(args ...string) (string, error) {
  662. if TestContext.KubeConfig != "" {
  663. args = append(args, "--"+clientcmd.RecommendedConfigPathFlag+"="+TestContext.KubeConfig)
  664. }
  665. return RunKubemciCmd(args...)
  666. }
  667. // RunKubemciCmd is a convenience wrapper over kubectlBuilder to run kubemci.
  668. // It assumes that kubemci exists in PATH.
  669. func RunKubemciCmd(args ...string) (string, error) {
  670. // kubemci is assumed to be in PATH.
  671. kubemci := "kubemci"
  672. b := new(KubectlBuilder)
  673. args = append(args, "--gcp-project="+TestContext.CloudConfig.ProjectID)
  674. b.cmd = exec.Command(kubemci, args...)
  675. return b.Exec()
  676. }
  677. // StartCmdAndStreamOutput returns stdout and stderr after starting the given cmd.
  678. func StartCmdAndStreamOutput(cmd *exec.Cmd) (stdout, stderr io.ReadCloser, err error) {
  679. stdout, err = cmd.StdoutPipe()
  680. if err != nil {
  681. return
  682. }
  683. stderr, err = cmd.StderrPipe()
  684. if err != nil {
  685. return
  686. }
  687. Logf("Asynchronously running '%s %s'", cmd.Path, strings.Join(cmd.Args, " "))
  688. err = cmd.Start()
  689. return
  690. }
  691. // TryKill is rough equivalent of ctrl+c for cleaning up processes. Intended to be run in defer.
  692. func TryKill(cmd *exec.Cmd) {
  693. if err := cmd.Process.Kill(); err != nil {
  694. Logf("ERROR failed to kill command %v! The process may leak", cmd)
  695. }
  696. }
  697. // testContainerOutputMatcher runs the given pod in the given namespace and waits
  698. // for all of the containers in the podSpec to move into the 'Success' status, and tests
  699. // the specified container log against the given expected output using the given matcher.
  700. func (f *Framework) testContainerOutputMatcher(scenarioName string,
  701. pod *v1.Pod,
  702. containerIndex int,
  703. expectedOutput []string,
  704. matcher func(string, ...interface{}) gomegatypes.GomegaMatcher) {
  705. ginkgo.By(fmt.Sprintf("Creating a pod to test %v", scenarioName))
  706. if containerIndex < 0 || containerIndex >= len(pod.Spec.Containers) {
  707. Failf("Invalid container index: %d", containerIndex)
  708. }
  709. ExpectNoError(f.MatchContainerOutput(pod, pod.Spec.Containers[containerIndex].Name, expectedOutput, matcher))
  710. }
  711. // MatchContainerOutput creates a pod and waits for all it's containers to exit with success.
  712. // It then tests that the matcher with each expectedOutput matches the output of the specified container.
  713. func (f *Framework) MatchContainerOutput(
  714. pod *v1.Pod,
  715. containerName string,
  716. expectedOutput []string,
  717. matcher func(string, ...interface{}) gomegatypes.GomegaMatcher) error {
  718. ns := pod.ObjectMeta.Namespace
  719. if ns == "" {
  720. ns = f.Namespace.Name
  721. }
  722. podClient := f.PodClientNS(ns)
  723. createdPod := podClient.Create(pod)
  724. defer func() {
  725. ginkgo.By("delete the pod")
  726. podClient.DeleteSync(createdPod.Name, &metav1.DeleteOptions{}, DefaultPodDeletionTimeout)
  727. }()
  728. // Wait for client pod to complete.
  729. podErr := e2epod.WaitForPodSuccessInNamespace(f.ClientSet, createdPod.Name, ns)
  730. // Grab its logs. Get host first.
  731. podStatus, err := podClient.Get(context.TODO(), createdPod.Name, metav1.GetOptions{})
  732. if err != nil {
  733. return fmt.Errorf("failed to get pod status: %v", err)
  734. }
  735. if podErr != nil {
  736. // Pod failed. Dump all logs from all containers to see what's wrong
  737. _ = podutil.VisitContainers(&podStatus.Spec, func(c *v1.Container) bool {
  738. logs, err := e2epod.GetPodLogs(f.ClientSet, ns, podStatus.Name, c.Name)
  739. if err != nil {
  740. Logf("Failed to get logs from node %q pod %q container %q: %v",
  741. podStatus.Spec.NodeName, podStatus.Name, c.Name, err)
  742. } else {
  743. Logf("Output of node %q pod %q container %q: %s", podStatus.Spec.NodeName, podStatus.Name, c.Name, logs)
  744. }
  745. return true
  746. })
  747. return fmt.Errorf("expected pod %q success: %v", createdPod.Name, podErr)
  748. }
  749. Logf("Trying to get logs from node %s pod %s container %s: %v",
  750. podStatus.Spec.NodeName, podStatus.Name, containerName, err)
  751. // Sometimes the actual containers take a second to get started, try to get logs for 60s
  752. logs, err := e2epod.GetPodLogs(f.ClientSet, ns, podStatus.Name, containerName)
  753. if err != nil {
  754. Logf("Failed to get logs from node %q pod %q container %q. %v",
  755. podStatus.Spec.NodeName, podStatus.Name, containerName, err)
  756. return fmt.Errorf("failed to get logs from %s for %s: %v", podStatus.Name, containerName, err)
  757. }
  758. for _, expected := range expectedOutput {
  759. m := matcher(expected)
  760. matches, err := m.Match(logs)
  761. if err != nil {
  762. return fmt.Errorf("expected %q in container output: %v", expected, err)
  763. } else if !matches {
  764. return fmt.Errorf("expected %q in container output: %s", expected, m.FailureMessage(logs))
  765. }
  766. }
  767. return nil
  768. }
  769. // EventsLister is a func that lists events.
  770. type EventsLister func(opts metav1.ListOptions, ns string) (*v1.EventList, error)
  771. // dumpEventsInNamespace dumps events in the given namespace.
  772. func dumpEventsInNamespace(eventsLister EventsLister, namespace string) {
  773. ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespace))
  774. events, err := eventsLister(metav1.ListOptions{}, namespace)
  775. ExpectNoError(err, "failed to list events in namespace %q", namespace)
  776. ginkgo.By(fmt.Sprintf("Found %d events.", len(events.Items)))
  777. // Sort events by their first timestamp
  778. sortedEvents := events.Items
  779. if len(sortedEvents) > 1 {
  780. sort.Sort(byFirstTimestamp(sortedEvents))
  781. }
  782. for _, e := range sortedEvents {
  783. Logf("At %v - event for %v: %v %v: %v", e.FirstTimestamp, e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
  784. }
  785. // Note that we don't wait for any Cleanup to propagate, which means
  786. // that if you delete a bunch of pods right before ending your test,
  787. // you may or may not see the killing/deletion/Cleanup events.
  788. }
  789. // DumpAllNamespaceInfo dumps events, pods and nodes information in the given namespace.
  790. func DumpAllNamespaceInfo(c clientset.Interface, namespace string) {
  791. dumpEventsInNamespace(func(opts metav1.ListOptions, ns string) (*v1.EventList, error) {
  792. return c.CoreV1().Events(ns).List(context.TODO(), opts)
  793. }, namespace)
  794. e2epod.DumpAllPodInfoForNamespace(c, namespace)
  795. // If cluster is large, then the following logs are basically useless, because:
  796. // 1. it takes tens of minutes or hours to grab all of them
  797. // 2. there are so many of them that working with them are mostly impossible
  798. // So we dump them only if the cluster is relatively small.
  799. maxNodesForDump := TestContext.MaxNodesToGather
  800. nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
  801. if err != nil {
  802. Logf("unable to fetch node list: %v", err)
  803. return
  804. }
  805. if len(nodes.Items) <= maxNodesForDump {
  806. dumpAllNodeInfo(c, nodes)
  807. } else {
  808. Logf("skipping dumping cluster info - cluster too large")
  809. }
  810. }
  811. // byFirstTimestamp sorts a slice of events by first timestamp, using their involvedObject's name as a tie breaker.
  812. type byFirstTimestamp []v1.Event
  813. func (o byFirstTimestamp) Len() int { return len(o) }
  814. func (o byFirstTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  815. func (o byFirstTimestamp) Less(i, j int) bool {
  816. if o[i].FirstTimestamp.Equal(&o[j].FirstTimestamp) {
  817. return o[i].InvolvedObject.Name < o[j].InvolvedObject.Name
  818. }
  819. return o[i].FirstTimestamp.Before(&o[j].FirstTimestamp)
  820. }
  821. func dumpAllNodeInfo(c clientset.Interface, nodes *v1.NodeList) {
  822. names := make([]string, len(nodes.Items))
  823. for ix := range nodes.Items {
  824. names[ix] = nodes.Items[ix].Name
  825. }
  826. DumpNodeDebugInfo(c, names, Logf)
  827. }
  828. // DumpNodeDebugInfo dumps debug information of the given nodes.
  829. func DumpNodeDebugInfo(c clientset.Interface, nodeNames []string, logFunc func(fmt string, args ...interface{})) {
  830. for _, n := range nodeNames {
  831. logFunc("\nLogging node info for node %v", n)
  832. node, err := c.CoreV1().Nodes().Get(context.TODO(), n, metav1.GetOptions{})
  833. if err != nil {
  834. logFunc("Error getting node info %v", err)
  835. }
  836. logFunc("Node Info: %v", node)
  837. logFunc("\nLogging kubelet events for node %v", n)
  838. for _, e := range getNodeEvents(c, n) {
  839. logFunc("source %v type %v message %v reason %v first ts %v last ts %v, involved obj %+v",
  840. e.Source, e.Type, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject)
  841. }
  842. logFunc("\nLogging pods the kubelet thinks is on node %v", n)
  843. podList, err := getKubeletPods(c, n)
  844. if err != nil {
  845. logFunc("Unable to retrieve kubelet pods for node %v: %v", n, err)
  846. continue
  847. }
  848. for _, p := range podList.Items {
  849. logFunc("%v started at %v (%d+%d container statuses recorded)", p.Name, p.Status.StartTime, len(p.Status.InitContainerStatuses), len(p.Status.ContainerStatuses))
  850. for _, c := range p.Status.InitContainerStatuses {
  851. logFunc("\tInit container %v ready: %v, restart count %v",
  852. c.Name, c.Ready, c.RestartCount)
  853. }
  854. for _, c := range p.Status.ContainerStatuses {
  855. logFunc("\tContainer %v ready: %v, restart count %v",
  856. c.Name, c.Ready, c.RestartCount)
  857. }
  858. }
  859. e2emetrics.HighLatencyKubeletOperations(c, 10*time.Second, n, logFunc)
  860. // TODO: Log node resource info
  861. }
  862. }
  863. // getKubeletPods retrieves the list of pods on the kubelet.
  864. func getKubeletPods(c clientset.Interface, node string) (*v1.PodList, error) {
  865. var client restclient.Result
  866. finished := make(chan struct{}, 1)
  867. go func() {
  868. // call chain tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165
  869. client = c.CoreV1().RESTClient().Get().
  870. Resource("nodes").
  871. SubResource("proxy").
  872. Name(fmt.Sprintf("%v:%v", node, ports.KubeletPort)).
  873. Suffix("pods").
  874. Do(context.TODO())
  875. finished <- struct{}{}
  876. }()
  877. select {
  878. case <-finished:
  879. result := &v1.PodList{}
  880. if err := client.Into(result); err != nil {
  881. return &v1.PodList{}, err
  882. }
  883. return result, nil
  884. case <-time.After(PodGetTimeout):
  885. return &v1.PodList{}, fmt.Errorf("Waiting up to %v for getting the list of pods", PodGetTimeout)
  886. }
  887. }
  888. // logNodeEvents logs kubelet events from the given node. This includes kubelet
  889. // restart and node unhealthy events. Note that listing events like this will mess
  890. // with latency metrics, beware of calling it during a test.
  891. func getNodeEvents(c clientset.Interface, nodeName string) []v1.Event {
  892. selector := fields.Set{
  893. "involvedObject.kind": "Node",
  894. "involvedObject.name": nodeName,
  895. "involvedObject.namespace": metav1.NamespaceAll,
  896. "source": "kubelet",
  897. }.AsSelector().String()
  898. options := metav1.ListOptions{FieldSelector: selector}
  899. events, err := c.CoreV1().Events(metav1.NamespaceSystem).List(context.TODO(), options)
  900. if err != nil {
  901. Logf("Unexpected error retrieving node events %v", err)
  902. return []v1.Event{}
  903. }
  904. return events.Items
  905. }
  906. // WaitForAllNodesSchedulable waits up to timeout for all
  907. // (but TestContext.AllowedNotReadyNodes) to become scheduable.
  908. func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
  909. Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
  910. return wait.PollImmediate(
  911. 30*time.Second,
  912. timeout,
  913. e2enode.CheckReadyForTests(c, TestContext.NonblockingTaints, TestContext.AllowedNotReadyNodes, largeClusterThreshold),
  914. )
  915. }
  916. // GetPodSecretUpdateTimeout reuturns the timeout duration for updating pod secret.
  917. func GetPodSecretUpdateTimeout(c clientset.Interface) time.Duration {
  918. // With SecretManager(ConfigMapManager), we may have to wait up to full sync period +
  919. // TTL of secret(configmap) to elapse before the Kubelet projects the update into the
  920. // volume and the container picks it up.
  921. // So this timeout is based on default Kubelet sync period (1 minute) + maximum TTL for
  922. // secret(configmap) that's based on cluster size + additional time as a fudge factor.
  923. secretTTL, err := getNodeTTLAnnotationValue(c)
  924. if err != nil {
  925. Logf("Couldn't get node TTL annotation (using default value of 0): %v", err)
  926. }
  927. podLogTimeout := 240*time.Second + secretTTL
  928. return podLogTimeout
  929. }
  930. func getNodeTTLAnnotationValue(c clientset.Interface) (time.Duration, error) {
  931. nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
  932. if err != nil || len(nodes.Items) == 0 {
  933. return time.Duration(0), fmt.Errorf("Couldn't list any nodes to get TTL annotation: %v", err)
  934. }
  935. // Since TTL the kubelet is using is stored in node object, for the timeout
  936. // purpose we take it from the first node (all of them should be the same).
  937. node := &nodes.Items[0]
  938. if node.Annotations == nil {
  939. return time.Duration(0), fmt.Errorf("No annotations found on the node")
  940. }
  941. value, ok := node.Annotations[v1.ObjectTTLAnnotationKey]
  942. if !ok {
  943. return time.Duration(0), fmt.Errorf("No TTL annotation found on the node")
  944. }
  945. intValue, err := strconv.Atoi(value)
  946. if err != nil {
  947. return time.Duration(0), fmt.Errorf("Cannot convert TTL annotation from %#v to int", *node)
  948. }
  949. return time.Duration(intValue) * time.Second, nil
  950. }
  951. // AddOrUpdateLabelOnNode adds the given label key and value to the given node or updates value.
  952. func AddOrUpdateLabelOnNode(c clientset.Interface, nodeName string, labelKey, labelValue string) {
  953. ExpectNoError(testutils.AddLabelsToNode(c, nodeName, map[string]string{labelKey: labelValue}))
  954. }
  955. // ExpectNodeHasLabel expects that the given node has the given label pair.
  956. func ExpectNodeHasLabel(c clientset.Interface, nodeName string, labelKey string, labelValue string) {
  957. ginkgo.By("verifying the node has the label " + labelKey + " " + labelValue)
  958. node, err := c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  959. ExpectNoError(err)
  960. ExpectEqual(node.Labels[labelKey], labelValue)
  961. }
  962. // RemoveTaintOffNode removes the given taint from the given node.
  963. func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint v1.Taint) {
  964. ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, nil, &taint))
  965. verifyThatTaintIsGone(c, nodeName, &taint)
  966. }
  967. // AddOrUpdateTaintOnNode adds the given taint to the given node or updates taint.
  968. func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taint v1.Taint) {
  969. ExpectNoError(controller.AddOrUpdateTaintOnNode(c, nodeName, &taint))
  970. }
  971. // RemoveLabelOffNode is for cleaning up labels temporarily added to node,
  972. // won't fail if target label doesn't exist or has been removed.
  973. func RemoveLabelOffNode(c clientset.Interface, nodeName string, labelKey string) {
  974. ginkgo.By("removing the label " + labelKey + " off the node " + nodeName)
  975. ExpectNoError(testutils.RemoveLabelOffNode(c, nodeName, []string{labelKey}))
  976. ginkgo.By("verifying the node doesn't have the label " + labelKey)
  977. ExpectNoError(testutils.VerifyLabelsRemoved(c, nodeName, []string{labelKey}))
  978. }
  979. func verifyThatTaintIsGone(c clientset.Interface, nodeName string, taint *v1.Taint) {
  980. ginkgo.By("verifying the node doesn't have the taint " + taint.ToString())
  981. nodeUpdated, err := c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  982. ExpectNoError(err)
  983. if taintutils.TaintExists(nodeUpdated.Spec.Taints, taint) {
  984. Failf("Failed removing taint " + taint.ToString() + " of the node " + nodeName)
  985. }
  986. }
  987. // ExpectNodeHasTaint expects that the node has the given taint.
  988. func ExpectNodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) {
  989. ginkgo.By("verifying the node has the taint " + taint.ToString())
  990. if has, err := NodeHasTaint(c, nodeName, taint); !has {
  991. ExpectNoError(err)
  992. Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName)
  993. }
  994. }
  995. // NodeHasTaint returns true if the node has the given taint, else returns false.
  996. func NodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) (bool, error) {
  997. node, err := c.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
  998. if err != nil {
  999. return false, err
  1000. }
  1001. nodeTaints := node.Spec.Taints
  1002. if len(nodeTaints) == 0 || !taintutils.TaintExists(nodeTaints, taint) {
  1003. return false, nil
  1004. }
  1005. return true, nil
  1006. }
  1007. // ScaleResource scales resource to the given size.
  1008. func ScaleResource(
  1009. clientset clientset.Interface,
  1010. scalesGetter scaleclient.ScalesGetter,
  1011. ns, name string,
  1012. size uint,
  1013. wait bool,
  1014. kind schema.GroupKind,
  1015. gvr schema.GroupVersionResource,
  1016. ) error {
  1017. ginkgo.By(fmt.Sprintf("Scaling %v %s in namespace %s to %d", kind, name, ns, size))
  1018. if err := testutils.ScaleResourceWithRetries(scalesGetter, ns, name, size, gvr); err != nil {
  1019. return fmt.Errorf("error while scaling RC %s to %d replicas: %v", name, size, err)
  1020. }
  1021. if !wait {
  1022. return nil
  1023. }
  1024. return e2epod.WaitForControlledPodsRunning(clientset, ns, name, kind)
  1025. }
  1026. // DeleteResourceAndWaitForGC deletes only given resource and waits for GC to delete the pods.
  1027. func DeleteResourceAndWaitForGC(c clientset.Interface, kind schema.GroupKind, ns, name string) error {
  1028. ginkgo.By(fmt.Sprintf("deleting %v %s in namespace %s, will wait for the garbage collector to delete the pods", kind, name, ns))
  1029. rtObject, err := e2eresource.GetRuntimeObjectForKind(c, kind, ns, name)
  1030. if err != nil {
  1031. if apierrors.IsNotFound(err) {
  1032. Logf("%v %s not found: %v", kind, name, err)
  1033. return nil
  1034. }
  1035. return err
  1036. }
  1037. selector, err := e2eresource.GetSelectorFromRuntimeObject(rtObject)
  1038. if err != nil {
  1039. return err
  1040. }
  1041. replicas, err := e2eresource.GetReplicasFromRuntimeObject(rtObject)
  1042. if err != nil {
  1043. return err
  1044. }
  1045. ps, err := testutils.NewPodStore(c, ns, selector, fields.Everything())
  1046. if err != nil {
  1047. return err
  1048. }
  1049. defer ps.Stop()
  1050. falseVar := false
  1051. deleteOption := &metav1.DeleteOptions{OrphanDependents: &falseVar}
  1052. startTime := time.Now()
  1053. if err := testutils.DeleteResourceWithRetries(c, kind, ns, name, deleteOption); err != nil {
  1054. return err
  1055. }
  1056. deleteTime := time.Since(startTime)
  1057. Logf("Deleting %v %s took: %v", kind, name, deleteTime)
  1058. var interval, timeout time.Duration
  1059. switch {
  1060. case replicas < 100:
  1061. interval = 100 * time.Millisecond
  1062. case replicas < 1000:
  1063. interval = 1 * time.Second
  1064. default:
  1065. interval = 10 * time.Second
  1066. }
  1067. if replicas < 5000 {
  1068. timeout = 10 * time.Minute
  1069. } else {
  1070. timeout = time.Duration(replicas/gcThroughput) * time.Second
  1071. // gcThroughput is pretty strict now, add a bit more to it
  1072. timeout = timeout + 3*time.Minute
  1073. }
  1074. err = waitForPodsInactive(ps, interval, timeout)
  1075. if err != nil {
  1076. return fmt.Errorf("error while waiting for pods to become inactive %s: %v", name, err)
  1077. }
  1078. terminatePodTime := time.Since(startTime) - deleteTime
  1079. Logf("Terminating %v %s pods took: %v", kind, name, terminatePodTime)
  1080. // In gce, at any point, small percentage of nodes can disappear for
  1081. // ~10 minutes due to hostError. 20 minutes should be long enough to
  1082. // restart VM in that case and delete the pod.
  1083. err = waitForPodsGone(ps, interval, 20*time.Minute)
  1084. if err != nil {
  1085. return fmt.Errorf("error while waiting for pods gone %s: %v", name, err)
  1086. }
  1087. return nil
  1088. }
  1089. // waitForPodsGone waits until there are no pods left in the PodStore.
  1090. func waitForPodsGone(ps *testutils.PodStore, interval, timeout time.Duration) error {
  1091. var pods []*v1.Pod
  1092. err := wait.PollImmediate(interval, timeout, func() (bool, error) {
  1093. if pods = ps.List(); len(pods) == 0 {
  1094. return true, nil
  1095. }
  1096. return false, nil
  1097. })
  1098. if err == wait.ErrWaitTimeout {
  1099. for _, pod := range pods {
  1100. Logf("ERROR: Pod %q still exists. Node: %q", pod.Name, pod.Spec.NodeName)
  1101. }
  1102. return fmt.Errorf("there are %d pods left. E.g. %q on node %q", len(pods), pods[0].Name, pods[0].Spec.NodeName)
  1103. }
  1104. return err
  1105. }
  1106. // waitForPodsInactive waits until there are no active pods left in the PodStore.
  1107. // This is to make a fair comparison of deletion time between DeleteRCAndPods
  1108. // and DeleteRCAndWaitForGC, because the RC controller decreases status.replicas
  1109. // when the pod is inactvie.
  1110. func waitForPodsInactive(ps *testutils.PodStore, interval, timeout time.Duration) error {
  1111. var activePods []*v1.Pod
  1112. err := wait.PollImmediate(interval, timeout, func() (bool, error) {
  1113. pods := ps.List()
  1114. activePods = controller.FilterActivePods(pods)
  1115. if len(activePods) != 0 {
  1116. return false, nil
  1117. }
  1118. return true, nil
  1119. })
  1120. if err == wait.ErrWaitTimeout {
  1121. for _, pod := range activePods {
  1122. Logf("ERROR: Pod %q running on %q is still active", pod.Name, pod.Spec.NodeName)
  1123. }
  1124. return fmt.Errorf("there are %d active pods. E.g. %q on node %q", len(activePods), activePods[0].Name, activePods[0].Spec.NodeName)
  1125. }
  1126. return err
  1127. }
  1128. // RunHostCmd runs the given cmd in the context of the given pod using `kubectl exec`
  1129. // inside of a shell.
  1130. func RunHostCmd(ns, name, cmd string) (string, error) {
  1131. return RunKubectl(ns, "exec", fmt.Sprintf("--namespace=%v", ns), name, "--", "/bin/sh", "-x", "-c", cmd)
  1132. }
  1133. // RunHostCmdOrDie calls RunHostCmd and dies on error.
  1134. func RunHostCmdOrDie(ns, name, cmd string) string {
  1135. stdout, err := RunHostCmd(ns, name, cmd)
  1136. Logf("stdout: %v", stdout)
  1137. ExpectNoError(err)
  1138. return stdout
  1139. }
  1140. // RunHostCmdWithRetries calls RunHostCmd and retries all errors
  1141. // until it succeeds or the specified timeout expires.
  1142. // This can be used with idempotent commands to deflake transient Node issues.
  1143. func RunHostCmdWithRetries(ns, name, cmd string, interval, timeout time.Duration) (string, error) {
  1144. start := time.Now()
  1145. for {
  1146. out, err := RunHostCmd(ns, name, cmd)
  1147. if err == nil {
  1148. return out, nil
  1149. }
  1150. if elapsed := time.Since(start); elapsed > timeout {
  1151. return out, fmt.Errorf("RunHostCmd still failed after %v: %v", elapsed, err)
  1152. }
  1153. Logf("Waiting %v to retry failed RunHostCmd: %v", interval, err)
  1154. time.Sleep(interval)
  1155. }
  1156. }
  1157. // AllNodesReady checks whether all registered nodes are ready.
  1158. // TODO: we should change the AllNodesReady call in AfterEach to WaitForAllNodesHealthy,
  1159. // and figure out how to do it in a configurable way, as we can't expect all setups to run
  1160. // default test add-ons.
  1161. func AllNodesReady(c clientset.Interface, timeout time.Duration) error {
  1162. Logf("Waiting up to %v for all (but %d) nodes to be ready", timeout, TestContext.AllowedNotReadyNodes)
  1163. var notReady []*v1.Node
  1164. err := wait.PollImmediate(Poll, timeout, func() (bool, error) {
  1165. notReady = nil
  1166. // It should be OK to list unschedulable Nodes here.
  1167. nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
  1168. if err != nil {
  1169. if testutils.IsRetryableAPIError(err) {
  1170. return false, nil
  1171. }
  1172. return false, err
  1173. }
  1174. for i := range nodes.Items {
  1175. node := &nodes.Items[i]
  1176. if !e2enode.IsConditionSetAsExpected(node, v1.NodeReady, true) {
  1177. notReady = append(notReady, node)
  1178. }
  1179. }
  1180. // Framework allows for <TestContext.AllowedNotReadyNodes> nodes to be non-ready,
  1181. // to make it possible e.g. for incorrect deployment of some small percentage
  1182. // of nodes (which we allow in cluster validation). Some nodes that are not
  1183. // provisioned correctly at startup will never become ready (e.g. when something
  1184. // won't install correctly), so we can't expect them to be ready at any point.
  1185. return len(notReady) <= TestContext.AllowedNotReadyNodes, nil
  1186. })
  1187. if err != nil && err != wait.ErrWaitTimeout {
  1188. return err
  1189. }
  1190. if len(notReady) > TestContext.AllowedNotReadyNodes {
  1191. msg := ""
  1192. for _, node := range notReady {
  1193. msg = fmt.Sprintf("%s, %s", msg, node.Name)
  1194. }
  1195. return fmt.Errorf("Not ready nodes: %#v", msg)
  1196. }
  1197. return nil
  1198. }
  1199. // RestartKubelet restarts kubelet on the given host.
  1200. func RestartKubelet(host string) error {
  1201. // TODO: Make it work for all providers and distros.
  1202. supportedProviders := []string{"gce", "aws", "vsphere"}
  1203. if !ProviderIs(supportedProviders...) {
  1204. return fmt.Errorf("unsupported provider for RestartKubelet: %s, supported providers are: %v", TestContext.Provider, supportedProviders)
  1205. }
  1206. if ProviderIs("gce") && !NodeOSDistroIs("debian", "gci") {
  1207. return fmt.Errorf("unsupported node OS distro: %s", TestContext.NodeOSDistro)
  1208. }
  1209. var cmd string
  1210. if ProviderIs("gce") && NodeOSDistroIs("debian") {
  1211. cmd = "sudo /etc/init.d/kubelet restart"
  1212. } else if ProviderIs("vsphere") {
  1213. var sudoPresent bool
  1214. sshResult, err := e2essh.SSH("sudo --version", host, TestContext.Provider)
  1215. if err != nil {
  1216. return fmt.Errorf("Unable to ssh to host %s with error %v", host, err)
  1217. }
  1218. if !strings.Contains(sshResult.Stderr, "command not found") {
  1219. sudoPresent = true
  1220. }
  1221. sshResult, err = e2essh.SSH("systemctl --version", host, TestContext.Provider)
  1222. if err != nil {
  1223. return fmt.Errorf("Failed to execute command 'systemctl' on host %s with error %v", host, err)
  1224. }
  1225. if !strings.Contains(sshResult.Stderr, "command not found") {
  1226. cmd = "systemctl restart kubelet"
  1227. } else {
  1228. cmd = "service kubelet restart"
  1229. }
  1230. if sudoPresent {
  1231. cmd = fmt.Sprintf("sudo %s", cmd)
  1232. }
  1233. } else {
  1234. cmd = "sudo systemctl restart kubelet"
  1235. }
  1236. Logf("Restarting kubelet via ssh on host %s with command %s", host, cmd)
  1237. result, err := e2essh.SSH(cmd, host, TestContext.Provider)
  1238. if err != nil || result.Code != 0 {
  1239. e2essh.LogResult(result)
  1240. return fmt.Errorf("couldn't restart kubelet: %v", err)
  1241. }
  1242. return nil
  1243. }
  1244. // RestartApiserver restarts the kube-apiserver.
  1245. func RestartApiserver(namespace string, cs clientset.Interface) error {
  1246. // TODO: Make it work for all providers.
  1247. if !ProviderIs("gce", "gke", "aws") {
  1248. return fmt.Errorf("unsupported provider for RestartApiserver: %s", TestContext.Provider)
  1249. }
  1250. if ProviderIs("gce", "aws") {
  1251. initialRestartCount, err := getApiserverRestartCount(cs)
  1252. if err != nil {
  1253. return fmt.Errorf("failed to get apiserver's restart count: %v", err)
  1254. }
  1255. if err := sshRestartMaster(); err != nil {
  1256. return fmt.Errorf("failed to restart apiserver: %v", err)
  1257. }
  1258. return waitForApiserverRestarted(cs, initialRestartCount)
  1259. }
  1260. // GKE doesn't allow ssh access, so use a same-version master
  1261. // upgrade to teardown/recreate master.
  1262. v, err := cs.Discovery().ServerVersion()
  1263. if err != nil {
  1264. return err
  1265. }
  1266. return masterUpgradeGKE(namespace, v.GitVersion[1:]) // strip leading 'v'
  1267. }
  1268. func sshRestartMaster() error {
  1269. if !ProviderIs("gce", "aws") {
  1270. return fmt.Errorf("unsupported provider for sshRestartMaster: %s", TestContext.Provider)
  1271. }
  1272. var command string
  1273. if ProviderIs("gce") {
  1274. command = "pidof kube-apiserver | xargs sudo kill"
  1275. } else {
  1276. command = "sudo /etc/init.d/kube-apiserver restart"
  1277. }
  1278. Logf("Restarting master via ssh, running: %v", command)
  1279. result, err := e2essh.SSH(command, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  1280. if err != nil || result.Code != 0 {
  1281. e2essh.LogResult(result)
  1282. return fmt.Errorf("couldn't restart apiserver: %v", err)
  1283. }
  1284. return nil
  1285. }
  1286. // waitForApiserverRestarted waits until apiserver's restart count increased.
  1287. func waitForApiserverRestarted(c clientset.Interface, initialRestartCount int32) error {
  1288. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  1289. restartCount, err := getApiserverRestartCount(c)
  1290. if err != nil {
  1291. Logf("Failed to get apiserver's restart count: %v", err)
  1292. continue
  1293. }
  1294. if restartCount > initialRestartCount {
  1295. Logf("Apiserver has restarted.")
  1296. return nil
  1297. }
  1298. Logf("Waiting for apiserver restart count to increase")
  1299. }
  1300. return fmt.Errorf("timed out waiting for apiserver to be restarted")
  1301. }
  1302. func getApiserverRestartCount(c clientset.Interface) (int32, error) {
  1303. label := labels.SelectorFromSet(labels.Set(map[string]string{"component": "kube-apiserver"}))
  1304. listOpts := metav1.ListOptions{LabelSelector: label.String()}
  1305. pods, err := c.CoreV1().Pods(metav1.NamespaceSystem).List(context.TODO(), listOpts)
  1306. if err != nil {
  1307. return -1, err
  1308. }
  1309. if len(pods.Items) != 1 {
  1310. return -1, fmt.Errorf("unexpected number of apiserver pod: %d", len(pods.Items))
  1311. }
  1312. for _, s := range pods.Items[0].Status.ContainerStatuses {
  1313. if s.Name != "kube-apiserver" {
  1314. continue
  1315. }
  1316. return s.RestartCount, nil
  1317. }
  1318. return -1, fmt.Errorf("Failed to find kube-apiserver container in pod")
  1319. }
  1320. // RestartControllerManager restarts the kube-controller-manager.
  1321. func RestartControllerManager() error {
  1322. // TODO: Make it work for all providers and distros.
  1323. if !ProviderIs("gce", "aws") {
  1324. return fmt.Errorf("unsupported provider for RestartControllerManager: %s", TestContext.Provider)
  1325. }
  1326. if ProviderIs("gce") && !MasterOSDistroIs("gci") {
  1327. return fmt.Errorf("unsupported master OS distro: %s", TestContext.MasterOSDistro)
  1328. }
  1329. cmd := "pidof kube-controller-manager | xargs sudo kill"
  1330. Logf("Restarting controller-manager via ssh, running: %v", cmd)
  1331. result, err := e2essh.SSH(cmd, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  1332. if err != nil || result.Code != 0 {
  1333. e2essh.LogResult(result)
  1334. return fmt.Errorf("couldn't restart controller-manager: %v", err)
  1335. }
  1336. return nil
  1337. }
  1338. // WaitForControllerManagerUp waits for the kube-controller-manager to be up.
  1339. func WaitForControllerManagerUp() error {
  1340. cmd := "curl http://localhost:" + strconv.Itoa(ports.InsecureKubeControllerManagerPort) + "/healthz"
  1341. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  1342. result, err := e2essh.SSH(cmd, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  1343. if err != nil || result.Code != 0 {
  1344. e2essh.LogResult(result)
  1345. }
  1346. if result.Stdout == "ok" {
  1347. return nil
  1348. }
  1349. }
  1350. return fmt.Errorf("waiting for controller-manager timed out")
  1351. }
  1352. type extractRT struct {
  1353. http.Header
  1354. }
  1355. func (rt *extractRT) RoundTrip(req *http.Request) (*http.Response, error) {
  1356. rt.Header = req.Header
  1357. return &http.Response{}, nil
  1358. }
  1359. // headersForConfig extracts any http client logic necessary for the provided
  1360. // config.
  1361. func headersForConfig(c *restclient.Config, url *url.URL) (http.Header, error) {
  1362. extract := &extractRT{}
  1363. rt, err := restclient.HTTPWrappersForConfig(c, extract)
  1364. if err != nil {
  1365. return nil, err
  1366. }
  1367. request, err := http.NewRequest("GET", url.String(), nil)
  1368. if err != nil {
  1369. return nil, err
  1370. }
  1371. if _, err := rt.RoundTrip(request); err != nil {
  1372. return nil, err
  1373. }
  1374. return extract.Header, nil
  1375. }
  1376. // OpenWebSocketForURL constructs a websocket connection to the provided URL, using the client
  1377. // config, with the specified protocols.
  1378. func OpenWebSocketForURL(url *url.URL, config *restclient.Config, protocols []string) (*websocket.Conn, error) {
  1379. tlsConfig, err := restclient.TLSConfigFor(config)
  1380. if err != nil {
  1381. return nil, fmt.Errorf("Failed to create tls config: %v", err)
  1382. }
  1383. if url.Scheme == "https" {
  1384. url.Scheme = "wss"
  1385. } else {
  1386. url.Scheme = "ws"
  1387. }
  1388. headers, err := headersForConfig(config, url)
  1389. if err != nil {
  1390. return nil, fmt.Errorf("Failed to load http headers: %v", err)
  1391. }
  1392. cfg, err := websocket.NewConfig(url.String(), "http://localhost")
  1393. if err != nil {
  1394. return nil, fmt.Errorf("Failed to create websocket config: %v", err)
  1395. }
  1396. cfg.Header = headers
  1397. cfg.TlsConfig = tlsConfig
  1398. cfg.Protocol = protocols
  1399. return websocket.DialConfig(cfg)
  1400. }
  1401. // LookForStringInLog looks for the given string in the log of a specific pod container
  1402. func LookForStringInLog(ns, podName, container, expectedString string, timeout time.Duration) (result string, err error) {
  1403. return lookForString(expectedString, timeout, func() string {
  1404. return RunKubectlOrDie(ns, "logs", podName, container, fmt.Sprintf("--namespace=%v", ns))
  1405. })
  1406. }
  1407. // EnsureLoadBalancerResourcesDeleted ensures that cloud load balancer resources that were created
  1408. // are actually cleaned up. Currently only implemented for GCE/GKE.
  1409. func EnsureLoadBalancerResourcesDeleted(ip, portRange string) error {
  1410. return TestContext.CloudConfig.Provider.EnsureLoadBalancerResourcesDeleted(ip, portRange)
  1411. }
  1412. // BlockNetwork blocks network between the given from value and the given to value.
  1413. // The following helper functions can block/unblock network from source
  1414. // host to destination host by manipulating iptable rules.
  1415. // This function assumes it can ssh to the source host.
  1416. //
  1417. // Caution:
  1418. // Recommend to input IP instead of hostnames. Using hostnames will cause iptables to
  1419. // do a DNS lookup to resolve the name to an IP address, which will
  1420. // slow down the test and cause it to fail if DNS is absent or broken.
  1421. //
  1422. // Suggested usage pattern:
  1423. // func foo() {
  1424. // ...
  1425. // defer UnblockNetwork(from, to)
  1426. // BlockNetwork(from, to)
  1427. // ...
  1428. // }
  1429. //
  1430. func BlockNetwork(from string, to string) {
  1431. Logf("block network traffic from %s to %s", from, to)
  1432. iptablesRule := fmt.Sprintf("OUTPUT --destination %s --jump REJECT", to)
  1433. dropCmd := fmt.Sprintf("sudo iptables --insert %s", iptablesRule)
  1434. if result, err := e2essh.SSH(dropCmd, from, TestContext.Provider); result.Code != 0 || err != nil {
  1435. e2essh.LogResult(result)
  1436. Failf("Unexpected error: %v", err)
  1437. }
  1438. }
  1439. // UnblockNetwork unblocks network between the given from value and the given to value.
  1440. func UnblockNetwork(from string, to string) {
  1441. Logf("Unblock network traffic from %s to %s", from, to)
  1442. iptablesRule := fmt.Sprintf("OUTPUT --destination %s --jump REJECT", to)
  1443. undropCmd := fmt.Sprintf("sudo iptables --delete %s", iptablesRule)
  1444. // Undrop command may fail if the rule has never been created.
  1445. // In such case we just lose 30 seconds, but the cluster is healthy.
  1446. // But if the rule had been created and removing it failed, the node is broken and
  1447. // not coming back. Subsequent tests will run or fewer nodes (some of the tests
  1448. // may fail). Manual intervention is required in such case (recreating the
  1449. // cluster solves the problem too).
  1450. err := wait.Poll(time.Millisecond*100, time.Second*30, func() (bool, error) {
  1451. result, err := e2essh.SSH(undropCmd, from, TestContext.Provider)
  1452. if result.Code == 0 && err == nil {
  1453. return true, nil
  1454. }
  1455. e2essh.LogResult(result)
  1456. if err != nil {
  1457. Logf("Unexpected error: %v", err)
  1458. }
  1459. return false, nil
  1460. })
  1461. if err != nil {
  1462. Failf("Failed to remove the iptable REJECT rule. Manual intervention is "+
  1463. "required on host %s: remove rule %s, if exists", from, iptablesRule)
  1464. }
  1465. }
  1466. // CoreDump SSHs to the master and all nodes and dumps their logs into dir.
  1467. // It shells out to cluster/log-dump/log-dump.sh to accomplish this.
  1468. func CoreDump(dir string) {
  1469. if TestContext.DisableLogDump {
  1470. Logf("Skipping dumping logs from cluster")
  1471. return
  1472. }
  1473. var cmd *exec.Cmd
  1474. if TestContext.LogexporterGCSPath != "" {
  1475. Logf("Dumping logs from nodes to GCS directly at path: %s", TestContext.LogexporterGCSPath)
  1476. cmd = exec.Command(path.Join(TestContext.RepoRoot, "cluster", "log-dump", "log-dump.sh"), dir, TestContext.LogexporterGCSPath)
  1477. } else {
  1478. Logf("Dumping logs locally to: %s", dir)
  1479. cmd = exec.Command(path.Join(TestContext.RepoRoot, "cluster", "log-dump", "log-dump.sh"), dir)
  1480. }
  1481. cmd.Env = append(os.Environ(), fmt.Sprintf("LOG_DUMP_SYSTEMD_SERVICES=%s", parseSystemdServices(TestContext.SystemdServices)))
  1482. cmd.Env = append(os.Environ(), fmt.Sprintf("LOG_DUMP_SYSTEMD_JOURNAL=%v", TestContext.DumpSystemdJournal))
  1483. cmd.Stdout = os.Stdout
  1484. cmd.Stderr = os.Stderr
  1485. if err := cmd.Run(); err != nil {
  1486. Logf("Error running cluster/log-dump/log-dump.sh: %v", err)
  1487. }
  1488. }
  1489. // parseSystemdServices converts services separator from comma to space.
  1490. func parseSystemdServices(services string) string {
  1491. return strings.TrimSpace(strings.Replace(services, ",", " ", -1))
  1492. }
  1493. // RunCmd runs cmd using args and returns its stdout and stderr. It also outputs
  1494. // cmd's stdout and stderr to their respective OS streams.
  1495. func RunCmd(command string, args ...string) (string, string, error) {
  1496. return RunCmdEnv(nil, command, args...)
  1497. }
  1498. // RunCmdEnv runs cmd with the provided environment and args and
  1499. // returns its stdout and stderr. It also outputs cmd's stdout and
  1500. // stderr to their respective OS streams.
  1501. func RunCmdEnv(env []string, command string, args ...string) (string, string, error) {
  1502. Logf("Running %s %v", command, args)
  1503. var bout, berr bytes.Buffer
  1504. cmd := exec.Command(command, args...)
  1505. // We also output to the OS stdout/stderr to aid in debugging in case cmd
  1506. // hangs and never returns before the test gets killed.
  1507. //
  1508. // This creates some ugly output because gcloud doesn't always provide
  1509. // newlines.
  1510. cmd.Stdout = io.MultiWriter(os.Stdout, &bout)
  1511. cmd.Stderr = io.MultiWriter(os.Stderr, &berr)
  1512. cmd.Env = env
  1513. err := cmd.Run()
  1514. stdout, stderr := bout.String(), berr.String()
  1515. if err != nil {
  1516. return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
  1517. command, args, err, stdout, stderr)
  1518. }
  1519. return stdout, stderr, nil
  1520. }
  1521. // E2ETestNodePreparer implements testutils.TestNodePreparer interface, which is used
  1522. // to create/modify Nodes before running a test.
  1523. type E2ETestNodePreparer struct {
  1524. client clientset.Interface
  1525. // Specifies how many nodes should be modified using the given strategy.
  1526. // Only one strategy can be applied to a single Node, so there needs to
  1527. // be at least <sum_of_keys> Nodes in the cluster.
  1528. countToStrategy []testutils.CountToStrategy
  1529. nodeToAppliedStrategy map[string]testutils.PrepareNodeStrategy
  1530. }
  1531. // PrepareNodes prepares nodes in the cluster.
  1532. func (p *E2ETestNodePreparer) PrepareNodes() error {
  1533. nodes, err := e2enode.GetReadySchedulableNodes(p.client)
  1534. if err != nil {
  1535. return err
  1536. }
  1537. numTemplates := 0
  1538. for _, v := range p.countToStrategy {
  1539. numTemplates += v.Count
  1540. }
  1541. if numTemplates > len(nodes.Items) {
  1542. return fmt.Errorf("Can't prepare Nodes. Got more templates than existing Nodes")
  1543. }
  1544. index := 0
  1545. sum := 0
  1546. for _, v := range p.countToStrategy {
  1547. sum += v.Count
  1548. for ; index < sum; index++ {
  1549. if err := testutils.DoPrepareNode(p.client, &nodes.Items[index], v.Strategy); err != nil {
  1550. klog.Errorf("Aborting node preparation: %v", err)
  1551. return err
  1552. }
  1553. p.nodeToAppliedStrategy[nodes.Items[index].Name] = v.Strategy
  1554. }
  1555. }
  1556. return nil
  1557. }
  1558. // CleanupNodes cleanups nodes in the cluster.
  1559. func (p *E2ETestNodePreparer) CleanupNodes() error {
  1560. var encounteredError error
  1561. nodes, err := e2enode.GetReadySchedulableNodes(p.client)
  1562. if err != nil {
  1563. return err
  1564. }
  1565. for i := range nodes.Items {
  1566. name := nodes.Items[i].Name
  1567. strategy, found := p.nodeToAppliedStrategy[name]
  1568. if found {
  1569. if err = testutils.DoCleanupNode(p.client, name, strategy); err != nil {
  1570. klog.Errorf("Skipping cleanup of Node: failed update of %v: %v", name, err)
  1571. encounteredError = err
  1572. }
  1573. }
  1574. }
  1575. return encounteredError
  1576. }
  1577. // getMasterAddresses returns the externalIP, internalIP and hostname fields of the master.
  1578. // If any of these is unavailable, it is set to "".
  1579. func getMasterAddresses(c clientset.Interface) (string, string, string) {
  1580. var externalIP, internalIP, hostname string
  1581. // Populate the internal IP.
  1582. eps, err := c.CoreV1().Endpoints(metav1.NamespaceDefault).Get(context.TODO(), "kubernetes", metav1.GetOptions{})
  1583. if err != nil {
  1584. Failf("Failed to get kubernetes endpoints: %v", err)
  1585. }
  1586. if len(eps.Subsets) != 1 || len(eps.Subsets[0].Addresses) != 1 {
  1587. Failf("There are more than 1 endpoints for kubernetes service: %+v", eps)
  1588. }
  1589. internalIP = eps.Subsets[0].Addresses[0].IP
  1590. // Populate the external IP/hostname.
  1591. hostURL, err := url.Parse(TestContext.Host)
  1592. if err != nil {
  1593. Failf("Failed to parse hostname: %v", err)
  1594. }
  1595. if net.ParseIP(hostURL.Host) != nil {
  1596. externalIP = hostURL.Host
  1597. } else {
  1598. hostname = hostURL.Host
  1599. }
  1600. return externalIP, internalIP, hostname
  1601. }
  1602. // GetAllMasterAddresses returns all IP addresses on which the kubelet can reach the master.
  1603. // It may return internal and external IPs, even if we expect for
  1604. // e.g. internal IPs to be used (issue #56787), so that we can be
  1605. // sure to block the master fully during tests.
  1606. func GetAllMasterAddresses(c clientset.Interface) []string {
  1607. externalIP, internalIP, _ := getMasterAddresses(c)
  1608. ips := sets.NewString()
  1609. switch TestContext.Provider {
  1610. case "gce", "gke":
  1611. if externalIP != "" {
  1612. ips.Insert(externalIP)
  1613. }
  1614. if internalIP != "" {
  1615. ips.Insert(internalIP)
  1616. }
  1617. case "aws":
  1618. ips.Insert(awsMasterIP)
  1619. default:
  1620. Failf("This test is not supported for provider %s and should be disabled", TestContext.Provider)
  1621. }
  1622. return ips.List()
  1623. }
  1624. // DescribeIng describes information of ingress by running kubectl describe ing.
  1625. func DescribeIng(ns string) {
  1626. Logf("\nOutput of kubectl describe ing:\n")
  1627. desc, _ := RunKubectl(
  1628. ns, "describe", "ing", fmt.Sprintf("--namespace=%v", ns))
  1629. Logf(desc)
  1630. }
  1631. // NewTestPod returns a pod that has the specified requests and limits
  1632. func (f *Framework) NewTestPod(name string, requests v1.ResourceList, limits v1.ResourceList) *v1.Pod {
  1633. return &v1.Pod{
  1634. ObjectMeta: metav1.ObjectMeta{
  1635. Name: name,
  1636. },
  1637. Spec: v1.PodSpec{
  1638. Containers: []v1.Container{
  1639. {
  1640. Name: "pause",
  1641. Image: imageutils.GetPauseImageName(),
  1642. Resources: v1.ResourceRequirements{
  1643. Requests: requests,
  1644. Limits: limits,
  1645. },
  1646. },
  1647. },
  1648. },
  1649. }
  1650. }
  1651. // NewAgnhostPod returns a pod that uses the agnhost image. The image's binary supports various subcommands
  1652. // that behave the same, no matter the underlying OS.
  1653. func (f *Framework) NewAgnhostPod(name string, args ...string) *v1.Pod {
  1654. return &v1.Pod{
  1655. ObjectMeta: metav1.ObjectMeta{
  1656. Name: name,
  1657. },
  1658. Spec: v1.PodSpec{
  1659. Containers: []v1.Container{
  1660. {
  1661. Name: "agnhost",
  1662. Image: AgnHostImage,
  1663. Args: args,
  1664. },
  1665. },
  1666. },
  1667. }
  1668. }
  1669. // CreateEmptyFileOnPod creates empty file at given path on the pod.
  1670. // TODO(alejandrox1): move to subpkg pod once kubectl methods have been refactored.
  1671. func CreateEmptyFileOnPod(namespace string, podName string, filePath string) error {
  1672. _, err := RunKubectl(namespace, "exec", fmt.Sprintf("--namespace=%s", namespace), podName, "--", "/bin/sh", "-c", fmt.Sprintf("touch %s", filePath))
  1673. return err
  1674. }
  1675. // DumpDebugInfo dumps debug info of tests.
  1676. func DumpDebugInfo(c clientset.Interface, ns string) {
  1677. sl, _ := c.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{LabelSelector: labels.Everything().String()})
  1678. for _, s := range sl.Items {
  1679. desc, _ := RunKubectl(ns, "describe", "po", s.Name, fmt.Sprintf("--namespace=%v", ns))
  1680. Logf("\nOutput of kubectl describe %v:\n%v", s.Name, desc)
  1681. l, _ := RunKubectl(ns, "logs", s.Name, fmt.Sprintf("--namespace=%v", ns), "--tail=100")
  1682. Logf("\nLast 100 log lines of %v:\n%v", s.Name, l)
  1683. }
  1684. }
  1685. // DsFromManifest reads a .json/yaml file and returns the daemonset in it.
  1686. func DsFromManifest(url string) (*appsv1.DaemonSet, error) {
  1687. Logf("Parsing ds from %v", url)
  1688. var response *http.Response
  1689. var err error
  1690. for i := 1; i <= 5; i++ {
  1691. response, err = http.Get(url)
  1692. if err == nil && response.StatusCode == 200 {
  1693. break
  1694. }
  1695. time.Sleep(time.Duration(i) * time.Second)
  1696. }
  1697. if err != nil {
  1698. return nil, fmt.Errorf("Failed to get url: %v", err)
  1699. }
  1700. if response.StatusCode != 200 {
  1701. return nil, fmt.Errorf("invalid http response status: %v", response.StatusCode)
  1702. }
  1703. defer response.Body.Close()
  1704. data, err := ioutil.ReadAll(response.Body)
  1705. if err != nil {
  1706. return nil, fmt.Errorf("Failed to read html response body: %v", err)
  1707. }
  1708. return DsFromData(data)
  1709. }
  1710. // DsFromData reads a byte slice and returns the daemonset in it.
  1711. func DsFromData(data []byte) (*appsv1.DaemonSet, error) {
  1712. var ds appsv1.DaemonSet
  1713. dataJSON, err := utilyaml.ToJSON(data)
  1714. if err != nil {
  1715. return nil, fmt.Errorf("Failed to parse data to json: %v", err)
  1716. }
  1717. err = runtime.DecodeInto(scheme.Codecs.UniversalDecoder(), dataJSON, &ds)
  1718. if err != nil {
  1719. return nil, fmt.Errorf("Failed to decode DaemonSet spec: %v", err)
  1720. }
  1721. return &ds, nil
  1722. }
  1723. // GetClusterZones returns the values of zone label collected from all nodes.
  1724. func GetClusterZones(c clientset.Interface) (sets.String, error) {
  1725. nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
  1726. if err != nil {
  1727. return nil, fmt.Errorf("Error getting nodes while attempting to list cluster zones: %v", err)
  1728. }
  1729. // collect values of zone label from all nodes
  1730. zones := sets.NewString()
  1731. for _, node := range nodes.Items {
  1732. if zone, found := node.Labels[v1.LabelZoneFailureDomain]; found {
  1733. zones.Insert(zone)
  1734. }
  1735. if zone, found := node.Labels[v1.LabelZoneFailureDomainStable]; found {
  1736. zones.Insert(zone)
  1737. }
  1738. }
  1739. return zones, nil
  1740. }
  1741. // GetFileModeRegex returns a file mode related regex which should be matched by the mounttest pods' output.
  1742. // If the given mask is nil, then the regex will contain the default OS file modes, which are 0644 for Linux and 0775 for Windows.
  1743. func GetFileModeRegex(filePath string, mask *int32) string {
  1744. var (
  1745. linuxMask int32
  1746. windowsMask int32
  1747. )
  1748. if mask == nil {
  1749. linuxMask = int32(0644)
  1750. windowsMask = int32(0775)
  1751. } else {
  1752. linuxMask = *mask
  1753. windowsMask = *mask
  1754. }
  1755. linuxOutput := fmt.Sprintf("mode of file \"%s\": %v", filePath, os.FileMode(linuxMask))
  1756. windowsOutput := fmt.Sprintf("mode of Windows file \"%v\": %s", filePath, os.FileMode(windowsMask))
  1757. return fmt.Sprintf("(%s|%s)", linuxOutput, windowsOutput)
  1758. }
  1759. // PrettyPrintJSON converts metrics to JSON format.
  1760. func PrettyPrintJSON(metrics interface{}) string {
  1761. output := &bytes.Buffer{}
  1762. if err := json.NewEncoder(output).Encode(metrics); err != nil {
  1763. Logf("Error building encoder: %v", err)
  1764. return ""
  1765. }
  1766. formatted := &bytes.Buffer{}
  1767. if err := json.Indent(formatted, output.Bytes(), "", " "); err != nil {
  1768. Logf("Error indenting: %v", err)
  1769. return ""
  1770. }
  1771. return string(formatted.Bytes())
  1772. }