nodes_util.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /*
  2. Copyright 2014 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package framework
  14. import (
  15. "fmt"
  16. "os"
  17. "path"
  18. "path/filepath"
  19. "strings"
  20. "sync"
  21. "time"
  22. v1 "k8s.io/api/core/v1"
  23. "k8s.io/apimachinery/pkg/util/wait"
  24. clientset "k8s.io/client-go/kubernetes"
  25. // TODO: Remove the following imports (ref: https://github.com/kubernetes/kubernetes/issues/81245)
  26. e2enode "k8s.io/kubernetes/test/e2e/framework/node"
  27. e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
  28. )
  29. const etcdImage = "3.4.3-0"
  30. // EtcdUpgrade upgrades etcd on GCE.
  31. func EtcdUpgrade(targetStorage, targetVersion string) error {
  32. switch TestContext.Provider {
  33. case "gce":
  34. return etcdUpgradeGCE(targetStorage, targetVersion)
  35. default:
  36. return fmt.Errorf("EtcdUpgrade() is not implemented for provider %s", TestContext.Provider)
  37. }
  38. }
  39. // MasterUpgrade upgrades master node on GCE/GKE.
  40. func MasterUpgrade(f *Framework, v string) error {
  41. switch TestContext.Provider {
  42. case "gce":
  43. return masterUpgradeGCE(v, false)
  44. case "gke":
  45. return masterUpgradeGKE(f.Namespace.Name, v)
  46. case "kubernetes-anywhere":
  47. return masterUpgradeKubernetesAnywhere(v)
  48. default:
  49. return fmt.Errorf("MasterUpgrade() is not implemented for provider %s", TestContext.Provider)
  50. }
  51. }
  52. func etcdUpgradeGCE(targetStorage, targetVersion string) error {
  53. env := append(
  54. os.Environ(),
  55. "TEST_ETCD_VERSION="+targetVersion,
  56. "STORAGE_BACKEND="+targetStorage,
  57. "TEST_ETCD_IMAGE="+etcdImage)
  58. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-l", "-M")
  59. return err
  60. }
  61. // MasterUpgradeGCEWithKubeProxyDaemonSet upgrades master node on GCE with enabling/disabling the daemon set of kube-proxy.
  62. // TODO(mrhohn): Remove this function when kube-proxy is run as a DaemonSet by default.
  63. func MasterUpgradeGCEWithKubeProxyDaemonSet(v string, enableKubeProxyDaemonSet bool) error {
  64. return masterUpgradeGCE(v, enableKubeProxyDaemonSet)
  65. }
  66. // TODO(mrhohn): Remove 'enableKubeProxyDaemonSet' when kube-proxy is run as a DaemonSet by default.
  67. func masterUpgradeGCE(rawV string, enableKubeProxyDaemonSet bool) error {
  68. env := append(os.Environ(), fmt.Sprintf("KUBE_PROXY_DAEMONSET=%v", enableKubeProxyDaemonSet))
  69. // TODO: Remove these variables when they're no longer needed for downgrades.
  70. if TestContext.EtcdUpgradeVersion != "" && TestContext.EtcdUpgradeStorage != "" {
  71. env = append(env,
  72. "TEST_ETCD_VERSION="+TestContext.EtcdUpgradeVersion,
  73. "STORAGE_BACKEND="+TestContext.EtcdUpgradeStorage,
  74. "TEST_ETCD_IMAGE="+etcdImage)
  75. } else {
  76. // In e2e tests, we skip the confirmation prompt about
  77. // implicit etcd upgrades to simulate the user entering "y".
  78. env = append(env, "TEST_ALLOW_IMPLICIT_ETCD_UPGRADE=true")
  79. }
  80. v := "v" + rawV
  81. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-M", v)
  82. return err
  83. }
  84. func locationParamGKE() string {
  85. if TestContext.CloudConfig.MultiMaster {
  86. // GKE Regional Clusters are being tested.
  87. return fmt.Sprintf("--region=%s", TestContext.CloudConfig.Region)
  88. }
  89. return fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone)
  90. }
  91. func appendContainerCommandGroupIfNeeded(args []string) []string {
  92. if TestContext.CloudConfig.Region != "" {
  93. // TODO(wojtek-t): Get rid of it once Regional Clusters go to GA.
  94. return append([]string{"beta"}, args...)
  95. }
  96. return args
  97. }
  98. func masterUpgradeGKE(namespace string, v string) error {
  99. Logf("Upgrading master to %q", v)
  100. args := []string{
  101. "container",
  102. "clusters",
  103. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  104. locationParamGKE(),
  105. "upgrade",
  106. TestContext.CloudConfig.Cluster,
  107. "--master",
  108. fmt.Sprintf("--cluster-version=%s", v),
  109. "--quiet",
  110. }
  111. _, _, err := RunCmd("gcloud", appendContainerCommandGroupIfNeeded(args)...)
  112. if err != nil {
  113. return err
  114. }
  115. waitForSSHTunnels(namespace)
  116. return nil
  117. }
  118. func masterUpgradeKubernetesAnywhere(v string) error {
  119. Logf("Upgrading master to %q", v)
  120. kaPath := TestContext.KubernetesAnywherePath
  121. originalConfigPath := filepath.Join(kaPath, ".config")
  122. backupConfigPath := filepath.Join(kaPath, ".config.bak")
  123. updatedConfigPath := filepath.Join(kaPath, fmt.Sprintf(".config-%s", v))
  124. // modify config with specified k8s version
  125. if _, _, err := RunCmd("sed",
  126. "-i.bak", // writes original to .config.bak
  127. fmt.Sprintf(`s/kubernetes_version=.*$/kubernetes_version=%q/`, v),
  128. originalConfigPath); err != nil {
  129. return err
  130. }
  131. defer func() {
  132. // revert .config.bak to .config
  133. if err := os.Rename(backupConfigPath, originalConfigPath); err != nil {
  134. Logf("Could not rename %s back to %s", backupConfigPath, originalConfigPath)
  135. }
  136. }()
  137. // invoke ka upgrade
  138. if _, _, err := RunCmd("make", "-C", TestContext.KubernetesAnywherePath,
  139. "WAIT_FOR_KUBECONFIG=y", "upgrade-master"); err != nil {
  140. return err
  141. }
  142. // move .config to .config.<version>
  143. if err := os.Rename(originalConfigPath, updatedConfigPath); err != nil {
  144. return err
  145. }
  146. return nil
  147. }
  148. // NodeUpgrade upgrades nodes on GCE/GKE.
  149. func NodeUpgrade(f *Framework, v string, img string) error {
  150. // Perform the upgrade.
  151. var err error
  152. switch TestContext.Provider {
  153. case "gce":
  154. err = nodeUpgradeGCE(v, img, false)
  155. case "gke":
  156. err = nodeUpgradeGKE(f.Namespace.Name, v, img)
  157. default:
  158. err = fmt.Errorf("NodeUpgrade() is not implemented for provider %s", TestContext.Provider)
  159. }
  160. if err != nil {
  161. return err
  162. }
  163. return waitForNodesReadyAfterUpgrade(f)
  164. }
  165. // NodeUpgradeGCEWithKubeProxyDaemonSet upgrades nodes on GCE with enabling/disabling the daemon set of kube-proxy.
  166. // TODO(mrhohn): Remove this function when kube-proxy is run as a DaemonSet by default.
  167. func NodeUpgradeGCEWithKubeProxyDaemonSet(f *Framework, v string, img string, enableKubeProxyDaemonSet bool) error {
  168. // Perform the upgrade.
  169. if err := nodeUpgradeGCE(v, img, enableKubeProxyDaemonSet); err != nil {
  170. return err
  171. }
  172. return waitForNodesReadyAfterUpgrade(f)
  173. }
  174. func waitForNodesReadyAfterUpgrade(f *Framework) error {
  175. // Wait for it to complete and validate nodes are healthy.
  176. //
  177. // TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
  178. // GKE; the operation shouldn't return until they all are.
  179. numNodes, err := e2enode.TotalRegistered(f.ClientSet)
  180. if err != nil {
  181. return fmt.Errorf("couldn't detect number of nodes")
  182. }
  183. Logf("Waiting up to %v for all %d nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout, numNodes)
  184. if _, err := e2enode.CheckReady(f.ClientSet, numNodes, RestartNodeReadyAgainTimeout); err != nil {
  185. return err
  186. }
  187. return nil
  188. }
  189. // TODO(mrhohn): Remove 'enableKubeProxyDaemonSet' when kube-proxy is run as a DaemonSet by default.
  190. func nodeUpgradeGCE(rawV, img string, enableKubeProxyDaemonSet bool) error {
  191. v := "v" + rawV
  192. env := append(os.Environ(), fmt.Sprintf("KUBE_PROXY_DAEMONSET=%v", enableKubeProxyDaemonSet))
  193. if img != "" {
  194. env = append(env, "KUBE_NODE_OS_DISTRIBUTION="+img)
  195. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-N", "-o", v)
  196. return err
  197. }
  198. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-N", v)
  199. return err
  200. }
  201. func nodeUpgradeGKE(namespace string, v string, img string) error {
  202. Logf("Upgrading nodes to version %q and image %q", v, img)
  203. nps, err := nodePoolsGKE()
  204. if err != nil {
  205. return err
  206. }
  207. Logf("Found node pools %v", nps)
  208. for _, np := range nps {
  209. args := []string{
  210. "container",
  211. "clusters",
  212. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  213. locationParamGKE(),
  214. "upgrade",
  215. TestContext.CloudConfig.Cluster,
  216. fmt.Sprintf("--node-pool=%s", np),
  217. fmt.Sprintf("--cluster-version=%s", v),
  218. "--quiet",
  219. }
  220. if len(img) > 0 {
  221. args = append(args, fmt.Sprintf("--image-type=%s", img))
  222. }
  223. _, _, err = RunCmd("gcloud", appendContainerCommandGroupIfNeeded(args)...)
  224. if err != nil {
  225. return err
  226. }
  227. waitForSSHTunnels(namespace)
  228. }
  229. return nil
  230. }
  231. func nodePoolsGKE() ([]string, error) {
  232. args := []string{
  233. "container",
  234. "node-pools",
  235. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  236. locationParamGKE(),
  237. "list",
  238. fmt.Sprintf("--cluster=%s", TestContext.CloudConfig.Cluster),
  239. "--format=get(name)",
  240. }
  241. stdout, _, err := RunCmd("gcloud", appendContainerCommandGroupIfNeeded(args)...)
  242. if err != nil {
  243. return nil, err
  244. }
  245. if len(strings.TrimSpace(stdout)) == 0 {
  246. return []string{}, nil
  247. }
  248. return strings.Fields(stdout), nil
  249. }
  250. func gceUpgradeScript() string {
  251. if len(TestContext.GCEUpgradeScript) == 0 {
  252. return path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh")
  253. }
  254. return TestContext.GCEUpgradeScript
  255. }
  256. func waitForSSHTunnels(namespace string) {
  257. Logf("Waiting for SSH tunnels to establish")
  258. RunKubectl(namespace, "run", "ssh-tunnel-test",
  259. "--image=busybox",
  260. "--restart=Never",
  261. "--command", "--",
  262. "echo", "Hello")
  263. defer RunKubectl(namespace, "delete", "pod", "ssh-tunnel-test")
  264. // allow up to a minute for new ssh tunnels to establish
  265. wait.PollImmediate(5*time.Second, time.Minute, func() (bool, error) {
  266. _, err := RunKubectl(namespace, "logs", "ssh-tunnel-test")
  267. return err == nil, nil
  268. })
  269. }
  270. // NodeKiller is a utility to simulate node failures.
  271. type NodeKiller struct {
  272. config NodeKillerConfig
  273. client clientset.Interface
  274. provider string
  275. }
  276. // NewNodeKiller creates new NodeKiller.
  277. func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
  278. config.NodeKillerStopCh = make(chan struct{})
  279. return &NodeKiller{config, client, provider}
  280. }
  281. // Run starts NodeKiller until stopCh is closed.
  282. func (k *NodeKiller) Run(stopCh <-chan struct{}) {
  283. // wait.JitterUntil starts work immediately, so wait first.
  284. time.Sleep(wait.Jitter(k.config.Interval, k.config.JitterFactor))
  285. wait.JitterUntil(func() {
  286. nodes := k.pickNodes()
  287. k.kill(nodes)
  288. }, k.config.Interval, k.config.JitterFactor, true, stopCh)
  289. }
  290. func (k *NodeKiller) pickNodes() []v1.Node {
  291. nodes, err := e2enode.GetReadySchedulableNodes(k.client)
  292. ExpectNoError(err)
  293. numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
  294. nodes, err = e2enode.GetBoundedReadySchedulableNodes(k.client, numNodes)
  295. ExpectNoError(err)
  296. return nodes.Items
  297. }
  298. func (k *NodeKiller) kill(nodes []v1.Node) {
  299. wg := sync.WaitGroup{}
  300. wg.Add(len(nodes))
  301. for _, node := range nodes {
  302. node := node
  303. go func() {
  304. defer wg.Done()
  305. Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
  306. err := e2essh.IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
  307. if err != nil {
  308. Logf("ERROR while stopping node %q: %v", node.Name, err)
  309. return
  310. }
  311. time.Sleep(k.config.SimulatedDowntime)
  312. Logf("Rebooting %q to repair the node", node.Name)
  313. err = e2essh.IssueSSHCommand("sudo reboot", k.provider, &node)
  314. if err != nil {
  315. Logf("ERROR while rebooting node %q: %v", node.Name, err)
  316. return
  317. }
  318. }()
  319. }
  320. wg.Wait()
  321. }