nodes_util.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. /*
  2. Copyright 2014 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package framework
  14. import (
  15. "fmt"
  16. "os"
  17. "path"
  18. "path/filepath"
  19. "strings"
  20. "sync"
  21. "time"
  22. v1 "k8s.io/api/core/v1"
  23. "k8s.io/apimachinery/pkg/util/wait"
  24. clientset "k8s.io/client-go/kubernetes"
  25. e2elog "k8s.io/kubernetes/test/e2e/framework/log"
  26. e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
  27. )
  28. // EtcdUpgrade upgrades etcd on GCE.
  29. func EtcdUpgrade(targetStorage, targetVersion string) error {
  30. switch TestContext.Provider {
  31. case "gce":
  32. return etcdUpgradeGCE(targetStorage, targetVersion)
  33. default:
  34. return fmt.Errorf("EtcdUpgrade() is not implemented for provider %s", TestContext.Provider)
  35. }
  36. }
  37. // MasterUpgrade upgrades master node on GCE/GKE.
  38. func MasterUpgrade(v string) error {
  39. switch TestContext.Provider {
  40. case "gce":
  41. return masterUpgradeGCE(v, false)
  42. case "gke":
  43. return masterUpgradeGKE(v)
  44. case "kubernetes-anywhere":
  45. return masterUpgradeKubernetesAnywhere(v)
  46. default:
  47. return fmt.Errorf("MasterUpgrade() is not implemented for provider %s", TestContext.Provider)
  48. }
  49. }
  50. func etcdUpgradeGCE(targetStorage, targetVersion string) error {
  51. env := append(
  52. os.Environ(),
  53. "TEST_ETCD_VERSION="+targetVersion,
  54. "STORAGE_BACKEND="+targetStorage,
  55. "TEST_ETCD_IMAGE=3.3.10-1")
  56. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-l", "-M")
  57. return err
  58. }
  59. // MasterUpgradeGCEWithKubeProxyDaemonSet upgrades master node on GCE with enabling/disabling the daemon set of kube-proxy.
  60. // TODO(mrhohn): Remove this function when kube-proxy is run as a DaemonSet by default.
  61. func MasterUpgradeGCEWithKubeProxyDaemonSet(v string, enableKubeProxyDaemonSet bool) error {
  62. return masterUpgradeGCE(v, enableKubeProxyDaemonSet)
  63. }
  64. // TODO(mrhohn): Remove 'enableKubeProxyDaemonSet' when kube-proxy is run as a DaemonSet by default.
  65. func masterUpgradeGCE(rawV string, enableKubeProxyDaemonSet bool) error {
  66. env := append(os.Environ(), fmt.Sprintf("KUBE_PROXY_DAEMONSET=%v", enableKubeProxyDaemonSet))
  67. // TODO: Remove these variables when they're no longer needed for downgrades.
  68. if TestContext.EtcdUpgradeVersion != "" && TestContext.EtcdUpgradeStorage != "" {
  69. env = append(env,
  70. "TEST_ETCD_VERSION="+TestContext.EtcdUpgradeVersion,
  71. "STORAGE_BACKEND="+TestContext.EtcdUpgradeStorage,
  72. "TEST_ETCD_IMAGE=3.3.10-1")
  73. } else {
  74. // In e2e tests, we skip the confirmation prompt about
  75. // implicit etcd upgrades to simulate the user entering "y".
  76. env = append(env, "TEST_ALLOW_IMPLICIT_ETCD_UPGRADE=true")
  77. }
  78. v := "v" + rawV
  79. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-M", v)
  80. return err
  81. }
  82. func locationParamGKE() string {
  83. if TestContext.CloudConfig.MultiMaster {
  84. // GKE Regional Clusters are being tested.
  85. return fmt.Sprintf("--region=%s", TestContext.CloudConfig.Region)
  86. }
  87. return fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone)
  88. }
  89. func appendContainerCommandGroupIfNeeded(args []string) []string {
  90. if TestContext.CloudConfig.Region != "" {
  91. // TODO(wojtek-t): Get rid of it once Regional Clusters go to GA.
  92. return append([]string{"beta"}, args...)
  93. }
  94. return args
  95. }
  96. func masterUpgradeGKE(v string) error {
  97. e2elog.Logf("Upgrading master to %q", v)
  98. args := []string{
  99. "container",
  100. "clusters",
  101. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  102. locationParamGKE(),
  103. "upgrade",
  104. TestContext.CloudConfig.Cluster,
  105. "--master",
  106. fmt.Sprintf("--cluster-version=%s", v),
  107. "--quiet",
  108. }
  109. _, _, err := RunCmd("gcloud", appendContainerCommandGroupIfNeeded(args)...)
  110. if err != nil {
  111. return err
  112. }
  113. waitForSSHTunnels()
  114. return nil
  115. }
  116. func masterUpgradeKubernetesAnywhere(v string) error {
  117. e2elog.Logf("Upgrading master to %q", v)
  118. kaPath := TestContext.KubernetesAnywherePath
  119. originalConfigPath := filepath.Join(kaPath, ".config")
  120. backupConfigPath := filepath.Join(kaPath, ".config.bak")
  121. updatedConfigPath := filepath.Join(kaPath, fmt.Sprintf(".config-%s", v))
  122. // modify config with specified k8s version
  123. if _, _, err := RunCmd("sed",
  124. "-i.bak", // writes original to .config.bak
  125. fmt.Sprintf(`s/kubernetes_version=.*$/kubernetes_version=%q/`, v),
  126. originalConfigPath); err != nil {
  127. return err
  128. }
  129. defer func() {
  130. // revert .config.bak to .config
  131. if err := os.Rename(backupConfigPath, originalConfigPath); err != nil {
  132. e2elog.Logf("Could not rename %s back to %s", backupConfigPath, originalConfigPath)
  133. }
  134. }()
  135. // invoke ka upgrade
  136. if _, _, err := RunCmd("make", "-C", TestContext.KubernetesAnywherePath,
  137. "WAIT_FOR_KUBECONFIG=y", "upgrade-master"); err != nil {
  138. return err
  139. }
  140. // move .config to .config.<version>
  141. if err := os.Rename(originalConfigPath, updatedConfigPath); err != nil {
  142. return err
  143. }
  144. return nil
  145. }
  146. // NodeUpgrade upgrades nodes on GCE/GKE.
  147. func NodeUpgrade(f *Framework, v string, img string) error {
  148. // Perform the upgrade.
  149. var err error
  150. switch TestContext.Provider {
  151. case "gce":
  152. err = nodeUpgradeGCE(v, img, false)
  153. case "gke":
  154. err = nodeUpgradeGKE(v, img)
  155. default:
  156. err = fmt.Errorf("NodeUpgrade() is not implemented for provider %s", TestContext.Provider)
  157. }
  158. if err != nil {
  159. return err
  160. }
  161. return waitForNodesReadyAfterUpgrade(f)
  162. }
  163. // NodeUpgradeGCEWithKubeProxyDaemonSet upgrades nodes on GCE with enabling/disabling the daemon set of kube-proxy.
  164. // TODO(mrhohn): Remove this function when kube-proxy is run as a DaemonSet by default.
  165. func NodeUpgradeGCEWithKubeProxyDaemonSet(f *Framework, v string, img string, enableKubeProxyDaemonSet bool) error {
  166. // Perform the upgrade.
  167. if err := nodeUpgradeGCE(v, img, enableKubeProxyDaemonSet); err != nil {
  168. return err
  169. }
  170. return waitForNodesReadyAfterUpgrade(f)
  171. }
  172. func waitForNodesReadyAfterUpgrade(f *Framework) error {
  173. // Wait for it to complete and validate nodes are healthy.
  174. //
  175. // TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
  176. // GKE; the operation shouldn't return until they all are.
  177. numNodes, err := NumberOfRegisteredNodes(f.ClientSet)
  178. if err != nil {
  179. return fmt.Errorf("couldn't detect number of nodes")
  180. }
  181. e2elog.Logf("Waiting up to %v for all %d nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout, numNodes)
  182. if _, err := CheckNodesReady(f.ClientSet, numNodes, RestartNodeReadyAgainTimeout); err != nil {
  183. return err
  184. }
  185. return nil
  186. }
  187. // TODO(mrhohn): Remove 'enableKubeProxyDaemonSet' when kube-proxy is run as a DaemonSet by default.
  188. func nodeUpgradeGCE(rawV, img string, enableKubeProxyDaemonSet bool) error {
  189. v := "v" + rawV
  190. env := append(os.Environ(), fmt.Sprintf("KUBE_PROXY_DAEMONSET=%v", enableKubeProxyDaemonSet))
  191. if img != "" {
  192. env = append(env, "KUBE_NODE_OS_DISTRIBUTION="+img)
  193. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-N", "-o", v)
  194. return err
  195. }
  196. _, _, err := RunCmdEnv(env, gceUpgradeScript(), "-N", v)
  197. return err
  198. }
  199. func nodeUpgradeGKE(v string, img string) error {
  200. e2elog.Logf("Upgrading nodes to version %q and image %q", v, img)
  201. args := []string{
  202. "container",
  203. "clusters",
  204. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  205. locationParamGKE(),
  206. "upgrade",
  207. TestContext.CloudConfig.Cluster,
  208. fmt.Sprintf("--cluster-version=%s", v),
  209. "--quiet",
  210. }
  211. if len(img) > 0 {
  212. args = append(args, fmt.Sprintf("--image-type=%s", img))
  213. }
  214. _, _, err := RunCmd("gcloud", appendContainerCommandGroupIfNeeded(args)...)
  215. if err != nil {
  216. return err
  217. }
  218. waitForSSHTunnels()
  219. return nil
  220. }
  221. // MigTemplate (GCE-only) returns the name of the MIG template that the
  222. // nodes of the cluster use.
  223. func MigTemplate() (string, error) {
  224. var errLast error
  225. var templ string
  226. key := "instanceTemplate"
  227. if wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
  228. // TODO(mikedanese): make this hit the compute API directly instead of
  229. // shelling out to gcloud.
  230. // An `instance-groups managed describe` call outputs what we want to stdout.
  231. output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed",
  232. fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
  233. "describe",
  234. fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
  235. TestContext.CloudConfig.NodeInstanceGroup)
  236. if err != nil {
  237. errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err)
  238. return false, nil
  239. }
  240. // The 'describe' call probably succeeded; parse the output and try to
  241. // find the line that looks like "instanceTemplate: url/to/<templ>" and
  242. // return <templ>.
  243. if val := ParseKVLines(output, key); len(val) > 0 {
  244. url := strings.Split(val, "/")
  245. templ = url[len(url)-1]
  246. e2elog.Logf("MIG group %s using template: %s", TestContext.CloudConfig.NodeInstanceGroup, templ)
  247. return true, nil
  248. }
  249. errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output)
  250. return false, nil
  251. }) != nil {
  252. return "", fmt.Errorf("MigTemplate() failed with last error: %v", errLast)
  253. }
  254. return templ, nil
  255. }
  256. func gceUpgradeScript() string {
  257. if len(TestContext.GCEUpgradeScript) == 0 {
  258. return path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh")
  259. }
  260. return TestContext.GCEUpgradeScript
  261. }
  262. func waitForSSHTunnels() {
  263. e2elog.Logf("Waiting for SSH tunnels to establish")
  264. RunKubectl("run", "ssh-tunnel-test",
  265. "--image=busybox",
  266. "--restart=Never",
  267. "--command", "--",
  268. "echo", "Hello")
  269. defer RunKubectl("delete", "pod", "ssh-tunnel-test")
  270. // allow up to a minute for new ssh tunnels to establish
  271. wait.PollImmediate(5*time.Second, time.Minute, func() (bool, error) {
  272. _, err := RunKubectl("logs", "ssh-tunnel-test")
  273. return err == nil, nil
  274. })
  275. }
  276. // NodeKiller is a utility to simulate node failures.
  277. type NodeKiller struct {
  278. config NodeKillerConfig
  279. client clientset.Interface
  280. provider string
  281. }
  282. // NewNodeKiller creates new NodeKiller.
  283. func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
  284. return &NodeKiller{config, client, provider}
  285. }
  286. // Run starts NodeKiller until stopCh is closed.
  287. func (k *NodeKiller) Run(stopCh <-chan struct{}) {
  288. // wait.JitterUntil starts work immediately, so wait first.
  289. time.Sleep(wait.Jitter(k.config.Interval, k.config.JitterFactor))
  290. wait.JitterUntil(func() {
  291. nodes := k.pickNodes()
  292. k.kill(nodes)
  293. }, k.config.Interval, k.config.JitterFactor, true, stopCh)
  294. }
  295. func (k *NodeKiller) pickNodes() []v1.Node {
  296. nodes := GetReadySchedulableNodesOrDie(k.client)
  297. numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
  298. shuffledNodes := shuffleNodes(nodes.Items)
  299. if len(shuffledNodes) > numNodes {
  300. return shuffledNodes[:numNodes]
  301. }
  302. return shuffledNodes
  303. }
  304. func (k *NodeKiller) kill(nodes []v1.Node) {
  305. wg := sync.WaitGroup{}
  306. wg.Add(len(nodes))
  307. for _, node := range nodes {
  308. node := node
  309. go func() {
  310. defer wg.Done()
  311. e2elog.Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
  312. err := e2essh.IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
  313. if err != nil {
  314. e2elog.Logf("ERROR while stopping node %q: %v", node.Name, err)
  315. return
  316. }
  317. time.Sleep(k.config.SimulatedDowntime)
  318. e2elog.Logf("Rebooting %q to repair the node", node.Name)
  319. err = e2essh.IssueSSHCommand("sudo reboot", k.provider, &node)
  320. if err != nil {
  321. e2elog.Logf("ERROR while rebooting node %q: %v", node.Name, err)
  322. return
  323. }
  324. }()
  325. }
  326. wg.Wait()
  327. }
  328. // DeleteNodeOnCloudProvider deletes the specified node.
  329. func DeleteNodeOnCloudProvider(node *v1.Node) error {
  330. return TestContext.CloudConfig.Provider.DeleteNode(node)
  331. }