#!/bin/bash # Copyright 2019 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # A small smoke test to run against a just-deployed kube-up cluster with Windows # nodes. Performs checks such as: # 1) Verifying that all Windows nodes have status Ready. # 2) Verifying that no system pods are attempting to run on Windows nodes. # 3) Verifying pairwise connectivity between most of the following: Linux # pods, Windows pods, K8s services, and the Internet. # 4) Verifying that basic DNS resolution works in Windows pods. # # This script assumes that it is run from the root of the kubernetes repository. # # TODOs: # - Implement the node-to-pod checks. # - Capture stdout for each command to a file and only print it when the test # fails. # - Move copy-pasted code into reusable functions. # - Continue running all checks after one fails. # - Test service connectivity by running a test pod with an http server and # exposing it as a service (rather than curl-ing from existing system # services that don't serve http requests). # - Add test retries for transient errors, such as: # "error: unable to upgrade connection: Authorization error # (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy)" # Override this to use a different kubectl binary. kubectl=kubectl linux_deployment_timeout=60 windows_deployment_timeout=300 output_file=/tmp/k8s-smoke-test.out function check_windows_nodes_are_ready { # kubectl filtering is the worst. statuses=$(${kubectl} get nodes -l kubernetes.io/os=windows \ -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}') for status in $statuses; do if [[ $status == "False" ]]; then echo "ERROR: some Windows node has status != Ready" echo "kubectl get nodes -l kubernetes.io/os=windows" ${kubectl} get nodes -l kubernetes.io/os=windows exit 1 fi done echo "Verified that all Windows nodes have status Ready" } function untaint_windows_nodes { # Untaint the windows nodes to allow test workloads without tolerations to be # scheduled onto them. WINDOWS_NODES=$(${kubectl} get nodes -l kubernetes.io/os=windows -o name) for node in $WINDOWS_NODES; do ${kubectl} taint node "$node" node.kubernetes.io/os:NoSchedule- done } function check_no_system_pods_on_windows_nodes { windows_system_pods=$(${kubectl} get pods --namespace kube-system \ -o wide | grep -E "Pending|windows" | wc -w) if [[ $windows_system_pods -ne 0 ]]; then echo "ERROR: there are kube-system pods trying to run on Windows nodes" echo "kubectl get pods --namespace kube-system -o wide" ${kubectl} get pods --namespace kube-system -o wide exit 1 fi echo "Verified that all system pods are running on Linux nodes" } linux_webserver_deployment=linux-nginx linux_webserver_pod_label=nginx linux_webserver_replicas=1 function deploy_linux_webserver_pod { echo "Writing example deployment to $linux_webserver_deployment.yaml" cat < $linux_webserver_deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: $linux_webserver_deployment labels: app: $linux_webserver_pod_label spec: replicas: $linux_webserver_replicas selector: matchLabels: app: $linux_webserver_pod_label template: metadata: labels: app: $linux_webserver_pod_label spec: containers: - name: nginx image: nginx:1.7.9 nodeSelector: kubernetes.io/os: linux EOF if ! ${kubectl} create -f $linux_webserver_deployment.yaml; then echo "kubectl create -f $linux_webserver_deployment.yaml failed" exit 1 fi timeout=$linux_deployment_timeout while [[ $timeout -gt 0 ]]; do echo "Waiting for $linux_webserver_replicas Linux $linux_webserver_pod_label pods to become Ready" statuses=$(${kubectl} get pods -l app=$linux_webserver_pod_label \ -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \ | grep "True" | wc -w) if [[ $statuses -eq $linux_webserver_replicas ]]; then break else sleep 10 (( timeout=timeout-10 )) fi done if [[ $timeout -gt 0 ]]; then echo "All $linux_webserver_pod_label pods became Ready" else echo "ERROR: Not all $linux_webserver_pod_label pods became Ready" echo "kubectl get pods -l app=$linux_webserver_pod_label" ${kubectl} get pods -l app=$linux_webserver_pod_label cleanup_deployments exit 1 fi } # Returns the name of an arbitrary Linux webserver pod. function get_linux_webserver_pod_name { $kubectl get pods -l app=$linux_webserver_pod_label \ -o jsonpath='{.items[0].metadata.name}' } # Returns the IP address of an arbitrary Linux webserver pod. function get_linux_webserver_pod_ip { $kubectl get pods -l app=$linux_webserver_pod_label \ -o jsonpath='{.items[0].status.podIP}' } function undeploy_linux_webserver_pod { ${kubectl} delete deployment $linux_webserver_deployment } linux_command_deployment=linux-ubuntu linux_command_pod_label=ubuntu linux_command_replicas=1 function deploy_linux_command_pod { echo "Writing example deployment to $linux_command_deployment.yaml" cat < $linux_command_deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: $linux_command_deployment labels: app: $linux_command_pod_label spec: replicas: $linux_command_replicas selector: matchLabels: app: $linux_command_pod_label template: metadata: labels: app: $linux_command_pod_label spec: containers: - name: ubuntu image: ubuntu command: ["sleep", "123456"] nodeSelector: kubernetes.io/os: linux EOF if ! ${kubectl} create -f $linux_command_deployment.yaml; then echo "kubectl create -f $linux_command_deployment.yaml failed" exit 1 fi timeout=$linux_deployment_timeout while [[ $timeout -gt 0 ]]; do echo "Waiting for $linux_command_replicas Linux $linux_command_pod_label pods to become Ready" statuses=$(${kubectl} get pods -l app=$linux_command_pod_label \ -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \ | grep "True" | wc -w) if [[ $statuses -eq $linux_command_replicas ]]; then break else sleep 10 (( timeout=timeout-10 )) fi done if [[ $timeout -gt 0 ]]; then echo "All $linux_command_pod_label pods became Ready" else echo "ERROR: Not all $linux_command_pod_label pods became Ready" echo "kubectl get pods -l app=$linux_command_pod_label" ${kubectl} get pods -l app=$linux_command_pod_label cleanup_deployments exit 1 fi } # Returns the name of an arbitrary Linux command pod. function get_linux_command_pod_name { $kubectl get pods -l app=$linux_command_pod_label \ -o jsonpath='{.items[0].metadata.name}' } # Returns the IP address of an arbitrary Linux command pod. function get_linux_command_pod_ip { $kubectl get pods -l app=$linux_command_pod_label \ -o jsonpath='{.items[0].status.podIP}' } # Installs test executables (ping, curl) in the Linux command pod. # NOTE: this assumes that there is only one Linux "command pod". # TODO(pjh): fix this. function prepare_linux_command_pod { local linux_command_pod linux_command_pod="$(get_linux_command_pod_name)" echo "Installing test utilities in Linux command pod, may take a minute" $kubectl exec "$linux_command_pod" -- apt-get update > /dev/null $kubectl exec "$linux_command_pod" -- \ apt-get install -y iputils-ping curl > /dev/null } function undeploy_linux_command_pod { ${kubectl} delete deployment $linux_command_deployment } windows_webserver_deployment=windows-agnhost windows_webserver_pod_label=agnhost # The default port for 'agnhost serve-hostname'. The documentation says that # this can be changed but the --port arg does not seem to work. windows_webserver_port=9376 windows_webserver_replicas=1 function deploy_windows_webserver_pod { echo "Writing example deployment to $windows_webserver_deployment.yaml" cat < $windows_webserver_deployment.yaml # A multi-arch Windows container that runs an HTTP server on port # $windows_webserver_port that serves the container's hostname. # curl -s http://:$windows_webserver_port apiVersion: apps/v1 kind: Deployment metadata: name: $windows_webserver_deployment labels: app: $windows_webserver_pod_label spec: replicas: $windows_webserver_replicas selector: matchLabels: app: $windows_webserver_pod_label template: metadata: labels: app: $windows_webserver_pod_label spec: containers: - name: agnhost image: e2eteam/agnhost:2.8 args: - serve-hostname nodeSelector: kubernetes.io/os: windows tolerations: - effect: NoSchedule key: node.kubernetes.io/os operator: Equal value: windows EOF if ! ${kubectl} create -f $windows_webserver_deployment.yaml; then echo "kubectl create -f $windows_webserver_deployment.yaml failed" exit 1 fi timeout=$windows_deployment_timeout while [[ $timeout -gt 0 ]]; do echo "Waiting for $windows_webserver_replicas Windows $windows_webserver_pod_label pods to become Ready" statuses=$(${kubectl} get pods -l app=$windows_webserver_pod_label \ -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \ | grep "True" | wc -w) if [[ $statuses -eq $windows_webserver_replicas ]]; then break else sleep 10 (( timeout=timeout-10 )) fi done if [[ $timeout -gt 0 ]]; then echo "All $windows_webserver_pod_label pods became Ready" else echo "ERROR: Not all $windows_webserver_pod_label pods became Ready" echo "kubectl get pods -l app=$windows_webserver_pod_label" ${kubectl} get pods -l app=$windows_webserver_pod_label cleanup_deployments exit 1 fi } function get_windows_webserver_pod_name { $kubectl get pods -l app=$windows_webserver_pod_label \ -o jsonpath='{.items[0].metadata.name}' } function get_windows_webserver_pod_ip { $kubectl get pods -l app=$windows_webserver_pod_label \ -o jsonpath='{.items[0].status.podIP}' } function undeploy_windows_webserver_pod { ${kubectl} delete deployment $windows_webserver_deployment } windows_command_deployment=windows-powershell windows_command_pod_label=powershell windows_command_replicas=1 # Deploys a multi-arch Windows pod capable of running PowerShell. function deploy_windows_command_pod { echo "Writing example deployment to $windows_command_deployment.yaml" cat < $windows_command_deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: $windows_command_deployment labels: app: $windows_command_pod_label spec: replicas: $windows_command_replicas selector: matchLabels: app: $windows_command_pod_label template: metadata: labels: app: $windows_command_pod_label spec: containers: - name: pause-win image: gcr.io/gke-release/pause-win:1.1.0 nodeSelector: kubernetes.io/os: windows tolerations: - effect: NoSchedule key: node.kubernetes.io/os operator: Equal value: windows EOF if ! ${kubectl} create -f $windows_command_deployment.yaml; then echo "kubectl create -f $windows_command_deployment.yaml failed" exit 1 fi timeout=$windows_deployment_timeout while [[ $timeout -gt 0 ]]; do echo "Waiting for $windows_command_replicas Windows $windows_command_pod_label pods to become Ready" statuses=$(${kubectl} get pods -l app=$windows_command_pod_label \ -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \ | grep "True" | wc -w) if [[ $statuses -eq $windows_command_replicas ]]; then break else sleep 10 (( timeout=timeout-10 )) fi done if [[ $timeout -gt 0 ]]; then echo "All $windows_command_pod_label pods became Ready" else echo "ERROR: Not all $windows_command_pod_label pods became Ready" echo "kubectl get pods -l app=$windows_command_pod_label" ${kubectl} get pods -l app=$windows_command_pod_label cleanup_deployments exit 1 fi } function get_windows_command_pod_name { $kubectl get pods -l app=$windows_command_pod_label \ -o jsonpath='{.items[0].metadata.name}' } function get_windows_command_pod_ip { $kubectl get pods -l app=$windows_command_pod_label \ -o jsonpath='{.items[0].status.podIP}' } function undeploy_windows_command_pod { ${kubectl} delete deployment $windows_command_deployment } function test_linux_node_to_linux_pod { echo "TODO: ${FUNCNAME[0]}" } function test_linux_node_to_windows_pod { echo "TODO: ${FUNCNAME[0]}" } function test_linux_pod_to_linux_pod { echo "TEST: ${FUNCNAME[0]}" local linux_command_pod linux_command_pod="$(get_linux_command_pod_name)" local linux_webserver_pod_ip linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)" if ! $kubectl exec "$linux_command_pod" -- curl -s -m 20 \ "http://$linux_webserver_pod_ip" &> $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } # TODO(pjh): this test flakily fails on brand-new clusters, not sure why. # % Total % Received % Xferd Average Speed Time Time Time Current # Dload Upload Total Spent Left Speed # 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 # curl: (6) Could not resolve host: # command terminated with exit code 6 function test_linux_pod_to_windows_pod { echo "TEST: ${FUNCNAME[0]}" local linux_command_pod linux_command_pod="$(get_linux_command_pod_name)" local windows_webserver_pod_ip windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)" if ! $kubectl exec "$linux_command_pod" -- curl -s -m 20 \ "http://$windows_webserver_pod_ip:$windows_webserver_port" &> $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" echo "This test seems to be flaky. TODO(pjh): investigate." exit 1 fi } function test_linux_pod_to_internet { echo "TEST: ${FUNCNAME[0]}" local linux_command_pod linux_command_pod="$(get_linux_command_pod_name)" # A stable (hopefully) HTTP server provided by Cloudflare. local internet_ip="1.1.1.1" if ! $kubectl exec "$linux_command_pod" -- curl -s -m 20 \ "http://$internet_ip" > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_linux_pod_to_k8s_service { echo "TEST: ${FUNCNAME[0]}" local linux_command_pod linux_command_pod="$(get_linux_command_pod_name)" local service="metrics-server" local service_ip service_ip=$($kubectl get service --namespace kube-system $service \ -o jsonpath='{.spec.clusterIP}') local service_port service_port=$($kubectl get service --namespace kube-system $service \ -o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}') echo "curl-ing $service address from Linux pod: $service_ip:$service_port" # curl-ing the metrics-server service downloads 14 bytes of unprintable binary # data and sets a return code of success (0). if ! $kubectl exec "$linux_command_pod" -- \ curl -s -m 20 "http://$service_ip:$service_port" &> $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_windows_node_to_linux_pod { echo "TODO: ${FUNCNAME[0]}" } function test_windows_node_to_windows_pod { echo "TODO: ${FUNCNAME[0]}" } # TODO(pjh): this test failed for me once with # error: unable to upgrade connection: container not found ("nettest") # Maybe the container crashed for some reason? Investigate if it happens more. # # TODO(pjh): another one-time failure: # error: unable to upgrade connection: Authorization error # (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy) function test_windows_pod_to_linux_pod { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" local linux_webserver_pod_ip linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)" if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "curl -UseBasicParsing http://$linux_webserver_pod_ip" > \ $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_windows_pod_to_windows_pod { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" local windows_webserver_pod_ip windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)" if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "curl -UseBasicParsing http://$windows_webserver_pod_ip:$windows_webserver_port" \ > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_windows_pod_to_internet { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" # A stable (hopefully) HTTP server provided by Cloudflare. If this ever stops # working, we can request from 8.8.8.8 (Google DNS) using https instead. local internet_ip="1.1.1.1" if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "curl -UseBasicParsing http://$internet_ip" > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_windows_pod_to_k8s_service { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" local service="metrics-server" local service_ip service_ip=$($kubectl get service --namespace kube-system $service \ -o jsonpath='{.spec.clusterIP}') local service_port service_port=$($kubectl get service --namespace kube-system $service \ -o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}') local service_address="$service_ip:$service_port" echo "curl-ing $service address from Windows pod: $service_address" # curl-ing the metrics-server service results in a ServerProtocolViolation # ("The server committed a protocol violation. Section=ResponseStatusLine") # exception. Since we don't care about what the metrics-server actually gives # back to us, just that we can reach it, we check that we get the expected # exception code and not some other exception code. # TODO: it might be less fragile to check that we don't get the "Unable to # connect to the remote server" exception code (2) instead of specifically # expecting the protocol-violation exception code (11). if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "\$result = try { \` curl -UseBasicParsing http://$service_address -ErrorAction Stop \` } catch [System.Net.WebException] { \` \$_ \` }; \` if ([int]\$result.Exception.Status -eq 11) { \` Write-Host \"curl $service_address got expected exception\" exit 0 \` } else { \` Write-Host \"curl $service_address got unexpected result/exception: \$result\" exit 1 \` }" > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_kube_dns_in_windows_pod { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" local service="kube-dns" local service_ip service_ip=$($kubectl get service --namespace kube-system $service \ -o jsonpath='{.spec.clusterIP}') if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "Resolve-DnsName www.bing.com -server $service_ip" > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function test_dns_just_works_in_windows_pod { echo "TEST: ${FUNCNAME[0]}" local windows_command_pod windows_command_pod="$(get_windows_command_pod_name)" if ! $kubectl exec "$windows_command_pod" -- powershell.exe \ "curl -UseBasicParsing http://www.bing.com" > $output_file; then cleanup_deployments echo "Failing output: $(cat $output_file)" echo "FAILED: ${FUNCNAME[0]}" exit 1 fi } function cleanup_deployments { undeploy_linux_webserver_pod undeploy_linux_command_pod undeploy_windows_webserver_pod undeploy_windows_command_pod } check_windows_nodes_are_ready untaint_windows_nodes check_no_system_pods_on_windows_nodes deploy_linux_webserver_pod deploy_linux_command_pod deploy_windows_webserver_pod deploy_windows_command_pod prepare_linux_command_pod echo "" test_linux_node_to_linux_pod test_linux_node_to_windows_pod test_linux_pod_to_linux_pod test_linux_pod_to_windows_pod test_linux_pod_to_k8s_service # Note: test_windows_node_to_k8s_service is not supported at this time. # https://docs.microsoft.com/en-us/virtualization/windowscontainers/kubernetes/common-problems#my-windows-node-cannot-access-my-services-using-the-service-ip test_windows_node_to_linux_pod test_windows_node_to_windows_pod test_windows_pod_to_linux_pod test_windows_pod_to_windows_pod test_windows_pod_to_internet test_windows_pod_to_k8s_service test_kube_dns_in_windows_pod test_dns_just_works_in_windows_pod echo "" cleanup_deployments echo "All tests passed!" exit 0