validate-cluster.sh 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/usr/bin/env bash
  2. # Copyright 2014 The Kubernetes Authors.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # Validates that the cluster is healthy.
  16. # Error codes are:
  17. # 0 - success
  18. # 1 - fatal (cluster is unlikely to work)
  19. # 2 - non-fatal (encountered some errors, but cluster should be working correctly)
  20. set -o errexit
  21. set -o nounset
  22. set -o pipefail
  23. KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
  24. if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
  25. source "${KUBE_ROOT}/cluster/env.sh"
  26. fi
  27. source "${KUBE_ROOT}/hack/lib/util.sh"
  28. source "${KUBE_ROOT}/cluster/kube-util.sh"
  29. # Run kubectl and retry upon failure.
  30. function kubectl_retry() {
  31. tries=3
  32. while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
  33. tries=$((tries-1))
  34. if [[ ${tries} -le 0 ]]; then
  35. echo "('kubectl $*' failed, giving up)" >&2
  36. return 1
  37. fi
  38. echo "(kubectl failed, will retry ${tries} times)" >&2
  39. sleep 1
  40. done
  41. }
  42. ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
  43. CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}"
  44. if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
  45. if [[ "${KUBE_CREATE_NODES}" == "true" ]]; then
  46. EXPECTED_NUM_NODES="$(get-num-nodes)"
  47. else
  48. EXPECTED_NUM_NODES="0"
  49. fi
  50. echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}"
  51. # In multizone mode we need to add instances for all nodes in the region.
  52. if [[ "${MULTIZONE:-}" == "true" ]]; then
  53. EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format="[no-heading]" \
  54. --filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region="${REGION}" --format="csv[no-heading](name)" | tr "\n" "," | sed "s/,$//"))" | wc -l)
  55. echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}"
  56. fi
  57. else
  58. EXPECTED_NUM_NODES="${NUM_NODES}"
  59. fi
  60. if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
  61. if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
  62. NUM_MASTERS=$(get-master-replicas-count)
  63. else
  64. NUM_MASTERS=1
  65. fi
  66. EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+NUM_MASTERS))
  67. fi
  68. REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
  69. # Make several attempts to deal with slow cluster birth.
  70. return_value=0
  71. attempt=0
  72. # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
  73. PAUSE_BETWEEN_ITERATIONS_SECONDS=15
  74. MAX_ATTEMPTS=100
  75. ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS))
  76. while true; do
  77. # Pause between iterations of this large outer loop.
  78. if [[ ${attempt} -gt 0 ]]; then
  79. sleep 15
  80. fi
  81. attempt=$((attempt+1))
  82. # The "kubectl get nodes -o template" exports node information.
  83. #
  84. # Echo the output and gather 2 counts:
  85. # - Total number of nodes.
  86. # - Number of "ready" nodes.
  87. #
  88. # Suppress errors from kubectl output because during cluster bootstrapping
  89. # for clusters where the master node is registered, the apiserver will become
  90. # available and then get restarted as the kubelet configures the docker bridge.
  91. #
  92. # We are assigning the result of kubectl_retry get nodes operation to the res
  93. # variable in that way, to prevent stopping the whole script on an error.
  94. #
  95. # Bash command substitution $(kubectl_...) removes all trailing whitespaces
  96. # which are important for line counting.
  97. # Use trick from https://unix.stackexchange.com/a/383411 to avoid
  98. # newline truncation.
  99. node=$(kubectl_retry get nodes --no-headers; ret=$?; echo .; exit "$ret") && res="$?" || res="$?"
  100. node="${node%.}"
  101. if [ "${res}" -ne "0" ]; then
  102. if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
  103. echo -e "${color_red:-} Failed to get nodes.${color_norm:-}"
  104. exit 1
  105. else
  106. continue
  107. fi
  108. fi
  109. found=$(echo -n "${node}" | wc -l)
  110. # Use grep || true so that empty result doesn't return nonzero exit code.
  111. ready=$(echo -n "${node}" | grep -c -v "NotReady" || true)
  112. if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
  113. break
  114. elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
  115. if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]]; then
  116. echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
  117. fi
  118. break
  119. elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
  120. echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
  121. break
  122. else
  123. if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]]; then
  124. echo -e "${color_green:-}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}"
  125. last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}"
  126. fi
  127. if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
  128. echo -e "${color_yellow:-}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
  129. kubectl_retry get nodes
  130. if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then
  131. exit 1
  132. else
  133. return_value=2
  134. break
  135. fi
  136. else
  137. echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
  138. fi
  139. fi
  140. done
  141. echo "Found ${found} node(s)."
  142. kubectl_retry get nodes
  143. attempt=0
  144. while true; do
  145. # The "kubectl componentstatuses -o template" exports components health information.
  146. #
  147. # Echo the output and gather 2 counts:
  148. # - Total number of componentstatuses.
  149. # - Number of "healthy" components.
  150. cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
  151. componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
  152. healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true
  153. if ((componentstatuses > healthy)) || ((componentstatuses == 0)); then
  154. if ((attempt < 5)); then
  155. echo -e "${color_yellow}Cluster not working yet.${color_norm}"
  156. attempt=$((attempt+1))
  157. sleep 30
  158. else
  159. echo -e " ${color_yellow}Validate output:${color_norm}"
  160. kubectl_retry get cs
  161. echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
  162. exit 1
  163. fi
  164. else
  165. break
  166. fi
  167. done
  168. echo "Validate output:"
  169. kubectl_retry get cs || true
  170. if [ "${return_value}" == "0" ]; then
  171. echo -e "${color_green}Cluster validation succeeded${color_norm}"
  172. else
  173. echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
  174. fi
  175. exit "${return_value}"