health-monitor.sh 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env bash
  2. # Copyright 2016 The Kubernetes Authors.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # This script is for master and node instance health monitoring, which is
  16. # packed in kube-manifest tarball. It is executed through a systemd service
  17. # in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
  18. # file provided by the systemd service.
  19. set -o nounset
  20. set -o pipefail
  21. # We simply kill the process when there is a failure. Another systemd service will
  22. # automatically restart the process.
  23. function container_runtime_monitoring {
  24. local -r max_attempts=5
  25. local attempt=1
  26. local -r crictl="${KUBE_HOME}/bin/crictl"
  27. local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
  28. # We still need to use `docker ps` when container runtime is "docker". This is because
  29. # dockershim is still part of kubelet today. When kubelet is down, crictl pods
  30. # will also fail, and docker will be killed. This is undesirable especially when
  31. # docker live restore is disabled.
  32. local healthcheck_command="docker ps"
  33. if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
  34. healthcheck_command="${crictl} pods"
  35. fi
  36. # Container runtime startup takes time. Make initial attempts before starting
  37. # killing the container runtime.
  38. until timeout 60 ${healthcheck_command} > /dev/null; do
  39. if (( attempt == max_attempts )); then
  40. echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
  41. break
  42. fi
  43. echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
  44. sleep "$(( 2 ** attempt++ ))"
  45. done
  46. while true; do
  47. if ! timeout 60 ${healthcheck_command} > /dev/null; then
  48. echo "Container runtime ${container_runtime_name} failed!"
  49. if [[ "$container_runtime_name" == "docker" ]]; then
  50. # Dump stack of docker daemon for investigation.
  51. # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to
  52. # the exec root directory, which is /var/run/docker/ on Ubuntu and COS.
  53. pkill -SIGUSR1 dockerd
  54. fi
  55. systemctl kill --kill-who=main "${container_runtime_name}"
  56. # Wait for a while, as we don't want to kill it again before it is really up.
  57. sleep 120
  58. else
  59. sleep "${SLEEP_SECONDS}"
  60. fi
  61. done
  62. }
  63. function kubelet_monitoring {
  64. echo "Wait for 2 minutes for kubelet to be functional"
  65. # TODO(andyzheng0831): replace it with a more reliable method if possible.
  66. sleep 120
  67. local -r max_seconds=10
  68. local output=""
  69. while [ 1 ]; do
  70. if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10255/healthz 2>&1); then
  71. # Print the response and/or errors.
  72. echo $output
  73. echo "Kubelet is unhealthy!"
  74. systemctl kill kubelet
  75. # Wait for a while, as we don't want to kill it again before it is really up.
  76. sleep 60
  77. else
  78. sleep "${SLEEP_SECONDS}"
  79. fi
  80. done
  81. }
  82. ############## Main Function ################
  83. if [[ "$#" -ne 1 ]]; then
  84. echo "Usage: health-monitor.sh <container-runtime/kubelet>"
  85. exit 1
  86. fi
  87. KUBE_HOME="/home/kubernetes"
  88. KUBE_ENV="${KUBE_HOME}/kube-env"
  89. if [[ ! -e "${KUBE_ENV}" ]]; then
  90. echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring"
  91. exit 1
  92. fi
  93. SLEEP_SECONDS=10
  94. component=$1
  95. echo "Start kubernetes health monitoring for ${component}"
  96. source "${KUBE_ENV}"
  97. if [[ "${component}" == "container-runtime" ]]; then
  98. container_runtime_monitoring
  99. elif [[ "${component}" == "kubelet" ]]; then
  100. kubelet_monitoring
  101. else
  102. echo "Health monitoring for component "${component}" is not supported!"
  103. fi