worker_test.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. /*
  2. Copyright 2015 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package prober
  14. import (
  15. "fmt"
  16. "testing"
  17. "time"
  18. "k8s.io/api/core/v1"
  19. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  20. "k8s.io/apimachinery/pkg/util/runtime"
  21. "k8s.io/apimachinery/pkg/util/wait"
  22. "k8s.io/client-go/kubernetes/fake"
  23. "k8s.io/client-go/tools/record"
  24. kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
  25. kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
  26. "k8s.io/kubernetes/pkg/kubelet/prober/results"
  27. "k8s.io/kubernetes/pkg/kubelet/status"
  28. statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
  29. "k8s.io/kubernetes/pkg/probe"
  30. "k8s.io/utils/exec"
  31. )
  32. func init() {
  33. runtime.ReallyCrash = true
  34. }
  35. func TestDoProbe(t *testing.T) {
  36. m := newTestManager()
  37. for _, probeType := range [...]probeType{liveness, readiness, startup} {
  38. // Test statuses.
  39. runningStatus := getTestRunningStatusWithStarted(probeType != startup)
  40. pendingStatus := getTestRunningStatusWithStarted(probeType != startup)
  41. pendingStatus.ContainerStatuses[0].State.Running = nil
  42. terminatedStatus := getTestRunningStatusWithStarted(probeType != startup)
  43. terminatedStatus.ContainerStatuses[0].State.Running = nil
  44. terminatedStatus.ContainerStatuses[0].State.Terminated = &v1.ContainerStateTerminated{
  45. StartedAt: metav1.Now(),
  46. }
  47. otherStatus := getTestRunningStatusWithStarted(probeType != startup)
  48. otherStatus.ContainerStatuses[0].Name = "otherContainer"
  49. failedStatus := getTestRunningStatusWithStarted(probeType != startup)
  50. failedStatus.Phase = v1.PodFailed
  51. tests := []struct {
  52. probe v1.Probe
  53. podStatus *v1.PodStatus
  54. expectContinue bool
  55. expectSet bool
  56. expectedResult results.Result
  57. }{
  58. { // No status.
  59. expectContinue: true,
  60. },
  61. { // Pod failed
  62. podStatus: &failedStatus,
  63. },
  64. { // No container status
  65. podStatus: &otherStatus,
  66. expectContinue: true,
  67. },
  68. { // Container waiting
  69. podStatus: &pendingStatus,
  70. expectContinue: true,
  71. expectSet: true,
  72. expectedResult: results.Failure,
  73. },
  74. { // Container terminated
  75. podStatus: &terminatedStatus,
  76. expectSet: true,
  77. expectedResult: results.Failure,
  78. },
  79. { // Probe successful.
  80. podStatus: &runningStatus,
  81. expectContinue: true,
  82. expectSet: true,
  83. expectedResult: results.Success,
  84. },
  85. { // Initial delay passed
  86. podStatus: &runningStatus,
  87. probe: v1.Probe{
  88. InitialDelaySeconds: -100,
  89. },
  90. expectContinue: true,
  91. expectSet: true,
  92. expectedResult: results.Success,
  93. },
  94. }
  95. for i, test := range tests {
  96. w := newTestWorker(m, probeType, test.probe)
  97. if test.podStatus != nil {
  98. m.statusManager.SetPodStatus(w.pod, *test.podStatus)
  99. }
  100. if c := w.doProbe(); c != test.expectContinue {
  101. t.Errorf("[%s-%d] Expected continue to be %v but got %v", probeType, i, test.expectContinue, c)
  102. }
  103. result, ok := resultsManager(m, probeType).Get(testContainerID)
  104. if ok != test.expectSet {
  105. t.Errorf("[%s-%d] Expected to have result: %v but got %v", probeType, i, test.expectSet, ok)
  106. }
  107. if result != test.expectedResult {
  108. t.Errorf("[%s-%d] Expected result: %v but got %v", probeType, i, test.expectedResult, result)
  109. }
  110. // Clean up.
  111. m.statusManager = status.NewManager(&fake.Clientset{}, kubepod.NewBasicPodManager(nil, nil, nil, nil), &statustest.FakePodDeletionSafetyProvider{})
  112. resultsManager(m, probeType).Remove(testContainerID)
  113. }
  114. }
  115. }
  116. func TestInitialDelay(t *testing.T) {
  117. m := newTestManager()
  118. for _, probeType := range [...]probeType{liveness, readiness, startup} {
  119. w := newTestWorker(m, probeType, v1.Probe{
  120. InitialDelaySeconds: 10,
  121. })
  122. m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(probeType != startup))
  123. expectContinue(t, w, w.doProbe(), "during initial delay")
  124. // Default value depends on probe, Success for liveness, Failure for readiness, Unknown for startup
  125. switch probeType {
  126. case liveness:
  127. expectResult(t, w, results.Success, "during initial delay")
  128. case readiness:
  129. expectResult(t, w, results.Failure, "during initial delay")
  130. case startup:
  131. expectResult(t, w, results.Unknown, "during initial delay")
  132. }
  133. // 100 seconds later...
  134. laterStatus := getTestRunningStatusWithStarted(probeType != startup)
  135. laterStatus.ContainerStatuses[0].State.Running.StartedAt.Time =
  136. time.Now().Add(-100 * time.Second)
  137. m.statusManager.SetPodStatus(w.pod, laterStatus)
  138. // Second call should succeed (already waited).
  139. expectContinue(t, w, w.doProbe(), "after initial delay")
  140. expectResult(t, w, results.Success, "after initial delay")
  141. }
  142. }
  143. func TestFailureThreshold(t *testing.T) {
  144. m := newTestManager()
  145. w := newTestWorker(m, readiness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
  146. m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
  147. for i := 0; i < 2; i++ {
  148. // First probe should succeed.
  149. m.prober.exec = fakeExecProber{probe.Success, nil}
  150. for j := 0; j < 3; j++ {
  151. msg := fmt.Sprintf("%d success (%d)", j+1, i)
  152. expectContinue(t, w, w.doProbe(), msg)
  153. expectResult(t, w, results.Success, msg)
  154. }
  155. // Prober starts failing :(
  156. m.prober.exec = fakeExecProber{probe.Failure, nil}
  157. // Next 2 probes should still be "success".
  158. for j := 0; j < 2; j++ {
  159. msg := fmt.Sprintf("%d failing (%d)", j+1, i)
  160. expectContinue(t, w, w.doProbe(), msg)
  161. expectResult(t, w, results.Success, msg)
  162. }
  163. // Third & following fail.
  164. for j := 0; j < 3; j++ {
  165. msg := fmt.Sprintf("%d failure (%d)", j+3, i)
  166. expectContinue(t, w, w.doProbe(), msg)
  167. expectResult(t, w, results.Failure, msg)
  168. }
  169. }
  170. }
  171. func TestSuccessThreshold(t *testing.T) {
  172. m := newTestManager()
  173. w := newTestWorker(m, readiness, v1.Probe{SuccessThreshold: 3, FailureThreshold: 1})
  174. m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
  175. // Start out failure.
  176. w.resultsManager.Set(testContainerID, results.Failure, &v1.Pod{})
  177. for i := 0; i < 2; i++ {
  178. // Probe defaults to Failure.
  179. for j := 0; j < 2; j++ {
  180. msg := fmt.Sprintf("%d success (%d)", j+1, i)
  181. expectContinue(t, w, w.doProbe(), msg)
  182. expectResult(t, w, results.Failure, msg)
  183. }
  184. // Continuing success!
  185. for j := 0; j < 3; j++ {
  186. msg := fmt.Sprintf("%d success (%d)", j+3, i)
  187. expectContinue(t, w, w.doProbe(), msg)
  188. expectResult(t, w, results.Success, msg)
  189. }
  190. // Prober flakes :(
  191. m.prober.exec = fakeExecProber{probe.Failure, nil}
  192. msg := fmt.Sprintf("1 failure (%d)", i)
  193. expectContinue(t, w, w.doProbe(), msg)
  194. expectResult(t, w, results.Failure, msg)
  195. // Back to success.
  196. m.prober.exec = fakeExecProber{probe.Success, nil}
  197. }
  198. }
  199. func TestCleanUp(t *testing.T) {
  200. m := newTestManager()
  201. for _, probeType := range [...]probeType{liveness, readiness, startup} {
  202. key := probeKey{testPodUID, testContainerName, probeType}
  203. w := newTestWorker(m, probeType, v1.Probe{})
  204. m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(probeType != startup))
  205. go w.run()
  206. m.workers[key] = w
  207. // Wait for worker to run.
  208. condition := func() (bool, error) {
  209. ready, _ := resultsManager(m, probeType).Get(testContainerID)
  210. return ready == results.Success, nil
  211. }
  212. if ready, _ := condition(); !ready {
  213. if err := wait.Poll(100*time.Millisecond, wait.ForeverTestTimeout, condition); err != nil {
  214. t.Fatalf("[%s] Error waiting for worker ready: %v", probeType, err)
  215. }
  216. }
  217. for i := 0; i < 10; i++ {
  218. w.stop() // Stop should be callable multiple times without consequence.
  219. }
  220. if err := waitForWorkerExit(m, []probeKey{key}); err != nil {
  221. t.Fatalf("[%s] error waiting for worker exit: %v", probeType, err)
  222. }
  223. if _, ok := resultsManager(m, probeType).Get(testContainerID); ok {
  224. t.Errorf("[%s] Expected result to be cleared.", probeType)
  225. }
  226. if _, ok := m.workers[key]; ok {
  227. t.Errorf("[%s] Expected worker to be cleared.", probeType)
  228. }
  229. }
  230. }
  231. func TestHandleCrash(t *testing.T) {
  232. runtime.ReallyCrash = false // Test that we *don't* really crash.
  233. m := newTestManager()
  234. w := newTestWorker(m, readiness, v1.Probe{})
  235. m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
  236. expectContinue(t, w, w.doProbe(), "Initial successful probe.")
  237. expectResult(t, w, results.Success, "Initial successful probe.")
  238. // Prober starts crashing.
  239. m.prober = &prober{
  240. refManager: kubecontainer.NewRefManager(),
  241. recorder: &record.FakeRecorder{},
  242. exec: crashingExecProber{},
  243. }
  244. // doProbe should recover from the crash, and keep going.
  245. expectContinue(t, w, w.doProbe(), "Crashing probe.")
  246. expectResult(t, w, results.Success, "Crashing probe unchanged.")
  247. }
  248. func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg string) {
  249. result, ok := resultsManager(w.probeManager, w.probeType).Get(w.containerID)
  250. if !ok {
  251. t.Errorf("[%s - %s] Expected result to be set, but was not set", w.probeType, msg)
  252. } else if result != expectedResult {
  253. t.Errorf("[%s - %s] Expected result to be %v, but was %v",
  254. w.probeType, msg, expectedResult, result)
  255. }
  256. }
  257. func expectContinue(t *testing.T, w *worker, c bool, msg string) {
  258. if !c {
  259. t.Errorf("[%s - %s] Expected to continue, but did not", w.probeType, msg)
  260. }
  261. }
  262. func resultsManager(m *manager, probeType probeType) results.Manager {
  263. switch probeType {
  264. case readiness:
  265. return m.readinessManager
  266. case liveness:
  267. return m.livenessManager
  268. case startup:
  269. return m.startupManager
  270. }
  271. panic(fmt.Errorf("Unhandled case: %v", probeType))
  272. }
  273. type crashingExecProber struct{}
  274. func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) {
  275. panic("Intentional Probe crash.")
  276. }
  277. func TestOnHoldOnLivenessOrStartupCheckFailure(t *testing.T) {
  278. m := newTestManager()
  279. for _, probeType := range [...]probeType{liveness, startup} {
  280. w := newTestWorker(m, probeType, v1.Probe{SuccessThreshold: 1, FailureThreshold: 1})
  281. status := getTestRunningStatusWithStarted(probeType != startup)
  282. m.statusManager.SetPodStatus(w.pod, status)
  283. // First probe should fail.
  284. m.prober.exec = fakeExecProber{probe.Failure, nil}
  285. msg := "first probe"
  286. expectContinue(t, w, w.doProbe(), msg)
  287. expectResult(t, w, results.Failure, msg)
  288. if !w.onHold {
  289. t.Errorf("Prober should be on hold due to %s check failure", probeType)
  290. }
  291. // Set fakeExecProber to return success. However, the result will remain
  292. // failure because the worker is on hold and won't probe.
  293. m.prober.exec = fakeExecProber{probe.Success, nil}
  294. msg = "while on hold"
  295. expectContinue(t, w, w.doProbe(), msg)
  296. expectResult(t, w, results.Failure, msg)
  297. if !w.onHold {
  298. t.Errorf("Prober should be on hold due to %s check failure", probeType)
  299. }
  300. // Set a new container ID to lift the hold. The next probe will succeed.
  301. status.ContainerStatuses[0].ContainerID = "test://newCont_ID"
  302. m.statusManager.SetPodStatus(w.pod, status)
  303. msg = "hold lifted"
  304. expectContinue(t, w, w.doProbe(), msg)
  305. expectResult(t, w, results.Success, msg)
  306. if w.onHold {
  307. t.Errorf("Prober should not be on hold anymore")
  308. }
  309. }
  310. }
  311. func TestResultRunOnLivenessCheckFailure(t *testing.T) {
  312. m := newTestManager()
  313. w := newTestWorker(m, liveness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
  314. m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
  315. m.prober.exec = fakeExecProber{probe.Success, nil}
  316. msg := "initial probe success"
  317. expectContinue(t, w, w.doProbe(), msg)
  318. expectResult(t, w, results.Success, msg)
  319. if w.resultRun != 1 {
  320. t.Errorf("Prober resultRun should be 1")
  321. }
  322. m.prober.exec = fakeExecProber{probe.Failure, nil}
  323. msg = "probe failure, result success"
  324. expectContinue(t, w, w.doProbe(), msg)
  325. expectResult(t, w, results.Success, msg)
  326. if w.resultRun != 1 {
  327. t.Errorf("Prober resultRun should be 1")
  328. }
  329. m.prober.exec = fakeExecProber{probe.Failure, nil}
  330. msg = "2nd probe failure, result success"
  331. expectContinue(t, w, w.doProbe(), msg)
  332. expectResult(t, w, results.Success, msg)
  333. if w.resultRun != 2 {
  334. t.Errorf("Prober resultRun should be 2")
  335. }
  336. // Exceeding FailureThreshold should cause resultRun to
  337. // reset to 0 so that the probe on the restarted pod
  338. // also gets FailureThreshold attempts to succeed.
  339. m.prober.exec = fakeExecProber{probe.Failure, nil}
  340. msg = "3rd probe failure, result failure"
  341. expectContinue(t, w, w.doProbe(), msg)
  342. expectResult(t, w, results.Failure, msg)
  343. if w.resultRun != 0 {
  344. t.Errorf("Prober resultRun should be reset to 0")
  345. }
  346. }
  347. func TestResultRunOnStartupCheckFailure(t *testing.T) {
  348. m := newTestManager()
  349. w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
  350. m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(false))
  351. // Below FailureThreshold leaves probe state unchanged
  352. // which is failed for startup at first.
  353. m.prober.exec = fakeExecProber{probe.Failure, nil}
  354. msg := "probe failure, result unknown"
  355. expectContinue(t, w, w.doProbe(), msg)
  356. expectResult(t, w, results.Unknown, msg)
  357. if w.resultRun != 1 {
  358. t.Errorf("Prober resultRun should be 1")
  359. }
  360. m.prober.exec = fakeExecProber{probe.Failure, nil}
  361. msg = "2nd probe failure, result unknown"
  362. expectContinue(t, w, w.doProbe(), msg)
  363. expectResult(t, w, results.Unknown, msg)
  364. if w.resultRun != 2 {
  365. t.Errorf("Prober resultRun should be 2")
  366. }
  367. // Exceeding FailureThreshold should cause resultRun to
  368. // reset to 0 so that the probe on the restarted pod
  369. // also gets FailureThreshold attempts to succeed.
  370. m.prober.exec = fakeExecProber{probe.Failure, nil}
  371. msg = "3rd probe failure, result failure"
  372. expectContinue(t, w, w.doProbe(), msg)
  373. expectResult(t, w, results.Failure, msg)
  374. if w.resultRun != 0 {
  375. t.Errorf("Prober resultRun should be reset to 0")
  376. }
  377. }
  378. func TestLivenessProbeDisabledByStarted(t *testing.T) {
  379. m := newTestManager()
  380. w := newTestWorker(m, liveness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 1})
  381. m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(false))
  382. // livenessProbe fails, but is disabled
  383. m.prober.exec = fakeExecProber{probe.Failure, nil}
  384. msg := "Not started, probe failure, result success"
  385. expectContinue(t, w, w.doProbe(), msg)
  386. expectResult(t, w, results.Success, msg)
  387. // setting started state
  388. m.statusManager.SetContainerStartup(w.pod.UID, w.containerID, true)
  389. // livenessProbe fails
  390. m.prober.exec = fakeExecProber{probe.Failure, nil}
  391. msg = "Started, probe failure, result failure"
  392. expectContinue(t, w, w.doProbe(), msg)
  393. expectResult(t, w, results.Failure, msg)
  394. }
  395. func TestStartupProbeDisabledByStarted(t *testing.T) {
  396. m := newTestManager()
  397. w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: 1, FailureThreshold: 2})
  398. m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(false))
  399. // startupProbe fails < FailureThreshold, stays unknown
  400. m.prober.exec = fakeExecProber{probe.Failure, nil}
  401. msg := "Not started, probe failure, result unknown"
  402. expectContinue(t, w, w.doProbe(), msg)
  403. expectResult(t, w, results.Unknown, msg)
  404. // startupProbe succeeds
  405. m.prober.exec = fakeExecProber{probe.Success, nil}
  406. msg = "Started, probe success, result success"
  407. expectContinue(t, w, w.doProbe(), msg)
  408. expectResult(t, w, results.Success, msg)
  409. // setting started state
  410. m.statusManager.SetContainerStartup(w.pod.UID, w.containerID, true)
  411. // startupProbe fails, but is disabled
  412. m.prober.exec = fakeExecProber{probe.Failure, nil}
  413. msg = "Started, probe failure, result success"
  414. expectContinue(t, w, w.doProbe(), msg)
  415. expectResult(t, w, results.Success, msg)
  416. }