node_problem_detector_linux.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. // +build cgo,linux
  2. /*
  3. Copyright 2016 The Kubernetes Authors.
  4. Licensed under the Apache License, Version 2.0 (the "License");
  5. you may not use this file except in compliance with the License.
  6. You may obtain a copy of the License at
  7. http://www.apache.org/licenses/LICENSE-2.0
  8. Unless required by applicable law or agreed to in writing, software
  9. distributed under the License is distributed on an "AS IS" BASIS,
  10. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. See the License for the specific language governing permissions and
  12. limitations under the License.
  13. */
  14. package e2enode
  15. import (
  16. "context"
  17. "fmt"
  18. "os"
  19. "path"
  20. "time"
  21. v1 "k8s.io/api/core/v1"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. "k8s.io/apimachinery/pkg/fields"
  24. "k8s.io/apimachinery/pkg/labels"
  25. "k8s.io/apimachinery/pkg/types"
  26. "k8s.io/apimachinery/pkg/util/uuid"
  27. clientset "k8s.io/client-go/kubernetes"
  28. coreclientset "k8s.io/client-go/kubernetes/typed/core/v1"
  29. "k8s.io/kubernetes/pkg/kubelet/util"
  30. "k8s.io/kubernetes/test/e2e/framework"
  31. e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
  32. testutils "k8s.io/kubernetes/test/utils"
  33. "github.com/onsi/ginkgo"
  34. "github.com/onsi/gomega"
  35. )
  36. var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDetector]", func() {
  37. const (
  38. pollInterval = 1 * time.Second
  39. pollConsistent = 5 * time.Second
  40. pollTimeout = 1 * time.Minute
  41. )
  42. f := framework.NewDefaultFramework("node-problem-detector")
  43. var c clientset.Interface
  44. var uid string
  45. var ns, name, configName, eventNamespace string
  46. var bootTime, nodeTime time.Time
  47. var image string
  48. ginkgo.BeforeEach(func() {
  49. c = f.ClientSet
  50. ns = f.Namespace.Name
  51. uid = string(uuid.NewUUID())
  52. name = "node-problem-detector-" + uid
  53. configName = "node-problem-detector-config-" + uid
  54. // There is no namespace for Node, event recorder will set default namespace for node events.
  55. eventNamespace = metav1.NamespaceDefault
  56. image = getNodeProblemDetectorImage()
  57. ginkgo.By(fmt.Sprintf("Using node-problem-detector image: %s", image))
  58. })
  59. // Test system log monitor. We may add other tests if we have more problem daemons in the future.
  60. framework.KubeDescribe("SystemLogMonitor", func() {
  61. const (
  62. // Use test condition to avoid changing the real node condition in use.
  63. // TODO(random-liu): Now node condition could be arbitrary string, consider whether we need to
  64. // add TestCondition when switching to predefined condition list.
  65. condition = v1.NodeConditionType("TestCondition")
  66. // File paths used in the test.
  67. logFile = "/log/test.log"
  68. configFile = "/config/testconfig.json"
  69. etcLocaltime = "/etc/localtime"
  70. // Volumes used in the test.
  71. configVolume = "config"
  72. logVolume = "log"
  73. localtimeVolume = "localtime"
  74. // Reasons and messages used in the test.
  75. defaultReason = "Default"
  76. defaultMessage = "default message"
  77. tempReason = "Temporary"
  78. tempMessage = "temporary error"
  79. permReason1 = "Permanent1"
  80. permMessage1 = "permanent error 1"
  81. permReason2 = "Permanent2"
  82. permMessage2 = "permanent error 2"
  83. )
  84. var source, config, hostLogFile string
  85. var lookback time.Duration
  86. var eventListOptions metav1.ListOptions
  87. ginkgo.BeforeEach(func() {
  88. ginkgo.By("Calculate Lookback duration")
  89. var err error
  90. nodeTime = time.Now()
  91. bootTime, err = util.GetBootTime()
  92. framework.ExpectNoError(err)
  93. // Set lookback duration longer than node up time.
  94. // Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds.
  95. lookback = nodeTime.Sub(bootTime) + time.Hour
  96. // Randomize the source name
  97. source = "kernel-monitor-" + uid
  98. config = `
  99. {
  100. "plugin": "filelog",
  101. "pluginConfig": {
  102. "timestamp": "^.{15}",
  103. "message": "kernel: \\[.*\\] (.*)",
  104. "timestampFormat": "` + time.Stamp + `"
  105. },
  106. "logPath": "` + logFile + `",
  107. "lookback": "` + lookback.String() + `",
  108. "bufferSize": 10,
  109. "source": "` + source + `",
  110. "conditions": [
  111. {
  112. "type": "` + string(condition) + `",
  113. "reason": "` + defaultReason + `",
  114. "message": "` + defaultMessage + `"
  115. }
  116. ],
  117. "rules": [
  118. {
  119. "type": "temporary",
  120. "reason": "` + tempReason + `",
  121. "pattern": "` + tempMessage + `"
  122. },
  123. {
  124. "type": "permanent",
  125. "condition": "` + string(condition) + `",
  126. "reason": "` + permReason1 + `",
  127. "pattern": "` + permMessage1 + ".*" + `"
  128. },
  129. {
  130. "type": "permanent",
  131. "condition": "` + string(condition) + `",
  132. "reason": "` + permReason2 + `",
  133. "pattern": "` + permMessage2 + ".*" + `"
  134. }
  135. ]
  136. }`
  137. ginkgo.By("Generate event list options")
  138. selector := fields.Set{
  139. "involvedObject.kind": "Node",
  140. "involvedObject.name": framework.TestContext.NodeName,
  141. "involvedObject.namespace": metav1.NamespaceAll,
  142. "source": source,
  143. }.AsSelector().String()
  144. eventListOptions = metav1.ListOptions{FieldSelector: selector}
  145. ginkgo.By("Create the test log file")
  146. framework.ExpectNoError(err)
  147. ginkgo.By("Create config map for the node problem detector")
  148. _, err = c.CoreV1().ConfigMaps(ns).Create(context.TODO(), &v1.ConfigMap{
  149. ObjectMeta: metav1.ObjectMeta{Name: configName},
  150. Data: map[string]string{path.Base(configFile): config},
  151. }, metav1.CreateOptions{})
  152. framework.ExpectNoError(err)
  153. ginkgo.By("Create the node problem detector")
  154. hostPathType := new(v1.HostPathType)
  155. *hostPathType = v1.HostPathType(string(v1.HostPathFileOrCreate))
  156. f.PodClient().CreateSync(&v1.Pod{
  157. ObjectMeta: metav1.ObjectMeta{
  158. Name: name,
  159. },
  160. Spec: v1.PodSpec{
  161. HostNetwork: true,
  162. SecurityContext: &v1.PodSecurityContext{},
  163. Volumes: []v1.Volume{
  164. {
  165. Name: configVolume,
  166. VolumeSource: v1.VolumeSource{
  167. ConfigMap: &v1.ConfigMapVolumeSource{
  168. LocalObjectReference: v1.LocalObjectReference{Name: configName},
  169. },
  170. },
  171. },
  172. {
  173. Name: logVolume,
  174. VolumeSource: v1.VolumeSource{
  175. EmptyDir: &v1.EmptyDirVolumeSource{},
  176. },
  177. },
  178. {
  179. Name: localtimeVolume,
  180. VolumeSource: v1.VolumeSource{
  181. HostPath: &v1.HostPathVolumeSource{
  182. Path: etcLocaltime,
  183. Type: hostPathType,
  184. },
  185. },
  186. },
  187. },
  188. Containers: []v1.Container{
  189. {
  190. Name: name,
  191. Image: image,
  192. Command: []string{"sh", "-c", "touch " + logFile + " && /node-problem-detector --logtostderr --system-log-monitors=" + configFile + fmt.Sprintf(" --apiserver-override=%s?inClusterConfig=false", framework.TestContext.Host)},
  193. Env: []v1.EnvVar{
  194. {
  195. Name: "NODE_NAME",
  196. ValueFrom: &v1.EnvVarSource{
  197. FieldRef: &v1.ObjectFieldSelector{
  198. APIVersion: "v1",
  199. FieldPath: "spec.nodeName",
  200. },
  201. },
  202. },
  203. },
  204. VolumeMounts: []v1.VolumeMount{
  205. {
  206. Name: logVolume,
  207. MountPath: path.Dir(logFile),
  208. },
  209. {
  210. Name: localtimeVolume,
  211. MountPath: etcLocaltime,
  212. },
  213. {
  214. Name: configVolume,
  215. MountPath: path.Dir(configFile),
  216. },
  217. },
  218. },
  219. },
  220. },
  221. })
  222. pod, err := f.PodClient().Get(context.TODO(), name, metav1.GetOptions{})
  223. framework.ExpectNoError(err)
  224. // TODO: remove hardcoded kubelet volume directory path
  225. // framework.TestContext.KubeVolumeDir is currently not populated for node e2e
  226. hostLogFile = "/var/lib/kubelet/pods/" + string(pod.UID) + "/volumes/kubernetes.io~empty-dir" + logFile
  227. })
  228. ginkgo.It("should generate node condition and events for corresponding errors", func() {
  229. for _, test := range []struct {
  230. description string
  231. timestamp time.Time
  232. message string
  233. messageNum int
  234. tempEvents int // Events for temp errors
  235. totalEvents int // Events for both temp errors and condition changes
  236. conditionReason string
  237. conditionMessage string
  238. conditionType v1.ConditionStatus
  239. }{
  240. {
  241. description: "should generate default node condition",
  242. conditionReason: defaultReason,
  243. conditionMessage: defaultMessage,
  244. conditionType: v1.ConditionFalse,
  245. },
  246. {
  247. description: "should not generate events for too old log",
  248. timestamp: bootTime.Add(-1 * time.Minute),
  249. message: tempMessage,
  250. messageNum: 3,
  251. conditionReason: defaultReason,
  252. conditionMessage: defaultMessage,
  253. conditionType: v1.ConditionFalse,
  254. },
  255. {
  256. description: "should not change node condition for too old log",
  257. timestamp: bootTime.Add(-1 * time.Minute),
  258. message: permMessage1,
  259. messageNum: 1,
  260. conditionReason: defaultReason,
  261. conditionMessage: defaultMessage,
  262. conditionType: v1.ConditionFalse,
  263. },
  264. {
  265. description: "should generate event for old log within lookback duration",
  266. timestamp: nodeTime,
  267. message: tempMessage,
  268. messageNum: 3,
  269. tempEvents: 3,
  270. totalEvents: 3,
  271. conditionReason: defaultReason,
  272. conditionMessage: defaultMessage,
  273. conditionType: v1.ConditionFalse,
  274. },
  275. {
  276. description: "should change node condition for old log within lookback duration",
  277. timestamp: nodeTime,
  278. message: permMessage1,
  279. messageNum: 1,
  280. tempEvents: 3, // event number for temp errors should not change
  281. totalEvents: 4, // add 1 event for condition change
  282. conditionReason: permReason1,
  283. conditionMessage: permMessage1,
  284. conditionType: v1.ConditionTrue,
  285. },
  286. {
  287. description: "should generate event for new log",
  288. timestamp: nodeTime.Add(5 * time.Minute),
  289. message: tempMessage,
  290. messageNum: 3,
  291. tempEvents: 6, // add 3 events for temp errors
  292. totalEvents: 7, // add 3 events for temp errors
  293. conditionReason: permReason1,
  294. conditionMessage: permMessage1,
  295. conditionType: v1.ConditionTrue,
  296. },
  297. {
  298. description: "should not update node condition with the same reason",
  299. timestamp: nodeTime.Add(5 * time.Minute),
  300. message: permMessage1 + "different message",
  301. messageNum: 1,
  302. tempEvents: 6, // event number should not change
  303. totalEvents: 7, // event number should not change
  304. conditionReason: permReason1,
  305. conditionMessage: permMessage1,
  306. conditionType: v1.ConditionTrue,
  307. },
  308. {
  309. description: "should change node condition for new log",
  310. timestamp: nodeTime.Add(5 * time.Minute),
  311. message: permMessage2,
  312. messageNum: 1,
  313. tempEvents: 6, // event number for temp errors should not change
  314. totalEvents: 8, // add 1 event for condition change
  315. conditionReason: permReason2,
  316. conditionMessage: permMessage2,
  317. conditionType: v1.ConditionTrue,
  318. },
  319. } {
  320. ginkgo.By(test.description)
  321. if test.messageNum > 0 {
  322. ginkgo.By(fmt.Sprintf("Inject %d logs: %q", test.messageNum, test.message))
  323. err := injectLog(hostLogFile, test.timestamp, test.message, test.messageNum)
  324. framework.ExpectNoError(err)
  325. }
  326. ginkgo.By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
  327. gomega.Eventually(func() error {
  328. return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
  329. }, pollTimeout, pollInterval).Should(gomega.Succeed())
  330. ginkgo.By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
  331. gomega.Eventually(func() error {
  332. return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
  333. }, pollTimeout, pollInterval).Should(gomega.Succeed())
  334. ginkgo.By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
  335. gomega.Consistently(func() error {
  336. return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
  337. }, pollConsistent, pollInterval).Should(gomega.Succeed())
  338. ginkgo.By(fmt.Sprintf("Make sure node condition %q is set", condition))
  339. gomega.Eventually(func() error {
  340. return verifyNodeCondition(c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
  341. }, pollTimeout, pollInterval).Should(gomega.Succeed())
  342. ginkgo.By(fmt.Sprintf("Make sure node condition %q is stable", condition))
  343. gomega.Consistently(func() error {
  344. return verifyNodeCondition(c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
  345. }, pollConsistent, pollInterval).Should(gomega.Succeed())
  346. }
  347. })
  348. ginkgo.AfterEach(func() {
  349. if ginkgo.CurrentGinkgoTestDescription().Failed && framework.TestContext.DumpLogsOnFailure {
  350. ginkgo.By("Get node problem detector log")
  351. log, err := e2epod.GetPodLogs(c, ns, name, name)
  352. gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
  353. framework.Logf("Node Problem Detector logs:\n %s", log)
  354. }
  355. ginkgo.By("Delete the node problem detector")
  356. f.PodClient().Delete(context.TODO(), name, metav1.NewDeleteOptions(0))
  357. ginkgo.By("Wait for the node problem detector to disappear")
  358. gomega.Expect(e2epod.WaitForPodToDisappear(c, ns, name, labels.Everything(), pollInterval, pollTimeout)).To(gomega.Succeed())
  359. ginkgo.By("Delete the config map")
  360. c.CoreV1().ConfigMaps(ns).Delete(context.TODO(), configName, nil)
  361. ginkgo.By("Clean up the events")
  362. gomega.Expect(c.CoreV1().Events(eventNamespace).DeleteCollection(context.TODO(), metav1.NewDeleteOptions(0), eventListOptions)).To(gomega.Succeed())
  363. ginkgo.By("Clean up the node condition")
  364. patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition))
  365. c.CoreV1().RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(context.TODO())
  366. })
  367. })
  368. })
  369. // injectLog injects kernel log into specified file.
  370. func injectLog(file string, timestamp time.Time, log string, num int) error {
  371. f, err := os.OpenFile(file, os.O_RDWR|os.O_APPEND, 0666)
  372. if err != nil {
  373. return err
  374. }
  375. defer f.Close()
  376. for i := 0; i < num; i++ {
  377. _, err := f.WriteString(fmt.Sprintf("%s kernel: [0.000000] %s\n", timestamp.Format(time.Stamp), log))
  378. if err != nil {
  379. return err
  380. }
  381. }
  382. return nil
  383. }
  384. // verifyEvents verifies there are num specific events generated with given reason and message.
  385. func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
  386. events, err := e.List(context.TODO(), options)
  387. if err != nil {
  388. return err
  389. }
  390. count := 0
  391. for _, event := range events.Items {
  392. if event.Reason != reason || event.Message != message {
  393. continue
  394. }
  395. count += int(event.Count)
  396. }
  397. if count != num {
  398. return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
  399. }
  400. return nil
  401. }
  402. // verifyTotalEvents verifies there are num events in total.
  403. func verifyTotalEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
  404. events, err := e.List(context.TODO(), options)
  405. if err != nil {
  406. return err
  407. }
  408. count := 0
  409. for _, event := range events.Items {
  410. count += int(event.Count)
  411. }
  412. if count != num {
  413. return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
  414. }
  415. return nil
  416. }
  417. // verifyNodeCondition verifies specific node condition is generated, if reason and message are empty, they will not be checked
  418. func verifyNodeCondition(n coreclientset.NodeInterface, condition v1.NodeConditionType, status v1.ConditionStatus, reason, message string) error {
  419. node, err := n.Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
  420. if err != nil {
  421. return err
  422. }
  423. _, c := testutils.GetNodeCondition(&node.Status, condition)
  424. if c == nil {
  425. return fmt.Errorf("node condition %q not found", condition)
  426. }
  427. if c.Status != status || c.Reason != reason || c.Message != message {
  428. return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
  429. }
  430. return nil
  431. }