node_problem_detector_linux.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. // +build cgo,linux
  2. /*
  3. Copyright 2016 The Kubernetes Authors.
  4. Licensed under the Apache License, Version 2.0 (the "License");
  5. you may not use this file except in compliance with the License.
  6. You may obtain a copy of the License at
  7. http://www.apache.org/licenses/LICENSE-2.0
  8. Unless required by applicable law or agreed to in writing, software
  9. distributed under the License is distributed on an "AS IS" BASIS,
  10. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. See the License for the specific language governing permissions and
  12. limitations under the License.
  13. */
  14. package e2e_node
  15. import (
  16. "fmt"
  17. "os"
  18. "path"
  19. "time"
  20. "k8s.io/api/core/v1"
  21. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  22. "k8s.io/apimachinery/pkg/fields"
  23. "k8s.io/apimachinery/pkg/labels"
  24. "k8s.io/apimachinery/pkg/types"
  25. "k8s.io/apimachinery/pkg/util/uuid"
  26. clientset "k8s.io/client-go/kubernetes"
  27. coreclientset "k8s.io/client-go/kubernetes/typed/core/v1"
  28. "k8s.io/kubernetes/pkg/kubelet/util"
  29. "k8s.io/kubernetes/test/e2e/framework"
  30. e2elog "k8s.io/kubernetes/test/e2e/framework/log"
  31. testutils "k8s.io/kubernetes/test/utils"
  32. . "github.com/onsi/ginkgo"
  33. . "github.com/onsi/gomega"
  34. )
  35. var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDetector]", func() {
  36. const (
  37. pollInterval = 1 * time.Second
  38. pollConsistent = 5 * time.Second
  39. pollTimeout = 1 * time.Minute
  40. )
  41. f := framework.NewDefaultFramework("node-problem-detector")
  42. var c clientset.Interface
  43. var uid string
  44. var ns, name, configName, eventNamespace string
  45. var bootTime, nodeTime time.Time
  46. var image string
  47. BeforeEach(func() {
  48. c = f.ClientSet
  49. ns = f.Namespace.Name
  50. uid = string(uuid.NewUUID())
  51. name = "node-problem-detector-" + uid
  52. configName = "node-problem-detector-config-" + uid
  53. // There is no namespace for Node, event recorder will set default namespace for node events.
  54. eventNamespace = metav1.NamespaceDefault
  55. image = getNodeProblemDetectorImage()
  56. By(fmt.Sprintf("Using node-problem-detector image: %s", image))
  57. })
  58. // Test system log monitor. We may add other tests if we have more problem daemons in the future.
  59. framework.KubeDescribe("SystemLogMonitor", func() {
  60. const (
  61. // Use test condition to avoid changing the real node condition in use.
  62. // TODO(random-liu): Now node condition could be arbitrary string, consider whether we need to
  63. // add TestCondition when switching to predefined condition list.
  64. condition = v1.NodeConditionType("TestCondition")
  65. // File paths used in the test.
  66. logFile = "/log/test.log"
  67. configFile = "/config/testconfig.json"
  68. etcLocaltime = "/etc/localtime"
  69. // Volumes used in the test.
  70. configVolume = "config"
  71. logVolume = "log"
  72. localtimeVolume = "localtime"
  73. // Reasons and messages used in the test.
  74. defaultReason = "Default"
  75. defaultMessage = "default message"
  76. tempReason = "Temporary"
  77. tempMessage = "temporary error"
  78. permReason1 = "Permanent1"
  79. permMessage1 = "permanent error 1"
  80. permReason2 = "Permanent2"
  81. permMessage2 = "permanent error 2"
  82. )
  83. var source, config, hostLogFile string
  84. var lookback time.Duration
  85. var eventListOptions metav1.ListOptions
  86. BeforeEach(func() {
  87. By("Calculate Lookback duration")
  88. var err error
  89. nodeTime = time.Now()
  90. bootTime, err = util.GetBootTime()
  91. Expect(err).To(BeNil())
  92. // Set lookback duration longer than node up time.
  93. // Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds.
  94. lookback = nodeTime.Sub(bootTime) + time.Hour
  95. // Randomize the source name
  96. source = "kernel-monitor-" + uid
  97. config = `
  98. {
  99. "plugin": "filelog",
  100. "pluginConfig": {
  101. "timestamp": "^.{15}",
  102. "message": "kernel: \\[.*\\] (.*)",
  103. "timestampFormat": "` + time.Stamp + `"
  104. },
  105. "logPath": "` + logFile + `",
  106. "lookback": "` + lookback.String() + `",
  107. "bufferSize": 10,
  108. "source": "` + source + `",
  109. "conditions": [
  110. {
  111. "type": "` + string(condition) + `",
  112. "reason": "` + defaultReason + `",
  113. "message": "` + defaultMessage + `"
  114. }
  115. ],
  116. "rules": [
  117. {
  118. "type": "temporary",
  119. "reason": "` + tempReason + `",
  120. "pattern": "` + tempMessage + `"
  121. },
  122. {
  123. "type": "permanent",
  124. "condition": "` + string(condition) + `",
  125. "reason": "` + permReason1 + `",
  126. "pattern": "` + permMessage1 + ".*" + `"
  127. },
  128. {
  129. "type": "permanent",
  130. "condition": "` + string(condition) + `",
  131. "reason": "` + permReason2 + `",
  132. "pattern": "` + permMessage2 + ".*" + `"
  133. }
  134. ]
  135. }`
  136. By("Generate event list options")
  137. selector := fields.Set{
  138. "involvedObject.kind": "Node",
  139. "involvedObject.name": framework.TestContext.NodeName,
  140. "involvedObject.namespace": metav1.NamespaceAll,
  141. "source": source,
  142. }.AsSelector().String()
  143. eventListOptions = metav1.ListOptions{FieldSelector: selector}
  144. By("Create the test log file")
  145. Expect(err).NotTo(HaveOccurred())
  146. By("Create config map for the node problem detector")
  147. _, err = c.CoreV1().ConfigMaps(ns).Create(&v1.ConfigMap{
  148. ObjectMeta: metav1.ObjectMeta{Name: configName},
  149. Data: map[string]string{path.Base(configFile): config},
  150. })
  151. Expect(err).NotTo(HaveOccurred())
  152. By("Create the node problem detector")
  153. hostPathType := new(v1.HostPathType)
  154. *hostPathType = v1.HostPathType(string(v1.HostPathFileOrCreate))
  155. f.PodClient().CreateSync(&v1.Pod{
  156. ObjectMeta: metav1.ObjectMeta{
  157. Name: name,
  158. },
  159. Spec: v1.PodSpec{
  160. HostNetwork: true,
  161. SecurityContext: &v1.PodSecurityContext{},
  162. Volumes: []v1.Volume{
  163. {
  164. Name: configVolume,
  165. VolumeSource: v1.VolumeSource{
  166. ConfigMap: &v1.ConfigMapVolumeSource{
  167. LocalObjectReference: v1.LocalObjectReference{Name: configName},
  168. },
  169. },
  170. },
  171. {
  172. Name: logVolume,
  173. VolumeSource: v1.VolumeSource{
  174. EmptyDir: &v1.EmptyDirVolumeSource{},
  175. },
  176. },
  177. {
  178. Name: localtimeVolume,
  179. VolumeSource: v1.VolumeSource{
  180. HostPath: &v1.HostPathVolumeSource{
  181. Path: etcLocaltime,
  182. Type: hostPathType,
  183. },
  184. },
  185. },
  186. },
  187. Containers: []v1.Container{
  188. {
  189. Name: name,
  190. Image: image,
  191. Command: []string{"sh", "-c", "touch " + logFile + " && /node-problem-detector --logtostderr --system-log-monitors=" + configFile + fmt.Sprintf(" --apiserver-override=%s?inClusterConfig=false", framework.TestContext.Host)},
  192. Env: []v1.EnvVar{
  193. {
  194. Name: "NODE_NAME",
  195. ValueFrom: &v1.EnvVarSource{
  196. FieldRef: &v1.ObjectFieldSelector{
  197. APIVersion: "v1",
  198. FieldPath: "spec.nodeName",
  199. },
  200. },
  201. },
  202. },
  203. VolumeMounts: []v1.VolumeMount{
  204. {
  205. Name: logVolume,
  206. MountPath: path.Dir(logFile),
  207. },
  208. {
  209. Name: localtimeVolume,
  210. MountPath: etcLocaltime,
  211. },
  212. {
  213. Name: configVolume,
  214. MountPath: path.Dir(configFile),
  215. },
  216. },
  217. },
  218. },
  219. },
  220. })
  221. pod, err := f.PodClient().Get(name, metav1.GetOptions{})
  222. Expect(err).NotTo(HaveOccurred())
  223. // TODO: remove hardcoded kubelet volume directory path
  224. // framework.TestContext.KubeVolumeDir is currently not populated for node e2e
  225. hostLogFile = "/var/lib/kubelet/pods/" + string(pod.UID) + "/volumes/kubernetes.io~empty-dir" + logFile
  226. })
  227. It("should generate node condition and events for corresponding errors", func() {
  228. for _, test := range []struct {
  229. description string
  230. timestamp time.Time
  231. message string
  232. messageNum int
  233. tempEvents int // Events for temp errors
  234. totalEvents int // Events for both temp errors and condition changes
  235. conditionReason string
  236. conditionMessage string
  237. conditionType v1.ConditionStatus
  238. }{
  239. {
  240. description: "should generate default node condition",
  241. conditionReason: defaultReason,
  242. conditionMessage: defaultMessage,
  243. conditionType: v1.ConditionFalse,
  244. },
  245. {
  246. description: "should not generate events for too old log",
  247. timestamp: bootTime.Add(-1 * time.Minute),
  248. message: tempMessage,
  249. messageNum: 3,
  250. conditionReason: defaultReason,
  251. conditionMessage: defaultMessage,
  252. conditionType: v1.ConditionFalse,
  253. },
  254. {
  255. description: "should not change node condition for too old log",
  256. timestamp: bootTime.Add(-1 * time.Minute),
  257. message: permMessage1,
  258. messageNum: 1,
  259. conditionReason: defaultReason,
  260. conditionMessage: defaultMessage,
  261. conditionType: v1.ConditionFalse,
  262. },
  263. {
  264. description: "should generate event for old log within lookback duration",
  265. timestamp: nodeTime,
  266. message: tempMessage,
  267. messageNum: 3,
  268. tempEvents: 3,
  269. totalEvents: 3,
  270. conditionReason: defaultReason,
  271. conditionMessage: defaultMessage,
  272. conditionType: v1.ConditionFalse,
  273. },
  274. {
  275. description: "should change node condition for old log within lookback duration",
  276. timestamp: nodeTime,
  277. message: permMessage1,
  278. messageNum: 1,
  279. tempEvents: 3, // event number for temp errors should not change
  280. totalEvents: 4, // add 1 event for condition change
  281. conditionReason: permReason1,
  282. conditionMessage: permMessage1,
  283. conditionType: v1.ConditionTrue,
  284. },
  285. {
  286. description: "should generate event for new log",
  287. timestamp: nodeTime.Add(5 * time.Minute),
  288. message: tempMessage,
  289. messageNum: 3,
  290. tempEvents: 6, // add 3 events for temp errors
  291. totalEvents: 7, // add 3 events for temp errors
  292. conditionReason: permReason1,
  293. conditionMessage: permMessage1,
  294. conditionType: v1.ConditionTrue,
  295. },
  296. {
  297. description: "should not update node condition with the same reason",
  298. timestamp: nodeTime.Add(5 * time.Minute),
  299. message: permMessage1 + "different message",
  300. messageNum: 1,
  301. tempEvents: 6, // event number should not change
  302. totalEvents: 7, // event number should not change
  303. conditionReason: permReason1,
  304. conditionMessage: permMessage1,
  305. conditionType: v1.ConditionTrue,
  306. },
  307. {
  308. description: "should change node condition for new log",
  309. timestamp: nodeTime.Add(5 * time.Minute),
  310. message: permMessage2,
  311. messageNum: 1,
  312. tempEvents: 6, // event number for temp errors should not change
  313. totalEvents: 8, // add 1 event for condition change
  314. conditionReason: permReason2,
  315. conditionMessage: permMessage2,
  316. conditionType: v1.ConditionTrue,
  317. },
  318. } {
  319. By(test.description)
  320. if test.messageNum > 0 {
  321. By(fmt.Sprintf("Inject %d logs: %q", test.messageNum, test.message))
  322. err := injectLog(hostLogFile, test.timestamp, test.message, test.messageNum)
  323. Expect(err).NotTo(HaveOccurred())
  324. }
  325. By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
  326. Eventually(func() error {
  327. return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
  328. }, pollTimeout, pollInterval).Should(Succeed())
  329. By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
  330. Eventually(func() error {
  331. return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
  332. }, pollTimeout, pollInterval).Should(Succeed())
  333. By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
  334. Consistently(func() error {
  335. return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
  336. }, pollConsistent, pollInterval).Should(Succeed())
  337. By(fmt.Sprintf("Make sure node condition %q is set", condition))
  338. Eventually(func() error {
  339. return verifyNodeCondition(c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
  340. }, pollTimeout, pollInterval).Should(Succeed())
  341. By(fmt.Sprintf("Make sure node condition %q is stable", condition))
  342. Consistently(func() error {
  343. return verifyNodeCondition(c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
  344. }, pollConsistent, pollInterval).Should(Succeed())
  345. }
  346. })
  347. AfterEach(func() {
  348. if CurrentGinkgoTestDescription().Failed && framework.TestContext.DumpLogsOnFailure {
  349. By("Get node problem detector log")
  350. log, err := framework.GetPodLogs(c, ns, name, name)
  351. Expect(err).ShouldNot(HaveOccurred())
  352. e2elog.Logf("Node Problem Detector logs:\n %s", log)
  353. }
  354. By("Delete the node problem detector")
  355. f.PodClient().Delete(name, metav1.NewDeleteOptions(0))
  356. By("Wait for the node problem detector to disappear")
  357. Expect(framework.WaitForPodToDisappear(c, ns, name, labels.Everything(), pollInterval, pollTimeout)).To(Succeed())
  358. By("Delete the config map")
  359. c.CoreV1().ConfigMaps(ns).Delete(configName, nil)
  360. By("Clean up the events")
  361. Expect(c.CoreV1().Events(eventNamespace).DeleteCollection(metav1.NewDeleteOptions(0), eventListOptions)).To(Succeed())
  362. By("Clean up the node condition")
  363. patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition))
  364. c.CoreV1().RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do()
  365. })
  366. })
  367. })
  368. // injectLog injects kernel log into specified file.
  369. func injectLog(file string, timestamp time.Time, log string, num int) error {
  370. f, err := os.OpenFile(file, os.O_RDWR|os.O_APPEND, 0666)
  371. if err != nil {
  372. return err
  373. }
  374. defer f.Close()
  375. for i := 0; i < num; i++ {
  376. _, err := f.WriteString(fmt.Sprintf("%s kernel: [0.000000] %s\n", timestamp.Format(time.Stamp), log))
  377. if err != nil {
  378. return err
  379. }
  380. }
  381. return nil
  382. }
  383. // verifyEvents verifies there are num specific events generated with given reason and message.
  384. func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
  385. events, err := e.List(options)
  386. if err != nil {
  387. return err
  388. }
  389. count := 0
  390. for _, event := range events.Items {
  391. if event.Reason != reason || event.Message != message {
  392. continue
  393. }
  394. count += int(event.Count)
  395. }
  396. if count != num {
  397. return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
  398. }
  399. return nil
  400. }
  401. // verifyTotalEvents verifies there are num events in total.
  402. func verifyTotalEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
  403. events, err := e.List(options)
  404. if err != nil {
  405. return err
  406. }
  407. count := 0
  408. for _, event := range events.Items {
  409. count += int(event.Count)
  410. }
  411. if count != num {
  412. return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
  413. }
  414. return nil
  415. }
  416. // verifyNodeCondition verifies specific node condition is generated, if reason and message are empty, they will not be checked
  417. func verifyNodeCondition(n coreclientset.NodeInterface, condition v1.NodeConditionType, status v1.ConditionStatus, reason, message string) error {
  418. node, err := n.Get(framework.TestContext.NodeName, metav1.GetOptions{})
  419. if err != nil {
  420. return err
  421. }
  422. _, c := testutils.GetNodeCondition(&node.Status, condition)
  423. if c == nil {
  424. return fmt.Errorf("node condition %q not found", condition)
  425. }
  426. if c.Status != status || c.Reason != reason || c.Message != message {
  427. return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
  428. }
  429. return nil
  430. }