container.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package manager
  15. import (
  16. "flag"
  17. "fmt"
  18. "io/ioutil"
  19. "math"
  20. "math/rand"
  21. "os/exec"
  22. "path"
  23. "regexp"
  24. "sort"
  25. "strconv"
  26. "strings"
  27. "sync"
  28. "time"
  29. "github.com/google/cadvisor/accelerators"
  30. "github.com/google/cadvisor/cache/memory"
  31. "github.com/google/cadvisor/collector"
  32. "github.com/google/cadvisor/container"
  33. info "github.com/google/cadvisor/info/v1"
  34. "github.com/google/cadvisor/info/v2"
  35. "github.com/google/cadvisor/summary"
  36. "github.com/google/cadvisor/utils/cpuload"
  37. units "github.com/docker/go-units"
  38. "k8s.io/klog"
  39. "k8s.io/utils/clock"
  40. )
  41. // Housekeeping interval.
  42. var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader")
  43. var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings")
  44. // cgroup type chosen to fetch the cgroup path of a process.
  45. // Memory has been chosen, as it is one of the default cgroups that is enabled for most containers.
  46. var cgroupPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`)
  47. type containerInfo struct {
  48. info.ContainerReference
  49. Subcontainers []info.ContainerReference
  50. Spec info.ContainerSpec
  51. }
  52. type containerData struct {
  53. handler container.ContainerHandler
  54. info containerInfo
  55. memoryCache *memory.InMemoryCache
  56. lock sync.Mutex
  57. loadReader cpuload.CpuLoadReader
  58. summaryReader *summary.StatsSummary
  59. loadAvg float64 // smoothed load average seen so far.
  60. housekeepingInterval time.Duration
  61. maxHousekeepingInterval time.Duration
  62. allowDynamicHousekeeping bool
  63. infoLastUpdatedTime time.Time
  64. statsLastUpdatedTime time.Time
  65. lastErrorTime time.Time
  66. // used to track time
  67. clock clock.Clock
  68. // Decay value used for load average smoothing. Interval length of 10 seconds is used.
  69. loadDecay float64
  70. // Whether to log the usage of this container when it is updated.
  71. logUsage bool
  72. // Tells the container to stop.
  73. stop chan bool
  74. // Tells the container to immediately collect stats
  75. onDemandChan chan chan struct{}
  76. // Runs custom metric collectors.
  77. collectorManager collector.CollectorManager
  78. // nvidiaCollector updates stats for Nvidia GPUs attached to the container.
  79. nvidiaCollector accelerators.AcceleratorCollector
  80. }
  81. // jitter returns a time.Duration between duration and duration + maxFactor * duration,
  82. // to allow clients to avoid converging on periodic behavior. If maxFactor is 0.0, a
  83. // suggested default value will be chosen.
  84. func jitter(duration time.Duration, maxFactor float64) time.Duration {
  85. if maxFactor <= 0.0 {
  86. maxFactor = 1.0
  87. }
  88. wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
  89. return wait
  90. }
  91. func (c *containerData) Start() error {
  92. go c.housekeeping()
  93. return nil
  94. }
  95. func (c *containerData) Stop() error {
  96. err := c.memoryCache.RemoveContainer(c.info.Name)
  97. if err != nil {
  98. return err
  99. }
  100. c.stop <- true
  101. return nil
  102. }
  103. func (c *containerData) allowErrorLogging() bool {
  104. if c.clock.Since(c.lastErrorTime) > time.Minute {
  105. c.lastErrorTime = c.clock.Now()
  106. return true
  107. }
  108. return false
  109. }
  110. // OnDemandHousekeeping performs housekeeping on the container and blocks until it has completed.
  111. // It is designed to be used in conjunction with periodic housekeeping, and will cause the timer for
  112. // periodic housekeeping to reset. This should be used sparingly, as calling OnDemandHousekeeping frequently
  113. // can have serious performance costs.
  114. func (c *containerData) OnDemandHousekeeping(maxAge time.Duration) {
  115. if c.clock.Since(c.statsLastUpdatedTime) > maxAge {
  116. housekeepingFinishedChan := make(chan struct{})
  117. c.onDemandChan <- housekeepingFinishedChan
  118. select {
  119. case <-c.stop:
  120. case <-housekeepingFinishedChan:
  121. }
  122. }
  123. }
  124. // notifyOnDemand notifies all calls to OnDemandHousekeeping that housekeeping is finished
  125. func (c *containerData) notifyOnDemand() {
  126. for {
  127. select {
  128. case finishedChan := <-c.onDemandChan:
  129. close(finishedChan)
  130. default:
  131. return
  132. }
  133. }
  134. }
  135. func (c *containerData) GetInfo(shouldUpdateSubcontainers bool) (*containerInfo, error) {
  136. // Get spec and subcontainers.
  137. if c.clock.Since(c.infoLastUpdatedTime) > 5*time.Second {
  138. err := c.updateSpec()
  139. if err != nil {
  140. return nil, err
  141. }
  142. if shouldUpdateSubcontainers {
  143. err = c.updateSubcontainers()
  144. if err != nil {
  145. return nil, err
  146. }
  147. }
  148. c.infoLastUpdatedTime = c.clock.Now()
  149. }
  150. // Make a copy of the info for the user.
  151. c.lock.Lock()
  152. defer c.lock.Unlock()
  153. return &c.info, nil
  154. }
  155. func (c *containerData) DerivedStats() (v2.DerivedStats, error) {
  156. if c.summaryReader == nil {
  157. return v2.DerivedStats{}, fmt.Errorf("derived stats not enabled for container %q", c.info.Name)
  158. }
  159. return c.summaryReader.DerivedStats()
  160. }
  161. func (c *containerData) getCgroupPath(cgroups string) (string, error) {
  162. if cgroups == "-" {
  163. return "/", nil
  164. }
  165. matches := cgroupPathRegExp.FindSubmatch([]byte(cgroups))
  166. if len(matches) != 2 {
  167. klog.V(3).Infof("failed to get memory cgroup path from %q", cgroups)
  168. // return root in case of failures - memory hierarchy might not be enabled.
  169. return "/", nil
  170. }
  171. return string(matches[1]), nil
  172. }
  173. // Returns contents of a file inside the container root.
  174. // Takes in a path relative to container root.
  175. func (c *containerData) ReadFile(filepath string, inHostNamespace bool) ([]byte, error) {
  176. pids, err := c.getContainerPids(inHostNamespace)
  177. if err != nil {
  178. return nil, err
  179. }
  180. // TODO(rjnagal): Optimize by just reading container's cgroup.proc file when in host namespace.
  181. rootfs := "/"
  182. if !inHostNamespace {
  183. rootfs = "/rootfs"
  184. }
  185. for _, pid := range pids {
  186. filePath := path.Join(rootfs, "/proc", pid, "/root", filepath)
  187. klog.V(3).Infof("Trying path %q", filePath)
  188. data, err := ioutil.ReadFile(filePath)
  189. if err == nil {
  190. return data, err
  191. }
  192. }
  193. // No process paths could be found. Declare config non-existent.
  194. return nil, fmt.Errorf("file %q does not exist.", filepath)
  195. }
  196. // Return output for ps command in host /proc with specified format
  197. func (c *containerData) getPsOutput(inHostNamespace bool, format string) ([]byte, error) {
  198. args := []string{}
  199. command := "ps"
  200. if !inHostNamespace {
  201. command = "/usr/sbin/chroot"
  202. args = append(args, "/rootfs", "ps")
  203. }
  204. args = append(args, "-e", "-o", format)
  205. out, err := exec.Command(command, args...).Output()
  206. if err != nil {
  207. return nil, fmt.Errorf("failed to execute %q command: %v", command, err)
  208. }
  209. return out, err
  210. }
  211. // Get pids of processes in this container.
  212. // A slightly lighterweight call than GetProcessList if other details are not required.
  213. func (c *containerData) getContainerPids(inHostNamespace bool) ([]string, error) {
  214. format := "pid,cgroup"
  215. out, err := c.getPsOutput(inHostNamespace, format)
  216. if err != nil {
  217. return nil, err
  218. }
  219. expectedFields := 2
  220. lines := strings.Split(string(out), "\n")
  221. pids := []string{}
  222. for _, line := range lines[1:] {
  223. if len(line) == 0 {
  224. continue
  225. }
  226. fields := strings.Fields(line)
  227. if len(fields) < expectedFields {
  228. return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
  229. }
  230. pid := fields[0]
  231. cgroup, err := c.getCgroupPath(fields[1])
  232. if err != nil {
  233. return nil, fmt.Errorf("could not parse cgroup path from %q: %v", fields[1], err)
  234. }
  235. if c.info.Name == cgroup {
  236. pids = append(pids, pid)
  237. }
  238. }
  239. return pids, nil
  240. }
  241. func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) {
  242. // report all processes for root.
  243. isRoot := c.info.Name == "/"
  244. rootfs := "/"
  245. if !inHostNamespace {
  246. rootfs = "/rootfs"
  247. }
  248. format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,cgroup"
  249. out, err := c.getPsOutput(inHostNamespace, format)
  250. if err != nil {
  251. return nil, err
  252. }
  253. expectedFields := 12
  254. processes := []v2.ProcessInfo{}
  255. lines := strings.Split(string(out), "\n")
  256. for _, line := range lines[1:] {
  257. if len(line) == 0 {
  258. continue
  259. }
  260. fields := strings.Fields(line)
  261. if len(fields) < expectedFields {
  262. return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
  263. }
  264. pid, err := strconv.Atoi(fields[1])
  265. if err != nil {
  266. return nil, fmt.Errorf("invalid pid %q: %v", fields[1], err)
  267. }
  268. ppid, err := strconv.Atoi(fields[2])
  269. if err != nil {
  270. return nil, fmt.Errorf("invalid ppid %q: %v", fields[2], err)
  271. }
  272. percentCpu, err := strconv.ParseFloat(fields[4], 32)
  273. if err != nil {
  274. return nil, fmt.Errorf("invalid cpu percent %q: %v", fields[4], err)
  275. }
  276. percentMem, err := strconv.ParseFloat(fields[5], 32)
  277. if err != nil {
  278. return nil, fmt.Errorf("invalid memory percent %q: %v", fields[5], err)
  279. }
  280. rss, err := strconv.ParseUint(fields[6], 0, 64)
  281. if err != nil {
  282. return nil, fmt.Errorf("invalid rss %q: %v", fields[6], err)
  283. }
  284. // convert to bytes
  285. rss *= 1024
  286. vs, err := strconv.ParseUint(fields[7], 0, 64)
  287. if err != nil {
  288. return nil, fmt.Errorf("invalid virtual size %q: %v", fields[7], err)
  289. }
  290. // convert to bytes
  291. vs *= 1024
  292. cgroup, err := c.getCgroupPath(fields[11])
  293. if err != nil {
  294. return nil, fmt.Errorf("could not parse cgroup path from %q: %v", fields[11], err)
  295. }
  296. // Remove the ps command we just ran from cadvisor container.
  297. // Not necessary, but makes the cadvisor page look cleaner.
  298. if !inHostNamespace && cadvisorContainer == cgroup && fields[10] == "ps" {
  299. continue
  300. }
  301. var cgroupPath string
  302. if isRoot {
  303. cgroupPath = cgroup
  304. }
  305. var fdCount int
  306. dirPath := path.Join(rootfs, "/proc", strconv.Itoa(pid), "fd")
  307. fds, err := ioutil.ReadDir(dirPath)
  308. if err != nil {
  309. klog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err)
  310. continue
  311. }
  312. fdCount = len(fds)
  313. if isRoot || c.info.Name == cgroup {
  314. processes = append(processes, v2.ProcessInfo{
  315. User: fields[0],
  316. Pid: pid,
  317. Ppid: ppid,
  318. StartTime: fields[3],
  319. PercentCpu: float32(percentCpu),
  320. PercentMemory: float32(percentMem),
  321. RSS: rss,
  322. VirtualSize: vs,
  323. Status: fields[8],
  324. RunningTime: fields[9],
  325. Cmd: fields[10],
  326. CgroupPath: cgroupPath,
  327. FdCount: fdCount,
  328. })
  329. }
  330. }
  331. return processes, nil
  332. }
  333. func newContainerData(containerName string, memoryCache *memory.InMemoryCache, handler container.ContainerHandler, logUsage bool, collectorManager collector.CollectorManager, maxHousekeepingInterval time.Duration, allowDynamicHousekeeping bool, clock clock.Clock) (*containerData, error) {
  334. if memoryCache == nil {
  335. return nil, fmt.Errorf("nil memory storage")
  336. }
  337. if handler == nil {
  338. return nil, fmt.Errorf("nil container handler")
  339. }
  340. ref, err := handler.ContainerReference()
  341. if err != nil {
  342. return nil, err
  343. }
  344. cont := &containerData{
  345. handler: handler,
  346. memoryCache: memoryCache,
  347. housekeepingInterval: *HousekeepingInterval,
  348. maxHousekeepingInterval: maxHousekeepingInterval,
  349. allowDynamicHousekeeping: allowDynamicHousekeeping,
  350. logUsage: logUsage,
  351. loadAvg: -1.0, // negative value indicates uninitialized.
  352. stop: make(chan bool, 1),
  353. collectorManager: collectorManager,
  354. onDemandChan: make(chan chan struct{}, 100),
  355. clock: clock,
  356. }
  357. cont.info.ContainerReference = ref
  358. cont.loadDecay = math.Exp(float64(-cont.housekeepingInterval.Seconds() / 10))
  359. if *enableLoadReader {
  360. // Create cpu load reader.
  361. loadReader, err := cpuload.New()
  362. if err != nil {
  363. klog.Warningf("Could not initialize cpu load reader for %q: %s", ref.Name, err)
  364. } else {
  365. cont.loadReader = loadReader
  366. }
  367. }
  368. err = cont.updateSpec()
  369. if err != nil {
  370. return nil, err
  371. }
  372. cont.summaryReader, err = summary.New(cont.info.Spec)
  373. if err != nil {
  374. cont.summaryReader = nil
  375. klog.Warningf("Failed to create summary reader for %q: %v", ref.Name, err)
  376. }
  377. return cont, nil
  378. }
  379. // Determine when the next housekeeping should occur.
  380. func (self *containerData) nextHousekeepingInterval() time.Duration {
  381. if self.allowDynamicHousekeeping {
  382. var empty time.Time
  383. stats, err := self.memoryCache.RecentStats(self.info.Name, empty, empty, 2)
  384. if err != nil {
  385. if self.allowErrorLogging() {
  386. klog.Warningf("Failed to get RecentStats(%q) while determining the next housekeeping: %v", self.info.Name, err)
  387. }
  388. } else if len(stats) == 2 {
  389. // TODO(vishnuk): Use no processes as a signal.
  390. // Raise the interval if usage hasn't changed in the last housekeeping.
  391. if stats[0].StatsEq(stats[1]) && (self.housekeepingInterval < self.maxHousekeepingInterval) {
  392. self.housekeepingInterval *= 2
  393. if self.housekeepingInterval > self.maxHousekeepingInterval {
  394. self.housekeepingInterval = self.maxHousekeepingInterval
  395. }
  396. } else if self.housekeepingInterval != *HousekeepingInterval {
  397. // Lower interval back to the baseline.
  398. self.housekeepingInterval = *HousekeepingInterval
  399. }
  400. }
  401. }
  402. return jitter(self.housekeepingInterval, 1.0)
  403. }
  404. // TODO(vmarmol): Implement stats collecting as a custom collector.
  405. func (c *containerData) housekeeping() {
  406. // Start any background goroutines - must be cleaned up in c.handler.Cleanup().
  407. c.handler.Start()
  408. defer c.handler.Cleanup()
  409. // Initialize cpuload reader - must be cleaned up in c.loadReader.Stop()
  410. if c.loadReader != nil {
  411. err := c.loadReader.Start()
  412. if err != nil {
  413. klog.Warningf("Could not start cpu load stat collector for %q: %s", c.info.Name, err)
  414. }
  415. defer c.loadReader.Stop()
  416. }
  417. // Long housekeeping is either 100ms or half of the housekeeping interval.
  418. longHousekeeping := 100 * time.Millisecond
  419. if *HousekeepingInterval/2 < longHousekeeping {
  420. longHousekeeping = *HousekeepingInterval / 2
  421. }
  422. // Housekeep every second.
  423. klog.V(3).Infof("Start housekeeping for container %q\n", c.info.Name)
  424. houseKeepingTimer := c.clock.NewTimer(0 * time.Second)
  425. defer houseKeepingTimer.Stop()
  426. for {
  427. if !c.housekeepingTick(houseKeepingTimer.C(), longHousekeeping) {
  428. return
  429. }
  430. // Stop and drain the timer so that it is safe to reset it
  431. if !houseKeepingTimer.Stop() {
  432. select {
  433. case <-houseKeepingTimer.C():
  434. default:
  435. }
  436. }
  437. // Log usage if asked to do so.
  438. if c.logUsage {
  439. const numSamples = 60
  440. var empty time.Time
  441. stats, err := c.memoryCache.RecentStats(c.info.Name, empty, empty, numSamples)
  442. if err != nil {
  443. if c.allowErrorLogging() {
  444. klog.Warningf("[%s] Failed to get recent stats for logging usage: %v", c.info.Name, err)
  445. }
  446. } else if len(stats) < numSamples {
  447. // Ignore, not enough stats yet.
  448. } else {
  449. usageCpuNs := uint64(0)
  450. for i := range stats {
  451. if i > 0 {
  452. usageCpuNs += (stats[i].Cpu.Usage.Total - stats[i-1].Cpu.Usage.Total)
  453. }
  454. }
  455. usageMemory := stats[numSamples-1].Memory.Usage
  456. instantUsageInCores := float64(stats[numSamples-1].Cpu.Usage.Total-stats[numSamples-2].Cpu.Usage.Total) / float64(stats[numSamples-1].Timestamp.Sub(stats[numSamples-2].Timestamp).Nanoseconds())
  457. usageInCores := float64(usageCpuNs) / float64(stats[numSamples-1].Timestamp.Sub(stats[0].Timestamp).Nanoseconds())
  458. usageInHuman := units.HumanSize(float64(usageMemory))
  459. // Don't set verbosity since this is already protected by the logUsage flag.
  460. klog.Infof("[%s] %.3f cores (average: %.3f cores), %s of memory", c.info.Name, instantUsageInCores, usageInCores, usageInHuman)
  461. }
  462. }
  463. houseKeepingTimer.Reset(c.nextHousekeepingInterval())
  464. }
  465. }
  466. func (c *containerData) housekeepingTick(timer <-chan time.Time, longHousekeeping time.Duration) bool {
  467. select {
  468. case <-c.stop:
  469. // Stop housekeeping when signaled.
  470. return false
  471. case finishedChan := <-c.onDemandChan:
  472. // notify the calling function once housekeeping has completed
  473. defer close(finishedChan)
  474. case <-timer:
  475. }
  476. start := c.clock.Now()
  477. err := c.updateStats()
  478. if err != nil {
  479. if c.allowErrorLogging() {
  480. klog.Warningf("Failed to update stats for container \"%s\": %s", c.info.Name, err)
  481. }
  482. }
  483. // Log if housekeeping took too long.
  484. duration := c.clock.Since(start)
  485. if duration >= longHousekeeping {
  486. klog.V(3).Infof("[%s] Housekeeping took %s", c.info.Name, duration)
  487. }
  488. c.notifyOnDemand()
  489. c.statsLastUpdatedTime = c.clock.Now()
  490. return true
  491. }
  492. func (c *containerData) updateSpec() error {
  493. spec, err := c.handler.GetSpec()
  494. if err != nil {
  495. // Ignore errors if the container is dead.
  496. if !c.handler.Exists() {
  497. return nil
  498. }
  499. return err
  500. }
  501. customMetrics, err := c.collectorManager.GetSpec()
  502. if err != nil {
  503. return err
  504. }
  505. if len(customMetrics) > 0 {
  506. spec.HasCustomMetrics = true
  507. spec.CustomMetrics = customMetrics
  508. }
  509. c.lock.Lock()
  510. defer c.lock.Unlock()
  511. c.info.Spec = spec
  512. return nil
  513. }
  514. // Calculate new smoothed load average using the new sample of runnable threads.
  515. // The decay used ensures that the load will stabilize on a new constant value within
  516. // 10 seconds.
  517. func (c *containerData) updateLoad(newLoad uint64) {
  518. if c.loadAvg < 0 {
  519. c.loadAvg = float64(newLoad) // initialize to the first seen sample for faster stabilization.
  520. } else {
  521. c.loadAvg = c.loadAvg*c.loadDecay + float64(newLoad)*(1.0-c.loadDecay)
  522. }
  523. }
  524. func (c *containerData) updateStats() error {
  525. stats, statsErr := c.handler.GetStats()
  526. if statsErr != nil {
  527. // Ignore errors if the container is dead.
  528. if !c.handler.Exists() {
  529. return nil
  530. }
  531. // Stats may be partially populated, push those before we return an error.
  532. statsErr = fmt.Errorf("%v, continuing to push stats", statsErr)
  533. }
  534. if stats == nil {
  535. return statsErr
  536. }
  537. if c.loadReader != nil {
  538. // TODO(vmarmol): Cache this path.
  539. path, err := c.handler.GetCgroupPath("cpu")
  540. if err == nil {
  541. loadStats, err := c.loadReader.GetCpuLoad(c.info.Name, path)
  542. if err != nil {
  543. return fmt.Errorf("failed to get load stat for %q - path %q, error %s", c.info.Name, path, err)
  544. }
  545. stats.TaskStats = loadStats
  546. c.updateLoad(loadStats.NrRunning)
  547. // convert to 'milliLoad' to avoid floats and preserve precision.
  548. stats.Cpu.LoadAverage = int32(c.loadAvg * 1000)
  549. }
  550. }
  551. if c.summaryReader != nil {
  552. err := c.summaryReader.AddSample(*stats)
  553. if err != nil {
  554. // Ignore summary errors for now.
  555. klog.V(2).Infof("Failed to add summary stats for %q: %v", c.info.Name, err)
  556. }
  557. }
  558. var customStatsErr error
  559. cm := c.collectorManager.(*collector.GenericCollectorManager)
  560. if len(cm.Collectors) > 0 {
  561. if cm.NextCollectionTime.Before(c.clock.Now()) {
  562. customStats, err := c.updateCustomStats()
  563. if customStats != nil {
  564. stats.CustomMetrics = customStats
  565. }
  566. if err != nil {
  567. customStatsErr = err
  568. }
  569. }
  570. }
  571. var nvidiaStatsErr error
  572. if c.nvidiaCollector != nil {
  573. // This updates the Accelerators field of the stats struct
  574. nvidiaStatsErr = c.nvidiaCollector.UpdateStats(stats)
  575. }
  576. ref, err := c.handler.ContainerReference()
  577. if err != nil {
  578. // Ignore errors if the container is dead.
  579. if !c.handler.Exists() {
  580. return nil
  581. }
  582. return err
  583. }
  584. cInfo := info.ContainerInfo{
  585. ContainerReference: ref,
  586. }
  587. err = c.memoryCache.AddStats(&cInfo, stats)
  588. if err != nil {
  589. return err
  590. }
  591. if statsErr != nil {
  592. return statsErr
  593. }
  594. if nvidiaStatsErr != nil {
  595. return nvidiaStatsErr
  596. }
  597. return customStatsErr
  598. }
  599. func (c *containerData) updateCustomStats() (map[string][]info.MetricVal, error) {
  600. _, customStats, customStatsErr := c.collectorManager.Collect()
  601. if customStatsErr != nil {
  602. if !c.handler.Exists() {
  603. return customStats, nil
  604. }
  605. customStatsErr = fmt.Errorf("%v, continuing to push custom stats", customStatsErr)
  606. }
  607. return customStats, customStatsErr
  608. }
  609. func (c *containerData) updateSubcontainers() error {
  610. var subcontainers info.ContainerReferenceSlice
  611. subcontainers, err := c.handler.ListContainers(container.ListSelf)
  612. if err != nil {
  613. // Ignore errors if the container is dead.
  614. if !c.handler.Exists() {
  615. return nil
  616. }
  617. return err
  618. }
  619. sort.Sort(subcontainers)
  620. c.lock.Lock()
  621. defer c.lock.Unlock()
  622. c.info.Subcontainers = subcontainers
  623. return nil
  624. }