container_linux.go 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "bytes"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "io/ioutil"
  10. "net"
  11. "os"
  12. "os/exec"
  13. "path/filepath"
  14. "reflect"
  15. "strings"
  16. "sync"
  17. "syscall" // only for SysProcAttr and Signal
  18. "time"
  19. securejoin "github.com/cyphar/filepath-securejoin"
  20. "github.com/opencontainers/runc/libcontainer/cgroups"
  21. "github.com/opencontainers/runc/libcontainer/configs"
  22. "github.com/opencontainers/runc/libcontainer/intelrdt"
  23. "github.com/opencontainers/runc/libcontainer/system"
  24. "github.com/opencontainers/runc/libcontainer/utils"
  25. "github.com/opencontainers/runtime-spec/specs-go"
  26. criurpc "github.com/checkpoint-restore/go-criu/rpc"
  27. "github.com/golang/protobuf/proto"
  28. "github.com/sirupsen/logrus"
  29. "github.com/vishvananda/netlink/nl"
  30. "golang.org/x/sys/unix"
  31. )
  32. const stdioFdCount = 3
  33. type linuxContainer struct {
  34. id string
  35. root string
  36. config *configs.Config
  37. cgroupManager cgroups.Manager
  38. intelRdtManager intelrdt.Manager
  39. initPath string
  40. initArgs []string
  41. initProcess parentProcess
  42. initProcessStartTime uint64
  43. criuPath string
  44. newuidmapPath string
  45. newgidmapPath string
  46. m sync.Mutex
  47. criuVersion int
  48. state containerState
  49. created time.Time
  50. }
  51. // State represents a running container's state
  52. type State struct {
  53. BaseState
  54. // Platform specific fields below here
  55. // Specified if the container was started under the rootless mode.
  56. // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
  57. Rootless bool `json:"rootless"`
  58. // Path to all the cgroups setup for a container. Key is cgroup subsystem name
  59. // with the value as the path.
  60. CgroupPaths map[string]string `json:"cgroup_paths"`
  61. // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
  62. // with the value as the path.
  63. NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
  64. // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
  65. ExternalDescriptors []string `json:"external_descriptors,omitempty"`
  66. // Intel RDT "resource control" filesystem path
  67. IntelRdtPath string `json:"intel_rdt_path"`
  68. }
  69. // Container is a libcontainer container object.
  70. //
  71. // Each container is thread-safe within the same process. Since a container can
  72. // be destroyed by a separate process, any function may return that the container
  73. // was not found.
  74. type Container interface {
  75. BaseContainer
  76. // Methods below here are platform specific
  77. // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
  78. //
  79. // errors:
  80. // Systemerror - System error.
  81. Checkpoint(criuOpts *CriuOpts) error
  82. // Restore restores the checkpointed container to a running state using the criu(8) utility.
  83. //
  84. // errors:
  85. // Systemerror - System error.
  86. Restore(process *Process, criuOpts *CriuOpts) error
  87. // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
  88. // the execution of any user processes. Asynchronously, when the container finished being paused the
  89. // state is changed to PAUSED.
  90. // If the Container state is PAUSED, do nothing.
  91. //
  92. // errors:
  93. // ContainerNotExists - Container no longer exists,
  94. // ContainerNotRunning - Container not running or created,
  95. // Systemerror - System error.
  96. Pause() error
  97. // If the Container state is PAUSED, resumes the execution of any user processes in the
  98. // Container before setting the Container state to RUNNING.
  99. // If the Container state is RUNNING, do nothing.
  100. //
  101. // errors:
  102. // ContainerNotExists - Container no longer exists,
  103. // ContainerNotPaused - Container is not paused,
  104. // Systemerror - System error.
  105. Resume() error
  106. // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
  107. //
  108. // errors:
  109. // Systemerror - System error.
  110. NotifyOOM() (<-chan struct{}, error)
  111. // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
  112. //
  113. // errors:
  114. // Systemerror - System error.
  115. NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
  116. }
  117. // ID returns the container's unique ID
  118. func (c *linuxContainer) ID() string {
  119. return c.id
  120. }
  121. // Config returns the container's configuration
  122. func (c *linuxContainer) Config() configs.Config {
  123. return *c.config
  124. }
  125. func (c *linuxContainer) Status() (Status, error) {
  126. c.m.Lock()
  127. defer c.m.Unlock()
  128. return c.currentStatus()
  129. }
  130. func (c *linuxContainer) State() (*State, error) {
  131. c.m.Lock()
  132. defer c.m.Unlock()
  133. return c.currentState()
  134. }
  135. func (c *linuxContainer) OCIState() (*specs.State, error) {
  136. c.m.Lock()
  137. defer c.m.Unlock()
  138. return c.currentOCIState()
  139. }
  140. func (c *linuxContainer) Processes() ([]int, error) {
  141. pids, err := c.cgroupManager.GetAllPids()
  142. if err != nil {
  143. return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
  144. }
  145. return pids, nil
  146. }
  147. func (c *linuxContainer) Stats() (*Stats, error) {
  148. var (
  149. err error
  150. stats = &Stats{}
  151. )
  152. if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
  153. return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
  154. }
  155. if c.intelRdtManager != nil {
  156. if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
  157. return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
  158. }
  159. }
  160. for _, iface := range c.config.Networks {
  161. switch iface.Type {
  162. case "veth":
  163. istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
  164. if err != nil {
  165. return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
  166. }
  167. stats.Interfaces = append(stats.Interfaces, istats)
  168. }
  169. }
  170. return stats, nil
  171. }
  172. func (c *linuxContainer) Set(config configs.Config) error {
  173. c.m.Lock()
  174. defer c.m.Unlock()
  175. status, err := c.currentStatus()
  176. if err != nil {
  177. return err
  178. }
  179. if status == Stopped {
  180. return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
  181. }
  182. if err := c.cgroupManager.Set(&config); err != nil {
  183. // Set configs back
  184. if err2 := c.cgroupManager.Set(c.config); err2 != nil {
  185. logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
  186. }
  187. return err
  188. }
  189. if c.intelRdtManager != nil {
  190. if err := c.intelRdtManager.Set(&config); err != nil {
  191. // Set configs back
  192. if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
  193. logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
  194. }
  195. return err
  196. }
  197. }
  198. // After config setting succeed, update config and states
  199. c.config = &config
  200. _, err = c.updateState(nil)
  201. return err
  202. }
  203. func (c *linuxContainer) Start(process *Process) error {
  204. c.m.Lock()
  205. defer c.m.Unlock()
  206. if process.Init {
  207. if err := c.createExecFifo(); err != nil {
  208. return err
  209. }
  210. }
  211. if err := c.start(process); err != nil {
  212. if process.Init {
  213. c.deleteExecFifo()
  214. }
  215. return err
  216. }
  217. return nil
  218. }
  219. func (c *linuxContainer) Run(process *Process) error {
  220. if err := c.Start(process); err != nil {
  221. return err
  222. }
  223. if process.Init {
  224. return c.exec()
  225. }
  226. return nil
  227. }
  228. func (c *linuxContainer) Exec() error {
  229. c.m.Lock()
  230. defer c.m.Unlock()
  231. return c.exec()
  232. }
  233. func (c *linuxContainer) exec() error {
  234. path := filepath.Join(c.root, execFifoFilename)
  235. pid := c.initProcess.pid()
  236. blockingFifoOpenCh := awaitFifoOpen(path)
  237. for {
  238. select {
  239. case result := <-blockingFifoOpenCh:
  240. return handleFifoResult(result)
  241. case <-time.After(time.Millisecond * 100):
  242. stat, err := system.Stat(pid)
  243. if err != nil || stat.State == system.Zombie {
  244. // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
  245. // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
  246. if err := handleFifoResult(fifoOpen(path, false)); err != nil {
  247. return errors.New("container process is already dead")
  248. }
  249. return nil
  250. }
  251. }
  252. }
  253. }
  254. func readFromExecFifo(execFifo io.Reader) error {
  255. data, err := ioutil.ReadAll(execFifo)
  256. if err != nil {
  257. return err
  258. }
  259. if len(data) <= 0 {
  260. return fmt.Errorf("cannot start an already running container")
  261. }
  262. return nil
  263. }
  264. func awaitFifoOpen(path string) <-chan openResult {
  265. fifoOpened := make(chan openResult)
  266. go func() {
  267. result := fifoOpen(path, true)
  268. fifoOpened <- result
  269. }()
  270. return fifoOpened
  271. }
  272. func fifoOpen(path string, block bool) openResult {
  273. flags := os.O_RDONLY
  274. if !block {
  275. flags |= syscall.O_NONBLOCK
  276. }
  277. f, err := os.OpenFile(path, flags, 0)
  278. if err != nil {
  279. return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
  280. }
  281. return openResult{file: f}
  282. }
  283. func handleFifoResult(result openResult) error {
  284. if result.err != nil {
  285. return result.err
  286. }
  287. f := result.file
  288. defer f.Close()
  289. if err := readFromExecFifo(f); err != nil {
  290. return err
  291. }
  292. return os.Remove(f.Name())
  293. }
  294. type openResult struct {
  295. file *os.File
  296. err error
  297. }
  298. func (c *linuxContainer) start(process *Process) error {
  299. parent, err := c.newParentProcess(process)
  300. if err != nil {
  301. return newSystemErrorWithCause(err, "creating new parent process")
  302. }
  303. parent.forwardChildLogs()
  304. if err := parent.start(); err != nil {
  305. // terminate the process to ensure that it properly is reaped.
  306. if err := ignoreTerminateErrors(parent.terminate()); err != nil {
  307. logrus.Warn(err)
  308. }
  309. return newSystemErrorWithCause(err, "starting container process")
  310. }
  311. // generate a timestamp indicating when the container was started
  312. c.created = time.Now().UTC()
  313. if process.Init {
  314. c.state = &createdState{
  315. c: c,
  316. }
  317. state, err := c.updateState(parent)
  318. if err != nil {
  319. return err
  320. }
  321. c.initProcessStartTime = state.InitProcessStartTime
  322. if c.config.Hooks != nil {
  323. s, err := c.currentOCIState()
  324. if err != nil {
  325. return err
  326. }
  327. for i, hook := range c.config.Hooks.Poststart {
  328. if err := hook.Run(s); err != nil {
  329. if err := ignoreTerminateErrors(parent.terminate()); err != nil {
  330. logrus.Warn(err)
  331. }
  332. return newSystemErrorWithCausef(err, "running poststart hook %d", i)
  333. }
  334. }
  335. }
  336. }
  337. return nil
  338. }
  339. func (c *linuxContainer) Signal(s os.Signal, all bool) error {
  340. if all {
  341. return signalAllProcesses(c.cgroupManager, s)
  342. }
  343. status, err := c.currentStatus()
  344. if err != nil {
  345. return err
  346. }
  347. // to avoid a PID reuse attack
  348. if status == Running || status == Created || status == Paused {
  349. if err := c.initProcess.signal(s); err != nil {
  350. return newSystemErrorWithCause(err, "signaling init process")
  351. }
  352. return nil
  353. }
  354. return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
  355. }
  356. func (c *linuxContainer) createExecFifo() error {
  357. rootuid, err := c.Config().HostRootUID()
  358. if err != nil {
  359. return err
  360. }
  361. rootgid, err := c.Config().HostRootGID()
  362. if err != nil {
  363. return err
  364. }
  365. fifoName := filepath.Join(c.root, execFifoFilename)
  366. if _, err := os.Stat(fifoName); err == nil {
  367. return fmt.Errorf("exec fifo %s already exists", fifoName)
  368. }
  369. oldMask := unix.Umask(0000)
  370. if err := unix.Mkfifo(fifoName, 0622); err != nil {
  371. unix.Umask(oldMask)
  372. return err
  373. }
  374. unix.Umask(oldMask)
  375. return os.Chown(fifoName, rootuid, rootgid)
  376. }
  377. func (c *linuxContainer) deleteExecFifo() {
  378. fifoName := filepath.Join(c.root, execFifoFilename)
  379. os.Remove(fifoName)
  380. }
  381. // includeExecFifo opens the container's execfifo as a pathfd, so that the
  382. // container cannot access the statedir (and the FIFO itself remains
  383. // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
  384. // fd, with _LIBCONTAINER_FIFOFD set to its fd number.
  385. func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
  386. fifoName := filepath.Join(c.root, execFifoFilename)
  387. fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
  388. if err != nil {
  389. return err
  390. }
  391. cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
  392. cmd.Env = append(cmd.Env,
  393. fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
  394. return nil
  395. }
  396. func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
  397. parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
  398. if err != nil {
  399. return nil, newSystemErrorWithCause(err, "creating new init pipe")
  400. }
  401. messageSockPair := filePair{parentInitPipe, childInitPipe}
  402. parentLogPipe, childLogPipe, err := os.Pipe()
  403. if err != nil {
  404. return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
  405. }
  406. logFilePair := filePair{parentLogPipe, childLogPipe}
  407. cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe)
  408. if err != nil {
  409. return nil, newSystemErrorWithCause(err, "creating new command template")
  410. }
  411. if !p.Init {
  412. return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
  413. }
  414. // We only set up fifoFd if we're not doing a `runc exec`. The historic
  415. // reason for this is that previously we would pass a dirfd that allowed
  416. // for container rootfs escape (and not doing it in `runc exec` avoided
  417. // that problem), but we no longer do that. However, there's no need to do
  418. // this for `runc exec` so we just keep it this way to be safe.
  419. if err := c.includeExecFifo(cmd); err != nil {
  420. return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
  421. }
  422. return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
  423. }
  424. func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) (*exec.Cmd, error) {
  425. cmd := exec.Command(c.initPath, c.initArgs[1:]...)
  426. cmd.Args[0] = c.initArgs[0]
  427. cmd.Stdin = p.Stdin
  428. cmd.Stdout = p.Stdout
  429. cmd.Stderr = p.Stderr
  430. cmd.Dir = c.config.Rootfs
  431. if cmd.SysProcAttr == nil {
  432. cmd.SysProcAttr = &syscall.SysProcAttr{}
  433. }
  434. cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
  435. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
  436. if p.ConsoleSocket != nil {
  437. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
  438. cmd.Env = append(cmd.Env,
  439. fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
  440. )
  441. }
  442. cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
  443. cmd.Env = append(cmd.Env,
  444. fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
  445. fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root),
  446. )
  447. cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
  448. cmd.Env = append(cmd.Env,
  449. fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
  450. fmt.Sprintf("_LIBCONTAINER_LOGLEVEL=%s", p.LogLevel),
  451. )
  452. // NOTE: when running a container with no PID namespace and the parent process spawning the container is
  453. // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
  454. // even with the parent still running.
  455. if c.config.ParentDeathSignal > 0 {
  456. cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
  457. }
  458. return cmd, nil
  459. }
  460. func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
  461. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
  462. nsMaps := make(map[configs.NamespaceType]string)
  463. for _, ns := range c.config.Namespaces {
  464. if ns.Path != "" {
  465. nsMaps[ns.Type] = ns.Path
  466. }
  467. }
  468. _, sharePidns := nsMaps[configs.NEWPID]
  469. data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
  470. if err != nil {
  471. return nil, err
  472. }
  473. init := &initProcess{
  474. cmd: cmd,
  475. messageSockPair: messageSockPair,
  476. logFilePair: logFilePair,
  477. manager: c.cgroupManager,
  478. intelRdtManager: c.intelRdtManager,
  479. config: c.newInitConfig(p),
  480. container: c,
  481. process: p,
  482. bootstrapData: data,
  483. sharePidns: sharePidns,
  484. }
  485. c.initProcess = init
  486. return init, nil
  487. }
  488. func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
  489. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
  490. state, err := c.currentState()
  491. if err != nil {
  492. return nil, newSystemErrorWithCause(err, "getting container's current state")
  493. }
  494. // for setns process, we don't have to set cloneflags as the process namespaces
  495. // will only be set via setns syscall
  496. data, err := c.bootstrapData(0, state.NamespacePaths)
  497. if err != nil {
  498. return nil, err
  499. }
  500. return &setnsProcess{
  501. cmd: cmd,
  502. cgroupPaths: c.cgroupManager.GetPaths(),
  503. rootlessCgroups: c.config.RootlessCgroups,
  504. intelRdtPath: state.IntelRdtPath,
  505. messageSockPair: messageSockPair,
  506. logFilePair: logFilePair,
  507. config: c.newInitConfig(p),
  508. process: p,
  509. bootstrapData: data,
  510. }, nil
  511. }
  512. func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
  513. cfg := &initConfig{
  514. Config: c.config,
  515. Args: process.Args,
  516. Env: process.Env,
  517. User: process.User,
  518. AdditionalGroups: process.AdditionalGroups,
  519. Cwd: process.Cwd,
  520. Capabilities: process.Capabilities,
  521. PassedFilesCount: len(process.ExtraFiles),
  522. ContainerId: c.ID(),
  523. NoNewPrivileges: c.config.NoNewPrivileges,
  524. RootlessEUID: c.config.RootlessEUID,
  525. RootlessCgroups: c.config.RootlessCgroups,
  526. AppArmorProfile: c.config.AppArmorProfile,
  527. ProcessLabel: c.config.ProcessLabel,
  528. Rlimits: c.config.Rlimits,
  529. }
  530. if process.NoNewPrivileges != nil {
  531. cfg.NoNewPrivileges = *process.NoNewPrivileges
  532. }
  533. if process.AppArmorProfile != "" {
  534. cfg.AppArmorProfile = process.AppArmorProfile
  535. }
  536. if process.Label != "" {
  537. cfg.ProcessLabel = process.Label
  538. }
  539. if len(process.Rlimits) > 0 {
  540. cfg.Rlimits = process.Rlimits
  541. }
  542. cfg.CreateConsole = process.ConsoleSocket != nil
  543. cfg.ConsoleWidth = process.ConsoleWidth
  544. cfg.ConsoleHeight = process.ConsoleHeight
  545. return cfg
  546. }
  547. func (c *linuxContainer) Destroy() error {
  548. c.m.Lock()
  549. defer c.m.Unlock()
  550. return c.state.destroy()
  551. }
  552. func (c *linuxContainer) Pause() error {
  553. c.m.Lock()
  554. defer c.m.Unlock()
  555. status, err := c.currentStatus()
  556. if err != nil {
  557. return err
  558. }
  559. switch status {
  560. case Running, Created:
  561. if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
  562. return err
  563. }
  564. return c.state.transition(&pausedState{
  565. c: c,
  566. })
  567. }
  568. return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
  569. }
  570. func (c *linuxContainer) Resume() error {
  571. c.m.Lock()
  572. defer c.m.Unlock()
  573. status, err := c.currentStatus()
  574. if err != nil {
  575. return err
  576. }
  577. if status != Paused {
  578. return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
  579. }
  580. if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
  581. return err
  582. }
  583. return c.state.transition(&runningState{
  584. c: c,
  585. })
  586. }
  587. func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
  588. // XXX(cyphar): This requires cgroups.
  589. if c.config.RootlessCgroups {
  590. logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
  591. }
  592. return notifyOnOOM(c.cgroupManager.GetPaths())
  593. }
  594. func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
  595. // XXX(cyphar): This requires cgroups.
  596. if c.config.RootlessCgroups {
  597. logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
  598. }
  599. return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
  600. }
  601. var criuFeatures *criurpc.CriuFeatures
  602. func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
  603. var t criurpc.CriuReqType
  604. t = criurpc.CriuReqType_FEATURE_CHECK
  605. // criu 1.8 => 10800
  606. if err := c.checkCriuVersion(10800); err != nil {
  607. // Feature checking was introduced with CRIU 1.8.
  608. // Ignore the feature check if an older CRIU version is used
  609. // and just act as before.
  610. // As all automated PR testing is done using CRIU 1.7 this
  611. // code will not be tested by automated PR testing.
  612. return nil
  613. }
  614. // make sure the features we are looking for are really not from
  615. // some previous check
  616. criuFeatures = nil
  617. req := &criurpc.CriuReq{
  618. Type: &t,
  619. // Theoretically this should not be necessary but CRIU
  620. // segfaults if Opts is empty.
  621. // Fixed in CRIU 2.12
  622. Opts: rpcOpts,
  623. Features: criuFeat,
  624. }
  625. err := c.criuSwrk(nil, req, criuOpts, false, nil)
  626. if err != nil {
  627. logrus.Debugf("%s", err)
  628. return fmt.Errorf("CRIU feature check failed")
  629. }
  630. logrus.Debugf("Feature check says: %s", criuFeatures)
  631. missingFeatures := false
  632. // The outer if checks if the fields actually exist
  633. if (criuFeat.MemTrack != nil) &&
  634. (criuFeatures.MemTrack != nil) {
  635. // The inner if checks if they are set to true
  636. if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
  637. missingFeatures = true
  638. logrus.Debugf("CRIU does not support MemTrack")
  639. }
  640. }
  641. // This needs to be repeated for every new feature check.
  642. // Is there a way to put this in a function. Reflection?
  643. if (criuFeat.LazyPages != nil) &&
  644. (criuFeatures.LazyPages != nil) {
  645. if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
  646. missingFeatures = true
  647. logrus.Debugf("CRIU does not support LazyPages")
  648. }
  649. }
  650. if missingFeatures {
  651. return fmt.Errorf("CRIU is missing features")
  652. }
  653. return nil
  654. }
  655. func parseCriuVersion(path string) (int, error) {
  656. var x, y, z int
  657. out, err := exec.Command(path, "-V").Output()
  658. if err != nil {
  659. return 0, fmt.Errorf("Unable to execute CRIU command: %s", path)
  660. }
  661. x = 0
  662. y = 0
  663. z = 0
  664. if ep := strings.Index(string(out), "-"); ep >= 0 {
  665. // criu Git version format
  666. var version string
  667. if sp := strings.Index(string(out), "GitID"); sp > 0 {
  668. version = string(out)[sp:ep]
  669. } else {
  670. return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path)
  671. }
  672. n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
  673. if err != nil {
  674. n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6
  675. y++
  676. } else {
  677. z++
  678. }
  679. if n < 2 || err != nil {
  680. return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
  681. }
  682. } else {
  683. // criu release version format
  684. n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
  685. if err != nil {
  686. n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
  687. }
  688. if n < 2 || err != nil {
  689. return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
  690. }
  691. }
  692. return x*10000 + y*100 + z, nil
  693. }
  694. func compareCriuVersion(criuVersion int, minVersion int) error {
  695. // simple function to perform the actual version compare
  696. if criuVersion < minVersion {
  697. return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
  698. }
  699. return nil
  700. }
  701. // This is used to store the result of criu version RPC
  702. var criuVersionRPC *criurpc.CriuVersion
  703. // checkCriuVersion checks Criu version greater than or equal to minVersion
  704. func (c *linuxContainer) checkCriuVersion(minVersion int) error {
  705. // If the version of criu has already been determined there is no need
  706. // to ask criu for the version again. Use the value from c.criuVersion.
  707. if c.criuVersion != 0 {
  708. return compareCriuVersion(c.criuVersion, minVersion)
  709. }
  710. // First try if this version of CRIU support the version RPC.
  711. // The CRIU version RPC was introduced with CRIU 3.0.
  712. // First, reset the variable for the RPC answer to nil
  713. criuVersionRPC = nil
  714. var t criurpc.CriuReqType
  715. t = criurpc.CriuReqType_VERSION
  716. req := &criurpc.CriuReq{
  717. Type: &t,
  718. }
  719. err := c.criuSwrk(nil, req, nil, false, nil)
  720. if err != nil {
  721. return fmt.Errorf("CRIU version check failed: %s", err)
  722. }
  723. if criuVersionRPC != nil {
  724. logrus.Debugf("CRIU version: %s", criuVersionRPC)
  725. // major and minor are always set
  726. c.criuVersion = int(*criuVersionRPC.Major) * 10000
  727. c.criuVersion += int(*criuVersionRPC.Minor) * 100
  728. if criuVersionRPC.Sublevel != nil {
  729. c.criuVersion += int(*criuVersionRPC.Sublevel)
  730. }
  731. if criuVersionRPC.Gitid != nil {
  732. // runc's convention is that a CRIU git release is
  733. // always the same as increasing the minor by 1
  734. c.criuVersion -= (c.criuVersion % 100)
  735. c.criuVersion += 100
  736. }
  737. return compareCriuVersion(c.criuVersion, minVersion)
  738. }
  739. // This is CRIU without the version RPC and therefore
  740. // older than 3.0. Parsing the output is required.
  741. // This can be remove once runc does not work with criu older than 3.0
  742. c.criuVersion, err = parseCriuVersion(c.criuPath)
  743. if err != nil {
  744. return err
  745. }
  746. return compareCriuVersion(c.criuVersion, minVersion)
  747. }
  748. const descriptorsFilename = "descriptors.json"
  749. func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
  750. mountDest := m.Destination
  751. if strings.HasPrefix(mountDest, c.config.Rootfs) {
  752. mountDest = mountDest[len(c.config.Rootfs):]
  753. }
  754. extMnt := &criurpc.ExtMountMap{
  755. Key: proto.String(mountDest),
  756. Val: proto.String(mountDest),
  757. }
  758. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  759. }
  760. func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
  761. for _, path := range c.config.MaskPaths {
  762. fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
  763. if err != nil {
  764. if os.IsNotExist(err) {
  765. continue
  766. }
  767. return err
  768. }
  769. if fi.IsDir() {
  770. continue
  771. }
  772. extMnt := &criurpc.ExtMountMap{
  773. Key: proto.String(path),
  774. Val: proto.String("/dev/null"),
  775. }
  776. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  777. }
  778. return nil
  779. }
  780. func waitForCriuLazyServer(r *os.File, status string) error {
  781. data := make([]byte, 1)
  782. _, err := r.Read(data)
  783. if err != nil {
  784. return err
  785. }
  786. fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
  787. if err != nil {
  788. return err
  789. }
  790. _, err = fd.Write(data)
  791. if err != nil {
  792. return err
  793. }
  794. fd.Close()
  795. return nil
  796. }
  797. func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
  798. // CRIU will evaluate a configuration starting with release 3.11.
  799. // Settings in the configuration file will overwrite RPC settings.
  800. // Look for annotations. The annotation 'org.criu.config'
  801. // specifies if CRIU should use a different, container specific
  802. // configuration file.
  803. _, annotations := utils.Annotations(c.config.Labels)
  804. configFile, exists := annotations["org.criu.config"]
  805. if exists {
  806. // If the annotation 'org.criu.config' exists and is set
  807. // to a non-empty string, tell CRIU to use that as a
  808. // configuration file. If the file does not exist, CRIU
  809. // will just ignore it.
  810. if configFile != "" {
  811. rpcOpts.ConfigFile = proto.String(configFile)
  812. }
  813. // If 'org.criu.config' exists and is set to an empty
  814. // string, a runc specific CRIU configuration file will
  815. // be not set at all.
  816. } else {
  817. // If the mentioned annotation has not been found, specify
  818. // a default CRIU configuration file.
  819. rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
  820. }
  821. }
  822. func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
  823. c.m.Lock()
  824. defer c.m.Unlock()
  825. // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  826. // (CLI prints a warning)
  827. // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
  828. // support for doing unprivileged dumps, but the setup of
  829. // rootless containers might make this complicated.
  830. // criu 1.5.2 => 10502
  831. if err := c.checkCriuVersion(10502); err != nil {
  832. return err
  833. }
  834. if criuOpts.ImagesDirectory == "" {
  835. return fmt.Errorf("invalid directory to save checkpoint")
  836. }
  837. // Since a container can be C/R'ed multiple times,
  838. // the checkpoint directory may already exist.
  839. if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) {
  840. return err
  841. }
  842. if criuOpts.WorkDirectory == "" {
  843. criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
  844. }
  845. if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
  846. return err
  847. }
  848. workDir, err := os.Open(criuOpts.WorkDirectory)
  849. if err != nil {
  850. return err
  851. }
  852. defer workDir.Close()
  853. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  854. if err != nil {
  855. return err
  856. }
  857. defer imageDir.Close()
  858. rpcOpts := criurpc.CriuOpts{
  859. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  860. WorkDirFd: proto.Int32(int32(workDir.Fd())),
  861. LogLevel: proto.Int32(4),
  862. LogFile: proto.String("dump.log"),
  863. Root: proto.String(c.config.Rootfs),
  864. ManageCgroups: proto.Bool(true),
  865. NotifyScripts: proto.Bool(true),
  866. Pid: proto.Int32(int32(c.initProcess.pid())),
  867. ShellJob: proto.Bool(criuOpts.ShellJob),
  868. LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
  869. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  870. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  871. FileLocks: proto.Bool(criuOpts.FileLocks),
  872. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  873. OrphanPtsMaster: proto.Bool(true),
  874. AutoDedup: proto.Bool(criuOpts.AutoDedup),
  875. LazyPages: proto.Bool(criuOpts.LazyPages),
  876. }
  877. c.handleCriuConfigurationFile(&rpcOpts)
  878. // If the container is running in a network namespace and has
  879. // a path to the network namespace configured, we will dump
  880. // that network namespace as an external namespace and we
  881. // will expect that the namespace exists during restore.
  882. // This basically means that CRIU will ignore the namespace
  883. // and expect to be setup correctly.
  884. nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
  885. if nsPath != "" {
  886. // For this to work we need at least criu 3.11.0 => 31100.
  887. // As there was already a successful version check we will
  888. // not error out if it fails. runc will just behave as it used
  889. // to do and ignore external network namespaces.
  890. err := c.checkCriuVersion(31100)
  891. if err == nil {
  892. // CRIU expects the information about an external namespace
  893. // like this: --external net[<inode>]:<key>
  894. // This <key> is always 'extRootNetNS'.
  895. var netns syscall.Stat_t
  896. err = syscall.Stat(nsPath, &netns)
  897. if err != nil {
  898. return err
  899. }
  900. criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
  901. rpcOpts.External = append(rpcOpts.External, criuExternal)
  902. }
  903. }
  904. fcg := c.cgroupManager.GetPaths()["freezer"]
  905. if fcg != "" {
  906. rpcOpts.FreezeCgroup = proto.String(fcg)
  907. }
  908. // append optional criu opts, e.g., page-server and port
  909. if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
  910. rpcOpts.Ps = &criurpc.CriuPageServerInfo{
  911. Address: proto.String(criuOpts.PageServer.Address),
  912. Port: proto.Int32(criuOpts.PageServer.Port),
  913. }
  914. }
  915. //pre-dump may need parentImage param to complete iterative migration
  916. if criuOpts.ParentImage != "" {
  917. rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
  918. rpcOpts.TrackMem = proto.Bool(true)
  919. }
  920. // append optional manage cgroups mode
  921. if criuOpts.ManageCgroupsMode != 0 {
  922. // criu 1.7 => 10700
  923. if err := c.checkCriuVersion(10700); err != nil {
  924. return err
  925. }
  926. mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
  927. rpcOpts.ManageCgroupsMode = &mode
  928. }
  929. var t criurpc.CriuReqType
  930. if criuOpts.PreDump {
  931. feat := criurpc.CriuFeatures{
  932. MemTrack: proto.Bool(true),
  933. }
  934. if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  935. return err
  936. }
  937. t = criurpc.CriuReqType_PRE_DUMP
  938. } else {
  939. t = criurpc.CriuReqType_DUMP
  940. }
  941. req := &criurpc.CriuReq{
  942. Type: &t,
  943. Opts: &rpcOpts,
  944. }
  945. if criuOpts.LazyPages {
  946. // lazy migration requested; check if criu supports it
  947. feat := criurpc.CriuFeatures{
  948. LazyPages: proto.Bool(true),
  949. }
  950. if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  951. return err
  952. }
  953. statusRead, statusWrite, err := os.Pipe()
  954. if err != nil {
  955. return err
  956. }
  957. rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
  958. go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
  959. }
  960. //no need to dump these information in pre-dump
  961. if !criuOpts.PreDump {
  962. for _, m := range c.config.Mounts {
  963. switch m.Device {
  964. case "bind":
  965. c.addCriuDumpMount(req, m)
  966. case "cgroup":
  967. binds, err := getCgroupMounts(m)
  968. if err != nil {
  969. return err
  970. }
  971. for _, b := range binds {
  972. c.addCriuDumpMount(req, b)
  973. }
  974. }
  975. }
  976. if err := c.addMaskPaths(req); err != nil {
  977. return err
  978. }
  979. for _, node := range c.config.Devices {
  980. m := &configs.Mount{Destination: node.Path, Source: node.Path}
  981. c.addCriuDumpMount(req, m)
  982. }
  983. // Write the FD info to a file in the image directory
  984. fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
  985. if err != nil {
  986. return err
  987. }
  988. err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600)
  989. if err != nil {
  990. return err
  991. }
  992. }
  993. err = c.criuSwrk(nil, req, criuOpts, false, nil)
  994. if err != nil {
  995. return err
  996. }
  997. return nil
  998. }
  999. func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
  1000. mountDest := m.Destination
  1001. if strings.HasPrefix(mountDest, c.config.Rootfs) {
  1002. mountDest = mountDest[len(c.config.Rootfs):]
  1003. }
  1004. extMnt := &criurpc.ExtMountMap{
  1005. Key: proto.String(mountDest),
  1006. Val: proto.String(m.Source),
  1007. }
  1008. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  1009. }
  1010. func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
  1011. for _, iface := range c.config.Networks {
  1012. switch iface.Type {
  1013. case "veth":
  1014. veth := new(criurpc.CriuVethPair)
  1015. veth.IfOut = proto.String(iface.HostInterfaceName)
  1016. veth.IfIn = proto.String(iface.Name)
  1017. req.Opts.Veths = append(req.Opts.Veths, veth)
  1018. case "loopback":
  1019. // Do nothing
  1020. }
  1021. }
  1022. for _, i := range criuOpts.VethPairs {
  1023. veth := new(criurpc.CriuVethPair)
  1024. veth.IfOut = proto.String(i.HostInterfaceName)
  1025. veth.IfIn = proto.String(i.ContainerInterfaceName)
  1026. req.Opts.Veths = append(req.Opts.Veths, veth)
  1027. }
  1028. }
  1029. // makeCriuRestoreMountpoints makes the actual mountpoints for the
  1030. // restore using CRIU. This function is inspired from the code in
  1031. // rootfs_linux.go
  1032. func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
  1033. switch m.Device {
  1034. case "cgroup":
  1035. // Do nothing for cgroup, CRIU should handle it
  1036. case "bind":
  1037. // The prepareBindMount() function checks if source
  1038. // exists. So it cannot be used for other filesystem types.
  1039. if err := prepareBindMount(m, c.config.Rootfs); err != nil {
  1040. return err
  1041. }
  1042. default:
  1043. // for all other file-systems just create the mountpoints
  1044. dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
  1045. if err != nil {
  1046. return err
  1047. }
  1048. if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
  1049. return err
  1050. }
  1051. m.Destination = dest
  1052. if err := os.MkdirAll(dest, 0755); err != nil {
  1053. return err
  1054. }
  1055. }
  1056. return nil
  1057. }
  1058. // isPathInPrefixList is a small function for CRIU restore to make sure
  1059. // mountpoints, which are on a tmpfs, are not created in the roofs
  1060. func isPathInPrefixList(path string, prefix []string) bool {
  1061. for _, p := range prefix {
  1062. if strings.HasPrefix(path, p+"/") {
  1063. return false
  1064. }
  1065. }
  1066. return true
  1067. }
  1068. // prepareCriuRestoreMounts tries to set up the rootfs of the
  1069. // container to be restored in the same way runc does it for
  1070. // initial container creation. Even for a read-only rootfs container
  1071. // runc modifies the rootfs to add mountpoints which do not exist.
  1072. // This function also creates missing mountpoints as long as they
  1073. // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
  1074. func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
  1075. // First get a list of a all tmpfs mounts
  1076. tmpfs := []string{}
  1077. for _, m := range mounts {
  1078. switch m.Device {
  1079. case "tmpfs":
  1080. tmpfs = append(tmpfs, m.Destination)
  1081. }
  1082. }
  1083. // Now go through all mounts and create the mountpoints
  1084. // if the mountpoints are not on a tmpfs, as CRIU will
  1085. // restore the complete tmpfs content from its checkpoint.
  1086. for _, m := range mounts {
  1087. if isPathInPrefixList(m.Destination, tmpfs) {
  1088. if err := c.makeCriuRestoreMountpoints(m); err != nil {
  1089. return err
  1090. }
  1091. }
  1092. }
  1093. return nil
  1094. }
  1095. func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
  1096. c.m.Lock()
  1097. defer c.m.Unlock()
  1098. var extraFiles []*os.File
  1099. // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  1100. // (CLI prints a warning)
  1101. // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
  1102. // support for unprivileged restore at the moment.
  1103. // criu 1.5.2 => 10502
  1104. if err := c.checkCriuVersion(10502); err != nil {
  1105. return err
  1106. }
  1107. if criuOpts.WorkDirectory == "" {
  1108. criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
  1109. }
  1110. // Since a container can be C/R'ed multiple times,
  1111. // the work directory may already exist.
  1112. if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
  1113. return err
  1114. }
  1115. workDir, err := os.Open(criuOpts.WorkDirectory)
  1116. if err != nil {
  1117. return err
  1118. }
  1119. defer workDir.Close()
  1120. if criuOpts.ImagesDirectory == "" {
  1121. return fmt.Errorf("invalid directory to restore checkpoint")
  1122. }
  1123. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  1124. if err != nil {
  1125. return err
  1126. }
  1127. defer imageDir.Close()
  1128. // CRIU has a few requirements for a root directory:
  1129. // * it must be a mount point
  1130. // * its parent must not be overmounted
  1131. // c.config.Rootfs is bind-mounted to a temporary directory
  1132. // to satisfy these requirements.
  1133. root := filepath.Join(c.root, "criu-root")
  1134. if err := os.Mkdir(root, 0755); err != nil {
  1135. return err
  1136. }
  1137. defer os.Remove(root)
  1138. root, err = filepath.EvalSymlinks(root)
  1139. if err != nil {
  1140. return err
  1141. }
  1142. err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
  1143. if err != nil {
  1144. return err
  1145. }
  1146. defer unix.Unmount(root, unix.MNT_DETACH)
  1147. t := criurpc.CriuReqType_RESTORE
  1148. req := &criurpc.CriuReq{
  1149. Type: &t,
  1150. Opts: &criurpc.CriuOpts{
  1151. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  1152. WorkDirFd: proto.Int32(int32(workDir.Fd())),
  1153. EvasiveDevices: proto.Bool(true),
  1154. LogLevel: proto.Int32(4),
  1155. LogFile: proto.String("restore.log"),
  1156. RstSibling: proto.Bool(true),
  1157. Root: proto.String(root),
  1158. ManageCgroups: proto.Bool(true),
  1159. NotifyScripts: proto.Bool(true),
  1160. ShellJob: proto.Bool(criuOpts.ShellJob),
  1161. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  1162. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  1163. FileLocks: proto.Bool(criuOpts.FileLocks),
  1164. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  1165. OrphanPtsMaster: proto.Bool(true),
  1166. AutoDedup: proto.Bool(criuOpts.AutoDedup),
  1167. LazyPages: proto.Bool(criuOpts.LazyPages),
  1168. },
  1169. }
  1170. c.handleCriuConfigurationFile(req.Opts)
  1171. // Same as during checkpointing. If the container has a specific network namespace
  1172. // assigned to it, this now expects that the checkpoint will be restored in a
  1173. // already created network namespace.
  1174. nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
  1175. if nsPath != "" {
  1176. // For this to work we need at least criu 3.11.0 => 31100.
  1177. // As there was already a successful version check we will
  1178. // not error out if it fails. runc will just behave as it used
  1179. // to do and ignore external network namespaces.
  1180. err := c.checkCriuVersion(31100)
  1181. if err == nil {
  1182. // CRIU wants the information about an existing network namespace
  1183. // like this: --inherit-fd fd[<fd>]:<key>
  1184. // The <key> needs to be the same as during checkpointing.
  1185. // We are always using 'extRootNetNS' as the key in this.
  1186. netns, err := os.Open(nsPath)
  1187. defer netns.Close()
  1188. if err != nil {
  1189. logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
  1190. return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
  1191. }
  1192. inheritFd := new(criurpc.InheritFd)
  1193. inheritFd.Key = proto.String("extRootNetNS")
  1194. // The offset of four is necessary because 0, 1, 2 and 3 is already
  1195. // used by stdin, stdout, stderr, 'criu swrk' socket.
  1196. inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
  1197. req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
  1198. // All open FDs need to be transferred to CRIU via extraFiles
  1199. extraFiles = append(extraFiles, netns)
  1200. }
  1201. }
  1202. // This will modify the rootfs of the container in the same way runc
  1203. // modifies the container during initial creation.
  1204. if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
  1205. return err
  1206. }
  1207. for _, m := range c.config.Mounts {
  1208. switch m.Device {
  1209. case "bind":
  1210. c.addCriuRestoreMount(req, m)
  1211. case "cgroup":
  1212. binds, err := getCgroupMounts(m)
  1213. if err != nil {
  1214. return err
  1215. }
  1216. for _, b := range binds {
  1217. c.addCriuRestoreMount(req, b)
  1218. }
  1219. }
  1220. }
  1221. if len(c.config.MaskPaths) > 0 {
  1222. m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
  1223. c.addCriuRestoreMount(req, m)
  1224. }
  1225. for _, node := range c.config.Devices {
  1226. m := &configs.Mount{Destination: node.Path, Source: node.Path}
  1227. c.addCriuRestoreMount(req, m)
  1228. }
  1229. if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
  1230. c.restoreNetwork(req, criuOpts)
  1231. }
  1232. // append optional manage cgroups mode
  1233. if criuOpts.ManageCgroupsMode != 0 {
  1234. // criu 1.7 => 10700
  1235. if err := c.checkCriuVersion(10700); err != nil {
  1236. return err
  1237. }
  1238. mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
  1239. req.Opts.ManageCgroupsMode = &mode
  1240. }
  1241. var (
  1242. fds []string
  1243. fdJSON []byte
  1244. )
  1245. if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
  1246. return err
  1247. }
  1248. if err := json.Unmarshal(fdJSON, &fds); err != nil {
  1249. return err
  1250. }
  1251. for i := range fds {
  1252. if s := fds[i]; strings.Contains(s, "pipe:") {
  1253. inheritFd := new(criurpc.InheritFd)
  1254. inheritFd.Key = proto.String(s)
  1255. inheritFd.Fd = proto.Int32(int32(i))
  1256. req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
  1257. }
  1258. }
  1259. return c.criuSwrk(process, req, criuOpts, true, extraFiles)
  1260. }
  1261. func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
  1262. // XXX: Do we need to deal with this case? AFAIK criu still requires root.
  1263. if err := c.cgroupManager.Apply(pid); err != nil {
  1264. return err
  1265. }
  1266. if err := c.cgroupManager.Set(c.config); err != nil {
  1267. return newSystemError(err)
  1268. }
  1269. path := fmt.Sprintf("/proc/%d/cgroup", pid)
  1270. cgroupsPaths, err := cgroups.ParseCgroupFile(path)
  1271. if err != nil {
  1272. return err
  1273. }
  1274. for c, p := range cgroupsPaths {
  1275. cgroupRoot := &criurpc.CgroupRoot{
  1276. Ctrl: proto.String(c),
  1277. Path: proto.String(p),
  1278. }
  1279. req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
  1280. }
  1281. return nil
  1282. }
  1283. func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
  1284. fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
  1285. if err != nil {
  1286. return err
  1287. }
  1288. var logPath string
  1289. if opts != nil {
  1290. logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
  1291. } else {
  1292. // For the VERSION RPC 'opts' is set to 'nil' and therefore
  1293. // opts.WorkDirectory does not exist. Set logPath to "".
  1294. logPath = ""
  1295. }
  1296. criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
  1297. criuClientFileCon, err := net.FileConn(criuClient)
  1298. criuClient.Close()
  1299. if err != nil {
  1300. return err
  1301. }
  1302. criuClientCon := criuClientFileCon.(*net.UnixConn)
  1303. defer criuClientCon.Close()
  1304. criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
  1305. defer criuServer.Close()
  1306. args := []string{"swrk", "3"}
  1307. if c.criuVersion != 0 {
  1308. // If the CRIU Version is still '0' then this is probably
  1309. // the initial CRIU run to detect the version. Skip it.
  1310. logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
  1311. }
  1312. logrus.Debugf("Using CRIU with following args: %s", args)
  1313. cmd := exec.Command(c.criuPath, args...)
  1314. if process != nil {
  1315. cmd.Stdin = process.Stdin
  1316. cmd.Stdout = process.Stdout
  1317. cmd.Stderr = process.Stderr
  1318. }
  1319. cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
  1320. if extraFiles != nil {
  1321. cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
  1322. }
  1323. if err := cmd.Start(); err != nil {
  1324. return err
  1325. }
  1326. criuServer.Close()
  1327. defer func() {
  1328. criuClientCon.Close()
  1329. _, err := cmd.Process.Wait()
  1330. if err != nil {
  1331. return
  1332. }
  1333. }()
  1334. if applyCgroups {
  1335. err := c.criuApplyCgroups(cmd.Process.Pid, req)
  1336. if err != nil {
  1337. return err
  1338. }
  1339. }
  1340. var extFds []string
  1341. if process != nil {
  1342. extFds, err = getPipeFds(cmd.Process.Pid)
  1343. if err != nil {
  1344. return err
  1345. }
  1346. }
  1347. logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
  1348. // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
  1349. // should be empty. For older CRIU versions it still will be
  1350. // available but empty. criurpc.CriuReqType_VERSION actually
  1351. // has no req.GetOpts().
  1352. if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
  1353. req.GetType() == criurpc.CriuReqType_VERSION) {
  1354. val := reflect.ValueOf(req.GetOpts())
  1355. v := reflect.Indirect(val)
  1356. for i := 0; i < v.NumField(); i++ {
  1357. st := v.Type()
  1358. name := st.Field(i).Name
  1359. if strings.HasPrefix(name, "XXX_") {
  1360. continue
  1361. }
  1362. value := val.MethodByName("Get" + name).Call([]reflect.Value{})
  1363. logrus.Debugf("CRIU option %s with value %v", name, value[0])
  1364. }
  1365. }
  1366. data, err := proto.Marshal(req)
  1367. if err != nil {
  1368. return err
  1369. }
  1370. _, err = criuClientCon.Write(data)
  1371. if err != nil {
  1372. return err
  1373. }
  1374. buf := make([]byte, 10*4096)
  1375. oob := make([]byte, 4096)
  1376. for true {
  1377. n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
  1378. if err != nil {
  1379. return err
  1380. }
  1381. if n == 0 {
  1382. return fmt.Errorf("unexpected EOF")
  1383. }
  1384. if n == len(buf) {
  1385. return fmt.Errorf("buffer is too small")
  1386. }
  1387. resp := new(criurpc.CriuResp)
  1388. err = proto.Unmarshal(buf[:n], resp)
  1389. if err != nil {
  1390. return err
  1391. }
  1392. if !resp.GetSuccess() {
  1393. typeString := req.GetType().String()
  1394. if typeString == "VERSION" {
  1395. // If the VERSION RPC fails this probably means that the CRIU
  1396. // version is too old for this RPC. Just return 'nil'.
  1397. return nil
  1398. }
  1399. return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
  1400. }
  1401. t := resp.GetType()
  1402. switch {
  1403. case t == criurpc.CriuReqType_VERSION:
  1404. logrus.Debugf("CRIU version: %s", resp)
  1405. criuVersionRPC = resp.GetVersion()
  1406. break
  1407. case t == criurpc.CriuReqType_FEATURE_CHECK:
  1408. logrus.Debugf("Feature check says: %s", resp)
  1409. criuFeatures = resp.GetFeatures()
  1410. case t == criurpc.CriuReqType_NOTIFY:
  1411. if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil {
  1412. return err
  1413. }
  1414. t = criurpc.CriuReqType_NOTIFY
  1415. req = &criurpc.CriuReq{
  1416. Type: &t,
  1417. NotifySuccess: proto.Bool(true),
  1418. }
  1419. data, err = proto.Marshal(req)
  1420. if err != nil {
  1421. return err
  1422. }
  1423. _, err = criuClientCon.Write(data)
  1424. if err != nil {
  1425. return err
  1426. }
  1427. continue
  1428. case t == criurpc.CriuReqType_RESTORE:
  1429. case t == criurpc.CriuReqType_DUMP:
  1430. case t == criurpc.CriuReqType_PRE_DUMP:
  1431. default:
  1432. return fmt.Errorf("unable to parse the response %s", resp.String())
  1433. }
  1434. break
  1435. }
  1436. criuClientCon.CloseWrite()
  1437. // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
  1438. // Here we want to wait only the CRIU process.
  1439. st, err := cmd.Process.Wait()
  1440. if err != nil {
  1441. return err
  1442. }
  1443. // In pre-dump mode CRIU is in a loop and waits for
  1444. // the final DUMP command.
  1445. // The current runc pre-dump approach, however, is
  1446. // start criu in PRE_DUMP once for a single pre-dump
  1447. // and not the whole series of pre-dump, pre-dump, ...m, dump
  1448. // If we got the message CriuReqType_PRE_DUMP it means
  1449. // CRIU was successful and we need to forcefully stop CRIU
  1450. if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
  1451. return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
  1452. }
  1453. return nil
  1454. }
  1455. // block any external network activity
  1456. func lockNetwork(config *configs.Config) error {
  1457. for _, config := range config.Networks {
  1458. strategy, err := getStrategy(config.Type)
  1459. if err != nil {
  1460. return err
  1461. }
  1462. if err := strategy.detach(config); err != nil {
  1463. return err
  1464. }
  1465. }
  1466. return nil
  1467. }
  1468. func unlockNetwork(config *configs.Config) error {
  1469. for _, config := range config.Networks {
  1470. strategy, err := getStrategy(config.Type)
  1471. if err != nil {
  1472. return err
  1473. }
  1474. if err = strategy.attach(config); err != nil {
  1475. return err
  1476. }
  1477. }
  1478. return nil
  1479. }
  1480. func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error {
  1481. notify := resp.GetNotify()
  1482. if notify == nil {
  1483. return fmt.Errorf("invalid response: %s", resp.String())
  1484. }
  1485. logrus.Debugf("notify: %s\n", notify.GetScript())
  1486. switch {
  1487. case notify.GetScript() == "post-dump":
  1488. f, err := os.Create(filepath.Join(c.root, "checkpoint"))
  1489. if err != nil {
  1490. return err
  1491. }
  1492. f.Close()
  1493. case notify.GetScript() == "network-unlock":
  1494. if err := unlockNetwork(c.config); err != nil {
  1495. return err
  1496. }
  1497. case notify.GetScript() == "network-lock":
  1498. if err := lockNetwork(c.config); err != nil {
  1499. return err
  1500. }
  1501. case notify.GetScript() == "setup-namespaces":
  1502. if c.config.Hooks != nil {
  1503. s, err := c.currentOCIState()
  1504. if err != nil {
  1505. return nil
  1506. }
  1507. s.Pid = int(notify.GetPid())
  1508. for i, hook := range c.config.Hooks.Prestart {
  1509. if err := hook.Run(s); err != nil {
  1510. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  1511. }
  1512. }
  1513. }
  1514. case notify.GetScript() == "post-restore":
  1515. pid := notify.GetPid()
  1516. r, err := newRestoredProcess(int(pid), fds)
  1517. if err != nil {
  1518. return err
  1519. }
  1520. process.ops = r
  1521. if err := c.state.transition(&restoredState{
  1522. imageDir: opts.ImagesDirectory,
  1523. c: c,
  1524. }); err != nil {
  1525. return err
  1526. }
  1527. // create a timestamp indicating when the restored checkpoint was started
  1528. c.created = time.Now().UTC()
  1529. if _, err := c.updateState(r); err != nil {
  1530. return err
  1531. }
  1532. if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
  1533. if !os.IsNotExist(err) {
  1534. logrus.Error(err)
  1535. }
  1536. }
  1537. case notify.GetScript() == "orphan-pts-master":
  1538. scm, err := unix.ParseSocketControlMessage(oob)
  1539. if err != nil {
  1540. return err
  1541. }
  1542. fds, err := unix.ParseUnixRights(&scm[0])
  1543. if err != nil {
  1544. return err
  1545. }
  1546. master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
  1547. defer master.Close()
  1548. // While we can access console.master, using the API is a good idea.
  1549. if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
  1550. return err
  1551. }
  1552. }
  1553. return nil
  1554. }
  1555. func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
  1556. if process != nil {
  1557. c.initProcess = process
  1558. }
  1559. state, err := c.currentState()
  1560. if err != nil {
  1561. return nil, err
  1562. }
  1563. err = c.saveState(state)
  1564. if err != nil {
  1565. return nil, err
  1566. }
  1567. return state, nil
  1568. }
  1569. func (c *linuxContainer) saveState(s *State) error {
  1570. f, err := os.Create(filepath.Join(c.root, stateFilename))
  1571. if err != nil {
  1572. return err
  1573. }
  1574. defer f.Close()
  1575. return utils.WriteJSON(f, s)
  1576. }
  1577. func (c *linuxContainer) deleteState() error {
  1578. return os.Remove(filepath.Join(c.root, stateFilename))
  1579. }
  1580. func (c *linuxContainer) currentStatus() (Status, error) {
  1581. if err := c.refreshState(); err != nil {
  1582. return -1, err
  1583. }
  1584. return c.state.status(), nil
  1585. }
  1586. // refreshState needs to be called to verify that the current state on the
  1587. // container is what is true. Because consumers of libcontainer can use it
  1588. // out of process we need to verify the container's status based on runtime
  1589. // information and not rely on our in process info.
  1590. func (c *linuxContainer) refreshState() error {
  1591. paused, err := c.isPaused()
  1592. if err != nil {
  1593. return err
  1594. }
  1595. if paused {
  1596. return c.state.transition(&pausedState{c: c})
  1597. }
  1598. t, err := c.runType()
  1599. if err != nil {
  1600. return err
  1601. }
  1602. switch t {
  1603. case Created:
  1604. return c.state.transition(&createdState{c: c})
  1605. case Running:
  1606. return c.state.transition(&runningState{c: c})
  1607. }
  1608. return c.state.transition(&stoppedState{c: c})
  1609. }
  1610. func (c *linuxContainer) runType() (Status, error) {
  1611. if c.initProcess == nil {
  1612. return Stopped, nil
  1613. }
  1614. pid := c.initProcess.pid()
  1615. stat, err := system.Stat(pid)
  1616. if err != nil {
  1617. return Stopped, nil
  1618. }
  1619. if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
  1620. return Stopped, nil
  1621. }
  1622. // We'll create exec fifo and blocking on it after container is created,
  1623. // and delete it after start container.
  1624. if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
  1625. return Created, nil
  1626. }
  1627. return Running, nil
  1628. }
  1629. func (c *linuxContainer) isPaused() (bool, error) {
  1630. fcg := c.cgroupManager.GetPaths()["freezer"]
  1631. if fcg == "" {
  1632. // A container doesn't have a freezer cgroup
  1633. return false, nil
  1634. }
  1635. pausedState := "FROZEN"
  1636. filename := "freezer.state"
  1637. if cgroups.IsCgroup2UnifiedMode() {
  1638. filename = "cgroup.freeze"
  1639. pausedState = "1"
  1640. }
  1641. data, err := ioutil.ReadFile(filepath.Join(fcg, filename))
  1642. if err != nil {
  1643. // If freezer cgroup is not mounted, the container would just be not paused.
  1644. if os.IsNotExist(err) || err == syscall.ENODEV {
  1645. return false, nil
  1646. }
  1647. return false, newSystemErrorWithCause(err, "checking if container is paused")
  1648. }
  1649. return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil
  1650. }
  1651. func (c *linuxContainer) currentState() (*State, error) {
  1652. var (
  1653. startTime uint64
  1654. externalDescriptors []string
  1655. pid = -1
  1656. )
  1657. if c.initProcess != nil {
  1658. pid = c.initProcess.pid()
  1659. startTime, _ = c.initProcess.startTime()
  1660. externalDescriptors = c.initProcess.externalDescriptors()
  1661. }
  1662. intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
  1663. if err != nil {
  1664. intelRdtPath = ""
  1665. }
  1666. state := &State{
  1667. BaseState: BaseState{
  1668. ID: c.ID(),
  1669. Config: *c.config,
  1670. InitProcessPid: pid,
  1671. InitProcessStartTime: startTime,
  1672. Created: c.created,
  1673. },
  1674. Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
  1675. CgroupPaths: c.cgroupManager.GetPaths(),
  1676. IntelRdtPath: intelRdtPath,
  1677. NamespacePaths: make(map[configs.NamespaceType]string),
  1678. ExternalDescriptors: externalDescriptors,
  1679. }
  1680. if pid > 0 {
  1681. for _, ns := range c.config.Namespaces {
  1682. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1683. }
  1684. for _, nsType := range configs.NamespaceTypes() {
  1685. if !configs.IsNamespaceSupported(nsType) {
  1686. continue
  1687. }
  1688. if _, ok := state.NamespacePaths[nsType]; !ok {
  1689. ns := configs.Namespace{Type: nsType}
  1690. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1691. }
  1692. }
  1693. }
  1694. return state, nil
  1695. }
  1696. func (c *linuxContainer) currentOCIState() (*specs.State, error) {
  1697. bundle, annotations := utils.Annotations(c.config.Labels)
  1698. state := &specs.State{
  1699. Version: specs.Version,
  1700. ID: c.ID(),
  1701. Bundle: bundle,
  1702. Annotations: annotations,
  1703. }
  1704. status, err := c.currentStatus()
  1705. if err != nil {
  1706. return nil, err
  1707. }
  1708. state.Status = status.String()
  1709. if status != Stopped {
  1710. if c.initProcess != nil {
  1711. state.Pid = c.initProcess.pid()
  1712. }
  1713. }
  1714. return state, nil
  1715. }
  1716. // orderNamespacePaths sorts namespace paths into a list of paths that we
  1717. // can setns in order.
  1718. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
  1719. paths := []string{}
  1720. for _, ns := range configs.NamespaceTypes() {
  1721. // Remove namespaces that we don't need to join.
  1722. if !c.config.Namespaces.Contains(ns) {
  1723. continue
  1724. }
  1725. if p, ok := namespaces[ns]; ok && p != "" {
  1726. // check if the requested namespace is supported
  1727. if !configs.IsNamespaceSupported(ns) {
  1728. return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
  1729. }
  1730. // only set to join this namespace if it exists
  1731. if _, err := os.Lstat(p); err != nil {
  1732. return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
  1733. }
  1734. // do not allow namespace path with comma as we use it to separate
  1735. // the namespace paths
  1736. if strings.ContainsRune(p, ',') {
  1737. return nil, newSystemError(fmt.Errorf("invalid path %s", p))
  1738. }
  1739. paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
  1740. }
  1741. }
  1742. return paths, nil
  1743. }
  1744. func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
  1745. data := bytes.NewBuffer(nil)
  1746. for _, im := range idMap {
  1747. line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
  1748. if _, err := data.WriteString(line); err != nil {
  1749. return nil, err
  1750. }
  1751. }
  1752. return data.Bytes(), nil
  1753. }
  1754. // bootstrapData encodes the necessary data in netlink binary format
  1755. // as a io.Reader.
  1756. // Consumer can write the data to a bootstrap program
  1757. // such as one that uses nsenter package to bootstrap the container's
  1758. // init process correctly, i.e. with correct namespaces, uid/gid
  1759. // mapping etc.
  1760. func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
  1761. // create the netlink message
  1762. r := nl.NewNetlinkRequest(int(InitMsg), 0)
  1763. // write cloneFlags
  1764. r.AddData(&Int32msg{
  1765. Type: CloneFlagsAttr,
  1766. Value: uint32(cloneFlags),
  1767. })
  1768. // write custom namespace paths
  1769. if len(nsMaps) > 0 {
  1770. nsPaths, err := c.orderNamespacePaths(nsMaps)
  1771. if err != nil {
  1772. return nil, err
  1773. }
  1774. r.AddData(&Bytemsg{
  1775. Type: NsPathsAttr,
  1776. Value: []byte(strings.Join(nsPaths, ",")),
  1777. })
  1778. }
  1779. // write namespace paths only when we are not joining an existing user ns
  1780. _, joinExistingUser := nsMaps[configs.NEWUSER]
  1781. if !joinExistingUser {
  1782. // write uid mappings
  1783. if len(c.config.UidMappings) > 0 {
  1784. if c.config.RootlessEUID && c.newuidmapPath != "" {
  1785. r.AddData(&Bytemsg{
  1786. Type: UidmapPathAttr,
  1787. Value: []byte(c.newuidmapPath),
  1788. })
  1789. }
  1790. b, err := encodeIDMapping(c.config.UidMappings)
  1791. if err != nil {
  1792. return nil, err
  1793. }
  1794. r.AddData(&Bytemsg{
  1795. Type: UidmapAttr,
  1796. Value: b,
  1797. })
  1798. }
  1799. // write gid mappings
  1800. if len(c.config.GidMappings) > 0 {
  1801. b, err := encodeIDMapping(c.config.GidMappings)
  1802. if err != nil {
  1803. return nil, err
  1804. }
  1805. r.AddData(&Bytemsg{
  1806. Type: GidmapAttr,
  1807. Value: b,
  1808. })
  1809. if c.config.RootlessEUID && c.newgidmapPath != "" {
  1810. r.AddData(&Bytemsg{
  1811. Type: GidmapPathAttr,
  1812. Value: []byte(c.newgidmapPath),
  1813. })
  1814. }
  1815. if requiresRootOrMappingTool(c.config) {
  1816. r.AddData(&Boolmsg{
  1817. Type: SetgroupAttr,
  1818. Value: true,
  1819. })
  1820. }
  1821. }
  1822. }
  1823. if c.config.OomScoreAdj != nil {
  1824. // write oom_score_adj
  1825. r.AddData(&Bytemsg{
  1826. Type: OomScoreAdjAttr,
  1827. Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
  1828. })
  1829. }
  1830. // write rootless
  1831. r.AddData(&Boolmsg{
  1832. Type: RootlessEUIDAttr,
  1833. Value: c.config.RootlessEUID,
  1834. })
  1835. return bytes.NewReader(r.Serialize()), nil
  1836. }
  1837. // ignoreTerminateErrors returns nil if the given err matches an error known
  1838. // to indicate that the terminate occurred successfully or err was nil, otherwise
  1839. // err is returned unaltered.
  1840. func ignoreTerminateErrors(err error) error {
  1841. if err == nil {
  1842. return nil
  1843. }
  1844. s := err.Error()
  1845. switch {
  1846. case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
  1847. return nil
  1848. }
  1849. return err
  1850. }
  1851. func requiresRootOrMappingTool(c *configs.Config) bool {
  1852. gidMap := []configs.IDMap{
  1853. {ContainerID: 0, HostID: os.Getegid(), Size: 1},
  1854. }
  1855. return !reflect.DeepEqual(c.GidMappings, gidMap)
  1856. }