process_linux.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "os/exec"
  10. "path/filepath"
  11. "strconv"
  12. "syscall" // only for Signal
  13. "github.com/opencontainers/runc/libcontainer/cgroups"
  14. "github.com/opencontainers/runc/libcontainer/configs"
  15. "github.com/opencontainers/runc/libcontainer/intelrdt"
  16. "github.com/opencontainers/runc/libcontainer/logs"
  17. "github.com/opencontainers/runc/libcontainer/system"
  18. "github.com/opencontainers/runc/libcontainer/utils"
  19. "golang.org/x/sys/unix"
  20. )
  21. // Synchronisation value for cgroup namespace setup.
  22. // The same constant is defined in nsexec.c as "CREATECGROUPNS".
  23. const createCgroupns = 0x80
  24. type parentProcess interface {
  25. // pid returns the pid for the running process.
  26. pid() int
  27. // start starts the process execution.
  28. start() error
  29. // send a SIGKILL to the process and wait for the exit.
  30. terminate() error
  31. // wait waits on the process returning the process state.
  32. wait() (*os.ProcessState, error)
  33. // startTime returns the process start time.
  34. startTime() (uint64, error)
  35. signal(os.Signal) error
  36. externalDescriptors() []string
  37. setExternalDescriptors(fds []string)
  38. forwardChildLogs()
  39. }
  40. type filePair struct {
  41. parent *os.File
  42. child *os.File
  43. }
  44. type setnsProcess struct {
  45. cmd *exec.Cmd
  46. messageSockPair filePair
  47. logFilePair filePair
  48. cgroupPaths map[string]string
  49. rootlessCgroups bool
  50. intelRdtPath string
  51. config *initConfig
  52. fds []string
  53. process *Process
  54. bootstrapData io.Reader
  55. }
  56. func (p *setnsProcess) startTime() (uint64, error) {
  57. stat, err := system.Stat(p.pid())
  58. return stat.StartTime, err
  59. }
  60. func (p *setnsProcess) signal(sig os.Signal) error {
  61. s, ok := sig.(syscall.Signal)
  62. if !ok {
  63. return errors.New("os: unsupported signal type")
  64. }
  65. return unix.Kill(p.pid(), s)
  66. }
  67. func (p *setnsProcess) start() (err error) {
  68. defer p.messageSockPair.parent.Close()
  69. err = p.cmd.Start()
  70. // close the write-side of the pipes (controlled by child)
  71. p.messageSockPair.child.Close()
  72. p.logFilePair.child.Close()
  73. if err != nil {
  74. return newSystemErrorWithCause(err, "starting setns process")
  75. }
  76. if p.bootstrapData != nil {
  77. if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
  78. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  79. }
  80. }
  81. if err = p.execSetns(); err != nil {
  82. return newSystemErrorWithCause(err, "executing setns process")
  83. }
  84. if len(p.cgroupPaths) > 0 {
  85. if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
  86. return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
  87. }
  88. }
  89. if p.intelRdtPath != "" {
  90. // if Intel RDT "resource control" filesystem path exists
  91. _, err := os.Stat(p.intelRdtPath)
  92. if err == nil {
  93. if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
  94. return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
  95. }
  96. }
  97. }
  98. // set rlimits, this has to be done here because we lose permissions
  99. // to raise the limits once we enter a user-namespace
  100. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  101. return newSystemErrorWithCause(err, "setting rlimits for process")
  102. }
  103. if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
  104. return newSystemErrorWithCause(err, "writing config to pipe")
  105. }
  106. ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
  107. switch sync.Type {
  108. case procReady:
  109. // This shouldn't happen.
  110. panic("unexpected procReady in setns")
  111. case procHooks:
  112. // This shouldn't happen.
  113. panic("unexpected procHooks in setns")
  114. default:
  115. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  116. }
  117. })
  118. if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
  119. return newSystemErrorWithCause(err, "calling shutdown on init pipe")
  120. }
  121. // Must be done after Shutdown so the child will exit and we can wait for it.
  122. if ierr != nil {
  123. p.wait()
  124. return ierr
  125. }
  126. return nil
  127. }
  128. // execSetns runs the process that executes C code to perform the setns calls
  129. // because setns support requires the C process to fork off a child and perform the setns
  130. // before the go runtime boots, we wait on the process to die and receive the child's pid
  131. // over the provided pipe.
  132. func (p *setnsProcess) execSetns() error {
  133. status, err := p.cmd.Process.Wait()
  134. if err != nil {
  135. p.cmd.Wait()
  136. return newSystemErrorWithCause(err, "waiting on setns process to finish")
  137. }
  138. if !status.Success() {
  139. p.cmd.Wait()
  140. return newSystemError(&exec.ExitError{ProcessState: status})
  141. }
  142. var pid *pid
  143. if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
  144. p.cmd.Wait()
  145. return newSystemErrorWithCause(err, "reading pid from init pipe")
  146. }
  147. // Clean up the zombie parent process
  148. // On Unix systems FindProcess always succeeds.
  149. firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
  150. // Ignore the error in case the child has already been reaped for any reason
  151. _, _ = firstChildProcess.Wait()
  152. process, err := os.FindProcess(pid.Pid)
  153. if err != nil {
  154. return err
  155. }
  156. p.cmd.Process = process
  157. p.process.ops = p
  158. return nil
  159. }
  160. // terminate sends a SIGKILL to the forked process for the setns routine then waits to
  161. // avoid the process becoming a zombie.
  162. func (p *setnsProcess) terminate() error {
  163. if p.cmd.Process == nil {
  164. return nil
  165. }
  166. err := p.cmd.Process.Kill()
  167. if _, werr := p.wait(); err == nil {
  168. err = werr
  169. }
  170. return err
  171. }
  172. func (p *setnsProcess) wait() (*os.ProcessState, error) {
  173. err := p.cmd.Wait()
  174. // Return actual ProcessState even on Wait error
  175. return p.cmd.ProcessState, err
  176. }
  177. func (p *setnsProcess) pid() int {
  178. return p.cmd.Process.Pid
  179. }
  180. func (p *setnsProcess) externalDescriptors() []string {
  181. return p.fds
  182. }
  183. func (p *setnsProcess) setExternalDescriptors(newFds []string) {
  184. p.fds = newFds
  185. }
  186. func (p *setnsProcess) forwardChildLogs() {
  187. go logs.ForwardLogs(p.logFilePair.parent)
  188. }
  189. type initProcess struct {
  190. cmd *exec.Cmd
  191. messageSockPair filePair
  192. logFilePair filePair
  193. config *initConfig
  194. manager cgroups.Manager
  195. intelRdtManager intelrdt.Manager
  196. container *linuxContainer
  197. fds []string
  198. process *Process
  199. bootstrapData io.Reader
  200. sharePidns bool
  201. }
  202. func (p *initProcess) pid() int {
  203. return p.cmd.Process.Pid
  204. }
  205. func (p *initProcess) externalDescriptors() []string {
  206. return p.fds
  207. }
  208. // getChildPid receives the final child's pid over the provided pipe.
  209. func (p *initProcess) getChildPid() (int, error) {
  210. var pid pid
  211. if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
  212. p.cmd.Wait()
  213. return -1, err
  214. }
  215. // Clean up the zombie parent process
  216. // On Unix systems FindProcess always succeeds.
  217. firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
  218. // Ignore the error in case the child has already been reaped for any reason
  219. _, _ = firstChildProcess.Wait()
  220. return pid.Pid, nil
  221. }
  222. func (p *initProcess) waitForChildExit(childPid int) error {
  223. status, err := p.cmd.Process.Wait()
  224. if err != nil {
  225. p.cmd.Wait()
  226. return err
  227. }
  228. if !status.Success() {
  229. p.cmd.Wait()
  230. return &exec.ExitError{ProcessState: status}
  231. }
  232. process, err := os.FindProcess(childPid)
  233. if err != nil {
  234. return err
  235. }
  236. p.cmd.Process = process
  237. p.process.ops = p
  238. return nil
  239. }
  240. func (p *initProcess) start() error {
  241. defer p.messageSockPair.parent.Close()
  242. err := p.cmd.Start()
  243. p.process.ops = p
  244. // close the write-side of the pipes (controlled by child)
  245. p.messageSockPair.child.Close()
  246. p.logFilePair.child.Close()
  247. if err != nil {
  248. p.process.ops = nil
  249. return newSystemErrorWithCause(err, "starting init process command")
  250. }
  251. // Do this before syncing with child so that no children can escape the
  252. // cgroup. We don't need to worry about not doing this and not being root
  253. // because we'd be using the rootless cgroup manager in that case.
  254. if err := p.manager.Apply(p.pid()); err != nil {
  255. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  256. }
  257. if p.intelRdtManager != nil {
  258. if err := p.intelRdtManager.Apply(p.pid()); err != nil {
  259. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  260. }
  261. }
  262. defer func() {
  263. if err != nil {
  264. // TODO: should not be the responsibility to call here
  265. p.manager.Destroy()
  266. if p.intelRdtManager != nil {
  267. p.intelRdtManager.Destroy()
  268. }
  269. }
  270. }()
  271. if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
  272. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  273. }
  274. childPid, err := p.getChildPid()
  275. if err != nil {
  276. return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
  277. }
  278. // Save the standard descriptor names before the container process
  279. // can potentially move them (e.g., via dup2()). If we don't do this now,
  280. // we won't know at checkpoint time which file descriptor to look up.
  281. fds, err := getPipeFds(childPid)
  282. if err != nil {
  283. return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
  284. }
  285. p.setExternalDescriptors(fds)
  286. // Do this before syncing with child so that no children
  287. // can escape the cgroup
  288. if err := p.manager.Apply(childPid); err != nil {
  289. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  290. }
  291. if p.intelRdtManager != nil {
  292. if err := p.intelRdtManager.Apply(childPid); err != nil {
  293. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  294. }
  295. }
  296. // Now it's time to setup cgroup namesapce
  297. if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
  298. if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
  299. return newSystemErrorWithCause(err, "sending synchronization value to init process")
  300. }
  301. }
  302. // Wait for our first child to exit
  303. if err := p.waitForChildExit(childPid); err != nil {
  304. return newSystemErrorWithCause(err, "waiting for our first child to exit")
  305. }
  306. defer func() {
  307. if err != nil {
  308. // TODO: should not be the responsibility to call here
  309. p.manager.Destroy()
  310. if p.intelRdtManager != nil {
  311. p.intelRdtManager.Destroy()
  312. }
  313. }
  314. }()
  315. if err := p.createNetworkInterfaces(); err != nil {
  316. return newSystemErrorWithCause(err, "creating network interfaces")
  317. }
  318. if err := p.sendConfig(); err != nil {
  319. return newSystemErrorWithCause(err, "sending config to init process")
  320. }
  321. var (
  322. sentRun bool
  323. sentResume bool
  324. )
  325. ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
  326. switch sync.Type {
  327. case procReady:
  328. // set rlimits, this has to be done here because we lose permissions
  329. // to raise the limits once we enter a user-namespace
  330. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  331. return newSystemErrorWithCause(err, "setting rlimits for ready process")
  332. }
  333. // call prestart hooks
  334. if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
  335. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  336. if err := p.manager.Set(p.config.Config); err != nil {
  337. return newSystemErrorWithCause(err, "setting cgroup config for ready process")
  338. }
  339. if p.intelRdtManager != nil {
  340. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  341. return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
  342. }
  343. }
  344. if p.config.Config.Hooks != nil {
  345. s, err := p.container.currentOCIState()
  346. if err != nil {
  347. return err
  348. }
  349. // initProcessStartTime hasn't been set yet.
  350. s.Pid = p.cmd.Process.Pid
  351. s.Status = "creating"
  352. for i, hook := range p.config.Config.Hooks.Prestart {
  353. if err := hook.Run(s); err != nil {
  354. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  355. }
  356. }
  357. }
  358. }
  359. // Sync with child.
  360. if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
  361. return newSystemErrorWithCause(err, "writing syncT 'run'")
  362. }
  363. sentRun = true
  364. case procHooks:
  365. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  366. if err := p.manager.Set(p.config.Config); err != nil {
  367. return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
  368. }
  369. if p.intelRdtManager != nil {
  370. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  371. return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
  372. }
  373. }
  374. if p.config.Config.Hooks != nil {
  375. s, err := p.container.currentOCIState()
  376. if err != nil {
  377. return err
  378. }
  379. // initProcessStartTime hasn't been set yet.
  380. s.Pid = p.cmd.Process.Pid
  381. s.Status = "creating"
  382. for i, hook := range p.config.Config.Hooks.Prestart {
  383. if err := hook.Run(s); err != nil {
  384. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  385. }
  386. }
  387. }
  388. // Sync with child.
  389. if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
  390. return newSystemErrorWithCause(err, "writing syncT 'resume'")
  391. }
  392. sentResume = true
  393. default:
  394. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  395. }
  396. return nil
  397. })
  398. if !sentRun {
  399. return newSystemErrorWithCause(ierr, "container init")
  400. }
  401. if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
  402. return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
  403. }
  404. if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
  405. return newSystemErrorWithCause(err, "shutting down init pipe")
  406. }
  407. // Must be done after Shutdown so the child will exit and we can wait for it.
  408. if ierr != nil {
  409. p.wait()
  410. return ierr
  411. }
  412. return nil
  413. }
  414. func (p *initProcess) wait() (*os.ProcessState, error) {
  415. err := p.cmd.Wait()
  416. if err != nil {
  417. return p.cmd.ProcessState, err
  418. }
  419. // we should kill all processes in cgroup when init is died if we use host PID namespace
  420. if p.sharePidns {
  421. signalAllProcesses(p.manager, unix.SIGKILL)
  422. }
  423. return p.cmd.ProcessState, nil
  424. }
  425. func (p *initProcess) terminate() error {
  426. if p.cmd.Process == nil {
  427. return nil
  428. }
  429. err := p.cmd.Process.Kill()
  430. if _, werr := p.wait(); err == nil {
  431. err = werr
  432. }
  433. return err
  434. }
  435. func (p *initProcess) startTime() (uint64, error) {
  436. stat, err := system.Stat(p.pid())
  437. return stat.StartTime, err
  438. }
  439. func (p *initProcess) sendConfig() error {
  440. // send the config to the container's init process, we don't use JSON Encode
  441. // here because there might be a problem in JSON decoder in some cases, see:
  442. // https://github.com/docker/docker/issues/14203#issuecomment-174177790
  443. return utils.WriteJSON(p.messageSockPair.parent, p.config)
  444. }
  445. func (p *initProcess) createNetworkInterfaces() error {
  446. for _, config := range p.config.Config.Networks {
  447. strategy, err := getStrategy(config.Type)
  448. if err != nil {
  449. return err
  450. }
  451. n := &network{
  452. Network: *config,
  453. }
  454. if err := strategy.create(n, p.pid()); err != nil {
  455. return err
  456. }
  457. p.config.Networks = append(p.config.Networks, n)
  458. }
  459. return nil
  460. }
  461. func (p *initProcess) signal(sig os.Signal) error {
  462. s, ok := sig.(syscall.Signal)
  463. if !ok {
  464. return errors.New("os: unsupported signal type")
  465. }
  466. return unix.Kill(p.pid(), s)
  467. }
  468. func (p *initProcess) setExternalDescriptors(newFds []string) {
  469. p.fds = newFds
  470. }
  471. func (p *initProcess) forwardChildLogs() {
  472. go logs.ForwardLogs(p.logFilePair.parent)
  473. }
  474. func getPipeFds(pid int) ([]string, error) {
  475. fds := make([]string, 3)
  476. dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
  477. for i := 0; i < 3; i++ {
  478. // XXX: This breaks if the path is not a valid symlink (which can
  479. // happen in certain particularly unlucky mount namespace setups).
  480. f := filepath.Join(dirPath, strconv.Itoa(i))
  481. target, err := os.Readlink(f)
  482. if err != nil {
  483. // Ignore permission errors, for rootless containers and other
  484. // non-dumpable processes. if we can't get the fd for a particular
  485. // file, there's not much we can do.
  486. if os.IsPermission(err) {
  487. continue
  488. }
  489. return fds, err
  490. }
  491. fds[i] = target
  492. }
  493. return fds, nil
  494. }
  495. // InitializeIO creates pipes for use with the process's stdio and returns the
  496. // opposite side for each. Do not use this if you want to have a pseudoterminal
  497. // set up for you by libcontainer (TODO: fix that too).
  498. // TODO: This is mostly unnecessary, and should be handled by clients.
  499. func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
  500. var fds []uintptr
  501. i = &IO{}
  502. // cleanup in case of an error
  503. defer func() {
  504. if err != nil {
  505. for _, fd := range fds {
  506. unix.Close(int(fd))
  507. }
  508. }
  509. }()
  510. // STDIN
  511. r, w, err := os.Pipe()
  512. if err != nil {
  513. return nil, err
  514. }
  515. fds = append(fds, r.Fd(), w.Fd())
  516. p.Stdin, i.Stdin = r, w
  517. // STDOUT
  518. if r, w, err = os.Pipe(); err != nil {
  519. return nil, err
  520. }
  521. fds = append(fds, r.Fd(), w.Fd())
  522. p.Stdout, i.Stdout = w, r
  523. // STDERR
  524. if r, w, err = os.Pipe(); err != nil {
  525. return nil, err
  526. }
  527. fds = append(fds, r.Fd(), w.Fd())
  528. p.Stderr, i.Stderr = w, r
  529. // change ownership of the pipes in case we are in a user namespace
  530. for _, fd := range fds {
  531. if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
  532. return nil, err
  533. }
  534. }
  535. return i, nil
  536. }