process_linux.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "os/exec"
  10. "path/filepath"
  11. "strconv"
  12. "syscall" // only for Signal
  13. "github.com/opencontainers/runc/libcontainer/cgroups"
  14. "github.com/opencontainers/runc/libcontainer/configs"
  15. "github.com/opencontainers/runc/libcontainer/intelrdt"
  16. "github.com/opencontainers/runc/libcontainer/system"
  17. "github.com/opencontainers/runc/libcontainer/utils"
  18. "golang.org/x/sys/unix"
  19. )
  20. type parentProcess interface {
  21. // pid returns the pid for the running process.
  22. pid() int
  23. // start starts the process execution.
  24. start() error
  25. // send a SIGKILL to the process and wait for the exit.
  26. terminate() error
  27. // wait waits on the process returning the process state.
  28. wait() (*os.ProcessState, error)
  29. // startTime returns the process start time.
  30. startTime() (uint64, error)
  31. signal(os.Signal) error
  32. externalDescriptors() []string
  33. setExternalDescriptors(fds []string)
  34. }
  35. type setnsProcess struct {
  36. cmd *exec.Cmd
  37. parentPipe *os.File
  38. childPipe *os.File
  39. cgroupPaths map[string]string
  40. rootlessCgroups bool
  41. intelRdtPath string
  42. config *initConfig
  43. fds []string
  44. process *Process
  45. bootstrapData io.Reader
  46. }
  47. func (p *setnsProcess) startTime() (uint64, error) {
  48. stat, err := system.Stat(p.pid())
  49. return stat.StartTime, err
  50. }
  51. func (p *setnsProcess) signal(sig os.Signal) error {
  52. s, ok := sig.(syscall.Signal)
  53. if !ok {
  54. return errors.New("os: unsupported signal type")
  55. }
  56. return unix.Kill(p.pid(), s)
  57. }
  58. func (p *setnsProcess) start() (err error) {
  59. defer p.parentPipe.Close()
  60. err = p.cmd.Start()
  61. p.childPipe.Close()
  62. if err != nil {
  63. return newSystemErrorWithCause(err, "starting setns process")
  64. }
  65. if p.bootstrapData != nil {
  66. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  67. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  68. }
  69. }
  70. if err = p.execSetns(); err != nil {
  71. return newSystemErrorWithCause(err, "executing setns process")
  72. }
  73. if len(p.cgroupPaths) > 0 {
  74. if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
  75. return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
  76. }
  77. }
  78. if p.intelRdtPath != "" {
  79. // if Intel RDT "resource control" filesystem path exists
  80. _, err := os.Stat(p.intelRdtPath)
  81. if err == nil {
  82. if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
  83. return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
  84. }
  85. }
  86. }
  87. // set rlimits, this has to be done here because we lose permissions
  88. // to raise the limits once we enter a user-namespace
  89. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  90. return newSystemErrorWithCause(err, "setting rlimits for process")
  91. }
  92. if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
  93. return newSystemErrorWithCause(err, "writing config to pipe")
  94. }
  95. ierr := parseSync(p.parentPipe, func(sync *syncT) error {
  96. switch sync.Type {
  97. case procReady:
  98. // This shouldn't happen.
  99. panic("unexpected procReady in setns")
  100. case procHooks:
  101. // This shouldn't happen.
  102. panic("unexpected procHooks in setns")
  103. default:
  104. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  105. }
  106. })
  107. if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
  108. return newSystemErrorWithCause(err, "calling shutdown on init pipe")
  109. }
  110. // Must be done after Shutdown so the child will exit and we can wait for it.
  111. if ierr != nil {
  112. p.wait()
  113. return ierr
  114. }
  115. return nil
  116. }
  117. // execSetns runs the process that executes C code to perform the setns calls
  118. // because setns support requires the C process to fork off a child and perform the setns
  119. // before the go runtime boots, we wait on the process to die and receive the child's pid
  120. // over the provided pipe.
  121. func (p *setnsProcess) execSetns() error {
  122. status, err := p.cmd.Process.Wait()
  123. if err != nil {
  124. p.cmd.Wait()
  125. return newSystemErrorWithCause(err, "waiting on setns process to finish")
  126. }
  127. if !status.Success() {
  128. p.cmd.Wait()
  129. return newSystemError(&exec.ExitError{ProcessState: status})
  130. }
  131. var pid *pid
  132. if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
  133. p.cmd.Wait()
  134. return newSystemErrorWithCause(err, "reading pid from init pipe")
  135. }
  136. // Clean up the zombie parent process
  137. firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
  138. if err != nil {
  139. return err
  140. }
  141. // Ignore the error in case the child has already been reaped for any reason
  142. _, _ = firstChildProcess.Wait()
  143. process, err := os.FindProcess(pid.Pid)
  144. if err != nil {
  145. return err
  146. }
  147. p.cmd.Process = process
  148. p.process.ops = p
  149. return nil
  150. }
  151. // terminate sends a SIGKILL to the forked process for the setns routine then waits to
  152. // avoid the process becoming a zombie.
  153. func (p *setnsProcess) terminate() error {
  154. if p.cmd.Process == nil {
  155. return nil
  156. }
  157. err := p.cmd.Process.Kill()
  158. if _, werr := p.wait(); err == nil {
  159. err = werr
  160. }
  161. return err
  162. }
  163. func (p *setnsProcess) wait() (*os.ProcessState, error) {
  164. err := p.cmd.Wait()
  165. // Return actual ProcessState even on Wait error
  166. return p.cmd.ProcessState, err
  167. }
  168. func (p *setnsProcess) pid() int {
  169. return p.cmd.Process.Pid
  170. }
  171. func (p *setnsProcess) externalDescriptors() []string {
  172. return p.fds
  173. }
  174. func (p *setnsProcess) setExternalDescriptors(newFds []string) {
  175. p.fds = newFds
  176. }
  177. type initProcess struct {
  178. cmd *exec.Cmd
  179. parentPipe *os.File
  180. childPipe *os.File
  181. config *initConfig
  182. manager cgroups.Manager
  183. intelRdtManager intelrdt.Manager
  184. container *linuxContainer
  185. fds []string
  186. process *Process
  187. bootstrapData io.Reader
  188. sharePidns bool
  189. }
  190. func (p *initProcess) pid() int {
  191. return p.cmd.Process.Pid
  192. }
  193. func (p *initProcess) externalDescriptors() []string {
  194. return p.fds
  195. }
  196. // execSetns runs the process that executes C code to perform the setns calls
  197. // because setns support requires the C process to fork off a child and perform the setns
  198. // before the go runtime boots, we wait on the process to die and receive the child's pid
  199. // over the provided pipe.
  200. // This is called by initProcess.start function
  201. func (p *initProcess) execSetns() error {
  202. status, err := p.cmd.Process.Wait()
  203. if err != nil {
  204. p.cmd.Wait()
  205. return err
  206. }
  207. if !status.Success() {
  208. p.cmd.Wait()
  209. return &exec.ExitError{ProcessState: status}
  210. }
  211. var pid *pid
  212. if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
  213. p.cmd.Wait()
  214. return err
  215. }
  216. // Clean up the zombie parent process
  217. firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
  218. if err != nil {
  219. return err
  220. }
  221. // Ignore the error in case the child has already been reaped for any reason
  222. _, _ = firstChildProcess.Wait()
  223. process, err := os.FindProcess(pid.Pid)
  224. if err != nil {
  225. return err
  226. }
  227. p.cmd.Process = process
  228. p.process.ops = p
  229. return nil
  230. }
  231. func (p *initProcess) start() error {
  232. defer p.parentPipe.Close()
  233. err := p.cmd.Start()
  234. p.process.ops = p
  235. p.childPipe.Close()
  236. if err != nil {
  237. p.process.ops = nil
  238. return newSystemErrorWithCause(err, "starting init process command")
  239. }
  240. // Do this before syncing with child so that no children can escape the
  241. // cgroup. We don't need to worry about not doing this and not being root
  242. // because we'd be using the rootless cgroup manager in that case.
  243. if err := p.manager.Apply(p.pid()); err != nil {
  244. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  245. }
  246. if p.intelRdtManager != nil {
  247. if err := p.intelRdtManager.Apply(p.pid()); err != nil {
  248. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  249. }
  250. }
  251. defer func() {
  252. if err != nil {
  253. // TODO: should not be the responsibility to call here
  254. p.manager.Destroy()
  255. if p.intelRdtManager != nil {
  256. p.intelRdtManager.Destroy()
  257. }
  258. }
  259. }()
  260. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  261. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  262. }
  263. if err := p.execSetns(); err != nil {
  264. return newSystemErrorWithCause(err, "running exec setns process for init")
  265. }
  266. // Save the standard descriptor names before the container process
  267. // can potentially move them (e.g., via dup2()). If we don't do this now,
  268. // we won't know at checkpoint time which file descriptor to look up.
  269. fds, err := getPipeFds(p.pid())
  270. if err != nil {
  271. return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
  272. }
  273. p.setExternalDescriptors(fds)
  274. if err := p.createNetworkInterfaces(); err != nil {
  275. return newSystemErrorWithCause(err, "creating network interfaces")
  276. }
  277. if err := p.sendConfig(); err != nil {
  278. return newSystemErrorWithCause(err, "sending config to init process")
  279. }
  280. var (
  281. sentRun bool
  282. sentResume bool
  283. )
  284. ierr := parseSync(p.parentPipe, func(sync *syncT) error {
  285. switch sync.Type {
  286. case procReady:
  287. // set rlimits, this has to be done here because we lose permissions
  288. // to raise the limits once we enter a user-namespace
  289. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  290. return newSystemErrorWithCause(err, "setting rlimits for ready process")
  291. }
  292. // call prestart hooks
  293. if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
  294. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  295. if err := p.manager.Set(p.config.Config); err != nil {
  296. return newSystemErrorWithCause(err, "setting cgroup config for ready process")
  297. }
  298. if p.intelRdtManager != nil {
  299. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  300. return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
  301. }
  302. }
  303. if p.config.Config.Hooks != nil {
  304. bundle, annotations := utils.Annotations(p.container.config.Labels)
  305. s := configs.HookState{
  306. Version: p.container.config.Version,
  307. ID: p.container.id,
  308. Pid: p.pid(),
  309. Bundle: bundle,
  310. Annotations: annotations,
  311. }
  312. for i, hook := range p.config.Config.Hooks.Prestart {
  313. if err := hook.Run(s); err != nil {
  314. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  315. }
  316. }
  317. }
  318. }
  319. // Sync with child.
  320. if err := writeSync(p.parentPipe, procRun); err != nil {
  321. return newSystemErrorWithCause(err, "writing syncT 'run'")
  322. }
  323. sentRun = true
  324. case procHooks:
  325. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  326. if err := p.manager.Set(p.config.Config); err != nil {
  327. return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
  328. }
  329. if p.intelRdtManager != nil {
  330. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  331. return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
  332. }
  333. }
  334. if p.config.Config.Hooks != nil {
  335. bundle, annotations := utils.Annotations(p.container.config.Labels)
  336. s := configs.HookState{
  337. Version: p.container.config.Version,
  338. ID: p.container.id,
  339. Pid: p.pid(),
  340. Bundle: bundle,
  341. Annotations: annotations,
  342. }
  343. for i, hook := range p.config.Config.Hooks.Prestart {
  344. if err := hook.Run(s); err != nil {
  345. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  346. }
  347. }
  348. }
  349. // Sync with child.
  350. if err := writeSync(p.parentPipe, procResume); err != nil {
  351. return newSystemErrorWithCause(err, "writing syncT 'resume'")
  352. }
  353. sentResume = true
  354. default:
  355. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  356. }
  357. return nil
  358. })
  359. if !sentRun {
  360. return newSystemErrorWithCause(ierr, "container init")
  361. }
  362. if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
  363. return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
  364. }
  365. if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
  366. return newSystemErrorWithCause(err, "shutting down init pipe")
  367. }
  368. // Must be done after Shutdown so the child will exit and we can wait for it.
  369. if ierr != nil {
  370. p.wait()
  371. return ierr
  372. }
  373. return nil
  374. }
  375. func (p *initProcess) wait() (*os.ProcessState, error) {
  376. err := p.cmd.Wait()
  377. if err != nil {
  378. return p.cmd.ProcessState, err
  379. }
  380. // we should kill all processes in cgroup when init is died if we use host PID namespace
  381. if p.sharePidns {
  382. signalAllProcesses(p.manager, unix.SIGKILL)
  383. }
  384. return p.cmd.ProcessState, nil
  385. }
  386. func (p *initProcess) terminate() error {
  387. if p.cmd.Process == nil {
  388. return nil
  389. }
  390. err := p.cmd.Process.Kill()
  391. if _, werr := p.wait(); err == nil {
  392. err = werr
  393. }
  394. return err
  395. }
  396. func (p *initProcess) startTime() (uint64, error) {
  397. stat, err := system.Stat(p.pid())
  398. return stat.StartTime, err
  399. }
  400. func (p *initProcess) sendConfig() error {
  401. // send the config to the container's init process, we don't use JSON Encode
  402. // here because there might be a problem in JSON decoder in some cases, see:
  403. // https://github.com/docker/docker/issues/14203#issuecomment-174177790
  404. return utils.WriteJSON(p.parentPipe, p.config)
  405. }
  406. func (p *initProcess) createNetworkInterfaces() error {
  407. for _, config := range p.config.Config.Networks {
  408. strategy, err := getStrategy(config.Type)
  409. if err != nil {
  410. return err
  411. }
  412. n := &network{
  413. Network: *config,
  414. }
  415. if err := strategy.create(n, p.pid()); err != nil {
  416. return err
  417. }
  418. p.config.Networks = append(p.config.Networks, n)
  419. }
  420. return nil
  421. }
  422. func (p *initProcess) signal(sig os.Signal) error {
  423. s, ok := sig.(syscall.Signal)
  424. if !ok {
  425. return errors.New("os: unsupported signal type")
  426. }
  427. return unix.Kill(p.pid(), s)
  428. }
  429. func (p *initProcess) setExternalDescriptors(newFds []string) {
  430. p.fds = newFds
  431. }
  432. func getPipeFds(pid int) ([]string, error) {
  433. fds := make([]string, 3)
  434. dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
  435. for i := 0; i < 3; i++ {
  436. // XXX: This breaks if the path is not a valid symlink (which can
  437. // happen in certain particularly unlucky mount namespace setups).
  438. f := filepath.Join(dirPath, strconv.Itoa(i))
  439. target, err := os.Readlink(f)
  440. if err != nil {
  441. // Ignore permission errors, for rootless containers and other
  442. // non-dumpable processes. if we can't get the fd for a particular
  443. // file, there's not much we can do.
  444. if os.IsPermission(err) {
  445. continue
  446. }
  447. return fds, err
  448. }
  449. fds[i] = target
  450. }
  451. return fds, nil
  452. }
  453. // InitializeIO creates pipes for use with the process's stdio and returns the
  454. // opposite side for each. Do not use this if you want to have a pseudoterminal
  455. // set up for you by libcontainer (TODO: fix that too).
  456. // TODO: This is mostly unnecessary, and should be handled by clients.
  457. func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
  458. var fds []uintptr
  459. i = &IO{}
  460. // cleanup in case of an error
  461. defer func() {
  462. if err != nil {
  463. for _, fd := range fds {
  464. unix.Close(int(fd))
  465. }
  466. }
  467. }()
  468. // STDIN
  469. r, w, err := os.Pipe()
  470. if err != nil {
  471. return nil, err
  472. }
  473. fds = append(fds, r.Fd(), w.Fd())
  474. p.Stdin, i.Stdin = r, w
  475. // STDOUT
  476. if r, w, err = os.Pipe(); err != nil {
  477. return nil, err
  478. }
  479. fds = append(fds, r.Fd(), w.Fd())
  480. p.Stdout, i.Stdout = w, r
  481. // STDERR
  482. if r, w, err = os.Pipe(); err != nil {
  483. return nil, err
  484. }
  485. fds = append(fds, r.Fd(), w.Fd())
  486. p.Stderr, i.Stderr = w, r
  487. // change ownership of the pipes in case we are in a user namespace
  488. for _, fd := range fds {
  489. if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
  490. return nil, err
  491. }
  492. }
  493. return i, nil
  494. }