init_linux.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "encoding/json"
  5. "fmt"
  6. "io"
  7. "io/ioutil"
  8. "net"
  9. "os"
  10. "strings"
  11. "syscall" // only for Errno
  12. "unsafe"
  13. "golang.org/x/sys/unix"
  14. "github.com/containerd/console"
  15. "github.com/opencontainers/runc/libcontainer/cgroups"
  16. "github.com/opencontainers/runc/libcontainer/configs"
  17. "github.com/opencontainers/runc/libcontainer/system"
  18. "github.com/opencontainers/runc/libcontainer/user"
  19. "github.com/opencontainers/runc/libcontainer/utils"
  20. "github.com/pkg/errors"
  21. "github.com/sirupsen/logrus"
  22. "github.com/vishvananda/netlink"
  23. )
  24. type initType string
  25. const (
  26. initSetns initType = "setns"
  27. initStandard initType = "standard"
  28. )
  29. type pid struct {
  30. Pid int `json:"pid"`
  31. PidFirstChild int `json:"pid_first"`
  32. }
  33. // network is an internal struct used to setup container networks.
  34. type network struct {
  35. configs.Network
  36. // TempVethPeerName is a unique temporary veth peer name that was placed into
  37. // the container's namespace.
  38. TempVethPeerName string `json:"temp_veth_peer_name"`
  39. }
  40. // initConfig is used for transferring parameters from Exec() to Init()
  41. type initConfig struct {
  42. Args []string `json:"args"`
  43. Env []string `json:"env"`
  44. Cwd string `json:"cwd"`
  45. Capabilities *configs.Capabilities `json:"capabilities"`
  46. ProcessLabel string `json:"process_label"`
  47. AppArmorProfile string `json:"apparmor_profile"`
  48. NoNewPrivileges bool `json:"no_new_privileges"`
  49. User string `json:"user"`
  50. AdditionalGroups []string `json:"additional_groups"`
  51. Config *configs.Config `json:"config"`
  52. Networks []*network `json:"network"`
  53. PassedFilesCount int `json:"passed_files_count"`
  54. ContainerId string `json:"containerid"`
  55. Rlimits []configs.Rlimit `json:"rlimits"`
  56. CreateConsole bool `json:"create_console"`
  57. ConsoleWidth uint16 `json:"console_width"`
  58. ConsoleHeight uint16 `json:"console_height"`
  59. RootlessEUID bool `json:"rootless_euid,omitempty"`
  60. RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
  61. }
  62. type initer interface {
  63. Init() error
  64. }
  65. func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
  66. var config *initConfig
  67. if err := json.NewDecoder(pipe).Decode(&config); err != nil {
  68. return nil, err
  69. }
  70. if err := populateProcessEnvironment(config.Env); err != nil {
  71. return nil, err
  72. }
  73. switch t {
  74. case initSetns:
  75. return &linuxSetnsInit{
  76. pipe: pipe,
  77. consoleSocket: consoleSocket,
  78. config: config,
  79. }, nil
  80. case initStandard:
  81. return &linuxStandardInit{
  82. pipe: pipe,
  83. consoleSocket: consoleSocket,
  84. parentPid: unix.Getppid(),
  85. config: config,
  86. fifoFd: fifoFd,
  87. }, nil
  88. }
  89. return nil, fmt.Errorf("unknown init type %q", t)
  90. }
  91. // populateProcessEnvironment loads the provided environment variables into the
  92. // current processes's environment.
  93. func populateProcessEnvironment(env []string) error {
  94. for _, pair := range env {
  95. p := strings.SplitN(pair, "=", 2)
  96. if len(p) < 2 {
  97. return fmt.Errorf("invalid environment '%v'", pair)
  98. }
  99. if err := os.Setenv(p[0], p[1]); err != nil {
  100. return err
  101. }
  102. }
  103. return nil
  104. }
  105. // finalizeNamespace drops the caps, sets the correct user
  106. // and working dir, and closes any leaked file descriptors
  107. // before executing the command inside the namespace
  108. func finalizeNamespace(config *initConfig) error {
  109. // Ensure that all unwanted fds we may have accidentally
  110. // inherited are marked close-on-exec so they stay out of the
  111. // container
  112. if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
  113. return errors.Wrap(err, "close exec fds")
  114. }
  115. capabilities := &configs.Capabilities{}
  116. if config.Capabilities != nil {
  117. capabilities = config.Capabilities
  118. } else if config.Config.Capabilities != nil {
  119. capabilities = config.Config.Capabilities
  120. }
  121. w, err := newContainerCapList(capabilities)
  122. if err != nil {
  123. return err
  124. }
  125. // drop capabilities in bounding set before changing user
  126. if err := w.ApplyBoundingSet(); err != nil {
  127. return errors.Wrap(err, "apply bounding set")
  128. }
  129. // preserve existing capabilities while we change users
  130. if err := system.SetKeepCaps(); err != nil {
  131. return errors.Wrap(err, "set keep caps")
  132. }
  133. if err := setupUser(config); err != nil {
  134. return errors.Wrap(err, "setup user")
  135. }
  136. if err := system.ClearKeepCaps(); err != nil {
  137. return errors.Wrap(err, "clear keep caps")
  138. }
  139. if err := w.ApplyCaps(); err != nil {
  140. return errors.Wrap(err, "apply caps")
  141. }
  142. if config.Cwd != "" {
  143. if err := unix.Chdir(config.Cwd); err != nil {
  144. return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
  145. }
  146. }
  147. return nil
  148. }
  149. // setupConsole sets up the console from inside the container, and sends the
  150. // master pty fd to the config.Pipe (using cmsg). This is done to ensure that
  151. // consoles are scoped to a container properly (see runc#814 and the many
  152. // issues related to that). This has to be run *after* we've pivoted to the new
  153. // rootfs (and the users' configuration is entirely set up).
  154. func setupConsole(socket *os.File, config *initConfig, mount bool) error {
  155. defer socket.Close()
  156. // At this point, /dev/ptmx points to something that we would expect. We
  157. // used to change the owner of the slave path, but since the /dev/pts mount
  158. // can have gid=X set (at the users' option). So touching the owner of the
  159. // slave PTY is not necessary, as the kernel will handle that for us. Note
  160. // however, that setupUser (specifically fixStdioPermissions) *will* change
  161. // the UID owner of the console to be the user the process will run as (so
  162. // they can actually control their console).
  163. pty, slavePath, err := console.NewPty()
  164. if err != nil {
  165. return err
  166. }
  167. if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
  168. err = pty.Resize(console.WinSize{
  169. Height: config.ConsoleHeight,
  170. Width: config.ConsoleWidth,
  171. })
  172. if err != nil {
  173. return err
  174. }
  175. }
  176. // After we return from here, we don't need the console anymore.
  177. defer pty.Close()
  178. // Mount the console inside our rootfs.
  179. if mount {
  180. if err := mountConsole(slavePath); err != nil {
  181. return err
  182. }
  183. }
  184. // While we can access console.master, using the API is a good idea.
  185. if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
  186. return err
  187. }
  188. // Now, dup over all the things.
  189. return dupStdio(slavePath)
  190. }
  191. // syncParentReady sends to the given pipe a JSON payload which indicates that
  192. // the init is ready to Exec the child process. It then waits for the parent to
  193. // indicate that it is cleared to Exec.
  194. func syncParentReady(pipe io.ReadWriter) error {
  195. // Tell parent.
  196. if err := writeSync(pipe, procReady); err != nil {
  197. return err
  198. }
  199. // Wait for parent to give the all-clear.
  200. return readSync(pipe, procRun)
  201. }
  202. // syncParentHooks sends to the given pipe a JSON payload which indicates that
  203. // the parent should execute pre-start hooks. It then waits for the parent to
  204. // indicate that it is cleared to resume.
  205. func syncParentHooks(pipe io.ReadWriter) error {
  206. // Tell parent.
  207. if err := writeSync(pipe, procHooks); err != nil {
  208. return err
  209. }
  210. // Wait for parent to give the all-clear.
  211. return readSync(pipe, procResume)
  212. }
  213. // setupUser changes the groups, gid, and uid for the user inside the container
  214. func setupUser(config *initConfig) error {
  215. // Set up defaults.
  216. defaultExecUser := user.ExecUser{
  217. Uid: 0,
  218. Gid: 0,
  219. Home: "/",
  220. }
  221. passwdPath, err := user.GetPasswdPath()
  222. if err != nil {
  223. return err
  224. }
  225. groupPath, err := user.GetGroupPath()
  226. if err != nil {
  227. return err
  228. }
  229. execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
  230. if err != nil {
  231. return err
  232. }
  233. var addGroups []int
  234. if len(config.AdditionalGroups) > 0 {
  235. addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
  236. if err != nil {
  237. return err
  238. }
  239. }
  240. // Rather than just erroring out later in setuid(2) and setgid(2), check
  241. // that the user is mapped here.
  242. if _, err := config.Config.HostUID(execUser.Uid); err != nil {
  243. return fmt.Errorf("cannot set uid to unmapped user in user namespace")
  244. }
  245. if _, err := config.Config.HostGID(execUser.Gid); err != nil {
  246. return fmt.Errorf("cannot set gid to unmapped user in user namespace")
  247. }
  248. if config.RootlessEUID {
  249. // We cannot set any additional groups in a rootless container and thus
  250. // we bail if the user asked us to do so. TODO: We currently can't do
  251. // this check earlier, but if libcontainer.Process.User was typesafe
  252. // this might work.
  253. if len(addGroups) > 0 {
  254. return fmt.Errorf("cannot set any additional groups in a rootless container")
  255. }
  256. }
  257. // Before we change to the container's user make sure that the processes
  258. // STDIO is correctly owned by the user that we are switching to.
  259. if err := fixStdioPermissions(config, execUser); err != nil {
  260. return err
  261. }
  262. setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
  263. if err != nil && !os.IsNotExist(err) {
  264. return err
  265. }
  266. // This isn't allowed in an unprivileged user namespace since Linux 3.19.
  267. // There's nothing we can do about /etc/group entries, so we silently
  268. // ignore setting groups here (since the user didn't explicitly ask us to
  269. // set the group).
  270. allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
  271. if allowSupGroups {
  272. suppGroups := append(execUser.Sgids, addGroups...)
  273. if err := unix.Setgroups(suppGroups); err != nil {
  274. return err
  275. }
  276. }
  277. if err := system.Setgid(execUser.Gid); err != nil {
  278. return err
  279. }
  280. if err := system.Setuid(execUser.Uid); err != nil {
  281. return err
  282. }
  283. // if we didn't get HOME already, set it based on the user's HOME
  284. if envHome := os.Getenv("HOME"); envHome == "" {
  285. if err := os.Setenv("HOME", execUser.Home); err != nil {
  286. return err
  287. }
  288. }
  289. return nil
  290. }
  291. // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
  292. // The ownership needs to match because it is created outside of the container and needs to be
  293. // localized.
  294. func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
  295. var null unix.Stat_t
  296. if err := unix.Stat("/dev/null", &null); err != nil {
  297. return err
  298. }
  299. for _, fd := range []uintptr{
  300. os.Stdin.Fd(),
  301. os.Stderr.Fd(),
  302. os.Stdout.Fd(),
  303. } {
  304. var s unix.Stat_t
  305. if err := unix.Fstat(int(fd), &s); err != nil {
  306. return err
  307. }
  308. // Skip chown of /dev/null if it was used as one of the STDIO fds.
  309. if s.Rdev == null.Rdev {
  310. continue
  311. }
  312. // We only change the uid owner (as it is possible for the mount to
  313. // prefer a different gid, and there's no reason for us to change it).
  314. // The reason why we don't just leave the default uid=X mount setup is
  315. // that users expect to be able to actually use their console. Without
  316. // this code, you couldn't effectively run as a non-root user inside a
  317. // container and also have a console set up.
  318. if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
  319. // If we've hit an EINVAL then s.Gid isn't mapped in the user
  320. // namespace. If we've hit an EPERM then the inode's current owner
  321. // is not mapped in our user namespace (in particular,
  322. // privileged_wrt_inode_uidgid() has failed). In either case, we
  323. // are in a configuration where it's better for us to just not
  324. // touch the stdio rather than bail at this point.
  325. if err == unix.EINVAL || err == unix.EPERM {
  326. continue
  327. }
  328. return err
  329. }
  330. }
  331. return nil
  332. }
  333. // setupNetwork sets up and initializes any network interface inside the container.
  334. func setupNetwork(config *initConfig) error {
  335. for _, config := range config.Networks {
  336. strategy, err := getStrategy(config.Type)
  337. if err != nil {
  338. return err
  339. }
  340. if err := strategy.initialize(config); err != nil {
  341. return err
  342. }
  343. }
  344. return nil
  345. }
  346. func setupRoute(config *configs.Config) error {
  347. for _, config := range config.Routes {
  348. _, dst, err := net.ParseCIDR(config.Destination)
  349. if err != nil {
  350. return err
  351. }
  352. src := net.ParseIP(config.Source)
  353. if src == nil {
  354. return fmt.Errorf("Invalid source for route: %s", config.Source)
  355. }
  356. gw := net.ParseIP(config.Gateway)
  357. if gw == nil {
  358. return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
  359. }
  360. l, err := netlink.LinkByName(config.InterfaceName)
  361. if err != nil {
  362. return err
  363. }
  364. route := &netlink.Route{
  365. Scope: netlink.SCOPE_UNIVERSE,
  366. Dst: dst,
  367. Src: src,
  368. Gw: gw,
  369. LinkIndex: l.Attrs().Index,
  370. }
  371. if err := netlink.RouteAdd(route); err != nil {
  372. return err
  373. }
  374. }
  375. return nil
  376. }
  377. func setupRlimits(limits []configs.Rlimit, pid int) error {
  378. for _, rlimit := range limits {
  379. if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
  380. return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
  381. }
  382. }
  383. return nil
  384. }
  385. const _P_PID = 1
  386. type siginfo struct {
  387. si_signo int32
  388. si_errno int32
  389. si_code int32
  390. // below here is a union; si_pid is the only field we use
  391. si_pid int32
  392. // Pad to 128 bytes as detailed in blockUntilWaitable
  393. pad [96]byte
  394. }
  395. // isWaitable returns true if the process has exited false otherwise.
  396. // Its based off blockUntilWaitable in src/os/wait_waitid.go
  397. func isWaitable(pid int) (bool, error) {
  398. si := &siginfo{}
  399. _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
  400. if e != 0 {
  401. return false, os.NewSyscallError("waitid", e)
  402. }
  403. return si.si_pid != 0, nil
  404. }
  405. // isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
  406. func isNoChildren(err error) bool {
  407. switch err := err.(type) {
  408. case syscall.Errno:
  409. if err == unix.ECHILD {
  410. return true
  411. }
  412. case *os.SyscallError:
  413. if err.Err == unix.ECHILD {
  414. return true
  415. }
  416. }
  417. return false
  418. }
  419. // signalAllProcesses freezes then iterates over all the processes inside the
  420. // manager's cgroups sending the signal s to them.
  421. // If s is SIGKILL then it will wait for each process to exit.
  422. // For all other signals it will check if the process is ready to report its
  423. // exit status and only if it is will a wait be performed.
  424. func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
  425. var procs []*os.Process
  426. if err := m.Freeze(configs.Frozen); err != nil {
  427. logrus.Warn(err)
  428. }
  429. pids, err := m.GetAllPids()
  430. if err != nil {
  431. m.Freeze(configs.Thawed)
  432. return err
  433. }
  434. for _, pid := range pids {
  435. p, err := os.FindProcess(pid)
  436. if err != nil {
  437. logrus.Warn(err)
  438. continue
  439. }
  440. procs = append(procs, p)
  441. if err := p.Signal(s); err != nil {
  442. logrus.Warn(err)
  443. }
  444. }
  445. if err := m.Freeze(configs.Thawed); err != nil {
  446. logrus.Warn(err)
  447. }
  448. subreaper, err := system.GetSubreaper()
  449. if err != nil {
  450. // The error here means that PR_GET_CHILD_SUBREAPER is not
  451. // supported because this code might run on a kernel older
  452. // than 3.4. We don't want to throw an error in that case,
  453. // and we simplify things, considering there is no subreaper
  454. // set.
  455. subreaper = 0
  456. }
  457. for _, p := range procs {
  458. if s != unix.SIGKILL {
  459. if ok, err := isWaitable(p.Pid); err != nil {
  460. if !isNoChildren(err) {
  461. logrus.Warn("signalAllProcesses: ", p.Pid, err)
  462. }
  463. continue
  464. } else if !ok {
  465. // Not ready to report so don't wait
  466. continue
  467. }
  468. }
  469. // In case a subreaper has been setup, this code must not
  470. // wait for the process. Otherwise, we cannot be sure the
  471. // current process will be reaped by the subreaper, while
  472. // the subreaper might be waiting for this process in order
  473. // to retrieve its exit code.
  474. if subreaper == 0 {
  475. if _, err := p.Wait(); err != nil {
  476. if !isNoChildren(err) {
  477. logrus.Warn("wait: ", err)
  478. }
  479. }
  480. }
  481. }
  482. return nil
  483. }