standard_init_linux.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "fmt"
  5. "os"
  6. "os/exec"
  7. "runtime"
  8. "syscall" //only for Exec
  9. "github.com/opencontainers/runc/libcontainer/apparmor"
  10. "github.com/opencontainers/runc/libcontainer/configs"
  11. "github.com/opencontainers/runc/libcontainer/keys"
  12. "github.com/opencontainers/runc/libcontainer/seccomp"
  13. "github.com/opencontainers/runc/libcontainer/system"
  14. "github.com/opencontainers/selinux/go-selinux/label"
  15. "github.com/pkg/errors"
  16. "golang.org/x/sys/unix"
  17. )
  18. type linuxStandardInit struct {
  19. pipe *os.File
  20. consoleSocket *os.File
  21. parentPid int
  22. fifoFd int
  23. config *initConfig
  24. }
  25. func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
  26. var newperms uint32
  27. if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
  28. // With user ns we need 'other' search permissions.
  29. newperms = 0x8
  30. } else {
  31. // Without user ns we need 'UID' search permissions.
  32. newperms = 0x80000
  33. }
  34. // Create a unique per session container name that we can join in setns;
  35. // However, other containers can also join it.
  36. return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
  37. }
  38. func (l *linuxStandardInit) Init() error {
  39. runtime.LockOSThread()
  40. defer runtime.UnlockOSThread()
  41. if !l.config.Config.NoNewKeyring {
  42. ringname, keepperms, newperms := l.getSessionRingParams()
  43. // Do not inherit the parent's session keyring.
  44. if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
  45. // If keyrings aren't supported then it is likely we are on an
  46. // older kernel (or inside an LXC container). While we could bail,
  47. // the security feature we are using here is best-effort (it only
  48. // really provides marginal protection since VFS credentials are
  49. // the only significant protection of keyrings).
  50. //
  51. // TODO(cyphar): Log this so people know what's going on, once we
  52. // have proper logging in 'runc init'.
  53. if errors.Cause(err) != unix.ENOSYS {
  54. return errors.Wrap(err, "join session keyring")
  55. }
  56. } else {
  57. // Make session keyring searcheable. If we've gotten this far we
  58. // bail on any error -- we don't want to have a keyring with bad
  59. // permissions.
  60. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
  61. return errors.Wrap(err, "mod keyring permissions")
  62. }
  63. }
  64. }
  65. if err := setupNetwork(l.config); err != nil {
  66. return err
  67. }
  68. if err := setupRoute(l.config.Config); err != nil {
  69. return err
  70. }
  71. label.Init()
  72. if err := prepareRootfs(l.pipe, l.config); err != nil {
  73. return err
  74. }
  75. // Set up the console. This has to be done *before* we finalize the rootfs,
  76. // but *after* we've given the user the chance to set up all of the mounts
  77. // they wanted.
  78. if l.config.CreateConsole {
  79. if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
  80. return err
  81. }
  82. if err := system.Setctty(); err != nil {
  83. return errors.Wrap(err, "setctty")
  84. }
  85. }
  86. // Finish the rootfs setup.
  87. if l.config.Config.Namespaces.Contains(configs.NEWNS) {
  88. if err := finalizeRootfs(l.config.Config); err != nil {
  89. return err
  90. }
  91. }
  92. if hostname := l.config.Config.Hostname; hostname != "" {
  93. if err := unix.Sethostname([]byte(hostname)); err != nil {
  94. return errors.Wrap(err, "sethostname")
  95. }
  96. }
  97. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  98. return errors.Wrap(err, "apply apparmor profile")
  99. }
  100. for key, value := range l.config.Config.Sysctl {
  101. if err := writeSystemProperty(key, value); err != nil {
  102. return errors.Wrapf(err, "write sysctl key %s", key)
  103. }
  104. }
  105. for _, path := range l.config.Config.ReadonlyPaths {
  106. if err := readonlyPath(path); err != nil {
  107. return errors.Wrapf(err, "readonly path %s", path)
  108. }
  109. }
  110. for _, path := range l.config.Config.MaskPaths {
  111. if err := maskPath(path, l.config.Config.MountLabel); err != nil {
  112. return errors.Wrapf(err, "mask path %s", path)
  113. }
  114. }
  115. pdeath, err := system.GetParentDeathSignal()
  116. if err != nil {
  117. return errors.Wrap(err, "get pdeath signal")
  118. }
  119. if l.config.NoNewPrivileges {
  120. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  121. return errors.Wrap(err, "set nonewprivileges")
  122. }
  123. }
  124. // Tell our parent that we're ready to Execv. This must be done before the
  125. // Seccomp rules have been applied, because we need to be able to read and
  126. // write to a socket.
  127. if err := syncParentReady(l.pipe); err != nil {
  128. return errors.Wrap(err, "sync ready")
  129. }
  130. if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
  131. return errors.Wrap(err, "set process label")
  132. }
  133. defer label.SetProcessLabel("")
  134. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  135. // do this before dropping capabilities; otherwise do it as late as possible
  136. // just before execve so as few syscalls take place after it as possible.
  137. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  138. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  139. return err
  140. }
  141. }
  142. if err := finalizeNamespace(l.config); err != nil {
  143. return err
  144. }
  145. // finalizeNamespace can change user/group which clears the parent death
  146. // signal, so we restore it here.
  147. if err := pdeath.Restore(); err != nil {
  148. return errors.Wrap(err, "restore pdeath signal")
  149. }
  150. // Compare the parent from the initial start of the init process and make
  151. // sure that it did not change. if the parent changes that means it died
  152. // and we were reparented to something else so we should just kill ourself
  153. // and not cause problems for someone else.
  154. if unix.Getppid() != l.parentPid {
  155. return unix.Kill(unix.Getpid(), unix.SIGKILL)
  156. }
  157. // Check for the arg before waiting to make sure it exists and it is
  158. // returned as a create time error.
  159. name, err := exec.LookPath(l.config.Args[0])
  160. if err != nil {
  161. return err
  162. }
  163. // Close the pipe to signal that we have completed our init.
  164. l.pipe.Close()
  165. // Wait for the FIFO to be opened on the other side before exec-ing the
  166. // user process. We open it through /proc/self/fd/$fd, because the fd that
  167. // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
  168. // re-open an O_PATH fd through /proc.
  169. fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
  170. if err != nil {
  171. return newSystemErrorWithCause(err, "open exec fifo")
  172. }
  173. if _, err := unix.Write(fd, []byte("0")); err != nil {
  174. return newSystemErrorWithCause(err, "write 0 exec fifo")
  175. }
  176. // Close the O_PATH fifofd fd before exec because the kernel resets
  177. // dumpable in the wrong order. This has been fixed in newer kernels, but
  178. // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
  179. // N.B. the core issue itself (passing dirfds to the host filesystem) has
  180. // since been resolved.
  181. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
  182. unix.Close(l.fifoFd)
  183. // Set seccomp as close to execve as possible, so as few syscalls take
  184. // place afterward (reducing the amount of syscalls that users need to
  185. // enable in their seccomp profiles).
  186. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  187. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  188. return newSystemErrorWithCause(err, "init seccomp")
  189. }
  190. }
  191. if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
  192. return newSystemErrorWithCause(err, "exec user process")
  193. }
  194. return nil
  195. }