standard_init_linux.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "fmt"
  5. "os"
  6. "os/exec"
  7. "runtime"
  8. "syscall" //only for Exec
  9. "github.com/opencontainers/runc/libcontainer/apparmor"
  10. "github.com/opencontainers/runc/libcontainer/configs"
  11. "github.com/opencontainers/runc/libcontainer/keys"
  12. "github.com/opencontainers/runc/libcontainer/seccomp"
  13. "github.com/opencontainers/runc/libcontainer/system"
  14. "github.com/opencontainers/selinux/go-selinux/label"
  15. "github.com/pkg/errors"
  16. "golang.org/x/sys/unix"
  17. )
  18. type linuxStandardInit struct {
  19. pipe *os.File
  20. consoleSocket *os.File
  21. parentPid int
  22. fifoFd int
  23. config *initConfig
  24. }
  25. func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
  26. var newperms uint32
  27. if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
  28. // With user ns we need 'other' search permissions.
  29. newperms = 0x8
  30. } else {
  31. // Without user ns we need 'UID' search permissions.
  32. newperms = 0x80000
  33. }
  34. // Create a unique per session container name that we can join in setns;
  35. // However, other containers can also join it.
  36. return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
  37. }
  38. func (l *linuxStandardInit) Init() error {
  39. runtime.LockOSThread()
  40. defer runtime.UnlockOSThread()
  41. if !l.config.Config.NoNewKeyring {
  42. if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil {
  43. return err
  44. }
  45. defer label.SetKeyLabel("")
  46. ringname, keepperms, newperms := l.getSessionRingParams()
  47. // Do not inherit the parent's session keyring.
  48. if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
  49. // If keyrings aren't supported then it is likely we are on an
  50. // older kernel (or inside an LXC container). While we could bail,
  51. // the security feature we are using here is best-effort (it only
  52. // really provides marginal protection since VFS credentials are
  53. // the only significant protection of keyrings).
  54. //
  55. // TODO(cyphar): Log this so people know what's going on, once we
  56. // have proper logging in 'runc init'.
  57. if errors.Cause(err) != unix.ENOSYS {
  58. return errors.Wrap(err, "join session keyring")
  59. }
  60. } else {
  61. // Make session keyring searcheable. If we've gotten this far we
  62. // bail on any error -- we don't want to have a keyring with bad
  63. // permissions.
  64. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
  65. return errors.Wrap(err, "mod keyring permissions")
  66. }
  67. }
  68. }
  69. if err := setupNetwork(l.config); err != nil {
  70. return err
  71. }
  72. if err := setupRoute(l.config.Config); err != nil {
  73. return err
  74. }
  75. label.Init()
  76. if err := prepareRootfs(l.pipe, l.config); err != nil {
  77. return err
  78. }
  79. // Set up the console. This has to be done *before* we finalize the rootfs,
  80. // but *after* we've given the user the chance to set up all of the mounts
  81. // they wanted.
  82. if l.config.CreateConsole {
  83. if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
  84. return err
  85. }
  86. if err := system.Setctty(); err != nil {
  87. return errors.Wrap(err, "setctty")
  88. }
  89. }
  90. // Finish the rootfs setup.
  91. if l.config.Config.Namespaces.Contains(configs.NEWNS) {
  92. if err := finalizeRootfs(l.config.Config); err != nil {
  93. return err
  94. }
  95. }
  96. if hostname := l.config.Config.Hostname; hostname != "" {
  97. if err := unix.Sethostname([]byte(hostname)); err != nil {
  98. return errors.Wrap(err, "sethostname")
  99. }
  100. }
  101. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  102. return errors.Wrap(err, "apply apparmor profile")
  103. }
  104. for key, value := range l.config.Config.Sysctl {
  105. if err := writeSystemProperty(key, value); err != nil {
  106. return errors.Wrapf(err, "write sysctl key %s", key)
  107. }
  108. }
  109. for _, path := range l.config.Config.ReadonlyPaths {
  110. if err := readonlyPath(path); err != nil {
  111. return errors.Wrapf(err, "readonly path %s", path)
  112. }
  113. }
  114. for _, path := range l.config.Config.MaskPaths {
  115. if err := maskPath(path, l.config.Config.MountLabel); err != nil {
  116. return errors.Wrapf(err, "mask path %s", path)
  117. }
  118. }
  119. pdeath, err := system.GetParentDeathSignal()
  120. if err != nil {
  121. return errors.Wrap(err, "get pdeath signal")
  122. }
  123. if l.config.NoNewPrivileges {
  124. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  125. return errors.Wrap(err, "set nonewprivileges")
  126. }
  127. }
  128. // Tell our parent that we're ready to Execv. This must be done before the
  129. // Seccomp rules have been applied, because we need to be able to read and
  130. // write to a socket.
  131. if err := syncParentReady(l.pipe); err != nil {
  132. return errors.Wrap(err, "sync ready")
  133. }
  134. if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
  135. return errors.Wrap(err, "set process label")
  136. }
  137. defer label.SetProcessLabel("")
  138. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  139. // do this before dropping capabilities; otherwise do it as late as possible
  140. // just before execve so as few syscalls take place after it as possible.
  141. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  142. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  143. return err
  144. }
  145. }
  146. if err := finalizeNamespace(l.config); err != nil {
  147. return err
  148. }
  149. // finalizeNamespace can change user/group which clears the parent death
  150. // signal, so we restore it here.
  151. if err := pdeath.Restore(); err != nil {
  152. return errors.Wrap(err, "restore pdeath signal")
  153. }
  154. // Compare the parent from the initial start of the init process and make
  155. // sure that it did not change. if the parent changes that means it died
  156. // and we were reparented to something else so we should just kill ourself
  157. // and not cause problems for someone else.
  158. if unix.Getppid() != l.parentPid {
  159. return unix.Kill(unix.Getpid(), unix.SIGKILL)
  160. }
  161. // Check for the arg before waiting to make sure it exists and it is
  162. // returned as a create time error.
  163. name, err := exec.LookPath(l.config.Args[0])
  164. if err != nil {
  165. return err
  166. }
  167. // Close the pipe to signal that we have completed our init.
  168. l.pipe.Close()
  169. // Wait for the FIFO to be opened on the other side before exec-ing the
  170. // user process. We open it through /proc/self/fd/$fd, because the fd that
  171. // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
  172. // re-open an O_PATH fd through /proc.
  173. fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
  174. if err != nil {
  175. return newSystemErrorWithCause(err, "open exec fifo")
  176. }
  177. if _, err := unix.Write(fd, []byte("0")); err != nil {
  178. return newSystemErrorWithCause(err, "write 0 exec fifo")
  179. }
  180. // Close the O_PATH fifofd fd before exec because the kernel resets
  181. // dumpable in the wrong order. This has been fixed in newer kernels, but
  182. // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
  183. // N.B. the core issue itself (passing dirfds to the host filesystem) has
  184. // since been resolved.
  185. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
  186. unix.Close(l.fifoFd)
  187. // Set seccomp as close to execve as possible, so as few syscalls take
  188. // place afterward (reducing the amount of syscalls that users need to
  189. // enable in their seccomp profiles).
  190. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  191. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  192. return newSystemErrorWithCause(err, "init seccomp")
  193. }
  194. }
  195. if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
  196. return newSystemErrorWithCause(err, "exec user process")
  197. }
  198. return nil
  199. }