12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898 |
- // +build linux
- package libcontainer
- import (
- "bytes"
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "io/ioutil"
- "net"
- "os"
- "os/exec"
- "path/filepath"
- "reflect"
- "strings"
- "sync"
- "syscall" // only for SysProcAttr and Signal
- "time"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/criurpc"
- "github.com/opencontainers/runc/libcontainer/intelrdt"
- "github.com/opencontainers/runc/libcontainer/system"
- "github.com/opencontainers/runc/libcontainer/utils"
- "github.com/golang/protobuf/proto"
- "github.com/sirupsen/logrus"
- "github.com/vishvananda/netlink/nl"
- "golang.org/x/sys/unix"
- )
- const stdioFdCount = 3
- type linuxContainer struct {
- id string
- root string
- config *configs.Config
- cgroupManager cgroups.Manager
- intelRdtManager intelrdt.Manager
- initPath string
- initArgs []string
- initProcess parentProcess
- initProcessStartTime uint64
- criuPath string
- newuidmapPath string
- newgidmapPath string
- m sync.Mutex
- criuVersion int
- state containerState
- created time.Time
- }
- // State represents a running container's state
- type State struct {
- BaseState
- // Platform specific fields below here
- // Specified if the container was started under the rootless mode.
- // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
- Rootless bool `json:"rootless"`
- // Path to all the cgroups setup for a container. Key is cgroup subsystem name
- // with the value as the path.
- CgroupPaths map[string]string `json:"cgroup_paths"`
- // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
- // with the value as the path.
- NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
- // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
- ExternalDescriptors []string `json:"external_descriptors,omitempty"`
- // Intel RDT "resource control" filesystem path
- IntelRdtPath string `json:"intel_rdt_path"`
- }
- // Container is a libcontainer container object.
- //
- // Each container is thread-safe within the same process. Since a container can
- // be destroyed by a separate process, any function may return that the container
- // was not found.
- type Container interface {
- BaseContainer
- // Methods below here are platform specific
- // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
- //
- // errors:
- // Systemerror - System error.
- Checkpoint(criuOpts *CriuOpts) error
- // Restore restores the checkpointed container to a running state using the criu(8) utility.
- //
- // errors:
- // Systemerror - System error.
- Restore(process *Process, criuOpts *CriuOpts) error
- // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
- // the execution of any user processes. Asynchronously, when the container finished being paused the
- // state is changed to PAUSED.
- // If the Container state is PAUSED, do nothing.
- //
- // errors:
- // ContainerNotExists - Container no longer exists,
- // ContainerNotRunning - Container not running or created,
- // Systemerror - System error.
- Pause() error
- // If the Container state is PAUSED, resumes the execution of any user processes in the
- // Container before setting the Container state to RUNNING.
- // If the Container state is RUNNING, do nothing.
- //
- // errors:
- // ContainerNotExists - Container no longer exists,
- // ContainerNotPaused - Container is not paused,
- // Systemerror - System error.
- Resume() error
- // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
- //
- // errors:
- // Systemerror - System error.
- NotifyOOM() (<-chan struct{}, error)
- // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
- //
- // errors:
- // Systemerror - System error.
- NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
- }
- // ID returns the container's unique ID
- func (c *linuxContainer) ID() string {
- return c.id
- }
- // Config returns the container's configuration
- func (c *linuxContainer) Config() configs.Config {
- return *c.config
- }
- func (c *linuxContainer) Status() (Status, error) {
- c.m.Lock()
- defer c.m.Unlock()
- return c.currentStatus()
- }
- func (c *linuxContainer) State() (*State, error) {
- c.m.Lock()
- defer c.m.Unlock()
- return c.currentState()
- }
- func (c *linuxContainer) Processes() ([]int, error) {
- pids, err := c.cgroupManager.GetAllPids()
- if err != nil {
- return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
- }
- return pids, nil
- }
- func (c *linuxContainer) Stats() (*Stats, error) {
- var (
- err error
- stats = &Stats{}
- )
- if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
- return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
- }
- if c.intelRdtManager != nil {
- if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
- return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
- }
- }
- for _, iface := range c.config.Networks {
- switch iface.Type {
- case "veth":
- istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
- if err != nil {
- return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
- }
- stats.Interfaces = append(stats.Interfaces, istats)
- }
- }
- return stats, nil
- }
- func (c *linuxContainer) Set(config configs.Config) error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- if status == Stopped {
- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
- }
- if err := c.cgroupManager.Set(&config); err != nil {
- // Set configs back
- if err2 := c.cgroupManager.Set(c.config); err2 != nil {
- logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
- }
- return err
- }
- if c.intelRdtManager != nil {
- if err := c.intelRdtManager.Set(&config); err != nil {
- // Set configs back
- if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
- logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
- }
- return err
- }
- }
- // After config setting succeed, update config and states
- c.config = &config
- _, err = c.updateState(nil)
- return err
- }
- func (c *linuxContainer) Start(process *Process) error {
- c.m.Lock()
- defer c.m.Unlock()
- if process.Init {
- if err := c.createExecFifo(); err != nil {
- return err
- }
- }
- if err := c.start(process); err != nil {
- if process.Init {
- c.deleteExecFifo()
- }
- return err
- }
- return nil
- }
- func (c *linuxContainer) Run(process *Process) error {
- if err := c.Start(process); err != nil {
- return err
- }
- if process.Init {
- return c.exec()
- }
- return nil
- }
- func (c *linuxContainer) Exec() error {
- c.m.Lock()
- defer c.m.Unlock()
- return c.exec()
- }
- func (c *linuxContainer) exec() error {
- path := filepath.Join(c.root, execFifoFilename)
- fifoOpen := make(chan struct{})
- select {
- case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
- return errors.New("container process is already dead")
- case result := <-awaitFifoOpen(path):
- close(fifoOpen)
- if result.err != nil {
- return result.err
- }
- f := result.file
- defer f.Close()
- if err := readFromExecFifo(f); err != nil {
- return err
- }
- return os.Remove(path)
- }
- }
- func readFromExecFifo(execFifo io.Reader) error {
- data, err := ioutil.ReadAll(execFifo)
- if err != nil {
- return err
- }
- if len(data) <= 0 {
- return fmt.Errorf("cannot start an already running container")
- }
- return nil
- }
- func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
- isDead := make(chan struct{})
- go func() {
- for {
- select {
- case <-exit:
- return
- case <-time.After(time.Millisecond * 100):
- stat, err := system.Stat(pid)
- if err != nil || stat.State == system.Zombie {
- close(isDead)
- return
- }
- }
- }
- }()
- return isDead
- }
- func awaitFifoOpen(path string) <-chan openResult {
- fifoOpened := make(chan openResult)
- go func() {
- f, err := os.OpenFile(path, os.O_RDONLY, 0)
- if err != nil {
- fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
- return
- }
- fifoOpened <- openResult{file: f}
- }()
- return fifoOpened
- }
- type openResult struct {
- file *os.File
- err error
- }
- func (c *linuxContainer) start(process *Process) error {
- parent, err := c.newParentProcess(process)
- if err != nil {
- return newSystemErrorWithCause(err, "creating new parent process")
- }
- if err := parent.start(); err != nil {
- // terminate the process to ensure that it properly is reaped.
- if err := ignoreTerminateErrors(parent.terminate()); err != nil {
- logrus.Warn(err)
- }
- return newSystemErrorWithCause(err, "starting container process")
- }
- // generate a timestamp indicating when the container was started
- c.created = time.Now().UTC()
- if process.Init {
- c.state = &createdState{
- c: c,
- }
- state, err := c.updateState(parent)
- if err != nil {
- return err
- }
- c.initProcessStartTime = state.InitProcessStartTime
- if c.config.Hooks != nil {
- bundle, annotations := utils.Annotations(c.config.Labels)
- s := configs.HookState{
- Version: c.config.Version,
- ID: c.id,
- Pid: parent.pid(),
- Bundle: bundle,
- Annotations: annotations,
- }
- for i, hook := range c.config.Hooks.Poststart {
- if err := hook.Run(s); err != nil {
- if err := ignoreTerminateErrors(parent.terminate()); err != nil {
- logrus.Warn(err)
- }
- return newSystemErrorWithCausef(err, "running poststart hook %d", i)
- }
- }
- }
- }
- return nil
- }
- func (c *linuxContainer) Signal(s os.Signal, all bool) error {
- if all {
- return signalAllProcesses(c.cgroupManager, s)
- }
- if err := c.initProcess.signal(s); err != nil {
- return newSystemErrorWithCause(err, "signaling init process")
- }
- return nil
- }
- func (c *linuxContainer) createExecFifo() error {
- rootuid, err := c.Config().HostRootUID()
- if err != nil {
- return err
- }
- rootgid, err := c.Config().HostRootGID()
- if err != nil {
- return err
- }
- fifoName := filepath.Join(c.root, execFifoFilename)
- if _, err := os.Stat(fifoName); err == nil {
- return fmt.Errorf("exec fifo %s already exists", fifoName)
- }
- oldMask := unix.Umask(0000)
- if err := unix.Mkfifo(fifoName, 0622); err != nil {
- unix.Umask(oldMask)
- return err
- }
- unix.Umask(oldMask)
- return os.Chown(fifoName, rootuid, rootgid)
- }
- func (c *linuxContainer) deleteExecFifo() {
- fifoName := filepath.Join(c.root, execFifoFilename)
- os.Remove(fifoName)
- }
- // includeExecFifo opens the container's execfifo as a pathfd, so that the
- // container cannot access the statedir (and the FIFO itself remains
- // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
- // fd, with _LIBCONTAINER_FIFOFD set to its fd number.
- func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
- fifoName := filepath.Join(c.root, execFifoFilename)
- fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
- if err != nil {
- return err
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
- return nil
- }
- func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
- parentPipe, childPipe, err := utils.NewSockPair("init")
- if err != nil {
- return nil, newSystemErrorWithCause(err, "creating new init pipe")
- }
- cmd, err := c.commandTemplate(p, childPipe)
- if err != nil {
- return nil, newSystemErrorWithCause(err, "creating new command template")
- }
- if !p.Init {
- return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
- }
- // We only set up fifoFd if we're not doing a `runc exec`. The historic
- // reason for this is that previously we would pass a dirfd that allowed
- // for container rootfs escape (and not doing it in `runc exec` avoided
- // that problem), but we no longer do that. However, there's no need to do
- // this for `runc exec` so we just keep it this way to be safe.
- if err := c.includeExecFifo(cmd); err != nil {
- return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
- }
- return c.newInitProcess(p, cmd, parentPipe, childPipe)
- }
- func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
- cmd := exec.Command(c.initPath, c.initArgs[1:]...)
- cmd.Args[0] = c.initArgs[0]
- cmd.Stdin = p.Stdin
- cmd.Stdout = p.Stdout
- cmd.Stderr = p.Stderr
- cmd.Dir = c.config.Rootfs
- if cmd.SysProcAttr == nil {
- cmd.SysProcAttr = &syscall.SysProcAttr{}
- }
- cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
- if p.ConsoleSocket != nil {
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
- )
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
- )
- // NOTE: when running a container with no PID namespace and the parent process spawning the container is
- // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
- // even with the parent still running.
- if c.config.ParentDeathSignal > 0 {
- cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
- }
- return cmd, nil
- }
- func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
- nsMaps := make(map[configs.NamespaceType]string)
- for _, ns := range c.config.Namespaces {
- if ns.Path != "" {
- nsMaps[ns.Type] = ns.Path
- }
- }
- _, sharePidns := nsMaps[configs.NEWPID]
- data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
- if err != nil {
- return nil, err
- }
- return &initProcess{
- cmd: cmd,
- childPipe: childPipe,
- parentPipe: parentPipe,
- manager: c.cgroupManager,
- intelRdtManager: c.intelRdtManager,
- config: c.newInitConfig(p),
- container: c,
- process: p,
- bootstrapData: data,
- sharePidns: sharePidns,
- }, nil
- }
- func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
- state, err := c.currentState()
- if err != nil {
- return nil, newSystemErrorWithCause(err, "getting container's current state")
- }
- // for setns process, we don't have to set cloneflags as the process namespaces
- // will only be set via setns syscall
- data, err := c.bootstrapData(0, state.NamespacePaths)
- if err != nil {
- return nil, err
- }
- return &setnsProcess{
- cmd: cmd,
- cgroupPaths: c.cgroupManager.GetPaths(),
- rootlessCgroups: c.config.RootlessCgroups,
- intelRdtPath: state.IntelRdtPath,
- childPipe: childPipe,
- parentPipe: parentPipe,
- config: c.newInitConfig(p),
- process: p,
- bootstrapData: data,
- }, nil
- }
- func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
- cfg := &initConfig{
- Config: c.config,
- Args: process.Args,
- Env: process.Env,
- User: process.User,
- AdditionalGroups: process.AdditionalGroups,
- Cwd: process.Cwd,
- Capabilities: process.Capabilities,
- PassedFilesCount: len(process.ExtraFiles),
- ContainerId: c.ID(),
- NoNewPrivileges: c.config.NoNewPrivileges,
- RootlessEUID: c.config.RootlessEUID,
- RootlessCgroups: c.config.RootlessCgroups,
- AppArmorProfile: c.config.AppArmorProfile,
- ProcessLabel: c.config.ProcessLabel,
- Rlimits: c.config.Rlimits,
- }
- if process.NoNewPrivileges != nil {
- cfg.NoNewPrivileges = *process.NoNewPrivileges
- }
- if process.AppArmorProfile != "" {
- cfg.AppArmorProfile = process.AppArmorProfile
- }
- if process.Label != "" {
- cfg.ProcessLabel = process.Label
- }
- if len(process.Rlimits) > 0 {
- cfg.Rlimits = process.Rlimits
- }
- cfg.CreateConsole = process.ConsoleSocket != nil
- cfg.ConsoleWidth = process.ConsoleWidth
- cfg.ConsoleHeight = process.ConsoleHeight
- return cfg
- }
- func (c *linuxContainer) Destroy() error {
- c.m.Lock()
- defer c.m.Unlock()
- return c.state.destroy()
- }
- func (c *linuxContainer) Pause() error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- switch status {
- case Running, Created:
- if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
- return err
- }
- return c.state.transition(&pausedState{
- c: c,
- })
- }
- return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
- }
- func (c *linuxContainer) Resume() error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- if status != Paused {
- return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
- }
- if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
- return err
- }
- return c.state.transition(&runningState{
- c: c,
- })
- }
- func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
- // XXX(cyphar): This requires cgroups.
- if c.config.RootlessCgroups {
- logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
- }
- return notifyOnOOM(c.cgroupManager.GetPaths())
- }
- func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
- // XXX(cyphar): This requires cgroups.
- if c.config.RootlessCgroups {
- logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
- }
- return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
- }
- var criuFeatures *criurpc.CriuFeatures
- func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
- var t criurpc.CriuReqType
- t = criurpc.CriuReqType_FEATURE_CHECK
- // criu 1.8 => 10800
- if err := c.checkCriuVersion(10800); err != nil {
- // Feature checking was introduced with CRIU 1.8.
- // Ignore the feature check if an older CRIU version is used
- // and just act as before.
- // As all automated PR testing is done using CRIU 1.7 this
- // code will not be tested by automated PR testing.
- return nil
- }
- // make sure the features we are looking for are really not from
- // some previous check
- criuFeatures = nil
- req := &criurpc.CriuReq{
- Type: &t,
- // Theoretically this should not be necessary but CRIU
- // segfaults if Opts is empty.
- // Fixed in CRIU 2.12
- Opts: rpcOpts,
- Features: criuFeat,
- }
- err := c.criuSwrk(nil, req, criuOpts, false, nil)
- if err != nil {
- logrus.Debugf("%s", err)
- return fmt.Errorf("CRIU feature check failed")
- }
- logrus.Debugf("Feature check says: %s", criuFeatures)
- missingFeatures := false
- // The outer if checks if the fields actually exist
- if (criuFeat.MemTrack != nil) &&
- (criuFeatures.MemTrack != nil) {
- // The inner if checks if they are set to true
- if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
- missingFeatures = true
- logrus.Debugf("CRIU does not support MemTrack")
- }
- }
- // This needs to be repeated for every new feature check.
- // Is there a way to put this in a function. Reflection?
- if (criuFeat.LazyPages != nil) &&
- (criuFeatures.LazyPages != nil) {
- if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
- missingFeatures = true
- logrus.Debugf("CRIU does not support LazyPages")
- }
- }
- if missingFeatures {
- return fmt.Errorf("CRIU is missing features")
- }
- return nil
- }
- func parseCriuVersion(path string) (int, error) {
- var x, y, z int
- out, err := exec.Command(path, "-V").Output()
- if err != nil {
- return 0, fmt.Errorf("Unable to execute CRIU command: %s", path)
- }
- x = 0
- y = 0
- z = 0
- if ep := strings.Index(string(out), "-"); ep >= 0 {
- // criu Git version format
- var version string
- if sp := strings.Index(string(out), "GitID"); sp > 0 {
- version = string(out)[sp:ep]
- } else {
- return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path)
- }
- n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
- if err != nil {
- n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6
- y++
- } else {
- z++
- }
- if n < 2 || err != nil {
- return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
- }
- } else {
- // criu release version format
- n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
- if err != nil {
- n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
- }
- if n < 2 || err != nil {
- return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
- }
- }
- return x*10000 + y*100 + z, nil
- }
- func compareCriuVersion(criuVersion int, minVersion int) error {
- // simple function to perform the actual version compare
- if criuVersion < minVersion {
- return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
- }
- return nil
- }
- // This is used to store the result of criu version RPC
- var criuVersionRPC *criurpc.CriuVersion
- // checkCriuVersion checks Criu version greater than or equal to minVersion
- func (c *linuxContainer) checkCriuVersion(minVersion int) error {
- // If the version of criu has already been determined there is no need
- // to ask criu for the version again. Use the value from c.criuVersion.
- if c.criuVersion != 0 {
- return compareCriuVersion(c.criuVersion, minVersion)
- }
- // First try if this version of CRIU support the version RPC.
- // The CRIU version RPC was introduced with CRIU 3.0.
- // First, reset the variable for the RPC answer to nil
- criuVersionRPC = nil
- var t criurpc.CriuReqType
- t = criurpc.CriuReqType_VERSION
- req := &criurpc.CriuReq{
- Type: &t,
- }
- err := c.criuSwrk(nil, req, nil, false, nil)
- if err != nil {
- return fmt.Errorf("CRIU version check failed: %s", err)
- }
- if criuVersionRPC != nil {
- logrus.Debugf("CRIU version: %s", criuVersionRPC)
- // major and minor are always set
- c.criuVersion = int(*criuVersionRPC.Major) * 10000
- c.criuVersion += int(*criuVersionRPC.Minor) * 100
- if criuVersionRPC.Sublevel != nil {
- c.criuVersion += int(*criuVersionRPC.Sublevel)
- }
- if criuVersionRPC.Gitid != nil {
- // runc's convention is that a CRIU git release is
- // always the same as increasing the minor by 1
- c.criuVersion -= (c.criuVersion % 100)
- c.criuVersion += 100
- }
- return compareCriuVersion(c.criuVersion, minVersion)
- }
- // This is CRIU without the version RPC and therefore
- // older than 3.0. Parsing the output is required.
- // This can be remove once runc does not work with criu older than 3.0
- c.criuVersion, err = parseCriuVersion(c.criuPath)
- if err != nil {
- return err
- }
- return compareCriuVersion(c.criuVersion, minVersion)
- }
- const descriptorsFilename = "descriptors.json"
- func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
- mountDest := m.Destination
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
- mountDest = mountDest[len(c.config.Rootfs):]
- }
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(mountDest),
- Val: proto.String(mountDest),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
- }
- func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
- for _, path := range c.config.MaskPaths {
- fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- return err
- }
- if fi.IsDir() {
- continue
- }
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(path),
- Val: proto.String("/dev/null"),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
- }
- return nil
- }
- func waitForCriuLazyServer(r *os.File, status string) error {
- data := make([]byte, 1)
- _, err := r.Read(data)
- if err != nil {
- return err
- }
- fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
- if err != nil {
- return err
- }
- _, err = fd.Write(data)
- if err != nil {
- return err
- }
- fd.Close()
- return nil
- }
- func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
- c.m.Lock()
- defer c.m.Unlock()
- // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
- // (CLI prints a warning)
- // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
- // support for doing unprivileged dumps, but the setup of
- // rootless containers might make this complicated.
- // criu 1.5.2 => 10502
- if err := c.checkCriuVersion(10502); err != nil {
- return err
- }
- if criuOpts.ImagesDirectory == "" {
- return fmt.Errorf("invalid directory to save checkpoint")
- }
- // Since a container can be C/R'ed multiple times,
- // the checkpoint directory may already exist.
- if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
- return err
- }
- if criuOpts.WorkDirectory == "" {
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
- }
- if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
- return err
- }
- workDir, err := os.Open(criuOpts.WorkDirectory)
- if err != nil {
- return err
- }
- defer workDir.Close()
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
- if err != nil {
- return err
- }
- defer imageDir.Close()
- rpcOpts := criurpc.CriuOpts{
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
- LogLevel: proto.Int32(4),
- LogFile: proto.String("dump.log"),
- Root: proto.String(c.config.Rootfs),
- ManageCgroups: proto.Bool(true),
- NotifyScripts: proto.Bool(true),
- Pid: proto.Int32(int32(c.initProcess.pid())),
- ShellJob: proto.Bool(criuOpts.ShellJob),
- LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
- FileLocks: proto.Bool(criuOpts.FileLocks),
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
- OrphanPtsMaster: proto.Bool(true),
- AutoDedup: proto.Bool(criuOpts.AutoDedup),
- LazyPages: proto.Bool(criuOpts.LazyPages),
- }
- // If the container is running in a network namespace and has
- // a path to the network namespace configured, we will dump
- // that network namespace as an external namespace and we
- // will expect that the namespace exists during restore.
- // This basically means that CRIU will ignore the namespace
- // and expect to be setup correctly.
- nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
- if nsPath != "" {
- // For this to work we need at least criu 3.11.0 => 31100.
- // As there was already a successful version check we will
- // not error out if it fails. runc will just behave as it used
- // to do and ignore external network namespaces.
- err := c.checkCriuVersion(31100)
- if err == nil {
- // CRIU expects the information about an external namespace
- // like this: --external net[<inode>]:<key>
- // This <key> is always 'extRootNetNS'.
- var netns syscall.Stat_t
- err = syscall.Stat(nsPath, &netns)
- if err != nil {
- return err
- }
- criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
- rpcOpts.External = append(rpcOpts.External, criuExternal)
- }
- }
- fcg := c.cgroupManager.GetPaths()["freezer"]
- if fcg != "" {
- rpcOpts.FreezeCgroup = proto.String(fcg)
- }
- // append optional criu opts, e.g., page-server and port
- if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
- rpcOpts.Ps = &criurpc.CriuPageServerInfo{
- Address: proto.String(criuOpts.PageServer.Address),
- Port: proto.Int32(criuOpts.PageServer.Port),
- }
- }
- //pre-dump may need parentImage param to complete iterative migration
- if criuOpts.ParentImage != "" {
- rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
- rpcOpts.TrackMem = proto.Bool(true)
- }
- // append optional manage cgroups mode
- if criuOpts.ManageCgroupsMode != 0 {
- // criu 1.7 => 10700
- if err := c.checkCriuVersion(10700); err != nil {
- return err
- }
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
- rpcOpts.ManageCgroupsMode = &mode
- }
- var t criurpc.CriuReqType
- if criuOpts.PreDump {
- feat := criurpc.CriuFeatures{
- MemTrack: proto.Bool(true),
- }
- if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
- return err
- }
- t = criurpc.CriuReqType_PRE_DUMP
- } else {
- t = criurpc.CriuReqType_DUMP
- }
- req := &criurpc.CriuReq{
- Type: &t,
- Opts: &rpcOpts,
- }
- if criuOpts.LazyPages {
- // lazy migration requested; check if criu supports it
- feat := criurpc.CriuFeatures{
- LazyPages: proto.Bool(true),
- }
- if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
- return err
- }
- statusRead, statusWrite, err := os.Pipe()
- if err != nil {
- return err
- }
- rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
- go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
- }
- //no need to dump these information in pre-dump
- if !criuOpts.PreDump {
- for _, m := range c.config.Mounts {
- switch m.Device {
- case "bind":
- c.addCriuDumpMount(req, m)
- case "cgroup":
- binds, err := getCgroupMounts(m)
- if err != nil {
- return err
- }
- for _, b := range binds {
- c.addCriuDumpMount(req, b)
- }
- }
- }
- if err := c.addMaskPaths(req); err != nil {
- return err
- }
- for _, node := range c.config.Devices {
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
- c.addCriuDumpMount(req, m)
- }
- // Write the FD info to a file in the image directory
- fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
- if err != nil {
- return err
- }
- err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
- if err != nil {
- return err
- }
- }
- err = c.criuSwrk(nil, req, criuOpts, false, nil)
- if err != nil {
- return err
- }
- return nil
- }
- func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
- mountDest := m.Destination
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
- mountDest = mountDest[len(c.config.Rootfs):]
- }
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(mountDest),
- Val: proto.String(m.Source),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
- }
- func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
- for _, iface := range c.config.Networks {
- switch iface.Type {
- case "veth":
- veth := new(criurpc.CriuVethPair)
- veth.IfOut = proto.String(iface.HostInterfaceName)
- veth.IfIn = proto.String(iface.Name)
- req.Opts.Veths = append(req.Opts.Veths, veth)
- case "loopback":
- // Do nothing
- }
- }
- for _, i := range criuOpts.VethPairs {
- veth := new(criurpc.CriuVethPair)
- veth.IfOut = proto.String(i.HostInterfaceName)
- veth.IfIn = proto.String(i.ContainerInterfaceName)
- req.Opts.Veths = append(req.Opts.Veths, veth)
- }
- }
- func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
- c.m.Lock()
- defer c.m.Unlock()
- var extraFiles []*os.File
- // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
- // (CLI prints a warning)
- // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
- // support for unprivileged restore at the moment.
- // criu 1.5.2 => 10502
- if err := c.checkCriuVersion(10502); err != nil {
- return err
- }
- if criuOpts.WorkDirectory == "" {
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
- }
- // Since a container can be C/R'ed multiple times,
- // the work directory may already exist.
- if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
- return err
- }
- workDir, err := os.Open(criuOpts.WorkDirectory)
- if err != nil {
- return err
- }
- defer workDir.Close()
- if criuOpts.ImagesDirectory == "" {
- return fmt.Errorf("invalid directory to restore checkpoint")
- }
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
- if err != nil {
- return err
- }
- defer imageDir.Close()
- // CRIU has a few requirements for a root directory:
- // * it must be a mount point
- // * its parent must not be overmounted
- // c.config.Rootfs is bind-mounted to a temporary directory
- // to satisfy these requirements.
- root := filepath.Join(c.root, "criu-root")
- if err := os.Mkdir(root, 0755); err != nil {
- return err
- }
- defer os.Remove(root)
- root, err = filepath.EvalSymlinks(root)
- if err != nil {
- return err
- }
- err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
- if err != nil {
- return err
- }
- defer unix.Unmount(root, unix.MNT_DETACH)
- t := criurpc.CriuReqType_RESTORE
- req := &criurpc.CriuReq{
- Type: &t,
- Opts: &criurpc.CriuOpts{
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
- EvasiveDevices: proto.Bool(true),
- LogLevel: proto.Int32(4),
- LogFile: proto.String("restore.log"),
- RstSibling: proto.Bool(true),
- Root: proto.String(root),
- ManageCgroups: proto.Bool(true),
- NotifyScripts: proto.Bool(true),
- ShellJob: proto.Bool(criuOpts.ShellJob),
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
- FileLocks: proto.Bool(criuOpts.FileLocks),
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
- OrphanPtsMaster: proto.Bool(true),
- AutoDedup: proto.Bool(criuOpts.AutoDedup),
- LazyPages: proto.Bool(criuOpts.LazyPages),
- },
- }
- // Same as during checkpointing. If the container has a specific network namespace
- // assigned to it, this now expects that the checkpoint will be restored in a
- // already created network namespace.
- nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
- if nsPath != "" {
- // For this to work we need at least criu 3.11.0 => 31100.
- // As there was already a successful version check we will
- // not error out if it fails. runc will just behave as it used
- // to do and ignore external network namespaces.
- err := c.checkCriuVersion(31100)
- if err == nil {
- // CRIU wants the information about an existing network namespace
- // like this: --inherit-fd fd[<fd>]:<key>
- // The <key> needs to be the same as during checkpointing.
- // We are always using 'extRootNetNS' as the key in this.
- netns, err := os.Open(nsPath)
- defer netns.Close()
- if err != nil {
- logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
- return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
- }
- inheritFd := new(criurpc.InheritFd)
- inheritFd.Key = proto.String("extRootNetNS")
- // The offset of four is necessary because 0, 1, 2 and 3 is already
- // used by stdin, stdout, stderr, 'criu swrk' socket.
- inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
- req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
- // All open FDs need to be transferred to CRIU via extraFiles
- extraFiles = append(extraFiles, netns)
- }
- }
- for _, m := range c.config.Mounts {
- switch m.Device {
- case "bind":
- c.addCriuRestoreMount(req, m)
- case "cgroup":
- binds, err := getCgroupMounts(m)
- if err != nil {
- return err
- }
- for _, b := range binds {
- c.addCriuRestoreMount(req, b)
- }
- }
- }
- if len(c.config.MaskPaths) > 0 {
- m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
- c.addCriuRestoreMount(req, m)
- }
- for _, node := range c.config.Devices {
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
- c.addCriuRestoreMount(req, m)
- }
- if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
- c.restoreNetwork(req, criuOpts)
- }
- // append optional manage cgroups mode
- if criuOpts.ManageCgroupsMode != 0 {
- // criu 1.7 => 10700
- if err := c.checkCriuVersion(10700); err != nil {
- return err
- }
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
- req.Opts.ManageCgroupsMode = &mode
- }
- var (
- fds []string
- fdJSON []byte
- )
- if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
- return err
- }
- if err := json.Unmarshal(fdJSON, &fds); err != nil {
- return err
- }
- for i := range fds {
- if s := fds[i]; strings.Contains(s, "pipe:") {
- inheritFd := new(criurpc.InheritFd)
- inheritFd.Key = proto.String(s)
- inheritFd.Fd = proto.Int32(int32(i))
- req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
- }
- }
- return c.criuSwrk(process, req, criuOpts, true, extraFiles)
- }
- func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
- // XXX: Do we need to deal with this case? AFAIK criu still requires root.
- if err := c.cgroupManager.Apply(pid); err != nil {
- return err
- }
- if err := c.cgroupManager.Set(c.config); err != nil {
- return newSystemError(err)
- }
- path := fmt.Sprintf("/proc/%d/cgroup", pid)
- cgroupsPaths, err := cgroups.ParseCgroupFile(path)
- if err != nil {
- return err
- }
- for c, p := range cgroupsPaths {
- cgroupRoot := &criurpc.CgroupRoot{
- Ctrl: proto.String(c),
- Path: proto.String(p),
- }
- req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
- }
- return nil
- }
- func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
- fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
- if err != nil {
- return err
- }
- var logPath string
- if opts != nil {
- logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
- } else {
- // For the VERSION RPC 'opts' is set to 'nil' and therefore
- // opts.WorkDirectory does not exist. Set logPath to "".
- logPath = ""
- }
- criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
- criuClientFileCon, err := net.FileConn(criuClient)
- criuClient.Close()
- if err != nil {
- return err
- }
- criuClientCon := criuClientFileCon.(*net.UnixConn)
- defer criuClientCon.Close()
- criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
- defer criuServer.Close()
- args := []string{"swrk", "3"}
- if c.criuVersion != 0 {
- // If the CRIU Version is still '0' then this is probably
- // the initial CRIU run to detect the version. Skip it.
- logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
- }
- logrus.Debugf("Using CRIU with following args: %s", args)
- cmd := exec.Command(c.criuPath, args...)
- if process != nil {
- cmd.Stdin = process.Stdin
- cmd.Stdout = process.Stdout
- cmd.Stderr = process.Stderr
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
- if extraFiles != nil {
- cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
- }
- if err := cmd.Start(); err != nil {
- return err
- }
- criuServer.Close()
- defer func() {
- criuClientCon.Close()
- _, err := cmd.Process.Wait()
- if err != nil {
- return
- }
- }()
- if applyCgroups {
- err := c.criuApplyCgroups(cmd.Process.Pid, req)
- if err != nil {
- return err
- }
- }
- var extFds []string
- if process != nil {
- extFds, err = getPipeFds(cmd.Process.Pid)
- if err != nil {
- return err
- }
- }
- logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
- // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
- // should be empty. For older CRIU versions it still will be
- // available but empty. criurpc.CriuReqType_VERSION actually
- // has no req.GetOpts().
- if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
- req.GetType() == criurpc.CriuReqType_VERSION) {
- val := reflect.ValueOf(req.GetOpts())
- v := reflect.Indirect(val)
- for i := 0; i < v.NumField(); i++ {
- st := v.Type()
- name := st.Field(i).Name
- if strings.HasPrefix(name, "XXX_") {
- continue
- }
- value := val.MethodByName("Get" + name).Call([]reflect.Value{})
- logrus.Debugf("CRIU option %s with value %v", name, value[0])
- }
- }
- data, err := proto.Marshal(req)
- if err != nil {
- return err
- }
- _, err = criuClientCon.Write(data)
- if err != nil {
- return err
- }
- buf := make([]byte, 10*4096)
- oob := make([]byte, 4096)
- for true {
- n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
- if err != nil {
- return err
- }
- if n == 0 {
- return fmt.Errorf("unexpected EOF")
- }
- if n == len(buf) {
- return fmt.Errorf("buffer is too small")
- }
- resp := new(criurpc.CriuResp)
- err = proto.Unmarshal(buf[:n], resp)
- if err != nil {
- return err
- }
- if !resp.GetSuccess() {
- typeString := req.GetType().String()
- if typeString == "VERSION" {
- // If the VERSION RPC fails this probably means that the CRIU
- // version is too old for this RPC. Just return 'nil'.
- return nil
- }
- return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
- }
- t := resp.GetType()
- switch {
- case t == criurpc.CriuReqType_VERSION:
- logrus.Debugf("CRIU version: %s", resp)
- criuVersionRPC = resp.GetVersion()
- break
- case t == criurpc.CriuReqType_FEATURE_CHECK:
- logrus.Debugf("Feature check says: %s", resp)
- criuFeatures = resp.GetFeatures()
- case t == criurpc.CriuReqType_NOTIFY:
- if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil {
- return err
- }
- t = criurpc.CriuReqType_NOTIFY
- req = &criurpc.CriuReq{
- Type: &t,
- NotifySuccess: proto.Bool(true),
- }
- data, err = proto.Marshal(req)
- if err != nil {
- return err
- }
- _, err = criuClientCon.Write(data)
- if err != nil {
- return err
- }
- continue
- case t == criurpc.CriuReqType_RESTORE:
- case t == criurpc.CriuReqType_DUMP:
- case t == criurpc.CriuReqType_PRE_DUMP:
- default:
- return fmt.Errorf("unable to parse the response %s", resp.String())
- }
- break
- }
- criuClientCon.CloseWrite()
- // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
- // Here we want to wait only the CRIU process.
- st, err := cmd.Process.Wait()
- if err != nil {
- return err
- }
- // In pre-dump mode CRIU is in a loop and waits for
- // the final DUMP command.
- // The current runc pre-dump approach, however, is
- // start criu in PRE_DUMP once for a single pre-dump
- // and not the whole series of pre-dump, pre-dump, ...m, dump
- // If we got the message CriuReqType_PRE_DUMP it means
- // CRIU was successful and we need to forcefully stop CRIU
- if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
- return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
- }
- return nil
- }
- // block any external network activity
- func lockNetwork(config *configs.Config) error {
- for _, config := range config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
- if err := strategy.detach(config); err != nil {
- return err
- }
- }
- return nil
- }
- func unlockNetwork(config *configs.Config) error {
- for _, config := range config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
- if err = strategy.attach(config); err != nil {
- return err
- }
- }
- return nil
- }
- func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error {
- notify := resp.GetNotify()
- if notify == nil {
- return fmt.Errorf("invalid response: %s", resp.String())
- }
- logrus.Debugf("notify: %s\n", notify.GetScript())
- switch {
- case notify.GetScript() == "post-dump":
- f, err := os.Create(filepath.Join(c.root, "checkpoint"))
- if err != nil {
- return err
- }
- f.Close()
- case notify.GetScript() == "network-unlock":
- if err := unlockNetwork(c.config); err != nil {
- return err
- }
- case notify.GetScript() == "network-lock":
- if err := lockNetwork(c.config); err != nil {
- return err
- }
- case notify.GetScript() == "setup-namespaces":
- if c.config.Hooks != nil {
- bundle, annotations := utils.Annotations(c.config.Labels)
- s := configs.HookState{
- Version: c.config.Version,
- ID: c.id,
- Pid: int(notify.GetPid()),
- Bundle: bundle,
- Annotations: annotations,
- }
- for i, hook := range c.config.Hooks.Prestart {
- if err := hook.Run(s); err != nil {
- return newSystemErrorWithCausef(err, "running prestart hook %d", i)
- }
- }
- }
- case notify.GetScript() == "post-restore":
- pid := notify.GetPid()
- r, err := newRestoredProcess(int(pid), fds)
- if err != nil {
- return err
- }
- process.ops = r
- if err := c.state.transition(&restoredState{
- imageDir: opts.ImagesDirectory,
- c: c,
- }); err != nil {
- return err
- }
- // create a timestamp indicating when the restored checkpoint was started
- c.created = time.Now().UTC()
- if _, err := c.updateState(r); err != nil {
- return err
- }
- if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
- if !os.IsNotExist(err) {
- logrus.Error(err)
- }
- }
- case notify.GetScript() == "orphan-pts-master":
- scm, err := unix.ParseSocketControlMessage(oob)
- if err != nil {
- return err
- }
- fds, err := unix.ParseUnixRights(&scm[0])
- if err != nil {
- return err
- }
- master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
- defer master.Close()
- // While we can access console.master, using the API is a good idea.
- if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
- return err
- }
- }
- return nil
- }
- func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
- if process != nil {
- c.initProcess = process
- }
- state, err := c.currentState()
- if err != nil {
- return nil, err
- }
- err = c.saveState(state)
- if err != nil {
- return nil, err
- }
- return state, nil
- }
- func (c *linuxContainer) saveState(s *State) error {
- f, err := os.Create(filepath.Join(c.root, stateFilename))
- if err != nil {
- return err
- }
- defer f.Close()
- return utils.WriteJSON(f, s)
- }
- func (c *linuxContainer) deleteState() error {
- return os.Remove(filepath.Join(c.root, stateFilename))
- }
- func (c *linuxContainer) currentStatus() (Status, error) {
- if err := c.refreshState(); err != nil {
- return -1, err
- }
- return c.state.status(), nil
- }
- // refreshState needs to be called to verify that the current state on the
- // container is what is true. Because consumers of libcontainer can use it
- // out of process we need to verify the container's status based on runtime
- // information and not rely on our in process info.
- func (c *linuxContainer) refreshState() error {
- paused, err := c.isPaused()
- if err != nil {
- return err
- }
- if paused {
- return c.state.transition(&pausedState{c: c})
- }
- t, err := c.runType()
- if err != nil {
- return err
- }
- switch t {
- case Created:
- return c.state.transition(&createdState{c: c})
- case Running:
- return c.state.transition(&runningState{c: c})
- }
- return c.state.transition(&stoppedState{c: c})
- }
- func (c *linuxContainer) runType() (Status, error) {
- if c.initProcess == nil {
- return Stopped, nil
- }
- pid := c.initProcess.pid()
- stat, err := system.Stat(pid)
- if err != nil {
- return Stopped, nil
- }
- if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
- return Stopped, nil
- }
- // We'll create exec fifo and blocking on it after container is created,
- // and delete it after start container.
- if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
- return Created, nil
- }
- return Running, nil
- }
- func (c *linuxContainer) isPaused() (bool, error) {
- fcg := c.cgroupManager.GetPaths()["freezer"]
- if fcg == "" {
- // A container doesn't have a freezer cgroup
- return false, nil
- }
- data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
- if err != nil {
- // If freezer cgroup is not mounted, the container would just be not paused.
- if os.IsNotExist(err) {
- return false, nil
- }
- return false, newSystemErrorWithCause(err, "checking if container is paused")
- }
- return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
- }
- func (c *linuxContainer) currentState() (*State, error) {
- var (
- startTime uint64
- externalDescriptors []string
- pid = -1
- )
- if c.initProcess != nil {
- pid = c.initProcess.pid()
- startTime, _ = c.initProcess.startTime()
- externalDescriptors = c.initProcess.externalDescriptors()
- }
- intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
- if err != nil {
- intelRdtPath = ""
- }
- state := &State{
- BaseState: BaseState{
- ID: c.ID(),
- Config: *c.config,
- InitProcessPid: pid,
- InitProcessStartTime: startTime,
- Created: c.created,
- },
- Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
- CgroupPaths: c.cgroupManager.GetPaths(),
- IntelRdtPath: intelRdtPath,
- NamespacePaths: make(map[configs.NamespaceType]string),
- ExternalDescriptors: externalDescriptors,
- }
- if pid > 0 {
- for _, ns := range c.config.Namespaces {
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
- }
- for _, nsType := range configs.NamespaceTypes() {
- if !configs.IsNamespaceSupported(nsType) {
- continue
- }
- if _, ok := state.NamespacePaths[nsType]; !ok {
- ns := configs.Namespace{Type: nsType}
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
- }
- }
- }
- return state, nil
- }
- // orderNamespacePaths sorts namespace paths into a list of paths that we
- // can setns in order.
- func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
- paths := []string{}
- for _, ns := range configs.NamespaceTypes() {
- // Remove namespaces that we don't need to join.
- if !c.config.Namespaces.Contains(ns) {
- continue
- }
- if p, ok := namespaces[ns]; ok && p != "" {
- // check if the requested namespace is supported
- if !configs.IsNamespaceSupported(ns) {
- return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
- }
- // only set to join this namespace if it exists
- if _, err := os.Lstat(p); err != nil {
- return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
- }
- // do not allow namespace path with comma as we use it to separate
- // the namespace paths
- if strings.ContainsRune(p, ',') {
- return nil, newSystemError(fmt.Errorf("invalid path %s", p))
- }
- paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
- }
- }
- return paths, nil
- }
- func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
- data := bytes.NewBuffer(nil)
- for _, im := range idMap {
- line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
- if _, err := data.WriteString(line); err != nil {
- return nil, err
- }
- }
- return data.Bytes(), nil
- }
- // bootstrapData encodes the necessary data in netlink binary format
- // as a io.Reader.
- // Consumer can write the data to a bootstrap program
- // such as one that uses nsenter package to bootstrap the container's
- // init process correctly, i.e. with correct namespaces, uid/gid
- // mapping etc.
- func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
- // create the netlink message
- r := nl.NewNetlinkRequest(int(InitMsg), 0)
- // write cloneFlags
- r.AddData(&Int32msg{
- Type: CloneFlagsAttr,
- Value: uint32(cloneFlags),
- })
- // write custom namespace paths
- if len(nsMaps) > 0 {
- nsPaths, err := c.orderNamespacePaths(nsMaps)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: NsPathsAttr,
- Value: []byte(strings.Join(nsPaths, ",")),
- })
- }
- // write namespace paths only when we are not joining an existing user ns
- _, joinExistingUser := nsMaps[configs.NEWUSER]
- if !joinExistingUser {
- // write uid mappings
- if len(c.config.UidMappings) > 0 {
- if c.config.RootlessEUID && c.newuidmapPath != "" {
- r.AddData(&Bytemsg{
- Type: UidmapPathAttr,
- Value: []byte(c.newuidmapPath),
- })
- }
- b, err := encodeIDMapping(c.config.UidMappings)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: UidmapAttr,
- Value: b,
- })
- }
- // write gid mappings
- if len(c.config.GidMappings) > 0 {
- b, err := encodeIDMapping(c.config.GidMappings)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: GidmapAttr,
- Value: b,
- })
- if c.config.RootlessEUID && c.newgidmapPath != "" {
- r.AddData(&Bytemsg{
- Type: GidmapPathAttr,
- Value: []byte(c.newgidmapPath),
- })
- }
- if requiresRootOrMappingTool(c.config) {
- r.AddData(&Boolmsg{
- Type: SetgroupAttr,
- Value: true,
- })
- }
- }
- }
- if c.config.OomScoreAdj != nil {
- // write oom_score_adj
- r.AddData(&Bytemsg{
- Type: OomScoreAdjAttr,
- Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
- })
- }
- // write rootless
- r.AddData(&Boolmsg{
- Type: RootlessEUIDAttr,
- Value: c.config.RootlessEUID,
- })
- return bytes.NewReader(r.Serialize()), nil
- }
- // ignoreTerminateErrors returns nil if the given err matches an error known
- // to indicate that the terminate occurred successfully or err was nil, otherwise
- // err is returned unaltered.
- func ignoreTerminateErrors(err error) error {
- if err == nil {
- return nil
- }
- s := err.Error()
- switch {
- case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
- return nil
- }
- return err
- }
- func requiresRootOrMappingTool(c *configs.Config) bool {
- gidMap := []configs.IDMap{
- {ContainerID: 0, HostID: os.Getegid(), Size: 1},
- }
- return !reflect.DeepEqual(c.GidMappings, gidMap)
- }
|