diff --git a/background/process.go b/background/process.go index dc4ddaa..be0353e 100644 --- a/background/process.go +++ b/background/process.go @@ -4,16 +4,10 @@ import ( "bufio" "bytes" "context" - "fmt" "io" - "os" "strings" - "sync" "sync/atomic" - "syscall" - "time" - "github.com/spf13/afero" "golang.org/x/xerrors" "cdr.dev/slog" @@ -30,8 +24,17 @@ type Process struct { binName string userKilled *int64 - waitCh chan error - mu sync.Mutex + waitCh chan struct{} + err error +} + +func RunCh(ctx context.Context, log slog.Logger, cmd string, args ...string) <-chan error { + proc := New(ctx, log, cmd, args...) + errCh := make(chan error, 1) + go func() { + errCh <- proc.Run() + }() + return errCh } // New returns an instantiated daemon. @@ -40,7 +43,7 @@ func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Proc return &Process{ ctx: ctx, cancel: cancel, - waitCh: make(chan error, 1), + waitCh: make(chan struct{}, 1), cmd: xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...), log: log.Named(cmd), userKilled: i64ptr(0), @@ -50,58 +53,6 @@ func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Proc // Start starts the daemon. It functions akin to ox/exec.Command.Start(). func (d *Process) Start() error { - d.mu.Lock() - defer d.mu.Unlock() - - return d.startProcess() -} - -// Wait waits for the process to exit, returning the error on the provided -// channel. -func (d *Process) Wait() <-chan error { - d.mu.Lock() - waitCh := d.waitCh - d.mu.Unlock() - - return waitCh -} - -// Run runs the command and waits for it to exit. It is a convenience -// function that combines both Start() and Wait(). -func (d *Process) Run() <-chan error { - err := d.Start() - if err != nil { - ch := make(chan error, 1) - ch <- err - return ch - } - - return d.Wait() -} - -// Restart kill the running process and reruns the command with the updated -// cmd and args. -func (d *Process) Restart(ctx context.Context, cmd string, args ...string) error { - d.mu.Lock() - defer d.mu.Unlock() - - err := d.kill(syscall.SIGTERM) - if err != nil { - return xerrors.Errorf("kill cmd: %w", err) - } - - ctx, cancel := context.WithCancel(ctx) - d.ctx = ctx - d.cancel = cancel - d.cmd = xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...) - d.waitCh = make(chan error, 1) - d.userKilled = i64ptr(0) - d.binName = cmd - - return d.startProcess() -} - -func (d *Process) startProcess() error { var ( buf bytes.Buffer @@ -142,82 +93,53 @@ func (d *Process) startProcess() error { // If the user killed the application the actual error returned // from wait doesn't really matter. if atomic.LoadInt64(userKilled) == 1 { - d.waitCh <- ErrUserKilled + d.err = ErrUserKilled return } - if err == nil { - d.waitCh <- nil - } else { - d.waitCh <- xerrors.Errorf("%s: %w", buf.Bytes(), err) + if err != nil { + d.err = xerrors.Errorf("%s: %w", buf.Bytes(), err) } }() return nil } -func (d *Process) kill(sig syscall.Signal) error { - if d.cmd.OSProcess() == nil { - return xerrors.Errorf("cmd has not been started") - } - - atomic.StoreInt64(d.userKilled, 1) +// Wait waits for the process to exit, returning the error on the provided +// channel. +func (d *Process) Wait() error { + <-d.waitCh + return d.err +} - pid := d.cmd.OSProcess().Pid - err := d.cmd.OSProcess().Signal(sig) +// Run runs the command and waits for it to exit. It is a convenience +// function that combines both Start() and Wait(). +func (d *Process) Run() error { + err := d.Start() if err != nil { - return xerrors.Errorf("kill proc: %w", err) + return err } - ticker := time.NewTicker(time.Millisecond * 10) - defer ticker.Stop() - - fs := xunix.GetFS(d.ctx) + return d.Wait() +} - for { - // Try to find the process in the procfs. If we can't find - // it, it means the process has exited. It's also possible that - // we find the same PID but the cmd is different indicating the PID - // has been reused. - exited, err := isProcExited(fs, pid, d.binName) +func (d *Process) KillAndWait() error { + if atomic.CompareAndSwapInt64(d.userKilled, 0, 1) { + err := d.kill() if err != nil { - return xerrors.Errorf("is proc cmd: %w", err) - } - - if exited { - return nil - } - - select { - case <-d.ctx.Done(): - return d.ctx.Err() - case <-ticker.C: + return xerrors.Errorf("kill: %w", err) } } -} -// isProcExited checks if the provided PID has exited. It does this -// by attempting to read its entry in /proc/. If it can't find the -// entry then the process has exited. If the entry exists we check to see -// if the cmd is the same since it is possible (even if extremely unlikely) -// that the PID may be reclaimed and reused for a separate process. -func isProcExited(fs afero.Fs, pid int, cmd string) (bool, error) { - cmdline, err := afero.ReadFile(fs, fmt.Sprintf("/proc/%d/cmdline", pid)) - if xerrors.Is(err, os.ErrNotExist) { - return true, nil - } - if err != nil { - return false, xerrors.Errorf("read file: %w", err) - } + return d.Wait() +} - args := bytes.Split(cmdline, []byte{'0'}) - if len(args) < 1 { - // Honestly idk. - return false, xerrors.Errorf("cmdline has no output (%s)?", cmdline) +func (d *Process) kill() error { + if d.cmd.OSProcess() == nil { + return xerrors.Errorf("cmd has not been started") } - // If the cmd doesn't match then the PID has been reused for a different - // process indicating the proc we're looking for has successfully exited. - return cmd != string(args[0]), nil + atomic.StoreInt64(d.userKilled, 1) + return d.cmd.OSProcess().Kill() } func scanIntoLog(ctx context.Context, log slog.Logger, scanner *bufio.Scanner, binaryName string) { diff --git a/cli/docker.go b/cli/docker.go index f48ea64..6b6bbf9 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -7,16 +7,12 @@ import ( "io" "net/url" "os" - "path" "path/filepath" - "sort" - "strconv" "strings" + "sync" - "github.com/docker/docker/api/types/container" "github.com/google/go-containerregistry/pkg/name" "github.com/spf13/cobra" - "golang.org/x/exp/slices" "golang.org/x/xerrors" "cdr.dev/slog" @@ -24,6 +20,7 @@ import ( "github.com/coder/envbox/background" "github.com/coder/envbox/buildlog" "github.com/coder/envbox/cli/cliflag" + "github.com/coder/envbox/cvm" "github.com/coder/envbox/dockerutil" "github.com/coder/envbox/slogkubeterminate" "github.com/coder/envbox/sysboxutil" @@ -41,43 +38,8 @@ const ( ) const ( - defaultNetLink = "eth0" - defaultDockerBridge = "docker0" - // From https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts-technical-overview.html - awsWebIdentityTokenFilePath = "/var/run/secrets/eks.amazonaws.com/serviceaccount/token" //nolint - sysboxErrMsg = "Sysbox exited, possibly because of an unsupported kernel version. Please contact an infrastructure administrator and request a node kernel with seccomp API level >= 5." - - // noSpaceDataDir is the directory to use for the data directory - // for dockerd when the default directory (/var/lib/docker pointing - // to the user's pvc) is at capacity. This directory points to - // ephemeral storage allocated by the node and should be more likely - // to have capacity. - noSpaceDataDir = "/var/lib/docker.bak" - // noSpaceDockerDriver is the storage driver to use in cases where - // the default data dir (residing in the user's PVC) is at capacity. - // In such cases we must use the vfs storage driver because overlay2 - // does not work on top of overlay. - noSpaceDockerDriver = "vfs" - - OuterFUSEPath = "/tmp/coder-fuse" - InnerFUSEPath = "/dev/fuse" - - OuterTUNPath = "/tmp/coder-tun" - InnerTUNPath = "/dev/net/tun" - - InnerContainerName = "workspace_cvm" - - // Required for userns mapping. - // This is the ID of the user we apply in `envbox/Dockerfile`. - // - // There should be caution changing this value. - // Source directory permissions on the host are offset by this - // value. For example, folder `/home/coder` inside the container - // with UID/GID 1000 will be mapped to `UserNamespaceOffset` + 1000 - // on the host. Changing this value will result in improper mappings - // on existing containers. - UserNamespaceOffset = 100000 - devDir = "/dev" + defaultNetLink = "eth0" + sysboxErrMsg = "Sysbox exited, possibly because of an unsupported kernel version. Please contact an infrastructure administrator and request a node kernel with seccomp API level >= 5." ) var ( @@ -104,19 +66,6 @@ var ( EnvExtraCertsPath = "CODER_EXTRA_CERTS_PATH" ) -var envboxPrivateMounts = map[string]struct{}{ - "/var/lib/containers": {}, - "/var/lib/docker": {}, - "/var/lib/sysbox": {}, - "/lib/modules": {}, - "/usr/src": {}, - // /var/lib/coder is not technically a mount - // private to envbox but it is specially handled - // by sysbox so it does not require any effort - // on our part. - "/var/lib/coder": {}, -} - type flags struct { innerImage string innerUsername string @@ -155,14 +104,48 @@ func dockerCmd() *cobra.Command { Short: "Create a docker-based CVM", RunE: func(cmd *cobra.Command, args []string) (err error) { var ( - ctx = cmd.Context() - log = slog.Make(slogjson.Sink(cmd.ErrOrStderr()), slogkubeterminate.Make()).Leveled(slog.LevelDebug) - blog buildlog.Logger = buildlog.JSONLogger{Encoder: json.NewEncoder(os.Stderr)} + ctx = cmd.Context() + log = slog.Make( + slogjson.Sink(cmd.ErrOrStderr()), + slogkubeterminate.Make(), + ).Leveled(slog.LevelDebug) + cfg = cvm.Config{ + Username: flags.innerUsername, + AgentToken: flags.agentToken, + OSEnvs: os.Environ(), + + BuildLog: buildlog.JSONLogger{ + Encoder: json.NewEncoder(os.Stderr), + }, + InnerEnvs: strings.Split(flags.innerEnvs, ","), + WorkDir: flags.innerWorkDir, + Hostname: flags.innerHostname, + ImagePullSecret: flags.imagePullSecret, + CoderURL: flags.coderURL, + AddTUN: flags.addTUN, + AddFUSE: flags.addFUSE, + BoostrapScript: flags.boostrapScript, + DockerConfig: flags.dockerConfig, + CPUS: flags.cpus, + Memory: flags.memory, + GPUConfig: cvm.GPUConfig{ + HostUsrLibDir: flags.hostUsrLibDir, + }, + } ) + cfg.Mounts, err = parseMounts(flags.containerMounts) + if err != nil { + return xerrors.Errorf("parse mounts: %w", err) + } + + if flags.addGPU && flags.hostUsrLibDir == "" { + return xerrors.Errorf("when using GPUs, %q must be specified", EnvUsrLibDir) + } + if flags.noStartupLogs { log = slog.Make(slogjson.Sink(io.Discard)) - blog = buildlog.NopLogger{} + cfg.BuildLog = buildlog.NopLogger{} } httpClient, err := xhttp.Client(log, flags.extraCertsPath) @@ -182,17 +165,17 @@ func dockerCmd() *cobra.Command { // an inability to push build logs. log.Error(ctx, "failed to instantiate coder build log client, no logs will be pushed", slog.Error(err)) } else { - blog = buildlog.MultiLogger( + cfg.BuildLog = buildlog.MultiLogger( buildlog.OpenCoderLogger(ctx, agent, log), - blog, + cfg.BuildLog, ) } } - defer blog.Close() + defer cfg.BuildLog.Close() defer func(err *error) { if *err != nil { - blog.Errorf("Failed to run envbox: %v", *err) + cfg.BuildLog.Errorf("Failed to run envbox: %v", *err) } }(&err) @@ -205,12 +188,12 @@ func dockerCmd() *cobra.Command { select { // Start sysbox-mgr and sysbox-fs in order to run // sysbox containers. - case err := <-background.New(ctx, log, "sysbox-mgr", sysboxArgs...).Run(): - blog.Info(sysboxErrMsg) + case err := <-background.RunCh(ctx, log, "sysbox-mgr", sysboxArgs...): + cfg.BuildLog.Info(sysboxErrMsg) //nolint log.Fatal(ctx, "sysbox-mgr exited", slog.Error(err)) - case err := <-background.New(ctx, log, "sysbox-fs").Run(): - blog.Info(sysboxErrMsg) + case err := <-background.RunCh(ctx, log, "sysbox-fs"): + cfg.BuildLog.Info(sysboxErrMsg) //nolint log.Fatal(ctx, "sysbox-fs exited", slog.Error(err)) } @@ -222,21 +205,7 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "using custom docker bridge CIDR", slog.F("cidr", cidr)) } - dargs, err := dockerdArgs(flags.ethlink, cidr, false) - if err != nil { - return xerrors.Errorf("dockerd args: %w", err) - } - - log.Debug(ctx, "starting dockerd", slog.F("args", args)) - - blog.Info("Waiting for sysbox processes to startup...") - dockerd := background.New(ctx, log, "dockerd", dargs...) - err = dockerd.Start() - if err != nil { - return xerrors.Errorf("start dockerd: %w", err) - } - - log.Debug(ctx, "waiting for manager") + cfg.BuildLog.Info("Waiting for sysbox processes to startup...") err = sysboxutil.WaitForManager(ctx) if err != nil { @@ -248,39 +217,36 @@ func dockerCmd() *cobra.Command { return xerrors.Errorf("new docker client: %w", err) } + dockerd, err := dockerutil.StartDaemon(ctx, log, &dockerutil.DaemonOptions{ + Link: flags.ethlink, + CIDR: cidr, + Driver: "vfs", + }) + if err != nil { + return xerrors.Errorf("start dockerd: %w", err) + } + + mustRestartDockerd := mustRestartDockerd(ctx, log, cfg.BuildLog, dockerd, &dockerutil.DaemonOptions{ + Link: flags.ethlink, + CIDR: cidr, + Driver: "vfs", + }) + go func() { - err := <-dockerd.Wait() // It's possible the for the docker daemon to run out of disk // while trying to startup, in such cases we should restart // it and point it to an ephemeral directory. Since this // directory is going to be on top of an overlayfs filesystem // we have to use the vfs storage driver. - if xunix.IsNoSpaceErr(err) { - args, err = dockerdArgs(flags.ethlink, cidr, true) - if err != nil { - blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) - //nolint - log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err)) - } - - err = dockerd.Restart(ctx, "dockerd", args...) - if err != nil { - blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) - //nolint - log.Fatal(ctx, "restart dockerd", slog.Error(err)) - } - - err = <-dockerd.Wait() - } - - // It's possible lower down in the call stack to restart - // the docker daemon if we run out of disk while starting the - // container. - if err != nil && !xerrors.Is(err, background.ErrUserKilled) { - blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) + if !xunix.IsNoSpaceErr(err) { + cfg.BuildLog.Error("Failed to create Container-based Virtual Machine: " + err.Error()) //nolint log.Fatal(ctx, "dockerd exited", slog.Error(err)) } + cfg.BuildLog.Info("Insufficient space to start inner container. Restarting dockerd using the vfs driver. Your performance will be degraded. Clean up your home volume and then restart the workspace to improve performance.") + log.Debug(ctx, "encountered 'no space left on device' error while starting workspace", slog.Error(err)) + + mustRestartDockerd() }() log.Debug(ctx, "waiting for dockerd") @@ -288,33 +254,39 @@ func dockerCmd() *cobra.Command { // We wait for the daemon after spawning the goroutine in case // startup causes the daemon to encounter encounter a 'no space left // on device' error. - blog.Info("Waiting for dockerd to startup...") + cfg.BuildLog.Info("Waiting for dockerd to startup...") err = dockerutil.WaitForDaemon(ctx, client) if err != nil { return xerrors.Errorf("wait for dockerd: %w", err) } - if flags.extraCertsPath != "" { - // Parse the registry from the inner image - registry, err := name.ParseReference(flags.innerImage) - if err != nil { - return xerrors.Errorf("invalid image: %w", err) - } - registryName := registry.Context().RegistryStr() + tag, err := name.NewTag(flags.innerImage) + if err != nil { + return xerrors.Errorf("parse image: %w", err) + } + if flags.extraCertsPath != "" { + registryName := tag.RegistryStr() // Write certificates for the registry err = dockerutil.WriteCertsForRegistry(ctx, registryName, flags.extraCertsPath) if err != nil { return xerrors.Errorf("write certs for registry: %w", err) } - blog.Infof("Successfully copied certificates from %q to %q", flags.extraCertsPath, filepath.Join("/etc/docker/certs.d", registryName)) + cfg.BuildLog.Infof("Successfully copied certificates from %q to %q", flags.extraCertsPath, filepath.Join("/etc/docker/certs.d", registryName)) log.Debug(ctx, "wrote certificates for registry", slog.F("registry", registryName), slog.F("extra_certs_path", flags.extraCertsPath), ) } - err = runDockerCVM(ctx, log, client, blog, flags) + // Set our OOM score to something really unfavorable to avoid getting killed + // in memory-scarce scenarios. + err = xunix.SetOOMScore(ctx, "self", "-1000") + if err != nil { + return xerrors.Errorf("set oom score: %w", err) + } + + err = cvm.Run(ctx, log, xunix.NewLinuxOS(), client, cfg) if err != nil { // It's possible we failed because we ran out of disk while // pulling the image. We should restart the daemon and use @@ -322,34 +294,19 @@ func dockerCmd() *cobra.Command { // a user can access their workspace and try to delete whatever // is causing their disk to fill up. if xunix.IsNoSpaceErr(err) { - blog.Info("Insufficient space to start inner container. Restarting dockerd using the vfs driver. Your performance will be degraded. Clean up your home volume and then restart the workspace to improve performance.") + cfg.BuildLog.Info("Insufficient space to start inner container. Restarting dockerd using the vfs driver. Your performance will be degraded. Clean up your home volume and then restart the workspace to improve performance.") log.Debug(ctx, "encountered 'no space left on device' error while starting workspace", slog.Error(err)) - args, err := dockerdArgs(flags.ethlink, cidr, true) - if err != nil { - return xerrors.Errorf("dockerd args for restart: %w", err) - } - - log.Debug(ctx, "restarting dockerd", slog.F("args", args)) - - err = dockerd.Restart(ctx, "dockerd", args...) - if err != nil { - return xerrors.Errorf("restart dockerd: %w", err) - } - go func() { - err = <-dockerd.Wait() - blog.Errorf("restarted dockerd exited: %v", err) - //nolint - log.Fatal(ctx, "restarted dockerd exited", slog.Error(err)) - }() + + mustRestartDockerd() log.Debug(ctx, "reattempting container creation") - err = runDockerCVM(ctx, log, client, blog, flags) - } - if err != nil { - blog.Errorf("Failed to run envbox: %v", err) - return xerrors.Errorf("run: %w", err) + err = cvm.Run(ctx, log, xunix.NewLinuxOS(), client, cfg) } } + if err != nil { + cfg.BuildLog.Errorf("Failed to run envbox: %v", err) + return xerrors.Errorf("run: %w", err) + } return nil }, @@ -387,420 +344,6 @@ func dockerCmd() *cobra.Command { return cmd } -func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client, blog buildlog.Logger, flags flags) error { - fs := xunix.GetFS(ctx) - - // Set our OOM score to something really unfavorable to avoid getting killed - // in memory-scarce scenarios. - err := xunix.SetOOMScore(ctx, "self", "-1000") - if err != nil { - return xerrors.Errorf("set oom score: %w", err) - } - ref, err := name.NewTag(flags.innerImage) - if err != nil { - return xerrors.Errorf("parse ref: %w", err) - } - - var dockerAuth dockerutil.AuthConfig - if flags.imagePullSecret != "" { - dockerAuth, err = dockerutil.AuthConfigFromString(flags.imagePullSecret, ref.RegistryStr()) - if err != nil { - return xerrors.Errorf("parse auth config: %w", err) - } - } - - log.Info(ctx, "checking for docker config file", slog.F("path", flags.dockerConfig)) - if _, err := fs.Stat(flags.dockerConfig); err == nil { - log.Info(ctx, "detected file", slog.F("image", flags.innerImage)) - dockerAuth, err = dockerutil.AuthConfigFromPath(flags.dockerConfig, ref.RegistryStr()) - if err != nil && !xerrors.Is(err, os.ErrNotExist) { - return xerrors.Errorf("auth config from file: %w", err) - } - } - - envs := defaultContainerEnvs(ctx, flags.agentToken) - - innerEnvsTokens := strings.Split(flags.innerEnvs, ",") - envs = append(envs, filterElements(xunix.Environ(ctx), innerEnvsTokens...)...) - - mounts := defaultMounts() - // Add any user-specified mounts to our mounts list. - extraMounts, err := parseMounts(flags.containerMounts) - if err != nil { - return xerrors.Errorf("read mounts: %w", err) - } - mounts = append(mounts, extraMounts...) - - log.Debug(ctx, "using mounts", slog.F("mounts", mounts)) - - devices := make([]container.DeviceMapping, 0, 2) - if flags.addTUN { - log.Debug(ctx, "creating TUN device", slog.F("path", OuterTUNPath)) - blog.Info("Creating TUN device") - dev, err := xunix.CreateTUNDevice(ctx, OuterTUNPath) - if err != nil { - return xerrors.Errorf("creat tun device: %w", err) - } - - devices = append(devices, container.DeviceMapping{ - PathOnHost: dev.Path, - PathInContainer: InnerTUNPath, - CgroupPermissions: "rwm", - }) - } - - if flags.addFUSE { - log.Debug(ctx, "creating FUSE device", slog.F("path", OuterFUSEPath)) - blog.Info("Creating FUSE device") - dev, err := xunix.CreateFuseDevice(ctx, OuterFUSEPath) - if err != nil { - return xerrors.Errorf("create fuse device: %w", err) - } - - devices = append(devices, container.DeviceMapping{ - PathOnHost: dev.Path, - PathInContainer: InnerFUSEPath, - CgroupPermissions: "rwm", - }) - } - - log.Debug(ctx, "using devices", slog.F("devices", devices)) - - // ID shift the devices so that they reflect the root user - // inside the container. - for _, device := range devices { - log.Debug(ctx, "chowning device", - slog.F("device", device.PathOnHost), - slog.F("uid", UserNamespaceOffset), - slog.F("gid", UserNamespaceOffset), - ) - err = fs.Chown(device.PathOnHost, UserNamespaceOffset, UserNamespaceOffset) - if err != nil { - return xerrors.Errorf("chown device %q: %w", device.PathOnHost, err) - } - } - - log.Debug(ctx, "pulling image", slog.F("image", flags.innerImage)) - - err = dockerutil.PullImage(ctx, &dockerutil.PullImageConfig{ - Client: client, - Image: flags.innerImage, - Auth: dockerAuth, - ProgressFn: dockerutil.DefaultLogImagePullFn(blog), - }) - if err != nil { - return xerrors.Errorf("pull image: %w", err) - } - - log.Debug(ctx, "remounting /sys") - - // After image pull we remount /sys so sysbox can have appropriate perms to create a container. - err = xunix.MountFS(ctx, "/sys", "/sys", "", "remount", "rw") - if err != nil { - return xerrors.Errorf("remount /sys: %w", err) - } - - if flags.addGPU { - if flags.hostUsrLibDir == "" { - return xerrors.Errorf("when using GPUs, %q must be specified", EnvUsrLibDir) - } - - // Unmount GPU drivers in /proc as it causes issues when creating any - // container in some cases (even the image metadata container). - _, err = xunix.TryUnmountProcGPUDrivers(ctx, log) - if err != nil { - return xerrors.Errorf("unmount /proc GPU drivers: %w", err) - } - } - - log.Debug(ctx, "fetching image metadata", - slog.F("image", flags.innerImage), - slog.F("username", flags.innerUsername), - ) - - blog.Info("Getting image metadata...") - // Get metadata about the image. We need to know things like the UID/GID - // of the user so that we can chown directories to the namespaced UID inside - // the inner container as well as whether we should be starting the container - // with /sbin/init or something simple like 'sleep infinity'. - imgMeta, err := dockerutil.GetImageMetadata(ctx, client, flags.innerImage, flags.innerUsername) - if err != nil { - return xerrors.Errorf("get image metadata: %w", err) - } - - blog.Infof("Detected entrypoint user '%s:%s' with home directory %q", imgMeta.UID, imgMeta.UID, imgMeta.HomeDir) - - log.Debug(ctx, "fetched image metadata", - slog.F("uid", imgMeta.UID), - slog.F("gid", imgMeta.GID), - slog.F("has_init", imgMeta.HasInit), - ) - - uid, err := strconv.ParseInt(imgMeta.UID, 10, 32) - if err != nil { - return xerrors.Errorf("parse image uid: %w", err) - } - gid, err := strconv.ParseInt(imgMeta.GID, 10, 32) - if err != nil { - return xerrors.Errorf("parse image gid: %w", err) - } - - for _, m := range mounts { - // Don't modify anything private to envbox. - if isPrivateMount(m) { - continue - } - - log.Debug(ctx, "chmod'ing directory", - slog.F("path", m.Source), - slog.F("mode", "02755"), - ) - - // If a mount is read-only we have to remount it rw so that we - // can id shift it correctly. We'll still mount it read-only into - // the inner container. - if m.ReadOnly { - mounter := xunix.Mounter(ctx) - err := mounter.Mount("", m.Source, "", []string{"remount,rw"}) - if err != nil { - return xerrors.Errorf("remount: %w", err) - } - } - - err := fs.Chmod(m.Source, 0o2755) - if err != nil { - return xerrors.Errorf("chmod mountpoint %q: %w", m.Source, err) - } - - var ( - shiftedUID = shiftedID(0) - shiftedGID = shiftedID(0) - ) - - if isHomeDir(m.Source) { - // We want to ensure that the inner directory is ID shifted to - // the namespaced UID of the user in the inner container otherwise - // they won't be able to write files. - shiftedUID = shiftedID(int(uid)) - shiftedGID = shiftedID(int(gid)) - } - - log.Debug(ctx, "chowning mount", - slog.F("source", m.Source), - slog.F("target", m.Mountpoint), - slog.F("uid", shiftedUID), - slog.F("gid", shiftedGID), - ) - - // Any non-home directory we assume should be owned by id-shifted root - // user. - err = fs.Chown(m.Source, shiftedUID, shiftedGID) - if err != nil { - return xerrors.Errorf("chown mountpoint %q: %w", m.Source, err) - } - } - - if flags.addGPU { - devs, binds, err := xunix.GPUs(ctx, log, flags.hostUsrLibDir) - if err != nil { - return xerrors.Errorf("find gpus: %w", err) - } - - for _, dev := range devs { - devices = append(devices, container.DeviceMapping{ - PathOnHost: dev.Path, - PathInContainer: dev.Path, - CgroupPermissions: "rwm", - }) - } - - for _, bind := range binds { - // If the bind has a path that points to the host-mounted /usr/lib - // directory we need to remap it to /usr/lib inside the container. - mountpoint := bind.Path - if strings.HasPrefix(mountpoint, flags.hostUsrLibDir) { - mountpoint = filepath.Join( - "/usr/lib", - strings.TrimPrefix(mountpoint, strings.TrimSuffix(flags.hostUsrLibDir, "/")), - ) - } - mounts = append(mounts, xunix.Mount{ - Source: bind.Path, - Mountpoint: mountpoint, - ReadOnly: slices.Contains(bind.Opts, "ro"), - }) - } - envs = append(envs, xunix.GPUEnvs(ctx)...) - } - - blog.Info("Creating workspace...") - - // Create the inner container. - containerID, err := dockerutil.CreateContainer(ctx, client, &dockerutil.ContainerConfig{ - Log: log, - Mounts: mounts, - Devices: devices, - Envs: envs, - Name: InnerContainerName, - Hostname: flags.innerHostname, - WorkingDir: flags.innerWorkDir, - HasInit: imgMeta.HasInit, - Image: flags.innerImage, - CPUs: int64(flags.cpus), - MemoryLimit: int64(flags.memory), - }) - if err != nil { - return xerrors.Errorf("create container: %w", err) - } - - blog.Info("Pruning images to free up disk...") - // Prune images to avoid taking up any unnecessary disk from the user. - _, err = dockerutil.PruneImages(ctx, client) - if err != nil { - return xerrors.Errorf("prune images: %w", err) - } - - // TODO fix iptables when istio detected. - - blog.Info("Starting up workspace...") - err = client.ContainerStart(ctx, containerID, container.StartOptions{}) - if err != nil { - return xerrors.Errorf("start container: %w", err) - } - - log.Debug(ctx, "creating bootstrap directory", slog.F("directory", imgMeta.HomeDir)) - - // Create the directory to which we will download the agent. - // We create this directory because the default behavior is - // to download the agent to /tmp/coder.XXXX. This causes a race to happen - // where we finish downloading the binary but before we can execute - // systemd remounts /tmp. - bootDir := filepath.Join(imgMeta.HomeDir, ".coder") - - blog.Infof("Creating %q directory to host Coder assets...", bootDir) - _, err = dockerutil.ExecContainer(ctx, client, dockerutil.ExecConfig{ - ContainerID: containerID, - User: imgMeta.UID, - Cmd: "mkdir", - Args: []string{"-p", bootDir}, - }) - if err != nil { - return xerrors.Errorf("make bootstrap dir: %w", err) - } - - cpuQuota, err := xunix.ReadCPUQuota(ctx, log) - if err != nil { - blog.Infof("Unable to read CPU quota: %s", err.Error()) - } else { - log.Debug(ctx, "setting CPU quota", - slog.F("quota", cpuQuota.Quota), - slog.F("period", cpuQuota.Period), - slog.F("cgroup", cpuQuota.CGroup.String()), - ) - - // We want the inner container to have the same limits as the outer container - // so that processes inside the container know what they're working with. - if err := dockerutil.SetContainerQuota(ctx, containerID, cpuQuota); err != nil { - blog.Infof("Unable to set quota for inner container: %s", err.Error()) - blog.Info("This is not a fatal error, but it may cause cgroup-aware applications to misbehave.") - } - } - - blog.Info("Envbox startup complete!") - - // The bootstrap script doesn't return since it execs the agent - // meaning that it can get pretty noisy if we were to log by default. - // In order to allow users to discern issues getting the bootstrap script - // to complete successfully we pipe the output to stdout if - // CODER_DEBUG=true. - debugWriter := io.Discard - if flags.debug { - debugWriter = os.Stdout - } - // Bootstrap the container if a script has been provided. - blog.Infof("Bootstrapping workspace...") - err = dockerutil.BootstrapContainer(ctx, client, dockerutil.BootstrapConfig{ - ContainerID: containerID, - User: imgMeta.UID, - Script: flags.boostrapScript, - // We set this because the default behavior is to download the agent - // to /tmp/coder.XXXX. This causes a race to happen where we finish - // downloading the binary but before we can execute systemd remounts - // /tmp. - Env: []string{fmt.Sprintf("BINARY_DIR=%s", bootDir)}, - StdOutErr: debugWriter, - }) - if err != nil { - return xerrors.Errorf("boostrap container: %w", err) - } - - return nil -} - -//nolint:revive -func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { - // We need to adjust the MTU for the host otherwise packets will fail delivery. - // 1500 is the standard, but certain deployments (like GKE) use custom MTU values. - // See: https://www.atlantis-press.com/journals/ijndc/125936177/view#sec-s3.1 - - mtu, err := xunix.NetlinkMTU(link) - if err != nil { - return nil, xerrors.Errorf("custom mtu: %w", err) - } - - // We set the Docker Bridge IP explicitly here for a number of reasons: - // 1) It sometimes picks the 172.17.x.x address which conflicts with that of the Docker daemon in the inner container. - // 2) It defaults to a /16 network which is way more than we need for envbox. - // 3) The default may conflict with existing internal network resources, and an operator may wish to override it. - dockerBip, prefixLen := dockerutil.BridgeIPFromCIDR(cidr) - - args := []string{ - "--debug", - "--log-level=debug", - fmt.Sprintf("--mtu=%d", mtu), - "--userns-remap=coder", - "--storage-driver=overlay2", - fmt.Sprintf("--bip=%s/%d", dockerBip, prefixLen), - } - - if isNoSpace { - args = append(args, - fmt.Sprintf("--data-root=%s", noSpaceDataDir), - fmt.Sprintf("--storage-driver=%s", noSpaceDockerDriver), - ) - } - - return args, nil -} - -// TODO This is bad code. -func filterElements(ss []string, filters ...string) []string { - filtered := make([]string, 0, len(ss)) - for _, f := range filters { - f = strings.TrimSpace(f) - for _, s := range ss { - toks := strings.Split(s, "=") - if len(toks) < 2 { - // Malformed environment variable. - continue - } - - key := toks[0] - - if strings.HasSuffix(f, "*") { - filter := strings.TrimSuffix(f, "*") - if strings.HasPrefix(key, filter) { - filtered = append(filtered, s) - } - } else if key == f { - filtered = append(filtered, s) - } - } - } - - return filtered -} - // parseMounts parses a list of mounts from containerMounts. The format should // be "src:dst[:ro],src:dst[:ro]". func parseMounts(containerMounts string) ([]xunix.Mount, error) { @@ -829,79 +372,23 @@ func parseMounts(containerMounts string) ([]xunix.Mount, error) { return mounts, nil } -// defaultContainerEnvs returns environment variables that should always -// be passed to the inner container. -func defaultContainerEnvs(ctx context.Context, agentToken string) []string { - const agentSubsystemEnv = "CODER_AGENT_SUBSYSTEM" - env := xunix.Environ(ctx) - existingSubsystem := "" - for _, e := range env { - if strings.HasPrefix(e, agentSubsystemEnv+"=") { - existingSubsystem = strings.TrimPrefix(e, agentSubsystemEnv+"=") - break +func mustRestartDockerd(ctx context.Context, log slog.Logger, blog buildlog.Logger, dockerd *background.Process, options *dockerutil.DaemonOptions) func() { + return sync.OnceFunc(func() { + err := dockerd.KillAndWait() + if err != nil { + log.Error(ctx, "failed to kill dockerd", slog.Error(err)) } - } - // We should append to the existing agent subsystem if it exists. - agentSubsystem := "envbox" - if existingSubsystem != "" { - split := strings.Split(existingSubsystem, ",") - split = append(split, "envbox") - - tidy := make([]string, 0, len(split)) - seen := make(map[string]struct{}) - for _, s := range split { - s := strings.TrimSpace(s) - if _, ok := seen[s]; s == "" || ok { - continue - } - seen[s] = struct{}{} - tidy = append(tidy, s) + dockerd, err = dockerutil.StartDaemon(ctx, log, options) + if err != nil { + log.Fatal(ctx, "failed to start dockerd", slog.Error(err)) } - sort.Strings(tidy) - agentSubsystem = strings.Join(tidy, ",") - } - - return []string{ - fmt.Sprintf("%s=%s", EnvAgentToken, agentToken), - fmt.Sprintf("%s=%s", "CODER_AGENT_SUBSYSTEM", agentSubsystem), - } -} - -// defaultMounts are bind mounts that are always provided to the inner -// container. -func defaultMounts() []xunix.Mount { - return []xunix.Mount{ - { - Source: "/var/lib/coder/docker", - Mountpoint: "/var/lib/docker", - }, - { - Source: "/var/lib/coder/containers", - Mountpoint: "/var/lib/containers", - }, - } -} - -// isPrivateMount returns true if the provided mount points to a mount -// private to the envbox container itself. -func isPrivateMount(m xunix.Mount) bool { - _, ok := envboxPrivateMounts[m.Mountpoint] - return ok -} - -func isHomeDir(fpath string) bool { - if fpath == "/root" { - return true - } - - dir, _ := path.Split(fpath) - return dir == "/home/" -} - -// shiftedID returns the ID but shifted to the user namespace offset we -// use for the inner container. -func shiftedID(id int) int { - return id + UserNamespaceOffset + go func() { + err := dockerd.Wait() + blog.Errorf("restarted dockerd exited: %v", err) + //nolint + log.Fatal(ctx, "restarted dockerd exited", slog.Error(err)) + }() + }) } diff --git a/cli/docker_test.go b/cli/docker_test.go index 88e9bc3..a216ed0 100644 --- a/cli/docker_test.go +++ b/cli/docker_test.go @@ -1,32 +1,15 @@ package cli_test import ( - "bufio" - "bytes" - "context" - "encoding/base64" "fmt" - "io" - "net" - "os" - "path/filepath" - "strings" "testing" - dockertypes "github.com/docker/docker/api/types" - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/image" - "github.com/docker/docker/api/types/network" - v1 "github.com/opencontainers/image-spec/specs-go/v1" "github.com/spf13/afero" "github.com/stretchr/testify/require" - "k8s.io/mount-utils" testingexec "k8s.io/utils/exec/testing" "github.com/coder/envbox/cli" "github.com/coder/envbox/cli/clitest" - "github.com/coder/envbox/dockerutil" - "github.com/coder/envbox/xunix" "github.com/coder/envbox/xunix/xunixfake" ) @@ -64,98 +47,6 @@ func TestDocker(t *testing.T) { execer.AssertCommandsCalled(t) }) - t.Run("Images", func(t *testing.T) { - t.Parallel() - - type testcase struct { - name string - image string - success bool - } - - testcases := []testcase{ - { - name: "Repository", - image: "ubuntu", - success: true, - }, - { - name: "RepositoryPath", - image: "ubuntu/ubuntu", - success: true, - }, - - { - name: "RepositoryLatest", - image: "ubuntu:latest", - success: true, - }, - { - name: "RepositoryTag", - image: "ubuntu:24.04", - success: true, - }, - { - name: "RepositoryPathTag", - image: "ubuntu/ubuntu:18.04", - success: true, - }, - { - name: "RegistryRepository", - image: "gcr.io/ubuntu", - success: true, - }, - { - name: "RegistryRepositoryTag", - image: "gcr.io/ubuntu:24.04", - success: true, - }, - } - - for _, tc := range testcases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "--image="+tc.image, - "--username=root", - "--agent-token=hi", - ) - - called := make(chan struct{}) - execer := clitest.Execer(ctx) - client := clitest.DockerClient(t, ctx) - execer.AddCommands(&xunixfake.FakeCmd{ - FakeCmd: &testingexec.FakeCmd{ - Argv: []string{ - "sysbox-mgr", - }, - }, - WaitFn: func() error { close(called); select {} }, //nolint:revive - }) - - var created bool - client.ContainerCreateFn = func(_ context.Context, conf *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, _ string) (container.CreateResponse, error) { - created = true - require.Equal(t, tc.image, conf.Image) - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - if !tc.success { - require.Error(t, err) - return - } - - <-called - require.NoError(t, err) - require.True(t, created, "container create fn not called") - execer.AssertCommandsCalled(t) - }) - } - }) - // Test that dockerd is configured correctly. t.Run("DockerdConfigured", func(t *testing.T) { t.Parallel() @@ -214,324 +105,6 @@ func TestDocker(t *testing.T) { require.Equal(t, []byte("-1000"), score) }) - // Test that user-provided env vars are passed through. - // It is valid to specify a wildcard so that all matching - // env vars are passed through. - t.Run("PassesThroughEnvVars", func(t *testing.T) { - t.Parallel() - var ( - cntEnvs = []string{ - "FOO", - "CODER_VAR", - "bar", - // Test that wildcard works. - "KUBERNETES_*", - "US_*", - } - - expectedEnvs = []string{ - "CODER_AGENT_TOKEN=hi", - "CODER_AGENT_SUBSYSTEM=envbox,exectrace", // sorted - "FOO=bar", - "CODER_VAR=baz", - "bar=123", - "KUBERNETES_SERVICE_HOST=10.0.0.1", - "KUBERNETES_PORT=tcp://10.0.0.1:443", - "KUBERNETES_PORT_443_TCP_PORT=443", - } - - osEnvs = (append([]string{ - "USER=root", - "USA=yay", - "HOME=/root", - "PATH=/usr/bin:/sbin:/bin", - // Envbox should add to this. - "CODER_AGENT_SUBSYSTEM=exectrace", - // Don't include the wildcards. - }, expectedEnvs...)) - ) - - ctx, cmd := clitest.New(t, "docker", - "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - fmt.Sprintf("--envs=%s", strings.Join(cntEnvs, ",")), - ) - - ctx = xunix.WithEnvironFn(ctx, func() []string { return osEnvs }) - - client := clitest.DockerClient(t, ctx) - var called bool - client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.ElementsMatch(t, expectedEnvs, config.Env) - } - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "create function was not called") - }) - - // Test that we parse mounts correctly. - t.Run("Mounts", func(t *testing.T) { - t.Parallel() - - var ( - userMounts = []string{"/home/coder:/home/coder", "/etc/hosts:/etc/hosts:ro", "/etc/hostname:/idc/where:ro", "/usr/src:/a/b/c"} - expectedMounts = append([]string{"/var/lib/coder/docker:/var/lib/docker", "/var/lib/coder/containers:/var/lib/containers"}, userMounts...) - ) - ctx, cmd := clitest.New(t, "docker", - "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - fmt.Sprintf("--mounts=%s", strings.Join(userMounts, ",")), - ) - - var ( - client = clitest.DockerClient(t, ctx) - fs = clitest.FS(ctx) - ) - - for _, mount := range userMounts { - src := strings.Split(mount, ":")[0] - - err := afero.WriteFile(fs, src, []byte("hi"), 0o777) - require.NoError(t, err) - } - - // Set the exec response from inspecting the image to some ID - // greater than 0. - client.ContainerExecAttachFn = func(_ context.Context, _ string, _ dockertypes.ExecStartCheck) (dockertypes.HijackedResponse, error) { - return dockertypes.HijackedResponse{ - Reader: bufio.NewReader(strings.NewReader("root:x:1001:1001:root:/root:/bin/bash")), - Conn: &net.IPConn{}, - }, nil - } - - var called bool - client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.Equal(t, expectedMounts, hostConfig.Binds) - } - - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "container create fn not called") - - fi, err := fs.Stat("/home/coder") - require.NoError(t, err) - require.Equal(t, os.FileMode(0o755), fi.Mode().Perm()) - // Check that we're calling chown and shifting the ID. - owner, ok := fs.GetFileOwner("/home/coder") - require.True(t, ok) - require.Equal(t, cli.UserNamespaceOffset+1001, owner.UID) - require.Equal(t, cli.UserNamespaceOffset+1001, owner.GID) - }) - - // Test that we remount /sys once we pull the image so that - // sysbox can use it properly. - t.Run("RemountSysfs", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - ) - - mounter := clitest.Mounter(ctx) - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - - actions := mounter.GetLog() - require.Len(t, actions, 1) - action := actions[0] - require.Equal(t, "mount", action.Action) - require.Equal(t, "", action.FSType) - require.Equal(t, "/sys", action.Source) - require.Equal(t, "/sys", action.Target) - }) - - // Test that devices are created and passed through to the docker - // daemon. - t.Run("Devices", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - "--add-tun", - "--add-fuse", - ) - - var ( - client = clitest.DockerClient(t, ctx) - fs = clitest.FS(ctx) - expectedDevices = []container.DeviceMapping{ - { - PathOnHost: cli.OuterTUNPath, - PathInContainer: cli.InnerTUNPath, - CgroupPermissions: "rwm", - }, - { - PathOnHost: cli.OuterFUSEPath, - PathInContainer: cli.InnerFUSEPath, - CgroupPermissions: "rwm", - }, - } - ) - - var called bool - client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.Equal(t, expectedDevices, hostConfig.Devices) - } - - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "container create fn not called") - - // Check that we're calling chown and shifting the ID to - // it maps to root of the inner container. - owner, ok := fs.GetFileOwner(cli.OuterFUSEPath) - require.True(t, ok) - require.Equal(t, cli.UserNamespaceOffset, owner.UID) - require.Equal(t, cli.UserNamespaceOffset, owner.GID) - - owner, ok = fs.GetFileOwner(cli.OuterTUNPath) - require.True(t, ok) - require.Equal(t, cli.UserNamespaceOffset, owner.UID) - require.Equal(t, cli.UserNamespaceOffset, owner.GID) - }) - - // Tests that 'sleep infinity' is used if /sbin/init - // isn't detected. - t.Run("NoInit", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - "--add-tun", - "--add-fuse", - ) - - var ( - client = clitest.DockerClient(t, ctx) - statExecID = "hi" - ) - - client.ContainerExecCreateFn = func(_ context.Context, _ string, config dockertypes.ExecConfig) (dockertypes.IDResponse, error) { - if config.Cmd[0] == "stat" { - return dockertypes.IDResponse{ - ID: statExecID, - }, nil - } - return dockertypes.IDResponse{}, nil - } - - // Set the exec response from inspecting the image to some ID - // greater than 0. - client.ContainerExecInspectFn = func(_ context.Context, execID string) (dockertypes.ContainerExecInspect, error) { - if execID == statExecID { - return dockertypes.ContainerExecInspect{ExitCode: 1}, nil - } - - return dockertypes.ContainerExecInspect{}, nil - } - - var called bool - client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.Equal(t, []string{"sleep", "infinity"}, []string(config.Entrypoint)) - } - - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "container create fn not called") - }) - - t.Run("DockerAuth", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "--image=us.gcr.io/ubuntu", - "--username=root", - "--agent-token=hi", - fmt.Sprintf("--image-secret=%s", rawDockerAuth), - ) - - raw := []byte(`{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }"}`) - authB64 := base64.URLEncoding.EncodeToString(raw) - - client := clitest.DockerClient(t, ctx) - client.ImagePullFn = func(_ context.Context, _ string, options image.PullOptions) (io.ReadCloser, error) { - // Assert that we call the image pull function with the credentials. - require.Equal(t, authB64, options.RegistryAuth) - return io.NopCloser(bytes.NewReader(nil)), nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - }) - - t.Run("SetsResources", func(t *testing.T) { - t.Parallel() - - const ( - // 4GB. - memory = 4 << 30 - cpus = 6 - ) - - ctx, cmd := clitest.New(t, "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - fmt.Sprintf("--cpus=%d", cpus), - fmt.Sprintf("--memory=%d", memory), - ) - - var called bool - client := clitest.DockerClient(t, ctx) - client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.Equal(t, int64(memory), hostConfig.Memory) - require.Equal(t, int64(cpus*dockerutil.DefaultCPUPeriod), hostConfig.CPUQuota) - require.Equal(t, int64(dockerutil.DefaultCPUPeriod), hostConfig.CPUPeriod) - } - - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "create function was not called for inner container") - }) - t.Run("GPUNoUsrLibDir", func(t *testing.T) { t.Parallel() @@ -547,165 +120,6 @@ func TestDocker(t *testing.T) { require.ErrorContains(t, err, fmt.Sprintf("when using GPUs, %q must be specified", cli.EnvUsrLibDir)) }) - t.Run("GPU", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - "--add-gpu=true", - "--usr-lib-dir=/var/coder/usr/lib", - ) - - var ( - mounter = clitest.Mounter(ctx) - afs = clitest.FS(ctx) - - procGPUDrivers = []string{ - "/proc/vulkan/foo", - "/proc/nvidia/bar", - "/proc/cuda/baz", - } - - // This path intentionally has a trailing '/' to ensure we are - // trimming correctly when remapping host-mounted /usr/lib dirs to - // /usr/lib inside the container. - usrLibMountpoint = "/var/coder/usr/lib/" - // expectedUsrLibFiles are files that we expect to be returned as bind mounts. - expectedUsrLibFiles = []string{ - filepath.Join(usrLibMountpoint, "nvidia", "libglxserver_nvidia.so"), - filepath.Join(usrLibMountpoint, "libnvidia-ml.so"), - } - expectedEnvs = []string{ - "NVIDIA_TEST=1", - "TEST_NVIDIA=1", - "nvidia_test=1", - } - ) - - environ := func() []string { - return append( - []string{ - "LIBGL_TEST=1", - "VULKAN_TEST=1", - }, expectedEnvs...) - } - - ctx = xunix.WithEnvironFn(ctx, environ) - - // Fake all the files. - for _, file := range append(expectedUsrLibFiles, procGPUDrivers...) { - _, err := afs.Create(file) - require.NoError(t, err) - } - - mounter.MountPoints = []mount.MountPoint{ - { - Device: "/dev/sda1", - Path: "/usr/local/nvidia", - Opts: []string{"rw"}, - }, - { - Device: "/dev/sda2", - Path: "/etc/hosts", - }, - { - Path: "/dev/nvidia0", - }, - { - Path: "/dev/nvidia1", - }, - } - - for _, driver := range procGPUDrivers { - mounter.MountPoints = append(mounter.MountPoints, mount.MountPoint{ - Path: driver, - }) - } - - _, err := afs.Create("/usr/local/nvidia") - require.NoError(t, err) - - unmounts := []string{} - mounter.UnmountFunc = func(path string) error { - unmounts = append(unmounts, path) - return nil - } - - var called bool - client := clitest.DockerClient(t, ctx) - client.ContainerCreateFn = func(_ context.Context, config *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - // Test that '/dev' mounts are passed as devices. - require.Contains(t, hostConfig.Devices, container.DeviceMapping{ - PathOnHost: "/dev/nvidia0", - PathInContainer: "/dev/nvidia0", - CgroupPermissions: "rwm", - }) - require.Contains(t, hostConfig.Devices, container.DeviceMapping{ - PathOnHost: "/dev/nvidia1", - PathInContainer: "/dev/nvidia1", - CgroupPermissions: "rwm", - }) - - // Test that the mountpoint that we provided that is not under - // '/dev' is passed as a bind mount. - require.Contains(t, hostConfig.Binds, fmt.Sprintf("%s:%s", "/usr/local/nvidia", "/usr/local/nvidia")) - - // Test that host /usr/lib bind mounts were passed through as read-only. - for _, file := range expectedUsrLibFiles { - require.Contains(t, hostConfig.Binds, fmt.Sprintf("%s:%s:ro", - file, - strings.ReplaceAll(file, usrLibMountpoint, "/usr/lib/"), - )) - } - - // Test that we captured the GPU-related env vars. - for _, env := range expectedEnvs { - require.Contains(t, config.Env, env) - } - } - - return container.CreateResponse{}, nil - } - - err = cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "create function was not called for inner container") - // Assert that we unmounted /proc GPU drivers. - for _, driver := range procGPUDrivers { - require.Contains(t, unmounts, driver) - } - }) - - t.Run("Hostname", func(t *testing.T) { - t.Parallel() - - ctx, cmd := clitest.New(t, "docker", - "--image=ubuntu", - "--username=root", - "--agent-token=hi", - "--hostname=hello-world", - ) - - var called bool - client := clitest.DockerClient(t, ctx) - client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { - if containerName == cli.InnerContainerName { - called = true - require.Equal(t, "hello-world", config.Hostname) - } - - return container.CreateResponse{}, nil - } - - err := cmd.ExecuteContext(ctx) - require.NoError(t, err) - require.True(t, called, "container create not called") - }) - t.Run("DisableIDMappedMounts", func(t *testing.T) { t.Parallel() @@ -734,7 +148,3 @@ func TestDocker(t *testing.T) { execer.AssertCommandsCalled(t) }) } - -// rawDockerAuth is sample input for a kubernetes secret to a gcr.io private -// registry. -const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}` diff --git a/cvm/cvm.go b/cvm/cvm.go new file mode 100644 index 0000000..fbf7f08 --- /dev/null +++ b/cvm/cvm.go @@ -0,0 +1,559 @@ +package cvm + +import ( + "context" + "fmt" + "io" + "os" + "path" + "path/filepath" + "sort" + "strconv" + "strings" + + "cdr.dev/slog" + "github.com/coder/envbox/buildlog" + "github.com/coder/envbox/dockerutil" + "github.com/coder/envbox/xunix" + "github.com/docker/docker/api/types/container" + "github.com/google/go-containerregistry/pkg/name" + "golang.org/x/xerrors" +) + +const ( + EnvAgentToken = "CODER_AGENT_TOKEN" + + OuterFUSEPath = "/tmp/coder-fuse" + InnerFUSEPath = "/dev/fuse" + + OuterTUNPath = "/tmp/coder-tun" + InnerTUNPath = "/dev/net/tun" + + // Required for userns mapping. + // This is the ID of the user we apply in `envbox/Dockerfile`. + // + // There should be caution changing this value. + // Source directory permissions on the host are offset by this + // value. For example, folder `/home/coder` inside the container + // with UID/GID 1000 will be mapped to `UserNamespaceOffset` + 1000 + // on the host. Changing this value will result in improper mappings + // on existing containers. + UserNamespaceOffset = 100000 + InnerContainerName = "workspace_cvm" +) + +type GPUConfig struct { + HostUsrLibDir string +} + +type Config struct { + // Required fields. + Tag name.Tag + Username string + AgentToken string + OSEnvs []string + + // Optional fields. + BuildLog buildlog.Logger + InnerEnvs []string + WorkDir string + Hostname string + ImagePullSecret string + CoderURL string + AddTUN bool + AddFUSE bool + BoostrapScript string + Mounts []xunix.Mount + DockerConfig string + CPUS int + Memory int + + // Test flags. + Debug bool + noStartupLogs bool + ethlink string + + GPUConfig +} + +func Run(ctx context.Context, log slog.Logger, xos xunix.OS, client dockerutil.Client, cfg Config) error { + + log = log.With( + slog.F("image", cfg.Tag.String()), + slog.F("username", cfg.Username), + slog.F("workdir", cfg.WorkDir), + slog.F("hostname", cfg.Hostname), + slog.F("coder_url", cfg.CoderURL), + slog.F("add_tun", cfg.AddTUN), + slog.F("add_fuse", cfg.AddFUSE), + slog.F("host_usr_lib_dir", cfg.HostUsrLibDir), + slog.F("docker_config", cfg.DockerConfig), + slog.F("cpus", cfg.CPUS), + slog.F("memory", cfg.Memory), + slog.F("mounts", cfg.Mounts), + ) + + blog := cfg.BuildLog + if blog == nil { + blog = buildlog.NopLogger{} + } + + var dockerAuth dockerutil.AuthConfig + if cfg.ImagePullSecret != "" { + var err error + dockerAuth, err = dockerutil.AuthConfigFromString(cfg.ImagePullSecret, cfg.Tag.RegistryStr()) + if err != nil { + return xerrors.Errorf("parse auth config: %w", err) + } + } + + if _, err := xos.Stat(cfg.DockerConfig); err == nil { + log.Info(ctx, "detected docker config file") + dockerAuth, err = dockerutil.AuthConfigFromPath(cfg.DockerConfig, cfg.Tag.RegistryStr()) + if err != nil && !xerrors.Is(err, os.ErrNotExist) { + return xerrors.Errorf("auth config from file: %w", err) + } + } + + envs := defaultContainerEnvs(cfg.OSEnvs, cfg.AgentToken) + envs = append(envs, filterEnvs(cfg.OSEnvs, cfg.InnerEnvs...)...) + + mounts := append(defaultMounts(), cfg.Mounts...) + + devices, err := ensureDevices(ctx, xos, log, blog, cfg.AddTUN, cfg.AddFUSE) + if err != nil { + return xerrors.Errorf("create devices: %w", err) + } + + log.Debug(ctx, "pulling image") + + err = dockerutil.PullImage(ctx, &dockerutil.PullImageConfig{ + Client: client, + Image: cfg.Tag.String(), + Auth: dockerAuth, + ProgressFn: dockerutil.DefaultLogImagePullFn(blog), + }) + if err != nil { + return xerrors.Errorf("pull image: %w", err) + } + + // After image pull we remount /sys so sysbox can have appropriate perms to create a container. + err = xos.Mount("/sys", "/sys", "", []string{"remount", "rw"}) + if err != nil { + return xerrors.Errorf("remount /sys: %w", err) + } + + if cfg.GPUConfig.HostUsrLibDir != "" { + // Unmount GPU drivers in /proc as it causes issues when creating any + // container in some cases (even the image metadata container). + _, err = xunix.TryUnmountProcGPUDrivers(ctx, xos, log) + if err != nil { + return xerrors.Errorf("unmount /proc GPU drivers: %w", err) + } + } + + log.Debug(ctx, "fetching image metadata") + + blog.Info("Getting image metadata...") + + // Get metadata about the image. We need to know things like the UID/GID + // of the user so that we can chown directories to the namespaced UID inside + // the inner container as well as whether we should be starting the container + // with /sbin/init or something simple like 'sleep infinity'. + imgMeta, err := dockerutil.GetImageMetadata(ctx, client, cfg.Tag.String(), cfg.Username) + if err != nil { + return xerrors.Errorf("get image metadata: %w", err) + } + + log.Debug(ctx, "fetched image metadata", + slog.F("uid", imgMeta.UID), + slog.F("gid", imgMeta.GID), + slog.F("has_init", imgMeta.HasInit), + ) + + blog.Infof("Detected entrypoint user '%s:%s' with home directory %q", imgMeta.UID, imgMeta.UID, imgMeta.HomeDir) + + log.Debug(ctx, "fetched image metadata", + slog.F("uid", imgMeta.UID), + slog.F("gid", imgMeta.GID), + slog.F("has_init", imgMeta.HasInit), + ) + + err = idShiftMounts(ctx, log, xos, mounts, imgMeta.UID, imgMeta.GID) + if err != nil { + return xerrors.Errorf("id shift mounts: %w", err) + } + + if cfg.GPUConfig.HostUsrLibDir != "" { + devs, binds, gpuEnvs, err := gpuMappings(ctx, log, xos, cfg.OSEnvs, cfg.GPUConfig.HostUsrLibDir) + if err != nil { + return xerrors.Errorf("gpu mappings: %w", err) + } + + mounts = append(mounts, binds...) + envs = append(envs, gpuEnvs...) + devices = append(devices, devs...) + } + + blog.Info("Creating workspace...") + + // Create the inner container. + containerID, err := dockerutil.CreateContainer(ctx, client, &dockerutil.ContainerConfig{ + Log: log, + Mounts: mounts, + Devices: devices, + Envs: envs, + Name: InnerContainerName, + Hostname: cfg.Hostname, + WorkingDir: cfg.WorkDir, + HasInit: imgMeta.HasInit, + Image: cfg.Tag.String(), + CPUs: int64(cfg.CPUS), + MemoryLimit: int64(cfg.Memory), + }) + if err != nil { + return xerrors.Errorf("create container: %w", err) + } + + blog.Info("Pruning images to free up disk...") + // Prune images to avoid taking up any unnecessary disk from the user. + _, err = dockerutil.PruneImages(ctx, client) + if err != nil { + return xerrors.Errorf("prune images: %w", err) + } + + // TODO fix iptables when istio detected. + + blog.Info("Starting up workspace...") + err = client.ContainerStart(ctx, containerID, container.StartOptions{}) + if err != nil { + return xerrors.Errorf("start container: %w", err) + } + + log.Debug(ctx, "creating bootstrap directory", slog.F("directory", imgMeta.HomeDir)) + + // Create the directory to which we will download the agent. + // We create this directory because the default behavior is + // to download the agent to /tmp/coder.XXXX. This causes a race to happen + // where we finish downloading the binary but before we can execute + // systemd remounts /tmp. + bootDir := filepath.Join(imgMeta.HomeDir, ".coder") + + blog.Infof("Creating %q directory to host Coder assets...", bootDir) + _, err = dockerutil.ExecContainer(ctx, client, dockerutil.ExecConfig{ + ContainerID: containerID, + User: strconv.Itoa(imgMeta.UID), + Cmd: "mkdir", + Args: []string{"-p", bootDir}, + }) + if err != nil { + return xerrors.Errorf("make bootstrap dir: %w", err) + } + + cpuQuota, err := xunix.ReadCPUQuota(ctx, log) + if err != nil { + blog.Infof("Unable to read CPU quota: %s", err.Error()) + } else { + log.Debug(ctx, "setting CPU quota", + slog.F("quota", cpuQuota.Quota), + slog.F("period", cpuQuota.Period), + slog.F("cgroup", cpuQuota.CGroup.String()), + ) + + // We want the inner container to have the same limits as the outer container + // so that processes inside the container know what they're working with. + if err := dockerutil.SetContainerQuota(ctx, containerID, cpuQuota); err != nil { + blog.Infof("Unable to set quota for inner container: %s", err.Error()) + blog.Info("This is not a fatal error, but it may cause cgroup-aware applications to misbehave.") + } + } + + blog.Info("Envbox startup complete!") + + // The bootstrap script doesn't return since it execs the agent + // meaning that it can get pretty noisy if we were to log by default. + // In order to allow users to discern issues getting the bootstrap script + // to complete successfully we pipe the output to stdout if + // CODER_DEBUG=true. + debugWriter := io.Discard + if cfg.Debug { + debugWriter = os.Stdout + } + // Bootstrap the container if a script has been provided. + blog.Infof("Bootstrapping workspace...") + err = dockerutil.BootstrapContainer(ctx, client, dockerutil.BootstrapConfig{ + ContainerID: containerID, + User: strconv.Itoa(imgMeta.UID), + Script: cfg.BoostrapScript, + // We set this because the default behavior is to download the agent + // to /tmp/coder.XXXX. This causes a race to happen where we finish + // downloading the binary but before we can execute systemd remounts + // /tmp. + Env: []string{fmt.Sprintf("BINARY_DIR=%s", bootDir)}, + StdOutErr: debugWriter, + }) + if err != nil { + return xerrors.Errorf("boostrap container: %w", err) + } + + return nil +} + +func gpuMappings(ctx context.Context, log slog.Logger, xos xunix.OS, environ []string, urlLibDir string) ([]container.DeviceMapping, []xunix.Mount, []string, error) { + devs, binds, err := xunix.GPUs(ctx, log, xos, urlLibDir) + if err != nil { + return nil, nil, nil, xerrors.Errorf("find gpus: %w", err) + } + + devices := make([]container.DeviceMapping, 0, len(devs)) + for _, dev := range devs { + devices = append(devices, container.DeviceMapping{ + PathOnHost: dev.Path, + PathInContainer: dev.Path, + CgroupPermissions: "rwm", + }) + } + + for i, bind := range binds { + // If the bind has a path that points to the host-mounted /usr/lib + // directory we need to remap it to /usr/lib inside the container. + bind.Mountpoint = bind.Source + if strings.HasPrefix(bind.Mountpoint, urlLibDir) { + bind.Mountpoint = filepath.Join( + "/usr/lib", + strings.TrimPrefix(bind.Mountpoint, strings.TrimSuffix(urlLibDir, "/")), + ) + } + binds[i] = bind + } + + envs := xunix.GPUEnvs(ctx, environ) + + return devices, binds, envs, nil +} + +// filterEnvs filters a list of environment variables returning a subset that matches +// the provided patterns. Patterns can be exact matches or use * as a wildcard. E.g. 'MY_ENV' +// matches only MY_ENV=value but 'MY_*' matches MY_ENV=value and MY_OTHER_ENV=value. +func filterEnvs(os []string, matches ...string) []string { + filtered := make([]string, 0, len(os)) + for _, f := range matches { + f = strings.TrimSpace(f) + for _, s := range os { + toks := strings.Split(s, "=") + if len(toks) < 2 { + // Malformed environment variable. + continue + } + + key := toks[0] + + if strings.HasSuffix(f, "*") { + filter := strings.TrimSuffix(f, "*") + if strings.HasPrefix(key, filter) { + filtered = append(filtered, s) + } + } else if key == f { + filtered = append(filtered, s) + } + } + } + + return filtered +} + +func defaultContainerEnvs(osEnvs []string, agentToken string) []string { + const agentSubsystemEnv = "CODER_AGENT_SUBSYSTEM" + existingSubsystem := "" + for _, e := range osEnvs { + if strings.HasPrefix(e, agentSubsystemEnv+"=") { + existingSubsystem = strings.TrimPrefix(e, agentSubsystemEnv+"=") + break + } + } + + // We should append to the existing agent subsystem if it exists. + agentSubsystem := "envbox" + if existingSubsystem != "" { + split := strings.Split(existingSubsystem, ",") + split = append(split, "envbox") + + tidy := make([]string, 0, len(split)) + seen := make(map[string]struct{}) + for _, s := range split { + s := strings.TrimSpace(s) + if _, ok := seen[s]; s == "" || ok { + continue + } + seen[s] = struct{}{} + tidy = append(tidy, s) + } + + sort.Strings(tidy) + agentSubsystem = strings.Join(tidy, ",") + } + + return []string{ + fmt.Sprintf("%s=%s", EnvAgentToken, agentToken), + fmt.Sprintf("%s=%s", "CODER_AGENT_SUBSYSTEM", agentSubsystem), + } +} + +// defaultMounts are bind mounts that are always provided to the inner +// container. +func defaultMounts() []xunix.Mount { + return []xunix.Mount{ + { + Source: "/var/lib/coder/docker", + Mountpoint: "/var/lib/docker", + }, + { + Source: "/var/lib/coder/containers", + Mountpoint: "/var/lib/containers", + }, + } +} + +// ensureDevices ensures that the devices are created and returns the resulting mappings. It also +// shifts the id of the owner of the devices to the user namespace offset. +func ensureDevices(ctx context.Context, fs xunix.FS, log slog.Logger, blog buildlog.Logger, tun, fuse bool) ([]container.DeviceMapping, error) { + devices := make([]container.DeviceMapping, 0, 2) + if tun { + log.Debug(ctx, "creating TUN device", slog.F("path", OuterTUNPath)) + blog.Info("Creating TUN device") + dev, err := xunix.CreateTUNDevice(fs, OuterTUNPath) + if err != nil { + return nil, xerrors.Errorf("create tun device: %w", err) + } + + devices = append(devices, container.DeviceMapping{ + PathOnHost: dev.Path, + PathInContainer: InnerTUNPath, + CgroupPermissions: "rwm", + }) + } + + if fuse { + log.Debug(ctx, "creating FUSE device", slog.F("path", OuterFUSEPath)) + blog.Info("Creating FUSE device") + dev, err := xunix.CreateFuseDevice(fs, OuterFUSEPath) + if err != nil { + return nil, xerrors.Errorf("create fuse device: %w", err) + } + + devices = append(devices, container.DeviceMapping{ + PathOnHost: dev.Path, + PathInContainer: InnerFUSEPath, + CgroupPermissions: "rwm", + }) + } + + log.Debug(ctx, "using devices", slog.F("devices", devices)) + + for _, device := range devices { + log.Debug(ctx, "chowning device", + slog.F("device", device.PathOnHost), + slog.F("uid", UserNamespaceOffset), + slog.F("gid", UserNamespaceOffset), + ) + err := fs.Chown(device.PathOnHost, UserNamespaceOffset, UserNamespaceOffset) + if err != nil { + return nil, xerrors.Errorf("chown device %q: %w", device.PathOnHost, err) + } + } + + return devices, nil +} + +func idShiftMounts(ctx context.Context, log slog.Logger, xos xunix.OS, mounts []xunix.Mount, uid, gid int) error { + for _, m := range mounts { + // Don't modify anything private to envbox. + if isPrivateMount(m) { + continue + } + + log.Debug(ctx, "chmod'ing directory", + slog.F("path", m.Source), + slog.F("mode", "02755"), + ) + + // If a mount is read-only we have to remount it rw so that we + // can id shift it correctly. We'll still mount it read-only into + // the inner container. + if m.ReadOnly { + err := xos.Mount("", m.Source, "", []string{"remount,rw"}) + if err != nil { + return xerrors.Errorf("remount: %w", err) + } + } + + err := xos.Chmod(m.Source, 0o2755) + if err != nil { + return xerrors.Errorf("chmod mountpoint %q: %w", m.Source, err) + } + + var ( + shiftedUID = shiftedID(0) + shiftedGID = shiftedID(0) + ) + + if isHomeDir(m.Source) { + // We want to ensure that the inner directory is ID shifted to + // the namespaced UID of the user in the inner container otherwise + // they won't be able to write files. + shiftedUID = shiftedID(int(uid)) + shiftedGID = shiftedID(int(gid)) + } + + log.Debug(ctx, "chowning mount", + slog.F("source", m.Source), + slog.F("target", m.Mountpoint), + slog.F("uid", shiftedUID), + slog.F("gid", shiftedGID), + ) + + // Any non-home directory we assume should be owned by id-shifted root + // user. + err = xos.Chown(m.Source, shiftedUID, shiftedGID) + if err != nil { + return xerrors.Errorf("chown mountpoint %q: %w", m.Source, err) + } + } + + return nil +} + +func isHomeDir(fpath string) bool { + if fpath == "/root" { + return true + } + + dir, _ := path.Split(fpath) + return dir == "/home/" +} + +// isPrivateMount returns true if the provided mount points to a mount +// private to the envbox container itself. +func isPrivateMount(m xunix.Mount) bool { + _, ok := envboxPrivateMounts[m.Mountpoint] + return ok +} + +var envboxPrivateMounts = map[string]struct{}{ + "/var/lib/containers": {}, + "/var/lib/docker": {}, + "/var/lib/sysbox": {}, + "/lib/modules": {}, + "/usr/src": {}, + // /var/lib/coder is not technically a mount + // private to envbox but it is specially handled + // by sysbox so it does not require any effort + // on our part. + "/var/lib/coder": {}, +} + +// shiftedID returns the ID but shifted to the user namespace offset we +// use for the inner container. +func shiftedID(id int) int { + return id + UserNamespaceOffset +} diff --git a/cvm/cvm_test.go b/cvm/cvm_test.go new file mode 100644 index 0000000..6ab25d6 --- /dev/null +++ b/cvm/cvm_test.go @@ -0,0 +1,653 @@ +package cvm_test + +import ( + "bufio" + "bytes" + "context" + "encoding/base64" + "fmt" + "io" + "net" + "os" + "path/filepath" + "strings" + "testing" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/envbox/cvm" + "github.com/coder/envbox/dockerutil" + "github.com/coder/envbox/dockerutil/dockerfake" + "github.com/coder/envbox/xunix" + "github.com/coder/envbox/xunix/xunixfake" + dockertypes "github.com/docker/docker/api/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" + "github.com/docker/docker/api/types/network" + "github.com/google/go-containerregistry/pkg/name" + v1 "github.com/opencontainers/image-spec/specs-go/v1" + "github.com/spf13/afero" + "github.com/stretchr/testify/require" + mount "k8s.io/mount-utils" +) + +func TestRun(t *testing.T) { + t.Parallel() + + t.Run("OK", func(t *testing.T) { + }) + + t.Run("Images", func(t *testing.T) { + t.Parallel() + type testcase struct { + name string + image string + success bool + } + + testcases := []testcase{ + { + name: "Repository", + image: "ubuntu", + success: true, + }, + { + name: "RepositoryPath", + image: "ubuntu/ubuntu", + success: true, + }, + + { + name: "RepositoryLatest", + image: "ubuntu:latest", + success: true, + }, + { + name: "RepositoryTag", + image: "ubuntu:24.04", + success: true, + }, + { + name: "RepositoryPathTag", + image: "ubuntu/ubuntu:18.04", + success: true, + }, + { + name: "RegistryRepository", + image: "gcr.io/ubuntu", + success: true, + }, + { + name: "RegistryRepositoryTag", + image: "gcr.io/ubuntu:24.04", + success: true, + }, + } + + for _, tc := range testcases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + tag, err := name.NewTag(tc.image) + require.NoError(t, err) + + ctx := context.Background() + client := NewFakeDockerClient() + log := slogtest.Make(t, nil) + os := xunixfake.NewFakeOS() + + var created bool + client.ContainerCreateFn = func(_ context.Context, conf *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, _ string) (container.CreateResponse, error) { + created = true + require.Equal(t, tc.image, conf.Image) + return container.CreateResponse{}, nil + } + + err = cvm.Run(ctx, log, os, client, cvm.Config{ + Tag: tag, + }) + + require.NoError(t, err) + require.True(t, created, "container create fn not called") + }) + } + }) + + // Test that user-provided env vars are passed through. + // It is valid to specify a wildcard so that all matching + // env vars are passed through. + t.Run("PassesThroughEnvVars", func(t *testing.T) { + t.Parallel() + var ( + cntEnvs = []string{ + "FOO", + "CODER_VAR", + "bar", + // Test that wildcard works. + "KUBERNETES_*", + "US_*", + } + + expectedEnvs = []string{ + "CODER_AGENT_TOKEN=hi", + "CODER_AGENT_SUBSYSTEM=envbox,exectrace", // sorted + "FOO=bar", + "CODER_VAR=baz", + "bar=123", + "KUBERNETES_SERVICE_HOST=10.0.0.1", + "KUBERNETES_PORT=tcp://10.0.0.1:443", + "KUBERNETES_PORT_443_TCP_PORT=443", + } + + osEnvs = (append([]string{ + "USER=root", + "USA=yay", + "HOME=/root", + "PATH=/usr/bin:/sbin:/bin", + // Envbox should add to this. + "CODER_AGENT_SUBSYSTEM=exectrace", + // Don't include the wildcards. + }, expectedEnvs...)) + ) + + ctx := context.Background() + log := slogtest.Make(t, nil) + xos := xunixfake.NewFakeOS() + client := NewFakeDockerClient() + + var called bool + client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.ElementsMatch(t, expectedEnvs, config.Env) + } + return container.CreateResponse{}, nil + } + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + OSEnvs: osEnvs, + InnerEnvs: cntEnvs, + }) + require.NoError(t, err) + require.True(t, called, "create function was not called") + }) + + // Test that we parse mounts correctly. + t.Run("Mounts", func(t *testing.T) { + t.Parallel() + + var ( + userMounts = []xunix.Mount{ + { + Source: "/home/coder", + Mountpoint: "/home/coder", + }, + { + Source: "/etc/hosts", + Mountpoint: "/etc/hosts", + ReadOnly: true, + }, + { + Source: "/etc/hostname", + Mountpoint: "/idc/where", + ReadOnly: true, + }, + { + Source: "/usr/src", + Mountpoint: "/a/b/c", + }, + } + + expectedMounts = append([]string{"/var/lib/coder/docker:/var/lib/docker", "/var/lib/coder/containers:/var/lib/containers"}, mountsToString(userMounts)...) + ) + + var ( + ctx = context.Background() + log = slogtest.Make(t, nil) + xos = xunixfake.NewFakeOS() + client = NewFakeDockerClient() + ) + + for _, mount := range userMounts { + err := afero.WriteFile(xos, mount.Source, []byte("hi"), 0o777) + require.NoError(t, err) + } + + // Set the exec response from inspecting the image to some ID + // greater than 0. + client.ContainerExecAttachFn = func(_ context.Context, _ string, _ dockertypes.ExecStartCheck) (dockertypes.HijackedResponse, error) { + return dockertypes.HijackedResponse{ + Reader: bufio.NewReader(strings.NewReader("root:x:1001:1001:root:/root:/bin/bash")), + Conn: &net.IPConn{}, + }, nil + } + + var called bool + client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.Equal(t, expectedMounts, hostConfig.Binds) + } + + return container.CreateResponse{}, nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + Mounts: userMounts, + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + + fi, err := xos.Stat("/home/coder") + require.NoError(t, err) + require.Equal(t, os.FileMode(0o755), fi.Mode().Perm()) + // Check that we're calling chown and shifting the ID. + owner, ok := xos.GetFileOwner("/home/coder") + require.True(t, ok) + require.Equal(t, cvm.UserNamespaceOffset+1001, owner.UID) + require.Equal(t, cvm.UserNamespaceOffset+1001, owner.GID) + }) + + t.Run("RemountSysfs", func(t *testing.T) { + t.Parallel() + + ctx := context.Background() + log := slogtest.Make(t, nil) + xos := xunixfake.NewFakeOS() + client := NewFakeDockerClient() + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + }) + require.NoError(t, err) + + actions := xos.GetLog() + require.Len(t, actions, 1) + action := actions[0] + require.Equal(t, "mount", action.Action) + require.Equal(t, "", action.FSType) + require.Equal(t, "/sys", action.Source) + require.Equal(t, "/sys", action.Target) + }) + + t.Run("Devices", func(t *testing.T) { + t.Parallel() + + ctx := context.Background() + log := slogtest.Make(t, nil) + xos := xunixfake.NewFakeOS() + client := NewFakeDockerClient() + + expectedDevices := []container.DeviceMapping{ + { + PathOnHost: cvm.OuterTUNPath, + PathInContainer: cvm.InnerTUNPath, + CgroupPermissions: "rwm", + }, + { + PathOnHost: cvm.OuterFUSEPath, + PathInContainer: cvm.InnerFUSEPath, + CgroupPermissions: "rwm", + }, + } + + var called bool + client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.Equal(t, expectedDevices, hostConfig.Devices) + } + return container.CreateResponse{}, nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + AddTUN: true, + AddFUSE: true, + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + + owner, ok := xos.GetFileOwner(cvm.OuterTUNPath) + require.True(t, ok) + require.Equal(t, cvm.UserNamespaceOffset, owner.UID) + require.Equal(t, cvm.UserNamespaceOffset, owner.GID) + + owner, ok = xos.GetFileOwner(cvm.OuterFUSEPath) + require.True(t, ok) + require.Equal(t, cvm.UserNamespaceOffset, owner.UID) + require.Equal(t, cvm.UserNamespaceOffset, owner.GID) + }) + + // Tests that 'sleep infinity' is used if /sbin/init + // isn't detected. + t.Run("NoInit", func(t *testing.T) { + t.Parallel() + + var ( + ctx = context.Background() + client = NewFakeDockerClient() + xos = xunixfake.NewFakeOS() + log = slogtest.Make(t, nil) + statExecID = "hi" + ) + + client.ContainerExecCreateFn = func(_ context.Context, _ string, config dockertypes.ExecConfig) (dockertypes.IDResponse, error) { + if config.Cmd[0] == "stat" { + return dockertypes.IDResponse{ + ID: statExecID, + }, nil + } + return dockertypes.IDResponse{}, nil + } + + // Set the exec response from inspecting the image to some ID + // greater than 0. + client.ContainerExecInspectFn = func(_ context.Context, execID string) (dockertypes.ContainerExecInspect, error) { + if execID == statExecID { + return dockertypes.ContainerExecInspect{ExitCode: 1}, nil + } + + return dockertypes.ContainerExecInspect{}, nil + } + + var called bool + client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.Equal(t, []string{"sleep", "infinity"}, []string(config.Entrypoint)) + } + + return container.CreateResponse{}, nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + }) + + t.Run("DockerAuth", func(t *testing.T) { + t.Parallel() + + var ( + ctx = context.Background() + log = slogtest.Make(t, nil) + xos = xunixfake.NewFakeOS() + client = NewFakeDockerClient() + ) + + raw := []byte(`{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }"}`) + + authB64 := base64.URLEncoding.EncodeToString(raw) + + var called bool + client.ImagePullFn = func(_ context.Context, _ string, options image.PullOptions) (io.ReadCloser, error) { + called = true + // Assert that we call the image pull function with the credentials. + require.Equal(t, authB64, options.RegistryAuth) + return io.NopCloser(bytes.NewReader(nil)), nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + Tag: requireParseImage(t, "us.gcr.io/ubuntu"), + Username: "root", + AgentToken: "hi", + ImagePullSecret: rawDockerAuth, + // Really weird but afero.FS doesn't return an erro + // for calling Stat() on an empty value. + DockerConfig: "/root/.config/idontexist", + }) + require.NoError(t, err) + require.True(t, called, "image pull fn not called") + }) + + t.Run("SetsResources", func(t *testing.T) { + t.Parallel() + + const ( + // 4GB. + memory = 4 << 30 + cpus = 6 + ) + + var ( + ctx = context.Background() + log = slogtest.Make(t, nil) + xos = xunixfake.NewFakeOS() + client = NewFakeDockerClient() + ) + + var called bool + client.ContainerCreateFn = func(_ context.Context, _ *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.Equal(t, int64(memory), hostConfig.Memory) + require.Equal(t, int64(cpus*dockerutil.DefaultCPUPeriod), hostConfig.CPUQuota) + require.Equal(t, int64(dockerutil.DefaultCPUPeriod), hostConfig.CPUPeriod) + } + return container.CreateResponse{}, nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + Memory: memory, + CPUS: cpus, + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + }) + + t.Run("GPU", func(t *testing.T) { + t.Parallel() + + var ( + ctx = context.Background() + log = slogtest.Make(t, nil) + xos = xunixfake.NewFakeOS() + client = NewFakeDockerClient() + ) + + var ( + procGPUDrivers = []string{ + "/proc/vulkan/foo", + "/proc/nvidia/bar", + "/proc/cuda/baz", + } + + usrLibMountpoint = "/var/coder/usr/lib/" + + expectedUsrLibFiles = []string{ + filepath.Join(usrLibMountpoint, "nvidia", "libglxserver_nvidia.so"), + filepath.Join(usrLibMountpoint, "libnvidia-ml.so"), + } + + expectedEnvs = []string{ + "NVIDIA_TEST=1", + "TEST_NVIDIA=1", + "nvidia_test=1", + } + ) + + environ := append([]string{ + "LIBGL_TEST=1", + "VULKAN_TEST=1", + }, expectedEnvs...) + + for _, file := range append(expectedUsrLibFiles, procGPUDrivers...) { + _, err := xos.Create(file) + require.NoError(t, err) + } + + xos.MountPoints = []mount.MountPoint{ + { + Device: "/dev/sda1", + Path: "/usr/local/nvidia", + Opts: []string{"rw"}, + }, + { + Device: "/dev/sda2", + Path: "/etc/hosts", + }, + { + Path: "/dev/nvidia0", + }, + { + Path: "/dev/nvidia1", + }, + } + + for _, driver := range procGPUDrivers { + xos.MountPoints = append(xos.MountPoints, mount.MountPoint{ + Path: driver, + }) + } + + _, err := xos.Create("/usr/local/nvidia") + require.NoError(t, err) + + unmounts := []string{} + xos.UnmountFunc = func(path string) error { + unmounts = append(unmounts, path) + return nil + } + + var called bool + client.ContainerCreateFn = func(_ context.Context, config *container.Config, hostConfig *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + // Test that '/dev' mounts are passed as devices. + require.Contains(t, hostConfig.Devices, container.DeviceMapping{ + PathOnHost: "/dev/nvidia0", + PathInContainer: "/dev/nvidia0", + CgroupPermissions: "rwm", + }) + require.Contains(t, hostConfig.Devices, container.DeviceMapping{ + PathOnHost: "/dev/nvidia1", + PathInContainer: "/dev/nvidia1", + CgroupPermissions: "rwm", + }) + + // Test that the mountpoint that we provided that is not under + // '/dev' is passed as a bind mount. + require.Contains(t, hostConfig.Binds, fmt.Sprintf("%s:%s", "/usr/local/nvidia", "/usr/local/nvidia")) + + // Test that host /usr/lib bind mounts were passed through as read-only. + for _, file := range expectedUsrLibFiles { + require.Contains(t, hostConfig.Binds, fmt.Sprintf("%s:%s:ro", + file, + strings.ReplaceAll(file, usrLibMountpoint, "/usr/lib/"), + )) + } + + // Test that we captured the GPU-related env vars. + for _, env := range expectedEnvs { + require.Contains(t, config.Env, env) + } + + } + return container.CreateResponse{}, nil + } + + err = cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Username: "root", + OSEnvs: environ, + GPUConfig: cvm.GPUConfig{ + HostUsrLibDir: usrLibMountpoint, + }, + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + // Assert that we unmounted /proc GPU drivers. + for _, driver := range procGPUDrivers { + require.Contains(t, unmounts, driver) + } + }) + + t.Run("Hostname", func(t *testing.T) { + t.Parallel() + + var ( + ctx = context.Background() + log = slogtest.Make(t, nil) + xos = xunixfake.NewFakeOS() + client = NewFakeDockerClient() + ) + + var called bool + client.ContainerCreateFn = func(_ context.Context, config *container.Config, _ *container.HostConfig, _ *network.NetworkingConfig, _ *v1.Platform, containerName string) (container.CreateResponse, error) { + if containerName == cvm.InnerContainerName { + called = true + require.Equal(t, "hello-world", config.Hostname) + } + return container.CreateResponse{}, nil + } + + err := cvm.Run(ctx, log, xos, client, cvm.Config{ + AgentToken: "hi", + Tag: requireParseImage(t, "ubuntu"), + Hostname: "hello-world", + }) + require.NoError(t, err) + require.True(t, called, "container create fn not called") + }) +} + +func requireParseImage(t *testing.T, image string) name.Tag { + t.Helper() + + tag, err := name.NewTag(image) + require.NoError(t, err) + return tag +} + +func NewFakeDockerClient() *dockerfake.MockClient { + client := &dockerfake.MockClient{} + + client.ContainerInspectFn = func(_ context.Context, _ string) (dockertypes.ContainerJSON, error) { + return dockertypes.ContainerJSON{ + ContainerJSONBase: &dockertypes.ContainerJSONBase{ + GraphDriver: dockertypes.GraphDriverData{ + Data: map[string]string{"MergedDir": "blah"}, + }, + }, + }, nil + } + + client.ContainerExecAttachFn = func(_ context.Context, _ string, _ dockertypes.ExecStartCheck) (dockertypes.HijackedResponse, error) { + return dockertypes.HijackedResponse{ + Reader: bufio.NewReader(strings.NewReader("root:x:0:0:root:/root:/bin/bash")), + Conn: &net.IPConn{}, + }, nil + } + + return client +} + +func mountsToString(mounts []xunix.Mount) []string { + binds := make([]string, 0, len(mounts)) + for _, mount := range mounts { + binds = append(binds, mount.String()) + } + return binds +} + +// rawDockerAuth is sample input for a kubernetes secret to a gcr.io private +// registry. +const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}` diff --git a/dockerutil/daemon.go b/dockerutil/daemon.go index 45a602e..64ea755 100644 --- a/dockerutil/daemon.go +++ b/dockerutil/daemon.go @@ -2,11 +2,70 @@ package dockerutil import ( "context" + "fmt" "time" + "cdr.dev/slog" + "github.com/coder/envbox/background" + "github.com/coder/envbox/xunix" dockerclient "github.com/docker/docker/client" + "golang.org/x/xerrors" ) +const noSpaceDataDir = "/var/lib/docker.bak" + +type DaemonOptions struct { + Link string + CIDR string + Driver string +} + +func StartDaemon(ctx context.Context, log slog.Logger, opts *DaemonOptions) (*background.Process, error) { + // We need to adjust the MTU for the host otherwise packets will fail delivery. + // 1500 is the standard, but certain deployments (like GKE) use custom MTU values. + // See: https://www.atlantis-press.com/journals/ijndc/125936177/view#sec-s3.1 + + mtu, err := xunix.NetlinkMTU(opts.Link) + if err != nil { + return nil, xerrors.Errorf("custom mtu: %w", err) + } + + // We set the Docker Bridge IP explicitly here for a number of reasons: + // 1) It sometimes picks the 172.17.x.x address which conflicts with that of the Docker daemon in the inner container. + // 2) It defaults to a /16 network which is way more than we need for envbox. + // 3) The default may conflict with existing internal network resources, and an operator may wish to override it. + dockerBip, prefixLen := BridgeIPFromCIDR(opts.CIDR) + + args := []string{ + "--debug", + "--log-level=debug", + fmt.Sprintf("--mtu=%d", mtu), + "--userns-remap=coder", + "--storage-driver=overlay2", + fmt.Sprintf("--bip=%s/%d", dockerBip, prefixLen), + } + + if opts.Driver != "" { + args = append(args, + fmt.Sprintf("--storage-driver=%s", opts.Driver), + ) + } + + if opts.Driver == "vfs" { + args = append(args, + fmt.Sprintf("--data-root=%s", noSpaceDataDir), + ) + } + + p := background.New(ctx, log, "dockerd", args...) + err = p.Start() + if err != nil { + return nil, xerrors.Errorf("start dockerd: %w", err) + } + + return p, nil +} + // WaitForDaemon waits for a Docker daemon to startup. It waits a max // of 5m before giving up. func WaitForDaemon(ctx context.Context, client Client) error { diff --git a/dockerutil/image.go b/dockerutil/image.go index ffb2bdb..469ef6e 100644 --- a/dockerutil/image.go +++ b/dockerutil/image.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "io" + "strconv" "strings" "time" @@ -148,8 +149,8 @@ func processImagePullEvents(r io.Reader, fn ImagePullProgressFn) error { } type ImageMetadata struct { - UID string - GID string + UID int + GID int HomeDir string HasInit bool } @@ -226,9 +227,19 @@ func GetImageMetadata(ctx context.Context, client Client, img, username string) return ImageMetadata{}, xerrors.Errorf("no users returned for username %s", username) } + uid, err := strconv.ParseInt(users[0].Uid, 10, 32) + if err != nil { + return ImageMetadata{}, xerrors.Errorf("parse uid: %w", err) + } + + gid, err := strconv.ParseInt(users[0].Gid, 10, 32) + if err != nil { + return ImageMetadata{}, xerrors.Errorf("parse gid: %w", err) + } + return ImageMetadata{ - UID: users[0].Uid, - GID: users[0].Gid, + UID: int(uid), + GID: int(gid), HomeDir: users[0].HomeDir, HasInit: initExists, }, nil diff --git a/go.mod b/go.mod index 6e8dffc..0e7491d 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,6 @@ require ( github.com/stretchr/testify v1.9.0 github.com/vishvananda/netlink v1.2.1-beta.2 golang.org/x/crypto v0.31.0 - golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/mod v0.19.0 golang.org/x/sys v0.28.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 @@ -201,6 +200,7 @@ require ( go.uber.org/atomic v1.11.0 // indirect go4.org/mem v0.0.0-20220726221520-4f986261bf13 // indirect go4.org/netipx v0.0.0-20230728180743-ad4cb58a6516 // indirect + golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 // indirect golang.org/x/net v0.33.0 // indirect golang.org/x/oauth2 v0.23.0 // indirect golang.org/x/sync v0.10.0 // indirect diff --git a/go.sum b/go.sum index d510e1c..d60cb21 100644 --- a/go.sum +++ b/go.sum @@ -618,8 +618,6 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= -golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= -golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= diff --git a/integration/docker_test.go b/integration/docker_test.go index 9f88b09..1e28c1f 100644 --- a/integration/docker_test.go +++ b/integration/docker_test.go @@ -18,6 +18,7 @@ import ( "github.com/stretchr/testify/require" "github.com/coder/envbox/cli" + "github.com/coder/envbox/cvm" "github.com/coder/envbox/dockerutil" "github.com/coder/envbox/integration/integrationtest" ) @@ -178,14 +179,14 @@ func TestDocker(t *testing.T) { // Assert that the FUSE device exists. _, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ ContainerID: resource.Container.ID, - Cmd: []string{"stat", cli.InnerFUSEPath}, + Cmd: []string{"stat", cvm.InnerFUSEPath}, }) require.NoError(t, err) // Assert that the TUN device exists. _, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ ContainerID: resource.Container.ID, - Cmd: []string{"stat", cli.InnerTUNPath}, + Cmd: []string{"stat", cvm.InnerTUNPath}, }) require.NoError(t, err) diff --git a/xunix/device.go b/xunix/device.go index 0392af0..25d3faf 100644 --- a/xunix/device.go +++ b/xunix/device.go @@ -1,7 +1,6 @@ package xunix import ( - "context" "os" "path/filepath" @@ -31,7 +30,7 @@ type Device struct { GID int32 } -func CreateTUNDevice(ctx context.Context, path string) (Device, error) { +func CreateTUNDevice(xfs FS, path string) (Device, error) { const ( major uint = 10 // See https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/devices.txt#L336 @@ -39,7 +38,7 @@ func CreateTUNDevice(ctx context.Context, path string) (Device, error) { ) // TODO offset (from legacy.go) - err := createDevice(ctx, deviceConfig{ + err := createDevice(xfs, deviceConfig{ path: path, mode: charDevMode, dev: dev(major, minor), @@ -60,7 +59,7 @@ func CreateTUNDevice(ctx context.Context, path string) (Device, error) { }, nil } -func CreateFuseDevice(ctx context.Context, path string) (Device, error) { +func CreateFuseDevice(xfs FS, path string) (Device, error) { const ( major uint = 10 @@ -68,7 +67,7 @@ func CreateFuseDevice(ctx context.Context, path string) (Device, error) { minor uint = 229 ) - err := createDevice(ctx, deviceConfig{ + err := createDevice(xfs, deviceConfig{ path: path, mode: charDevMode, dev: dev(major, minor), @@ -98,24 +97,23 @@ type deviceConfig struct { ftype uint32 } -func createDevice(ctx context.Context, conf deviceConfig) error { +func createDevice(xfs FS, conf deviceConfig) error { var ( - fs = GetFS(ctx) dir = filepath.Dir(conf.path) ) - err := fs.MkdirAll(dir, 0o700) + err := xfs.MkdirAll(dir, 0o700) if err != nil { return xerrors.Errorf("ensure parent dir: %w", err) } //nolint:gosec - err = fs.Mknod(conf.path, conf.ftype|conf.mode, int(conf.dev)) + err = xfs.Mknod(conf.path, conf.ftype|conf.mode, int(conf.dev)) if err != nil { return xerrors.Errorf("mknod %s c %d %d: %w", conf.path, conf.major, conf.minor, err) } - err = fs.Chmod(conf.path, os.FileMode(conf.mode)) + err = xfs.Chmod(conf.path, os.FileMode(conf.mode)) if err != nil { return xerrors.Errorf("chown %v %q: %w", conf.mode, conf.path, err) } diff --git a/xunix/gpu.go b/xunix/gpu.go index a494ab5..d684ca8 100644 --- a/xunix/gpu.go +++ b/xunix/gpu.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "regexp" + "slices" "sort" "strings" @@ -22,11 +23,9 @@ var ( gpuEnvRegex = regexp.MustCompile("(?i)nvidia") ) -func GPUEnvs(ctx context.Context) []string { - envs := Environ(ctx) - +func GPUEnvs(ctx context.Context, environ []string) []string { gpus := []string{} - for _, env := range envs { + for _, env := range environ { name := strings.Split(env, "=")[0] if gpuEnvRegex.MatchString(name) { gpus = append(gpus, env) @@ -36,14 +35,13 @@ func GPUEnvs(ctx context.Context) []string { return gpus } -func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []mount.MountPoint, error) { +func GPUs(ctx context.Context, log slog.Logger, xos OS, usrLibDir string) ([]Device, []Mount, error) { var ( - mounter = Mounter(ctx) devices = []Device{} binds = []mount.MountPoint{} ) - mounts, err := mounter.List() + mounts, err := xos.List() if err != nil { return nil, nil, xerrors.Errorf("list mounts: %w", err) } @@ -66,7 +64,7 @@ func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []m } } - extraGPUS, err := usrLibGPUs(ctx, log, usrLibDir) + extraGPUS, err := usrLibGPUs(ctx, log, xos, usrLibDir) if err != nil { return nil, nil, xerrors.Errorf("find %q gpus: %w", usrLibDir, err) } @@ -84,16 +82,15 @@ func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []m } } - return devices, binds, nil + return devices, toMounts(binds), nil } -func usrLibGPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]mount.MountPoint, error) { +func usrLibGPUs(ctx context.Context, log slog.Logger, xfs FS, usrLibDir string) ([]mount.MountPoint, error) { var ( - afs = GetFS(ctx) binds = []string{} ) - err := afero.Walk(afs, usrLibDir, + err := afero.Walk(xfs, usrLibDir, func(path string, _ fs.FileInfo, err error) error { if path == usrLibDir && err != nil { return xerrors.Errorf("stat /usr/lib mountpoint %q: %w", usrLibDir, err) @@ -107,7 +104,7 @@ func usrLibGPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]mount return nil } - paths, err := recursiveSymlinks(afs, usrLibDir, path) + paths, err := recursiveSymlinks(xfs, usrLibDir, path) if err != nil { log.Error(ctx, "find recursive symlinks", slog.F("path", path), slog.Error(err)) } @@ -178,10 +175,8 @@ func recursiveSymlinks(afs FS, mountpoint string, path string) ([]string, error) // TryUnmountProcGPUDrivers unmounts any GPU-related mounts under /proc as it causes // issues when creating any container in some cases. Errors encountered while // unmounting are treated as non-fatal. -func TryUnmountProcGPUDrivers(ctx context.Context, log slog.Logger) ([]mount.MountPoint, error) { - mounter := Mounter(ctx) - - mounts, err := mounter.List() +func TryUnmountProcGPUDrivers(ctx context.Context, mnt mount.Interface, log slog.Logger) ([]mount.MountPoint, error) { + mounts, err := mnt.List() if err != nil { return nil, xerrors.Errorf("list mounts: %w", err) } @@ -195,7 +190,7 @@ func TryUnmountProcGPUDrivers(ctx context.Context, log slog.Logger) ([]mount.Mou drivers := []mount.MountPoint{} for _, m := range mounts { if strings.HasPrefix(m.Path, "/proc/") && gpuMountRegex.MatchString(m.Path) { - err := mounter.Unmount(m.Path) + err := mnt.Unmount(m.Path) if err != nil { log.Warn(ctx, "umount potentially problematic mount", @@ -210,3 +205,18 @@ func TryUnmountProcGPUDrivers(ctx context.Context, log slog.Logger) ([]mount.Mou return drivers, nil } + +func toMounts(mounts []mount.MountPoint) []Mount { + m := make([]Mount, 0, len(mounts)) + for _, mount := range mounts { + m = append(m, toMount(mount)) + } + return m +} + +func toMount(m mount.MountPoint) Mount { + return Mount{ + Source: m.Path, + ReadOnly: slices.Contains(m.Opts, "ro"), + } +} diff --git a/xunix/gpu_test.go b/xunix/gpu_test.go index 4cbf5f0..d87a4be 100644 --- a/xunix/gpu_test.go +++ b/xunix/gpu_test.go @@ -31,7 +31,14 @@ func TestGPUEnvs(t *testing.T) { } }) - envs := xunix.GPUEnvs(ctx) + envs := xunix.GPUEnvs(ctx, []string{ + "NVIDIA_TEST=1", + "VULKAN_TEST=1", + "LIBGL_TEST=1", + "TEST_NVIDIA=1", + "nvidia_test=1", + }, + ) require.Contains(t, envs, "NVIDIA_TEST=1") require.Contains(t, envs, "TEST_NVIDIA=1") require.Contains(t, envs, "nvidia_test=1") @@ -95,7 +102,11 @@ func TestGPUs(t *testing.T) { require.NoError(t, err) } - devices, binds, err := xunix.GPUs(ctx, log, usrLibMountpoint) + linuxOS := xunix.LinuxOS{ + FS: fs, + Interface: mounter, + } + devices, binds, err := xunix.GPUs(ctx, log, linuxOS, usrLibMountpoint) require.NoError(t, err) require.Len(t, devices, 2, "unexpected 2 nvidia devices") require.Len(t, binds, 3, "expected 4 nvidia binds") diff --git a/xunix/mount.go b/xunix/mount.go index 84ce288..3e62427 100644 --- a/xunix/mount.go +++ b/xunix/mount.go @@ -2,6 +2,7 @@ package xunix import ( "context" + "fmt" mount "k8s.io/mount-utils" ) @@ -28,6 +29,15 @@ type Mount struct { ReadOnly bool } +// String returns the bind mount string for the mount. +func (m Mount) String() string { + bind := fmt.Sprintf("%s:%s", m.Source, m.Mountpoint) + if m.ReadOnly { + bind += ":ro" + } + return bind +} + func MountFS(ctx context.Context, source, mountpoint, fstype string, options ...string) error { return Mounter(ctx). Mount(source, mountpoint, fstype, options) diff --git a/xunix/sys.go b/xunix/sys.go index eef07cd..3e806b5 100644 --- a/xunix/sys.go +++ b/xunix/sys.go @@ -9,10 +9,30 @@ import ( "github.com/spf13/afero" "golang.org/x/xerrors" + mount "k8s.io/mount-utils" "cdr.dev/slog" ) +type OS interface { + FS + mount.Interface +} + +var _ OS = &LinuxOS{} + +func NewLinuxOS() *LinuxOS { + return &LinuxOS{ + FS: &osFS{&afero.OsFs{}}, + Interface: mount.New("/bin/mount"), + } +} + +type LinuxOS struct { + FS + mount.Interface +} + type CPUQuota struct { Quota int Period int diff --git a/xunix/xunixfake/fs.go b/xunix/xunixfake/fs.go index 4a54fac..e8c3390 100644 --- a/xunix/xunixfake/fs.go +++ b/xunix/xunixfake/fs.go @@ -7,8 +7,21 @@ import ( "github.com/spf13/afero" "golang.org/x/xerrors" + mount "k8s.io/mount-utils" ) +type FakeOS struct { + *MemFS + *mount.FakeMounter +} + +func NewFakeOS() *FakeOS { + return &FakeOS{ + MemFS: NewMemFS(), + FakeMounter: &mount.FakeMounter{}, + } +} + type FileOwner struct { UID int GID int