diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f3ffc86..c106dd0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -8,7 +8,7 @@ on: pull_request: workflow_dispatch: - + permissions: actions: read checks: none @@ -209,7 +209,7 @@ jobs: category: "Trivy" - name: Upload Trivy scan results as an artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: trivy path: trivy-results.sarif diff --git a/Makefile b/Makefile index 2c3655b..b230ab7 100644 --- a/Makefile +++ b/Makefile @@ -32,3 +32,11 @@ fmt/go: .PHONY: fmt/md fmt/md: go run github.com/Kunde21/markdownfmt/v3/cmd/markdownfmt@v3.1.0 -w ./README.md + +.PHONY: test +test: + go test -v -count=1 ./... + +.PHONY: test-integration +test-integration: + CODER_TEST_INTEGRATION=1 go test -v -count=1 ./integration/ diff --git a/README.md b/README.md index 7ed5bda..d09c4a5 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,10 @@ The environment variables can be used to configure various aspects of the inner | `CODER_INNER_HOSTNAME` | The hostname to use for the inner container. | false | | `CODER_IMAGE_PULL_SECRET` | The docker credentials to use when pulling the inner container. The recommended way to do this is to create an [Image Pull Secret](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/#create-a-secret-by-providing-credentials-on-the-command-line) and then reference the secret using an [environment variable](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#define-container-environment-variables-using-secret-data). See below for example. | false | | `CODER_DOCKER_BRIDGE_CIDR` | The bridge CIDR to start the Docker daemon with. | false | +| `CODER_BOOTSTRAP_SCRIPT` | The script to use to bootstrap the container. This should typically install and start the agent. | false | | `CODER_MOUNTS` | A list of mounts to mount into the inner container. Mounts default to `rw`. Ex: `CODER_MOUNTS=/home/coder:/home/coder,/var/run/mysecret:/var/run/mysecret:ro` | false | | `CODER_USR_LIB_DIR` | The mountpoint of the host `/usr/lib` directory. Only required when using GPUs. | false | +| `CODER_INNER_USR_LIB_DIR` | The inner /usr/lib mountpoint. This is automatically detected based on `/etc/os-release` in the inner image, but may optionally be overridden. | false | | `CODER_ADD_TUN` | If `CODER_ADD_TUN=true` add a TUN device to the inner container. | false | | `CODER_ADD_FUSE` | If `CODER_ADD_FUSE=true` add a FUSE device to the inner container. | false | | `CODER_ADD_GPU` | If `CODER_ADD_GPU=true` add detected GPUs and related files to the inner container. Requires setting `CODER_USR_LIB_DIR` and mounting in the hosts `/usr/lib/` directory. | false | @@ -43,7 +45,7 @@ It is not possible to develop `envbox` effectively using a containerized environ If a login is required to pull images from a private repository, create a secret following the instructions from the [Kubernetes Documentation](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/#create-a-secret-by-providing-credentials-on-the-command-line) as such: -``` +```shell kubectl -n create secret docker-registry regcred \ --docker-server= \ --docker-username= \ @@ -53,7 +55,7 @@ kubectl -n create secret docker-registry regcred \ Then reference the secret in your template as such: -``` +```shell env { name = "CODER_IMAGE_PULL_SECRET" value_from { @@ -86,3 +88,86 @@ env { > } > } > ``` + +## GPUs + +When passing through GPUs to the inner container, you may end up using associated tooling such as the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html) or the [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html). These will inject required utilities and libraries inside the inner container. You can verify this by directly running (without Envbox) a barebones image like `debian:bookworm` and running `mount` or `nvidia-smi` inside the container. + +Envbox will detect these mounts and pass them inside the inner container it creates, so that GPU-aware tools run inside the inner container can still utilize these libraries. + +Here's an example Docker command to run a GPU-enabled workload in Envbox. Note the following: + +1) The NVidia container runtime must be installed on the host (`--runtime=nvidia`). +2) `CODER_ADD_GPU=true` must be set to enable GPU-specific functionality. +3) When `CODER_ADD_GPU` is set, it is required to also set `CODER_USR_LIB_DIR` to a location where the relevant host directory has been mounted inside the outer container. In the below example, this is `/usr/lib/x86_64-linux-gnu` on the underlying host. It is mounted into the container under the path `/var/coder/usr/lib`. We then set `CODER_USR_LIB_DIR=/var/coder/usr/lib`. The actual location inside the container is not important **as long as it does not overwrite any pre-existing directories containing system libraries**. +4) The location to mount the libraries in the inner container is determined by the distribution ID in the `/etc/os-release` of the inner container. If the automatically determined location is incorrect (e.g. `nvidia-smi` complains about not being able to find libraries), you can set it manually via `CODER_INNER_USR_LIB_DIR`. + + > Note: this step is required in case user workloads require libraries from the underlying host that are not added in by the container runtime. + +```shell +docker run -it --rm \ + --runtime=nvidia \ + --gpus=all \ + --name=envbox-gpu-test \ + -v /tmp/envbox/docker:/var/lib/coder/docker \ + -v /tmp/envbox/containers:/var/lib/coder/containers \ + -v /tmp/envbox/sysbox:/var/lib/sysbox \ + -v /tmp/envbox/docker:/var/lib/docker \ + -v /usr/src:/usr/src:ro \ + -v /lib/modules:/lib/modules:ro \ + -v /usr/lib/x86_64-linux-gnu:/var/coder/usr/lib \ + --privileged \ + -e CODER_INNER_IMAGE=nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 \ + -e CODER_INNER_USERNAME=root \ + -e CODER_ADD_GPU=true \ + -e CODER_USR_LIB_DIR=/var/coder/usr/lib \ + envbox:latest /envbox docker +``` + +To validate GPU functionality, you can run the following commands: + +1) To validate that the container runtime correctly passed the required GPU tools into the outer container, run: + + ```shell + docker exec -it envbox-gpu-test nvidia-smi + ``` + +2) To validate the same inside the inner container, run: + + ```shell + docker exec -it envbox-gpu-test docker exec -it workspace_cvm nvidia-smi + ``` + +3) To validate that the sample CUDA application inside the container runs correctly: + + ```shell + docker exec -it envbox-gpu-test docker exec -it workspace_cvm /tmp/vectorAdd + ``` + +## Hacking + +Here's a simple one-liner to run the `codercom/enterprise-minimal:ubuntu` image in Envbox using Docker: + +```shell +docker run -it --rm \ + -v /tmp/envbox/docker:/var/lib/coder/docker \ + -v /tmp/envbox/containers:/var/lib/coder/containers \ + -v /tmp/envbox/sysbox:/var/lib/sysbox \ + -v /tmp/envbox/docker:/var/lib/docker \ + -v /usr/src:/usr/src:ro \ + -v /lib/modules:/lib/modules:ro \ + --privileged \ + -e CODER_INNER_IMAGE=codercom/enterprise-minimal:ubuntu \ + -e CODER_INNER_USERNAME=coder \ + envbox:latest /envbox docker +``` + +This will store persistent data under `/tmp/envbox`. + +## Troubleshooting + +### `failed to write to cgroup.procs: write /sys/fs/cgroup/docker//init.scope/cgroup.procs: operation not supported: unknown` + +This issue occurs in Docker if you have `cgroupns-mode` set to `private`. To validate, add `--cgroupns=host` to your `docker run` invocation and re-run. + +To permanently set this as the default in your Docker daemon, add `"default-cgroupns-mode": "host"` to your `/etc/docker/daemon.json` and restart Docker. diff --git a/cli/docker.go b/cli/docker.go index f48ea64..9ac872c 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -98,6 +98,7 @@ var ( EnvMemory = "CODER_MEMORY" EnvAddGPU = "CODER_ADD_GPU" EnvUsrLibDir = "CODER_USR_LIB_DIR" + EnvInnerUsrLibDir = "CODER_INNER_USR_LIB_DIR" EnvDockerConfig = "CODER_DOCKER_CONFIG" EnvDebug = "CODER_DEBUG" EnvDisableIDMappedMount = "CODER_DISABLE_IDMAPPED_MOUNT" @@ -135,6 +136,7 @@ type flags struct { boostrapScript string containerMounts string hostUsrLibDir string + innerUsrLibDir string dockerConfig string cpus int memory int @@ -370,6 +372,7 @@ func dockerCmd() *cobra.Command { cliflag.StringVarP(cmd.Flags(), &flags.boostrapScript, "boostrap-script", "", EnvBootstrap, "", "The script to use to bootstrap the container. This should typically install and start the agent.") cliflag.StringVarP(cmd.Flags(), &flags.containerMounts, "mounts", "", EnvMounts, "", "Comma separated list of mounts in the form of ':[:options]' (e.g. /var/lib/docker:/var/lib/docker:ro,/usr/src:/usr/src).") cliflag.StringVarP(cmd.Flags(), &flags.hostUsrLibDir, "usr-lib-dir", "", EnvUsrLibDir, "", "The host /usr/lib mountpoint. Used to detect GPU drivers to mount into inner container.") + cliflag.StringVarP(cmd.Flags(), &flags.innerUsrLibDir, "inner-usr-lib-dir", "", EnvInnerUsrLibDir, "", "The inner /usr/lib mountpoint. This is automatically detected based on /etc/os-release in the inner image, but may optionally be overridden.") cliflag.StringVarP(cmd.Flags(), &flags.dockerConfig, "docker-config", "", EnvDockerConfig, "/root/.docker/config.json", "The path to the docker config to consult when pulling an image.") cliflag.BoolVarP(cmd.Flags(), &flags.addTUN, "add-tun", "", EnvAddTun, false, "Add a TUN device to the inner container.") cliflag.BoolVarP(cmd.Flags(), &flags.addFUSE, "add-fuse", "", EnvAddFuse, false, "Add a FUSE device to the inner container.") @@ -396,14 +399,14 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client if err != nil { return xerrors.Errorf("set oom score: %w", err) } - ref, err := name.NewTag(flags.innerImage) + ref, err := name.ParseReference(flags.innerImage) if err != nil { return xerrors.Errorf("parse ref: %w", err) } var dockerAuth dockerutil.AuthConfig if flags.imagePullSecret != "" { - dockerAuth, err = dockerutil.AuthConfigFromString(flags.imagePullSecret, ref.RegistryStr()) + dockerAuth, err = dockerutil.AuthConfigFromString(flags.imagePullSecret, ref.Context().RegistryStr()) if err != nil { return xerrors.Errorf("parse auth config: %w", err) } @@ -412,7 +415,7 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client log.Info(ctx, "checking for docker config file", slog.F("path", flags.dockerConfig)) if _, err := fs.Stat(flags.dockerConfig); err == nil { log.Info(ctx, "detected file", slog.F("image", flags.innerImage)) - dockerAuth, err = dockerutil.AuthConfigFromPath(flags.dockerConfig, ref.RegistryStr()) + dockerAuth, err = dockerutil.AuthConfigFromPath(flags.dockerConfig, ref.Context().RegistryStr()) if err != nil && !xerrors.Is(err, os.ErrNotExist) { return xerrors.Errorf("auth config from file: %w", err) } @@ -523,7 +526,7 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client // of the user so that we can chown directories to the namespaced UID inside // the inner container as well as whether we should be starting the container // with /sbin/init or something simple like 'sleep infinity'. - imgMeta, err := dockerutil.GetImageMetadata(ctx, client, flags.innerImage, flags.innerUsername) + imgMeta, err := dockerutil.GetImageMetadata(ctx, log, client, flags.innerImage, flags.innerUsername) if err != nil { return xerrors.Errorf("get image metadata: %w", err) } @@ -534,6 +537,8 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client slog.F("uid", imgMeta.UID), slog.F("gid", imgMeta.GID), slog.F("has_init", imgMeta.HasInit), + slog.F("os_release", imgMeta.OsReleaseID), + slog.F("home_dir", imgMeta.HomeDir), ) uid, err := strconv.ParseInt(imgMeta.UID, 10, 32) @@ -614,16 +619,33 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client }) } + innerUsrLibDir := imgMeta.UsrLibDir() + if flags.innerUsrLibDir != "" { + log.Info(ctx, "overriding auto-detected inner usr lib dir ", + slog.F("before", innerUsrLibDir), + slog.F("after", flags.innerUsrLibDir)) + innerUsrLibDir = flags.innerUsrLibDir + } for _, bind := range binds { // If the bind has a path that points to the host-mounted /usr/lib // directory we need to remap it to /usr/lib inside the container. mountpoint := bind.Path if strings.HasPrefix(mountpoint, flags.hostUsrLibDir) { mountpoint = filepath.Join( - "/usr/lib", + // Note: we used to mount into /usr/lib, but this can change + // based on the distro inside the container. + innerUsrLibDir, strings.TrimPrefix(mountpoint, strings.TrimSuffix(flags.hostUsrLibDir, "/")), ) } + // Even though xunix.GPUs checks for duplicate mounts, we need to check + // for duplicates again here after remapping the path. + if slices.ContainsFunc(mounts, func(m xunix.Mount) bool { + return m.Mountpoint == mountpoint + }) { + log.Debug(ctx, "skipping duplicate mount", slog.F("path", mountpoint)) + continue + } mounts = append(mounts, xunix.Mount{ Source: bind.Path, Mountpoint: mountpoint, @@ -634,6 +656,13 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Client } blog.Info("Creating workspace...") + // If imgMeta.HasInit is true, we just use flags.boostrapScript as the entrypoint. + // But if it's false, we need to run /sbin/init as the entrypoint. + // We need to mount or run some exec command that injects a systemd service for starting + // the coder agent. + + // We need to check that if PID1 is systemd (or /sbin/init) that systemd propagates SIGTERM + // to service units. If it doesn't then this solution doesn't help us. // Create the inner container. containerID, err := dockerutil.CreateContainer(ctx, client, &dockerutil.ContainerConfig{ diff --git a/cli/docker_test.go b/cli/docker_test.go index 88e9bc3..9f05201 100644 --- a/cli/docker_test.go +++ b/cli/docker_test.go @@ -110,6 +110,11 @@ func TestDocker(t *testing.T) { image: "gcr.io/ubuntu:24.04", success: true, }, + { + name: "RegistryRepositorySha", + image: "gcr.io/images/helloworld@sha256:13e101dd511a26a2147e123456bdff5845c9461aaa53d856845745b063001234", + success: true, + }, } for _, tc := range testcases { diff --git a/dockerutil/image.go b/dockerutil/image.go index ffb2bdb..f49cba0 100644 --- a/dockerutil/image.go +++ b/dockerutil/image.go @@ -15,6 +15,8 @@ import ( "github.com/docker/docker/api/types/image" "golang.org/x/xerrors" + "cdr.dev/slog" + "github.com/coder/envbox/buildlog" "github.com/coder/envbox/xunix" "github.com/coder/retry" @@ -22,6 +24,23 @@ import ( const diskFullStorageDriver = "vfs" +// Adapted from https://github.com/NVIDIA/libnvidia-container/blob/v1.15.0/src/nvc_container.c#L152-L165 +var UsrLibDirs = map[string]string{ + // Debian-based distros use a multi-arch directory. + "debian": usrLibMultiarchDir, + "ubuntu": usrLibMultiarchDir, + // Fedora and Redhat use the standard /usr/lib64. + "fedora": "/usr/lib64", + "rhel": "/usr/lib64", + // Fall back to the standard /usr/lib. + "linux": "/usr/lib", +} + +// /etc/os-release is the standard location for system identification data on +// Linux systems running systemd. +// Ref: https://www.freedesktop.org/software/systemd/man/latest/os-release.html +var etcOsRelease = "/etc/os-release" + type PullImageConfig struct { Client Client Image string @@ -148,15 +167,16 @@ func processImagePullEvents(r io.Reader, fn ImagePullProgressFn) error { } type ImageMetadata struct { - UID string - GID string - HomeDir string - HasInit bool + UID string + GID string + HomeDir string + HasInit bool + OsReleaseID string } // GetImageMetadata returns metadata about an image such as the UID/GID of the // provided username and whether it contains an /sbin/init that we should run. -func GetImageMetadata(ctx context.Context, client Client, img, username string) (ImageMetadata, error) { +func GetImageMetadata(ctx context.Context, log slog.Logger, client Client, img, username string) (ImageMetadata, error) { // Creating a dummy container to inspect the filesystem. created, err := client.ContainerCreate(ctx, &container.Config{ @@ -226,14 +246,58 @@ func GetImageMetadata(ctx context.Context, client Client, img, username string) return ImageMetadata{}, xerrors.Errorf("no users returned for username %s", username) } + // Read the /etc/os-release file to get the ID of the OS. + // We only care about the ID field. + var osReleaseID string + out, err = ExecContainer(ctx, client, ExecConfig{ + ContainerID: inspect.ID, + Cmd: "cat", + Args: []string{etcOsRelease}, + }) + if err != nil { + log.Error(ctx, "read os-release", slog.Error(err)) + log.Error(ctx, "falling back to linux for os-release ID") + osReleaseID = "linux" + } else { + osReleaseID = GetOSReleaseID(out) + } + return ImageMetadata{ - UID: users[0].Uid, - GID: users[0].Gid, - HomeDir: users[0].HomeDir, - HasInit: initExists, + UID: users[0].Uid, + GID: users[0].Gid, + HomeDir: users[0].HomeDir, + HasInit: initExists, + OsReleaseID: osReleaseID, }, nil } +// UsrLibDir returns the path to the /usr/lib directory for the given +// operating system determined by the /etc/os-release file. +func (im ImageMetadata) UsrLibDir() string { + if val, ok := UsrLibDirs[im.OsReleaseID]; ok && val != "" { + return val + } + return UsrLibDirs["linux"] // fallback +} + +// GetOSReleaseID returns the ID of the operating system from the +// raw contents of /etc/os-release. +func GetOSReleaseID(raw []byte) string { + var osReleaseID string + for _, line := range strings.Split(string(raw), "\n") { + if strings.HasPrefix(line, "ID=") { + osReleaseID = strings.TrimPrefix(line, "ID=") + // The value may be quoted. + osReleaseID = strings.Trim(osReleaseID, "\"") + break + } + } + if osReleaseID == "" { + return "linux" + } + return osReleaseID +} + func DefaultLogImagePullFn(log buildlog.Logger) func(ImagePullEvent) error { var ( // Avoid spamming too frequently, the messages can come quickly diff --git a/dockerutil/image_linux_amd64.go b/dockerutil/image_linux_amd64.go new file mode 100644 index 0000000..5952a29 --- /dev/null +++ b/dockerutil/image_linux_amd64.go @@ -0,0 +1,5 @@ +package dockerutil + +// usrLibMultiarchDir is defined for arm64 and amd64 architectures. +// Envbox is not published for other architectures. +var usrLibMultiarchDir = "/usr/lib/x86_64-linux-gnu" diff --git a/dockerutil/image_linux_arm64.go b/dockerutil/image_linux_arm64.go new file mode 100644 index 0000000..577c1be --- /dev/null +++ b/dockerutil/image_linux_arm64.go @@ -0,0 +1,5 @@ +package dockerutil + +// usrLibMultiarchDir is defined for arm64 and amd64 architectures. +// Envbox is not published for other architectures. +var usrLibMultiarchDir = "/usr/lib/aarch64-linux-gnu" diff --git a/integration/docker_test.go b/integration/docker_test.go index 9f88b09..533d2ba 100644 --- a/integration/docker_test.go +++ b/integration/docker_test.go @@ -1,6 +1,3 @@ -//go:build integration -// +build integration - package integration_test import ( @@ -24,6 +21,9 @@ import ( func TestDocker(t *testing.T) { t.Parallel() + if val, ok := os.LookupEnv("CODER_TEST_INTEGRATION"); !ok || val != "1" { + t.Skip("integration tests are skipped unless CODER_TEST_INTEGRATION=1") + } // Dockerd just tests that dockerd can spin up and function correctly. t.Run("Dockerd", func(t *testing.T) { @@ -240,28 +240,53 @@ func TestDocker(t *testing.T) { require.Equal(t, "1000", strings.TrimSpace(string(out))) // Validate that memory limit is being applied to the inner container. - out, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ + // First check under cgroupv2 path. + if out, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ ContainerID: resource.Container.ID, - Cmd: []string{"cat", "/sys/fs/cgroup/memory/memory.limit_in_bytes"}, - }) - require.NoError(t, err) - require.Equal(t, expectedMemoryLimit, strings.TrimSpace(string(out))) + Cmd: []string{"cat", "/sys/fs/cgroup/memory.max"}, + }); err == nil { + require.Equal(t, expectedMemoryLimit, strings.TrimSpace(string(out))) + } else { // fall back to cgroupv1 path. + out, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ + ContainerID: resource.Container.ID, + Cmd: []string{"cat", "/sys/fs/cgroup/memory/memory.limit_in_bytes"}, + }) + require.NoError(t, err) + require.Equal(t, expectedMemoryLimit, strings.TrimSpace(string(out))) + } - periodStr, err := integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ + // Validate the cpu limits are being applied to the inner container. + // First check under cgroupv2 path. + var quota, period int64 + if out, err = integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ ContainerID: resource.Container.ID, - Cmd: []string{"cat", "/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us"}, - }) - require.NoError(t, err) - period, err := strconv.ParseInt(strings.TrimSpace(string(periodStr)), 10, 64) - require.NoError(t, err) + Cmd: []string{"cat", "/sys/fs/cgroup/cpu.max"}, + }); err == nil { + // out is in the format "period quota" + // e.g. "100000 100000" + fields := strings.Fields(string(out)) + require.Len(t, fields, 2) + period, err = strconv.ParseInt(fields[0], 10, 64) + require.NoError(t, err) + quota, err = strconv.ParseInt(fields[1], 10, 64) + require.NoError(t, err) + } else { // fall back to cgroupv1 path. + periodStr, err := integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ + ContainerID: resource.Container.ID, + Cmd: []string{"cat", "/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us"}, + }) + require.NoError(t, err) + period, err = strconv.ParseInt(strings.TrimSpace(string(periodStr)), 10, 64) + require.NoError(t, err) - quotaStr, err := integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ - ContainerID: resource.Container.ID, - Cmd: []string{"cat", "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"}, - }) - require.NoError(t, err) - quota, err := strconv.ParseInt(strings.TrimSpace(string(quotaStr)), 10, 64) - require.NoError(t, err) + quotaStr, err := integrationtest.ExecInnerContainer(t, pool, integrationtest.ExecConfig{ + ContainerID: resource.Container.ID, + Cmd: []string{"cat", "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"}, + }) + require.NoError(t, err) + quota, err = strconv.ParseInt(strings.TrimSpace(string(quotaStr)), 10, 64) + require.NoError(t, err) + } // Validate that the CPU limit is being applied to the inner container. actualLimit := float64(quota) / float64(period) diff --git a/integration/gpu_test.go b/integration/gpu_test.go new file mode 100644 index 0000000..f3fc1ac --- /dev/null +++ b/integration/gpu_test.go @@ -0,0 +1,305 @@ +package integration_test + +import ( + "context" + "os" + "os/exec" + "slices" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/envbox/integration/integrationtest" +) + +func TestDocker_Nvidia(t *testing.T) { + t.Parallel() + if val, ok := os.LookupEnv("CODER_TEST_INTEGRATION"); !ok || val != "1" { + t.Skip("integration tests are skipped unless CODER_TEST_INTEGRATION=1") + } + // Only run this test if the nvidia container runtime is detected. + // Check if the nvidia runtime is available using `docker info`. + // The docker client doesn't expose this information so we need to fetch it ourselves. + if !slices.Contains(dockerRuntimes(t), "nvidia") { + t.Skip("this test requires nvidia runtime to be available") + } + + t.Run("Ubuntu", func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + // Start the envbox container. + ctID := startEnvboxCmd(ctx, t, integrationtest.UbuntuImage, "root", + "-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", + "--env", "CODER_ADD_GPU=true", + "--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", + "--runtime=nvidia", + "--gpus=all", + ) + + // Assert that we can run nvidia-smi in the inner container. + assertInnerNvidiaSMI(ctx, t, ctID) + }) + + t.Run("Redhat", func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + // Start the envbox container. + ctID := startEnvboxCmd(ctx, t, integrationtest.RedhatImage, "root", + "-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", + "--env", "CODER_ADD_GPU=true", + "--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", + "--runtime=nvidia", + "--gpus=all", + ) + + // Assert that we can run nvidia-smi in the inner container. + assertInnerNvidiaSMI(ctx, t, ctID) + + // Make sure dnf still works. This checks for a regression due to + // gpuExtraRegex matching `libglib.so` in the outer container. + // This had a dependency on `libpcre.so.3` which would cause dnf to fail. + out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "dnf") + if !assert.NoError(t, err, "failed to run dnf in the inner container") { + t.Logf("dnf output:\n%s", strings.TrimSpace(out)) + } + + // Make sure libglib.so is not present in the inner container. + out, err = execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/x86_64-linux-gnu/libglib*") + // An error is expected here. + assert.Error(t, err, "libglib should not be present in the inner container") + assert.Contains(t, out, "No such file or directory", "libglib should not be present in the inner container") + }) + + t.Run("InnerUsrLibDirOverride", func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + // Start the envbox container. + ctID := startEnvboxCmd(ctx, t, integrationtest.UbuntuImage, "root", + "-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", + "--env", "CODER_ADD_GPU=true", + "--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", + "--env", "CODER_INNER_USR_LIB_DIR=/usr/lib/coder", + "--runtime=nvidia", + "--gpus=all", + ) + + // Assert that the libraries end up in the expected location in the inner + // container. + out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/coder") + require.NoError(t, err, "inner usr lib dir override failed") + require.Regexp(t, `(?i)(libgl|nvidia|vulkan|cuda)`, out) + }) + + t.Run("EmptyHostUsrLibDir", func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + emptyUsrLibDir := t.TempDir() + + // Start the envbox container. + ctID := startEnvboxCmd(ctx, t, integrationtest.UbuntuImage, "root", + "-v", emptyUsrLibDir+":/var/coder/usr/lib", + "--env", "CODER_ADD_GPU=true", + "--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", + "--runtime=nvidia", + "--gpus=all", + ) + + ofs := outerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*") + // Assert invariant: the outer container has the files we expect. + require.NotEmpty(t, ofs, "failed to list outer container files") + // Assert that expected files are available in the inner container. + assertInnerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*", ofs...) + assertInnerNvidiaSMI(ctx, t, ctID) + }) + + t.Run("CUDASample", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + // Start the envbox container. + ctID := startEnvboxCmd(ctx, t, integrationtest.CUDASampleImage, "root", + "-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", + "--env", "CODER_ADD_GPU=true", + "--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", + "--runtime=nvidia", + "--gpus=all", + ) + + // Assert that we can run nvidia-smi in the inner container. + assertInnerNvidiaSMI(ctx, t, ctID) + + // Assert that /tmp/vectorAdd runs successfully in the inner container. + _, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "/tmp/vectorAdd") + require.NoError(t, err, "failed to run /tmp/vectorAdd in the inner container") + }) +} + +// dockerRuntimes returns the list of container runtimes available on the host. +// It does this by running `docker info` and parsing the output. +func dockerRuntimes(t *testing.T) []string { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "docker", "info", "--format", "{{ range $k, $v := .Runtimes}}{{ println $k }}{{ end }}") + out, err := cmd.CombinedOutput() + require.NoError(t, err, "failed to get docker runtimes: %s", out) + raw := strings.TrimSpace(string(out)) + return strings.Split(raw, "\n") +} + +// outerFiles returns the list of files in the outer container matching the +// given pattern. It does this by running `ls -1` in the outer container. +func outerFiles(ctx context.Context, t *testing.T, containerID, pattern string) []string { + t.Helper() + // We need to use /bin/sh -c to avoid the shell interpreting the glob. + out, err := execContainerCmd(ctx, t, containerID, "/bin/sh", "-c", "ls -1 "+pattern) + require.NoError(t, err, "failed to list outer container files") + files := strings.Split(strings.TrimSpace(out), "\n") + slices.Sort(files) + return files +} + +// assertInnerFiles checks that all the files matching the given pattern exist in the +// inner container. +func assertInnerFiles(ctx context.Context, t *testing.T, containerID, pattern string, expected ...string) { + t.Helper() + + // Get the list of files in the inner container. + // We need to use /bin/sh -c to avoid the shell interpreting the glob. + out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "/bin/sh", "-c", "ls -1 "+pattern) + require.NoError(t, err, "failed to list inner container files") + innerFiles := strings.Split(strings.TrimSpace(out), "\n") + + // Check that the expected files exist in the inner container. + missingFiles := make([]string, 0) + for _, expectedFile := range expected { + if !slices.Contains(innerFiles, expectedFile) { + missingFiles = append(missingFiles, expectedFile) + } + } + require.Empty(t, missingFiles, "missing files in inner container: %s", strings.Join(missingFiles, ", ")) +} + +// assertInnerNvidiaSMI checks that nvidia-smi runs successfully in the inner +// container. +func assertInnerNvidiaSMI(ctx context.Context, t *testing.T, containerID string) { + t.Helper() + // Assert that we can run nvidia-smi in the inner container. + out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "nvidia-smi") + require.NoError(t, err, "failed to run nvidia-smi in the inner container") + require.Contains(t, out, "NVIDIA-SMI", "nvidia-smi output does not contain NVIDIA-SMI") +} + +// startEnvboxCmd starts the envbox container with the given arguments. +// Ideally we would use ory/dockertest for this, but it doesn't support +// specifying the runtime. We have alternatively used the docker client library, +// but a nice property of using the docker cli is that if a test fails, you can +// just run the command manually to debug! +func startEnvboxCmd(ctx context.Context, t *testing.T, innerImage, innerUser string, addlArgs ...string) (containerID string) { + t.Helper() + + var ( + tmpDir = integrationtest.TmpDir(t) + binds = integrationtest.DefaultBinds(t, tmpDir) + cancelCtx, cancel = context.WithCancel(ctx) + ) + t.Cleanup(cancel) + + // Unfortunately ory/dockertest does not allow us to specify runtime. + // We're instead going to just run the container directly via the docker cli. + startEnvboxArgs := []string{ + "run", + "--detach", + "--rm", + "--privileged", + "--env", "CODER_INNER_IMAGE=" + innerImage, + "--env", "CODER_INNER_USERNAME=" + innerUser, + } + for _, bind := range binds { + bindParts := []string{bind.Source, bind.Target} + if bind.ReadOnly { + bindParts = append(bindParts, "ro") + } + startEnvboxArgs = append(startEnvboxArgs, []string{"-v", strings.Join(bindParts, ":")}...) + } + startEnvboxArgs = append(startEnvboxArgs, addlArgs...) + startEnvboxArgs = append(startEnvboxArgs, "envbox:latest", "/envbox", "docker") + t.Logf("envbox docker cmd: docker %s", strings.Join(startEnvboxArgs, " ")) + + // Start the envbox container without attaching. + startEnvboxCmd := exec.CommandContext(cancelCtx, "docker", startEnvboxArgs...) + out, err := startEnvboxCmd.CombinedOutput() + require.NoError(t, err, "failed to start envbox container") + containerID = strings.TrimSpace(string(out)) + t.Logf("envbox container ID: %s", containerID) + t.Cleanup(func() { + if t.Failed() { + // Dump the logs if the test failed. + logsCmd := exec.Command("docker", "logs", containerID) + out, err := logsCmd.CombinedOutput() + if err != nil { + t.Logf("failed to read logs: %s", err) + } + t.Logf("envbox logs:\n%s", string(out)) + } + // Stop the envbox container. + stopEnvboxCmd := exec.Command("docker", "rm", "-f", containerID) + out, err := stopEnvboxCmd.CombinedOutput() + if err != nil { + t.Errorf("failed to stop envbox container: %s", out) + } + }) + + // Wait for the Docker CVM to come up. + waitCtx, waitCancel := context.WithTimeout(cancelCtx, 5*time.Minute) + defer waitCancel() +WAITLOOP: + for { + select { + case <-waitCtx.Done(): + t.Fatal("timed out waiting for inner container to come up") + default: + execCmd := exec.CommandContext(cancelCtx, "docker", "exec", containerID, "docker", "inspect", "workspace_cvm") + out, err := execCmd.CombinedOutput() + if err != nil { + t.Logf("waiting for inner container to come up:\n%s", string(out)) + <-time.After(time.Second) + continue WAITLOOP + } + t.Logf("inner container is up") + break WAITLOOP + } + } + + return containerID +} + +func execContainerCmd(ctx context.Context, t *testing.T, containerID string, cmdArgs ...string) (string, error) { + t.Helper() + + execArgs := []string{"exec", containerID} + execArgs = append(execArgs, cmdArgs...) + t.Logf("exec cmd: docker %s", strings.Join(execArgs, " ")) + execCmd := exec.CommandContext(ctx, "docker", execArgs...) + out, err := execCmd.CombinedOutput() + if err != nil { + t.Logf("exec cmd failed: %s\n%s", err.Error(), string(out)) + } else { + t.Logf("exec cmd success: %s", out) + } + return strings.TrimSpace(string(out)), err +} diff --git a/integration/integrationtest/docker.go b/integration/integrationtest/docker.go index 6b5e07c..5b42bbe 100644 --- a/integration/integrationtest/docker.go +++ b/integration/integrationtest/docker.go @@ -40,6 +40,11 @@ const ( // UbuntuImage is just vanilla ubuntu (80MB) but the user is set to a non-root // user . UbuntuImage = "gcr.io/coder-dev-1/sreya/ubuntu-coder" + // Redhat UBI9 image as of 2025-03-05 + RedhatImage = "registry.access.redhat.com/ubi9/ubi:9.5" + // CUDASampleImage is a CUDA sample image from NVIDIA's container registry. + // It contains a binary /tmp/vectorAdd which can be run to test the CUDA setup. + CUDASampleImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2" // RegistryImage is used to assert that we add certs // correctly to the docker daemon when pulling an image diff --git a/xunix/gpu.go b/xunix/gpu.go index a494ab5..a9129d5 100644 --- a/xunix/gpu.go +++ b/xunix/gpu.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "regexp" + "slices" "sort" "strings" @@ -17,9 +18,10 @@ import ( ) var ( - gpuMountRegex = regexp.MustCompile("(?i)(nvidia|vulkan|cuda)") - gpuExtraRegex = regexp.MustCompile("(?i)(libgl|nvidia|vulkan|cuda)") - gpuEnvRegex = regexp.MustCompile("(?i)nvidia") + gpuMountRegex = regexp.MustCompile(`(?i)(nvidia|vulkan|cuda)`) + gpuExtraRegex = regexp.MustCompile(`(?i)(libgl(e|sx|\.)|nvidia|vulkan|cuda)`) + gpuEnvRegex = regexp.MustCompile(`(?i)nvidia`) + sharedObjectRegex = regexp.MustCompile(`\.so(\.[0-9\.]+)?$`) ) func GPUEnvs(ctx context.Context) []string { @@ -38,6 +40,7 @@ func GPUEnvs(ctx context.Context) []string { func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []mount.MountPoint, error) { var ( + afs = GetFS(ctx) mounter = Mounter(ctx) devices = []Device{} binds = []mount.MountPoint{} @@ -63,6 +66,22 @@ func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []m // If it's not in /dev treat it as a bind mount. binds = append(binds, m) + // We also want to find any symlinks that point to the target. + // This is important for the nvidia driver as it mounts the driver + // files with the driver version appended to the end, and creates + // symlinks that point to the actual files. + links, err := SameDirSymlinks(afs, m.Path) + if err != nil { + log.Error(ctx, "find symlinks", slog.F("path", m.Path), slog.Error(err)) + } else { + for _, link := range links { + log.Debug(ctx, "found symlink", slog.F("link", link), slog.F("target", m.Path)) + binds = append(binds, mount.MountPoint{ + Path: link, + Opts: []string{"ro"}, + }) + } + } } } @@ -103,7 +122,11 @@ func usrLibGPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]mount return nil } - if filepath.Ext(path) != ".so" || !gpuExtraRegex.MatchString(path) { + if !gpuExtraRegex.MatchString(path) { + return nil + } + + if !sharedObjectRegex.MatchString(path) { return nil } @@ -175,6 +198,75 @@ func recursiveSymlinks(afs FS, mountpoint string, path string) ([]string, error) return paths, nil } +// SameDirSymlinks returns all links in the same directory as `target` that +// point to target, either indirectly or directly. Only symlinks in the same +// directory as `target` are considered. +func SameDirSymlinks(afs FS, target string) ([]string, error) { + // Get the list of files in the directory of the target. + fis, err := afero.ReadDir(afs, filepath.Dir(target)) + if err != nil { + return nil, xerrors.Errorf("read dir %q: %w", filepath.Dir(target), err) + } + + // Do an initial pass to map all symlinks to their destinations. + allLinks := make(map[string]string) + for _, fi := range fis { + // Ignore non-symlinks. + if fi.Mode()&os.ModeSymlink == 0 { + continue + } + + absPath := filepath.Join(filepath.Dir(target), fi.Name()) + link, err := afs.Readlink(filepath.Join(filepath.Dir(target), fi.Name())) + if err != nil { + return nil, xerrors.Errorf("readlink %q: %w", fi.Name(), err) + } + + if !filepath.IsAbs(link) { + link = filepath.Join(filepath.Dir(target), link) + } + allLinks[absPath] = link + } + + // Now we can start checking for symlinks that point to the target. + var ( + found = make([]string, 0) + // Set an arbitrary upper limit to prevent infinite loops. + maxIterations = 10 + ) + for range maxIterations { + var foundThisTime bool + for linkName, linkDest := range allLinks { + // Ignore symlinks that point outside of target's directory. + if filepath.Dir(linkName) != filepath.Dir(target) { + continue + } + + // If the symlink points to the target, add it to the list. + if linkDest == target { + if !slices.Contains(found, linkName) { + found = append(found, linkName) + foundThisTime = true + } + } + + // If the symlink points to another symlink that we already determined + // points to the target, add it to the list. + if slices.Contains(found, linkDest) { + if !slices.Contains(found, linkName) { + found = append(found, linkName) + foundThisTime = true + } + } + } + // If we didn't find any new symlinks, we're done. + if !foundThisTime { + break + } + } + return found, nil +} + // TryUnmountProcGPUDrivers unmounts any GPU-related mounts under /proc as it causes // issues when creating any container in some cases. Errors encountered while // unmounting are treated as non-fatal. diff --git a/xunix/gpu_test.go b/xunix/gpu_test.go index 4cbf5f0..4324fcf 100644 --- a/xunix/gpu_test.go +++ b/xunix/gpu_test.go @@ -2,10 +2,13 @@ package xunix_test import ( "context" + "os" "path/filepath" + "sort" "testing" "github.com/spf13/afero" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "k8s.io/mount-utils" @@ -56,14 +59,20 @@ func TestGPUs(t *testing.T) { expectedUsrLibFiles = []string{ filepath.Join(usrLibMountpoint, "nvidia", "libglxserver_nvidia.so"), filepath.Join(usrLibMountpoint, "libnvidia-ml.so"), + filepath.Join(usrLibMountpoint, "nvidia", "libglxserver_nvidia.so.1"), } - // fakeUsrLibFiles are files that should be written to the "mounted" - // /usr/lib directory. It includes files that shouldn't be returned. - fakeUsrLibFiles = append([]string{ + // fakeUsrLibFiles are files that we do not expect to be returned + // bind mounts for. + fakeUsrLibFiles = []string{ filepath.Join(usrLibMountpoint, "libcurl-gnutls.so"), - filepath.Join(usrLibMountpoint, "nvidia", "libglxserver_nvidia.so.1"), - }, expectedUsrLibFiles...) + filepath.Join(usrLibMountpoint, "libglib.so"), + } + + // allUsrLibFiles are all the files that should be written to the + // "mounted" /usr/lib directory. It includes files that shouldn't + // be returned. + allUsrLibFiles = append(expectedUsrLibFiles, fakeUsrLibFiles...) ) ctx := xunix.WithFS(context.Background(), fs) @@ -90,15 +99,19 @@ func TestGPUs(t *testing.T) { err := fs.MkdirAll(filepath.Join(usrLibMountpoint, "nvidia"), 0o755) require.NoError(t, err) - for _, file := range fakeUsrLibFiles { + for _, file := range allUsrLibFiles { _, err = fs.Create(file) require.NoError(t, err) } + for _, mp := range mounter.MountPoints { + _, err = fs.Create(mp.Path) + require.NoError(t, err) + } devices, binds, err := xunix.GPUs(ctx, log, usrLibMountpoint) require.NoError(t, err) require.Len(t, devices, 2, "unexpected 2 nvidia devices") - require.Len(t, binds, 3, "expected 4 nvidia binds") + require.Len(t, binds, 4, "expected 4 nvidia binds") require.Contains(t, binds, mount.MountPoint{ Device: "/dev/sda1", Path: "/usr/local/nvidia", @@ -110,5 +123,111 @@ func TestGPUs(t *testing.T) { Opts: []string{"ro"}, }) } + for _, file := range fakeUsrLibFiles { + require.NotContains(t, binds, mount.MountPoint{ + Path: file, + Opts: []string{"ro"}, + }) + } }) } + +func Test_SameDirSymlinks(t *testing.T) { + t.Parallel() + + var ( + ctx = context.Background() + // We need to test with a real filesystem as the fake one doesn't + // support creating symlinks. + tmpDir = t.TempDir() + // We do test with the interface though! + afs = xunix.GetFS(ctx) + ) + + // Create some files in the temporary directory. + _, err := os.Create(filepath.Join(tmpDir, "file1.real")) + require.NoError(t, err, "create file") + _, err = os.Create(filepath.Join(tmpDir, "file2.real")) + require.NoError(t, err, "create file2") + _, err = os.Create(filepath.Join(tmpDir, "file3.real")) + require.NoError(t, err, "create file3") + _, err = os.Create(filepath.Join(tmpDir, "file4.real")) + require.NoError(t, err, "create file4") + + // Create a symlink to the file in the temporary directory. + // This needs to be done by the real os package. + err = os.Symlink(filepath.Join(tmpDir, "file1.real"), filepath.Join(tmpDir, "file1.link1")) + require.NoError(t, err, "create first symlink") + + // Create another symlink to the previous symlink. + err = os.Symlink(filepath.Join(tmpDir, "file1.link1"), filepath.Join(tmpDir, "file1.link2")) + require.NoError(t, err, "create second symlink") + + // Create a symlink to a file outside of the temporary directory. + err = os.MkdirAll(filepath.Join(tmpDir, "dir"), 0o755) + require.NoError(t, err, "create dir") + // Create a symlink from file2 to inside the dir. + err = os.Symlink(filepath.Join(tmpDir, "file2.real"), filepath.Join(tmpDir, "dir", "file2.link1")) + require.NoError(t, err, "create dir symlink") + + // Create a symlink with a relative path. To do this, we need to + // change the working directory to the temporary directory. + oldWorkingDir, err := os.Getwd() + require.NoError(t, err, "get working dir") + // Change the working directory to the temporary directory. + require.NoError(t, os.Chdir(tmpDir), "change working dir") + err = os.Symlink(filepath.Join(tmpDir, "file4.real"), "file4.link1") + require.NoError(t, err, "create relative symlink") + // Change the working directory back to the original. + require.NoError(t, os.Chdir(oldWorkingDir), "change working dir back") + + for _, tt := range []struct { + name string + expected []string + }{ + { + // Two symlinks to the same file. + name: "file1.real", + expected: []string{ + filepath.Join(tmpDir, "file1.link1"), + filepath.Join(tmpDir, "file1.link2"), + }, + }, + { + // Mid-way in the symlink chain. + name: "file1.link1", + expected: []string{ + filepath.Join(tmpDir, "file1.link2"), + }, + }, + { + // End of the symlink chain. + name: "file1.link2", + expected: []string{}, + }, + { + // Symlink to a file outside of the temporary directory. + name: "file2.real", + expected: []string{}, + }, + { + // No symlinks to this file. + name: "file3.real", + expected: []string{}, + }, + { + // One relative symlink. + name: "file4.real", + expected: []string{filepath.Join(tmpDir, "file4.link1")}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + fullPath := filepath.Join(tmpDir, tt.name) + actual, err := xunix.SameDirSymlinks(afs, fullPath) + require.NoError(t, err, "find symlink") + sort.Strings(actual) + assert.Equal(t, tt.expected, actual, "find symlinks %q", tt.name) + }) + } +}