Skip to content

chore: cache terraform providers between CI test runs #17373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
set up a terraform providers mirror for tests
  • Loading branch information
hugodutka committed Apr 16, 2025
commit f11fea6f3010429f6b9779d85586ab91f0f3a220
8 changes: 6 additions & 2 deletions provisioner/terraform/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ type executor struct {
mut *sync.Mutex
binaryPath string
// cachePath and workdir must not be used by multiple processes at once.
cachePath string
workdir string
cachePath string
cliConfigPath string
workdir string
// used to capture execution times at various stages
timings *timingAggregator
}
Expand All @@ -50,6 +51,9 @@ func (e *executor) basicEnv() []string {
if e.cachePath != "" && runtime.GOOS == "linux" {
env = append(env, "TF_PLUGIN_CACHE_DIR="+e.cachePath)
}
if e.cliConfigPath != "" {
env = append(env, "TF_CLI_CONFIG_FILE="+e.cliConfigPath)
}
return env
}

Expand Down
187 changes: 175 additions & 12 deletions provisioner/terraform/provision_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
package terraform_test

import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
Expand All @@ -29,10 +33,11 @@ import (
)

type provisionerServeOptions struct {
binaryPath string
exitTimeout time.Duration
workDir string
logger *slog.Logger
binaryPath string
cliConfigPath string
exitTimeout time.Duration
workDir string
logger *slog.Logger
}

func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
Expand Down Expand Up @@ -66,9 +71,10 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
Logger: *opts.logger,
WorkDirectory: opts.workDir,
},
BinaryPath: opts.binaryPath,
CachePath: cachePath,
ExitTimeout: opts.exitTimeout,
BinaryPath: opts.binaryPath,
CachePath: cachePath,
ExitTimeout: opts.exitTimeout,
CliConfigPath: opts.cliConfigPath,
})
}()
api := proto.NewDRPCProvisionerClient(client)
Expand All @@ -85,6 +91,149 @@ func configure(ctx context.Context, t *testing.T, client proto.DRPCProvisionerCl
return sess
}

func hashTemplateFilesAndTestName(t *testing.T, testName string, templateFiles map[string]string) string {
t.Helper()

sortedFileNames := make([]string, 0, len(templateFiles))
for fileName := range templateFiles {
sortedFileNames = append(sortedFileNames, fileName)
}
sort.Strings(sortedFileNames)

hasher := sha256.New()
for _, fileName := range sortedFileNames {
file := templateFiles[fileName]
_, err := hasher.Write([]byte(fileName))
require.NoError(t, err)
_, err = hasher.Write([]byte(file))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to include some sort of delimiter or dogfood: stuff will hash the same as dog: foodstuff

require.NoError(t, err)
}
_, err := hasher.Write([]byte(testName))
require.NoError(t, err)
return hex.EncodeToString(hasher.Sum(nil))
}

const (
terraformConfigFileName = "terraform.rc"
cacheProvidersDirName = "providers"
cacheTemplateFilesDirName = "files"
)

// Writes a Terraform CLI config file (`terraform.rc`) in `dir` to enforce using the local provider mirror.
// This blocks network access for providers, forcing Terraform to use only what's cached in `dir`.
// Returns the path to the generated config file.
func writeCliConfig(t *testing.T, dir string) string {
t.Helper()

cliConfigPath := filepath.Join(dir, terraformConfigFileName)
require.NoError(t, os.MkdirAll(filepath.Dir(cliConfigPath), 0o700))

content := fmt.Sprintf(`
provider_installation {
filesystem_mirror {
path = "%s"
include = ["*/*"]
}
direct {
exclude = ["*/*"]
}
}
`, filepath.Join(dir, cacheProvidersDirName))
require.NoError(t, os.WriteFile(cliConfigPath, []byte(content), 0o600))
return cliConfigPath
}

func runCmd(t *testing.T, dir string, args ...string) {
t.Helper()

stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil)
cmd := exec.Command(args[0], args[1:]...) //#nosec
cmd.Dir = dir
cmd.Stdout = stdout
cmd.Stderr = stderr
if err := cmd.Run(); err != nil {
t.Fatalf("failed to run %s: %s\nstdout: %s\nstderr: %s", strings.Join(args, " "), err, stdout.String(), stderr.String())
}
}

// Ensures Terraform providers are downloaded and cached locally in a unique directory for this test.
// Uses `terraform init` then `mirror` to populate the cache if needed.
// Returns the cache directory path.
func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]string) string {
t.Helper()

// Each test gets a unique cache dir based on its name and template files.
// This ensures that tests can download providers in parallel and that they
// will redownload providers if the template files change.
hash := hashTemplateFilesAndTestName(t, t.Name(), templateFiles)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Providers are versioned, so I don't think each test or each version of each test needs its own unique mirror.

It results in multiple copies of the providers being downloaded on each system, and there doesn't seem to be any mechanism to clean up old versions of tests. The CI caches expire after 14 days, but people's personal development environments won't get cleaned up.

terraform providers mirror can be run multiple times on the same target directory and it will merge the results:

You can run terraform providers mirror again on an existing mirror directory to update it with new packages.

https://developer.hashicorp.com/terraform/cli/commands/providers/mirror

So, what do you think of doing two passes over the test cases: one serially that sets of the mirror with the totality of the modules we need, then a second, parallel pass that runs the tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing a serial pass is pretty slow if you haven't cached the providers beforehand. I just tried it, and it takes 27 seconds to cache the providers serially in my Coder workspace. In contrast, doing it in parallel only takes 5 seconds.

The cache size across all tests is currently around 70 MB - probably not a big deal. That said, it could balloon if someone starts iterating on a test locally. To avoid that, maybe we should add cleanup logic instead of doing a serial pass?

We already know the paths to provider directories based on test hashes, so at the start of TestProvision, we could check which ones are needed and remove the rest.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cleanup logic would also be fine here, so we don't blow up people's cache.

dir := filepath.Join(rootDir, hash[:12])
if _, err := os.Stat(dir); err == nil {
t.Logf("%s: using cached terraform providers", t.Name())
return dir
}
filesDir := filepath.Join(dir, cacheTemplateFilesDirName)
defer func() {
// The files dir will contain a copy of terraform providers generated
// by the terraform init command. We don't want to persist them since
// we already have a registry mirror in the providers dir.
if err := os.RemoveAll(filesDir); err != nil {
t.Logf("failed to remove files dir %s: %s", filesDir, err)
}
if !t.Failed() {
return
}
if err := os.RemoveAll(dir); err != nil {
t.Logf("failed to remove dir %s: %s", dir, err)
}
}()

require.NoError(t, os.MkdirAll(filesDir, 0o700))

for fileName, file := range templateFiles {
filePath := filepath.Join(filesDir, fileName)
if _, err := os.Stat(filePath); os.IsNotExist(err) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't we just check that the directory doesn't already exist? How could the file already exist if the directory doesn't?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, this check is redundant. I'll remove it.

require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700))
require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600))
}
}

providersDir := filepath.Join(dir, cacheProvidersDirName)
require.NoError(t, os.MkdirAll(providersDir, 0o700))

// We need to run init because if a test uses modules in its template,
// the mirror command will fail without it.
runCmd(t, filesDir, "terraform", "init")
// Now, mirror the providers into `providersDir`. We use this explicit mirror
// instead of relying only on the standard Terraform plugin cache.
//
// Why? Because this mirror, when used with the CLI config from `writeCliConfig`,
// prevents Terraform from hitting the network registry during `plan`. This cuts
// down on network calls, making CI tests less flaky.
//
// In contrast, the standard cache *still* contacts the registry for metadata
// during `init`, even if the plugins are already cached locally - see link below.
//
// Ref: https://developer.hashicorp.com/terraform/cli/config/config-file#provider-plugin-cache
// > When a plugin cache directory is enabled, the terraform init command will
// > still use the configured or implied installation methods to obtain metadata
// > about which plugins are available
runCmd(t, filesDir, "terraform", "providers", "mirror", providersDir)

return dir
}

// Caches providers locally and generates a Terraform CLI config to use *only* that cache.
// This setup prevents network access for providers during `terraform init`, improving reliability
// in subsequent test runs.
// Returns the path to the generated CLI config file.
func cacheProviders(t *testing.T, templateFiles map[string]string, rootDir string) string {
t.Helper()

providersParentDir := downloadProviders(t, rootDir, templateFiles)
cliConfigPath := writeCliConfig(t, providersParentDir)
return cliConfigPath
}

func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_SessionClient) string {
var logBuf strings.Builder
for {
Expand Down Expand Up @@ -352,6 +501,8 @@ func TestProvision(t *testing.T) {
Apply bool
// Some tests may need to be skipped until the relevant provider version is released.
SkipReason string
// If SkipCacheProviders is true, then skip caching the terraform providers for this test.
SkipCacheProviders bool
}{
{
Name: "missing-variable",
Expand Down Expand Up @@ -422,16 +573,18 @@ func TestProvision(t *testing.T) {
Files: map[string]string{
"main.tf": `a`,
},
ErrorContains: "initialize terraform",
ExpectLogContains: "Argument or block definition required",
ErrorContains: "initialize terraform",
ExpectLogContains: "Argument or block definition required",
SkipCacheProviders: true,
},
{
Name: "bad-syntax-2",
Files: map[string]string{
"main.tf": `;asdf;`,
},
ErrorContains: "initialize terraform",
ExpectLogContains: `The ";" character is not valid.`,
ErrorContains: "initialize terraform",
ExpectLogContains: `The ";" character is not valid.`,
SkipCacheProviders: true,
},
{
Name: "destroy-no-state",
Expand Down Expand Up @@ -847,7 +1000,17 @@ func TestProvision(t *testing.T) {
t.Skip(testCase.SkipReason)
}

ctx, api := setupProvisioner(t, nil)
cliConfigPath := ""
if !testCase.SkipCacheProviders {
cliConfigPath = cacheProviders(
t,
testCase.Files,
filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test"),
)
}
ctx, api := setupProvisioner(t, &provisionerServeOptions{
cliConfigPath: cliConfigPath,
})
sess := configure(ctx, t, api, &proto.Config{
TemplateSourceArchive: testutil.CreateTar(t, testCase.Files),
})
Expand Down
45 changes: 25 additions & 20 deletions provisioner/terraform/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ type ServeOptions struct {
BinaryPath string
// CachePath must not be used by multiple processes at once.
CachePath string
Tracer trace.Tracer
// CliConfigPath is the path to the Terraform CLI config file.
CliConfigPath string
Tracer trace.Tracer

// ExitTimeout defines how long we will wait for a running Terraform
// command to exit (cleanly) if the provision was stopped. This
Expand Down Expand Up @@ -132,22 +134,24 @@ func Serve(ctx context.Context, options *ServeOptions) error {
options.ExitTimeout = unhanger.HungJobExitTimeout
}
return provisionersdk.Serve(ctx, &server{
execMut: &sync.Mutex{},
binaryPath: options.BinaryPath,
cachePath: options.CachePath,
logger: options.Logger,
tracer: options.Tracer,
exitTimeout: options.ExitTimeout,
execMut: &sync.Mutex{},
binaryPath: options.BinaryPath,
cachePath: options.CachePath,
cliConfigPath: options.CliConfigPath,
logger: options.Logger,
tracer: options.Tracer,
exitTimeout: options.ExitTimeout,
}, options.ServeOptions)
}

type server struct {
execMut *sync.Mutex
binaryPath string
cachePath string
logger slog.Logger
tracer trace.Tracer
exitTimeout time.Duration
execMut *sync.Mutex
binaryPath string
cachePath string
cliConfigPath string
logger slog.Logger
tracer trace.Tracer
exitTimeout time.Duration
}

func (s *server) startTrace(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
Expand All @@ -158,12 +162,13 @@ func (s *server) startTrace(ctx context.Context, name string, opts ...trace.Span

func (s *server) executor(workdir string, stage database.ProvisionerJobTimingStage) *executor {
return &executor{
server: s,
mut: s.execMut,
binaryPath: s.binaryPath,
cachePath: s.cachePath,
workdir: workdir,
logger: s.logger.Named("executor"),
timings: newTimingAggregator(stage),
server: s,
mut: s.execMut,
binaryPath: s.binaryPath,
cachePath: s.cachePath,
cliConfigPath: s.cliConfigPath,
workdir: workdir,
logger: s.logger.Named("executor"),
timings: newTimingAggregator(stage),
}
}
25 changes: 25 additions & 0 deletions testutil/cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package testutil

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/require"
)

// PersistentCacheDir returns a path to a directory
// that will be cached between test runs in Github Actions.
func PersistentCacheDir(t *testing.T) string {
t.Helper()

// We don't use os.UserCacheDir() because the path it
// returns is different on different operating systems.
// This would make it harder to specify which cache dir to use
// in Github Actions.
home, err := os.UserHomeDir()
require.NoError(t, err)
dir := filepath.Join(home, ".cache", "coderv2-test")

return dir
}