Skip to content

Commit 6651c16

Browse files
authored
fix: avoid terraform state concurrent access, remove global mutex (#5273)
1 parent 85a6d14 commit 6651c16

File tree

5 files changed

+63
-54
lines changed

5 files changed

+63
-54
lines changed

cli/server.go

+15-5
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
112112
notifyCtx, notifyStop := signal.NotifyContext(ctx, InterruptSignals...)
113113
defer notifyStop()
114114

115+
// Ensure we have a unique cache directory for this process.
116+
cacheDir := filepath.Join(cfg.CacheDirectory.Value, uuid.NewString())
117+
err = os.MkdirAll(cacheDir, 0o700)
118+
if err != nil {
119+
return xerrors.Errorf("create cache directory: %w", err)
120+
}
121+
defer os.RemoveAll(cacheDir)
122+
115123
// Clean up idle connections at the end, e.g.
116124
// embedded-postgres can leave an idle connection
117125
// which is caught by goleaks.
@@ -355,7 +363,7 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
355363
Database: databasefake.New(),
356364
DERPMap: derpMap,
357365
Pubsub: database.NewPubsubInMemory(),
358-
CacheDir: cfg.CacheDirectory.Value,
366+
CacheDir: cacheDir,
359367
GoogleTokenValidator: googleTokenValidator,
360368
GitAuthConfigs: gitAuthConfigs,
361369
RealIPConfig: realIPConfig,
@@ -632,7 +640,8 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
632640
}()
633641
provisionerdMetrics := provisionerd.NewMetrics(options.PrometheusRegistry)
634642
for i := 0; i < cfg.Provisioner.Daemons.Value; i++ {
635-
daemon, err := newProvisionerDaemon(ctx, coderAPI, provisionerdMetrics, logger, cfg, errCh, false)
643+
daemonCacheDir := filepath.Join(cacheDir, fmt.Sprintf("provisioner-%d", i))
644+
daemon, err := newProvisionerDaemon(ctx, coderAPI, provisionerdMetrics, logger, cfg, daemonCacheDir, errCh, false)
636645
if err != nil {
637646
return xerrors.Errorf("create provisioner daemon: %w", err)
638647
}
@@ -902,6 +911,7 @@ func newProvisionerDaemon(
902911
metrics provisionerd.Metrics,
903912
logger slog.Logger,
904913
cfg *codersdk.DeploymentConfig,
914+
cacheDir string,
905915
errCh chan error,
906916
dev bool,
907917
) (srv *provisionerd.Server, err error) {
@@ -912,9 +922,9 @@ func newProvisionerDaemon(
912922
}
913923
}()
914924

915-
err = os.MkdirAll(cfg.CacheDirectory.Value, 0o700)
925+
err = os.MkdirAll(cacheDir, 0o700)
916926
if err != nil {
917-
return nil, xerrors.Errorf("mkdir %q: %w", cfg.CacheDirectory.Value, err)
927+
return nil, xerrors.Errorf("mkdir %q: %w", cacheDir, err)
918928
}
919929

920930
terraformClient, terraformServer := provisionersdk.MemTransportPipe()
@@ -930,7 +940,7 @@ func newProvisionerDaemon(
930940
ServeOptions: &provisionersdk.ServeOptions{
931941
Listener: terraformServer,
932942
},
933-
CachePath: cfg.CacheDirectory.Value,
943+
CachePath: cacheDir,
934944
Logger: logger,
935945
})
936946
if err != nil && !xerrors.Is(err, context.Canceled) {

provisioner/terraform/executor.go

+34-41
Original file line numberDiff line numberDiff line change
@@ -23,27 +23,15 @@ import (
2323
"github.com/coder/coder/provisionersdk/proto"
2424
)
2525

26-
// initMut is a global mutex that protects the Terraform cache directory from
27-
// concurrent usage by path. Only `terraform init` commands are guarded by this
28-
// mutex.
29-
//
30-
// When cache path is set, we must protect against multiple calls to
31-
// `terraform init`.
32-
//
33-
// From the Terraform documentation:
34-
//
35-
// Note: The plugin cache directory is not guaranteed to be concurrency
36-
// safe. The provider installer's behavior in environments with multiple
37-
// terraform init calls is undefined.
38-
var initMut = &sync.Mutex{}
39-
4026
type executor struct {
27+
mut *sync.Mutex
4128
binaryPath string
42-
cachePath string
43-
workdir string
29+
// cachePath and workdir must not be used by multiple processes at once.
30+
cachePath string
31+
workdir string
4432
}
4533

46-
func (e executor) basicEnv() []string {
34+
func (e *executor) basicEnv() []string {
4735
// Required for "terraform init" to find "git" to
4836
// clone Terraform modules.
4937
env := safeEnviron()
@@ -55,7 +43,8 @@ func (e executor) basicEnv() []string {
5543
return env
5644
}
5745

58-
func (e executor) execWriteOutput(ctx, killCtx context.Context, args, env []string, stdOutWriter, stdErrWriter io.WriteCloser) (err error) {
46+
// execWriteOutput must only be called while the lock is held.
47+
func (e *executor) execWriteOutput(ctx, killCtx context.Context, args, env []string, stdOutWriter, stdErrWriter io.WriteCloser) (err error) {
5948
defer func() {
6049
closeErr := stdOutWriter.Close()
6150
if err == nil && closeErr != nil {
@@ -98,7 +87,8 @@ func (e executor) execWriteOutput(ctx, killCtx context.Context, args, env []stri
9887
return cmd.Wait()
9988
}
10089

101-
func (e executor) execParseJSON(ctx, killCtx context.Context, args, env []string, v interface{}) error {
90+
// execParseJSON must only be called while the lock is held.
91+
func (e *executor) execParseJSON(ctx, killCtx context.Context, args, env []string, v interface{}) error {
10292
if ctx.Err() != nil {
10393
return ctx.Err()
10494
}
@@ -133,7 +123,7 @@ func (e executor) execParseJSON(ctx, killCtx context.Context, args, env []string
133123
return nil
134124
}
135125

136-
func (e executor) checkMinVersion(ctx context.Context) error {
126+
func (e *executor) checkMinVersion(ctx context.Context) error {
137127
v, err := e.version(ctx)
138128
if err != nil {
139129
return err
@@ -147,7 +137,8 @@ func (e executor) checkMinVersion(ctx context.Context) error {
147137
return nil
148138
}
149139

150-
func (e executor) version(ctx context.Context) (*version.Version, error) {
140+
// version doesn't need the lock because it doesn't read or write to any state.
141+
func (e *executor) version(ctx context.Context) (*version.Version, error) {
151142
return versionFromBinaryPath(ctx, e.binaryPath)
152143
}
153144

@@ -177,7 +168,10 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
177168
return version.NewVersion(vj.Version)
178169
}
179170

180-
func (e executor) init(ctx, killCtx context.Context, logr logSink) error {
171+
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
172+
e.mut.Lock()
173+
defer e.mut.Unlock()
174+
181175
outWriter, doneOut := logWriter(logr, proto.LogLevel_DEBUG)
182176
errWriter, doneErr := logWriter(logr, proto.LogLevel_ERROR)
183177
defer func() {
@@ -193,23 +187,14 @@ func (e executor) init(ctx, killCtx context.Context, logr logSink) error {
193187
"-input=false",
194188
}
195189

196-
// When cache path is set, we must protect against multiple calls
197-
// to `terraform init`.
198-
//
199-
// From the Terraform documentation:
200-
// Note: The plugin cache directory is not guaranteed to be
201-
// concurrency safe. The provider installer's behavior in
202-
// environments with multiple terraform init calls is undefined.
203-
if e.cachePath != "" {
204-
initMut.Lock()
205-
defer initMut.Unlock()
206-
}
207-
208190
return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
209191
}
210192

211193
// revive:disable-next-line:flag-parameter
212-
func (e executor) plan(ctx, killCtx context.Context, env, vars []string, logr logSink, destroy bool) (*proto.Provision_Response, error) {
194+
func (e *executor) plan(ctx, killCtx context.Context, env, vars []string, logr logSink, destroy bool) (*proto.Provision_Response, error) {
195+
e.mut.Lock()
196+
defer e.mut.Unlock()
197+
213198
planfilePath := filepath.Join(e.workdir, "terraform.tfplan")
214199
args := []string{
215200
"plan",
@@ -257,7 +242,8 @@ func (e executor) plan(ctx, killCtx context.Context, env, vars []string, logr lo
257242
}, nil
258243
}
259244

260-
func (e executor) planResources(ctx, killCtx context.Context, planfilePath string) ([]*proto.Resource, error) {
245+
// planResources must only be called while the lock is held.
246+
func (e *executor) planResources(ctx, killCtx context.Context, planfilePath string) ([]*proto.Resource, error) {
261247
plan, err := e.showPlan(ctx, killCtx, planfilePath)
262248
if err != nil {
263249
return nil, xerrors.Errorf("show terraform plan file: %w", err)
@@ -270,14 +256,16 @@ func (e executor) planResources(ctx, killCtx context.Context, planfilePath strin
270256
return ConvertResources(plan.PlannedValues.RootModule, rawGraph)
271257
}
272258

273-
func (e executor) showPlan(ctx, killCtx context.Context, planfilePath string) (*tfjson.Plan, error) {
259+
// showPlan must only be called while the lock is held.
260+
func (e *executor) showPlan(ctx, killCtx context.Context, planfilePath string) (*tfjson.Plan, error) {
274261
args := []string{"show", "-json", "-no-color", planfilePath}
275262
p := new(tfjson.Plan)
276263
err := e.execParseJSON(ctx, killCtx, args, e.basicEnv(), p)
277264
return p, err
278265
}
279266

280-
func (e executor) graph(ctx, killCtx context.Context) (string, error) {
267+
// graph must only be called while the lock is held.
268+
func (e *executor) graph(ctx, killCtx context.Context) (string, error) {
281269
if ctx.Err() != nil {
282270
return "", ctx.Err()
283271
}
@@ -302,9 +290,12 @@ func (e executor) graph(ctx, killCtx context.Context) (string, error) {
302290
}
303291

304292
// revive:disable-next-line:flag-parameter
305-
func (e executor) apply(
293+
func (e *executor) apply(
306294
ctx, killCtx context.Context, plan []byte, env []string, logr logSink,
307295
) (*proto.Provision_Response, error) {
296+
e.mut.Lock()
297+
defer e.mut.Unlock()
298+
308299
planFile, err := ioutil.TempFile("", "coder-terrafrom-plan")
309300
if err != nil {
310301
return nil, xerrors.Errorf("create plan file: %w", err)
@@ -356,7 +347,8 @@ func (e executor) apply(
356347
}, nil
357348
}
358349

359-
func (e executor) stateResources(ctx, killCtx context.Context) ([]*proto.Resource, error) {
350+
// stateResources must only be called while the lock is held.
351+
func (e *executor) stateResources(ctx, killCtx context.Context) ([]*proto.Resource, error) {
360352
state, err := e.state(ctx, killCtx)
361353
if err != nil {
362354
return nil, err
@@ -375,7 +367,8 @@ func (e executor) stateResources(ctx, killCtx context.Context) ([]*proto.Resourc
375367
return resources, nil
376368
}
377369

378-
func (e executor) state(ctx, killCtx context.Context) (*tfjson.State, error) {
370+
// state must only be called while the lock is held.
371+
func (e *executor) state(ctx, killCtx context.Context) (*tfjson.State, error) {
379372
args := []string{"show", "-json", "-no-color"}
380373
state := &tfjson.State{}
381374
err := e.execParseJSON(ctx, killCtx, args, e.basicEnv(), state)

provisioner/terraform/provision_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_ProvisionClie
7777

7878
if log := msg.GetLog(); log != nil {
7979
t.Log(log.Level.String(), log.Output)
80-
logBuf.WriteString(log.Output)
80+
_, _ = logBuf.WriteString(log.Output)
8181
}
8282
if c = msg.GetComplete(); c != nil {
8383
require.Empty(t, c.Error)
@@ -190,8 +190,6 @@ func TestProvision_Cancel(t *testing.T) {
190190
func TestProvision(t *testing.T) {
191191
t.Parallel()
192192

193-
ctx, api := setupProvisioner(t, nil)
194-
195193
testCases := []struct {
196194
Name string
197195
Files map[string]string
@@ -329,6 +327,8 @@ func TestProvision(t *testing.T) {
329327
t.Run(testCase.Name, func(t *testing.T) {
330328
t.Parallel()
331329

330+
ctx, api := setupProvisioner(t, nil)
331+
332332
directory := t.TempDir()
333333
for path, content := range testCase.Files {
334334
err := os.WriteFile(filepath.Join(directory, path), []byte(content), 0o600)

provisioner/terraform/serve.go

+9-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package terraform
33
import (
44
"context"
55
"path/filepath"
6+
"sync"
67
"time"
78

89
"github.com/cli/safeexec"
@@ -22,8 +23,9 @@ type ServeOptions struct {
2223
// BinaryPath specifies the "terraform" binary to use.
2324
// If omitted, the $PATH will attempt to find it.
2425
BinaryPath string
25-
CachePath string
26-
Logger slog.Logger
26+
// CachePath must not be used by multiple processes at once.
27+
CachePath string
28+
Logger slog.Logger
2729

2830
// ExitTimeout defines how long we will wait for a running Terraform
2931
// command to exit (cleanly) if the provision was stopped. This only
@@ -91,6 +93,7 @@ func Serve(ctx context.Context, options *ServeOptions) error {
9193
options.ExitTimeout = defaultExitTimeout
9294
}
9395
return provisionersdk.Serve(ctx, &server{
96+
execMut: &sync.Mutex{},
9497
binaryPath: options.BinaryPath,
9598
cachePath: options.CachePath,
9699
logger: options.Logger,
@@ -99,14 +102,16 @@ func Serve(ctx context.Context, options *ServeOptions) error {
99102
}
100103

101104
type server struct {
105+
execMut *sync.Mutex
102106
binaryPath string
103107
cachePath string
104108
logger slog.Logger
105109
exitTimeout time.Duration
106110
}
107111

108-
func (s *server) executor(workdir string) executor {
109-
return executor{
112+
func (s *server) executor(workdir string) *executor {
113+
return &executor{
114+
mut: s.execMut,
110115
binaryPath: s.binaryPath,
111116
cachePath: s.cachePath,
112117
workdir: workdir,

provisionerd/provisionerd.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ type Options struct {
5757
JobPollJitter time.Duration
5858
JobPollDebounce time.Duration
5959
Provisioners Provisioners
60-
WorkDirectory string
60+
// WorkDirectory must not be used by multiple processes at once.
61+
WorkDirectory string
6162
}
6263

6364
// New creates and starts a provisioner daemon.

0 commit comments

Comments
 (0)