From 2a38c601afc8baf27dfe18d29d318d51ffadbd8e Mon Sep 17 00:00:00 2001 From: Danielle Maywood Date: Fri, 13 Dec 2024 16:59:27 +0000 Subject: [PATCH] chore: acquire lock for individual workspace transition (#15859) When Coder is ran in High Availability mode, each Coder instance has a lifecycle executor. These lifecycle executors are all trying to do the same work, and whilst transactions saves us from this causing an issue, we are still doing extra work that could be prevented. This PR adds a `TryAcquireLock` call for each attempted workspace transition, meaning two Coder instances shouldn't duplicate effort. (cherry picked from commit 50ff06cc3c1845f5aecd260d743be5cb0a0f4431) --- coderd/autobuild/lifecycle_executor.go | 12 +++- coderd/autobuild/lifecycle_executor_test.go | 71 +++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index ac2930c9e32c8..8bd266362e7dc 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -3,6 +3,7 @@ package autobuild import ( "context" "database/sql" + "fmt" "net/http" "sync" "sync/atomic" @@ -177,6 +178,15 @@ func (e *Executor) runOnce(t time.Time) Stats { err := e.db.InTx(func(tx database.Store) error { var err error + ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID))) + if err != nil { + return xerrors.Errorf("try acquire lifecycle executor lock: %w", err) + } + if !ok { + log.Debug(e.ctx, "unable to acquire lock for workspace, skipping") + return nil + } + // Re-check eligibility since the first check was outside the // transaction and the workspace settings may have changed. ws, err = tx.GetWorkspaceByID(e.ctx, wsID) @@ -372,7 +382,7 @@ func (e *Executor) runOnce(t time.Time) Stats { } return nil }() - if err != nil { + if err != nil && !xerrors.Is(err, context.Canceled) { log.Error(e.ctx, "failed to transition workspace", slog.Error(err)) statsMu.Lock() stats.Errors[wsID] = err diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go index 667b20dd9fd4f..7bcd1ccd5836f 100644 --- a/coderd/autobuild/lifecycle_executor_test.go +++ b/coderd/autobuild/lifecycle_executor_test.go @@ -18,6 +18,7 @@ import ( "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/schedule" @@ -72,6 +73,76 @@ func TestExecutorAutostartOK(t *testing.T) { require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"}) } +func TestMultipleLifecycleExecutors(t *testing.T) { + t.Parallel() + + db, ps := dbtestutil.NewDB(t) + + var ( + sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *") + // Create our first client + tickCh = make(chan time.Time, 2) + statsChA = make(chan autobuild.Stats) + clientA = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsChA, + Database: db, + Pubsub: ps, + }) + // ... And then our second client + statsChB = make(chan autobuild.Stats) + _ = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsChB, + Database: db, + Pubsub: ps, + }) + // Now create a workspace (we can use either client, it doesn't matter) + workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + ) + + // Have the workspace stopped so we can perform an autostart + workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) + + // Get both clients to perform a lifecycle execution tick + next := sched.Next(workspace.LatestBuild.CreatedAt) + + startCh := make(chan struct{}) + go func() { + <-startCh + tickCh <- next + }() + go func() { + <-startCh + tickCh <- next + }() + close(startCh) + + // Now we want to check the stats for both clients + statsA := <-statsChA + statsB := <-statsChB + + // We expect there to be no errors + assert.Len(t, statsA.Errors, 0) + assert.Len(t, statsB.Errors, 0) + + // We also expect there to have been only one transition + require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions)) + + stats := statsA + if len(statsB.Transitions) == 1 { + stats = statsB + } + + // And we expect this transition to have been a start transition + assert.Contains(t, stats.Transitions, workspace.ID) + assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) +} + func TestExecutorAutostartTemplateUpdated(t *testing.T) { t.Parallel()