Skip to content

Commit 50ff06c

Browse files
chore: acquire lock for individual workspace transition (#15859)
When Coder is ran in High Availability mode, each Coder instance has a lifecycle executor. These lifecycle executors are all trying to do the same work, and whilst transactions saves us from this causing an issue, we are still doing extra work that could be prevented. This PR adds a `TryAcquireLock` call for each attempted workspace transition, meaning two Coder instances shouldn't duplicate effort.
1 parent d504e0e commit 50ff06c

File tree

2 files changed

+82
-1
lines changed

2 files changed

+82
-1
lines changed

coderd/autobuild/lifecycle_executor.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package autobuild
33
import (
44
"context"
55
"database/sql"
6+
"fmt"
67
"net/http"
78
"sync"
89
"sync/atomic"
@@ -177,6 +178,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
177178
err := e.db.InTx(func(tx database.Store) error {
178179
var err error
179180

181+
ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID)))
182+
if err != nil {
183+
return xerrors.Errorf("try acquire lifecycle executor lock: %w", err)
184+
}
185+
if !ok {
186+
log.Debug(e.ctx, "unable to acquire lock for workspace, skipping")
187+
return nil
188+
}
189+
180190
// Re-check eligibility since the first check was outside the
181191
// transaction and the workspace settings may have changed.
182192
ws, err = tx.GetWorkspaceByID(e.ctx, wsID)
@@ -389,7 +399,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
389399
}
390400
return nil
391401
}()
392-
if err != nil {
402+
if err != nil && !xerrors.Is(err, context.Canceled) {
393403
log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
394404
statsMu.Lock()
395405
stats.Errors[wsID] = err

coderd/autobuild/lifecycle_executor_test.go

+71
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/coder/coder/v2/coderd/coderdtest"
1919
"github.com/coder/coder/v2/coderd/database"
2020
"github.com/coder/coder/v2/coderd/database/dbauthz"
21+
"github.com/coder/coder/v2/coderd/database/dbtestutil"
2122
"github.com/coder/coder/v2/coderd/notifications"
2223
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
2324
"github.com/coder/coder/v2/coderd/schedule"
@@ -72,6 +73,76 @@ func TestExecutorAutostartOK(t *testing.T) {
7273
require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"})
7374
}
7475

76+
func TestMultipleLifecycleExecutors(t *testing.T) {
77+
t.Parallel()
78+
79+
db, ps := dbtestutil.NewDB(t)
80+
81+
var (
82+
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
83+
// Create our first client
84+
tickCh = make(chan time.Time, 2)
85+
statsChA = make(chan autobuild.Stats)
86+
clientA = coderdtest.New(t, &coderdtest.Options{
87+
IncludeProvisionerDaemon: true,
88+
AutobuildTicker: tickCh,
89+
AutobuildStats: statsChA,
90+
Database: db,
91+
Pubsub: ps,
92+
})
93+
// ... And then our second client
94+
statsChB = make(chan autobuild.Stats)
95+
_ = coderdtest.New(t, &coderdtest.Options{
96+
IncludeProvisionerDaemon: true,
97+
AutobuildTicker: tickCh,
98+
AutobuildStats: statsChB,
99+
Database: db,
100+
Pubsub: ps,
101+
})
102+
// Now create a workspace (we can use either client, it doesn't matter)
103+
workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) {
104+
cwr.AutostartSchedule = ptr.Ref(sched.String())
105+
})
106+
)
107+
108+
// Have the workspace stopped so we can perform an autostart
109+
workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop)
110+
111+
// Get both clients to perform a lifecycle execution tick
112+
next := sched.Next(workspace.LatestBuild.CreatedAt)
113+
114+
startCh := make(chan struct{})
115+
go func() {
116+
<-startCh
117+
tickCh <- next
118+
}()
119+
go func() {
120+
<-startCh
121+
tickCh <- next
122+
}()
123+
close(startCh)
124+
125+
// Now we want to check the stats for both clients
126+
statsA := <-statsChA
127+
statsB := <-statsChB
128+
129+
// We expect there to be no errors
130+
assert.Len(t, statsA.Errors, 0)
131+
assert.Len(t, statsB.Errors, 0)
132+
133+
// We also expect there to have been only one transition
134+
require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions))
135+
136+
stats := statsA
137+
if len(statsB.Transitions) == 1 {
138+
stats = statsB
139+
}
140+
141+
// And we expect this transition to have been a start transition
142+
assert.Contains(t, stats.Transitions, workspace.ID)
143+
assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID])
144+
}
145+
75146
func TestExecutorAutostartTemplateUpdated(t *testing.T) {
76147
t.Parallel()
77148

0 commit comments

Comments
 (0)