Skip to content

Commit 48d69c9

Browse files
authored
fix: update autostart context to include querying users (#10929)
1 parent e9c12c3 commit 48d69c9

File tree

5 files changed

+164
-158
lines changed

5 files changed

+164
-158
lines changed

coderd/autobuild/lifecycle_executor.go

+130-128
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ type Executor struct {
4343
type Stats struct {
4444
Transitions map[uuid.UUID]database.WorkspaceTransition
4545
Elapsed time.Duration
46-
Error error
46+
Errors map[uuid.UUID]error
4747
}
4848

4949
// New returns a new wsactions executor.
@@ -83,9 +83,6 @@ func (e *Executor) Run() {
8383
return
8484
}
8585
stats := e.runOnce(t)
86-
if stats.Error != nil {
87-
e.log.Error(e.ctx, "error running once", slog.Error(stats.Error))
88-
}
8986
if e.statsCh != nil {
9087
select {
9188
case <-e.ctx.Done():
@@ -100,15 +97,14 @@ func (e *Executor) Run() {
10097
}
10198

10299
func (e *Executor) runOnce(t time.Time) Stats {
103-
var err error
104100
stats := Stats{
105101
Transitions: make(map[uuid.UUID]database.WorkspaceTransition),
102+
Errors: make(map[uuid.UUID]error),
106103
}
107104
// we build the map of transitions concurrently, so need a mutex to serialize writes to the map
108105
statsMu := sync.Mutex{}
109106
defer func() {
110107
stats.Elapsed = time.Since(t)
111-
stats.Error = err
112108
}()
113109
currentTick := t.Truncate(time.Minute)
114110

@@ -139,152 +135,158 @@ func (e *Executor) runOnce(t time.Time) Stats {
139135
log := e.log.With(slog.F("workspace_id", wsID))
140136

141137
eg.Go(func() error {
142-
var job *database.ProvisionerJob
143-
var auditLog *auditParams
144-
err := e.db.InTx(func(tx database.Store) error {
145-
// Re-check eligibility since the first check was outside the
146-
// transaction and the workspace settings may have changed.
147-
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
148-
if err != nil {
149-
return xerrors.Errorf("get workspace by id: %w", err)
150-
}
138+
err := func() error {
139+
var job *database.ProvisionerJob
140+
var auditLog *auditParams
141+
err := e.db.InTx(func(tx database.Store) error {
142+
// Re-check eligibility since the first check was outside the
143+
// transaction and the workspace settings may have changed.
144+
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
145+
if err != nil {
146+
return xerrors.Errorf("get workspace by id: %w", err)
147+
}
151148

152-
user, err := tx.GetUserByID(e.ctx, ws.OwnerID)
153-
if err != nil {
154-
return xerrors.Errorf("get user by id: %w", err)
155-
}
149+
user, err := tx.GetUserByID(e.ctx, ws.OwnerID)
150+
if err != nil {
151+
return xerrors.Errorf("get user by id: %w", err)
152+
}
156153

157-
// Determine the workspace state based on its latest build.
158-
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
159-
if err != nil {
160-
return xerrors.Errorf("get latest workspace build: %w", err)
161-
}
154+
// Determine the workspace state based on its latest build.
155+
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
156+
if err != nil {
157+
return xerrors.Errorf("get latest workspace build: %w", err)
158+
}
162159

163-
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
164-
if err != nil {
165-
return xerrors.Errorf("get latest provisioner job: %w", err)
166-
}
160+
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
161+
if err != nil {
162+
return xerrors.Errorf("get latest provisioner job: %w", err)
163+
}
167164

168-
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
169-
if err != nil {
170-
return xerrors.Errorf("get template scheduling options: %w", err)
171-
}
165+
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
166+
if err != nil {
167+
return xerrors.Errorf("get template scheduling options: %w", err)
168+
}
172169

173-
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
174-
if err != nil {
175-
return xerrors.Errorf("get template by ID: %w", err)
176-
}
170+
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
171+
if err != nil {
172+
return xerrors.Errorf("get template by ID: %w", err)
173+
}
177174

178-
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
175+
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
179176

180-
nextTransition, reason, err := getNextTransition(user, ws, latestBuild, latestJob, templateSchedule, currentTick)
181-
if err != nil {
182-
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
183-
// err is used to indicate that a workspace is not eligible
184-
// so returning nil here is ok although ultimately the distinction
185-
// doesn't matter since the transaction is read-only up to
186-
// this point.
187-
return nil
188-
}
177+
nextTransition, reason, err := getNextTransition(user, ws, latestBuild, latestJob, templateSchedule, currentTick)
178+
if err != nil {
179+
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
180+
// err is used to indicate that a workspace is not eligible
181+
// so returning nil here is ok although ultimately the distinction
182+
// doesn't matter since the transaction is read-only up to
183+
// this point.
184+
return nil
185+
}
189186

190-
var build *database.WorkspaceBuild
191-
if nextTransition != "" {
192-
builder := wsbuilder.New(ws, nextTransition).
193-
SetLastWorkspaceBuildInTx(&latestBuild).
194-
SetLastWorkspaceBuildJobInTx(&latestJob).
195-
Reason(reason)
196-
log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition))
197-
if nextTransition == database.WorkspaceTransitionStart &&
198-
useActiveVersion(accessControl, ws) {
199-
log.Debug(e.ctx, "autostarting with active version")
200-
builder = builder.ActiveVersion()
187+
var build *database.WorkspaceBuild
188+
if nextTransition != "" {
189+
builder := wsbuilder.New(ws, nextTransition).
190+
SetLastWorkspaceBuildInTx(&latestBuild).
191+
SetLastWorkspaceBuildJobInTx(&latestJob).
192+
Reason(reason)
193+
log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition))
194+
if nextTransition == database.WorkspaceTransitionStart &&
195+
useActiveVersion(accessControl, ws) {
196+
log.Debug(e.ctx, "autostarting with active version")
197+
builder = builder.ActiveVersion()
198+
}
199+
200+
build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
201+
if err != nil {
202+
return xerrors.Errorf("build workspace with transition %q: %w", nextTransition, err)
203+
}
201204
}
202205

203-
build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
204-
if err != nil {
205-
log.Error(e.ctx, "unable to transition workspace",
206-
slog.F("transition", nextTransition),
207-
slog.Error(err),
206+
// Transition the workspace to dormant if it has breached the template's
207+
// threshold for inactivity.
208+
if reason == database.BuildReasonAutolock {
209+
wsOld := ws
210+
ws, err = tx.UpdateWorkspaceDormantDeletingAt(e.ctx, database.UpdateWorkspaceDormantDeletingAtParams{
211+
ID: ws.ID,
212+
DormantAt: sql.NullTime{
213+
Time: dbtime.Now(),
214+
Valid: true,
215+
},
216+
})
217+
218+
auditLog = &auditParams{
219+
Build: build,
220+
Job: latestJob,
221+
Reason: reason,
222+
Old: wsOld,
223+
New: ws,
224+
}
225+
if err != nil {
226+
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
227+
}
228+
229+
log.Info(e.ctx, "dormant workspace",
230+
slog.F("last_used_at", ws.LastUsedAt),
231+
slog.F("time_til_dormant", templateSchedule.TimeTilDormant),
232+
slog.F("since_last_used_at", time.Since(ws.LastUsedAt)),
208233
)
209-
return xerrors.Errorf("build workspace: %w", err)
210234
}
211-
}
212235

213-
// Transition the workspace to dormant if it has breached the template's
214-
// threshold for inactivity.
215-
if reason == database.BuildReasonAutolock {
216-
wsOld := ws
217-
ws, err = tx.UpdateWorkspaceDormantDeletingAt(e.ctx, database.UpdateWorkspaceDormantDeletingAtParams{
218-
ID: ws.ID,
219-
DormantAt: sql.NullTime{
220-
Time: dbtime.Now(),
221-
Valid: true,
222-
},
223-
})
224-
225-
auditLog = &auditParams{
226-
Build: build,
227-
Job: latestJob,
228-
Reason: reason,
229-
Old: wsOld,
230-
New: ws,
236+
if reason == database.BuildReasonAutodelete {
237+
log.Info(e.ctx, "deleted workspace",
238+
slog.F("dormant_at", ws.DormantAt.Time),
239+
slog.F("time_til_dormant_autodelete", templateSchedule.TimeTilDormantAutoDelete),
240+
)
231241
}
232-
if err != nil {
233-
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
242+
243+
if nextTransition == "" {
244+
return nil
234245
}
235246

236-
log.Info(e.ctx, "dormant workspace",
237-
slog.F("last_used_at", ws.LastUsedAt),
238-
slog.F("time_til_dormant", templateSchedule.TimeTilDormant),
239-
slog.F("since_last_used_at", time.Since(ws.LastUsedAt)),
240-
)
241-
}
247+
statsMu.Lock()
248+
stats.Transitions[ws.ID] = nextTransition
249+
statsMu.Unlock()
242250

243-
if reason == database.BuildReasonAutodelete {
244-
log.Info(e.ctx, "deleted workspace",
245-
slog.F("dormant_at", ws.DormantAt.Time),
246-
slog.F("time_til_dormant_autodelete", templateSchedule.TimeTilDormantAutoDelete),
251+
log.Info(e.ctx, "scheduling workspace transition",
252+
slog.F("transition", nextTransition),
253+
slog.F("reason", reason),
247254
)
248-
}
249255

250-
if nextTransition == "" {
251256
return nil
252-
}
253-
254-
statsMu.Lock()
255-
stats.Transitions[ws.ID] = nextTransition
256-
statsMu.Unlock()
257257

258-
log.Info(e.ctx, "scheduling workspace transition",
259-
slog.F("transition", nextTransition),
260-
slog.F("reason", reason),
261-
)
262-
263-
return nil
264-
265-
// Run with RepeatableRead isolation so that the build process sees the same data
266-
// as our calculation that determines whether an autobuild is necessary.
267-
}, &sql.TxOptions{Isolation: sql.LevelRepeatableRead})
268-
if err != nil {
269-
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
270-
}
271-
if auditLog != nil {
272-
// If the transition didn't succeed then updating the workspace
273-
// to indicate dormant didn't either.
274-
auditLog.Success = err == nil
275-
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
276-
}
277-
if job != nil && err == nil {
278-
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
279-
// with an outer transaction like this, and we need to make sure the outer transaction commits before
280-
// posting the job. If we post before the transaction commits, provisionerd might try to acquire the
281-
// job, fail, and then sit idle instead of picking up the job.
282-
err = provisionerjobs.PostJob(e.ps, *job)
258+
// Run with RepeatableRead isolation so that the build process sees the same data
259+
// as our calculation that determines whether an autobuild is necessary.
260+
}, &sql.TxOptions{Isolation: sql.LevelRepeatableRead})
261+
if auditLog != nil {
262+
// If the transition didn't succeed then updating the workspace
263+
// to indicate dormant didn't either.
264+
auditLog.Success = err == nil
265+
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
266+
}
283267
if err != nil {
284-
// Client probably doesn't care about this error, so just log it.
285-
log.Error(e.ctx, "failed to post provisioner job to pubsub", slog.Error(err))
268+
return xerrors.Errorf("transition workspace: %w", err)
286269
}
270+
if job != nil {
271+
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
272+
// with an outer transaction like this, and we need to make sure the outer transaction commits before
273+
// posting the job. If we post before the transaction commits, provisionerd might try to acquire the
274+
// job, fail, and then sit idle instead of picking up the job.
275+
err = provisionerjobs.PostJob(e.ps, *job)
276+
if err != nil {
277+
return xerrors.Errorf("post provisioner job to pubsub: %w", err)
278+
}
279+
}
280+
return nil
281+
}()
282+
if err != nil {
283+
e.log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
284+
statsMu.Lock()
285+
stats.Errors[wsID] = err
286+
statsMu.Unlock()
287287
}
288+
// Even though we got an error we still return nil to avoid
289+
// short-circuiting the evaluation loop.
288290
return nil
289291
})
290292
}

0 commit comments

Comments
 (0)