Skip to content

Commit 590f76a

Browse files
committed
comments
1 parent 9e0ae3b commit 590f76a

File tree

4 files changed

+68
-49
lines changed

4 files changed

+68
-49
lines changed

cli/server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
902902
defer hangDetectorTicker.Stop()
903903
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
904904
hangDetector.Start()
905+
defer hangDetector.Close()
905906

906907
// Currently there is no way to ask the server to shut
907908
// itself down, so any exit signal will result in a non-zero

coderd/coderdtest/coderdtest.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
255255
defer hangDetectorTicker.Stop()
256256
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
257257
hangDetector.Start()
258+
t.Cleanup(hangDetector.Close)
258259

259260
var mutex sync.RWMutex
260261
var handler http.Handler

coderd/unhanger/detector.go

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,14 @@ func (e *acquireLockError) Unwrap() error {
6262
// build log and terminates them as failed.
6363
type Detector struct {
6464
ctx context.Context
65+
cancel context.CancelFunc
66+
done chan struct{}
67+
6568
db database.Store
6669
pubsub database.Pubsub
6770
log slog.Logger
6871
tick <-chan time.Time
6972
stats chan<- Stats
70-
done chan struct{}
7173
}
7274

7375
// Stats contains statistics about the last run of the detector.
@@ -83,14 +85,17 @@ type Stats struct {
8385

8486
// New returns a new hang detector.
8587
func New(ctx context.Context, db database.Store, pubsub database.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector {
88+
//nolint:gocritic // Hang detector has a limited set of permissions.
89+
ctx, cancel := context.WithCancel(dbauthz.AsHangDetector(ctx))
8690
le := &Detector{
87-
//nolint:gocritic // Hang detector has a limited set of permissions.
88-
ctx: dbauthz.AsHangDetector(ctx),
91+
ctx: ctx,
92+
cancel: cancel,
93+
done: make(chan struct{}),
8994
db: db,
9095
pubsub: pubsub,
91-
tick: tick,
9296
log: log,
93-
done: make(chan struct{}),
97+
tick: tick,
98+
stats: nil,
9499
}
95100
return le
96101
}
@@ -111,6 +116,8 @@ func (d *Detector) WithStatsChannel(ch chan<- Stats) *Detector {
111116
func (d *Detector) Start() {
112117
go func() {
113118
defer close(d.done)
119+
defer d.cancel()
120+
114121
for {
115122
select {
116123
case <-d.ctx.Done():
@@ -143,6 +150,12 @@ func (d *Detector) Wait() {
143150
<-d.done
144151
}
145152

153+
// Close will stop the detector.
154+
func (d *Detector) Close() {
155+
d.cancel()
156+
<-d.done
157+
}
158+
146159
func (d *Detector) run(t time.Time) Stats {
147160
ctx, cancel := context.WithTimeout(d.ctx, 5*time.Minute)
148161
defer cancel()
@@ -153,8 +166,8 @@ func (d *Detector) run(t time.Time) Stats {
153166
}
154167

155168
err := d.db.InTx(func(db database.Store) error {
156-
err := db.AcquireLock(ctx, database.LockIDHangDetector)
157-
if err != nil {
169+
locked, err := db.TryAcquireLock(ctx, database.LockIDHangDetector)
170+
if !locked {
158171
// If we can't acquire the lock, it means another instance of the
159172
// hang detector is already running in another coder replica.
160173
// There's no point in waiting to run it again, so we'll just retry
@@ -163,6 +176,10 @@ func (d *Detector) run(t time.Time) Stats {
163176
// This error is ignored.
164177
return &acquireLockError{err: err}
165178
}
179+
if err != nil {
180+
d.log.Warn(ctx, "skipping workspace build hang detector run due to error acquiring lock", slog.Error(err))
181+
return xerrors.Errorf("acquire lock: %w", err)
182+
}
166183
d.log.Info(ctx, "running workspace build hang detector")
167184

168185
// Find all provisioner jobs that are currently running but have not

coderd/unhanger/detector_test.go

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ func TestDetectorNoJobs(t *testing.T) {
2929
t.Parallel()
3030

3131
var (
32-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
33-
db, pubsub = dbtestutil.NewDB(t)
34-
log = slogtest.Make(t, nil)
35-
tickCh = make(chan time.Time)
36-
statsCh = make(chan unhanger.Stats)
32+
ctx = testutil.Context(t, testutil.WaitLong)
33+
db, pubsub = dbtestutil.NewDB(t)
34+
log = slogtest.Make(t, nil)
35+
tickCh = make(chan time.Time)
36+
statsCh = make(chan unhanger.Stats)
3737
)
3838

3939
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
@@ -44,19 +44,19 @@ func TestDetectorNoJobs(t *testing.T) {
4444
require.NoError(t, stats.Error)
4545
require.Empty(t, stats.HungJobIDs)
4646

47-
cancel()
47+
detector.Close()
4848
detector.Wait()
4949
}
5050

5151
func TestDetectorNoHungJobs(t *testing.T) {
5252
t.Parallel()
5353

5454
var (
55-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
56-
db, pubsub = dbtestutil.NewDB(t)
57-
log = slogtest.Make(t, nil)
58-
tickCh = make(chan time.Time)
59-
statsCh = make(chan unhanger.Stats)
55+
ctx = testutil.Context(t, testutil.WaitLong)
56+
db, pubsub = dbtestutil.NewDB(t)
57+
log = slogtest.Make(t, nil)
58+
tickCh = make(chan time.Time)
59+
statsCh = make(chan unhanger.Stats)
6060
)
6161

6262
// Insert some jobs that are running and haven't been updated in a while,
@@ -91,19 +91,19 @@ func TestDetectorNoHungJobs(t *testing.T) {
9191
require.NoError(t, stats.Error)
9292
require.Empty(t, stats.HungJobIDs)
9393

94-
cancel()
94+
detector.Close()
9595
detector.Wait()
9696
}
9797

9898
func TestDetectorHungWorkspaceBuild(t *testing.T) {
9999
t.Parallel()
100100

101101
var (
102-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
103-
db, pubsub = dbtestutil.NewDB(t)
104-
log = slogtest.Make(t, nil)
105-
tickCh = make(chan time.Time)
106-
statsCh = make(chan unhanger.Stats)
102+
ctx = testutil.Context(t, testutil.WaitLong)
103+
db, pubsub = dbtestutil.NewDB(t)
104+
log = slogtest.Make(t, nil)
105+
tickCh = make(chan time.Time)
106+
statsCh = make(chan unhanger.Stats)
107107
)
108108

109109
var (
@@ -195,19 +195,19 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) {
195195
require.NoError(t, err)
196196
require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState)
197197

198-
cancel()
198+
detector.Close()
199199
detector.Wait()
200200
}
201201

202202
func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) {
203203
t.Parallel()
204204

205205
var (
206-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
207-
db, pubsub = dbtestutil.NewDB(t)
208-
log = slogtest.Make(t, nil)
209-
tickCh = make(chan time.Time)
210-
statsCh = make(chan unhanger.Stats)
206+
ctx = testutil.Context(t, testutil.WaitLong)
207+
db, pubsub = dbtestutil.NewDB(t)
208+
log = slogtest.Make(t, nil)
209+
tickCh = make(chan time.Time)
210+
statsCh = make(chan unhanger.Stats)
211211
)
212212

213213
var (
@@ -300,19 +300,19 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) {
300300
require.NoError(t, err)
301301
require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState)
302302

303-
cancel()
303+
detector.Close()
304304
detector.Wait()
305305
}
306306

307307
func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) {
308308
t.Parallel()
309309

310310
var (
311-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
312-
db, pubsub = dbtestutil.NewDB(t)
313-
log = slogtest.Make(t, nil)
314-
tickCh = make(chan time.Time)
315-
statsCh = make(chan unhanger.Stats)
311+
ctx = testutil.Context(t, testutil.WaitLong)
312+
db, pubsub = dbtestutil.NewDB(t)
313+
log = slogtest.Make(t, nil)
314+
tickCh = make(chan time.Time)
315+
statsCh = make(chan unhanger.Stats)
316316
)
317317

318318
var (
@@ -376,19 +376,19 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T
376376
require.NoError(t, err)
377377
require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState)
378378

379-
cancel()
379+
detector.Close()
380380
detector.Wait()
381381
}
382382

383383
func TestDetectorHungOtherJobTypes(t *testing.T) {
384384
t.Parallel()
385385

386386
var (
387-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
388-
db, pubsub = dbtestutil.NewDB(t)
389-
log = slogtest.Make(t, nil)
390-
tickCh = make(chan time.Time)
391-
statsCh = make(chan unhanger.Stats)
387+
ctx = testutil.Context(t, testutil.WaitLong)
388+
db, pubsub = dbtestutil.NewDB(t)
389+
log = slogtest.Make(t, nil)
390+
tickCh = make(chan time.Time)
391+
statsCh = make(chan unhanger.Stats)
392392
)
393393

394394
var (
@@ -467,7 +467,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) {
467467
require.Contains(t, job.Error.String, "Build has been detected as hung")
468468
require.False(t, job.ErrorCode.Valid)
469469

470-
cancel()
470+
detector.Close()
471471
detector.Wait()
472472
}
473473

@@ -506,11 +506,11 @@ func TestDetectorPushesLogs(t *testing.T) {
506506
t.Parallel()
507507

508508
var (
509-
ctx, cancel = context.WithCancel(testutil.Context(t, testutil.WaitLong))
510-
db, pubsub = dbtestutil.NewDB(t)
511-
log = slogtest.Make(t, nil)
512-
tickCh = make(chan time.Time)
513-
statsCh = make(chan unhanger.Stats)
509+
ctx = testutil.Context(t, testutil.WaitLong)
510+
db, pubsub = dbtestutil.NewDB(t)
511+
log = slogtest.Make(t, nil)
512+
tickCh = make(chan time.Time)
513+
statsCh = make(chan unhanger.Stats)
514514
)
515515

516516
var (
@@ -608,7 +608,7 @@ func TestDetectorPushesLogs(t *testing.T) {
608608
require.NoError(t, err)
609609
require.Len(t, logs, c.preLogCount+len(unhanger.HungJobLogMessages))
610610

611-
cancel()
611+
detector.Close()
612612
detector.Wait()
613613
})
614614
}

0 commit comments

Comments
 (0)