Skip to content

Commit 9e0ae3b

Browse files
committed
fixup! fix cancel timeout test
1 parent 0218151 commit 9e0ae3b

File tree

6 files changed

+24
-28
lines changed

6 files changed

+24
-28
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
901901
hangDetectorTicker := time.NewTicker(cfg.JobHangDetectorInterval.Value())
902902
defer hangDetectorTicker.Stop()
903903
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
904-
hangDetector.Run()
904+
hangDetector.Start()
905905

906906
// Currently there is no way to ask the server to shut
907907
// itself down, so any exit signal will result in a non-zero

coderd/coderdtest/coderdtest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
254254
hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
255255
defer hangDetectorTicker.Stop()
256256
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
257-
hangDetector.Run()
257+
hangDetector.Start()
258258

259259
var mutex sync.RWMutex
260260
var handler http.Handler

coderd/database/lock.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ package database
44
// change. If locks are deprecated, they should be kept in this list to avoid
55
// reusing the same ID.
66
const (
7-
lockIDUnused = iota
8-
LockIDDeploymentSetup = iota
9-
LockIDHangDetector = iota
7+
lockIDUnused = iota
8+
LockIDDeploymentSetup
9+
LockIDHangDetector
1010
)

coderd/unhanger/detector.go

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"encoding/json"
77
"time"
88

9-
"golang.org/x/sync/errgroup"
109
"golang.org/x/xerrors"
1110

1211
"github.com/google/uuid"
@@ -27,6 +26,9 @@ const (
2726
// HungJobExitTimeout is the duration of time that provisioners should allow
2827
// for a graceful exit upon cancellation due to failing to send an update to
2928
// a job.
29+
//
30+
// Provisioners should avoid keeping a job "running" for longer than this
31+
// time after failing to send an update to the job.
3032
HungJobExitTimeout = 3 * time.Minute
3133
)
3234

@@ -95,18 +97,18 @@ func New(ctx context.Context, db database.Store, pubsub database.Pubsub, log slo
9597

9698
// WithStatsChannel will cause Executor to push a RunStats to ch after
9799
// every tick. This push is blocking, so if ch is not read, the detector will
98-
// hang.
100+
// hang. This should only be used in tests.
99101
func (d *Detector) WithStatsChannel(ch chan<- Stats) *Detector {
100102
d.stats = ch
101103
return d
102104
}
103105

104-
// Run will cause the detector to detect and unhang provisioner jobs on every
106+
// Start will cause the detector to detect and unhang provisioner jobs on every
105107
// tick from its channel. It will stop when its context is Done, or when its
106108
// channel is closed.
107109
//
108-
// Run should only be called once.
109-
func (d *Detector) Run() {
110+
// Start should only be called once.
111+
func (d *Detector) Start() {
110112
go func() {
111113
defer close(d.done)
112114
for {
@@ -117,7 +119,7 @@ func (d *Detector) Run() {
117119
if !ok {
118120
return
119121
}
120-
stats := d.runOnce(t)
122+
stats := d.run(t)
121123
if stats.Error != nil && !xerrors.As(stats.Error, &acquireLockError{}) {
122124
d.log.Warn(d.ctx, "error running workspace build hang detector once", slog.Error(stats.Error))
123125
}
@@ -141,7 +143,7 @@ func (d *Detector) Wait() {
141143
<-d.done
142144
}
143145

144-
func (d *Detector) runOnce(t time.Time) Stats {
146+
func (d *Detector) run(t time.Time) Stats {
145147
ctx, cancel := context.WithTimeout(d.ctx, 5*time.Minute)
146148
defer cancel()
147149

@@ -170,12 +172,6 @@ func (d *Detector) runOnce(t time.Time) Stats {
170172
return xerrors.Errorf("get hung provisioner jobs: %w", err)
171173
}
172174

173-
// We only use errgroup here for convenience of API, not for early
174-
// cancellation. This means we only return nil errors in th eg.Go.
175-
eg := errgroup.Group{}
176-
// Limit the concurrency to avoid overloading the database.
177-
eg.SetLimit(10)
178-
179175
// Send a message into the build log for each hung job saying that it
180176
// has been detected and will be terminated, then mark the job as
181177
// failed.

coderd/unhanger/detector_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ func TestDetectorNoJobs(t *testing.T) {
3737
)
3838

3939
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
40-
detector.Run()
40+
detector.Start()
4141
tickCh <- time.Now()
4242

4343
stats := <-statsCh
@@ -84,7 +84,7 @@ func TestDetectorNoHungJobs(t *testing.T) {
8484
}
8585

8686
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
87-
detector.Run()
87+
detector.Start()
8888
tickCh <- now
8989

9090
stats := <-statsCh
@@ -172,7 +172,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) {
172172
t.Log("current job ID: ", currentWorkspaceBuildJob.ID)
173173

174174
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
175-
detector.Run()
175+
detector.Start()
176176
tickCh <- now
177177

178178
stats := <-statsCh
@@ -277,7 +277,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) {
277277
t.Log("current job ID: ", currentWorkspaceBuildJob.ID)
278278

279279
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
280-
detector.Run()
280+
detector.Start()
281281
tickCh <- now
282282

283283
stats := <-statsCh
@@ -353,7 +353,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T
353353
t.Log("current job ID: ", currentWorkspaceBuildJob.ID)
354354

355355
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
356-
detector.Run()
356+
detector.Start()
357357
tickCh <- now
358358

359359
stats := <-statsCh
@@ -438,7 +438,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) {
438438
t.Log("template dry-run job ID: ", templateDryRunJob.ID)
439439

440440
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
441-
detector.Run()
441+
detector.Start()
442442
tickCh <- now
443443

444444
stats := <-statsCh
@@ -559,7 +559,7 @@ func TestDetectorPushesLogs(t *testing.T) {
559559
}
560560

561561
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
562-
detector.Run()
562+
detector.Start()
563563

564564
// Create pubsub subscription to listen for new log events.
565565
pubsubCalled := make(chan int64, 1)

provisioner/terraform/serve.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ type ServeOptions struct {
3434
//
3535
// This is a no-op on Windows where the process can't be interrupted.
3636
//
37-
// Default value: 3 minutes. This value should be kept less than the value
38-
// that Coder uses to mark hung jobs as failed, which is 5 minutes (see
39-
// unhanger package).
37+
// Default value: 3 minutes (unhanger.HungJobExitTimeout). This value should
38+
// be kept less than the value that Coder uses to mark hung jobs as failed,
39+
// which is 5 minutes (see unhanger package).
4040
ExitTimeout time.Duration
4141
}
4242

0 commit comments

Comments
 (0)