Skip to content

Commit 3f95841

Browse files
committed
Backoff acquiring provisioner jobs when the database is unreachable
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 9e2af3e commit 3f95841

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

provisionerd/provisionerd.go

+13-8
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
defer p.wg.Done()
291292
defer func() { close(p.acquireDoneCh) }()
292293
ctx := p.closeContext
293-
for {
294+
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
294295
if p.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,10 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err := p.acquireAndRunOne(client)
304+
if err != nil && ctx.Err() == nil { // Only log if context is not done.
305+
p.opts.Logger.Debug(ctx, "retrying to acquire job", slog.F("retry_in_ms", retrier.Delay.Milliseconds()), slog.Error(err))
306+
}
303307
}
304308
}
305309

@@ -318,7 +322,7 @@ func (p *Server) acquireExit() bool {
318322
return false
319323
}
320324

321-
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
325+
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
322326
ctx := p.closeContext
323327
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
324328
job, err := p.acquireGraceful(client)
@@ -327,15 +331,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327331
if errors.Is(err, context.Canceled) ||
328332
errors.Is(err, yamux.ErrSessionShutdown) ||
329333
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
330-
return
334+
return err
331335
}
332336

333337
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
334-
return
338+
return xerrors.Errorf("failed to acquire job: %w", err)
335339
}
336340
if job.JobId == "" {
337341
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
338-
return
342+
return xerrors.New("canceled")
339343
}
340344

341345
if len(job.TraceMetadata) > 0 {
@@ -392,7 +396,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
392396
if err != nil {
393397
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
394398
}
395-
return
399+
return xerrors.Errorf("provisioner job failed: %w", err)
396400
}
397401

398402
p.mutex.Lock()
@@ -416,6 +420,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416420
p.mutex.Lock()
417421
p.activeJob = nil
418422
p.mutex.Unlock()
423+
return nil
419424
}
420425

421426
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

0 commit comments

Comments
 (0)