Skip to content

Commit 03885f3

Browse files
committed
WIP: exponential backoff for reconciliation if failed loop possible
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent e3cecab commit 03885f3

File tree

15 files changed

+307
-39
lines changed

15 files changed

+307
-39
lines changed

coderd/database/dbauthz/dbauthz.go

+7
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,13 @@ func (q *querier) GetPresetParametersByTemplateVersionID(ctx context.Context, te
20562056
return q.db.GetPresetParametersByTemplateVersionID(ctx, templateVersionID)
20572057
}
20582058

2059+
func (q *querier) GetPresetsBackoff(ctx context.Context) ([]database.GetPresetsBackoffRow, error) {
2060+
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceTemplate); err != nil {
2061+
return nil, err
2062+
}
2063+
return q.db.GetPresetsBackoff(ctx)
2064+
}
2065+
20592066
func (q *querier) GetPresetsByTemplateVersionID(ctx context.Context, templateVersionID uuid.UUID) ([]database.TemplateVersionPreset, error) {
20602067
// An actor can read template version presets if they can read the related template version.
20612068
_, err := q.GetTemplateVersionByID(ctx, templateVersionID)

coderd/database/dbmem/dbmem.go

+4
Original file line numberDiff line numberDiff line change
@@ -4066,6 +4066,10 @@ func (q *FakeQuerier) GetPresetParametersByTemplateVersionID(_ context.Context,
40664066
return parameters, nil
40674067
}
40684068

4069+
func (q *FakeQuerier) GetPresetsBackoff(ctx context.Context) ([]database.GetPresetsBackoffRow, error) {
4070+
panic("not implemented")
4071+
}
4072+
40694073
func (q *FakeQuerier) GetPresetsByTemplateVersionID(_ context.Context, templateVersionID uuid.UUID) ([]database.TemplateVersionPreset, error) {
40704074
q.mutex.RLock()
40714075
defer q.mutex.RUnlock()

coderd/database/dbmetrics/querymetrics.go

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

+15
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

+72-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

+33
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,39 @@ FROM workspace_latest_build wlb
4444
WHERE pj.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
4545
GROUP BY t.id, wpb.template_version_id, wpb.transition;
4646

47+
-- name: GetPresetsBackoff :many
48+
WITH filtered_builds AS (
49+
-- Only select builds which are for prebuild creations
50+
SELECT wlb.*, tvp.id AS preset_id, pj.job_status
51+
FROM template_version_presets tvp
52+
JOIN workspace_latest_build wlb ON wlb.template_version_preset_id = tvp.id
53+
JOIN provisioner_jobs pj ON wlb.job_id = pj.id
54+
JOIN template_versions tv ON wlb.template_version_id = tv.id
55+
JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id
56+
JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
57+
WHERE wlb.transition = 'start'::workspace_transition),
58+
latest_builds AS (
59+
-- Select only the latest build per template_version AND preset
60+
SELECT fb.*,
61+
ROW_NUMBER() OVER (PARTITION BY fb.template_version_preset_id ORDER BY fb.created_at DESC) as rn
62+
FROM filtered_builds fb),
63+
failed_count AS (
64+
-- Count failed builds per template version/preset in the last hour
65+
SELECT preset_id, COUNT(*) AS num_failed
66+
FROM filtered_builds
67+
WHERE job_status = 'failed'::provisioner_job_status
68+
AND created_at >= NOW() - INTERVAL '1 hour'
69+
GROUP BY preset_id)
70+
SELECT lb.template_version_id,
71+
lb.preset_id,
72+
lb.job_status AS latest_build_status,
73+
COALESCE(fc.num_failed, 0)::int AS num_failed,
74+
lb.created_at AS last_build_at
75+
FROM latest_builds lb
76+
LEFT JOIN failed_count fc ON fc.preset_id = lb.preset_id
77+
WHERE lb.rn = 1
78+
AND lb.job_status = 'failed'::provisioner_job_status;
79+
4780
-- name: ClaimPrebuild :one
4881
-- TODO: rewrite to use named CTE instead?
4982
UPDATE workspaces w

coderd/prebuilds/reconcile.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ func NewNoopReconciler() Reconciler {
1111
}
1212

1313
func (noopReconciler) RunLoop(context.Context) {}
14-
func (noopReconciler) Stop(context.Context, error) {}
14+
func (noopReconciler) Stop(context.Context, error) {}
1515
func (noopReconciler) ReconcileAll(context.Context) error { return nil }
1616
func (noopReconciler) ReconcileTemplate() error { return nil }
1717

codersdk/deployment.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,8 @@ type NotificationsWebhookConfig struct {
764764
}
765765

766766
type PrebuildsConfig struct {
767-
ReconciliationInterval serpent.Duration `json:"reconciliation_interval" typescript:",notnull"`
767+
ReconciliationInterval serpent.Duration `json:"reconciliation_interval" typescript:",notnull"`
768+
ReconciliationBackoffInterval serpent.Duration `json:"reconciliation_backoff_interval" typescript:",notnull"`
768769
}
769770

770771
const (
@@ -2969,6 +2970,16 @@ Write out the current server config as YAML to stdout.`,
29692970
Group: &deploymentGroupPrebuilds,
29702971
YAML: "reconciliation_interval",
29712972
},
2973+
{
2974+
Name: "Reconciliation Backoff Interval",
2975+
Description: "Interval to increase reconciliation backoff by when unrecoverable errors occur.",
2976+
Flag: "workspace-prebuilds-reconciliation-backoff-interval",
2977+
Env: "CODER_WORKSPACE_PREBUILDS_RECONCILIATION_INTERVAL",
2978+
Value: &c.Prebuilds.ReconciliationBackoffInterval,
2979+
Default: (time.Second * 15).String(),
2980+
Group: &deploymentGroupPrebuilds,
2981+
YAML: "reconciliation_backoff_interval",
2982+
},
29722983
}
29732984

29742985
return opts

enterprise/coderd/prebuilds/reconcile.go

+33-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"database/sql"
77
"encoding/base32"
88
"fmt"
9+
"math"
910
"strings"
1011
"sync/atomic"
1112
"time"
@@ -31,6 +32,8 @@ import (
3132
"golang.org/x/xerrors"
3233
)
3334

35+
var ErrBackoff = xerrors.New("reconciliation in backoff")
36+
3437
type storeReconciler struct {
3538
store database.Store
3639
cfg codersdk.PrebuildsConfig
@@ -179,7 +182,7 @@ func (c *storeReconciler) ReconcileAll(ctx context.Context) error {
179182
}
180183

181184
if !preset.UsingActiveVersion && len(ps.running) == 0 && len(ps.inProgress) == 0 {
182-
logger.Debug(ctx, "skipping reconciliation for preset; inactive, no running prebuilds, and no in-progress operationss",
185+
logger.Debug(ctx, "skipping reconciliation for preset; inactive, no running prebuilds, and no in-progress operations",
183186
slog.F("preset_id", preset.PresetID.String()))
184187
continue
185188
}
@@ -246,7 +249,12 @@ func (c *storeReconciler) determineState(ctx context.Context, store database.Sto
246249
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
247250
}
248251

249-
state = newReconciliationState(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress)
252+
presetsBackoff, err := db.GetPresetsBackoff(ctx)
253+
if err != nil {
254+
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
255+
}
256+
257+
state = newReconciliationState(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff)
250258
return nil
251259
}, &database.TxOptions{
252260
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
@@ -268,7 +276,7 @@ func (c *storeReconciler) reconcilePrebuildsForPreset(ctx context.Context, ps *p
268276
vlogger := logger.With(slog.F("template_version_id", ps.preset.TemplateVersionID), slog.F("preset_id", ps.preset.PresetID))
269277

270278
// TODO: move log lines up from calculateActions.
271-
actions, err := ps.calculateActions()
279+
actions, err := ps.calculateActions(c.cfg.ReconciliationBackoffInterval.Value())
272280
if err != nil {
273281
vlogger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err))
274282
return xerrors.Errorf("failed to calculate reconciliation actions: %w", err)
@@ -286,14 +294,34 @@ func (c *storeReconciler) reconcilePrebuildsForPreset(ctx context.Context, ps *p
286294
if actions.create > 0 || len(actions.deleteIDs) > 0 {
287295
// Only log with info level when there's a change that needs to be effected.
288296
levelFn = vlogger.Info
297+
} else if dbtime.Now().Before(actions.backoffUntil) {
298+
levelFn = vlogger.Warn
289299
}
290-
levelFn(ctx, "template prebuild state retrieved",
300+
301+
fields := []any{
291302
slog.F("create_count", actions.create), slog.F("delete_count", len(actions.deleteIDs)),
292303
slog.F("to_delete", actions.deleteIDs),
293304
slog.F("desired", actions.desired), slog.F("actual", actions.actual),
294305
slog.F("outdated", actions.outdated), slog.F("extraneous", actions.extraneous),
295306
slog.F("starting", actions.starting), slog.F("stopping", actions.stopping),
296-
slog.F("deleting", actions.deleting), slog.F("eligible", actions.eligible))
307+
slog.F("deleting", actions.deleting), slog.F("eligible", actions.eligible),
308+
}
309+
310+
// TODO: add quartz
311+
312+
// If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out.
313+
if actions.backoffUntil.After(dbtime.Now()) {
314+
levelFn(ctx, "template prebuild state retrieved, backing off",
315+
append(fields,
316+
slog.F("backoff_until", actions.backoffUntil.Format(time.RFC3339)),
317+
slog.F("backoff_secs", math.Round(actions.backoffUntil.Sub(dbtime.Now()).Seconds())),
318+
)...)
319+
320+
// return ErrBackoff
321+
return nil
322+
} else {
323+
levelFn(ctx, "template prebuild state retrieved", fields...)
324+
}
297325

298326
// Provision workspaces within the same tx so we don't get any timing issues here.
299327
// i.e. we hold the advisory lock until all "reconciliatory" actions have been taken.

enterprise/coderd/prebuilds/reconcile_test.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"testing"
88
"time"
99

10+
"cdr.dev/slog"
11+
"cdr.dev/slog/sloggers/slogtest"
1012
"github.com/google/uuid"
1113
"github.com/stretchr/testify/require"
1214

@@ -297,7 +299,9 @@ func TestPrebuildReconciliation(t *testing.T) {
297299
t.Parallel()
298300
ctx := testutil.Context(t, testutil.WaitShort)
299301
cfg := codersdk.PrebuildsConfig{}
300-
logger := testutil.Logger(t)
302+
logger := slogtest.Make(
303+
t, &slogtest.Options{IgnoreErrors: true},
304+
).Leveled(slog.LevelDebug)
301305
db, pubsub := dbtestutil.NewDB(t)
302306
controller := prebuilds.NewStoreReconciler(db, pubsub, cfg, logger)
303307

@@ -339,8 +343,8 @@ func TestPrebuildReconciliation(t *testing.T) {
339343

340344
// Run the reconciliation multiple times to ensure idempotency
341345
// 8 was arbitrary, but large enough to reasonably trust the result
342-
for range 8 {
343-
controller.ReconcileAll(ctx)
346+
for i := 1; i <= 8; i++ {
347+
require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i)
344348

345349
if tc.shouldCreateNewPrebuild != nil {
346350
newPrebuildCount := 0

0 commit comments

Comments
 (0)