Skip to content

Commit 8abc973

Browse files
committed
Guarding against overzealous prebuild creations/deletions
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent c78374f commit 8abc973

File tree

7 files changed

+178
-97
lines changed

7 files changed

+178
-97
lines changed

coderd/database/queries.sql.go

+30-29
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

+30-29
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ FROM templates t
3636
WHERE (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL);
3737

3838
-- name: GetPrebuildsInProgress :many
39-
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
39+
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
4040
FROM workspace_latest_build wlb
4141
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
4242
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
@@ -46,36 +46,37 @@ GROUP BY t.id, wpb.template_version_id, wpb.transition;
4646

4747
-- name: GetPresetsBackoff :many
4848
WITH filtered_builds AS (
49-
-- Only select builds which are for prebuild creations
50-
SELECT wlb.*, tvp.id AS preset_id, pj.job_status
51-
FROM template_version_presets tvp
52-
JOIN workspace_latest_build wlb ON wlb.template_version_preset_id = tvp.id
53-
JOIN provisioner_jobs pj ON wlb.job_id = pj.id
54-
JOIN template_versions tv ON wlb.template_version_id = tv.id
55-
JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id
56-
JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
57-
WHERE wlb.transition = 'start'::workspace_transition),
58-
latest_builds AS (
59-
-- Select only the latest build per template_version AND preset
60-
SELECT fb.*,
61-
ROW_NUMBER() OVER (PARTITION BY fb.template_version_preset_id ORDER BY fb.created_at DESC) as rn
62-
FROM filtered_builds fb),
63-
failed_count AS (
64-
-- Count failed builds per template version/preset in the given period
65-
SELECT preset_id, COUNT(*) AS num_failed
66-
FROM filtered_builds
67-
WHERE job_status = 'failed'::provisioner_job_status
68-
AND created_at >= @lookback::timestamptz
69-
GROUP BY preset_id)
49+
-- Only select builds which are for prebuild creations
50+
SELECT wlb.*, tvp.id AS preset_id, pj.job_status, tvpp.desired_instances
51+
FROM template_version_presets tvp
52+
JOIN workspace_latest_build wlb ON wlb.template_version_preset_id = tvp.id
53+
JOIN provisioner_jobs pj ON wlb.job_id = pj.id
54+
JOIN template_versions tv ON wlb.template_version_id = tv.id
55+
JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id
56+
JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
57+
WHERE wlb.transition = 'start'::workspace_transition),
58+
latest_builds AS (
59+
-- Select only the latest build per template_version AND preset
60+
SELECT fb.*,
61+
ROW_NUMBER() OVER (PARTITION BY fb.template_version_preset_id ORDER BY fb.created_at DESC) as rn
62+
FROM filtered_builds fb),
63+
failed_count AS (
64+
-- Count failed builds per template version/preset in the given period
65+
SELECT preset_id, COUNT(*) AS num_failed
66+
FROM filtered_builds
67+
WHERE job_status = 'failed'::provisioner_job_status
68+
AND created_at >= @lookback::timestamptz
69+
GROUP BY preset_id)
7070
SELECT lb.template_version_id,
71-
lb.preset_id,
72-
lb.job_status AS latest_build_status,
73-
COALESCE(fc.num_failed, 0)::int AS num_failed,
74-
lb.created_at AS last_build_at
71+
lb.preset_id,
72+
MAX(lb.job_status)::provisioner_job_status AS latest_build_status,
73+
MAX(COALESCE(fc.num_failed, 0))::int AS num_failed,
74+
MAX(lb.created_at)::timestamptz AS last_build_at
7575
FROM latest_builds lb
76-
LEFT JOIN failed_count fc ON fc.preset_id = lb.preset_id
77-
WHERE lb.rn = 1
78-
AND lb.job_status = 'failed'::provisioner_job_status;
76+
LEFT JOIN failed_count fc ON fc.preset_id = lb.preset_id
77+
WHERE lb.rn <= lb.desired_instances -- Fetch the last N builds, where N is the number of desired instances; if any fail, we backoff
78+
AND lb.job_status = 'failed'::provisioner_job_status
79+
GROUP BY lb.template_version_id, lb.preset_id, lb.job_status;
7980

8081
-- name: ClaimPrebuild :one
8182
-- TODO: rewrite to use named CTE instead?

coderd/prebuilds/reconcile.go

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type ReconciliationActions struct {
3434
Outdated int32 // Prebuilds which no longer match the active template version.
3535
Extraneous int32 // Extra running prebuilds for active version (somehow).
3636
Starting, Stopping, Deleting int32 // Prebuilds currently being provisioned up or down.
37+
Failed int32 // Number of prebuilds which have failed in the past CODER_WORKSPACE_PREBUILDS_RECONCILIATION_BACKOFF_LOOKBACK_PERIOD.
3738
Create int32 // The number of prebuilds required to be created to reconcile required state.
3839
DeleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state.
3940
BackoffUntil time.Time // The time to wait until before trying to provision a new prebuild.

coderd/prebuilds/state.go

+8-4
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
4848
// In-progress builds are common across all presets belonging to a given template.
4949
// In other words: these values will be identical across all presets belonging to this template.
5050
for _, progress := range p.InProgress {
51+
num := progress.Count
5152
switch progress.Transition {
5253
case database.WorkspaceTransitionStart:
53-
starting++
54+
starting+=num
5455
case database.WorkspaceTransitionStop:
55-
stopping++
56+
stopping+=num
5657
case database.WorkspaceTransitionDelete:
57-
deleting++
58+
deleting+=num
5859
}
5960
}
6061

@@ -90,6 +91,8 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
9091
// We backoff when the last build failed, to give the operator some time to investigate the issue and to not provision
9192
// a tonne of prebuilds (_n_ on each reconciliation iteration).
9293
if p.Backoff != nil && p.Backoff.NumFailed > 0 {
94+
actions.Failed = p.Backoff.NumFailed
95+
9396
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
9497

9598
if clock.Now().Before(backoffUntil) {
@@ -104,7 +107,8 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
104107

105108
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
106109
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
107-
if extraneous > 0 {
110+
// There must be no active deletions in order for extraneous prebuilds to be deleted.
111+
if extraneous > 0 && deleting == 0 {
108112
// Sort running IDs by creation time so we always delete the oldest prebuilds.
109113
// In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better).
110114
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuildsRow) int {

0 commit comments

Comments
 (0)