Skip to content

Commit 8e2732a

Browse files
committed
Refactor API to expose reconciliation internals for better testability
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent dbb6afa commit 8e2732a

File tree

10 files changed

+493
-415
lines changed

10 files changed

+493
-415
lines changed

coderd/prebuilds/api.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,20 @@ import (
88
"github.com/coder/coder/v2/coderd/database"
99
)
1010

11-
type Reconciler interface {
11+
type ReconciliationOrchestrator interface {
12+
Reconciler
13+
1214
RunLoop(ctx context.Context)
1315
Stop(ctx context.Context, cause error)
14-
ReconcileAll(ctx context.Context) error
16+
}
17+
18+
type Reconciler interface {
19+
// SnapshotState MUST be called inside a repeatable-read tx.
20+
SnapshotState(ctx context.Context, store database.Store) (*ReconciliationState, error)
21+
// DetermineActions MUST be called inside a repeatable-read tx.
22+
DetermineActions(ctx context.Context, state PresetState) (*ReconciliationActions, error)
23+
// Reconcile MUST be called inside a repeatable-read tx.
24+
Reconcile(ctx context.Context, state PresetState, actions ReconciliationActions) error
1525
}
1626

1727
type Claimer interface {

coderd/prebuilds/noop.go

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package prebuilds
2+
3+
import (
4+
"context"
5+
6+
"github.com/coder/coder/v2/coderd/database"
7+
)
8+
9+
type NoopReconciler struct{}
10+
11+
func NewNoopReconciler() *NoopReconciler {
12+
return &NoopReconciler{}
13+
}
14+
15+
func (NoopReconciler) RunLoop(ctx context.Context) {}
16+
func (NoopReconciler) Stop(ctx context.Context, cause error) {}
17+
func (NoopReconciler) SnapshotState(ctx context.Context, store database.Store) (*ReconciliationState, error) { return &ReconciliationState{}, nil }
18+
func (NoopReconciler) DetermineActions(ctx context.Context, state PresetState) (*ReconciliationActions, error) { return &ReconciliationActions{}, nil}
19+
func (NoopReconciler) Reconcile(ctx context.Context, state PresetState, actions ReconciliationActions) error { return nil}
20+
21+
var _ ReconciliationOrchestrator = NoopReconciler{}

coderd/prebuilds/reconcile.go

+78-8
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,87 @@
11
package prebuilds
22

33
import (
4-
"context"
4+
"time"
5+
6+
"github.com/google/uuid"
7+
"golang.org/x/xerrors"
8+
9+
"github.com/coder/coder/v2/coderd/database"
10+
"github.com/coder/coder/v2/coderd/util/slice"
511
)
612

7-
type noopReconciler struct{}
13+
// ReconciliationState represents a full point-in-time snapshot of state relating to prebuilds across all templates.
14+
type ReconciliationState struct {
15+
Presets []database.GetTemplatePresetsWithPrebuildsRow
16+
RunningPrebuilds []database.GetRunningPrebuildsRow
17+
PrebuildsInProgress []database.GetPrebuildsInProgressRow
18+
Backoffs []database.GetPresetsBackoffRow
19+
}
20+
21+
// PresetState is a subset of ReconciliationState but specifically for a single preset.
22+
type PresetState struct {
23+
Preset database.GetTemplatePresetsWithPrebuildsRow
24+
Running []database.GetRunningPrebuildsRow
25+
InProgress []database.GetPrebuildsInProgressRow
26+
Backoff *database.GetPresetsBackoffRow
27+
}
828

9-
func NewNoopReconciler() Reconciler {
10-
return &noopReconciler{}
29+
// ReconciliationActions represents the set of actions which must be taken to achieve the desired state for prebuilds.
30+
type ReconciliationActions struct {
31+
Actual int32 // Running prebuilds for active version.
32+
Desired int32 // Active template version's desired instances as defined in preset.
33+
Eligible int32 // Prebuilds which can be claimed.
34+
Outdated int32 // Prebuilds which no longer match the active template version.
35+
Extraneous int32 // Extra running prebuilds for active version (somehow).
36+
Starting, Stopping, Deleting int32 // Prebuilds currently being provisioned up or down.
37+
Create int32 // The number of prebuilds required to be created to reconcile required state.
38+
DeleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state.
39+
BackoffUntil time.Time // The time to wait until before trying to provision a new prebuild.
1140
}
1241

13-
func (noopReconciler) RunLoop(context.Context) {}
14-
func (noopReconciler) Stop(context.Context, error) {}
15-
func (noopReconciler) ReconcileAll(context.Context) error { return nil }
42+
func NewReconciliationState(presets []database.GetTemplatePresetsWithPrebuildsRow, runningPrebuilds []database.GetRunningPrebuildsRow,
43+
prebuildsInProgress []database.GetPrebuildsInProgressRow, backoffs []database.GetPresetsBackoffRow,
44+
) ReconciliationState {
45+
return ReconciliationState{Presets: presets, RunningPrebuilds: runningPrebuilds, PrebuildsInProgress: prebuildsInProgress, Backoffs: backoffs}
46+
}
1647

17-
var _ Reconciler = noopReconciler{}
48+
func (s ReconciliationState) FilterByPreset(presetID uuid.UUID) (*PresetState, error) {
49+
preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
50+
return preset.PresetID == presetID
51+
})
52+
if !found {
53+
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
54+
}
55+
56+
running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuildsRow) bool {
57+
if !prebuild.CurrentPresetID.Valid {
58+
return false
59+
}
60+
return prebuild.CurrentPresetID.UUID == preset.PresetID &&
61+
prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe.
62+
})
63+
64+
// These aren't preset-specific, but they need to inhibit all presets of this template from operating since they could
65+
// be in-progress builds which might impact another preset. For example, if a template goes from no defined prebuilds to defined prebuilds
66+
// and back, or a template is updated from one version to another.
67+
// We group by the template so that all prebuilds being provisioned for a prebuild are inhibited if any prebuild for
68+
// any preset in that template are in progress, to prevent clobbering.
69+
inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool {
70+
return prebuild.TemplateID == preset.TemplateID
71+
})
72+
73+
var backoff *database.GetPresetsBackoffRow
74+
backoffs := slice.Filter(s.Backoffs, func(row database.GetPresetsBackoffRow) bool {
75+
return row.PresetID == preset.PresetID
76+
})
77+
if len(backoffs) == 1 {
78+
backoff = &backoffs[0]
79+
}
80+
81+
return &PresetState{
82+
Preset: preset,
83+
Running: running,
84+
InProgress: inProgress,
85+
Backoff: backoff,
86+
}, nil
87+
}

coderd/prebuilds/state.go

+164
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
package prebuilds
2+
3+
import (
4+
"math"
5+
"slices"
6+
"time"
7+
8+
"github.com/coder/quartz"
9+
10+
"github.com/coder/coder/v2/coderd/database"
11+
)
12+
13+
func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, error) {
14+
// TODO: align workspace states with how we represent them on the FE and the CLI
15+
// right now there's some slight differences which can lead to additional prebuilds being created
16+
17+
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
18+
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
19+
20+
var (
21+
actual int32 // Running prebuilds for active version.
22+
desired int32 // Active template version's desired instances as defined in preset.
23+
eligible int32 // Prebuilds which can be claimed.
24+
outdated int32 // Prebuilds which no longer match the active template version.
25+
extraneous int32 // Extra running prebuilds for active version (somehow).
26+
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
27+
)
28+
29+
if p.Preset.UsingActiveVersion {
30+
actual = int32(len(p.Running))
31+
desired = p.Preset.DesiredInstances
32+
}
33+
34+
for _, prebuild := range p.Running {
35+
if p.Preset.UsingActiveVersion {
36+
if prebuild.Ready {
37+
eligible++
38+
}
39+
40+
extraneous = int32(math.Max(float64(actual-p.Preset.DesiredInstances), 0))
41+
}
42+
43+
if prebuild.TemplateVersionID == p.Preset.TemplateVersionID && !p.Preset.UsingActiveVersion {
44+
outdated++
45+
}
46+
}
47+
48+
// In-progress builds are common across all presets belonging to a given template.
49+
// In other words: these values will be identical across all presets belonging to this template.
50+
for _, progress := range p.InProgress {
51+
switch progress.Transition {
52+
case database.WorkspaceTransitionStart:
53+
starting++
54+
case database.WorkspaceTransitionStop:
55+
stopping++
56+
case database.WorkspaceTransitionDelete:
57+
deleting++
58+
}
59+
}
60+
61+
var (
62+
toCreate = int(math.Max(0, float64(
63+
desired-(actual+starting)), // The number of prebuilds currently being stopped (should be 0)
64+
))
65+
toDelete = int(math.Max(0, float64(
66+
outdated- // The number of prebuilds running above the desired count for active version
67+
deleting), // The number of prebuilds currently being deleted
68+
))
69+
70+
actions = &ReconciliationActions{
71+
Actual: actual,
72+
Desired: desired,
73+
Eligible: eligible,
74+
Outdated: outdated,
75+
Extraneous: extraneous,
76+
Starting: starting,
77+
Stopping: stopping,
78+
Deleting: deleting,
79+
}
80+
)
81+
82+
// If the template has become deleted or deprecated since the last reconciliation, we need to ensure we
83+
// scale those prebuilds down to zero.
84+
if p.Preset.Deleted || p.Preset.Deprecated {
85+
toCreate = 0
86+
toDelete = int(actual + outdated)
87+
actions.Desired = 0
88+
}
89+
90+
// We backoff when the last build failed, to give the operator some time to investigate the issue and to not provision
91+
// a tonne of prebuilds (_n_ on each reconciliation iteration).
92+
if p.Backoff != nil && p.Backoff.NumFailed > 0 {
93+
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
94+
95+
if clock.Now().Before(backoffUntil) {
96+
actions.Create = 0
97+
actions.DeleteIDs = nil
98+
actions.BackoffUntil = backoffUntil
99+
100+
// Return early here; we should not perform any reconciliation actions if we're in a backoff period.
101+
return actions, nil
102+
}
103+
}
104+
105+
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
106+
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
107+
if extraneous > 0 {
108+
// Sort running IDs by creation time so we always delete the oldest prebuilds.
109+
// In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better).
110+
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuildsRow) int {
111+
if a.CreatedAt.Before(b.CreatedAt) {
112+
return -1
113+
}
114+
if a.CreatedAt.After(b.CreatedAt) {
115+
return 1
116+
}
117+
118+
return 0
119+
})
120+
121+
for i := 0; i < int(extraneous); i++ {
122+
if i >= len(p.Running) {
123+
// This should never happen.
124+
// TODO: move up
125+
// c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!",
126+
// slog.F("running_count", len(p.Running)),
127+
// slog.F("extraneous", extraneous))
128+
continue
129+
}
130+
131+
actions.DeleteIDs = append(actions.DeleteIDs, p.Running[i].WorkspaceID)
132+
}
133+
134+
// TODO: move up
135+
// c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)",
136+
// slog.F("template_id", p.Preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous),
137+
// slog.F("victims", victims))
138+
139+
// Prevent the rest of the reconciliation from completing
140+
return actions, nil
141+
}
142+
143+
actions.Create = int32(toCreate)
144+
145+
if toDelete > 0 && len(p.Running) != toDelete {
146+
// TODO: move up
147+
// c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!",
148+
// slog.F("template_id", s.preset.TemplateID.String()), slog.F("running", len(p.Running)), slog.F("to_delete", toDelete))
149+
}
150+
151+
// TODO: implement lookup to not perform same action on workspace multiple times in $period
152+
// i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion
153+
for i := 0; i < toDelete; i++ {
154+
if i >= len(p.Running) {
155+
// TODO: move up
156+
// Above warning will have already addressed this.
157+
continue
158+
}
159+
160+
actions.DeleteIDs = append(actions.DeleteIDs, p.Running[i].WorkspaceID)
161+
}
162+
163+
return actions, nil
164+
}

0 commit comments

Comments
 (0)