Skip to content

Commit e3a79c4

Browse files
feat: implement reconciliation loop
1 parent 9ba7c2a commit e3a79c4

File tree

10 files changed

+2083
-1
lines changed

10 files changed

+2083
-1
lines changed

coderd/prebuilds/api.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package prebuilds
2+
3+
import (
4+
"context"
5+
6+
"github.com/coder/coder/v2/coderd/database"
7+
)
8+
9+
type ReconciliationOrchestrator interface {
10+
Reconciler
11+
12+
RunLoop(ctx context.Context)
13+
Stop(ctx context.Context, cause error)
14+
}
15+
16+
type Reconciler interface {
17+
// SnapshotState MUST be called inside a repeatable-read tx.
18+
SnapshotState(ctx context.Context, store database.Store) (*ReconciliationState, error)
19+
// DetermineActions MUST be called inside a repeatable-read tx.
20+
DetermineActions(ctx context.Context, state PresetState) (*ReconciliationActions, error)
21+
// Reconcile MUST be called inside a repeatable-read tx.
22+
Reconcile(ctx context.Context, state PresetState, actions ReconciliationActions) error
23+
}

coderd/prebuilds/noop.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package prebuilds
2+
3+
import (
4+
"context"
5+
6+
"github.com/coder/coder/v2/coderd/database"
7+
)
8+
9+
type NoopReconciler struct{}
10+
11+
func NewNoopReconciler() *NoopReconciler {
12+
return &NoopReconciler{}
13+
}
14+
15+
func (NoopReconciler) RunLoop(context.Context) {}
16+
func (NoopReconciler) Stop(context.Context, error) {}
17+
func (NoopReconciler) SnapshotState(context.Context, database.Store) (*ReconciliationState, error) {
18+
return &ReconciliationState{}, nil
19+
}
20+
21+
func (NoopReconciler) DetermineActions(context.Context, PresetState) (*ReconciliationActions, error) {
22+
return &ReconciliationActions{}, nil
23+
}
24+
25+
func (NoopReconciler) Reconcile(context.Context, PresetState, ReconciliationActions) error {
26+
return nil
27+
}
28+
29+
var _ ReconciliationOrchestrator = NoopReconciler{}

coderd/prebuilds/reconcile.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
package prebuilds
2+
3+
import (
4+
"time"
5+
6+
"github.com/google/uuid"
7+
"golang.org/x/xerrors"
8+
9+
"github.com/coder/coder/v2/coderd/database"
10+
"github.com/coder/coder/v2/coderd/util/slice"
11+
)
12+
13+
// ReconciliationState represents a full point-in-time snapshot of state relating to prebuilds across all templates.
14+
type ReconciliationState struct {
15+
Presets []database.GetTemplatePresetsWithPrebuildsRow
16+
RunningPrebuilds []database.GetRunningPrebuildsRow
17+
PrebuildsInProgress []database.GetPrebuildsInProgressRow
18+
Backoffs []database.GetPresetsBackoffRow
19+
}
20+
21+
// PresetState is a subset of ReconciliationState but specifically for a single preset.
22+
type PresetState struct {
23+
Preset database.GetTemplatePresetsWithPrebuildsRow
24+
Running []database.GetRunningPrebuildsRow
25+
InProgress []database.GetPrebuildsInProgressRow
26+
Backoff *database.GetPresetsBackoffRow
27+
}
28+
29+
// ReconciliationActions represents the set of actions which must be taken to achieve the desired state for prebuilds.
30+
type ReconciliationActions struct {
31+
Actual int32 // Running prebuilds for active version.
32+
Desired int32 // Active template version's desired instances as defined in preset.
33+
Eligible int32 // Prebuilds which can be claimed.
34+
Outdated int32 // Prebuilds which no longer match the active template version.
35+
Extraneous int32 // Extra running prebuilds for active version (somehow).
36+
Starting, Stopping, Deleting int32 // Prebuilds currently being provisioned up or down.
37+
Failed int32 // Number of prebuilds which have failed in the past CODER_WORKSPACE_PREBUILDS_RECONCILIATION_BACKOFF_LOOKBACK_PERIOD.
38+
Create int32 // The number of prebuilds required to be created to reconcile required state.
39+
DeleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state.
40+
BackoffUntil time.Time // The time to wait until before trying to provision a new prebuild.
41+
}
42+
43+
func NewReconciliationState(presets []database.GetTemplatePresetsWithPrebuildsRow, runningPrebuilds []database.GetRunningPrebuildsRow,
44+
prebuildsInProgress []database.GetPrebuildsInProgressRow, backoffs []database.GetPresetsBackoffRow,
45+
) ReconciliationState {
46+
return ReconciliationState{Presets: presets, RunningPrebuilds: runningPrebuilds, PrebuildsInProgress: prebuildsInProgress, Backoffs: backoffs}
47+
}
48+
49+
func (s ReconciliationState) FilterByPreset(presetID uuid.UUID) (*PresetState, error) {
50+
preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
51+
return preset.PresetID == presetID
52+
})
53+
if !found {
54+
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
55+
}
56+
57+
running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuildsRow) bool {
58+
if !prebuild.CurrentPresetID.Valid {
59+
return false
60+
}
61+
return prebuild.CurrentPresetID.UUID == preset.PresetID &&
62+
prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe.
63+
})
64+
65+
// These aren't preset-specific, but they need to inhibit all presets of this template from operating since they could
66+
// be in-progress builds which might impact another preset. For example, if a template goes from no defined prebuilds to defined prebuilds
67+
// and back, or a template is updated from one version to another.
68+
// We group by the template so that all prebuilds being provisioned for a prebuild are inhibited if any prebuild for
69+
// any preset in that template are in progress, to prevent clobbering.
70+
inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool {
71+
return prebuild.TemplateID == preset.TemplateID
72+
})
73+
74+
var backoff *database.GetPresetsBackoffRow
75+
backoffs := slice.Filter(s.Backoffs, func(row database.GetPresetsBackoffRow) bool {
76+
return row.PresetID == preset.PresetID
77+
})
78+
if len(backoffs) == 1 {
79+
backoff = &backoffs[0]
80+
}
81+
82+
return &PresetState{
83+
Preset: preset,
84+
Running: running,
85+
InProgress: inProgress,
86+
Backoff: backoff,
87+
}, nil
88+
}

coderd/prebuilds/state.go

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package prebuilds
2+
3+
import (
4+
"math"
5+
"slices"
6+
"time"
7+
8+
"github.com/coder/quartz"
9+
10+
"github.com/coder/coder/v2/coderd/database"
11+
)
12+
13+
func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, error) {
14+
// TODO: align workspace states with how we represent them on the FE and the CLI
15+
// right now there's some slight differences which can lead to additional prebuilds being created
16+
17+
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
18+
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
19+
20+
var (
21+
actual int32 // Running prebuilds for active version.
22+
desired int32 // Active template version's desired instances as defined in preset.
23+
eligible int32 // Prebuilds which can be claimed.
24+
outdated int32 // Prebuilds which no longer match the active template version.
25+
extraneous int32 // Extra running prebuilds for active version (somehow).
26+
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
27+
)
28+
29+
if p.Preset.UsingActiveVersion {
30+
actual = int32(len(p.Running))
31+
desired = p.Preset.DesiredInstances
32+
}
33+
34+
for _, prebuild := range p.Running {
35+
if p.Preset.UsingActiveVersion {
36+
if prebuild.Ready {
37+
eligible++
38+
}
39+
40+
extraneous = int32(math.Max(float64(actual-p.Preset.DesiredInstances), 0))
41+
}
42+
43+
if prebuild.TemplateVersionID == p.Preset.TemplateVersionID && !p.Preset.UsingActiveVersion {
44+
outdated++
45+
}
46+
}
47+
48+
// In-progress builds are common across all presets belonging to a given template.
49+
// In other words: these values will be identical across all presets belonging to this template.
50+
for _, progress := range p.InProgress {
51+
num := progress.Count
52+
switch progress.Transition {
53+
case database.WorkspaceTransitionStart:
54+
starting += num
55+
case database.WorkspaceTransitionStop:
56+
stopping += num
57+
case database.WorkspaceTransitionDelete:
58+
deleting += num
59+
}
60+
}
61+
62+
var (
63+
toCreate = int(math.Max(0, float64(
64+
desired-(actual+starting)), // The number of prebuilds currently being stopped (should be 0)
65+
))
66+
toDelete = int(math.Max(0, float64(
67+
outdated- // The number of prebuilds running above the desired count for active version
68+
deleting), // The number of prebuilds currently being deleted
69+
))
70+
71+
actions = &ReconciliationActions{
72+
Actual: actual,
73+
Desired: desired,
74+
Eligible: eligible,
75+
Outdated: outdated,
76+
Extraneous: extraneous,
77+
Starting: starting,
78+
Stopping: stopping,
79+
Deleting: deleting,
80+
}
81+
)
82+
83+
// If the template has become deleted or deprecated since the last reconciliation, we need to ensure we
84+
// scale those prebuilds down to zero.
85+
if p.Preset.Deleted || p.Preset.Deprecated {
86+
toCreate = 0
87+
toDelete = int(actual + outdated)
88+
actions.Desired = 0
89+
}
90+
91+
// We backoff when the last build failed, to give the operator some time to investigate the issue and to not provision
92+
// a tonne of prebuilds (_n_ on each reconciliation iteration).
93+
if p.Backoff != nil && p.Backoff.NumFailed > 0 {
94+
actions.Failed = p.Backoff.NumFailed
95+
96+
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
97+
98+
if clock.Now().Before(backoffUntil) {
99+
actions.Create = 0
100+
actions.DeleteIDs = nil
101+
actions.BackoffUntil = backoffUntil
102+
103+
// Return early here; we should not perform any reconciliation actions if we're in a backoff period.
104+
return actions, nil
105+
}
106+
}
107+
108+
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
109+
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
110+
if extraneous > 0 {
111+
// Sort running IDs by creation time so we always delete the oldest prebuilds.
112+
// In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better).
113+
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuildsRow) int {
114+
if a.CreatedAt.Before(b.CreatedAt) {
115+
return -1
116+
}
117+
if a.CreatedAt.After(b.CreatedAt) {
118+
return 1
119+
}
120+
121+
return 0
122+
})
123+
124+
for i := 0; i < int(extraneous); i++ {
125+
if i >= len(p.Running) {
126+
// This should never happen.
127+
// TODO: move up
128+
// c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!",
129+
// slog.F("running_count", len(p.Running)),
130+
// slog.F("extraneous", extraneous))
131+
continue
132+
}
133+
134+
actions.DeleteIDs = append(actions.DeleteIDs, p.Running[i].WorkspaceID)
135+
}
136+
137+
// TODO: move up
138+
// c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)",
139+
// slog.F("template_id", p.Preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous),
140+
// slog.F("victims", victims))
141+
142+
// Prevent the rest of the reconciliation from completing
143+
return actions, nil
144+
}
145+
146+
actions.Create = int32(toCreate)
147+
148+
// if toDelete > 0 && len(p.Running) != toDelete {
149+
// TODO: move up
150+
// c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!",
151+
// slog.F("template_id", s.preset.TemplateID.String()), slog.F("running", len(p.Running)), slog.F("to_delete", toDelete))
152+
// }
153+
154+
// TODO: implement lookup to not perform same action on workspace multiple times in $period
155+
// i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion
156+
for i := 0; i < toDelete; i++ {
157+
if i >= len(p.Running) {
158+
// TODO: move up
159+
// Above warning will have already addressed this.
160+
continue
161+
}
162+
163+
actions.DeleteIDs = append(actions.DeleteIDs, p.Running[i].WorkspaceID)
164+
}
165+
166+
return actions, nil
167+
}

0 commit comments

Comments
 (0)