Skip to content

Commit b330c08

Browse files
fix: reimplement reporting of preset-hard-limited metric (coder#18055)
Addresses concerns raised in coder#18045
1 parent 6a2f22a commit b330c08

File tree

4 files changed

+72
-45
lines changed

4 files changed

+72
-45
lines changed

coderd/prebuilds/global_snapshot.go

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ import (
1212

1313
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
1414
type GlobalSnapshot struct {
15-
Presets []database.GetTemplatePresetsWithPrebuildsRow
16-
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
17-
PrebuildsInProgress []database.CountInProgressPrebuildsRow
18-
Backoffs []database.GetPresetsBackoffRow
19-
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
15+
Presets []database.GetTemplatePresetsWithPrebuildsRow
16+
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
17+
PrebuildsInProgress []database.CountInProgressPrebuildsRow
18+
Backoffs []database.GetPresetsBackoffRow
19+
HardLimitedPresetsMap map[uuid.UUID]database.GetPresetsAtFailureLimitRow
2020
}
2121

2222
func NewGlobalSnapshot(
@@ -26,12 +26,17 @@ func NewGlobalSnapshot(
2626
backoffs []database.GetPresetsBackoffRow,
2727
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
2828
) GlobalSnapshot {
29+
hardLimitedPresetsMap := make(map[uuid.UUID]database.GetPresetsAtFailureLimitRow, len(hardLimitedPresets))
30+
for _, preset := range hardLimitedPresets {
31+
hardLimitedPresetsMap[preset.PresetID] = preset
32+
}
33+
2934
return GlobalSnapshot{
30-
Presets: presets,
31-
RunningPrebuilds: runningPrebuilds,
32-
PrebuildsInProgress: prebuildsInProgress,
33-
Backoffs: backoffs,
34-
HardLimitedPresets: hardLimitedPresets,
35+
Presets: presets,
36+
RunningPrebuilds: runningPrebuilds,
37+
PrebuildsInProgress: prebuildsInProgress,
38+
Backoffs: backoffs,
39+
HardLimitedPresetsMap: hardLimitedPresetsMap,
3540
}
3641
}
3742

@@ -66,9 +71,7 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
6671
backoffPtr = &backoff
6772
}
6873

69-
_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
70-
return row.PresetID == preset.ID
71-
})
74+
_, isHardLimited := s.HardLimitedPresetsMap[preset.ID]
7275

7376
return &PresetSnapshot{
7477
Preset: preset,
@@ -80,6 +83,12 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
8083
}, nil
8184
}
8285

86+
func (s GlobalSnapshot) IsHardLimited(presetID uuid.UUID) bool {
87+
_, isHardLimited := s.HardLimitedPresetsMap[presetID]
88+
89+
return isHardLimited
90+
}
91+
8392
// filterExpiredWorkspaces splits running workspaces into expired and non-expired
8493
// based on the preset's TTL.
8594
// If TTL is missing or zero, all workspaces are considered non-expired.

enterprise/coderd/prebuilds/metricscollector.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -280,16 +280,9 @@ func (k hardLimitedPresetKey) String() string {
280280
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
281281
}
282282

283-
// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
284-
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
283+
func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) {
285284
mc.isPresetHardLimitedMu.Lock()
286285
defer mc.isPresetHardLimitedMu.Unlock()
287286

288-
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
289-
290-
if isHardLimited {
291-
mc.isPresetHardLimited[key] = true
292-
} else {
293-
delete(mc.isPresetHardLimited, key)
294-
}
287+
mc.isPresetHardLimited = isPresetHardLimited
295288
}

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
256256
if err != nil {
257257
return xerrors.Errorf("determine current snapshot: %w", err)
258258
}
259+
260+
c.reportHardLimitedPresets(snapshot)
261+
259262
if len(snapshot.Presets) == 0 {
260263
logger.Debug(ctx, "no templates found with prebuilds configured")
261264
return nil
@@ -296,6 +299,49 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
296299
return err
297300
}
298301

302+
func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
303+
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
304+
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
305+
// because templates can have multiple versions — or deleted templates can share the same name.
306+
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
307+
for _, preset := range snapshot.Presets {
308+
key := hardLimitedPresetKey{
309+
orgName: preset.OrganizationName,
310+
templateName: preset.TemplateName,
311+
presetName: preset.Name,
312+
}
313+
314+
presetsMap[key] = append(presetsMap[key], preset)
315+
}
316+
317+
// Report a preset as hard-limited only if all the following conditions are met:
318+
// - The preset is marked as hard-limited
319+
// - The preset is using the active version of its template, and the template has not been deleted
320+
//
321+
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
322+
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
323+
// as hard-limited to the admin.
324+
//
325+
// This approach accounts for all relevant scenarios:
326+
// Scenario #1: The admin created a new template version with the same preset names.
327+
// Scenario #2: The admin created a new template version and renamed the presets.
328+
// Scenario #3: The admin deleted a template version that contained hard-limited presets.
329+
//
330+
// In all of these cases, only the latest and non-deleted presets will be reported.
331+
// All other presets will be ignored and eventually removed from Prometheus.
332+
isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
333+
for key, presets := range presetsMap {
334+
for _, preset := range presets {
335+
if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
336+
isPresetHardLimited[key] = true
337+
break
338+
}
339+
}
340+
}
341+
342+
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
343+
}
344+
299345
// SnapshotState captures the current state of all prebuilds across templates.
300346
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
301347
if err := ctx.Err(); err != nil {
@@ -361,24 +407,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
361407
slog.F("preset_name", ps.Preset.Name),
362408
)
363409

364-
// Report a metric only if the preset uses the latest version of the template and the template is not deleted.
365-
// This avoids conflicts between metrics from old and new template versions.
366-
//
367-
// NOTE: Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
368-
// because templates can have multiple versions — or deleted templates can share the same name.
369-
//
370-
// The safest approach is to report the metric only for the latest version of the preset.
371-
// When a new template version is released, the metric for the new preset should overwrite
372-
// the old value in Prometheus.
373-
//
374-
// However, there’s one edge case: if an admin creates a template, it becomes hard-limited,
375-
// then deletes the template and never creates another with the same name,
376-
// the old preset will continue to be reported as hard-limited —
377-
// even though it’s deleted. This will persist until `coderd` is restarted.
378-
if ps.Preset.UsingActiveVersion && !ps.Preset.Deleted {
379-
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
380-
}
381-
382410
// If the preset reached the hard failure limit for the first time during this iteration:
383411
// - Mark it as hard-limited in the database
384412
// - Send notifications to template admins

enterprise/coderd/prebuilds/reconcile_test.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,18 +1034,15 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
10341034
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
10351035
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
10361036

1037-
// The metric is still set to 1, even though the preset has become outdated.
1038-
// This happens because the old value hasn't been overwritten by a newer preset yet.
1037+
// Metric is deleted after preset became outdated.
10391038
mf, err = registry.Gather()
10401039
require.NoError(t, err)
10411040
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
10421041
"template_name": template.Name,
10431042
"preset_name": preset.Name,
10441043
"org_name": org.Name,
10451044
})
1046-
require.NotNil(t, metric)
1047-
require.NotNil(t, metric.GetGauge())
1048-
require.EqualValues(t, 1, metric.GetGauge().GetValue())
1045+
require.Nil(t, metric)
10491046
})
10501047
}
10511048
}

0 commit comments

Comments
 (0)