Skip to content

fix: reimplement reporting of preset-hard-limited metric #18055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions coderd/prebuilds/global_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ import (

// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
type GlobalSnapshot struct {
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
HardLimitedPresetsMap map[uuid.UUID]database.GetPresetsAtFailureLimitRow
}

func NewGlobalSnapshot(
Expand All @@ -26,12 +26,17 @@ func NewGlobalSnapshot(
backoffs []database.GetPresetsBackoffRow,
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
) GlobalSnapshot {
hardLimitedPresetsMap := make(map[uuid.UUID]database.GetPresetsAtFailureLimitRow, len(hardLimitedPresets))
for _, preset := range hardLimitedPresets {
hardLimitedPresetsMap[preset.PresetID] = preset
}

return GlobalSnapshot{
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
HardLimitedPresets: hardLimitedPresets,
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
HardLimitedPresetsMap: hardLimitedPresetsMap,
}
}

Expand Down Expand Up @@ -66,9 +71,7 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
backoffPtr = &backoff
}

_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
return row.PresetID == preset.ID
})
_, isHardLimited := s.HardLimitedPresetsMap[preset.ID]

return &PresetSnapshot{
Preset: preset,
Expand All @@ -80,6 +83,12 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
}, nil
}

func (s GlobalSnapshot) IsHardLimited(presetID uuid.UUID) bool {
_, isHardLimited := s.HardLimitedPresetsMap[presetID]

return isHardLimited
}

// filterExpiredWorkspaces splits running workspaces into expired and non-expired
// based on the preset's TTL.
// If TTL is missing or zero, all workspaces are considered non-expired.
Expand Down
11 changes: 2 additions & 9 deletions enterprise/coderd/prebuilds/metricscollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,16 +280,9 @@ func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}

// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()

key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}

if isHardLimited {
mc.isPresetHardLimited[key] = true
} else {
delete(mc.isPresetHardLimited, key)
}
mc.isPresetHardLimited = isPresetHardLimited
}
64 changes: 46 additions & 18 deletions enterprise/coderd/prebuilds/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
if err != nil {
return xerrors.Errorf("determine current snapshot: %w", err)
}

c.reportHardLimitedPresets(snapshot)

if len(snapshot.Presets) == 0 {
logger.Debug(ctx, "no templates found with prebuilds configured")
return nil
Expand Down Expand Up @@ -296,6 +299,49 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
return err
}

func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
// because templates can have multiple versions — or deleted templates can share the same name.
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
for _, preset := range snapshot.Presets {
key := hardLimitedPresetKey{
orgName: preset.OrganizationName,
templateName: preset.TemplateName,
presetName: preset.Name,
}

presetsMap[key] = append(presetsMap[key], preset)
}

// Report a preset as hard-limited only if all the following conditions are met:
// - The preset is marked as hard-limited
// - The preset is using the active version of its template, and the template has not been deleted
//
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
// as hard-limited to the admin.
//
// This approach accounts for all relevant scenarios:
// Scenario #1: The admin created a new template version with the same preset names.
// Scenario #2: The admin created a new template version and renamed the presets.
// Scenario #3: The admin deleted a template version that contained hard-limited presets.
//
// In all of these cases, only the latest and non-deleted presets will be reported.
// All other presets will be ignored and eventually removed from Prometheus.
isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
for key, presets := range presetsMap {
for _, preset := range presets {
if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
isPresetHardLimited[key] = true
break
}
}
}

c.metrics.registerHardLimitedPresets(isPresetHardLimited)
}

// SnapshotState captures the current state of all prebuilds across templates.
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
if err := ctx.Err(); err != nil {
Expand Down Expand Up @@ -361,24 +407,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)

// Report a metric only if the preset uses the latest version of the template and the template is not deleted.
// This avoids conflicts between metrics from old and new template versions.
//
// NOTE: Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
// because templates can have multiple versions — or deleted templates can share the same name.
//
// The safest approach is to report the metric only for the latest version of the preset.
// When a new template version is released, the metric for the new preset should overwrite
// the old value in Prometheus.
//
// However, there’s one edge case: if an admin creates a template, it becomes hard-limited,
// then deletes the template and never creates another with the same name,
// the old preset will continue to be reported as hard-limited —
// even though it’s deleted. This will persist until `coderd` is restarted.
if ps.Preset.UsingActiveVersion && !ps.Preset.Deleted {
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
}

// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
Expand Down
7 changes: 2 additions & 5 deletions enterprise/coderd/prebuilds/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1034,18 +1034,15 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)

// The metric is still set to 1, even though the preset has become outdated.
// This happens because the old value hasn't been overwritten by a newer preset yet.
// Metric is deleted after preset became outdated.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue())
require.Nil(t, metric)
})
}
}
Expand Down
Loading