Skip to content

fix: exclude deleted templates from metrics collection #17839

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 15, 2025
4 changes: 4 additions & 0 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions coderd/database/queries/prebuilds.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ WHERE w.id IN (
AND b.template_version_id = t.active_version_id
AND p.current_preset_id = @preset_id::uuid
AND p.ready
AND NOT t.deleted
LIMIT 1 FOR UPDATE OF p SKIP LOCKED -- Ensure that a concurrent request will not select the same prebuild.
)
RETURNING w.id, w.name;
Expand All @@ -40,6 +41,7 @@ FROM templates t
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
INNER JOIN organizations o ON o.id = t.organization_id
WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration.
-- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running.
AND (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL);

-- name: GetRunningPrebuiltWorkspaces :many
Expand Down Expand Up @@ -70,6 +72,7 @@ FROM workspace_latest_builds wlb
-- prebuilds that are still building.
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
-- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running.
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id;

-- GetPresetsBackoff groups workspace builds by preset ID.
Expand Down Expand Up @@ -98,6 +101,7 @@ WITH filtered_builds AS (
WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration.
AND wlb.transition = 'start'::workspace_transition
AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'
AND NOT t.deleted
),
time_sorted_builds AS (
-- Group builds by preset, then sort each group by created_at.
Expand Down
4 changes: 4 additions & 0 deletions enterprise/coderd/prebuilds/metricscollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
continue
}

if preset.Deleted {
continue
}

presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
if err != nil {
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
Expand Down
189 changes: 168 additions & 21 deletions enterprise/coderd/prebuilds/metricscollector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/enterprise/coderd/prebuilds"
Expand Down Expand Up @@ -165,27 +166,12 @@ func TestMetricsCollector(t *testing.T) {
eligible: []bool{false},
},
{
name: "deleted templates never desire prebuilds",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{prebuilds.MetricDesiredGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
},
{
name: "running prebuilds for deleted templates are still counted, so that they can be deleted",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
name: "deleted templates should not be included in exported metrics",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: nil,
templateDeleted: []bool{true},
eligible: []bool{false},
},
Expand Down Expand Up @@ -281,6 +267,12 @@ func TestMetricsCollector(t *testing.T) {
"organization_name": org.Name,
}

// If no expected metrics have been defined, ensure we don't find any metric series (i.e. metrics with given labels).
if test.metrics == nil {
series := findAllMetricSeries(metricsFamilies, labels)
require.Empty(t, series)
}

for _, check := range test.metrics {
metric := findMetric(metricsFamilies, check.name, labels)
if check.value == nil {
Expand All @@ -307,6 +299,131 @@ func TestMetricsCollector(t *testing.T) {
}
}

// TestMetricsCollector_DuplicateTemplateNames validates a bug that we saw previously which caused duplicate metric series
// registration when a template was deleted and a new one created with the same name (and preset name).
// We are now excluding deleted templates from our metric collection.
func TestMetricsCollector_DuplicateTemplateNames(t *testing.T) {
t.Parallel()

if !dbtestutil.WillUsePostgres() {
t.Skip("this test requires postgres")
}

type metricCheck struct {
name string
value *float64
isCounter bool
}

type testCase struct {
transition database.WorkspaceTransition
jobStatus database.ProvisionerJobStatus
initiatorID uuid.UUID
ownerID uuid.UUID
metrics []metricCheck
eligible bool
}

test := testCase{
transition: database.WorkspaceTransitionStart,
jobStatus: database.ProvisionerJobStatusSucceeded,
initiatorID: agplprebuilds.SystemUserID,
ownerID: agplprebuilds.SystemUserID,
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(1.0), false},
},
eligible: true,
}

logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
clock := quartz.NewMock(t)
db, pubsub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ctx := testutil.Context(t, testutil.WaitLong)

collector := prebuilds.NewMetricsCollector(db, logger, reconciler)
registry := prometheus.NewPedanticRegistry()
registry.Register(collector)

presetName := "default-preset"
defaultOrg := dbgen.Organization(t, db, database.Organization{})
setupTemplateWithDeps := func() database.Template {
template := setupTestDBTemplateWithinOrg(t, db, test.ownerID, false, "default-template", defaultOrg)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, defaultOrg.ID, test.ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, "default-preset")
workspace, _ := setupTestDBWorkspace(
t, clock, db, pubsub,
test.transition, test.jobStatus, defaultOrg.ID, preset, template.ID, templateVersionID, test.initiatorID, test.ownerID,
)
setupTestDBWorkspaceAgent(t, db, workspace.ID, test.eligible)
return template
}

// When: starting with a regular template.
template := setupTemplateWithDeps()
labels := map[string]string{
"template_name": template.Name,
"preset_name": presetName,
"organization_name": defaultOrg.Name,
}

// nolint:gocritic // Authz context needed to retrieve state.
ctx = dbauthz.AsPrebuildsOrchestrator(ctx)

// Then: metrics collect successfully.
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
metricsFamilies, err := registry.Gather()
require.NoError(t, err)
require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels))

// When: the template is deleted.
require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
ID: template.ID,
Deleted: true,
UpdatedAt: dbtime.Now(),
}))

// Then: metrics collect successfully but are empty because the template is deleted.
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
metricsFamilies, err = registry.Gather()
require.NoError(t, err)
require.Empty(t, findAllMetricSeries(metricsFamilies, labels))

// When: a new template is created with the same name as the deleted template.
newTemplate := setupTemplateWithDeps()

// Ensure the database has both the new and old (delete) template.
{
deleted, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{
OrganizationID: template.OrganizationID,
Deleted: true,
Name: template.Name,
})
require.NoError(t, err)
require.Equal(t, template.ID, deleted.ID)

current, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{
// Use details from deleted template to ensure they're aligned.
OrganizationID: template.OrganizationID,
Deleted: false,
Name: template.Name,
})
require.NoError(t, err)
require.Equal(t, newTemplate.ID, current.ID)
}

// Then: metrics collect successfully.
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
metricsFamilies, err = registry.Gather()
require.NoError(t, err)
require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels))
}

func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric {
for _, metricFamily := range metricsFamilies {
if metricFamily.GetName() != name {
Expand Down Expand Up @@ -334,3 +451,33 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
}
return nil
}

// findAllMetricSeries finds all metrics with a given set of labels.
func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric {
series := make(map[string]*prometheus_client.Metric)
for _, metricFamily := range metricsFamilies {
for _, metric := range metricFamily.GetMetric() {
labelPairs := metric.GetLabel()

if len(labelPairs) != len(labels) {
continue
}

// Convert label pairs to map for easier lookup
metricLabels := make(map[string]string, len(labelPairs))
for _, label := range labelPairs {
metricLabels[label.GetName()] = label.GetValue()
}

// Check if all requested labels match
for wantName, wantValue := range labels {
if metricLabels[wantName] != wantValue {
continue
}
}

series[metricFamily.GetName()] = metric
}
}
return series
}
34 changes: 33 additions & 1 deletion enterprise/coderd/prebuilds/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,15 @@ func TestPrebuildReconciliation(t *testing.T) {
templateDeleted: []bool{false},
},
{
name: "delete prebuilds for deleted templates",
// Templates can be soft-deleted (`deleted=true`) or hard-deleted (row is removed).
// On the former there is *no* DB constraint to prevent soft deletion, so we have to ensure that if somehow
// the template was soft-deleted any running prebuilds will be removed.
// On the latter there is a DB constraint to prevent row deletion if any workspaces reference the deleting template.
name: "soft-deleted templates MAY have prebuilds",
prebuildLatestTransitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
prebuildJobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
templateVersionActive: []bool{true, false},
shouldCreateNewPrebuild: ptr.To(false),
shouldDeleteOldPrebuild: ptr.To(true),
templateDeleted: []bool{true},
},
Expand Down Expand Up @@ -1060,6 +1065,33 @@ func setupTestDBTemplate(
return org, template
}

// nolint:revive // It's a control flag, but this is a test.
func setupTestDBTemplateWithinOrg(
t *testing.T,
db database.Store,
userID uuid.UUID,
templateDeleted bool,
templateName string,
org database.Organization,
) database.Template {
t.Helper()

template := dbgen.Template(t, db, database.Template{
Name: templateName,
CreatedBy: userID,
OrganizationID: org.ID,
CreatedAt: time.Now().Add(muchEarlier),
})
if templateDeleted {
ctx := testutil.Context(t, testutil.WaitShort)
require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
ID: template.ID,
Deleted: true,
}))
}
return template
}

const (
earlier = -time.Hour
muchEarlier = -time.Hour * 2
Expand Down
Loading