Skip to content

Commit cef7313

Browse files
fix: exclude deleted templates from metrics collection (#17839)
Also add some clarification about the lack of database constraints for soft template deletion. --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com> Co-authored-by: Danny Kopping <dannykopping@gmail.com>
1 parent fd0a84d commit cef7313

File tree

5 files changed

+213
-22
lines changed

5 files changed

+213
-22
lines changed

coderd/database/queries.sql.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ WHERE w.id IN (
1515
AND b.template_version_id = t.active_version_id
1616
AND p.current_preset_id = @preset_id::uuid
1717
AND p.ready
18+
AND NOT t.deleted
1819
LIMIT 1 FOR UPDATE OF p SKIP LOCKED -- Ensure that a concurrent request will not select the same prebuild.
1920
)
2021
RETURNING w.id, w.name;
@@ -40,6 +41,7 @@ FROM templates t
4041
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
4142
INNER JOIN organizations o ON o.id = t.organization_id
4243
WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration.
44+
-- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running.
4345
AND (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL);
4446

4547
-- name: GetRunningPrebuiltWorkspaces :many
@@ -70,6 +72,7 @@ FROM workspace_latest_builds wlb
7072
-- prebuilds that are still building.
7173
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
7274
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
75+
-- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running.
7376
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id;
7477

7578
-- GetPresetsBackoff groups workspace builds by preset ID.
@@ -98,6 +101,7 @@ WITH filtered_builds AS (
98101
WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration.
99102
AND wlb.transition = 'start'::workspace_transition
100103
AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'
104+
AND NOT t.deleted
101105
),
102106
time_sorted_builds AS (
103107
-- Group builds by preset, then sort each group by created_at.

enterprise/coderd/prebuilds/metricscollector.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
157157
continue
158158
}
159159

160+
if preset.Deleted {
161+
continue
162+
}
163+
160164
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
161165
if err != nil {
162166
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))

enterprise/coderd/prebuilds/metricscollector_test.go

Lines changed: 168 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/coder/coder/v2/coderd/database/dbauthz"
2020
"github.com/coder/coder/v2/coderd/database/dbgen"
2121
"github.com/coder/coder/v2/coderd/database/dbtestutil"
22+
"github.com/coder/coder/v2/coderd/database/dbtime"
2223
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
2324
"github.com/coder/coder/v2/codersdk"
2425
"github.com/coder/coder/v2/enterprise/coderd/prebuilds"
@@ -165,27 +166,12 @@ func TestMetricsCollector(t *testing.T) {
165166
eligible: []bool{false},
166167
},
167168
{
168-
name: "deleted templates never desire prebuilds",
169-
transitions: allTransitions,
170-
jobStatuses: allJobStatuses,
171-
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
172-
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
173-
metrics: []metricCheck{
174-
{prebuilds.MetricDesiredGauge, ptr.To(0.0), false},
175-
},
176-
templateDeleted: []bool{true},
177-
eligible: []bool{false},
178-
},
179-
{
180-
name: "running prebuilds for deleted templates are still counted, so that they can be deleted",
181-
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
182-
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
183-
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
184-
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
185-
metrics: []metricCheck{
186-
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
187-
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
188-
},
169+
name: "deleted templates should not be included in exported metrics",
170+
transitions: allTransitions,
171+
jobStatuses: allJobStatuses,
172+
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
173+
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
174+
metrics: nil,
189175
templateDeleted: []bool{true},
190176
eligible: []bool{false},
191177
},
@@ -281,6 +267,12 @@ func TestMetricsCollector(t *testing.T) {
281267
"organization_name": org.Name,
282268
}
283269

270+
// If no expected metrics have been defined, ensure we don't find any metric series (i.e. metrics with given labels).
271+
if test.metrics == nil {
272+
series := findAllMetricSeries(metricsFamilies, labels)
273+
require.Empty(t, series)
274+
}
275+
284276
for _, check := range test.metrics {
285277
metric := findMetric(metricsFamilies, check.name, labels)
286278
if check.value == nil {
@@ -307,6 +299,131 @@ func TestMetricsCollector(t *testing.T) {
307299
}
308300
}
309301

302+
// TestMetricsCollector_DuplicateTemplateNames validates a bug that we saw previously which caused duplicate metric series
303+
// registration when a template was deleted and a new one created with the same name (and preset name).
304+
// We are now excluding deleted templates from our metric collection.
305+
func TestMetricsCollector_DuplicateTemplateNames(t *testing.T) {
306+
t.Parallel()
307+
308+
if !dbtestutil.WillUsePostgres() {
309+
t.Skip("this test requires postgres")
310+
}
311+
312+
type metricCheck struct {
313+
name string
314+
value *float64
315+
isCounter bool
316+
}
317+
318+
type testCase struct {
319+
transition database.WorkspaceTransition
320+
jobStatus database.ProvisionerJobStatus
321+
initiatorID uuid.UUID
322+
ownerID uuid.UUID
323+
metrics []metricCheck
324+
eligible bool
325+
}
326+
327+
test := testCase{
328+
transition: database.WorkspaceTransitionStart,
329+
jobStatus: database.ProvisionerJobStatusSucceeded,
330+
initiatorID: agplprebuilds.SystemUserID,
331+
ownerID: agplprebuilds.SystemUserID,
332+
metrics: []metricCheck{
333+
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
334+
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
335+
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
336+
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
337+
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
338+
{prebuilds.MetricEligibleGauge, ptr.To(1.0), false},
339+
},
340+
eligible: true,
341+
}
342+
343+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
344+
clock := quartz.NewMock(t)
345+
db, pubsub := dbtestutil.NewDB(t)
346+
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
347+
ctx := testutil.Context(t, testutil.WaitLong)
348+
349+
collector := prebuilds.NewMetricsCollector(db, logger, reconciler)
350+
registry := prometheus.NewPedanticRegistry()
351+
registry.Register(collector)
352+
353+
presetName := "default-preset"
354+
defaultOrg := dbgen.Organization(t, db, database.Organization{})
355+
setupTemplateWithDeps := func() database.Template {
356+
template := setupTestDBTemplateWithinOrg(t, db, test.ownerID, false, "default-template", defaultOrg)
357+
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, defaultOrg.ID, test.ownerID, template.ID)
358+
preset := setupTestDBPreset(t, db, templateVersionID, 1, "default-preset")
359+
workspace, _ := setupTestDBWorkspace(
360+
t, clock, db, pubsub,
361+
test.transition, test.jobStatus, defaultOrg.ID, preset, template.ID, templateVersionID, test.initiatorID, test.ownerID,
362+
)
363+
setupTestDBWorkspaceAgent(t, db, workspace.ID, test.eligible)
364+
return template
365+
}
366+
367+
// When: starting with a regular template.
368+
template := setupTemplateWithDeps()
369+
labels := map[string]string{
370+
"template_name": template.Name,
371+
"preset_name": presetName,
372+
"organization_name": defaultOrg.Name,
373+
}
374+
375+
// nolint:gocritic // Authz context needed to retrieve state.
376+
ctx = dbauthz.AsPrebuildsOrchestrator(ctx)
377+
378+
// Then: metrics collect successfully.
379+
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
380+
metricsFamilies, err := registry.Gather()
381+
require.NoError(t, err)
382+
require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels))
383+
384+
// When: the template is deleted.
385+
require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
386+
ID: template.ID,
387+
Deleted: true,
388+
UpdatedAt: dbtime.Now(),
389+
}))
390+
391+
// Then: metrics collect successfully but are empty because the template is deleted.
392+
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
393+
metricsFamilies, err = registry.Gather()
394+
require.NoError(t, err)
395+
require.Empty(t, findAllMetricSeries(metricsFamilies, labels))
396+
397+
// When: a new template is created with the same name as the deleted template.
398+
newTemplate := setupTemplateWithDeps()
399+
400+
// Ensure the database has both the new and old (delete) template.
401+
{
402+
deleted, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{
403+
OrganizationID: template.OrganizationID,
404+
Deleted: true,
405+
Name: template.Name,
406+
})
407+
require.NoError(t, err)
408+
require.Equal(t, template.ID, deleted.ID)
409+
410+
current, err := db.GetTemplateByOrganizationAndName(ctx, database.GetTemplateByOrganizationAndNameParams{
411+
// Use details from deleted template to ensure they're aligned.
412+
OrganizationID: template.OrganizationID,
413+
Deleted: false,
414+
Name: template.Name,
415+
})
416+
require.NoError(t, err)
417+
require.Equal(t, newTemplate.ID, current.ID)
418+
}
419+
420+
// Then: metrics collect successfully.
421+
require.NoError(t, collector.UpdateState(ctx, testutil.WaitLong))
422+
metricsFamilies, err = registry.Gather()
423+
require.NoError(t, err)
424+
require.NotEmpty(t, findAllMetricSeries(metricsFamilies, labels))
425+
}
426+
310427
func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric {
311428
for _, metricFamily := range metricsFamilies {
312429
if metricFamily.GetName() != name {
@@ -334,3 +451,33 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
334451
}
335452
return nil
336453
}
454+
455+
// findAllMetricSeries finds all metrics with a given set of labels.
456+
func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric {
457+
series := make(map[string]*prometheus_client.Metric)
458+
for _, metricFamily := range metricsFamilies {
459+
for _, metric := range metricFamily.GetMetric() {
460+
labelPairs := metric.GetLabel()
461+
462+
if len(labelPairs) != len(labels) {
463+
continue
464+
}
465+
466+
// Convert label pairs to map for easier lookup
467+
metricLabels := make(map[string]string, len(labelPairs))
468+
for _, label := range labelPairs {
469+
metricLabels[label.GetName()] = label.GetValue()
470+
}
471+
472+
// Check if all requested labels match
473+
for wantName, wantValue := range labels {
474+
if metricLabels[wantName] != wantValue {
475+
continue
476+
}
477+
}
478+
479+
series[metricFamily.GetName()] = metric
480+
}
481+
}
482+
return series
483+
}

enterprise/coderd/prebuilds/reconcile_test.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,15 @@ func TestPrebuildReconciliation(t *testing.T) {
294294
templateDeleted: []bool{false},
295295
},
296296
{
297-
name: "delete prebuilds for deleted templates",
297+
// Templates can be soft-deleted (`deleted=true`) or hard-deleted (row is removed).
298+
// On the former there is *no* DB constraint to prevent soft deletion, so we have to ensure that if somehow
299+
// the template was soft-deleted any running prebuilds will be removed.
300+
// On the latter there is a DB constraint to prevent row deletion if any workspaces reference the deleting template.
301+
name: "soft-deleted templates MAY have prebuilds",
298302
prebuildLatestTransitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
299303
prebuildJobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
300304
templateVersionActive: []bool{true, false},
305+
shouldCreateNewPrebuild: ptr.To(false),
301306
shouldDeleteOldPrebuild: ptr.To(true),
302307
templateDeleted: []bool{true},
303308
},
@@ -1060,6 +1065,33 @@ func setupTestDBTemplate(
10601065
return org, template
10611066
}
10621067

1068+
// nolint:revive // It's a control flag, but this is a test.
1069+
func setupTestDBTemplateWithinOrg(
1070+
t *testing.T,
1071+
db database.Store,
1072+
userID uuid.UUID,
1073+
templateDeleted bool,
1074+
templateName string,
1075+
org database.Organization,
1076+
) database.Template {
1077+
t.Helper()
1078+
1079+
template := dbgen.Template(t, db, database.Template{
1080+
Name: templateName,
1081+
CreatedBy: userID,
1082+
OrganizationID: org.ID,
1083+
CreatedAt: time.Now().Add(muchEarlier),
1084+
})
1085+
if templateDeleted {
1086+
ctx := testutil.Context(t, testutil.WaitShort)
1087+
require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
1088+
ID: template.ID,
1089+
Deleted: true,
1090+
}))
1091+
}
1092+
return template
1093+
}
1094+
10631095
const (
10641096
earlier = -time.Hour
10651097
muchEarlier = -time.Hour * 2

0 commit comments

Comments
 (0)