From e8ff926abf2725a5a4386a8abfc8ed6d643ef394 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 23 Apr 2025 17:03:34 +0200 Subject: [PATCH 1/6] Add metrics collector Signed-off-by: Danny Kopping Reverting unnecessary changes that crept in Signed-off-by: Danny Kopping --- coderd/prebuilds/api.go | 13 + enterprise/coderd/coderd.go | 2 +- enterprise/coderd/prebuilds/claim_test.go | 3 +- .../coderd/prebuilds/metricscollector.go | 126 ++++++++ .../coderd/prebuilds/metricscollector_test.go | 284 ++++++++++++++++++ enterprise/coderd/prebuilds/reconcile.go | 51 +++- enterprise/coderd/prebuilds/reconcile_test.go | 18 +- 7 files changed, 473 insertions(+), 24 deletions(-) create mode 100644 enterprise/coderd/prebuilds/metricscollector.go create mode 100644 enterprise/coderd/prebuilds/metricscollector_test.go diff --git a/coderd/prebuilds/api.go b/coderd/prebuilds/api.go index ba174d318d5fa..2342da5d62c07 100644 --- a/coderd/prebuilds/api.go +++ b/coderd/prebuilds/api.go @@ -5,6 +5,8 @@ import ( "github.com/google/uuid" "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" ) var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found") @@ -25,12 +27,23 @@ type ReconciliationOrchestrator interface { } type Reconciler interface { + StateSnapshotter + // ReconcileAll orchestrates the reconciliation of all prebuilds across all templates. // It takes a global snapshot of the system state and then reconciles each preset // in parallel, creating or deleting prebuilds as needed to reach their desired states. ReconcileAll(ctx context.Context) error } +// StateSnapshotter defines the operations necessary to capture workspace prebuilds state. +type StateSnapshotter interface { + // SnapshotState captures the current state of all prebuilds across templates. + // It creates a global database snapshot that can be viewed as a collection of PresetSnapshots, + // each representing the state of prebuilds for a specific preset. + // MUST be called inside a repeatable-read transaction. + SnapshotState(ctx context.Context, store database.Store) (*GlobalSnapshot, error) +} + type Claimer interface { Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error) Initiator() uuid.UUID diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 1f468997ac220..ca3531b60db78 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -1165,6 +1165,6 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio } reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds, - api.Logger.Named("prebuilds"), quartz.NewReal()) + api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry) return reconciler, prebuilds.EnterpriseClaimer{} } diff --git a/enterprise/coderd/prebuilds/claim_test.go b/enterprise/coderd/prebuilds/claim_test.go index 4f398724b8265..e8d66fae18a07 100644 --- a/enterprise/coderd/prebuilds/claim_test.go +++ b/enterprise/coderd/prebuilds/claim_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "golang.org/x/xerrors" @@ -142,7 +143,7 @@ func TestClaimPrebuild(t *testing.T) { EntitlementsUpdateInterval: time.Second, }) - reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) + reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy) api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go new file mode 100644 index 0000000000000..507dbddd383ef --- /dev/null +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -0,0 +1,126 @@ +package prebuilds + +import ( + "context" + "time" + + "cdr.dev/slog" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/prebuilds" +) + +var ( + labels = []string{"template_name", "preset_name", "organization_name"} + createdPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_created_total", + "The number of prebuilds that have been created to meet the desired count set by presets.", + labels, + nil, + ) + failedPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_failed_total", + "The number of prebuilds that failed to build during creation.", + labels, + nil, + ) + claimedPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_claimed_total", + "The number of prebuilds that were claimed by a user. Each count means that a user created a workspace using a preset and was assigned a prebuild instead of a brand new workspace.", + labels, + nil, + ) + usedPresetsDesc = prometheus.NewDesc( + "coderd_prebuilds_used_presets", + "The number of times a preset was used to build a prebuild.", + labels, + nil, + ) + desiredPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_desired", + "The number of prebuilds desired by each preset of each template.", + labels, + nil, + ) + runningPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_running", + "The number of prebuilds that are currently running. Running prebuilds have successfully started, but they may not be ready to be claimed by a user yet.", + labels, + nil, + ) + eligiblePrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilds_eligible", + "The number of eligible prebuilds. Eligible prebuilds are prebuilds that are ready to be claimed by a user.", + labels, + nil, + ) +) + +type MetricsCollector struct { + database database.Store + logger slog.Logger + snapshotter prebuilds.StateSnapshotter +} + +var _ prometheus.Collector = new(MetricsCollector) + +func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { + return &MetricsCollector{ + database: db, + logger: logger.Named("prebuilds_metrics_collector"), + snapshotter: snapshotter, + } +} + +func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { + descCh <- createdPrebuildsDesc + descCh <- failedPrebuildsDesc + descCh <- claimedPrebuildsDesc + descCh <- usedPresetsDesc + descCh <- desiredPrebuildsDesc + descCh <- runningPrebuildsDesc + descCh <- eligiblePrebuildsDesc +} + +func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) + defer cancel() + // nolint:gocritic // just until we get back to this + prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) + if err != nil { + mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) + return + } + + for _, metric := range prebuildMetrics { + metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + } + + snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database) + if err != nil { + mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err)) + return + } + + for _, preset := range snapshot.Presets { + if !preset.UsingActiveVersion { + continue + } + + presetSnapshot, err := snapshot.FilterByPreset(preset.ID) + if err != nil { + mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) + continue + } + state := presetSnapshot.CalculateState() + + metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) + } +} diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go new file mode 100644 index 0000000000000..0387ecea5e1db --- /dev/null +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -0,0 +1,284 @@ +package prebuilds_test + +import ( + "fmt" + "slices" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + "tailscale.com/types/ptr" + + "github.com/prometheus/client_golang/prometheus" + prometheus_client "github.com/prometheus/client_model/go" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/quartz" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbgen" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" + "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/enterprise/coderd/prebuilds" + "github.com/coder/coder/v2/testutil" +) + +func TestMetricsCollector(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("this test requires postgres") + } + + type metricCheck struct { + name string + value *float64 + isCounter bool + } + + type testCase struct { + name string + transitions []database.WorkspaceTransition + jobStatuses []database.ProvisionerJobStatus + initiatorIDs []uuid.UUID + ownerIDs []uuid.UUID + metrics []metricCheck + templateDeleted []bool + } + + tests := []testCase{ + { + name: "prebuild created", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + // TODO: reexamine and refactor the test cases and assertions: + // * a running prebuild that is not elibible to be claimed currently seems to be eligible. + // * a prebuild that was claimed should not be deemed running, not eligible. + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilds_created_total", ptr.To(1.0), true}, + {"coderd_prebuilds_desired", ptr.To(1.0), false}, + // {"coderd_prebuilds_running", ptr.To(0.0), false}, + // {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + }, + { + name: "prebuild running", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilds_created_total", ptr.To(1.0), true}, + {"coderd_prebuilds_desired", ptr.To(1.0), false}, + {"coderd_prebuilds_running", ptr.To(1.0), false}, + {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + }, + { + name: "prebuild failed", + transitions: allTransitions, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusFailed}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilds_created_total", ptr.To(1.0), true}, + {"coderd_prebuilds_failed_total", ptr.To(1.0), true}, + {"coderd_prebuilds_desired", ptr.To(1.0), false}, + {"coderd_prebuilds_running", ptr.To(0.0), false}, + {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + }, + { + name: "prebuild assigned", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilds_created_total", ptr.To(1.0), true}, + {"coderd_prebuilds_claimed_total", ptr.To(1.0), true}, + {"coderd_prebuilds_desired", ptr.To(1.0), false}, + {"coderd_prebuilds_running", ptr.To(0.0), false}, + {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + }, + { + name: "workspaces that were not created by the prebuilds user are not counted", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{uuid.New()}, + ownerIDs: []uuid.UUID{uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilds_desired", ptr.To(1.0), false}, + {"coderd_prebuilds_running", ptr.To(0.0), false}, + {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + }, + { + name: "deleted templates never desire prebuilds", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilds_desired", ptr.To(0.0), false}, + }, + templateDeleted: []bool{true}, + }, + { + name: "running prebuilds for deleted templates are still counted, so that they can be deleted", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilds_running", ptr.To(1.0), false}, + {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{true}, + }, + } + for _, test := range tests { + test := test // capture for parallel + for _, transition := range test.transitions { + transition := transition // capture for parallel + for _, jobStatus := range test.jobStatuses { + jobStatus := jobStatus // capture for parallel + for _, initiatorID := range test.initiatorIDs { + initiatorID := initiatorID // capture for parallel + for _, ownerID := range test.ownerIDs { + ownerID := ownerID // capture for parallel + for _, templateDeleted := range test.templateDeleted { + templateDeleted := templateDeleted // capture for parallel + t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) { + t.Parallel() + + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + t.Cleanup(func() { + if t.Failed() { + t.Logf("failed to run test: %s", test.name) + t.Logf("transition: %s", transition) + t.Logf("jobStatus: %s", jobStatus) + t.Logf("initiatorID: %s", initiatorID) + t.Logf("ownerID: %s", ownerID) + t.Logf("templateDeleted: %t", templateDeleted) + } + }) + clock := quartz.NewMock(t) + db, pubsub := dbtestutil.NewDB(t) + reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) + ctx := testutil.Context(t, testutil.WaitLong) + + createdUsers := []uuid.UUID{agplprebuilds.SystemUserID} + for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) { + if !slices.Contains(createdUsers, user) { + dbgen.User(t, db, database.User{ + ID: user, + }) + createdUsers = append(createdUsers, user) + } + } + + collector := prebuilds.NewMetricsCollector(db, logger, reconciler) + registry := prometheus.NewPedanticRegistry() + registry.Register(collector) + + numTemplates := 2 + for i := 0; i < numTemplates; i++ { + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String()) + setupTestDBWorkspace( + t, clock, db, pubsub, + transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID, + ) + } + + metricsFamilies, err := registry.Gather() + require.NoError(t, err) + + templates, err := db.GetTemplates(ctx) + require.NoError(t, err) + require.Equal(t, numTemplates, len(templates)) + + for _, template := range templates { + org, err := db.GetOrganizationByID(ctx, template.OrganizationID) + require.NoError(t, err) + templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{ + TemplateID: template.ID, + }) + require.NoError(t, err) + require.Equal(t, 1, len(templateVersions)) + + presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID) + require.NoError(t, err) + require.Equal(t, 1, len(presets)) + + for _, preset := range presets { + preset := preset // capture for parallel + labels := map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, + } + + for _, check := range test.metrics { + metric := findMetric(metricsFamilies, check.name, labels) + if check.value == nil { + continue + } + + require.NotNil(t, metric, "metric %s should exist", check.name) + + if check.isCounter { + require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name) + } else { + require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name) + } + } + } + } + }) + } + } + } + } + } + } +} + +func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric { + for _, metricFamily := range metricsFamilies { + if metricFamily.GetName() != name { + continue + } + + for _, metric := range metricFamily.GetMetric() { + labelPairs := metric.GetLabel() + + // Convert label pairs to map for easier lookup + metricLabels := make(map[string]string, len(labelPairs)) + for _, label := range labelPairs { + metricLabels[label.GetName()] = label.GetValue() + } + + // Check if all requested labels match + for wantName, wantValue := range labels { + if metricLabels[wantName] != wantValue { + continue + } + } + + return metric + } + } + return nil +} diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 081b4223a93c4..134365b65766b 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -9,6 +9,7 @@ import ( "time" "github.com/hashicorp/go-multierror" + "github.com/prometheus/client_golang/prometheus" "github.com/coder/quartz" @@ -31,11 +32,13 @@ import ( ) type StoreReconciler struct { - store database.Store - cfg codersdk.PrebuildsConfig - pubsub pubsub.Pubsub - logger slog.Logger - clock quartz.Clock + store database.Store + cfg codersdk.PrebuildsConfig + pubsub pubsub.Pubsub + logger slog.Logger + clock quartz.Clock + registerer prometheus.Registerer + metrics *MetricsCollector cancelFn context.CancelCauseFunc running atomic.Bool @@ -45,21 +48,30 @@ type StoreReconciler struct { var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} -func NewStoreReconciler( - store database.Store, +func NewStoreReconciler(store database.Store, ps pubsub.Pubsub, cfg codersdk.PrebuildsConfig, logger slog.Logger, clock quartz.Clock, + registerer prometheus.Registerer, ) *StoreReconciler { - return &StoreReconciler{ - store: store, - pubsub: ps, - logger: logger, - cfg: cfg, - clock: clock, - done: make(chan struct{}, 1), + reconciler := &StoreReconciler{ + store: store, + pubsub: ps, + logger: logger, + cfg: cfg, + clock: clock, + registerer: registerer, + done: make(chan struct{}, 1), } + + reconciler.metrics = NewMetricsCollector(store, logger, reconciler) + if err := registerer.Register(reconciler.metrics); err != nil { + // If the registerer fails to register the metrics collector, it's not fatal. + logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + } + + return reconciler } func (c *StoreReconciler) Run(ctx context.Context) { @@ -128,6 +140,17 @@ func (c *StoreReconciler) Stop(ctx context.Context, cause error) { return } + // Unregister the metrics collector. + if c.metrics != nil && c.registerer != nil { + if !c.registerer.Unregister(c.metrics) { + // The API doesn't allow us to know why the de-registration failed, but it's not very consequential. + // The only time this would be an issue is if the premium license is removed, leading to the feature being + // disabled (and consequently this Stop method being called), and then adding a new license which enables the + // feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler. + c.logger.Warn(context.Background(), "failed to unregister metrics collector") + } + } + // If the reconciler is not running, there's nothing else to do. if !c.running.Load() { return diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 5c1ffe993ec42..3c18e557152bb 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -8,6 +8,8 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/coder/coder/v2/coderd/util/slice" "github.com/google/uuid" @@ -45,7 +47,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) { ReconciliationInterval: serpent.Duration(testutil.WaitLong), } logger := testutil.Logger(t) - controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) // given a template version with no presets org := dbgen.Organization(t, db, database.Organization{}) @@ -90,7 +92,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) { ReconciliationInterval: serpent.Duration(testutil.WaitLong), } logger := testutil.Logger(t) - controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) // given there are presets, but no prebuilds org := dbgen.Organization(t, db, database.Organization{}) @@ -317,7 +319,7 @@ func TestPrebuildReconciliation(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -419,7 +421,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -503,7 +505,7 @@ func TestInvalidPreset(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -575,7 +577,7 @@ func TestRunLoop(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock) + reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -705,7 +707,7 @@ func TestFailedBuildBackoff(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, ps := dbtestutil.NewDB(t) - reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock) + reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry()) // Given: an active template version with presets and prebuilds configured. const desiredInstances = 2 @@ -820,7 +822,7 @@ func TestReconciliationLock(t *testing.T) { codersdk.PrebuildsConfig{}, slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug), quartz.NewMock(t), - ) + prometheus.NewRegistry()) reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { lockObtained := mutex.TryLock() // As long as the postgres lock is held, this mutex should always be unlocked when we get here. From 7b78523e09750156b68f2879fccf65261f79cf15 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 24 Apr 2025 16:29:11 +0200 Subject: [PATCH 2/6] Renaming prebuilds to "prebuilt workspaces" in metrics Signed-off-by: Danny Kopping --- .../coderd/prebuilds/metricscollector.go | 31 +++++------- .../coderd/prebuilds/metricscollector_test.go | 50 +++++++++---------- 2 files changed, 37 insertions(+), 44 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 507dbddd383ef..48bae252c3b02 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -16,44 +16,38 @@ import ( var ( labels = []string{"template_name", "preset_name", "organization_name"} createdPrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_created_total", - "The number of prebuilds that have been created to meet the desired count set by presets.", + "coderd_prebuilt_workspaces_created_total", + "The number of prebuilt workspaces that have been created to meet the desired count set by presets.", labels, nil, ) failedPrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_failed_total", - "The number of prebuilds that failed to build during creation.", + "coderd_prebuilt_workspaces_failed_total", + "The number of prebuilt workspaces that failed to build.", labels, nil, ) claimedPrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_claimed_total", - "The number of prebuilds that were claimed by a user. Each count means that a user created a workspace using a preset and was assigned a prebuild instead of a brand new workspace.", - labels, - nil, - ) - usedPresetsDesc = prometheus.NewDesc( - "coderd_prebuilds_used_presets", - "The number of times a preset was used to build a prebuild.", + "coderd_prebuilt_workspaces_claimed_total", + "The number of prebuilt workspaces that were claimed by a user. Each count means that a user created a workspace using a preset and claimed a prebuilt workspace instead of a brand new workspace being created.", labels, nil, ) desiredPrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_desired", - "The number of prebuilds desired by each preset of each template.", + "coderd_prebuilt_workspaces_desired", + "The number of prebuilt workspaces desired by each preset of each template.", labels, nil, ) runningPrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_running", - "The number of prebuilds that are currently running. Running prebuilds have successfully started, but they may not be ready to be claimed by a user yet.", + "coderd_prebuilt_workspaces_running", + "The number of prebuilt workspaces that are currently running. Running prebuilt workspaces have successfully started, but includes both eligible and ineligible workspaces.", labels, nil, ) eligiblePrebuildsDesc = prometheus.NewDesc( - "coderd_prebuilds_eligible", - "The number of eligible prebuilds. Eligible prebuilds are prebuilds that are ready to be claimed by a user.", + "coderd_prebuilt_workspaces_eligible", + "The number of eligible prebuilt workspaces. Eligible prebuilt workspaces are ones whose agent is marked 'ready', and can be claimed by a user.", labels, nil, ) @@ -79,7 +73,6 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- createdPrebuildsDesc descCh <- failedPrebuildsDesc descCh <- claimedPrebuildsDesc - descCh <- usedPresetsDesc descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 0387ecea5e1db..29e0e371418be 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -58,10 +58,10 @@ func TestMetricsCollector(t *testing.T) { // * a prebuild that was claimed should not be deemed running, not eligible. ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, metrics: []metricCheck{ - {"coderd_prebuilds_created_total", ptr.To(1.0), true}, - {"coderd_prebuilds_desired", ptr.To(1.0), false}, - // {"coderd_prebuilds_running", ptr.To(0.0), false}, - // {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, }, @@ -72,10 +72,10 @@ func TestMetricsCollector(t *testing.T) { initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, metrics: []metricCheck{ - {"coderd_prebuilds_created_total", ptr.To(1.0), true}, - {"coderd_prebuilds_desired", ptr.To(1.0), false}, - {"coderd_prebuilds_running", ptr.To(1.0), false}, - {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, }, @@ -86,26 +86,26 @@ func TestMetricsCollector(t *testing.T) { initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, metrics: []metricCheck{ - {"coderd_prebuilds_created_total", ptr.To(1.0), true}, - {"coderd_prebuilds_failed_total", ptr.To(1.0), true}, - {"coderd_prebuilds_desired", ptr.To(1.0), false}, - {"coderd_prebuilds_running", ptr.To(0.0), false}, - {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, }, { - name: "prebuild assigned", + name: "prebuild claimed", transitions: allTransitions, jobStatuses: allJobStatuses, initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, ownerIDs: []uuid.UUID{uuid.New()}, metrics: []metricCheck{ - {"coderd_prebuilds_created_total", ptr.To(1.0), true}, - {"coderd_prebuilds_claimed_total", ptr.To(1.0), true}, - {"coderd_prebuilds_desired", ptr.To(1.0), false}, - {"coderd_prebuilds_running", ptr.To(0.0), false}, - {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, }, @@ -116,9 +116,9 @@ func TestMetricsCollector(t *testing.T) { initiatorIDs: []uuid.UUID{uuid.New()}, ownerIDs: []uuid.UUID{uuid.New()}, metrics: []metricCheck{ - {"coderd_prebuilds_desired", ptr.To(1.0), false}, - {"coderd_prebuilds_running", ptr.To(0.0), false}, - {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, }, @@ -129,7 +129,7 @@ func TestMetricsCollector(t *testing.T) { initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, metrics: []metricCheck{ - {"coderd_prebuilds_desired", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false}, }, templateDeleted: []bool{true}, }, @@ -140,8 +140,8 @@ func TestMetricsCollector(t *testing.T) { initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, metrics: []metricCheck{ - {"coderd_prebuilds_running", ptr.To(1.0), false}, - {"coderd_prebuilds_eligible", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{true}, }, From 547c1b049044e88cb2a2c641d6f195c4894f5949 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 24 Apr 2025 17:00:17 +0200 Subject: [PATCH 3/6] Eligibility tests Signed-off-by: Danny Kopping --- enterprise/coderd/prebuilds/claim_test.go | 2 +- .../coderd/prebuilds/metricscollector_test.go | 203 +++++++++++------- enterprise/coderd/prebuilds/reconcile_test.go | 30 ++- 3 files changed, 155 insertions(+), 80 deletions(-) diff --git a/enterprise/coderd/prebuilds/claim_test.go b/enterprise/coderd/prebuilds/claim_test.go index e8d66fae18a07..1573aab9387f1 100644 --- a/enterprise/coderd/prebuilds/claim_test.go +++ b/enterprise/coderd/prebuilds/claim_test.go @@ -420,7 +420,7 @@ func TestClaimPrebuild_CheckDifferentErrors(t *testing.T) { EntitlementsUpdateInterval: time.Second, }) - reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) + reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), api.PrometheusRegistry) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore) api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 29e0e371418be..859509ced6635 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -45,25 +45,26 @@ func TestMetricsCollector(t *testing.T) { ownerIDs []uuid.UUID metrics []metricCheck templateDeleted []bool + eligible []bool } tests := []testCase{ { - name: "prebuild created", + name: "prebuild provisioned but not completed", transitions: allTransitions, - jobStatuses: allJobStatuses, + jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling), initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, - // TODO: reexamine and refactor the test cases and assertions: - // * a running prebuild that is not elibible to be claimed currently seems to be eligible. - // * a prebuild that was claimed should not be deemed running, not eligible. - ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, metrics: []metricCheck{ {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, + eligible: []bool{false}, }, { name: "prebuild running", @@ -73,11 +74,14 @@ func TestMetricsCollector(t *testing.T) { ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, metrics: []metricCheck{ {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, + eligible: []bool{false}, }, { name: "prebuild failed", @@ -93,6 +97,41 @@ func TestMetricsCollector(t *testing.T) { {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "prebuild eligible", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(1.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{true}, + }, + { + name: "prebuild ineligible", + transitions: allTransitions, + jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusSucceeded), + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, }, { name: "prebuild claimed", @@ -108,6 +147,7 @@ func TestMetricsCollector(t *testing.T) { {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, + eligible: []bool{false}, }, { name: "workspaces that were not created by the prebuilds user are not counted", @@ -121,6 +161,7 @@ func TestMetricsCollector(t *testing.T) { {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{false}, + eligible: []bool{false}, }, { name: "deleted templates never desire prebuilds", @@ -132,6 +173,7 @@ func TestMetricsCollector(t *testing.T) { {"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false}, }, templateDeleted: []bool{true}, + eligible: []bool{false}, }, { name: "running prebuilds for deleted templates are still counted, so that they can be deleted", @@ -144,6 +186,7 @@ func TestMetricsCollector(t *testing.T) { {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, }, templateDeleted: []bool{true}, + eligible: []bool{false}, }, } for _, test := range tests { @@ -158,95 +201,99 @@ func TestMetricsCollector(t *testing.T) { ownerID := ownerID // capture for parallel for _, templateDeleted := range test.templateDeleted { templateDeleted := templateDeleted // capture for parallel - t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) { - t.Parallel() + for _, eligible := range test.eligible { + eligible := eligible // capture for parallel + t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) { + t.Parallel() - logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) - t.Cleanup(func() { - if t.Failed() { - t.Logf("failed to run test: %s", test.name) - t.Logf("transition: %s", transition) - t.Logf("jobStatus: %s", jobStatus) - t.Logf("initiatorID: %s", initiatorID) - t.Logf("ownerID: %s", ownerID) - t.Logf("templateDeleted: %t", templateDeleted) - } - }) - clock := quartz.NewMock(t) - db, pubsub := dbtestutil.NewDB(t) - reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) - ctx := testutil.Context(t, testutil.WaitLong) + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + t.Cleanup(func() { + if t.Failed() { + t.Logf("failed to run test: %s", test.name) + t.Logf("transition: %s", transition) + t.Logf("jobStatus: %s", jobStatus) + t.Logf("initiatorID: %s", initiatorID) + t.Logf("ownerID: %s", ownerID) + t.Logf("templateDeleted: %t", templateDeleted) + } + }) + clock := quartz.NewMock(t) + db, pubsub := dbtestutil.NewDB(t) + reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) + ctx := testutil.Context(t, testutil.WaitLong) - createdUsers := []uuid.UUID{agplprebuilds.SystemUserID} - for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) { - if !slices.Contains(createdUsers, user) { - dbgen.User(t, db, database.User{ - ID: user, - }) - createdUsers = append(createdUsers, user) + createdUsers := []uuid.UUID{agplprebuilds.SystemUserID} + for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) { + if !slices.Contains(createdUsers, user) { + dbgen.User(t, db, database.User{ + ID: user, + }) + createdUsers = append(createdUsers, user) + } } - } - collector := prebuilds.NewMetricsCollector(db, logger, reconciler) - registry := prometheus.NewPedanticRegistry() - registry.Register(collector) + collector := prebuilds.NewMetricsCollector(db, logger, reconciler) + registry := prometheus.NewPedanticRegistry() + registry.Register(collector) - numTemplates := 2 - for i := 0; i < numTemplates; i++ { - org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) - templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID) - preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String()) - setupTestDBWorkspace( - t, clock, db, pubsub, - transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID, - ) - } - - metricsFamilies, err := registry.Gather() - require.NoError(t, err) - - templates, err := db.GetTemplates(ctx) - require.NoError(t, err) - require.Equal(t, numTemplates, len(templates)) + numTemplates := 2 + for i := 0; i < numTemplates; i++ { + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String()) + workspace := setupTestDBWorkspace( + t, clock, db, pubsub, + transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID, + ) + setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible) + } - for _, template := range templates { - org, err := db.GetOrganizationByID(ctx, template.OrganizationID) - require.NoError(t, err) - templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{ - TemplateID: template.ID, - }) + metricsFamilies, err := registry.Gather() require.NoError(t, err) - require.Equal(t, 1, len(templateVersions)) - presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID) + templates, err := db.GetTemplates(ctx) require.NoError(t, err) - require.Equal(t, 1, len(presets)) + require.Equal(t, numTemplates, len(templates)) - for _, preset := range presets { - preset := preset // capture for parallel - labels := map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "organization_name": org.Name, - } + for _, template := range templates { + org, err := db.GetOrganizationByID(ctx, template.OrganizationID) + require.NoError(t, err) + templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{ + TemplateID: template.ID, + }) + require.NoError(t, err) + require.Equal(t, 1, len(templateVersions)) + + presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID) + require.NoError(t, err) + require.Equal(t, 1, len(presets)) - for _, check := range test.metrics { - metric := findMetric(metricsFamilies, check.name, labels) - if check.value == nil { - continue + for _, preset := range presets { + preset := preset // capture for parallel + labels := map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, } - require.NotNil(t, metric, "metric %s should exist", check.name) + for _, check := range test.metrics { + metric := findMetric(metricsFamilies, check.name, labels) + if check.value == nil { + continue + } - if check.isCounter { - require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name) - } else { - require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name) + require.NotNil(t, metric, "metric %s should exist", check.name) + + if check.isCounter { + require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name) + } else { + require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name) + } } } } - } - }) + }) + } } } } diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 3c18e557152bb..881aebdbbc893 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -10,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" + "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/util/slice" "github.com/google/uuid" @@ -1011,6 +1012,29 @@ func setupTestDBWorkspace( return workspace } +func setupTestDBWorkspaceAgent(t *testing.T, db database.Store, workspaceID uuid.UUID, eligible bool) database.WorkspaceAgent { + build, err := db.GetLatestWorkspaceBuildByWorkspaceID(t.Context(), workspaceID) + require.NoError(t, err) + + res := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build.JobID}) + agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ + ResourceID: res.ID, + }) + + // A prebuilt workspace is considered eligible when its agent is in a "ready" lifecycle state. + // i.e. connected to the control plane and all startup scripts have run. + if eligible { + require.NoError(t, db.UpdateWorkspaceAgentLifecycleStateByID(t.Context(), database.UpdateWorkspaceAgentLifecycleStateByIDParams{ + ID: agent.ID, + LifecycleState: database.WorkspaceAgentLifecycleStateReady, + StartedAt: sql.NullTime{Time: dbtime.Now().Add(-time.Minute), Valid: true}, + ReadyAt: sql.NullTime{Time: dbtime.Now(), Valid: true}, + })) + } + + return agent +} + var allTransitions = []database.WorkspaceTransition{ database.WorkspaceTransitionStart, database.WorkspaceTransitionStop, @@ -1026,4 +1050,8 @@ var allJobStatuses = []database.ProvisionerJobStatus{ database.ProvisionerJobStatusCanceling, } -// TODO (sasswart): test mutual exclusion +func allJobStatusesExcept(except ...database.ProvisionerJobStatus) []database.ProvisionerJobStatus { + return slice.Filter(except, func(status database.ProvisionerJobStatus) bool { + return !slice.Contains(allJobStatuses, status) + }) +} From 131307a301919fda7232e68133d3838535da7dc7 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 25 Apr 2025 09:13:11 +0200 Subject: [PATCH 4/6] Review & linter feedback Signed-off-by: Danny Kopping --- enterprise/coderd/prebuilds/metricscollector.go | 2 +- enterprise/coderd/prebuilds/reconcile_test.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 48bae252c3b02..15e62904e9dc9 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -79,9 +79,9 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { } func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { + // nolint:gocritic // We need to set an authz context to read metrics from the db. ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) defer cancel() - // nolint:gocritic // just until we get back to this prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) if err != nil { mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 881aebdbbc893..9783b215f185b 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -1012,6 +1012,7 @@ func setupTestDBWorkspace( return workspace } +// nolint:revive // It's a control flag, but this is a test. func setupTestDBWorkspaceAgent(t *testing.T, db database.Store, workspaceID uuid.UUID, eligible bool) database.WorkspaceAgent { build, err := db.GetLatestWorkspaceBuildByWorkspaceID(t.Context(), workspaceID) require.NoError(t, err) From 7287bd8ce9236767808ed15a6461211c2c9fc530 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 25 Apr 2025 09:48:54 +0200 Subject: [PATCH 5/6] Reword metric descriptions for clarity Signed-off-by: Danny Kopping --- enterprise/coderd/prebuilds/metricscollector.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 15e62904e9dc9..0d782d609b50b 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -17,37 +17,41 @@ var ( labels = []string{"template_name", "preset_name", "organization_name"} createdPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_created_total", - "The number of prebuilt workspaces that have been created to meet the desired count set by presets.", + "Total number of prebuilt workspaces that have been created to meet the desired instance count of each " + + "template preset.", labels, nil, ) failedPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_failed_total", - "The number of prebuilt workspaces that failed to build.", + "Total number of prebuilt workspaces that failed to build.", labels, nil, ) claimedPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_claimed_total", - "The number of prebuilt workspaces that were claimed by a user. Each count means that a user created a workspace using a preset and claimed a prebuilt workspace instead of a brand new workspace being created.", + "Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+ + "with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.", labels, nil, ) desiredPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_desired", - "The number of prebuilt workspaces desired by each preset of each template.", + "Target number of prebuilt workspaces that should be available for each template preset.", labels, nil, ) runningPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_running", - "The number of prebuilt workspaces that are currently running. Running prebuilt workspaces have successfully started, but includes both eligible and ineligible workspaces.", + "Current number of prebuilt workspaces that are in a running state. These workspaces have started "+ + "successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).", labels, nil, ) eligiblePrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_eligible", - "The number of eligible prebuilt workspaces. Eligible prebuilt workspaces are ones whose agent is marked 'ready', and can be claimed by a user.", + "Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that " + + "have completed their build process with their agent reporting 'ready' status.", labels, nil, ) From 1d66ade3fdfebb5cafc88eb6468e61ba494e5622 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 25 Apr 2025 09:53:55 +0200 Subject: [PATCH 6/6] make fmt Signed-off-by: Danny Kopping --- enterprise/coderd/prebuilds/metricscollector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 0d782d609b50b..7b55227effffa 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -17,7 +17,7 @@ var ( labels = []string{"template_name", "preset_name", "organization_name"} createdPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_created_total", - "Total number of prebuilt workspaces that have been created to meet the desired instance count of each " + + "Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+ "template preset.", labels, nil, @@ -50,7 +50,7 @@ var ( ) eligiblePrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_eligible", - "Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that " + + "Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+ "have completed their build process with their agent reporting 'ready' status.", labels, nil,