Skip to content

Commit b2a1de9

Browse files
authored
feat: fetch prebuilds metrics state in background (#17792)
`Collect()` is called whenever the `/metrics` endpoint is hit to retrieve metrics. The queries used in prebuilds metrics collection are quite heavy, and we want to avoid having them running concurrently / too often to keep db load down. Here I'm moving towards a background retrieval of the state required to set the metrics, which gets invalidated every interval. Also introduces `coderd_prebuilt_workspaces_metrics_last_updated` which operators can use to determine when these metrics go stale. See #17789 as well. --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 709445e commit b2a1de9

File tree

3 files changed

+109
-24
lines changed

3 files changed

+109
-24
lines changed

enterprise/coderd/prebuilds/metricscollector.go

+86-20
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@ package prebuilds
22

33
import (
44
"context"
5+
"fmt"
6+
"sync/atomic"
57
"time"
68

7-
"cdr.dev/slog"
8-
99
"github.com/prometheus/client_golang/prometheus"
10+
"golang.org/x/xerrors"
11+
12+
"cdr.dev/slog"
1013

1114
"github.com/coder/coder/v2/coderd/database"
12-
"github.com/coder/coder/v2/coderd/database/dbauthz"
15+
"github.com/coder/coder/v2/coderd/database/dbtime"
1316
"github.com/coder/coder/v2/coderd/prebuilds"
1417
)
1518

@@ -55,20 +58,34 @@ var (
5558
labels,
5659
nil,
5760
)
61+
lastUpdateDesc = prometheus.NewDesc(
62+
"coderd_prebuilt_workspaces_metrics_last_updated",
63+
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
64+
[]string{},
65+
nil,
66+
)
67+
)
68+
69+
const (
70+
metricsUpdateInterval = time.Second * 15
71+
metricsUpdateTimeout = time.Second * 10
5872
)
5973

6074
type MetricsCollector struct {
6175
database database.Store
6276
logger slog.Logger
6377
snapshotter prebuilds.StateSnapshotter
78+
79+
latestState atomic.Pointer[metricsState]
6480
}
6581

6682
var _ prometheus.Collector = new(MetricsCollector)
6783

6884
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
85+
log := logger.Named("prebuilds_metrics_collector")
6986
return &MetricsCollector{
7087
database: db,
71-
logger: logger.Named("prebuilds_metrics_collector"),
88+
logger: log,
7289
snapshotter: snapshotter,
7390
}
7491
}
@@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
8097
descCh <- desiredPrebuildsDesc
8198
descCh <- runningPrebuildsDesc
8299
descCh <- eligiblePrebuildsDesc
100+
descCh <- lastUpdateDesc
83101
}
84102

103+
// Collect uses the cached state to set configured metrics.
104+
// The state is cached because this function can be called multiple times per second and retrieving the current state
105+
// is an expensive operation.
85106
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
86-
// nolint:gocritic // We need to set an authz context to read metrics from the db.
87-
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
88-
defer cancel()
89-
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
90-
if err != nil {
91-
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
107+
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func.
108+
if currentState == nil {
109+
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set")
110+
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0)
92111
return
93112
}
94113

95-
for _, metric := range prebuildMetrics {
114+
for _, metric := range currentState.prebuildMetrics {
96115
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
97116
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
98117
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
99118
}
100119

101-
snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
102-
if err != nil {
103-
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
104-
return
105-
}
106-
107-
for _, preset := range snapshot.Presets {
120+
for _, preset := range currentState.snapshot.Presets {
108121
if !preset.UsingActiveVersion {
109122
continue
110123
}
111124

112-
presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
125+
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
113126
if err != nil {
114-
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
127+
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
115128
continue
116129
}
117130
state := presetSnapshot.CalculateState()
@@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
120133
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
121134
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
122135
}
136+
137+
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
138+
}
139+
140+
type metricsState struct {
141+
prebuildMetrics []database.GetPrebuildMetricsRow
142+
snapshot *prebuilds.GlobalSnapshot
143+
createdAt time.Time
144+
}
145+
146+
// BackgroundFetch updates the metrics state every given interval.
147+
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) {
148+
tick := time.NewTicker(time.Nanosecond)
149+
defer tick.Stop()
150+
151+
for {
152+
select {
153+
case <-ctx.Done():
154+
return
155+
case <-tick.C:
156+
// Tick immediately, then set regular interval.
157+
tick.Reset(updateInterval)
158+
159+
if err := mc.UpdateState(ctx, updateTimeout); err != nil {
160+
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err))
161+
}
162+
}
163+
}
164+
}
165+
166+
// UpdateState builds the current metrics state.
167+
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error {
168+
start := time.Now()
169+
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout)
170+
defer fetchCancel()
171+
172+
prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx)
173+
if err != nil {
174+
return xerrors.Errorf("fetch prebuild metrics: %w", err)
175+
}
176+
177+
snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database)
178+
if err != nil {
179+
return xerrors.Errorf("snapshot state: %w", err)
180+
}
181+
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds())))
182+
183+
mc.latestState.Store(&metricsState{
184+
prebuildMetrics: prebuildMetrics,
185+
snapshot: snapshot,
186+
createdAt: dbtime.Now(),
187+
})
188+
return nil
123189
}

enterprise/coderd/prebuilds/metricscollector_test.go

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/coder/quartz"
1717

1818
"github.com/coder/coder/v2/coderd/database"
19+
"github.com/coder/coder/v2/coderd/database/dbauthz"
1920
"github.com/coder/coder/v2/coderd/database/dbgen"
2021
"github.com/coder/coder/v2/coderd/database/dbtestutil"
2122
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
@@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) {
248249
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
249250
}
250251

252+
// Force an update to the metrics state to allow the collector to collect fresh metrics.
253+
// nolint:gocritic // Authz context needed to retrieve state.
254+
require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong))
255+
251256
metricsFamilies, err := registry.Gather()
252257
require.NoError(t, err)
253258

enterprise/coderd/prebuilds/reconcile.go

+18-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"database/sql"
66
"fmt"
77
"math"
8+
"sync"
89
"sync/atomic"
910
"time"
1011

@@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store,
6768
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
6869
}
6970

70-
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
71-
if err := registerer.Register(reconciler.metrics); err != nil {
72-
// If the registerer fails to register the metrics collector, it's not fatal.
73-
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
71+
if registerer != nil {
72+
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
73+
if err := registerer.Register(reconciler.metrics); err != nil {
74+
// If the registerer fails to register the metrics collector, it's not fatal.
75+
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
76+
}
7477
}
7578

7679
return reconciler
@@ -87,16 +90,27 @@ func (c *StoreReconciler) Run(ctx context.Context) {
8790
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
8891
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))
8992

93+
var wg sync.WaitGroup
9094
ticker := c.clock.NewTicker(reconciliationInterval)
9195
defer ticker.Stop()
9296
defer func() {
97+
wg.Wait()
9398
c.done <- struct{}{}
9499
}()
95100

96101
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
97102
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
98103
c.cancelFn = cancel
99104

105+
// Start updating metrics in the background.
106+
if c.metrics != nil {
107+
wg.Add(1)
108+
go func() {
109+
defer wg.Done()
110+
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
111+
}()
112+
}
113+
100114
// Everything is in place, reconciler can now be considered as running.
101115
//
102116
// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.

0 commit comments

Comments
 (0)