Skip to content

Commit 8ef4bd6

Browse files
committed
Control cardinality of coderd metrics as well
Signed-off-by: Danny Kopping <danny@coder.com>
1 parent 7e0c8f0 commit 8ef4bd6

File tree

3 files changed

+37
-19
lines changed

3 files changed

+37
-19
lines changed

cli/server.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ func enablePrometheus(
229229
afterCtx(ctx, closeInsightsMetricsCollector)
230230

231231
if vals.Prometheus.CollectAgentStats {
232-
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
232+
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
233233
if err != nil {
234234
return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err)
235235
}

coderd/prometheusmetrics/prometheusmetrics.go

+35-17
Original file line numberDiff line numberDiff line change
@@ -329,11 +329,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
329329
}, nil
330330
}
331331

332-
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) {
332+
var DefaultAgentStatsLabels = []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}
333+
334+
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
333335
if duration == 0 {
334336
duration = 1 * time.Minute
335337
}
336338

339+
if len(aggregateByLabels) == 0 {
340+
aggregateByLabels = DefaultAgentStatsLabels
341+
}
342+
337343
metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{
338344
Namespace: "coderd",
339345
Subsystem: "prometheusmetrics",
@@ -351,7 +357,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
351357
Subsystem: "agentstats",
352358
Name: "tx_bytes",
353359
Help: "Agent Tx bytes",
354-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
360+
}, aggregateByLabels))
355361
err = registerer.Register(agentStatsTxBytesGauge)
356362
if err != nil {
357363
return nil, err
@@ -362,7 +368,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
362368
Subsystem: "agentstats",
363369
Name: "rx_bytes",
364370
Help: "Agent Rx bytes",
365-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
371+
}, aggregateByLabels))
366372
err = registerer.Register(agentStatsRxBytesGauge)
367373
if err != nil {
368374
return nil, err
@@ -373,7 +379,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
373379
Subsystem: "agentstats",
374380
Name: "connection_count",
375381
Help: "The number of established connections by agent",
376-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
382+
}, aggregateByLabels))
377383
err = registerer.Register(agentStatsConnectionCountGauge)
378384
if err != nil {
379385
return nil, err
@@ -384,7 +390,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
384390
Subsystem: "agentstats",
385391
Name: "connection_median_latency_seconds",
386392
Help: "The median agent connection latency in seconds",
387-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
393+
}, aggregateByLabels))
388394
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
389395
if err != nil {
390396
return nil, err
@@ -395,7 +401,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
395401
Subsystem: "agentstats",
396402
Name: "session_count_jetbrains",
397403
Help: "The number of session established by JetBrains",
398-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
404+
}, aggregateByLabels))
399405
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
400406
if err != nil {
401407
return nil, err
@@ -406,7 +412,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
406412
Subsystem: "agentstats",
407413
Name: "session_count_reconnecting_pty",
408414
Help: "The number of session established by reconnecting PTY",
409-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
415+
}, aggregateByLabels))
410416
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
411417
if err != nil {
412418
return nil, err
@@ -417,7 +423,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
417423
Subsystem: "agentstats",
418424
Name: "session_count_ssh",
419425
Help: "The number of session established by SSH",
420-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
426+
}, aggregateByLabels))
421427
err = registerer.Register(agentStatsSessionCountSSHGauge)
422428
if err != nil {
423429
return nil, err
@@ -428,7 +434,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
428434
Subsystem: "agentstats",
429435
Name: "session_count_vscode",
430436
Help: "The number of session established by VSCode",
431-
}, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel}))
437+
}, aggregateByLabels))
432438
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
433439
if err != nil {
434440
return nil, err
@@ -460,16 +466,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
460466
logger.Error(ctx, "can't get agent stats", slog.Error(err))
461467
} else {
462468
for _, agentStat := range stats {
463-
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
464-
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
469+
var labelValues []string
470+
for _, label := range aggregateByLabels {
471+
switch label {
472+
case agentmetrics.UsernameLabel:
473+
labelValues = append(labelValues, agentStat.Username)
474+
case agentmetrics.WorkspaceNameLabel:
475+
labelValues = append(labelValues, agentStat.WorkspaceName)
476+
case agentmetrics.AgentNameLabel:
477+
labelValues = append(labelValues, agentStat.AgentName)
478+
}
479+
}
480+
481+
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
482+
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
465483

466-
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
467-
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
484+
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
485+
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
468486

469-
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
470-
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
471-
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
472-
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
487+
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
488+
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
489+
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
490+
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
473491
}
474492

475493
if len(stats) > 0 {

coderd/prometheusmetrics/prometheusmetrics_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ func TestAgentStats(t *testing.T) {
451451
// and it doesn't depend on the real time.
452452
closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{
453453
IgnoreErrors: true,
454-
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond)
454+
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond, nil)
455455
require.NoError(t, err)
456456
t.Cleanup(closeFunc)
457457

0 commit comments

Comments
 (0)