Skip to content

Commit b6e86a2

Browse files
committed
Refactoring
Signed-off-by: Danny Kopping <danny@coder.com>
1 parent fcbffe0 commit b6e86a2

File tree

2 files changed

+122
-41
lines changed

2 files changed

+122
-41
lines changed

coderd/prometheusmetrics/aggregator.go

+38-31
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
"github.com/coder/coder/v2/coderd/agentmetrics"
1111
"github.com/prometheus/client_golang/prometheus"
12+
"github.com/prometheus/common/model"
1213
"golang.org/x/xerrors"
1314

1415
"cdr.dev/slog"
@@ -107,8 +108,9 @@ var _ prometheus.Collector = new(MetricsAggregator)
107108

108109
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
109110
var (
110-
baseLabelNames []string = am.aggregateByLabels
111+
baseLabelNames = am.aggregateByLabels
111112
baseLabelValues []string
113+
extraLabels = am.Labels
112114
)
113115

114116
for _, label := range am.aggregateByLabels {
@@ -120,19 +122,17 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
120122
baseLabelValues = append(baseLabelValues, val)
121123
}
122124

123-
labels := make([]string, 0, len(baseLabelNames)+len(am.Labels))
124-
labelValues := make([]string, 0, len(baseLabelNames)+len(am.Labels))
125+
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
126+
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))
125127

126128
labels = append(labels, baseLabelNames...)
127129
labelValues = append(labelValues, baseLabelValues...)
128130

129-
for _, l := range am.Labels {
131+
for _, l := range extraLabels {
130132
labels = append(labels, l.Name)
131133
labelValues = append(labelValues, l.Value)
132134
}
133135

134-
//fmt.Printf(">>>>[%s] [%s] %s [%q] [%q]: %v\n", time.Now().Format(time.RFC3339Nano), am.Type, am.Name, labels, labelValues, am.Value)
135-
136136
desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil)
137137
valueType, err := asPrometheusValueType(am.Type)
138138
if err != nil {
@@ -237,50 +237,56 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer,
237237
}, nil
238238
}
239239

240-
type MetricAggregator struct {
240+
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
241+
type labelAggregator struct {
241242
aggregations map[string]float64
242243
metrics map[string]annotatedMetric
243244
}
244245

245-
func NewMetricAggregator(size int) *MetricAggregator {
246-
return &MetricAggregator{
246+
func newLabelAggregator(size int) *labelAggregator {
247+
return &labelAggregator{
247248
aggregations: make(map[string]float64, size),
248249
metrics: make(map[string]annotatedMetric, size),
249250
}
250251
}
251252

252-
func (a *MetricAggregator) Aggregate(am annotatedMetric, labels []string) error {
253-
// if we already have an entry for this key, don't clone this am afresh - rather use the existing one
254-
// this will be a bit more memory efficient
255-
// ...do this after unit-test is written
256-
257-
clone := am.clone()
258-
259-
fields := make(map[string]string, len(labels))
253+
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
254+
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
255+
labelSet := make(model.LabelSet, len(labels))
260256
labelValues := make([]string, 0, len(labels))
261257

262258
for _, label := range labels {
263-
val, err := clone.getFieldByLabel(label)
259+
val, err := am.getFieldByLabel(label)
264260
if err != nil {
265261
return err
266262
}
267263

268-
fields[label] = val
264+
labelSet[model.LabelName(label)] = model.LabelValue(val)
269265
labelValues = append(labelValues, val)
270266
}
271267

272-
key := fmt.Sprintf("%s:%v", clone.Stats_Metric.Name, fields)
268+
// Memoize based on the metric name & the unique combination of labels.
269+
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())
270+
271+
// Aggregate the value based on the key.
272+
a.aggregations[key] += am.Value
273+
274+
metric, found := a.metrics[key]
275+
if !found {
276+
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
277+
metric = am.clone()
278+
}
273279

274-
clone.aggregateByLabels = labels
275-
a.aggregations[key] += clone.Value
280+
// Store the metric.
281+
metric.aggregateByLabels = labels
282+
metric.Value = a.aggregations[key]
276283

277-
clone.Value = a.aggregations[key]
278-
a.metrics[key] = clone
284+
a.metrics[key] = metric
279285

280286
return nil
281287
}
282288

283-
func (a *MetricAggregator) asMetrics() (out []annotatedMetric) {
289+
func (a *labelAggregator) toMetrics() (out []annotatedMetric) {
284290
for _, am := range a.metrics {
285291
out = append(out, am)
286292
}
@@ -331,24 +337,25 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
331337

332338
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
333339
// This results in higher cardinality, but may be desirable in larger deployments.
340+
// Default behaviour.
334341
if len(ma.aggregateByLabels) == 0 {
335342
for _, m := range ma.store {
336-
// Aggregate by high cardinality labels.
337-
m.aggregateByLabels = agentMetricsLabels
343+
// Aggregate by all available metrics.
344+
m.aggregateByLabels = defaultAgentMetricsLabels
338345
input = append(input, m)
339346
}
340347
} else {
341348
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
342349
// metrics because we cannot register multiple metric series with the same labels.
343-
aggregator := NewMetricAggregator(len(ma.store) * len(ma.aggregateByLabels))
350+
la := newLabelAggregator(len(ma.store))
344351

345352
for _, m := range ma.store {
346-
if err := aggregator.Aggregate(m, ma.aggregateByLabels); err != nil {
353+
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
347354
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
348355
}
349356
}
350357

351-
input = aggregator.asMetrics()
358+
input = la.toMetrics()
352359
}
353360

354361
for _, m := range input {
@@ -395,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
395402
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
396403
}
397404

398-
var agentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}
405+
var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}
399406

400407
// AgentMetricLabels are the labels used to decorate an agent's metrics.
401408
// This list should match the list of labels in agentMetricsLabels.

coderd/prometheusmetrics/aggregator_test.go

+84-10
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@ import (
88
"testing"
99
"time"
1010

11+
"cdr.dev/slog/sloggers/slogtest"
1112
"github.com/coder/coder/v2/coderd/agentmetrics"
1213
"github.com/prometheus/client_golang/prometheus"
1314
dto "github.com/prometheus/client_model/go"
1415
"github.com/stretchr/testify/assert"
1516
"github.com/stretchr/testify/require"
1617

17-
"cdr.dev/slog/sloggers/slogtest"
18-
1918
agentproto "github.com/coder/coder/v2/agent/proto"
2019
"github.com/coder/coder/v2/coderd/prometheusmetrics"
2120
"github.com/coder/coder/v2/cryptorand"
@@ -324,23 +323,19 @@ func TestLabelsAggregation(t *testing.T) {
324323
{
325324
labels: testLabels,
326325
metrics: []*agentproto.Stats_Metric{
327-
{Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
326+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
328327
},
329328
},
330329
{
331330
labels: testLabels,
332331
metrics: []*agentproto.Stats_Metric{
333-
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
334-
{Name: "lizz", Value: "rizz"},
335-
}},
332+
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
336333
},
337334
},
338335
},
339336
expected: []*agentproto.Stats_Metric{
340-
{Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
341-
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: append([]*agentproto.Stats_Metric_Label{
342-
{Name: "lizz", Value: "rizz"},
343-
}, commonLabels...)},
337+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
338+
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels},
344339
},
345340
},
346341
{
@@ -484,6 +479,85 @@ func TestLabelsAggregation(t *testing.T) {
484479
}},
485480
},
486481
},
482+
{
483+
name: "extra labels are retained, even with label aggregations",
484+
aggregateOn: []string{agentmetrics.UsernameLabel},
485+
given: []statCollection{
486+
{
487+
labels: testLabels,
488+
metrics: []*agentproto.Stats_Metric{
489+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
490+
},
491+
},
492+
{
493+
labels: testLabels,
494+
metrics: []*agentproto.Stats_Metric{
495+
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
496+
{Name: "lizz", Value: "rizz"},
497+
}},
498+
},
499+
},
500+
},
501+
expected: []*agentproto.Stats_Metric{
502+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
503+
{Name: agentmetrics.UsernameLabel, Value: testUsername},
504+
}},
505+
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
506+
{Name: "lizz", Value: "rizz"},
507+
{Name: agentmetrics.UsernameLabel, Value: testUsername},
508+
}},
509+
},
510+
},
511+
{
512+
// Both counters and gauges should have all their values summed to produce the correct output.
513+
name: "counters & gauges behave identically",
514+
aggregateOn: []string{agentmetrics.TemplateNameLabel},
515+
given: []statCollection{
516+
{
517+
labels: prometheusmetrics.AgentMetricLabels{
518+
Username: "username1",
519+
TemplateName: "template1",
520+
},
521+
metrics: []*agentproto.Stats_Metric{
522+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
523+
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3},
524+
},
525+
},
526+
{
527+
labels: prometheusmetrics.AgentMetricLabels{
528+
Username: "username2",
529+
TemplateName: "template1",
530+
},
531+
metrics: []*agentproto.Stats_Metric{
532+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2},
533+
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
534+
},
535+
},
536+
},
537+
expected: []*agentproto.Stats_Metric{
538+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{
539+
{Name: agentmetrics.TemplateNameLabel, Value: "template1"},
540+
}},
541+
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
542+
{Name: agentmetrics.TemplateNameLabel, Value: "template1"},
543+
}},
544+
},
545+
},
546+
{
547+
// Scenario: validation fails and an invalid label is selected for aggregation.
548+
name: "invalid label aggregation",
549+
aggregateOn: []string{"nonsense"},
550+
given: []statCollection{
551+
{
552+
labels: testLabels,
553+
metrics: []*agentproto.Stats_Metric{
554+
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
555+
},
556+
},
557+
},
558+
// Nothing will be returned.
559+
expected: []*agentproto.Stats_Metric{},
560+
},
487561
}
488562

489563
for _, tc := range tests {

0 commit comments

Comments
 (0)