Skip to content

feat: make agent stats' cardinality configurable #12468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1172e09
Initial implementation
dannykopping Mar 7, 2024
ddd563e
Refactoring
dannykopping Mar 8, 2024
6a1ab6e
Drive-by change to add delve to nix flake so ./scripts/develop.sh --d…
dannykopping Mar 8, 2024
ce0c22d
Update deployment config
dannykopping Mar 8, 2024
25fd616
Linting
dannykopping Mar 8, 2024
cc1a0b0
Control cardinality of coderd metrics as well
dannykopping Mar 8, 2024
122f68d
Remove default value
dannykopping Mar 8, 2024
3e569ff
make fmt && make gen
dannykopping Mar 8, 2024
62e2624
Appeasing CI
dannykopping Mar 8, 2024
6544d2d
Appeasing CI, again...
dannykopping Mar 8, 2024
5e89d05
...
dannykopping Mar 8, 2024
ae8a912
Fix loop var capture
dannykopping Mar 8, 2024
023f7d4
Review comments & CI appeasement
dannykopping Mar 11, 2024
9aedd97
Merge branch 'dk/configurable-cardinality' of github.com:dannykopping…
dannykopping Mar 11, 2024
92be1d6
More review comments & CI appeasement
dannykopping Mar 11, 2024
9b16a3b
Fixing lints
dannykopping Mar 11, 2024
6c7d1bd
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
3538e78
chore(dogfood): update keys
johnstcn Mar 11, 2024
5a97817
Merge remote-tracking branch 'upstream/cj/dogfood-update-keys' into d…
dannykopping Mar 11, 2024
c861500
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
6bcfe99
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
765fe9d
Fix flaky label evaluation by sorting both sides
dannykopping Mar 11, 2024
37c3628
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
f1d2821
Update node key
dannykopping Mar 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactoring
Signed-off-by: Danny Kopping <danny@coder.com>
  • Loading branch information
dannykopping committed Mar 8, 2024
commit ddd563eab50ce6c893d36ba981a2e337b07b71a6
69 changes: 38 additions & 31 deletions coderd/prometheusmetrics/aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"golang.org/x/xerrors"

"cdr.dev/slog"
Expand Down Expand Up @@ -107,8 +108,9 @@ var _ prometheus.Collector = new(MetricsAggregator)

func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
var (
baseLabelNames []string = am.aggregateByLabels
baseLabelNames = am.aggregateByLabels
baseLabelValues []string
extraLabels = am.Labels
)

for _, label := range am.aggregateByLabels {
Expand All @@ -120,19 +122,17 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
baseLabelValues = append(baseLabelValues, val)
}

labels := make([]string, 0, len(baseLabelNames)+len(am.Labels))
labelValues := make([]string, 0, len(baseLabelNames)+len(am.Labels))
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))

labels = append(labels, baseLabelNames...)
labelValues = append(labelValues, baseLabelValues...)

for _, l := range am.Labels {
for _, l := range extraLabels {
labels = append(labels, l.Name)
labelValues = append(labelValues, l.Value)
}

//fmt.Printf(">>>>[%s] [%s] %s [%q] [%q]: %v\n", time.Now().Format(time.RFC3339Nano), am.Type, am.Name, labels, labelValues, am.Value)

desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil)
valueType, err := asPrometheusValueType(am.Type)
if err != nil {
Expand Down Expand Up @@ -237,50 +237,56 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer,
}, nil
}

type MetricAggregator struct {
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
type labelAggregator struct {
aggregations map[string]float64
metrics map[string]annotatedMetric
}

func NewMetricAggregator(size int) *MetricAggregator {
return &MetricAggregator{
func newLabelAggregator(size int) *labelAggregator {
return &labelAggregator{
aggregations: make(map[string]float64, size),
metrics: make(map[string]annotatedMetric, size),
}
}

func (a *MetricAggregator) Aggregate(am annotatedMetric, labels []string) error {
// if we already have an entry for this key, don't clone this am afresh - rather use the existing one
// this will be a bit more memory efficient
// ...do this after unit-test is written

clone := am.clone()

fields := make(map[string]string, len(labels))
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
labelSet := make(model.LabelSet, len(labels))
labelValues := make([]string, 0, len(labels))

for _, label := range labels {
val, err := clone.getFieldByLabel(label)
val, err := am.getFieldByLabel(label)
if err != nil {
return err
}

fields[label] = val
labelSet[model.LabelName(label)] = model.LabelValue(val)
labelValues = append(labelValues, val)
}

key := fmt.Sprintf("%s:%v", clone.Stats_Metric.Name, fields)
// Memoize based on the metric name & the unique combination of labels.
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())

// Aggregate the value based on the key.
a.aggregations[key] += am.Value

metric, found := a.metrics[key]
if !found {
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
metric = am.clone()
}

clone.aggregateByLabels = labels
a.aggregations[key] += clone.Value
// Store the metric.
metric.aggregateByLabels = labels
metric.Value = a.aggregations[key]

clone.Value = a.aggregations[key]
a.metrics[key] = clone
a.metrics[key] = metric

return nil
}

func (a *MetricAggregator) asMetrics() (out []annotatedMetric) {
func (a *labelAggregator) toMetrics() (out []annotatedMetric) {
for _, am := range a.metrics {
out = append(out, am)
}
Expand Down Expand Up @@ -331,24 +337,25 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {

// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
// This results in higher cardinality, but may be desirable in larger deployments.
// Default behaviour.
if len(ma.aggregateByLabels) == 0 {
for _, m := range ma.store {
// Aggregate by high cardinality labels.
m.aggregateByLabels = agentMetricsLabels
// Aggregate by all available metrics.
m.aggregateByLabels = defaultAgentMetricsLabels
input = append(input, m)
}
} else {
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
// metrics because we cannot register multiple metric series with the same labels.
aggregator := NewMetricAggregator(len(ma.store) * len(ma.aggregateByLabels))
la := newLabelAggregator(len(ma.store))

for _, m := range ma.store {
if err := aggregator.Aggregate(m, ma.aggregateByLabels); err != nil {
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
}
}

input = aggregator.asMetrics()
input = la.toMetrics()
}

for _, m := range input {
Expand Down Expand Up @@ -395,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
}

var agentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}
var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}

// AgentMetricLabels are the labels used to decorate an agent's metrics.
// This list should match the list of labels in agentMetricsLabels.
Expand Down
94 changes: 84 additions & 10 deletions coderd/prometheusmetrics/aggregator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@ import (
"testing"
"time"

"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"cdr.dev/slog/sloggers/slogtest"

agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/cryptorand"
Expand Down Expand Up @@ -324,23 +323,19 @@ func TestLabelsAggregation(t *testing.T) {
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
}},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: append([]*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
}, commonLabels...)},
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels},
},
},
{
Expand Down Expand Up @@ -484,6 +479,85 @@ func TestLabelsAggregation(t *testing.T) {
}},
},
},
{
name: "extra labels are retained, even with label aggregations",
aggregateOn: []string{agentmetrics.UsernameLabel},
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
}},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.UsernameLabel, Value: testUsername},
}},
{Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{
{Name: "lizz", Value: "rizz"},
{Name: agentmetrics.UsernameLabel, Value: testUsername},
}},
},
},
{
// Both counters and gauges should have all their values summed to produce the correct output.
name: "counters & gauges behave identically",
aggregateOn: []string{agentmetrics.TemplateNameLabel},
given: []statCollection{
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "username1",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3},
},
},
{
labels: prometheusmetrics.AgentMetricLabels{
Username: "username2",
TemplateName: "template1",
},
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4},
},
},
},
expected: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.TemplateNameLabel, Value: "template1"},
}},
{Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{
{Name: agentmetrics.TemplateNameLabel, Value: "template1"},
}},
},
},
{
// Scenario: validation fails and an invalid label is selected for aggregation.
name: "invalid label aggregation",
aggregateOn: []string{"nonsense"},
given: []statCollection{
{
labels: testLabels,
metrics: []*agentproto.Stats_Metric{
{Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
},
},
},
// Nothing will be returned.
expected: []*agentproto.Stats_Metric{},
},
}

for _, tc := range tests {
Expand Down