From 975caa55e5c92e708bc17e6aee00406dbc0e2682 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 7 Mar 2024 15:26:18 +0200 Subject: [PATCH 01/19] Initial implementation Signed-off-by: Danny Kopping --- cli/server.go | 2 +- coderd/agentmetrics/labels.go | 8 + coderd/prometheusmetrics/aggregator.go | 155 ++++++++- coderd/prometheusmetrics/aggregator_test.go | 293 ++++++++++++++++-- coderd/prometheusmetrics/prometheusmetrics.go | 32 +- codersdk/deployment.go | 45 ++- 6 files changed, 470 insertions(+), 65 deletions(-) create mode 100644 coderd/agentmetrics/labels.go diff --git a/cli/server.go b/cli/server.go index 6788639193fdd..0134cdf4b7df2 100644 --- a/cli/server.go +++ b/cli/server.go @@ -235,7 +235,7 @@ func enablePrometheus( } afterCtx(ctx, closeAgentStatsFunc) - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0) + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value()) if err != nil { return nil, xerrors.Errorf("can't initialize metrics aggregator: %w", err) } diff --git a/coderd/agentmetrics/labels.go b/coderd/agentmetrics/labels.go new file mode 100644 index 0000000000000..90ebcad6f963e --- /dev/null +++ b/coderd/agentmetrics/labels.go @@ -0,0 +1,8 @@ +package agentmetrics + +const ( + TemplateNameLabel = "template_name" + AgentNameLabel = "agent_name" + UsernameLabel = "username" + WorkspaceNameLabel = "workspace_name" +) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 40ad6c7b2f621..d7798c6dde538 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" @@ -43,9 +44,10 @@ type MetricsAggregator struct { collectCh chan (chan []prometheus.Metric) updateCh chan updateRequest - storeSizeGauge prometheus.Gauge - updateHistogram prometheus.Histogram - cleanupHistogram prometheus.Histogram + storeSizeGauge prometheus.Gauge + updateHistogram prometheus.Histogram + cleanupHistogram prometheus.Histogram + aggregateByLabels []string } type updateRequest struct { @@ -68,6 +70,8 @@ type annotatedMetric struct { templateName string expiryDate time.Time + + aggregateByLabels []string } type metricKey struct { @@ -102,26 +106,80 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey { var _ prometheus.Collector = new(MetricsAggregator) func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { - labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) - labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) + var ( + baseLabelNames []string = am.aggregateByLabels + baseLabelValues []string + ) + + for _, label := range am.aggregateByLabels { + val, err := am.getFieldByLabel(label) + if err != nil { + return nil, err + } + + baseLabelValues = append(baseLabelValues, val) + } + + labels := make([]string, 0, len(baseLabelNames)+len(am.Labels)) + labelValues := make([]string, 0, len(baseLabelNames)+len(am.Labels)) - labels = append(labels, agentMetricsLabels...) - labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName) + labels = append(labels, baseLabelNames...) + labelValues = append(labelValues, baseLabelValues...) for _, l := range am.Labels { labels = append(labels, l.Name) labelValues = append(labelValues, l.Value) } + //fmt.Printf(">>>>[%s] [%s] %s [%q] [%q]: %v\n", time.Now().Format(time.RFC3339Nano), am.Type, am.Name, labels, labelValues, am.Value) + desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil) valueType, err := asPrometheusValueType(am.Type) if err != nil { return nil, err } + return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil } -func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) { +// getFieldByLabel returns the related field value for a given label +func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { + var labelVal string + switch label { + case agentmetrics.WorkspaceNameLabel: + labelVal = am.workspaceName + case agentmetrics.TemplateNameLabel: + labelVal = am.templateName + case agentmetrics.AgentNameLabel: + labelVal = am.agentName + case agentmetrics.UsernameLabel: + labelVal = am.username + default: + return "", xerrors.Errorf("unexpected label: %q", label) + } + + return labelVal, nil +} + +func (am *annotatedMetric) clone() annotatedMetric { + stats := &agentproto.Stats_Metric{ + Name: am.Name, + Type: am.Type, + Value: am.Value, + Labels: am.Labels, + } + + return annotatedMetric{ + Stats_Metric: stats, + username: am.username, + workspaceName: am.workspaceName, + agentName: am.agentName, + templateName: am.templateName, + expiryDate: am.expiryDate, + } +} + +func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) { metricsCleanupInterval := defaultMetricsCleanupInterval if duration > 0 { metricsCleanupInterval = duration @@ -174,9 +232,61 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, storeSizeGauge: storeSizeGauge, updateHistogram: updateHistogram, cleanupHistogram: cleanupHistogram, + + aggregateByLabels: aggregateByLabels, }, nil } +type MetricAggregator struct { + aggregations map[string]float64 + metrics map[string]annotatedMetric +} + +func NewMetricAggregator(size int) *MetricAggregator { + return &MetricAggregator{ + aggregations: make(map[string]float64, size), + metrics: make(map[string]annotatedMetric, size), + } +} + +func (a *MetricAggregator) Aggregate(am annotatedMetric, labels []string) error { + // if we already have an entry for this key, don't clone this am afresh - rather use the existing one + // this will be a bit more memory efficient + // ...do this after unit-test is written + + clone := am.clone() + + fields := make(map[string]string, len(labels)) + labelValues := make([]string, 0, len(labels)) + + for _, label := range labels { + val, err := clone.getFieldByLabel(label) + if err != nil { + return err + } + + fields[label] = val + labelValues = append(labelValues, val) + } + + key := fmt.Sprintf("%s:%v", clone.Stats_Metric.Name, fields) + + clone.aggregateByLabels = labels + a.aggregations[key] += clone.Value + + clone.Value = a.aggregations[key] + a.metrics[key] = clone + + return nil +} + +func (a *MetricAggregator) asMetrics() (out []annotatedMetric) { + for _, am := range a.metrics { + out = append(out, am) + } + return +} + func (ma *MetricsAggregator) Run(ctx context.Context) func() { ctx, cancelFunc := context.WithCancel(ctx) done := make(chan struct{}) @@ -216,8 +326,32 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { case outputCh := <-ma.collectCh: ma.log.Debug(ctx, "collect metrics") + var input []annotatedMetric output := make([]prometheus.Metric, 0, len(ma.store)) - for _, m := range ma.store { + + // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. + // This results in higher cardinality, but may be desirable in larger deployments. + if len(ma.aggregateByLabels) == 0 { + for _, m := range ma.store { + // Aggregate by high cardinality labels. + m.aggregateByLabels = agentMetricsLabels + input = append(input, m) + } + } else { + // However, if custom aggregations have been chosen, we need to aggregate the values from the annotated + // metrics because we cannot register multiple metric series with the same labels. + aggregator := NewMetricAggregator(len(ma.store) * len(ma.aggregateByLabels)) + + for _, m := range ma.store { + if err := aggregator.Aggregate(m, ma.aggregateByLabels); err != nil { + ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err)) + } + } + + input = aggregator.asMetrics() + } + + for _, m := range input { promMetric, err := m.asPrometheus() if err != nil { ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) @@ -225,6 +359,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { } output = append(output, promMetric) } + outputCh <- output close(outputCh) case <-cleanupTicker.C: @@ -260,7 +395,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} +var agentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} // AgentMetricLabels are the labels used to decorate an agent's metrics. // This list should match the list of labels in agentMetricsLabels. diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index bc17dc9be7a11..27a85280d5db8 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" @@ -40,7 +41,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { // given registry := prometheus.NewRegistry() - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour) // time.Hour, so metrics won't expire + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil) // time.Hour, so metrics won't expire require.NoError(t, err) ctx, cancelFunc := context.WithCancel(context.Background()) @@ -93,54 +94,54 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { } commonLabels := []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, } expected := []*agentproto.Stats_Metric{ {Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: -9, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, {Name: "lizz", Value: "rizz"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, }}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: 4, Labels: commonLabels}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 2, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, {Name: "foobar", Value: "Foobaz"}, {Name: "hello", Value: "world"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, }}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 5, Labels: commonLabels}, {Name: "d_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 6, Labels: commonLabels}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 17, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, {Name: "cat", Value: "do,=g"}, {Name: "hello", Value: "wo,,rld"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, }}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 15, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, {Name: "foobar", Value: "Foo,ba=z"}, {Name: "halo", Value: "wor\\,d=1,e=\\,2"}, {Name: "hello", Value: "wo,,r=d"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, }}, {Name: "f_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "agent_name", Value: testAgentName}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, {Name: "foobar", Value: "foobaz"}, - {Name: "username", Value: testUsername}, - {Name: "workspace_name", Value: testWorkspaceName}, - {Name: "template_name", Value: testTemplateName}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, }}, } @@ -175,6 +176,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a return false } + // ensure stable iteration order + sort.Slice(expected, func(i, j int) bool { + return expected[i].Name < expected[j].Name + }) + sort.Slice(actual, func(i, j int) bool { m1 := prometheusMetricToString(t, actual[i]) m2 := prometheusMetricToString(t, actual[j]) @@ -253,7 +259,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { // given registry := prometheus.NewRegistry() - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond) + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, nil) require.NoError(t, err) ctx, cancelFunc := context.WithCancel(context.Background()) @@ -291,6 +297,235 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { }, testutil.WaitShort, testutil.IntervalFast) } +func TestLabelsAggregation(t *testing.T) { + t.Parallel() + + type statCollection struct { + labels prometheusmetrics.AgentMetricLabels + metrics []*agentproto.Stats_Metric + } + + commonLabels := []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, + {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + } + + tests := []struct { + name string + given []statCollection + expected []*agentproto.Stats_Metric + aggregateOn []string + }{ + { + name: "label aggregations not specified, keep all (high cardinality, default behaviour)", + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + }}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: append([]*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + }, commonLabels...)}, + }, + }, + { + // Scenario: 2 users are using the same agent and we've configured the deployment to aggregate on the "agent_name" label. + name: "single label aggregation, aggregating to single metric", + aggregateOn: []string{agentmetrics.AgentNameLabel}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We only observed one agent_name value, so all metrics are aggregated to a single series. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, + }}, + }, + }, + { + // Scenario: as above, but we're aggregating on two invariant labels. + name: "multiple label aggregation, aggregating to single metric", + aggregateOn: []string{agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We only observed one agent_name & template_name tuple, so all metrics are aggregated to a single series. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, + {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + }}, + }, + }, + { + // Scenario: aggregating on a label which is unique across all metrics. + name: "single label aggregation, aggregating to multiple metrics", + aggregateOn: []string{agentmetrics.UsernameLabel}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We observed two unique username values, and therefore we have a metric for each. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: "user1"}, + }}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: "user2"}, + }}, + }, + }, + { + // Scenario: aggregating on a label which is unique across all metrics, plus two invariant labels. + name: "multiple label aggregation, aggregating to multiple metrics", + aggregateOn: []string{agentmetrics.UsernameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user1", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "user2", + AgentName: "agent1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + // We observed two unique username values, and therefore we have a metric for each. + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: "user1"}, + {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, + {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + }}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: "user2"}, + {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, + {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + }}, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // given + registry := prometheus.NewRegistry() + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, tc.aggregateOn) // time.Hour, so metrics won't expire + require.NoError(t, err) + + ctx, cancelFunc := context.WithCancel(context.Background()) + t.Cleanup(cancelFunc) + + closeFunc := metricsAggregator.Run(ctx) + t.Cleanup(closeFunc) + + // when + for _, sc := range tc.given { + metricsAggregator.Update(ctx, sc.labels, sc.metrics) + } + + // then + require.Eventually(t, func() bool { + var actual []prometheus.Metric + metricsCh := make(chan prometheus.Metric) + + done := make(chan struct{}, 1) + defer close(done) + go func() { + for m := range metricsCh { + actual = append(actual, m) + } + done <- struct{}{} + }() + metricsAggregator.Collect(metricsCh) + close(metricsCh) + <-done + return verifyCollectedMetrics(t, tc.expected, actual) + }, testutil.WaitMedium, testutil.IntervalSlow) + }) + } +} + func Benchmark_MetricsAggregator_Run(b *testing.B) { // Number of metrics to generate and send in each iteration. // Hard-coded to 1024 to avoid overflowing the queue in the metrics aggregator. @@ -298,11 +533,7 @@ func Benchmark_MetricsAggregator_Run(b *testing.B) { // given registry := prometheus.NewRegistry() - metricsAggregator := must(prometheusmetrics.NewMetricsAggregator( - slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), - registry, - time.Hour, - )) + metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil)) ctx, cancelFunc := context.WithCancel(context.Background()) b.Cleanup(cancelFunc) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index e1928fec5fa15..01ef1cb1e9442 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -10,6 +10,7 @@ import ( "sync/atomic" "time" + "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/coder/coder/v2/codersdk" "github.com/google/uuid" @@ -24,13 +25,6 @@ import ( "github.com/coder/coder/v2/tailnet" ) -const ( - templateNameLabel = "template_name" - agentNameLabel = "agent_name" - usernameLabel = "username" - workspaceNameLabel = "workspace_name" -) - // ActiveUsers tracks the number of users that have authenticated within the past hour. func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { @@ -156,7 +150,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"})) + }, []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.TemplateNameLabel, "template_version"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err @@ -167,7 +161,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "status", "lifecycle_state", "tailnet_node"})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err @@ -178,7 +172,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "derp_region", "preferred"})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err @@ -189,7 +183,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel, "app_name", "health"})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -357,7 +351,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "tx_bytes", Help: "Agent Tx bytes", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsTxBytesGauge) if err != nil { return nil, err @@ -368,7 +362,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "rx_bytes", Help: "Agent Rx bytes", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsRxBytesGauge) if err != nil { return nil, err @@ -379,7 +373,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_count", Help: "The number of established connections by agent", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsConnectionCountGauge) if err != nil { return nil, err @@ -390,7 +384,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_median_latency_seconds", Help: "The median agent connection latency in seconds", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsConnectionMedianLatencyGauge) if err != nil { return nil, err @@ -401,7 +395,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_jetbrains", Help: "The number of session established by JetBrains", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsSessionCountJetBrainsGauge) if err != nil { return nil, err @@ -412,7 +406,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_reconnecting_pty", Help: "The number of session established by reconnecting PTY", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge) if err != nil { return nil, err @@ -423,7 +417,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_ssh", Help: "The number of session established by SSH", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsSessionCountSSHGauge) if err != nil { return nil, err @@ -434,7 +428,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_vscode", Help: "The number of session established by VSCode", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) err = registerer.Register(agentStatsSessionCountVSCodeGauge) if err != nil { return nil, err diff --git a/codersdk/deployment.go b/codersdk/deployment.go index ef4feeab06372..e49f4219a86dd 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/coder/coder/v2/coderd/agentmetrics" "golang.org/x/mod/semver" "golang.org/x/xerrors" @@ -55,6 +56,13 @@ const ( FeatureControlSharedPorts FeatureName = "control_shared_ports" ) +var ( + AcceptedMetricAggregationLabels = []string{ + agentmetrics.TemplateNameLabel, agentmetrics.AgentNameLabel, + agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, + } +) + // FeatureNames must be kept in-sync with the Feature enum above. var FeatureNames = []FeatureName{ FeatureUserLimit, @@ -255,10 +263,11 @@ type DERPConfig struct { } type PrometheusConfig struct { - Enable clibase.Bool `json:"enable" typescript:",notnull"` - Address clibase.HostPort `json:"address" typescript:",notnull"` - CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"` - CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"` + Enable clibase.Bool `json:"enable" typescript:",notnull"` + Address clibase.HostPort `json:"address" typescript:",notnull"` + CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"` + CollectDBMetrics clibase.Bool `json:"collect_db_metrics" typescript:",notnull"` + AggregateAgentStatsBy clibase.StringArray `json:"aggregate_agent_stats_by" typescript:",notnull"` } type PprofConfig struct { @@ -942,6 +951,34 @@ when required by your organization's security policy.`, Group: &deploymentGroupIntrospectionPrometheus, YAML: "collect_agent_stats", }, + { + Name: "TODO", + Description: "TODO.", + Flag: "prometheus-aggregate-agent-stats-by", + Env: "CODER_PROMETHEUS_AGGREGATE_BY_LABELS", + Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { + if value == nil { + return nil + } + + acceptable := make(map[string]any, len(AcceptedMetricAggregationLabels)) + for _, label := range AcceptedMetricAggregationLabels { + acceptable[label] = nil + } + + for _, label := range value.Value() { + if _, found := acceptable[label]; !found { + return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable", + label, strings.Join(AcceptedMetricAggregationLabels, ", ")) + } + } + + return nil + }), + Group: &deploymentGroupIntrospectionPrometheus, + YAML: "aggregate_agent_stats_by", + Default: strings.Join(AcceptedMetricAggregationLabels, ","), + }, { Name: "Prometheus Collect Database Metrics", Description: "Collect database metrics (may increase charges for metrics storage).", From fcaebb30ed3135e818b270d81ed1ca185c743b5d Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 11:44:21 +0200 Subject: [PATCH 02/19] Refactoring Signed-off-by: Danny Kopping --- coderd/prometheusmetrics/aggregator.go | 69 ++++++++------- coderd/prometheusmetrics/aggregator_test.go | 94 ++++++++++++++++++--- 2 files changed, 122 insertions(+), 41 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index d7798c6dde538..511050820ea60 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -9,6 +9,7 @@ import ( "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" "golang.org/x/xerrors" "cdr.dev/slog" @@ -107,8 +108,9 @@ var _ prometheus.Collector = new(MetricsAggregator) func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { var ( - baseLabelNames []string = am.aggregateByLabels + baseLabelNames = am.aggregateByLabels baseLabelValues []string + extraLabels = am.Labels ) for _, label := range am.aggregateByLabels { @@ -120,19 +122,17 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { baseLabelValues = append(baseLabelValues, val) } - labels := make([]string, 0, len(baseLabelNames)+len(am.Labels)) - labelValues := make([]string, 0, len(baseLabelNames)+len(am.Labels)) + labels := make([]string, 0, len(baseLabelNames)+len(extraLabels)) + labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels)) labels = append(labels, baseLabelNames...) labelValues = append(labelValues, baseLabelValues...) - for _, l := range am.Labels { + for _, l := range extraLabels { labels = append(labels, l.Name) labelValues = append(labelValues, l.Value) } - //fmt.Printf(">>>>[%s] [%s] %s [%q] [%q]: %v\n", time.Now().Format(time.RFC3339Nano), am.Type, am.Name, labels, labelValues, am.Value) - desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil) valueType, err := asPrometheusValueType(am.Type) if err != nil { @@ -237,50 +237,56 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, }, nil } -type MetricAggregator struct { +// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels. +type labelAggregator struct { aggregations map[string]float64 metrics map[string]annotatedMetric } -func NewMetricAggregator(size int) *MetricAggregator { - return &MetricAggregator{ +func newLabelAggregator(size int) *labelAggregator { + return &labelAggregator{ aggregations: make(map[string]float64, size), metrics: make(map[string]annotatedMetric, size), } } -func (a *MetricAggregator) Aggregate(am annotatedMetric, labels []string) error { - // if we already have an entry for this key, don't clone this am afresh - rather use the existing one - // this will be a bit more memory efficient - // ...do this after unit-test is written - - clone := am.clone() - - fields := make(map[string]string, len(labels)) +func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { + // Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering. + labelSet := make(model.LabelSet, len(labels)) labelValues := make([]string, 0, len(labels)) for _, label := range labels { - val, err := clone.getFieldByLabel(label) + val, err := am.getFieldByLabel(label) if err != nil { return err } - fields[label] = val + labelSet[model.LabelName(label)] = model.LabelValue(val) labelValues = append(labelValues, val) } - key := fmt.Sprintf("%s:%v", clone.Stats_Metric.Name, fields) + // Memoize based on the metric name & the unique combination of labels. + key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint()) + + // Aggregate the value based on the key. + a.aggregations[key] += am.Value + + metric, found := a.metrics[key] + if !found { + // Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers. + metric = am.clone() + } - clone.aggregateByLabels = labels - a.aggregations[key] += clone.Value + // Store the metric. + metric.aggregateByLabels = labels + metric.Value = a.aggregations[key] - clone.Value = a.aggregations[key] - a.metrics[key] = clone + a.metrics[key] = metric return nil } -func (a *MetricAggregator) asMetrics() (out []annotatedMetric) { +func (a *labelAggregator) toMetrics() (out []annotatedMetric) { for _, am := range a.metrics { out = append(out, am) } @@ -331,24 +337,25 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. // This results in higher cardinality, but may be desirable in larger deployments. + // Default behaviour. if len(ma.aggregateByLabels) == 0 { for _, m := range ma.store { - // Aggregate by high cardinality labels. - m.aggregateByLabels = agentMetricsLabels + // Aggregate by all available metrics. + m.aggregateByLabels = defaultAgentMetricsLabels input = append(input, m) } } else { // However, if custom aggregations have been chosen, we need to aggregate the values from the annotated // metrics because we cannot register multiple metric series with the same labels. - aggregator := NewMetricAggregator(len(ma.store) * len(ma.aggregateByLabels)) + la := newLabelAggregator(len(ma.store)) for _, m := range ma.store { - if err := aggregator.Aggregate(m, ma.aggregateByLabels); err != nil { + if err := la.aggregate(m, ma.aggregateByLabels); err != nil { ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err)) } } - input = aggregator.asMetrics() + input = la.toMetrics() } for _, m := range input { @@ -395,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var agentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} +var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} // AgentMetricLabels are the labels used to decorate an agent's metrics. // This list should match the list of labels in agentMetricsLabels. diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 27a85280d5db8..5619d9546525a 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -8,14 +8,13 @@ import ( "testing" "time" + "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "cdr.dev/slog/sloggers/slogtest" - agentproto "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/cryptorand" @@ -324,23 +323,19 @@ func TestLabelsAggregation(t *testing.T) { { labels: testLabels, metrics: []*agentproto.Stats_Metric{ - {Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, }, }, { labels: testLabels, metrics: []*agentproto.Stats_Metric{ - {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ - {Name: "lizz", Value: "rizz"}, - }}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4}, }, }, }, expected: []*agentproto.Stats_Metric{ - {Name: "no_extra_labels", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, - {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: append([]*agentproto.Stats_Metric_Label{ - {Name: "lizz", Value: "rizz"}, - }, commonLabels...)}, + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4, Labels: commonLabels}, }, }, { @@ -484,6 +479,85 @@ func TestLabelsAggregation(t *testing.T) { }}, }, }, + { + name: "extra labels are retained, even with label aggregations", + aggregateOn: []string{agentmetrics.UsernameLabel}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + }}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + }}, + {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ + {Name: "lizz", Value: "rizz"}, + {Name: agentmetrics.UsernameLabel, Value: testUsername}, + }}, + }, + }, + { + // Both counters and gauges should have all their values summed to produce the correct output. + name: "counters & gauges behave identically", + aggregateOn: []string{agentmetrics.TemplateNameLabel}, + given: []statCollection{ + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "username1", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 3}, + }, + }, + { + labels: prometheusmetrics.AgentMetricLabels{ + Username: "username2", + TemplateName: "template1", + }, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 2}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 4}, + }, + }, + }, + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + }}, + {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ + {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + }}, + }, + }, + { + // Scenario: validation fails and an invalid label is selected for aggregation. + name: "invalid label aggregation", + aggregateOn: []string{"nonsense"}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + }, + // Nothing will be returned. + expected: []*agentproto.Stats_Metric{}, + }, } for _, tc := range tests { From 70102f83dc1d53d56b377a47cf345a2f18d3a88a Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 11:44:48 +0200 Subject: [PATCH 03/19] Drive-by change to add delve to nix flake so ./scripts/develop.sh --debug works Signed-off-by: Danny Kopping --- flake.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/flake.nix b/flake.nix index cd82bfd84e35e..f906f3c3d5e68 100644 --- a/flake.nix +++ b/flake.nix @@ -26,6 +26,7 @@ bat cairo curl + delve drpc.defaultPackage.${system} gcc gdk From e3bfa0d78e53621bab89fd3c8f932a0bb6a3da4e Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 12:03:06 +0200 Subject: [PATCH 04/19] Update deployment config Signed-off-by: Danny Kopping --- codersdk/deployment.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/codersdk/deployment.go b/codersdk/deployment.go index e49f4219a86dd..11ce55aeee65e 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "flag" + "fmt" "net/http" "os" "path/filepath" @@ -952,10 +953,10 @@ when required by your organization's security policy.`, YAML: "collect_agent_stats", }, { - Name: "TODO", - Description: "TODO.", + Name: "Prometheus Aggregate Agent Stats By", + Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %q", AcceptedMetricAggregationLabels), Flag: "prometheus-aggregate-agent-stats-by", - Env: "CODER_PROMETHEUS_AGGREGATE_BY_LABELS", + Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY", Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { if value == nil { return nil From 92e538ac34864b24dc09307ebb50f08566d33472 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 12:09:38 +0200 Subject: [PATCH 05/19] Linting Signed-off-by: Danny Kopping --- coderd/prometheusmetrics/aggregator.go | 14 +++++++------- codersdk/deployment.go | 5 +++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 511050820ea60..6687504bf764f 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -7,11 +7,12 @@ import ( "strings" "time" - "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "golang.org/x/xerrors" + "github.com/coder/coder/v2/coderd/agentmetrics" + "cdr.dev/slog" agentproto "github.com/coder/coder/v2/agent/proto" @@ -253,7 +254,6 @@ func newLabelAggregator(size int) *labelAggregator { func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { // Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering. labelSet := make(model.LabelSet, len(labels)) - labelValues := make([]string, 0, len(labels)) for _, label := range labels { val, err := am.getFieldByLabel(label) @@ -262,7 +262,6 @@ func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { } labelSet[model.LabelName(label)] = model.LabelValue(val) - labelValues = append(labelValues, val) } // Memoize based on the metric name & the unique combination of labels. @@ -286,11 +285,12 @@ func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { return nil } -func (a *labelAggregator) toMetrics() (out []annotatedMetric) { +func (a *labelAggregator) listMetrics() []annotatedMetric { + var out []annotatedMetric for _, am := range a.metrics { out = append(out, am) } - return + return out } func (ma *MetricsAggregator) Run(ctx context.Context) func() { @@ -337,7 +337,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. // This results in higher cardinality, but may be desirable in larger deployments. - // Default behaviour. + // Default behavior. if len(ma.aggregateByLabels) == 0 { for _, m := range ma.store { // Aggregate by all available metrics. @@ -355,7 +355,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { } } - input = la.toMetrics() + input = la.listMetrics() } for _, m := range input { diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 11ce55aeee65e..75c29cb4a1e90 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -12,10 +12,11 @@ import ( "strings" "time" - "github.com/coder/coder/v2/coderd/agentmetrics" "golang.org/x/mod/semver" "golang.org/x/xerrors" + "github.com/coder/coder/v2/coderd/agentmetrics" + "github.com/coreos/go-oidc/v3/oidc" "github.com/coder/coder/v2/buildinfo" @@ -954,7 +955,7 @@ when required by your organization's security policy.`, }, { Name: "Prometheus Aggregate Agent Stats By", - Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %q", AcceptedMetricAggregationLabels), + Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %q.", AcceptedMetricAggregationLabels), Flag: "prometheus-aggregate-agent-stats-by", Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY", Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { From fd19896d5f5ed99104e4181b841ad1c71904bb42 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 13:04:15 +0200 Subject: [PATCH 06/19] Control cardinality of coderd metrics as well Signed-off-by: Danny Kopping --- cli/server.go | 2 +- coderd/prometheusmetrics/prometheusmetrics.go | 52 +++++++++++++------ .../prometheusmetrics_test.go | 2 +- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/cli/server.go b/cli/server.go index 0134cdf4b7df2..e02a891022e02 100644 --- a/cli/server.go +++ b/cli/server.go @@ -229,7 +229,7 @@ func enablePrometheus( afterCtx(ctx, closeInsightsMetricsCollector) if vals.Prometheus.CollectAgentStats { - closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0) + closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value()) if err != nil { return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err) } diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 01ef1cb1e9442..632dc9600658c 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -329,11 +329,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis }, nil } -func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) { +var DefaultAgentStatsLabels = []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel} + +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) { if duration == 0 { duration = 1 * time.Minute } + if len(aggregateByLabels) == 0 { + aggregateByLabels = DefaultAgentStatsLabels + } + metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "prometheusmetrics", @@ -351,7 +357,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "tx_bytes", Help: "Agent Tx bytes", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsTxBytesGauge) if err != nil { return nil, err @@ -362,7 +368,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "rx_bytes", Help: "Agent Rx bytes", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsRxBytesGauge) if err != nil { return nil, err @@ -373,7 +379,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_count", Help: "The number of established connections by agent", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsConnectionCountGauge) if err != nil { return nil, err @@ -384,7 +390,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "connection_median_latency_seconds", Help: "The median agent connection latency in seconds", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsConnectionMedianLatencyGauge) if err != nil { return nil, err @@ -395,7 +401,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_jetbrains", Help: "The number of session established by JetBrains", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountJetBrainsGauge) if err != nil { return nil, err @@ -406,7 +412,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_reconnecting_pty", Help: "The number of session established by reconnecting PTY", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge) if err != nil { return nil, err @@ -417,7 +423,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_ssh", Help: "The number of session established by SSH", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountSSHGauge) if err != nil { return nil, err @@ -428,7 +434,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R Subsystem: "agentstats", Name: "session_count_vscode", Help: "The number of session established by VSCode", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel})) + }, aggregateByLabels)) err = registerer.Register(agentStatsSessionCountVSCodeGauge) if err != nil { return nil, err @@ -460,16 +466,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Error(ctx, "can't get agent stats", slog.Error(err)) } else { for _, agentStat := range stats { - agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + var labelValues []string + for _, label := range aggregateByLabels { + switch label { + case agentmetrics.UsernameLabel: + labelValues = append(labelValues, agentStat.Username) + case agentmetrics.WorkspaceNameLabel: + labelValues = append(labelValues, agentStat.WorkspaceName) + case agentmetrics.AgentNameLabel: + labelValues = append(labelValues, agentStat.AgentName) + } + } + + agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...) + agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...) - agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...) + agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...) - agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...) + agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...) + agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...) + agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...) } if len(stats) > 0 { diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 645f179256fa4..c16f4dfba2b76 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -451,7 +451,7 @@ func TestAgentStats(t *testing.T) { // and it doesn't depend on the real time. closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{ IgnoreErrors: true, - }), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + }), registry, db, time.Now().Add(-time.Minute), time.Millisecond, nil) require.NoError(t, err) t.Cleanup(closeFunc) From a6828433722023461cef9481fcb84a44f1d417b2 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 13:46:46 +0200 Subject: [PATCH 07/19] Remove default value Signed-off-by: Danny Kopping --- codersdk/deployment.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 75c29cb4a1e90..9f4b62eca9b90 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -977,9 +977,8 @@ when required by your organization's security policy.`, return nil }), - Group: &deploymentGroupIntrospectionPrometheus, - YAML: "aggregate_agent_stats_by", - Default: strings.Join(AcceptedMetricAggregationLabels, ","), + Group: &deploymentGroupIntrospectionPrometheus, + YAML: "aggregate_agent_stats_by", }, { Name: "Prometheus Collect Database Metrics", From 48b68adf88f638c18e48842bafe93803c210825f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 14:37:34 +0200 Subject: [PATCH 08/19] make fmt && make gen Signed-off-by: Danny Kopping --- coderd/apidoc/docs.go | 6 ++++++ coderd/apidoc/swagger.json | 6 ++++++ codersdk/deployment.go | 12 +++++------- docs/api/general.md | 1 + docs/api/schemas.md | 16 ++++++++++------ docs/cli/server.md | 10 ++++++++++ site/src/api/typesGenerated.ts | 1 + 7 files changed, 39 insertions(+), 13 deletions(-) diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index 79e24f32d0af4..74ad5f3214a17 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -10952,6 +10952,12 @@ const docTemplate = `{ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "aggregate_agent_stats_by": { + "type": "array", + "items": { + "type": "string" + } + }, "collect_agent_stats": { "type": "boolean" }, diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index 20934d817be49..ef3670825586c 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -9860,6 +9860,12 @@ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "aggregate_agent_stats_by": { + "type": "array", + "items": { + "type": "string" + } + }, "collect_agent_stats": { "type": "boolean" }, diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 9f4b62eca9b90..312ef9cc68979 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -58,12 +58,10 @@ const ( FeatureControlSharedPorts FeatureName = "control_shared_ports" ) -var ( - AcceptedMetricAggregationLabels = []string{ - agentmetrics.TemplateNameLabel, agentmetrics.AgentNameLabel, - agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, - } -) +var AcceptedMetricAggregationLabels = []string{ + agentmetrics.TemplateNameLabel, agentmetrics.AgentNameLabel, + agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, +} // FeatureNames must be kept in-sync with the Feature enum above. var FeatureNames = []FeatureName{ @@ -955,7 +953,7 @@ when required by your organization's security policy.`, }, { Name: "Prometheus Aggregate Agent Stats By", - Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %q.", AcceptedMetricAggregationLabels), + Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(AcceptedMetricAggregationLabels, ", ")), Flag: "prometheus-aggregate-agent-stats-by", Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY", Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { diff --git a/docs/api/general.md b/docs/api/general.md index 764972919761d..b21adb8acf6a4 100644 --- a/docs/api/general.md +++ b/docs/api/general.md @@ -317,6 +317,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \ "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true diff --git a/docs/api/schemas.md b/docs/api/schemas.md index 8dbc25d3713b4..5f295cf774f08 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -2786,6 +2786,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -3154,6 +3155,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -4783,6 +4785,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in "host": "string", "port": "string" }, + "aggregate_agent_stats_by": ["string"], "collect_agent_stats": true, "collect_db_metrics": true, "enable": true @@ -4791,12 +4794,13 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in ### Properties -| Name | Type | Required | Restrictions | Description | -| --------------------- | ------------------------------------ | -------- | ------------ | ----------- | -| `address` | [clibase.HostPort](#clibasehostport) | false | | | -| `collect_agent_stats` | boolean | false | | | -| `collect_db_metrics` | boolean | false | | | -| `enable` | boolean | false | | | +| Name | Type | Required | Restrictions | Description | +| -------------------------- | ------------------------------------ | -------- | ------------ | ----------- | +| `address` | [clibase.HostPort](#clibasehostport) | false | | | +| `aggregate_agent_stats_by` | array of string | false | | | +| `collect_agent_stats` | boolean | false | | | +| `collect_db_metrics` | boolean | false | | | +| `enable` | boolean | false | | | ## codersdk.ProvisionerConfig diff --git a/docs/cli/server.md b/docs/cli/server.md index 5a32845378991..27e6996558aaa 100644 --- a/docs/cli/server.md +++ b/docs/cli/server.md @@ -742,6 +742,16 @@ URL of a PostgreSQL database. If empty, PostgreSQL binaries will be downloaded f The bind address to serve prometheus metrics. +### --prometheus-aggregate-agent-stats-by + +| | | +| ----------- | -------------------------------------------------------------- | +| Type | string-array | +| Environment | $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY | +| YAML | introspection.prometheus.aggregate_agent_stats_by | + +When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are template_name, agent_name, username, workspace_name. + ### --prometheus-collect-agent-stats | | | diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index d5ce9dd5ece81..f129ea54a573d 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -925,6 +925,7 @@ export interface PrometheusConfig { readonly address: string; readonly collect_agent_stats: boolean; readonly collect_db_metrics: boolean; + readonly aggregate_agent_stats_by: string[]; } // From codersdk/deployment.go From 661f5c24d856e85ee11cb7bd8c853a06918ec4dc Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 14:51:12 +0200 Subject: [PATCH 09/19] Appeasing CI Signed-off-by: Danny Kopping --- cli/testdata/server-config.yaml.golden | 5 +++++ coderd/prometheusmetrics/aggregator_test.go | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 8996387ff4bca..90b1991aa6429 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -191,6 +191,11 @@ introspection: # Collect database metrics (may increase charges for metrics storage). # (default: false, type: bool) collect_db_metrics: false + # When collecting agent stats, aggregate metrics by a given set of comma-separated + # labels to reduce cardinality. Accepted values are template_name, agent_name, + # username, workspace_name. + # (default: , type: string-array) + aggregate_agent_stats_by: [] pprof: # Serve pprof metrics on the address defined by pprof address. # (default: , type: bool) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 5619d9546525a..ec91fe33f7221 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -297,8 +297,6 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { } func TestLabelsAggregation(t *testing.T) { - t.Parallel() - type statCollection struct { labels prometheusmetrics.AgentMetricLabels metrics []*agentproto.Stats_Metric @@ -561,6 +559,8 @@ func TestLabelsAggregation(t *testing.T) { } for _, tc := range tests { + t.Parallel() + t.Run(tc.name, func(t *testing.T) { // given registry := prometheus.NewRegistry() From 7d3ded4fd2e5d6f97333895bb42ba941a7dfd8cb Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 15:01:43 +0200 Subject: [PATCH 10/19] Appeasing CI, again... Signed-off-by: Danny Kopping --- cli/testdata/server-config.yaml.golden | 6 +++--- coderd/prometheusmetrics/aggregator_test.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 90b1991aa6429..90db9e018b4e8 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -188,14 +188,14 @@ introspection: # Collect agent stats (may increase charges for metrics storage). # (default: , type: bool) collect_agent_stats: false - # Collect database metrics (may increase charges for metrics storage). - # (default: false, type: bool) - collect_db_metrics: false # When collecting agent stats, aggregate metrics by a given set of comma-separated # labels to reduce cardinality. Accepted values are template_name, agent_name, # username, workspace_name. # (default: , type: string-array) aggregate_agent_stats_by: [] + # Collect database metrics (may increase charges for metrics storage). + # (default: false, type: bool) + collect_db_metrics: false pprof: # Serve pprof metrics on the address defined by pprof address. # (default: , type: bool) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index ec91fe33f7221..5619d9546525a 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -297,6 +297,8 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { } func TestLabelsAggregation(t *testing.T) { + t.Parallel() + type statCollection struct { labels prometheusmetrics.AgentMetricLabels metrics []*agentproto.Stats_Metric @@ -559,8 +561,6 @@ func TestLabelsAggregation(t *testing.T) { } for _, tc := range tests { - t.Parallel() - t.Run(tc.name, func(t *testing.T) { // given registry := prometheus.NewRegistry() From defcf749b4a0c60cd0bc61ad7e93793923187ac3 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 8 Mar 2024 15:14:03 +0200 Subject: [PATCH 11/19] ... Signed-off-by: Danny Kopping --- cli/testdata/coder_server_--help.golden | 7 ++++++- coderd/prometheusmetrics/aggregator_test.go | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cli/testdata/coder_server_--help.golden b/cli/testdata/coder_server_--help.golden index 632a96a470354..ce551dc888763 100644 --- a/cli/testdata/coder_server_--help.golden +++ b/cli/testdata/coder_server_--help.golden @@ -119,10 +119,15 @@ INTROSPECTION / LOGGING OPTIONS: --log-stackdriver string, $CODER_LOGGING_STACKDRIVER Output Stackdriver compatible logs to a given file. -INTROSPECTION / PROMETHEUS OPTIONS: +INTROSPECTION / PROMETHEUS OPTIONS: --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. + --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY + When collecting agent stats, aggregate metrics by a given set of + comma-separated labels to reduce cardinality. Accepted values are + template_name, agent_name, username, workspace_name. + --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS Collect agent stats (may increase charges for metrics storage). diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 5619d9546525a..55bc66eb651e1 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -562,6 +562,8 @@ func TestLabelsAggregation(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() + // given registry := prometheus.NewRegistry() metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, tc.aggregateOn) // time.Hour, so metrics won't expire From 97f2bd1d72a53bec076149dea37211aed04515e9 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 09:54:33 +0200 Subject: [PATCH 12/19] Review comments & CI appeasement Signed-off-by: Danny Kopping --- cli/testdata/coder_server_--help.golden | 6 +- cli/testdata/server-config.yaml.golden | 10 +- coderd/agentmetrics/labels.go | 13 +- coderd/prometheusmetrics/aggregator.go | 19 +-- coderd/prometheusmetrics/aggregator_test.go | 125 ++++++++++-------- coderd/prometheusmetrics/prometheusmetrics.go | 18 ++- codersdk/deployment.go | 18 +-- .../cli/testdata/coder_server_--help.golden | 5 + 8 files changed, 121 insertions(+), 93 deletions(-) diff --git a/cli/testdata/coder_server_--help.golden b/cli/testdata/coder_server_--help.golden index ce551dc888763..3c3c0f4031194 100644 --- a/cli/testdata/coder_server_--help.golden +++ b/cli/testdata/coder_server_--help.golden @@ -119,14 +119,14 @@ INTROSPECTION / LOGGING OPTIONS: --log-stackdriver string, $CODER_LOGGING_STACKDRIVER Output Stackdriver compatible logs to a given file. -INTROSPECTION / PROMETHEUS OPTIONS: +INTROSPECTION / PROMETHEUS OPTIONS: --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. - --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY + --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name) When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are - template_name, agent_name, username, workspace_name. + agent_name, template_name, username, workspace_name. --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS Collect agent stats (may increase charges for metrics storage). diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 90db9e018b4e8..14ad5df67f08b 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -189,10 +189,14 @@ introspection: # (default: , type: bool) collect_agent_stats: false # When collecting agent stats, aggregate metrics by a given set of comma-separated - # labels to reduce cardinality. Accepted values are template_name, agent_name, + # labels to reduce cardinality. Accepted values are agent_name, template_name, # username, workspace_name. - # (default: , type: string-array) - aggregate_agent_stats_by: [] + # (default: agent_name,template_name,username,workspace_name, type: string-array) + aggregate_agent_stats_by: + - agent_name + - template_name + - username + - workspace_name # Collect database metrics (may increase charges for metrics storage). # (default: false, type: bool) collect_db_metrics: false diff --git a/coderd/agentmetrics/labels.go b/coderd/agentmetrics/labels.go index 90ebcad6f963e..b8259e4a14d54 100644 --- a/coderd/agentmetrics/labels.go +++ b/coderd/agentmetrics/labels.go @@ -1,8 +1,13 @@ package agentmetrics const ( - TemplateNameLabel = "template_name" - AgentNameLabel = "agent_name" - UsernameLabel = "username" - WorkspaceNameLabel = "workspace_name" + LabelAgentName = "agent_name" + LabelUsername = "username" + LabelTemplateName = "template_name" + LabelWorkspaceName = "workspace_name" +) + +var ( + LabelAll = []string{LabelAgentName, LabelTemplateName, LabelUsername, LabelWorkspaceName} + LabelAgentStats = []string{LabelAgentName, LabelUsername, LabelWorkspaceName} ) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 6687504bf764f..6f22d429a36f4 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -114,7 +114,7 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { extraLabels = am.Labels ) - for _, label := range am.aggregateByLabels { + for _, label := range baseLabelNames { val, err := am.getFieldByLabel(label) if err != nil { return nil, err @@ -147,13 +147,13 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { var labelVal string switch label { - case agentmetrics.WorkspaceNameLabel: + case agentmetrics.LabelWorkspaceName: labelVal = am.workspaceName - case agentmetrics.TemplateNameLabel: + case agentmetrics.LabelTemplateName: labelVal = am.templateName - case agentmetrics.AgentNameLabel: + case agentmetrics.LabelAgentName: labelVal = am.agentName - case agentmetrics.UsernameLabel: + case agentmetrics.LabelUsername: labelVal = am.username default: return "", xerrors.Errorf("unexpected label: %q", label) @@ -162,7 +162,7 @@ func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { return labelVal, nil } -func (am *annotatedMetric) clone() annotatedMetric { +func (am *annotatedMetric) shallowCopy() annotatedMetric { stats := &agentproto.Stats_Metric{ Name: am.Name, Type: am.Type, @@ -273,7 +273,7 @@ func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { metric, found := a.metrics[key] if !found { // Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers. - metric = am.clone() + metric = am.shallowCopy() } // Store the metric. @@ -337,8 +337,9 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. // This results in higher cardinality, but may be desirable in larger deployments. + // // Default behavior. - if len(ma.aggregateByLabels) == 0 { + if len(ma.aggregateByLabels) == len(agentmetrics.LabelAll) { for _, m := range ma.store { // Aggregate by all available metrics. m.aggregateByLabels = defaultAgentMetricsLabels @@ -402,7 +403,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} +var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName} // AgentMetricLabels are the labels used to decorate an agent's metrics. // This list should match the list of labels in agentMetricsLabels. diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 55bc66eb651e1..cb6ad427eb029 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -2,19 +2,21 @@ package prometheusmetrics_test import ( "context" + "fmt" "sort" "strings" "sync/atomic" "testing" "time" - "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/agentmetrics" + agentproto "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/cryptorand" @@ -93,54 +95,54 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { } commonLabels := []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, } expected := []*agentproto.Stats_Metric{ {Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: -9, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "lizz", Value: "rizz"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "b_counter_two", Type: agentproto.Stats_Metric_COUNTER, Value: 4, Labels: commonLabels}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 2, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "Foobaz"}, {Name: "hello", Value: "world"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "c_gauge_three", Type: agentproto.Stats_Metric_GAUGE, Value: 5, Labels: commonLabels}, {Name: "d_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 6, Labels: commonLabels}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 17, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "cat", Value: "do,=g"}, {Name: "hello", Value: "wo,,rld"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "e_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 15, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "Foo,ba=z"}, {Name: "halo", Value: "wor\\,d=1,e=\\,2"}, {Name: "hello", Value: "wo,,r=d"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, {Name: "f_gauge_four", Type: agentproto.Stats_Metric_GAUGE, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, {Name: "foobar", Value: "foobaz"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, }}, } @@ -305,10 +307,10 @@ func TestLabelsAggregation(t *testing.T) { } commonLabels := []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: testUsername}, - {Name: agentmetrics.AgentNameLabel, Value: testAgentName}, - {Name: agentmetrics.WorkspaceNameLabel, Value: testWorkspaceName}, - {Name: agentmetrics.TemplateNameLabel, Value: testTemplateName}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, + {Name: agentmetrics.LabelAgentName, Value: testAgentName}, + {Name: agentmetrics.LabelWorkspaceName, Value: testWorkspaceName}, + {Name: agentmetrics.LabelTemplateName, Value: testTemplateName}, } tests := []struct { @@ -318,7 +320,8 @@ func TestLabelsAggregation(t *testing.T) { aggregateOn []string }{ { - name: "label aggregations not specified, keep all (high cardinality, default behaviour)", + name: "label aggregations not specified, keep all (high cardinality, default behavior)", + aggregateOn: agentmetrics.LabelAll, given: []statCollection{ { labels: testLabels, @@ -341,7 +344,7 @@ func TestLabelsAggregation(t *testing.T) { { // Scenario: 2 users are using the same agent and we've configured the deployment to aggregate on the "agent_name" label. name: "single label aggregation, aggregating to single metric", - aggregateOn: []string{agentmetrics.AgentNameLabel}, + aggregateOn: []string{agentmetrics.LabelAgentName}, given: []statCollection{ { labels: prometheusmetrics.AgentMetricLabels{ @@ -365,14 +368,14 @@ func TestLabelsAggregation(t *testing.T) { expected: []*agentproto.Stats_Metric{ // We only observed one agent_name value, so all metrics are aggregated to a single series. {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, }}, }, }, { // Scenario: as above, but we're aggregating on two invariant labels. name: "multiple label aggregation, aggregating to single metric", - aggregateOn: []string{agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}, + aggregateOn: []string{agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}, given: []statCollection{ { labels: prometheusmetrics.AgentMetricLabels{ @@ -398,15 +401,15 @@ func TestLabelsAggregation(t *testing.T) { expected: []*agentproto.Stats_Metric{ // We only observed one agent_name & template_name tuple, so all metrics are aggregated to a single series. {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 8, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, - {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, }}, }, }, { // Scenario: aggregating on a label which is unique across all metrics. name: "single label aggregation, aggregating to multiple metrics", - aggregateOn: []string{agentmetrics.UsernameLabel}, + aggregateOn: []string{agentmetrics.LabelUsername}, given: []statCollection{ { labels: prometheusmetrics.AgentMetricLabels{ @@ -432,17 +435,17 @@ func TestLabelsAggregation(t *testing.T) { expected: []*agentproto.Stats_Metric{ // We observed two unique username values, and therefore we have a metric for each. {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: "user1"}, + {Name: agentmetrics.LabelUsername, Value: "user1"}, }}, {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: "user2"}, + {Name: agentmetrics.LabelUsername, Value: "user2"}, }}, }, }, { // Scenario: aggregating on a label which is unique across all metrics, plus two invariant labels. name: "multiple label aggregation, aggregating to multiple metrics", - aggregateOn: []string{agentmetrics.UsernameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}, + aggregateOn: []string{agentmetrics.LabelUsername, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}, given: []statCollection{ { labels: prometheusmetrics.AgentMetricLabels{ @@ -468,20 +471,20 @@ func TestLabelsAggregation(t *testing.T) { expected: []*agentproto.Stats_Metric{ // We observed two unique username values, and therefore we have a metric for each. {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: "user1"}, - {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, - {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + {Name: agentmetrics.LabelUsername, Value: "user1"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, }}, {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: "user2"}, - {Name: agentmetrics.AgentNameLabel, Value: "agent1"}, - {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + {Name: agentmetrics.LabelUsername, Value: "user2"}, + {Name: agentmetrics.LabelAgentName, Value: "agent1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, }}, }, }, { name: "extra labels are retained, even with label aggregations", - aggregateOn: []string{agentmetrics.UsernameLabel}, + aggregateOn: []string{agentmetrics.LabelUsername}, given: []statCollection{ { labels: testLabels, @@ -500,18 +503,18 @@ func TestLabelsAggregation(t *testing.T) { }, expected: []*agentproto.Stats_Metric{ {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, }}, {Name: "extra_label", Type: agentproto.Stats_Metric_COUNTER, Value: 27, Labels: []*agentproto.Stats_Metric_Label{ {Name: "lizz", Value: "rizz"}, - {Name: agentmetrics.UsernameLabel, Value: testUsername}, + {Name: agentmetrics.LabelUsername, Value: testUsername}, }}, }, }, { // Both counters and gauges should have all their values summed to produce the correct output. name: "counters & gauges behave identically", - aggregateOn: []string{agentmetrics.TemplateNameLabel}, + aggregateOn: []string{agentmetrics.LabelTemplateName}, given: []statCollection{ { labels: prometheusmetrics.AgentMetricLabels{ @@ -536,10 +539,10 @@ func TestLabelsAggregation(t *testing.T) { }, expected: []*agentproto.Stats_Metric{ {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 3, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, }}, {Name: "active_conns", Type: agentproto.Stats_Metric_GAUGE, Value: 7, Labels: []*agentproto.Stats_Metric_Label{ - {Name: agentmetrics.TemplateNameLabel, Value: "template1"}, + {Name: agentmetrics.LabelTemplateName, Value: "template1"}, }}, }, }, @@ -561,6 +564,8 @@ func TestLabelsAggregation(t *testing.T) { } for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { t.Parallel() @@ -603,13 +608,27 @@ func TestLabelsAggregation(t *testing.T) { } func Benchmark_MetricsAggregator_Run(b *testing.B) { + benchmarkRunner(b, agentmetrics.LabelAll) +} + +func Benchmark_MetricsAggregator_RunWithAggregations(b *testing.B) { + for i := 1; i <= len(agentmetrics.LabelAll); i++ { + b.Run(fmt.Sprintf("%d labels", i), func(b *testing.B) { + benchmarkRunner(b, agentmetrics.LabelAll[0:i]) + }) + } +} + +func benchmarkRunner(b *testing.B, aggregateByLabels []string) { + b.ReportAllocs() + // Number of metrics to generate and send in each iteration. // Hard-coded to 1024 to avoid overflowing the queue in the metrics aggregator. numMetrics := 1024 // given registry := prometheus.NewRegistry() - metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, nil)) + metricsAggregator := must(prometheusmetrics.NewMetricsAggregator(slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}), registry, time.Hour, aggregateByLabels)) ctx, cancelFunc := context.WithCancel(context.Background()) b.Cleanup(cancelFunc) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 632dc9600658c..08b650dd87b28 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -150,7 +150,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.TemplateNameLabel, "template_version"})) + }, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err @@ -161,7 +161,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "status", "lifecycle_state", "tailnet_node"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err @@ -172,7 +172,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "derp_region", "preferred"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err @@ -183,7 +183,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, "app_name", "health"})) + }, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -329,15 +329,13 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis }, nil } -var DefaultAgentStatsLabels = []string{agentmetrics.AgentNameLabel, agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel} - func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) { if duration == 0 { duration = 1 * time.Minute } if len(aggregateByLabels) == 0 { - aggregateByLabels = DefaultAgentStatsLabels + aggregateByLabels = agentmetrics.LabelAgentStats } metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ @@ -469,11 +467,11 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R var labelValues []string for _, label := range aggregateByLabels { switch label { - case agentmetrics.UsernameLabel: + case agentmetrics.LabelUsername: labelValues = append(labelValues, agentStat.Username) - case agentmetrics.WorkspaceNameLabel: + case agentmetrics.LabelWorkspaceName: labelValues = append(labelValues, agentStat.WorkspaceName) - case agentmetrics.AgentNameLabel: + case agentmetrics.LabelAgentName: labelValues = append(labelValues, agentStat.AgentName) } } diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 312ef9cc68979..b37076ac34167 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -58,11 +58,6 @@ const ( FeatureControlSharedPorts FeatureName = "control_shared_ports" ) -var AcceptedMetricAggregationLabels = []string{ - agentmetrics.TemplateNameLabel, agentmetrics.AgentNameLabel, - agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, -} - // FeatureNames must be kept in-sync with the Feature enum above. var FeatureNames = []FeatureName{ FeatureUserLimit, @@ -953,7 +948,7 @@ when required by your organization's security policy.`, }, { Name: "Prometheus Aggregate Agent Stats By", - Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(AcceptedMetricAggregationLabels, ", ")), + Description: fmt.Sprintf("When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are %s.", strings.Join(agentmetrics.LabelAll, ", ")), Flag: "prometheus-aggregate-agent-stats-by", Env: "CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY", Value: clibase.Validate(&c.Prometheus.AggregateAgentStatsBy, func(value *clibase.StringArray) error { @@ -961,22 +956,23 @@ when required by your organization's security policy.`, return nil } - acceptable := make(map[string]any, len(AcceptedMetricAggregationLabels)) - for _, label := range AcceptedMetricAggregationLabels { + acceptable := make(map[string]any, len(agentmetrics.LabelAll)) + for _, label := range agentmetrics.LabelAll { acceptable[label] = nil } for _, label := range value.Value() { if _, found := acceptable[label]; !found { return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable", - label, strings.Join(AcceptedMetricAggregationLabels, ", ")) + label, strings.Join(agentmetrics.LabelAll, ", ")) } } return nil }), - Group: &deploymentGroupIntrospectionPrometheus, - YAML: "aggregate_agent_stats_by", + Group: &deploymentGroupIntrospectionPrometheus, + YAML: "aggregate_agent_stats_by", + Default: strings.Join(agentmetrics.LabelAll, ","), }, { Name: "Prometheus Collect Database Metrics", diff --git a/enterprise/cli/testdata/coder_server_--help.golden b/enterprise/cli/testdata/coder_server_--help.golden index 8e9ccac868c1e..30c2f778e81e9 100644 --- a/enterprise/cli/testdata/coder_server_--help.golden +++ b/enterprise/cli/testdata/coder_server_--help.golden @@ -124,6 +124,11 @@ INTROSPECTION / PROMETHEUS OPTIONS: --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. + --prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY (default: agent_name,template_name,username,workspace_name) + When collecting agent stats, aggregate metrics by a given set of + comma-separated labels to reduce cardinality. Accepted values are + agent_name, template_name, username, workspace_name. + --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS Collect agent stats (may increase charges for metrics storage). From 8f2c50532ba0f2604a1f4f8f34f7f12f7bda289c Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 10:17:41 +0200 Subject: [PATCH 13/19] More review comments & CI appeasement Signed-off-by: Danny Kopping --- coderd/agentmetrics/labels.go | 25 +++++++++++ coderd/agentmetrics/labels_test.go | 48 +++++++++++++++++++++ coderd/prometheusmetrics/aggregator.go | 4 ++ coderd/prometheusmetrics/aggregator_test.go | 17 ++++++++ codersdk/deployment.go | 14 +----- docs/cli/server.md | 3 +- 6 files changed, 97 insertions(+), 14 deletions(-) create mode 100644 coderd/agentmetrics/labels_test.go diff --git a/coderd/agentmetrics/labels.go b/coderd/agentmetrics/labels.go index b8259e4a14d54..b4cb976111a21 100644 --- a/coderd/agentmetrics/labels.go +++ b/coderd/agentmetrics/labels.go @@ -1,5 +1,11 @@ package agentmetrics +import ( + "strings" + + "golang.org/x/xerrors" +) + const ( LabelAgentName = "agent_name" LabelUsername = "username" @@ -11,3 +17,22 @@ var ( LabelAll = []string{LabelAgentName, LabelTemplateName, LabelUsername, LabelWorkspaceName} LabelAgentStats = []string{LabelAgentName, LabelUsername, LabelWorkspaceName} ) + +// ValidateAggregationLabels ensures a given set of labels are valid aggregation labels. +func ValidateAggregationLabels(labels []string) error { + acceptable := LabelAll + + seen := make(map[string]any, len(acceptable)) + for _, label := range acceptable { + seen[label] = nil + } + + for _, label := range labels { + if _, found := seen[label]; !found { + return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable", + label, strings.Join(acceptable, ", ")) + } + } + + return nil +} diff --git a/coderd/agentmetrics/labels_test.go b/coderd/agentmetrics/labels_test.go new file mode 100644 index 0000000000000..af7091fe5e7a8 --- /dev/null +++ b/coderd/agentmetrics/labels_test.go @@ -0,0 +1,48 @@ +package agentmetrics_test + +import ( + "testing" + + "github.com/coder/coder/v2/coderd/agentmetrics" + "github.com/stretchr/testify/require" +) + +func TestValidateAggregationLabels(t *testing.T) { + tests := []struct { + name string + labels []string + expectedErr bool + }{ + { + name: "empty list is valid", + }, + { + name: "single valid entry", + labels: []string{agentmetrics.LabelTemplateName}, + }, + { + name: "multiple valid entries", + labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername}, + }, + { + name: "repeated valid entries are not invalid", + labels: []string{agentmetrics.LabelTemplateName, agentmetrics.LabelUsername, agentmetrics.LabelUsername, agentmetrics.LabelUsername}, + }, + { + name: "empty entry is invalid", + labels: []string{""}, + expectedErr: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + err := agentmetrics.ValidateAggregationLabels(tc.labels) + if tc.expectedErr { + require.Error(t, err) + } + }) + } +} diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 6f22d429a36f4..44ade677d5cff 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -335,6 +335,10 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { var input []annotatedMetric output := make([]prometheus.Metric, 0, len(ma.store)) + if len(ma.aggregateByLabels) == 0 { + ma.aggregateByLabels = agentmetrics.LabelAll + } + // If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. // This results in higher cardinality, but may be desirable in larger deployments. // diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index cb6ad427eb029..8ec8080e5da34 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -561,6 +561,23 @@ func TestLabelsAggregation(t *testing.T) { // Nothing will be returned. expected: []*agentproto.Stats_Metric{}, }, + { + // Scenario: validation fails and an empty list is given for aggregation. + name: "empty label aggregation list", + aggregateOn: []string{}, + given: []statCollection{ + { + labels: testLabels, + metrics: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1}, + }, + }, + }, + // Default aggregation will be used. + expected: []*agentproto.Stats_Metric{ + {Name: "user_counter", Type: agentproto.Stats_Metric_COUNTER, Value: 1, Labels: commonLabels}, + }, + }, } for _, tc := range tests { diff --git a/codersdk/deployment.go b/codersdk/deployment.go index b37076ac34167..f1d4df519ac01 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -956,19 +956,7 @@ when required by your organization's security policy.`, return nil } - acceptable := make(map[string]any, len(agentmetrics.LabelAll)) - for _, label := range agentmetrics.LabelAll { - acceptable[label] = nil - } - - for _, label := range value.Value() { - if _, found := acceptable[label]; !found { - return xerrors.Errorf("%q is not a valid aggregation label; only one or more of %q are acceptable", - label, strings.Join(agentmetrics.LabelAll, ", ")) - } - } - - return nil + return agentmetrics.ValidateAggregationLabels(value.Value()) }), Group: &deploymentGroupIntrospectionPrometheus, YAML: "aggregate_agent_stats_by", diff --git a/docs/cli/server.md b/docs/cli/server.md index 27e6996558aaa..6c44cd70d382e 100644 --- a/docs/cli/server.md +++ b/docs/cli/server.md @@ -749,8 +749,9 @@ The bind address to serve prometheus metrics. | Type | string-array | | Environment | $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY | | YAML | introspection.prometheus.aggregate_agent_stats_by | +| Default | agent_name,template_name,username,workspace_name | -When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are template_name, agent_name, username, workspace_name. +When collecting agent stats, aggregate metrics by a given set of comma-separated labels to reduce cardinality. Accepted values are agent_name, template_name, username, workspace_name. ### --prometheus-collect-agent-stats From c0896a77241524afbdc5b8097e197a109590f0a3 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 10:26:53 +0200 Subject: [PATCH 14/19] Fixing lints Signed-off-by: Danny Kopping --- coderd/agentmetrics/labels.go | 2 +- coderd/agentmetrics/labels_test.go | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/coderd/agentmetrics/labels.go b/coderd/agentmetrics/labels.go index b4cb976111a21..7257f1bb618f8 100644 --- a/coderd/agentmetrics/labels.go +++ b/coderd/agentmetrics/labels.go @@ -8,8 +8,8 @@ import ( const ( LabelAgentName = "agent_name" - LabelUsername = "username" LabelTemplateName = "template_name" + LabelUsername = "username" LabelWorkspaceName = "workspace_name" ) diff --git a/coderd/agentmetrics/labels_test.go b/coderd/agentmetrics/labels_test.go index af7091fe5e7a8..3bcdd4a68afeb 100644 --- a/coderd/agentmetrics/labels_test.go +++ b/coderd/agentmetrics/labels_test.go @@ -3,11 +3,14 @@ package agentmetrics_test import ( "testing" - "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/agentmetrics" ) func TestValidateAggregationLabels(t *testing.T) { + t.Parallel() + tests := []struct { name string labels []string @@ -36,6 +39,8 @@ func TestValidateAggregationLabels(t *testing.T) { } for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { t.Parallel() From c11dc611db18fe2dd05948dc8dd376d843285e20 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 11 Mar 2024 09:22:13 +0000 Subject: [PATCH 15/19] chore(dogfood): update keys --- .../files/usr/share/keyrings/nodesource.gpg | Bin 1185 -> 2206 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dogfood/files/usr/share/keyrings/nodesource.gpg b/dogfood/files/usr/share/keyrings/nodesource.gpg index a8c38d432dbd84d6d385b83ef5d190120a65a879..4f3ec4ed793b397c15b9cba46c45cac6315dcc62 100644 GIT binary patch literal 2206 zcmV;P2x0e`0u2OHn zGb2pT0FAJHnQ9LOen#@%F5|z&XQSYC`H_RzY$X#LGtDigk9KP_M=>ehkUPDXTu;B(V3ubAZ5m-sKU^kff*x~ zZ`Cm{)>!{y$rAt(0RRECA5L#%Wm9i;a${v6JZErcKyGhjWpi(Ja${vKV{dIfi2^tT z69EDMA_W3dnMyIxRsH;0PMkg$clS^{YTdUXw+PSWLNu=D0z2yO(O z_L)x-nN}nt%nnbK_KGzgha(e!#vOg#< zDV3pDqn*)=Wy$Atk^4+1CMYk-4;}Z1C+o<{-7M|O=%x6uUWppkB0=VIvbXoDGIP-N zq-m;2y#N%yudo`pz(4g63%S%092Zt2H^_rSC8@r4%OuDm1PL6`jbBvv&sE@N5AEzZy|9o#Den98Na`3}LhG;}Feurl9!o_En{3W5c`HWwC zvSc;MaJ?9&zEC5mh?eByw7&Ppa*vpR17SkBOr1P`{D)bx#<9JOtw?%LWi~MSqr{D{ zNjSP_F1-9@r_I*qq@)UvjrYg^>X!8+E} zMwR_59u1pS&LPPvZk6#cM&0>l7ErehWFIuqaG#&4Ss;8^akyF#TUGg3cgWiK74@o9!uIFErdKdjjITX|v%3 zUsg#~(bn4CnDl)bGku-5sX_4x@XYO}ba)la7GdHy$Y9)s2Tbpjc2)mqec%8wK)oeH zzf1hLr$%$7#PmA*m|+E-0|z^d;WJU%OJiZtIE&iROSaP6d~Rq$Tta(Ayl`-c<|&nfrwbw=OFZp*dq>L1av);N3wE+aeWA@@&(c_& zN2PxK=Q;j}EHb^6r7iLOnpdDXB(?$@3;+rV5EfOSt7unZfP>Hw0GwhUD@R{&F_wRZ1~2By zI%84XB+wDNIiGz3sLe0y#;2Uq6F0%fi_`)PxM*cuSB7^?5x?|R&E{jBF|dT)L!2P; z)Q-A^MBsW8jj{Dwp5ygBlBh3#1Opok&DJ21f%;F@xWJMBAp3s0;Zqb`x1+?$GV+=x zPki{BDcOI}xZGi?mk1f9TeS^apSix1lpg*3{%l*6d2`~ur^_x;oamqi;R^bh#Q5dw=z ze+M&hc}-v|&mj?`V^PuBYxBa+-Mi+nnag z9eQ6NssJ;!_Nnu}AX&>9GxCH~+^$})WE9M0H`$0SR}BL08A;u-$u3-jNK{Z@SnEeO03#02lO?o~>rC*DA4s+0G#EbI&sEr!W(OV delta 1171 zcmV;E1Z?}B5upixnE?$1S3@^Y0SExvHQ&hIJ?)>39eD#_&$vO@+td2YmwTx=W@lm% z!=DE6YmqMjgw(q3-*S->ExL79nYn#>be>4ZK!U7(!-Wc&LsCY) z_Vh^n?lfpePG%6^@g|M}qCT3FWI#=i{*p4{&H+}=0UvBC+p{XV@k$uvQ^B6@854MeW|oMn z0;|%;lnb~;fY%c_czC&z+_j({#{dxl00FcjPE&7eX=ETgZgX#JX=E*DaA!bnZ)9b2 zZ*_8GWiDfHZ9a(sI0O>`0stZf0#`#fP?JCcAb&4ewfUPH;IzxU2mUTGou>Ec+iPKN z(Pskui=>=3Gcn>nep6X1{9c&0rJeYk0&Evrubk0>98Yn9D*mVYF}MI>%Qbl;u1x0p zpTDTZOdTI+g0y%_hjMG$WN_9?I|Z>>JRVA#Oh;%yUoAcz_>XmFwB4wM@OO*klR!n0Ol)z@RJ}PDj<)Z$bN%;!bncZ8*JpM(hXI15 zx3o_F;Ox40h_e|nW)Bgq8tpDF@)x-vLQpN=8xzRowZt#2+V>W3BqL&A1euhXeh*3b zZcHIsNBo`&-PECPJl(rJE%wvQjC~A|seeR^ykkNO-M%c3@u+25xd9CXS3@^Y0SEx9 z(`YWXYb@$GgWB$Nz0eWIa!wiFcDA*Vf4YbF^Ak>sC5DARE+ye;O)=-?Br4=va=U`J zD`(qoo38u=&=*cXYFP7DK*lu1xTPr|IjbGY$o$_|F0#tUUu0NIZLovQBO z#abeKS3h=5{bC!oeqE~?X^~&pknYBanQ10i?)JC$kfjJiGLTylY-}J8AbJ+Ry2tet zv4A+Og?mQnh&e}Ao&~a6Jk`th3V%sp5?hGKY1B*`=EQEh#{;UeE<;w z00D^s9|RZy0ssjG0#`#fPy!na0162ZFIlzun;YP?PtFJZ6*tcVM|L-U>ay-yzjoMU zpRnqi))#h)*_U$l=VycIczr5r13)J)cVa zn~8%n5jw8Z`?Pnh%l=IB9l}p5jDsFVEH;Ev@NTkn!~6UHd=oH4uiR(XZshca;U09Z z+b#NrG7Dy?$AUIW2<;d5811nFv5@JPHmx^!FW-aYand01BF6z)OW@vZ&gp}CEYY%; z63UFWD{rB|r8o Date: Mon, 11 Mar 2024 15:23:13 +0200 Subject: [PATCH 16/19] Fix flaky label evaluation by sorting both sides Signed-off-by: Danny Kopping --- coderd/prometheusmetrics/aggregator_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 8ec8080e5da34..7f0b0b2e4faee 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -206,9 +206,11 @@ func verifyCollectedMetrics(t *testing.T, expected []*agentproto.Stats_Metric, a dtoLabels := asMetricAgentLabels(d.GetLabel()) // dto labels are sorted in alphabetical order. - sort.Slice(e.Labels, func(i, j int) bool { + sortFn := func(i, j int) bool { return e.Labels[i].Name < e.Labels[j].Name - }) + } + sort.Slice(e.Labels, sortFn) + sort.Slice(dtoLabels, sortFn) require.Equal(t, e.Labels, dtoLabels, d.String()) } return true From 06f2f872411450ed5c5672614d3519028f3613ea Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 15:36:23 +0200 Subject: [PATCH 17/19] Update node key Signed-off-by: Danny Kopping --- .../files/usr/share/keyrings/nodesource.gpg | Bin 2206 -> 1185 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dogfood/files/usr/share/keyrings/nodesource.gpg b/dogfood/files/usr/share/keyrings/nodesource.gpg index 4f3ec4ed793b397c15b9cba46c45cac6315dcc62..a8c38d432dbd84d6d385b83ef5d190120a65a879 100644 GIT binary patch delta 1171 zcmV;E1Z?}B5upixnE?$1S3@^Y0SExvHQ&hIJ?)>39eD#_&$vO@+td2YmwTx=W@lm% z!=DE6YmqMjgw(q3-*S->ExL79nYn#>be>4ZK!U7(!-Wc&LsCY) z_Vh^n?lfpePG%6^@g|M}qCT3FWI#=i{*p4{&H+}=0UvBC+p{XV@k$uvQ^B6@854MeW|oMn z0;|%;lnb~;fY%c_czC&z+_j({#{dxl00FcjPE&7eX=ETgZgX#JX=E*DaA!bnZ)9b2 zZ*_8GWiDfHZ9a(sI0O>`0stZf0#`#fP?JCcAb&4ewfUPH;IzxU2mUTGou>Ec+iPKN z(Pskui=>=3Gcn>nep6X1{9c&0rJeYk0&Evrubk0>98Yn9D*mVYF}MI>%Qbl;u1x0p zpTDTZOdTI+g0y%_hjMG$WN_9?I|Z>>JRVA#Oh;%yUoAcz_>XmFwB4wM@OO*klR!n0Ol)z@RJ}PDj<)Z$bN%;!bncZ8*JpM(hXI15 zx3o_F;Ox40h_e|nW)Bgq8tpDF@)x-vLQpN=8xzRowZt#2+V>W3BqL&A1euhXeh*3b zZcHIsNBo`&-PECPJl(rJE%wvQjC~A|seeR^ykkNO-M%c3@u+25xd9CXS3@^Y0SEx9 z(`YWXYb@$GgWB$Nz0eWIa!wiFcDA*Vf4YbF^Ak>sC5DARE+ye;O)=-?Br4=va=U`J zD`(qoo38u=&=*cXYFP7DK*lu1xTPr|IjbGY$o$_|F0#tUUu0NIZLovQBO z#abeKS3h=5{bC!oeqE~?X^~&pknYBanQ10i?)JC$kfjJiGLTylY-}J8AbJ+Ry2tet zv4A+Og?mQnh&e}Ao&~a6Jk`th3V%sp5?hGKY1B*`=EQEh#{;UeE<;w z00D^s9|RZy0ssjG0#`#fPy!na0162ZFIlzun;YP?PtFJZ6*tcVM|L-U>ay-yzjoMU zpRnqi))#h)*_U$l=VycIczr5r13)J)cVa zn~8%n5jw8Z`?Pnh%l=IB9l}p5jDsFVEH;Ev@NTkn!~6UHd=oH4uiR(XZshca;U09Z z+b#NrG7Dy?$AUIW2<;d5811nFv5@JPHmx^!FW-aYand01BF6z)OW@vZ&gp}CEYY%; z63UFWD{rB|r8o zGb2pT0FAJHnQ9LOen#@%F5|z&XQSYC`H_RzY$X#LGtDigk9KP_M=>ehkUPDXTu;B(V3ubAZ5m-sKU^kff*x~ zZ`Cm{)>!{y$rAt(0RRECA5L#%Wm9i;a${v6JZErcKyGhjWpi(Ja${vKV{dIfi2^tT z69EDMA_W3dnMyIxRsH;0PMkg$clS^{YTdUXw+PSWLNu=D0z2yO(O z_L)x-nN}nt%nnbK_KGzgha(e!#vOg#< zDV3pDqn*)=Wy$Atk^4+1CMYk-4;}Z1C+o<{-7M|O=%x6uUWppkB0=VIvbXoDGIP-N zq-m;2y#N%yudo`pz(4g63%S%092Zt2H^_rSC8@r4%OuDm1PL6`jbBvv&sE@N5AEzZy|9o#Den98Na`3}LhG;}Feurl9!o_En{3W5c`HWwC zvSc;MaJ?9&zEC5mh?eByw7&Ppa*vpR17SkBOr1P`{D)bx#<9JOtw?%LWi~MSqr{D{ zNjSP_F1-9@r_I*qq@)UvjrYg^>X!8+E} zMwR_59u1pS&LPPvZk6#cM&0>l7ErehWFIuqaG#&4Ss;8^akyF#TUGg3cgWiK74@o9!uIFErdKdjjITX|v%3 zUsg#~(bn4CnDl)bGku-5sX_4x@XYO}ba)la7GdHy$Y9)s2Tbpjc2)mqec%8wK)oeH zzf1hLr$%$7#PmA*m|+E-0|z^d;WJU%OJiZtIE&iROSaP6d~Rq$Tta(Ayl`-c<|&nfrwbw=OFZp*dq>L1av);N3wE+aeWA@@&(c_& zN2PxK=Q;j}EHb^6r7iLOnpdDXB(?$@3;+rV5EfOSt7unZfP>Hw0GwhUD@R{&F_wRZ1~2By zI%84XB+wDNIiGz3sLe0y#;2Uq6F0%fi_`)PxM*cuSB7^?5x?|R&E{jBF|dT)L!2P; z)Q-A^MBsW8jj{Dwp5ygBlBh3#1Opok&DJ21f%;F@xWJMBAp3s0;Zqb`x1+?$GV+=x zPki{BDcOI}xZGi?mk1f9TeS^apSix1lpg*3{%l*6d2`~ur^_x;oamqi;R^bh#Q5dw=z ze+M&hc}-v|&mj?`V^PuBYxBa+-Mi+nnag z9eQ6NssJ;!_Nnu}AX&>9GxCH~+^$})WE9M0H`$0SR}BL08A;u-$u3-jNK{Z@SnEeO03#02lO?o~>rC*DA4s+0G#EbI&sEr!W(OV From 3c1d016a82d640dbcb258fc3576899b7678a928e Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 16:09:13 +0000 Subject: [PATCH 18/19] Filter out unacceptable labels in AgentStats Signed-off-by: Danny Kopping --- coderd/prometheusmetrics/aggregator_test.go | 2 +- coderd/prometheusmetrics/prometheusmetrics.go | 19 +++++++++++++++++++ .../prometheusmetrics_test.go | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 7f0b0b2e4faee..412dfae1e7559 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -262,7 +262,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { // given registry := prometheus.NewRegistry() - metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, nil) + metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, agentmetrics.LabelAll) require.NoError(t, err) ctx, cancelFunc := context.WithCancel(context.Background()) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 08b650dd87b28..c3141e1e58775 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -338,6 +338,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R aggregateByLabels = agentmetrics.LabelAgentStats } + aggregateByLabels = filterInvalidLabels(aggregateByLabels) + metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "prometheusmetrics", @@ -514,3 +516,20 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R <-done }, nil } + +// filterInvalidLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on +// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value, +// and therefore we have to exclude it from the list of acceptable labels. +func filterInvalidLabels(labels []string) []string { + var out []string + + for _, label := range labels { + if label == agentmetrics.LabelTemplateName { + continue + } + + out = append(out, label) + } + + return out +} diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index c16f4dfba2b76..1712497b4fe3d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -5,6 +5,7 @@ import ( "database/sql" "encoding/json" "fmt" + "github.com/coder/coder/v2/coderd/agentmetrics" "os" "reflect" "sync/atomic" @@ -451,7 +452,7 @@ func TestAgentStats(t *testing.T) { // and it doesn't depend on the real time. closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{ IgnoreErrors: true, - }), registry, db, time.Now().Add(-time.Minute), time.Millisecond, nil) + }), registry, db, time.Now().Add(-time.Minute), time.Millisecond, agentmetrics.LabelAll) require.NoError(t, err) t.Cleanup(closeFunc) From f8ebdf2182ebf5aeaea5d0c1ce3bd15cf9e978e5 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 11 Mar 2024 16:20:46 +0000 Subject: [PATCH 19/19] make fmt Signed-off-by: Danny Kopping --- coderd/prometheusmetrics/prometheusmetrics_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 1712497b4fe3d..3b76d9669c9a0 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -5,13 +5,14 @@ import ( "database/sql" "encoding/json" "fmt" - "github.com/coder/coder/v2/coderd/agentmetrics" "os" "reflect" "sync/atomic" "testing" "time" + "github.com/coder/coder/v2/coderd/agentmetrics" + "github.com/coder/coder/v2/coderd/batchstats" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime"