Skip to content

feat: make agent stats' cardinality configurable #12468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1172e09
Initial implementation
dannykopping Mar 7, 2024
ddd563e
Refactoring
dannykopping Mar 8, 2024
6a1ab6e
Drive-by change to add delve to nix flake so ./scripts/develop.sh --d…
dannykopping Mar 8, 2024
ce0c22d
Update deployment config
dannykopping Mar 8, 2024
25fd616
Linting
dannykopping Mar 8, 2024
cc1a0b0
Control cardinality of coderd metrics as well
dannykopping Mar 8, 2024
122f68d
Remove default value
dannykopping Mar 8, 2024
3e569ff
make fmt && make gen
dannykopping Mar 8, 2024
62e2624
Appeasing CI
dannykopping Mar 8, 2024
6544d2d
Appeasing CI, again...
dannykopping Mar 8, 2024
5e89d05
...
dannykopping Mar 8, 2024
ae8a912
Fix loop var capture
dannykopping Mar 8, 2024
023f7d4
Review comments & CI appeasement
dannykopping Mar 11, 2024
9aedd97
Merge branch 'dk/configurable-cardinality' of github.com:dannykopping…
dannykopping Mar 11, 2024
92be1d6
More review comments & CI appeasement
dannykopping Mar 11, 2024
9b16a3b
Fixing lints
dannykopping Mar 11, 2024
6c7d1bd
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
3538e78
chore(dogfood): update keys
johnstcn Mar 11, 2024
5a97817
Merge remote-tracking branch 'upstream/cj/dogfood-update-keys' into d…
dannykopping Mar 11, 2024
c861500
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
6bcfe99
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
765fe9d
Fix flaky label evaluation by sorting both sides
dannykopping Mar 11, 2024
37c3628
Merge branch 'main' of github.com:/coder/coder into dk/configurable-c…
dannykopping Mar 11, 2024
f1d2821
Update node key
dannykopping Mar 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,13 @@ func enablePrometheus(
afterCtx(ctx, closeInsightsMetricsCollector)

if vals.Prometheus.CollectAgentStats {
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
if err != nil {
return nil, xerrors.Errorf("register agent stats prometheus metric: %w", err)
}
afterCtx(ctx, closeAgentStatsFunc)

metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0)
metricsAggregator, err := prometheusmetrics.NewMetricsAggregator(logger, options.PrometheusRegistry, 0, options.DeploymentValues.Prometheus.AggregateAgentStatsBy.Value())
if err != nil {
return nil, xerrors.Errorf("can't initialize metrics aggregator: %w", err)
}
Expand Down
7 changes: 6 additions & 1 deletion cli/testdata/coder_server_--help.golden
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,15 @@ INTROSPECTION / LOGGING OPTIONS:
--log-stackdriver string, $CODER_LOGGING_STACKDRIVER
Output Stackdriver compatible logs to a given file.

INTROSPECTION / PROMETHEUS OPTIONS:
INTROSPECTION / PROMETHEUS OPTIONS:
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
The bind address to serve prometheus metrics.

--prometheus-aggregate-agent-stats-by string-array, $CODER_PROMETHEUS_AGGREGATE_AGENT_STATS_BY
When collecting agent stats, aggregate metrics by a given set of
comma-separated labels to reduce cardinality. Accepted values are
template_name, agent_name, username, workspace_name.

--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
Collect agent stats (may increase charges for metrics storage).

Expand Down
5 changes: 5 additions & 0 deletions cli/testdata/server-config.yaml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ introspection:
# Collect agent stats (may increase charges for metrics storage).
# (default: <unset>, type: bool)
collect_agent_stats: false
# When collecting agent stats, aggregate metrics by a given set of comma-separated
# labels to reduce cardinality. Accepted values are template_name, agent_name,
# username, workspace_name.
# (default: <unset>, type: string-array)
aggregate_agent_stats_by: []
# Collect database metrics (may increase charges for metrics storage).
# (default: false, type: bool)
collect_db_metrics: false
Expand Down
8 changes: 8 additions & 0 deletions coderd/agentmetrics/labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package agentmetrics

const (
TemplateNameLabel = "template_name"
AgentNameLabel = "agent_name"
UsernameLabel = "username"
WorkspaceNameLabel = "workspace_name"
)
6 changes: 6 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

164 changes: 153 additions & 11 deletions coderd/prometheusmetrics/aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"golang.org/x/xerrors"

"github.com/coder/coder/v2/coderd/agentmetrics"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately our formatter doesn't handle merging import groups and leaves things in a messy state (depending on what program injected them). 😔

If you notice these, please feel free to fix, but the standard is we try our best but sometimes these slip through, so don't worry too much.


"cdr.dev/slog"

agentproto "github.com/coder/coder/v2/agent/proto"
Expand Down Expand Up @@ -43,9 +46,10 @@ type MetricsAggregator struct {
collectCh chan (chan []prometheus.Metric)
updateCh chan updateRequest

storeSizeGauge prometheus.Gauge
updateHistogram prometheus.Histogram
cleanupHistogram prometheus.Histogram
storeSizeGauge prometheus.Gauge
updateHistogram prometheus.Histogram
cleanupHistogram prometheus.Histogram
aggregateByLabels []string
}

type updateRequest struct {
Expand All @@ -68,6 +72,8 @@ type annotatedMetric struct {
templateName string

expiryDate time.Time

aggregateByLabels []string
}

type metricKey struct {
Expand Down Expand Up @@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey {
var _ prometheus.Collector = new(MetricsAggregator)

func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels))
var (
baseLabelNames = am.aggregateByLabels
baseLabelValues []string
extraLabels = am.Labels
)

for _, label := range am.aggregateByLabels {
val, err := am.getFieldByLabel(label)
if err != nil {
return nil, err
}

baseLabelValues = append(baseLabelValues, val)
}

labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))

labels = append(labels, agentMetricsLabels...)
labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName)
labels = append(labels, baseLabelNames...)
labelValues = append(labelValues, baseLabelValues...)

for _, l := range am.Labels {
for _, l := range extraLabels {
labels = append(labels, l.Name)
labelValues = append(labelValues, l.Value)
}
Expand All @@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
if err != nil {
return nil, err
}

return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
}

func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) {
// getFieldByLabel returns the related field value for a given label
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) {
var labelVal string
switch label {
case agentmetrics.WorkspaceNameLabel:
labelVal = am.workspaceName
case agentmetrics.TemplateNameLabel:
labelVal = am.templateName
case agentmetrics.AgentNameLabel:
labelVal = am.agentName
case agentmetrics.UsernameLabel:
labelVal = am.username
default:
return "", xerrors.Errorf("unexpected label: %q", label)
}

return labelVal, nil
}

func (am *annotatedMetric) clone() annotatedMetric {
stats := &agentproto.Stats_Metric{
Name: am.Name,
Type: am.Type,
Value: am.Value,
Labels: am.Labels,
}

return annotatedMetric{
Stats_Metric: stats,
username: am.username,
workspaceName: am.workspaceName,
agentName: am.agentName,
templateName: am.templateName,
expiryDate: am.expiryDate,
}
}

func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) {
metricsCleanupInterval := defaultMetricsCleanupInterval
if duration > 0 {
metricsCleanupInterval = duration
Expand Down Expand Up @@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer,
storeSizeGauge: storeSizeGauge,
updateHistogram: updateHistogram,
cleanupHistogram: cleanupHistogram,

aggregateByLabels: aggregateByLabels,
}, nil
}

// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
type labelAggregator struct {
aggregations map[string]float64
metrics map[string]annotatedMetric
}

func newLabelAggregator(size int) *labelAggregator {
return &labelAggregator{
aggregations: make(map[string]float64, size),
metrics: make(map[string]annotatedMetric, size),
}
}

func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
labelSet := make(model.LabelSet, len(labels))

for _, label := range labels {
val, err := am.getFieldByLabel(label)
if err != nil {
return err
}

labelSet[model.LabelName(label)] = model.LabelValue(val)
}

// Memoize based on the metric name & the unique combination of labels.
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())

// Aggregate the value based on the key.
a.aggregations[key] += am.Value

metric, found := a.metrics[key]
if !found {
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
metric = am.clone()
}

// Store the metric.
metric.aggregateByLabels = labels
metric.Value = a.aggregations[key]

a.metrics[key] = metric

return nil
}

func (a *labelAggregator) listMetrics() []annotatedMetric {
var out []annotatedMetric
for _, am := range a.metrics {
out = append(out, am)
}
return out
}

func (ma *MetricsAggregator) Run(ctx context.Context) func() {
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
Expand Down Expand Up @@ -216,15 +332,41 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
case outputCh := <-ma.collectCh:
ma.log.Debug(ctx, "collect metrics")

var input []annotatedMetric
output := make([]prometheus.Metric, 0, len(ma.store))
for _, m := range ma.store {

// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
// This results in higher cardinality, but may be desirable in larger deployments.
// Default behavior.
if len(ma.aggregateByLabels) == 0 {
for _, m := range ma.store {
// Aggregate by all available metrics.
m.aggregateByLabels = defaultAgentMetricsLabels
input = append(input, m)
}
} else {
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
// metrics because we cannot register multiple metric series with the same labels.
la := newLabelAggregator(len(ma.store))

for _, m := range ma.store {
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
}
}

input = la.listMetrics()
}

for _, m := range input {
promMetric, err := m.asPrometheus()
if err != nil {
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err))
continue
}
output = append(output, promMetric)
}

outputCh <- output
close(outputCh)
case <-cleanupTicker.C:
Expand Down Expand Up @@ -260,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
}

var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel}
var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel}

// AgentMetricLabels are the labels used to decorate an agent's metrics.
// This list should match the list of labels in agentMetricsLabels.
Expand Down
Loading