-
Notifications
You must be signed in to change notification settings - Fork 928
feat: make agent stats' cardinality configurable #12468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
1172e09
ddd563e
6a1ab6e
ce0c22d
25fd616
cc1a0b0
122f68d
3e569ff
62e2624
6544d2d
5e89d05
ae8a912
023f7d4
9aedd97
92be1d6
9b16a3b
6c7d1bd
3538e78
5a97817
c861500
6bcfe99
765fe9d
37c3628
f1d2821
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package agentmetrics | ||
|
||
const ( | ||
TemplateNameLabel = "template_name" | ||
AgentNameLabel = "agent_name" | ||
UsernameLabel = "username" | ||
WorkspaceNameLabel = "workspace_name" | ||
) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,11 @@ import ( | |
"time" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/common/model" | ||
"golang.org/x/xerrors" | ||
|
||
"github.com/coder/coder/v2/coderd/agentmetrics" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately our formatter doesn't handle merging import groups and leaves things in a messy state (depending on what program injected them). 😔 If you notice these, please feel free to fix, but the standard is we try our best but sometimes these slip through, so don't worry too much. |
||
|
||
"cdr.dev/slog" | ||
|
||
agentproto "github.com/coder/coder/v2/agent/proto" | ||
|
@@ -43,9 +46,10 @@ type MetricsAggregator struct { | |
collectCh chan (chan []prometheus.Metric) | ||
updateCh chan updateRequest | ||
|
||
storeSizeGauge prometheus.Gauge | ||
updateHistogram prometheus.Histogram | ||
cleanupHistogram prometheus.Histogram | ||
storeSizeGauge prometheus.Gauge | ||
updateHistogram prometheus.Histogram | ||
cleanupHistogram prometheus.Histogram | ||
aggregateByLabels []string | ||
} | ||
|
||
type updateRequest struct { | ||
|
@@ -68,6 +72,8 @@ type annotatedMetric struct { | |
templateName string | ||
|
||
expiryDate time.Time | ||
|
||
aggregateByLabels []string | ||
} | ||
|
||
type metricKey struct { | ||
|
@@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey { | |
var _ prometheus.Collector = new(MetricsAggregator) | ||
|
||
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { | ||
labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) | ||
labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) | ||
var ( | ||
baseLabelNames = am.aggregateByLabels | ||
baseLabelValues []string | ||
extraLabels = am.Labels | ||
) | ||
|
||
for _, label := range am.aggregateByLabels { | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
val, err := am.getFieldByLabel(label) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
baseLabelValues = append(baseLabelValues, val) | ||
} | ||
|
||
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels)) | ||
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels)) | ||
|
||
labels = append(labels, agentMetricsLabels...) | ||
labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName) | ||
labels = append(labels, baseLabelNames...) | ||
labelValues = append(labelValues, baseLabelValues...) | ||
|
||
for _, l := range am.Labels { | ||
for _, l := range extraLabels { | ||
labels = append(labels, l.Name) | ||
labelValues = append(labelValues, l.Value) | ||
} | ||
|
@@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { | |
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil | ||
} | ||
|
||
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) { | ||
// getFieldByLabel returns the related field value for a given label | ||
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { | ||
var labelVal string | ||
switch label { | ||
case agentmetrics.WorkspaceNameLabel: | ||
labelVal = am.workspaceName | ||
case agentmetrics.TemplateNameLabel: | ||
labelVal = am.templateName | ||
case agentmetrics.AgentNameLabel: | ||
labelVal = am.agentName | ||
case agentmetrics.UsernameLabel: | ||
labelVal = am.username | ||
default: | ||
return "", xerrors.Errorf("unexpected label: %q", label) | ||
} | ||
|
||
return labelVal, nil | ||
} | ||
|
||
func (am *annotatedMetric) clone() annotatedMetric { | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
stats := &agentproto.Stats_Metric{ | ||
Name: am.Name, | ||
Type: am.Type, | ||
Value: am.Value, | ||
Labels: am.Labels, | ||
} | ||
|
||
return annotatedMetric{ | ||
Stats_Metric: stats, | ||
username: am.username, | ||
workspaceName: am.workspaceName, | ||
agentName: am.agentName, | ||
templateName: am.templateName, | ||
expiryDate: am.expiryDate, | ||
} | ||
} | ||
|
||
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) { | ||
metricsCleanupInterval := defaultMetricsCleanupInterval | ||
if duration > 0 { | ||
metricsCleanupInterval = duration | ||
|
@@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, | |
storeSizeGauge: storeSizeGauge, | ||
updateHistogram: updateHistogram, | ||
cleanupHistogram: cleanupHistogram, | ||
|
||
aggregateByLabels: aggregateByLabels, | ||
}, nil | ||
} | ||
|
||
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels. | ||
type labelAggregator struct { | ||
aggregations map[string]float64 | ||
metrics map[string]annotatedMetric | ||
} | ||
|
||
func newLabelAggregator(size int) *labelAggregator { | ||
return &labelAggregator{ | ||
aggregations: make(map[string]float64, size), | ||
metrics: make(map[string]annotatedMetric, size), | ||
} | ||
} | ||
|
||
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering. | ||
labelSet := make(model.LabelSet, len(labels)) | ||
|
||
for _, label := range labels { | ||
val, err := am.getFieldByLabel(label) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
labelSet[model.LabelName(label)] = model.LabelValue(val) | ||
} | ||
|
||
// Memoize based on the metric name & the unique combination of labels. | ||
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint()) | ||
|
||
// Aggregate the value based on the key. | ||
a.aggregations[key] += am.Value | ||
|
||
metric, found := a.metrics[key] | ||
if !found { | ||
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers. | ||
metric = am.clone() | ||
} | ||
|
||
// Store the metric. | ||
metric.aggregateByLabels = labels | ||
metric.Value = a.aggregations[key] | ||
|
||
a.metrics[key] = metric | ||
|
||
return nil | ||
} | ||
|
||
func (a *labelAggregator) listMetrics() []annotatedMetric { | ||
var out []annotatedMetric | ||
for _, am := range a.metrics { | ||
out = append(out, am) | ||
} | ||
return out | ||
} | ||
|
||
func (ma *MetricsAggregator) Run(ctx context.Context) func() { | ||
ctx, cancelFunc := context.WithCancel(ctx) | ||
done := make(chan struct{}) | ||
|
@@ -216,15 +332,41 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { | |
case outputCh := <-ma.collectCh: | ||
ma.log.Debug(ctx, "collect metrics") | ||
|
||
var input []annotatedMetric | ||
output := make([]prometheus.Metric, 0, len(ma.store)) | ||
for _, m := range ma.store { | ||
|
||
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. | ||
// This results in higher cardinality, but may be desirable in larger deployments. | ||
// Default behavior. | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if len(ma.aggregateByLabels) == 0 { | ||
for _, m := range ma.store { | ||
// Aggregate by all available metrics. | ||
m.aggregateByLabels = defaultAgentMetricsLabels | ||
input = append(input, m) | ||
} | ||
} else { | ||
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated | ||
// metrics because we cannot register multiple metric series with the same labels. | ||
la := newLabelAggregator(len(ma.store)) | ||
|
||
for _, m := range ma.store { | ||
if err := la.aggregate(m, ma.aggregateByLabels); err != nil { | ||
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err)) | ||
} | ||
} | ||
|
||
input = la.listMetrics() | ||
} | ||
|
||
for _, m := range input { | ||
promMetric, err := m.asPrometheus() | ||
if err != nil { | ||
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) | ||
continue | ||
} | ||
output = append(output, promMetric) | ||
} | ||
|
||
outputCh <- output | ||
close(outputCh) | ||
case <-cleanupTicker.C: | ||
|
@@ -260,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { | |
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { | ||
} | ||
|
||
var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} | ||
var defaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} | ||
|
||
// AgentMetricLabels are the labels used to decorate an agent's metrics. | ||
// This list should match the list of labels in agentMetricsLabels. | ||
|
Uh oh!
There was an error while loading. Please reload this page.