Skip to content

Commit 4509d55

Browse files
committed
Hacky initial fix
Signed-off-by: Danny Kopping <danny@coder.com>
1 parent d7e8627 commit 4509d55

File tree

3 files changed

+162
-66
lines changed

3 files changed

+162
-66
lines changed

coderd/prometheusmetrics/prometheusmetrics.go

+63-16
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,18 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
592592
if err != nil {
593593
logger.Error(ctx, "can't get agent stats", slog.Error(err))
594594
} else {
595+
aggr := map[string][]float64{}
596+
assoc := map[string]*CachedGaugeVec{
597+
"rx_bytes": agentStatsRxBytesGauge,
598+
"tx_bytes": agentStatsTxBytesGauge,
599+
"median_latency_ms": agentStatsConnectionMedianLatencyGauge,
600+
"conn_count": agentStatsConnectionCountGauge,
601+
"session_count_jb": agentStatsSessionCountJetBrainsGauge,
602+
"session_count_pty": agentStatsSessionCountReconnectingPTYGauge,
603+
"session_count_ssh": agentStatsSessionCountSSHGauge,
604+
"session_count_vscode": agentStatsSessionCountVSCodeGauge,
605+
}
606+
595607
for _, agentStat := range stats {
596608
var labelValues []string
597609
for _, label := range aggregateByLabels {
@@ -605,29 +617,64 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
605617
}
606618
}
607619

608-
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
609-
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
620+
metricsMap := map[string]float64{
621+
"rx_bytes": float64(agentStat.RxBytes),
622+
"tx_bytes": float64(agentStat.TxBytes),
623+
"median_latency_ms": agentStat.ConnectionMedianLatencyMS,
624+
"conn_count": float64(agentStat.ConnectionCount),
625+
"session_count_jb": float64(agentStat.SessionCountJetBrains),
626+
"session_count_pty": float64(agentStat.SessionCountReconnectingPTY),
627+
"session_count_ssh": float64(agentStat.SessionCountSSH),
628+
"session_count_vscode": float64(agentStat.SessionCountVSCode),
629+
}
610630

611-
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
612-
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
631+
for m, v := range metricsMap {
632+
key := fmt.Sprintf("%s:%s", m, strings.Join(labelValues, ","))
633+
if _, ok := aggr[key]; !ok {
634+
aggr[key] = []float64{}
635+
}
613636

614-
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
615-
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
616-
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
617-
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
637+
aggr[key] = append(aggr[key], v)
638+
}
618639
}
619640

620641
if len(stats) > 0 {
621-
agentStatsRxBytesGauge.Commit()
622-
agentStatsTxBytesGauge.Commit()
642+
for k, values := range aggr {
643+
name, labels, found := strings.Cut(k, ":")
644+
if !found {
645+
// TODO error
646+
panic("oops 1")
647+
}
648+
649+
metric, ok := assoc[name]
650+
if !ok {
651+
// TODO error
652+
panic("oops 2")
653+
}
623654

624-
agentStatsConnectionCountGauge.Commit()
625-
agentStatsConnectionMedianLatencyGauge.Commit()
655+
var value float64
656+
for _, v := range values {
657+
value += v
658+
}
659+
660+
// special-case handling for this metric
661+
if name == "median_latency_ms" {
662+
value = value /
663+
1000 / // convert to seconds
664+
float64(len(values)) // average over all samples
665+
}
666+
667+
operation := VectorOperationSet
668+
if name == "rx_bytes" || name == "tx_bytes" {
669+
operation = VectorOperationAdd
670+
}
671+
672+
metric.WithLabelValues(operation, value, strings.Split(labels, ",")...)
673+
}
626674

627-
agentStatsSessionCountJetBrainsGauge.Commit()
628-
agentStatsSessionCountReconnectingPTYGauge.Commit()
629-
agentStatsSessionCountSSHGauge.Commit()
630-
agentStatsSessionCountVSCodeGauge.Commit()
675+
for _, metric := range assoc {
676+
metric.Commit()
677+
}
631678
}
632679
}
633680

coderd/prometheusmetrics/prometheusmetrics_test.go

+89-50
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ import (
1313

1414
"github.com/google/uuid"
1515
"github.com/prometheus/client_golang/prometheus"
16+
promClient "github.com/prometheus/client_model/go"
1617
"github.com/stretchr/testify/assert"
1718
"github.com/stretchr/testify/require"
1819
"tailscale.com/tailcfg"
1920

2021
"cdr.dev/slog"
2122
"cdr.dev/slog/sloggers/slogtest"
2223

24+
"github.com/coder/quartz"
25+
2326
agentproto "github.com/coder/coder/v2/agent/proto"
2427
"github.com/coder/coder/v2/coderd/agentmetrics"
2528
"github.com/coder/coder/v2/coderd/coderdtest"
@@ -38,7 +41,6 @@ import (
3841
"github.com/coder/coder/v2/tailnet"
3942
"github.com/coder/coder/v2/tailnet/tailnettest"
4043
"github.com/coder/coder/v2/testutil"
41-
"github.com/coder/quartz"
4244
)
4345

4446
func TestActiveUsers(t *testing.T) {
@@ -518,11 +520,10 @@ func TestAgentStats(t *testing.T) {
518520
defer agent2.DRPCConn().Close()
519521
defer agent3.DRPCConn().Close()
520522

521-
registry := prometheus.NewRegistry()
522-
523523
// given
524+
const workspaceCount = 3
524525
var i int64
525-
for i = 0; i < 3; i++ {
526+
for i = 0; i < workspaceCount; i++ {
526527
_, err = agent1.UpdateStats(ctx, &agentproto.UpdateStatsRequest{
527528
Stats: &agentproto.Stats{
528529
TxBytes: 1 + i, RxBytes: 2 + i,
@@ -559,58 +560,96 @@ func TestAgentStats(t *testing.T) {
559560
// to be posted after this.
560561
closeBatcher()
561562

562-
// when
563-
//
564-
// Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats,
565-
// and it doesn't depend on the real time.
566-
closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{
567-
IgnoreErrors: true,
568-
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond, agentmetrics.LabelAll, false)
569-
require.NoError(t, err)
570-
t.Cleanup(closeFunc)
563+
tests := []struct {
564+
name string
565+
aggregateByLabels []string
566+
goldenFile string
567+
metricKeyFn func(metric *promClient.MetricFamily, sample *promClient.Metric) string
568+
}{
569+
{
570+
name: "unaggregated",
571+
aggregateByLabels: agentmetrics.LabelAll,
572+
goldenFile: "testdata/agent-stats.json",
573+
metricKeyFn: func(metric *promClient.MetricFamily, sample *promClient.Metric) string {
574+
return fmt.Sprintf("%s:%s:%s:%s", sample.Label[1].GetValue(), sample.Label[2].GetValue(), sample.Label[0].GetValue(), metric.GetName())
575+
},
576+
},
577+
{
578+
name: "single label aggregation",
579+
aggregateByLabels: []string{agentmetrics.LabelUsername},
580+
goldenFile: "testdata/agent-stats-aggregated.json",
581+
metricKeyFn: func(metric *promClient.MetricFamily, sample *promClient.Metric) string {
582+
return fmt.Sprintf("%s:%s", sample.Label[0].GetValue(), metric.GetName())
583+
},
584+
},
585+
}
571586

572-
// then
573-
goldenFile, err := os.ReadFile("testdata/agent-stats.json")
574-
require.NoError(t, err)
575-
golden := map[string]int{}
576-
err = json.Unmarshal(goldenFile, &golden)
577-
require.NoError(t, err)
587+
for _, tc := range tests {
588+
t.Run(tc.name, func(t *testing.T) {
589+
t.Parallel()
578590

579-
collected := map[string]int{}
580-
var executionSeconds bool
581-
assert.Eventually(t, func() bool {
582-
metrics, err := registry.Gather()
583-
assert.NoError(t, err)
591+
// when
592+
//
593+
// Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats,
594+
// and it doesn't depend on the real time.
595+
registry := prometheus.NewRegistry()
596+
closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{
597+
IgnoreErrors: true,
598+
}), registry, db, time.Now().Add(-time.Minute), time.Millisecond, tc.aggregateByLabels, false) // TODO: make conditional on ExperimentWorkspaceUsage like in server.go?
599+
require.NoError(t, err)
600+
t.Cleanup(closeFunc)
584601

585-
if len(metrics) < 1 {
586-
return false
587-
}
602+
// then
603+
goldenFile, err := os.ReadFile(tc.goldenFile)
604+
require.NoError(t, err)
605+
golden := map[string]float64{}
606+
err = json.Unmarshal(goldenFile, &golden)
607+
require.NoError(t, err)
588608

589-
for _, metric := range metrics {
590-
switch metric.GetName() {
591-
case "coderd_prometheusmetrics_agentstats_execution_seconds":
592-
executionSeconds = true
593-
case "coderd_agentstats_connection_count",
594-
"coderd_agentstats_connection_median_latency_seconds",
595-
"coderd_agentstats_rx_bytes",
596-
"coderd_agentstats_tx_bytes",
597-
"coderd_agentstats_session_count_jetbrains",
598-
"coderd_agentstats_session_count_reconnecting_pty",
599-
"coderd_agentstats_session_count_ssh",
600-
"coderd_agentstats_session_count_vscode":
601-
for _, m := range metric.Metric {
602-
// username:workspace:agent:metric = value
603-
collected[m.Label[1].GetValue()+":"+m.Label[2].GetValue()+":"+m.Label[0].GetValue()+":"+metric.GetName()] = int(m.Gauge.GetValue())
609+
collected := map[string]float64{}
610+
// sampleCount := map[string]int{}
611+
var executionSeconds bool
612+
assert.Eventually(t, func() bool {
613+
metrics, err := registry.Gather()
614+
assert.NoError(t, err)
615+
616+
if len(metrics) < 1 {
617+
return false
604618
}
605-
default:
606-
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
607-
}
608-
}
609-
return executionSeconds && reflect.DeepEqual(golden, collected)
610-
}, testutil.WaitShort, testutil.IntervalFast)
611619

612-
// Keep this assertion, so that "go test" can print differences instead of "Condition never satisfied"
613-
assert.EqualValues(t, golden, collected)
620+
for _, metric := range metrics {
621+
switch metric.GetName() {
622+
// This metric is not aggregated, but we need to validate we saw it, at least.
623+
case "coderd_prometheusmetrics_agentstats_execution_seconds":
624+
executionSeconds = true
625+
case "coderd_agentstats_connection_count",
626+
"coderd_agentstats_connection_median_latency_seconds",
627+
"coderd_agentstats_rx_bytes",
628+
"coderd_agentstats_tx_bytes",
629+
"coderd_agentstats_session_count_jetbrains",
630+
"coderd_agentstats_session_count_reconnecting_pty",
631+
"coderd_agentstats_session_count_ssh",
632+
"coderd_agentstats_session_count_vscode":
633+
for _, sample := range metric.Metric {
634+
key := tc.metricKeyFn(metric, sample)
635+
collected[key] = sample.Gauge.GetValue()
636+
// if _, ok := sampleCount[key]; !ok {
637+
// sampleCount[key] = 0
638+
// }
639+
// sampleCount[key]++
640+
}
641+
default:
642+
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
643+
}
644+
}
645+
646+
return executionSeconds && reflect.DeepEqual(golden, collected)
647+
}, testutil.WaitShort, testutil.IntervalFast)
648+
649+
// Keep this assertion, so that "go test" can print differences instead of "Condition never satisfied"
650+
assert.EqualValues(t, golden, collected)
651+
})
652+
}
614653
}
615654

616655
func TestExperimentsMetric(t *testing.T) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"testuser:coderd_agentstats_connection_count": 30,
3+
"testuser:coderd_agentstats_connection_median_latency_seconds": 10,
4+
"testuser:coderd_agentstats_rx_bytes": 45,
5+
"testuser:coderd_agentstats_session_count_jetbrains": 32,
6+
"testuser:coderd_agentstats_session_count_reconnecting_pty": 37,
7+
"testuser:coderd_agentstats_session_count_ssh": 42,
8+
"testuser:coderd_agentstats_session_count_vscode": 27,
9+
"testuser:coderd_agentstats_tx_bytes": 27
10+
}

0 commit comments

Comments
 (0)