Skip to content

Commit 5c154ea

Browse files
committed
feat: expose current agent connections by type via prometheus
1 parent 9da6467 commit 5c154ea

File tree

5 files changed

+82
-43
lines changed

5 files changed

+82
-43
lines changed

agent/agent.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
15101510
var mu sync.Mutex
15111511
status := a.network.Status()
15121512
durations := []float64{}
1513+
p2pConns := 0
1514+
derpConns := 0
15131515
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
15141516
defer cancelFunc()
15151517
for nodeID, peer := range status.Peer {
@@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
15261528
wg.Add(1)
15271529
go func() {
15281530
defer wg.Done()
1529-
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
1531+
duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
15301532
if err != nil {
15311533
return
15321534
}
15331535
mu.Lock()
15341536
defer mu.Unlock()
15351537
durations = append(durations, float64(duration.Microseconds()))
1538+
if p2p {
1539+
p2pConns++
1540+
} else {
1541+
derpConns++
1542+
}
15361543
}()
15371544
}
15381545
wg.Wait()
@@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
15521559
// Agent metrics are changing all the time, so there is no need to perform
15531560
// reflect.DeepEqual to see if stats should be transferred.
15541561

1562+
// currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time.
1563+
a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns))
1564+
a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns))
15551565
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
15561566
defer cancelFunc()
15571567
a.logger.Debug(ctx, "collecting agent metrics for stats")

agent/agent_test.go

Lines changed: 52 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
25312531
err = session.Shell()
25322532
require.NoError(t, err)
25332533

2534-
expected := []agentsdk.AgentMetric{
2534+
expected := []*proto.Stats_Metric{
25352535
{
25362536
Name: "agent_reconnecting_pty_connections_total",
2537-
Type: agentsdk.AgentMetricTypeCounter,
2537+
Type: proto.Stats_Metric_COUNTER,
25382538
Value: 0,
25392539
},
25402540
{
25412541
Name: "agent_sessions_total",
2542-
Type: agentsdk.AgentMetricTypeCounter,
2542+
Type: proto.Stats_Metric_COUNTER,
25432543
Value: 1,
2544-
Labels: []agentsdk.AgentMetricLabel{
2544+
Labels: []*proto.Stats_Metric_Label{
25452545
{
25462546
Name: "magic_type",
25472547
Value: "ssh",
@@ -2554,30 +2554,46 @@ func TestAgent_Metrics_SSH(t *testing.T) {
25542554
},
25552555
{
25562556
Name: "agent_ssh_server_failed_connections_total",
2557-
Type: agentsdk.AgentMetricTypeCounter,
2557+
Type: proto.Stats_Metric_COUNTER,
25582558
Value: 0,
25592559
},
25602560
{
25612561
Name: "agent_ssh_server_sftp_connections_total",
2562-
Type: agentsdk.AgentMetricTypeCounter,
2562+
Type: proto.Stats_Metric_COUNTER,
25632563
Value: 0,
25642564
},
25652565
{
25662566
Name: "agent_ssh_server_sftp_server_errors_total",
2567-
Type: agentsdk.AgentMetricTypeCounter,
2567+
Type: proto.Stats_Metric_COUNTER,
25682568
Value: 0,
25692569
},
25702570
{
2571-
Name: "coderd_agentstats_startup_script_seconds",
2572-
Type: agentsdk.AgentMetricTypeGauge,
2571+
Name: "coderd_agentstats_currently_reachable_peers",
2572+
Type: proto.Stats_Metric_GAUGE,
25732573
Value: 0,
2574-
Labels: []agentsdk.AgentMetricLabel{
2574+
Labels: []*proto.Stats_Metric_Label{
2575+
{
2576+
Name: "connection_type",
2577+
Value: "derp",
2578+
},
2579+
},
2580+
},
2581+
{
2582+
Name: "coderd_agentstats_currently_reachable_peers",
2583+
Type: proto.Stats_Metric_GAUGE,
2584+
Value: 1,
2585+
Labels: []*proto.Stats_Metric_Label{
25752586
{
2576-
Name: "success",
2577-
Value: "true",
2587+
Name: "connection_type",
2588+
Value: "p2p",
25782589
},
25792590
},
25802591
},
2592+
{
2593+
Name: "coderd_agentstats_startup_script_seconds",
2594+
Type: proto.Stats_Metric_GAUGE,
2595+
Value: 1,
2596+
},
25812597
}
25822598

25832599
var actual []*promgo.MetricFamily
@@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) {
25862602
if err != nil {
25872603
return false
25882604
}
2589-
2590-
if len(expected) != len(actual) {
2591-
return false
2605+
count := 0
2606+
for _, m := range actual {
2607+
count += len(m.GetMetric())
25922608
}
2593-
2594-
return verifyCollectedMetrics(t, expected, actual)
2609+
return count == len(expected)
25952610
}, testutil.WaitLong, testutil.IntervalFast)
25962611

2597-
require.Len(t, actual, len(expected))
2598-
collected := verifyCollectedMetrics(t, expected, actual)
2599-
require.True(t, collected, "expected metrics were not collected")
2612+
i := 0
2613+
for _, mf := range actual {
2614+
for _, m := range mf.GetMetric() {
2615+
assert.Equal(t, expected[i].Name, mf.GetName())
2616+
assert.Equal(t, expected[i].Type.String(), mf.GetType().String())
2617+
// Value is max expected
2618+
if expected[i].Type == proto.Stats_Metric_GAUGE {
2619+
assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue())
2620+
} else if expected[i].Type == proto.Stats_Metric_COUNTER {
2621+
assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue())
2622+
}
2623+
for j, lbl := range expected[i].Labels {
2624+
assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{
2625+
Name: &lbl.Name,
2626+
Value: &lbl.Value,
2627+
})
2628+
}
2629+
i++
2630+
}
2631+
}
26002632

26012633
_ = stdin.Close()
26022634
err = session.Wait()
@@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) {
28282860
})
28292861
}
28302862

2831-
func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
2832-
t.Helper()
2833-
2834-
for i, e := range expected {
2835-
assert.Equal(t, e.Name, actual[i].GetName())
2836-
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))
2837-
2838-
for _, m := range actual[i].GetMetric() {
2839-
assert.Equal(t, e.Value, m.Counter.GetValue())
2840-
2841-
if len(m.GetLabel()) > 0 {
2842-
for j, lbl := range m.GetLabel() {
2843-
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
2844-
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
2845-
}
2846-
}
2847-
m.GetLabel()
2848-
}
2849-
}
2850-
return true
2851-
}
2852-
28532863
type syncWriter struct {
28542864
mu sync.Mutex
28552865
w io.Writer

agent/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type agentMetrics struct {
1919
// startupScriptSeconds is the time in seconds that the start script(s)
2020
// took to run. This is reported once per agent.
2121
startupScriptSeconds *prometheus.GaugeVec
22+
currentConnections *prometheus.GaugeVec
2223
}
2324

2425
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
@@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
4546
}, []string{"success"})
4647
registerer.MustRegister(startupScriptSeconds)
4748

49+
currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{
50+
Namespace: "coderd",
51+
Subsystem: "agentstats",
52+
Name: "currently_reachable_peers",
53+
Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.",
54+
}, []string{"connection_type"})
55+
registerer.MustRegister(currentConnections)
56+
4857
return &agentMetrics{
4958
connectionsTotal: connectionsTotal,
5059
reconnectingPTYErrors: reconnectingPTYErrors,
5160
startupScriptSeconds: startupScriptSeconds,
61+
currentConnections: currentConnections,
5262
}
5363
}
5464

docs/admin/prometheus.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ spec:
9696

9797
## Available metrics
9898

99+
`coderd_agentstats_*` metrics must first be enabled with the flag
100+
`--prometheus-collect-agent-stats`, or the environment variable
101+
`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the
102+
deployment. They will always be available from the agent.
103+
99104
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
100105

101106
| Name | Type | Description | Labels |
@@ -107,6 +112,7 @@ spec:
107112
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
108113
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
109114
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
115+
| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` |
110116
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
111117
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
112118
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |

scripts/metricsdocgen/metrics

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_
6363
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
6464
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
6565
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
66+
# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network.
67+
# TYPE coderd_agentstats_currently_reachable_peers gauge
68+
coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0
6669
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
6770
# TYPE coderd_agentstats_rx_bytes gauge
6871
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731

0 commit comments

Comments
 (0)