Skip to content

feat: expose current agent connections by type via prometheus #14612

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
var mu sync.Mutex
status := a.network.Status()
durations := []float64{}
p2pConns := 0
derpConns := 0
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
defer cancelFunc()
for nodeID, peer := range status.Peer {
Expand All @@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
wg.Add(1)
go func() {
defer wg.Done()
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
if err != nil {
return
}
mu.Lock()
defer mu.Unlock()
durations = append(durations, float64(duration.Microseconds()))
if p2p {
p2pConns++
} else {
derpConns++
}
}()
}
wg.Wait()
Expand All @@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
// Agent metrics are changing all the time, so there is no need to perform
// reflect.DeepEqual to see if stats should be transferred.

// currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time.
a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns))
a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns))
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
defer cancelFunc()
a.logger.Debug(ctx, "collecting agent metrics for stats")
Expand Down
94 changes: 52 additions & 42 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
err = session.Shell()
require.NoError(t, err)

expected := []agentsdk.AgentMetric{
expected := []*proto.Stats_Metric{
{
Name: "agent_reconnecting_pty_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_sessions_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 1,
Labels: []agentsdk.AgentMetricLabel{
Labels: []*proto.Stats_Metric_Label{
{
Name: "magic_type",
Value: "ssh",
Expand All @@ -2554,30 +2554,46 @@ func TestAgent_Metrics_SSH(t *testing.T) {
},
{
Name: "agent_ssh_server_failed_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_ssh_server_sftp_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_ssh_server_sftp_server_errors_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "coderd_agentstats_startup_script_seconds",
Type: agentsdk.AgentMetricTypeGauge,
Name: "coderd_agentstats_currently_reachable_peers",
Type: proto.Stats_Metric_GAUGE,
Value: 0,
Labels: []agentsdk.AgentMetricLabel{
Labels: []*proto.Stats_Metric_Label{
{
Name: "connection_type",
Value: "derp",
},
},
},
{
Name: "coderd_agentstats_currently_reachable_peers",
Type: proto.Stats_Metric_GAUGE,
Value: 1,
Labels: []*proto.Stats_Metric_Label{
{
Name: "success",
Value: "true",
Name: "connection_type",
Value: "p2p",
},
},
},
{
Name: "coderd_agentstats_startup_script_seconds",
Type: proto.Stats_Metric_GAUGE,
Value: 1,
},
}

var actual []*promgo.MetricFamily
Expand All @@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) {
if err != nil {
return false
}

if len(expected) != len(actual) {
return false
count := 0
for _, m := range actual {
count += len(m.GetMetric())
}

return verifyCollectedMetrics(t, expected, actual)
return count == len(expected)
}, testutil.WaitLong, testutil.IntervalFast)

require.Len(t, actual, len(expected))
collected := verifyCollectedMetrics(t, expected, actual)
require.True(t, collected, "expected metrics were not collected")
i := 0
for _, mf := range actual {
for _, m := range mf.GetMetric() {
assert.Equal(t, expected[i].Name, mf.GetName())
assert.Equal(t, expected[i].Type.String(), mf.GetType().String())
// Value is max expected
if expected[i].Type == proto.Stats_Metric_GAUGE {
assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue())
} else if expected[i].Type == proto.Stats_Metric_COUNTER {
assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue())
}
for j, lbl := range expected[i].Labels {
assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{
Name: &lbl.Name,
Value: &lbl.Value,
})
}
i++
}
}

_ = stdin.Close()
err = session.Wait()
Expand Down Expand Up @@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) {
})
}

func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
t.Helper()

for i, e := range expected {
assert.Equal(t, e.Name, actual[i].GetName())
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))

for _, m := range actual[i].GetMetric() {
assert.Equal(t, e.Value, m.Counter.GetValue())

if len(m.GetLabel()) > 0 {
for j, lbl := range m.GetLabel() {
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
}
}
m.GetLabel()
}
}
return true
}

type syncWriter struct {
mu sync.Mutex
w io.Writer
Expand Down
10 changes: 10 additions & 0 deletions agent/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type agentMetrics struct {
// startupScriptSeconds is the time in seconds that the start script(s)
// took to run. This is reported once per agent.
startupScriptSeconds *prometheus.GaugeVec
currentConnections *prometheus.GaugeVec
}

func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
Expand All @@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
}, []string{"success"})
registerer.MustRegister(startupScriptSeconds)

currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "currently_reachable_peers",
Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.",
}, []string{"connection_type"})
registerer.MustRegister(currentConnections)

return &agentMetrics{
connectionsTotal: connectionsTotal,
reconnectingPTYErrors: reconnectingPTYErrors,
startupScriptSeconds: startupScriptSeconds,
currentConnections: currentConnections,
}
}

Expand Down
6 changes: 6 additions & 0 deletions docs/admin/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ spec:

## Available metrics

`coderd_agentstats_*` metrics must first be enabled with the flag
`--prometheus-collect-agent-stats`, or the environment variable
`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the
deployment. They will always be available from the agent.

<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->

| Name | Type | Description | Labels |
Expand All @@ -107,6 +112,7 @@ spec:
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` |
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |
Expand Down
3 changes: 3 additions & 0 deletions scripts/metricsdocgen/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network.
# TYPE coderd_agentstats_currently_reachable_peers gauge
coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
# TYPE coderd_agentstats_rx_bytes gauge
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731
Expand Down
Loading