From 5c154ea90e69ae1446a08917a4f51d73a0055293 Mon Sep 17 00:00:00 2001 From: Ethan Dickson Date: Mon, 9 Sep 2024 11:29:10 +0000 Subject: [PATCH] feat: expose current agent connections by type via prometheus --- agent/agent.go | 12 ++++- agent/agent_test.go | 94 +++++++++++++++++++---------------- agent/metrics.go | 10 ++++ docs/admin/prometheus.md | 6 +++ scripts/metricsdocgen/metrics | 3 ++ 5 files changed, 82 insertions(+), 43 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index f0e357479bc47..dbd355669b95c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect var mu sync.Mutex status := a.network.Status() durations := []float64{} + p2pConns := 0 + derpConns := 0 pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second) defer cancelFunc() for nodeID, peer := range status.Peer { @@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect wg.Add(1) go func() { defer wg.Done() - duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr()) + duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr()) if err != nil { return } mu.Lock() defer mu.Unlock() durations = append(durations, float64(duration.Microseconds())) + if p2p { + p2pConns++ + } else { + derpConns++ + } }() } wg.Wait() @@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect // Agent metrics are changing all the time, so there is no need to perform // reflect.DeepEqual to see if stats should be transferred. + // currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time. + a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns)) + a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns)) metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second) defer cancelFunc() a.logger.Debug(ctx, "collecting agent metrics for stats") diff --git a/agent/agent_test.go b/agent/agent_test.go index 4b0712bcf93c6..e4aac04e0eedd 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) { err = session.Shell() require.NoError(t, err) - expected := []agentsdk.AgentMetric{ + expected := []*proto.Stats_Metric{ { Name: "agent_reconnecting_pty_connections_total", - Type: agentsdk.AgentMetricTypeCounter, + Type: proto.Stats_Metric_COUNTER, Value: 0, }, { Name: "agent_sessions_total", - Type: agentsdk.AgentMetricTypeCounter, + Type: proto.Stats_Metric_COUNTER, Value: 1, - Labels: []agentsdk.AgentMetricLabel{ + Labels: []*proto.Stats_Metric_Label{ { Name: "magic_type", Value: "ssh", @@ -2554,30 +2554,46 @@ func TestAgent_Metrics_SSH(t *testing.T) { }, { Name: "agent_ssh_server_failed_connections_total", - Type: agentsdk.AgentMetricTypeCounter, + Type: proto.Stats_Metric_COUNTER, Value: 0, }, { Name: "agent_ssh_server_sftp_connections_total", - Type: agentsdk.AgentMetricTypeCounter, + Type: proto.Stats_Metric_COUNTER, Value: 0, }, { Name: "agent_ssh_server_sftp_server_errors_total", - Type: agentsdk.AgentMetricTypeCounter, + Type: proto.Stats_Metric_COUNTER, Value: 0, }, { - Name: "coderd_agentstats_startup_script_seconds", - Type: agentsdk.AgentMetricTypeGauge, + Name: "coderd_agentstats_currently_reachable_peers", + Type: proto.Stats_Metric_GAUGE, Value: 0, - Labels: []agentsdk.AgentMetricLabel{ + Labels: []*proto.Stats_Metric_Label{ + { + Name: "connection_type", + Value: "derp", + }, + }, + }, + { + Name: "coderd_agentstats_currently_reachable_peers", + Type: proto.Stats_Metric_GAUGE, + Value: 1, + Labels: []*proto.Stats_Metric_Label{ { - Name: "success", - Value: "true", + Name: "connection_type", + Value: "p2p", }, }, }, + { + Name: "coderd_agentstats_startup_script_seconds", + Type: proto.Stats_Metric_GAUGE, + Value: 1, + }, } var actual []*promgo.MetricFamily @@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) { if err != nil { return false } - - if len(expected) != len(actual) { - return false + count := 0 + for _, m := range actual { + count += len(m.GetMetric()) } - - return verifyCollectedMetrics(t, expected, actual) + return count == len(expected) }, testutil.WaitLong, testutil.IntervalFast) - require.Len(t, actual, len(expected)) - collected := verifyCollectedMetrics(t, expected, actual) - require.True(t, collected, "expected metrics were not collected") + i := 0 + for _, mf := range actual { + for _, m := range mf.GetMetric() { + assert.Equal(t, expected[i].Name, mf.GetName()) + assert.Equal(t, expected[i].Type.String(), mf.GetType().String()) + // Value is max expected + if expected[i].Type == proto.Stats_Metric_GAUGE { + assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue()) + } else if expected[i].Type == proto.Stats_Metric_COUNTER { + assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue()) + } + for j, lbl := range expected[i].Labels { + assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{ + Name: &lbl.Name, + Value: &lbl.Value, + }) + } + i++ + } + } _ = stdin.Close() err = session.Wait() @@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) { }) } -func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool { - t.Helper() - - for i, e := range expected { - assert.Equal(t, e.Name, actual[i].GetName()) - assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String())) - - for _, m := range actual[i].GetMetric() { - assert.Equal(t, e.Value, m.Counter.GetValue()) - - if len(m.GetLabel()) > 0 { - for j, lbl := range m.GetLabel() { - assert.Equal(t, e.Labels[j].Name, lbl.GetName()) - assert.Equal(t, e.Labels[j].Value, lbl.GetValue()) - } - } - m.GetLabel() - } - } - return true -} - type syncWriter struct { mu sync.Mutex w io.Writer diff --git a/agent/metrics.go b/agent/metrics.go index 5a60740c4c969..6c89827d2c2ee 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -19,6 +19,7 @@ type agentMetrics struct { // startupScriptSeconds is the time in seconds that the start script(s) // took to run. This is reported once per agent. startupScriptSeconds *prometheus.GaugeVec + currentConnections *prometheus.GaugeVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { }, []string{"success"}) registerer.MustRegister(startupScriptSeconds) + currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "currently_reachable_peers", + Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.", + }, []string{"connection_type"}) + registerer.MustRegister(currentConnections) + return &agentMetrics{ connectionsTotal: connectionsTotal, reconnectingPTYErrors: reconnectingPTYErrors, startupScriptSeconds: startupScriptSeconds, + currentConnections: currentConnections, } } diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index 99d36b5b15e31..005770b5c10e9 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -96,6 +96,11 @@ spec: ## Available metrics +`coderd_agentstats_*` metrics must first be enabled with the flag +`--prometheus-collect-agent-stats`, or the environment variable +`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the +deployment. They will always be available from the agent. + | Name | Type | Description | Labels | @@ -107,6 +112,7 @@ spec: | `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` | | `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | | `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` | | `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | | `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | | `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index fda7cebd1938f..414605de1cd70 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_ # HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency # TYPE coderd_agentstats_connection_median_latency_seconds gauge coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784 +# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network. +# TYPE coderd_agentstats_currently_reachable_peers gauge +coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0 # HELP coderd_agentstats_rx_bytes Agent Rx bytes # TYPE coderd_agentstats_rx_bytes gauge coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731