Skip to content

feat: expose agent stats via Prometheus endpoint #7115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 43 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8d4e67d
WIP
mtojek Apr 3, 2023
da729e6
Merge branch 'main' into 6724-metrics
mtojek Apr 4, 2023
9ad09b2
WIP
mtojek Apr 4, 2023
440657c
WIP
mtojek Apr 5, 2023
8764f89
Agents
mtojek Apr 5, 2023
663b5d5
fix
mtojek Apr 5, 2023
63aff5e
1min
mtojek Apr 5, 2023
3905481
fix
mtojek Apr 5, 2023
f8d6f46
WIP
mtojek Apr 5, 2023
d487a77
Test
mtojek Apr 5, 2023
7acbaf0
docs
mtojek Apr 5, 2023
7418779
fmt
mtojek Apr 5, 2023
3a8e4e6
Add timer to measure the metrics collection
mtojek Apr 6, 2023
b5d0581
Use CachedGaugeVec
mtojek Apr 6, 2023
e4d708b
Unit tests
mtojek Apr 6, 2023
199e549
WIP
mtojek Apr 7, 2023
7307bd3
Merge branch 'main' into 6724-metrics-2
mtojek Apr 12, 2023
d0b8398
WIP
mtojek Apr 13, 2023
f0c0418
db: GetWorkspaceAgentStatsAndLabels
mtojek Apr 13, 2023
970d35a
fmt
mtojek Apr 13, 2023
229f546
WIP
mtojek Apr 13, 2023
7070e0e
Merge branch 'main' into 6724-metrics-2
mtojek Apr 13, 2023
8c6f96b
gauges
mtojek Apr 13, 2023
1ed37b4
feat: collect
mtojek Apr 13, 2023
7ee1bfc
fix
mtojek Apr 13, 2023
2b8a9e4
fmt
mtojek Apr 13, 2023
322f7e8
minor fixes
mtojek Apr 14, 2023
c7af75a
Prometheus flag
mtojek Apr 14, 2023
9693fa8
fix
mtojek Apr 14, 2023
28f7a13
WIP
mtojek Apr 14, 2023
7878167
fix tests
mtojek Apr 14, 2023
d9e4903
WIP
mtojek Apr 14, 2023
0d37c85
fix json
mtojek Apr 14, 2023
f752c6f
Rx Tx bytes
mtojek Apr 14, 2023
9c7aef8
CloseFunc
mtojek Apr 14, 2023
5290571
fix
mtojek Apr 14, 2023
1cbe59b
fix
mtojek Apr 14, 2023
f8f11eb
Fixes
mtojek Apr 14, 2023
4ffae11
fix
mtojek Apr 14, 2023
7ba16b5
fix: IgnoreErrors
mtojek Apr 14, 2023
2a4c674
Fix: Windows
mtojek Apr 14, 2023
201da83
fix
mtojek Apr 14, 2023
ba52c45
reflect.DeepEquals
mtojek Apr 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
gauges
  • Loading branch information
mtojek committed Apr 13, 2023
commit 8c6f96bbe3b64b223fb89d86441bb49b8ec4dd45
86 changes: 85 additions & 1 deletion coderd/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,73 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
return nil, err
}

createdAfter := database.Now().Add(-duration)
agentStatsConnectionCountGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "connection_count",
Help: "The number of established connections by agent",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsConnectionCountGauge)
if err != nil {
return nil, err
}

agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "connection_median_latency",
Help: "The median agent connection latency",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
if err != nil {
return nil, err
}

agentStatsSessionCountJetBrainsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_jetbrains",
Help: "The number of session established by JetBrains",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
if err != nil {
return nil, err
}

agentStatsSessionCountReconnectingPTYGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_reconnecting_pty",
Help: "The number of session established by reconnecting PTY",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
if err != nil {
return nil, err
}

agentStatsSessionCountSSHGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_ssh",
Help: "The number of session established by SSH",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsSessionCountSSHGauge)
if err != nil {
return nil, err
}

agentStatsSessionCountVSCodeGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_vscode",
Help: "The number of session established by VSCode",
}, []string{"agent_name", "username", "workspace_name"}))
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
if err != nil {
return nil, err
}

createdAfter := time.Now()
ctx, cancelFunc := context.WithCancel(ctx)
ticker := time.NewTicker(duration)
go func() {
Expand All @@ -354,14 +420,32 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
for _, agentStat := range stats {
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)

agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)

agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
}

agentStatsRxBytesGauge.Commit()
agentStatsTxBytesGauge.Commit()

agentStatsConnectionCountGauge.Commit()
agentStatsConnectionMedianLatencyGauge.Commit()

agentStatsSessionCountJetBrainsGauge.Commit()
agentStatsSessionCountReconnectingPTYGauge.Commit()
agentStatsSessionCountSSHGauge.Commit()
agentStatsSessionCountVSCodeGauge.Commit()

done:
logger.Debug(ctx, "Agent metrics collection is done")
metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds())

createdAfter = time.Now()
}
}()
return cancelFunc, nil
Expand Down
10 changes: 9 additions & 1 deletion docs/admin/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,17 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically
| Name | Type | Description | Labels |
| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` |
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` |
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` |
| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` |
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` |
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_connection_median_latency` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` |
| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | |
| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | |
| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | |
Expand Down
30 changes: 27 additions & 3 deletions scripts/metricsdocgen/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",use
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1
# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds.
# TYPE coderd_agents_connection_latencies_seconds gauge
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416
coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125
coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416
coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416
# HELP coderd_agents_connections Agent connections with statuses.
# TYPE coderd_agents_connections gauge
coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1
Expand All @@ -18,6 +18,30 @@ coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",stat
coderd_agents_up{username="admin",workspace_name="workspace-1"} 1
coderd_agents_up{username="admin",workspace_name="workspace-2"} 1
coderd_agents_up{username="admin",workspace_name="workspace-3"} 1
# HELP coderd_agentstats_connection_count The number of established connections by agent
# TYPE coderd_agentstats_connection_count gauge
coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_name="workspace1"} 2
# HELP coderd_agentstats_connection_median_latency The median agent connection latency
# TYPE coderd_agentstats_connection_median_latency gauge
coderd_agentstats_connection_median_latency{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
# TYPE coderd_agentstats_rx_bytes gauge
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731
# HELP coderd_agentstats_session_count_jetbrains The number of session established by JetBrains
# TYPE coderd_agentstats_session_count_jetbrains gauge
coderd_agentstats_session_count_jetbrains{agent_name="main",username="admin",workspace_name="workspace1"} 0
# HELP coderd_agentstats_session_count_reconnecting_pty The number of session established by reconnecting PTY
# TYPE coderd_agentstats_session_count_reconnecting_pty gauge
coderd_agentstats_session_count_reconnecting_pty{agent_name="main",username="admin",workspace_name="workspace1"} 1
# HELP coderd_agentstats_session_count_ssh The number of session established by SSH
# TYPE coderd_agentstats_session_count_ssh gauge
coderd_agentstats_session_count_ssh{agent_name="main",username="admin",workspace_name="workspace1"} 0
# HELP coderd_agentstats_session_count_vscode The number of session established by VSCode
# TYPE coderd_agentstats_session_count_vscode gauge
coderd_agentstats_session_count_vscode{agent_name="main",username="admin",workspace_name="workspace1"} 0
# HELP coderd_agentstats_tx_bytes Agent Tx bytes
# TYPE coderd_agentstats_tx_bytes gauge
coderd_agentstats_tx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 6643
# HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds.
# TYPE coderd_api_websocket_durations_seconds histogram
coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0
Expand Down