Skip to content

Commit 942aba3

Browse files
authored
feat: expose agent stats via Prometheus endpoint (#7115)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * WIP * WIP * db: GetWorkspaceAgentStatsAndLabels * fmt * WIP * gauges * feat: collect * fix * fmt * minor fixes * Prometheus flag * fix * WIP * fix tests * WIP * fix json * Rx Tx bytes * CloseFunc * fix * fix * Fixes * fix * fix: IgnoreErrors * Fix: Windows * fix * reflect.DeepEquals
1 parent e068945 commit 942aba3

20 files changed

+732
-79
lines changed

cli/server.go

+8
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,14 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
704704
}
705705
defer closeWorkspacesFunc()
706706

707+
if cfg.Prometheus.CollectAgentStats {
708+
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
709+
if err != nil {
710+
return xerrors.Errorf("register agent stats prometheus metric: %w", err)
711+
}
712+
defer closeAgentStatsFunc()
713+
}
714+
707715
//nolint:revive
708716
defer ServeHandler(ctx, logger, promhttp.InstrumentMetricHandler(
709717
options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}),

cli/testdata/coder_server_--help.golden

+3
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ Use a YAML configuration file when your server launch become unwieldy.
9090
--prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112)
9191
The bind address to serve prometheus metrics.
9292

93+
--prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS
94+
Collect agent stats (may increase charges for metrics storage).
95+
9396
--prometheus-enable bool, $CODER_PROMETHEUS_ENABLE
9497
Serve prometheus metrics on the address defined by prometheus address.
9598

cli/testdata/server-config.yaml.golden

+3
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ introspection:
146146
# The bind address to serve prometheus metrics.
147147
# (default: 127.0.0.1:2112, type: host:port)
148148
address: 127.0.0.1:2112
149+
# Collect agent stats (may increase charges for metrics storage).
150+
# (default: <unset>, type: bool)
151+
collect_agent_stats: false
149152
pprof:
150153
# Serve pprof metrics on the address defined by pprof address.
151154
# (default: <unset>, type: bool)

coderd/apidoc/docs.go

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbauthz/system.go

+4
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,10 @@ func (q *querier) GetWorkspaceAgentStats(ctx context.Context, createdAfter time.
302302
return q.db.GetWorkspaceAgentStats(ctx, createdAfter)
303303
}
304304

305+
func (q *querier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
306+
return q.db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
307+
}
308+
305309
func (q *querier) GetDeploymentWorkspaceStats(ctx context.Context) (database.GetDeploymentWorkspaceStatsRow, error) {
306310
return q.db.GetDeploymentWorkspaceStats(ctx)
307311
}

coderd/database/dbfake/databasefake.go

+71
Original file line numberDiff line numberDiff line change
@@ -3998,6 +3998,77 @@ func (q *fakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim
39983998
return stats, nil
39993999
}
40004000

4001+
func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
4002+
q.mutex.RLock()
4003+
defer q.mutex.RUnlock()
4004+
4005+
agentStatsCreatedAfter := make([]database.WorkspaceAgentStat, 0)
4006+
latestAgentStats := map[uuid.UUID]database.WorkspaceAgentStat{}
4007+
4008+
for _, agentStat := range q.workspaceAgentStats {
4009+
if agentStat.CreatedAt.After(createdAfter) {
4010+
agentStatsCreatedAfter = append(agentStatsCreatedAfter, agentStat)
4011+
latestAgentStats[agentStat.AgentID] = agentStat
4012+
}
4013+
}
4014+
4015+
statByAgent := map[uuid.UUID]database.GetWorkspaceAgentStatsAndLabelsRow{}
4016+
4017+
// Session and connection metrics
4018+
for _, agentStat := range latestAgentStats {
4019+
stat := statByAgent[agentStat.AgentID]
4020+
stat.SessionCountVSCode += agentStat.SessionCountVSCode
4021+
stat.SessionCountJetBrains += agentStat.SessionCountJetBrains
4022+
stat.SessionCountReconnectingPTY += agentStat.SessionCountReconnectingPTY
4023+
stat.SessionCountSSH += agentStat.SessionCountSSH
4024+
stat.ConnectionCount += agentStat.ConnectionCount
4025+
if agentStat.ConnectionMedianLatencyMS >= 0 && stat.ConnectionMedianLatencyMS < agentStat.ConnectionMedianLatencyMS {
4026+
stat.ConnectionMedianLatencyMS = agentStat.ConnectionMedianLatencyMS
4027+
}
4028+
statByAgent[agentStat.AgentID] = stat
4029+
}
4030+
4031+
// Tx, Rx metrics
4032+
for _, agentStat := range agentStatsCreatedAfter {
4033+
stat := statByAgent[agentStat.AgentID]
4034+
stat.RxBytes += agentStat.RxBytes
4035+
stat.TxBytes += agentStat.TxBytes
4036+
statByAgent[agentStat.AgentID] = stat
4037+
}
4038+
4039+
// Labels
4040+
for _, agentStat := range agentStatsCreatedAfter {
4041+
stat := statByAgent[agentStat.AgentID]
4042+
4043+
user, err := q.getUserByIDNoLock(agentStat.UserID)
4044+
if err != nil {
4045+
return nil, err
4046+
}
4047+
4048+
stat.Username = user.Username
4049+
4050+
workspace, err := q.GetWorkspaceByID(ctx, agentStat.WorkspaceID)
4051+
if err != nil {
4052+
return nil, err
4053+
}
4054+
stat.WorkspaceName = workspace.Name
4055+
4056+
agent, err := q.GetWorkspaceAgentByID(ctx, agentStat.AgentID)
4057+
if err != nil {
4058+
return nil, err
4059+
}
4060+
stat.AgentName = agent.Name
4061+
4062+
statByAgent[agentStat.AgentID] = stat
4063+
}
4064+
4065+
stats := make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(statByAgent))
4066+
for _, agent := range statByAgent {
4067+
stats = append(stats, agent)
4068+
}
4069+
return stats, nil
4070+
}
4071+
40014072
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
40024073
q.mutex.RLock()
40034074
defer q.mutex.RUnlock()

coderd/database/querier.go

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

+102
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaceagentstats.sql

+52
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,55 @@ WITH agent_stats AS (
103103
) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id
104104
)
105105
SELECT * FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id;
106+
107+
-- name: GetWorkspaceAgentStatsAndLabels :many
108+
WITH agent_stats AS (
109+
SELECT
110+
user_id,
111+
agent_id,
112+
workspace_id,
113+
coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes,
114+
coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes
115+
FROM workspace_agent_stats
116+
WHERE workspace_agent_stats.created_at > $1
117+
GROUP BY user_id, agent_id, workspace_id
118+
), latest_agent_stats AS (
119+
SELECT
120+
a.agent_id,
121+
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
122+
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
123+
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
124+
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
125+
coalesce(SUM(connection_count), 0)::bigint AS connection_count,
126+
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
127+
FROM (
128+
SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
129+
FROM workspace_agent_stats
130+
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
131+
WHERE created_at > $1 AND connection_median_latency_ms > 0
132+
) AS a
133+
WHERE a.rn = 1
134+
GROUP BY a.user_id, a.agent_id, a.workspace_id
135+
)
136+
SELECT
137+
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
138+
session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty,
139+
connection_count, connection_median_latency_ms
140+
FROM
141+
agent_stats
142+
JOIN
143+
latest_agent_stats
144+
ON
145+
agent_stats.agent_id = latest_agent_stats.agent_id
146+
JOIN
147+
users
148+
ON
149+
users.id = agent_stats.user_id
150+
JOIN
151+
workspace_agents
152+
ON
153+
workspace_agents.id = agent_stats.agent_id
154+
JOIN
155+
workspaces
156+
ON
157+
workspaces.id = agent_stats.workspace_id;

0 commit comments

Comments
 (0)