Skip to content

Commit 1ed37b4

Browse files
committed
feat: collect
1 parent 8c6f96b commit 1ed37b4

File tree

8 files changed

+231
-74
lines changed

8 files changed

+231
-74
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -844,7 +844,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
844844
}
845845
defer closeWorkspacesFunc()
846846

847-
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, 0)
847+
closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0)
848848
if err != nil {
849849
return xerrors.Errorf("register agent stats prometheus metric: %w", err)
850850
}

coderd/database/dbfake/databasefake.go

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3998,8 +3998,75 @@ func (q *fakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim
39983998
return stats, nil
39993999
}
40004000

4001-
func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(_ context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
4002-
panic("not implemented yet")
4001+
func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
4002+
q.mutex.RLock()
4003+
defer q.mutex.RUnlock()
4004+
4005+
agentStatsCreatedAfter := make([]database.WorkspaceAgentStat, 0)
4006+
latestAgentStats := map[uuid.UUID]database.WorkspaceAgentStat{}
4007+
4008+
for _, agentStat := range q.workspaceAgentStats {
4009+
if agentStat.CreatedAt.After(createdAfter) {
4010+
agentStatsCreatedAfter = append(agentStatsCreatedAfter, agentStat)
4011+
latestAgentStats[agentStat.AgentID] = agentStat
4012+
}
4013+
}
4014+
4015+
statByAgent := map[uuid.UUID]database.GetWorkspaceAgentStatsAndLabelsRow{}
4016+
4017+
// Session and connection metrics
4018+
for _, agentStat := range latestAgentStats {
4019+
stat := statByAgent[agentStat.AgentID]
4020+
stat.SessionCountVSCode += agentStat.SessionCountVSCode
4021+
stat.SessionCountJetBrains += agentStat.SessionCountJetBrains
4022+
stat.SessionCountReconnectingPTY += agentStat.SessionCountReconnectingPTY
4023+
stat.SessionCountSSH += agentStat.SessionCountSSH
4024+
stat.ConnectionCount += agentStat.ConnectionCount
4025+
if agentStat.ConnectionMedianLatencyMS >= 0 && stat.ConnectionMedianLatencyMS < agentStat.ConnectionMedianLatencyMS {
4026+
stat.ConnectionMedianLatencyMS = agentStat.ConnectionMedianLatencyMS
4027+
}
4028+
statByAgent[agentStat.AgentID] = stat
4029+
}
4030+
4031+
// Tx, Rx metrics
4032+
for _, agentStat := range agentStatsCreatedAfter {
4033+
stat := statByAgent[agentStat.AgentID]
4034+
stat.WorkspaceRxBytes += agentStat.RxBytes
4035+
stat.WorkspaceTxBytes += agentStat.TxBytes
4036+
statByAgent[agentStat.AgentID] = stat
4037+
}
4038+
4039+
// Labels
4040+
for _, agentStat := range agentStatsCreatedAfter {
4041+
stat := statByAgent[agentStat.AgentID]
4042+
4043+
user, err := q.getUserByIDNoLock(agentStat.UserID)
4044+
if err != nil {
4045+
return nil, err
4046+
}
4047+
4048+
stat.Username = user.Username
4049+
4050+
workspace, err := q.GetWorkspaceByID(ctx, agentStat.WorkspaceID)
4051+
if err != nil {
4052+
return nil, err
4053+
}
4054+
stat.WorkspaceName = workspace.Name
4055+
4056+
agent, err := q.GetWorkspaceAgentByID(ctx, agentStat.AgentID)
4057+
if err != nil {
4058+
return nil, err
4059+
}
4060+
stat.AgentName = agent.Name
4061+
4062+
statByAgent[agentStat.AgentID] = stat
4063+
}
4064+
4065+
stats := make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(statByAgent))
4066+
for _, agent := range statByAgent {
4067+
stats = append(stats, agent)
4068+
}
4069+
return stats, nil
40034070
}
40044071

40054072
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {

coderd/database/queries.sql.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaceagentstats.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ WITH agent_stats AS (
123123
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
124124
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
125125
coalesce(SUM(connection_count), 0)::bigint AS connection_count,
126-
coalesce(SUM(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
126+
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
127127
FROM (
128128
SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
129129
FROM workspace_agent_stats

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
291291
return cancelFunc, nil
292292
}
293293

294-
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) {
294+
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (context.CancelFunc, error) {
295295
if duration == 0 {
296296
duration = 1 * time.Minute
297297
}
@@ -344,8 +344,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
344344
agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
345345
Namespace: "coderd",
346346
Subsystem: "agentstats",
347-
Name: "connection_median_latency",
348-
Help: "The median agent connection latency",
347+
Name: "connection_median_latency_seconds",
348+
Help: "The median agent connection latency in seconds",
349349
}, []string{"agent_name", "username", "workspace_name"}))
350350
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
351351
if err != nil {
@@ -396,7 +396,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
396396
return nil, err
397397
}
398398

399-
createdAfter := time.Now()
399+
createdAfter := initialCreateAfter
400400
ctx, cancelFunc := context.WithCancel(ctx)
401401
ticker := time.NewTicker(duration)
402402
go func() {
@@ -411,12 +411,18 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
411411
logger.Debug(ctx, "Agent metrics collection is starting")
412412
timer := prometheus.NewTimer(metricsCollectorAgentStats)
413413

414+
checkpoint := time.Now()
414415
stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
416+
415417
if err != nil {
416418
logger.Error(ctx, "can't get agent stats", slog.Error(err))
417419
goto done
418420
}
419421

422+
if len(stats) == 0 {
423+
goto done
424+
}
425+
420426
for _, agentStat := range stats {
421427
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
422428
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
@@ -445,7 +451,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
445451
logger.Debug(ctx, "Agent metrics collection is done")
446452
metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds())
447453

448-
createdAfter = time.Now()
454+
createdAfter = checkpoint
449455
}
450456
}()
451457
return cancelFunc, nil

coderd/prometheusmetrics/prometheusmetrics_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/coder/coder/coderd/database/dbgen"
2121
"github.com/coder/coder/coderd/prometheusmetrics"
2222
"github.com/coder/coder/codersdk"
23+
"github.com/coder/coder/codersdk/agentsdk"
2324
"github.com/coder/coder/provisioner/echo"
2425
"github.com/coder/coder/provisionersdk/proto"
2526
"github.com/coder/coder/tailnet"
@@ -352,3 +353,86 @@ func TestAgents(t *testing.T) {
352353
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
353354
}, testutil.WaitShort, testutil.IntervalFast)
354355
}
356+
357+
func TestAgentStats(t *testing.T) {
358+
t.Parallel()
359+
360+
// Build a sample workspace with test agent and fake agent client
361+
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
362+
db := api.Database
363+
364+
user := coderdtest.CreateFirstUser(t, client)
365+
authToken := uuid.NewString()
366+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
367+
Parse: echo.ParseComplete,
368+
ProvisionPlan: echo.ProvisionComplete,
369+
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
370+
})
371+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
372+
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
373+
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
374+
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
375+
376+
agentClient := agentsdk.New(client.URL)
377+
agentClient.SetSessionToken(authToken)
378+
379+
registry := prometheus.NewRegistry()
380+
381+
// given
382+
cancel, err := prometheusmetrics.AgentStats(context.Background(), slogtest.Make(t, nil), registry, db, time.Now(), time.Second)
383+
require.NoError(t, err)
384+
t.Cleanup(cancel)
385+
386+
// when
387+
_, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{
388+
ConnectionsByProto: map[string]int64{"TCP": 1},
389+
ConnectionCount: 2,
390+
RxPackets: 3,
391+
RxBytes: 4,
392+
TxPackets: 5,
393+
TxBytes: 6,
394+
SessionCountVSCode: 7,
395+
SessionCountJetBrains: 8,
396+
SessionCountReconnectingPTY: 9,
397+
SessionCountSSH: 10,
398+
ConnectionMedianLatencyMS: 10000,
399+
})
400+
require.NoError(t, err)
401+
402+
// then
403+
require.NoError(t, err)
404+
405+
collectedMetrics := map[string]struct{}{}
406+
require.Eventually(t, func() bool {
407+
metrics, err := registry.Gather()
408+
assert.NoError(t, err)
409+
410+
if len(metrics) < 1 {
411+
return false
412+
}
413+
414+
for _, metric := range metrics {
415+
switch metric.GetName() {
416+
case "coderd_prometheusmetrics_agentstats_execution_seconds":
417+
collectedMetrics[metric.GetName()] = struct{}{}
418+
case "coderd_agentstats_connection_count",
419+
"coderd_agentstats_connection_median_latency_seconds",
420+
"coderd_agentstats_rx_bytes",
421+
"coderd_agentstats_tx_bytes",
422+
"coderd_agentstats_session_count_jetbrains",
423+
"coderd_agentstats_session_count_reconnecting_pty",
424+
"coderd_agentstats_session_count_ssh",
425+
"coderd_agentstats_session_count_vscode":
426+
collectedMetrics[metric.GetName()] = struct{}{}
427+
assert.Equal(t, "example", metric.Metric[0].Label[0].GetValue()) // Agent name
428+
assert.Equal(t, "testuser", metric.Metric[0].Label[1].GetValue()) // Username
429+
assert.Equal(t, workspace.Name, metric.Metric[0].Label[2].GetValue()) // Workspace name
430+
assert.NotZero(t, int(metric.Metric[0].Gauge.GetValue()), metric.GetName()) // Metric value
431+
default:
432+
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
433+
}
434+
}
435+
436+
return len(collectedMetrics) == 9
437+
}, testutil.WaitShort, testutil.IntervalFast, "collected metrics: %v", collectedMetrics)
438+
}

0 commit comments

Comments
 (0)