Skip to content

Commit 28f7a13

Browse files
committed
WIP
1 parent 9693fa8 commit 28f7a13

File tree

3 files changed

+118
-62
lines changed

3 files changed

+118
-62
lines changed

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -417,38 +417,34 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R
417417
stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
418418
if err != nil {
419419
logger.Error(ctx, "can't get agent stats", slog.Error(err))
420-
goto done
421-
}
422-
423-
if len(stats) == 0 {
424-
goto done
425-
}
420+
} else {
421+
for _, agentStat := range stats {
422+
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
423+
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
424+
425+
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
426+
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
427+
428+
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
429+
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
430+
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
431+
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
432+
}
426433

427-
for _, agentStat := range stats {
428-
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
429-
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
434+
if len(stats) > 0 {
435+
agentStatsRxBytesGauge.Commit()
436+
agentStatsTxBytesGauge.Commit()
430437

431-
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
432-
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
438+
agentStatsConnectionCountGauge.Commit()
439+
agentStatsConnectionMedianLatencyGauge.Commit()
433440

434-
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
435-
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
436-
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
437-
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName)
441+
agentStatsSessionCountJetBrainsGauge.Commit()
442+
agentStatsSessionCountReconnectingPTYGauge.Commit()
443+
agentStatsSessionCountSSHGauge.Commit()
444+
agentStatsSessionCountVSCodeGauge.Commit()
445+
}
438446
}
439447

440-
agentStatsRxBytesGauge.Commit()
441-
agentStatsTxBytesGauge.Commit()
442-
443-
agentStatsConnectionCountGauge.Commit()
444-
agentStatsConnectionMedianLatencyGauge.Commit()
445-
446-
agentStatsSessionCountJetBrainsGauge.Commit()
447-
agentStatsSessionCountReconnectingPTYGauge.Commit()
448-
agentStatsSessionCountSSHGauge.Commit()
449-
agentStatsSessionCountVSCodeGauge.Commit()
450-
451-
done:
452448
logger.Debug(ctx, "Agent metrics collection is done")
453449
metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds())
454450

coderd/prometheusmetrics/prometheusmetrics_test.go

Lines changed: 69 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ package prometheusmetrics_test
33
import (
44
"context"
55
"database/sql"
6+
"encoding/json"
7+
"fmt"
8+
"os"
69
"sync/atomic"
710
"testing"
811
"time"
@@ -357,24 +360,15 @@ func TestAgents(t *testing.T) {
357360
func TestAgentStats(t *testing.T) {
358361
t.Parallel()
359362

360-
// Build a sample workspace with test agent and fake agent client
363+
// Build sample workspaces with test agents and fake agent client
361364
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
362365
db := api.Database
363366

364367
user := coderdtest.CreateFirstUser(t, client)
365-
authToken := uuid.NewString()
366-
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
367-
Parse: echo.ParseComplete,
368-
ProvisionPlan: echo.ProvisionComplete,
369-
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
370-
})
371-
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
372-
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
373-
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
374-
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
375368

376-
agentClient := agentsdk.New(client.URL)
377-
agentClient.SetSessionToken(authToken)
369+
agent1, _ := prepareWorkspaceAndAgent(t, client, user, 1)
370+
agent2, _ := prepareWorkspaceAndAgent(t, client, user, 2)
371+
agent3, _ := prepareWorkspaceAndAgent(t, client, user, 3)
378372

379373
registry := prometheus.NewRegistry()
380374

@@ -384,24 +378,45 @@ func TestAgentStats(t *testing.T) {
384378
t.Cleanup(cancel)
385379

386380
// when
387-
_, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{
388-
ConnectionsByProto: map[string]int64{"TCP": 1},
389-
ConnectionCount: 2,
390-
RxPackets: 3,
391-
RxBytes: 4,
392-
TxPackets: 5,
393-
TxBytes: 6,
394-
SessionCountVSCode: 7,
395-
SessionCountJetBrains: 8,
396-
SessionCountReconnectingPTY: 9,
397-
SessionCountSSH: 10,
398-
ConnectionMedianLatencyMS: 10000,
399-
})
381+
var i int64
382+
for i = 0; i < 3; i++ {
383+
_, err = agent1.PostStats(context.Background(), &agentsdk.Stats{
384+
TxBytes: 1 + i, RxBytes: 2 + i,
385+
SessionCountVSCode: 3 + i, SessionCountJetBrains: 4 + i, SessionCountReconnectingPTY: 5 + i, SessionCountSSH: 6 + i,
386+
ConnectionCount: 7 + i, ConnectionMedianLatencyMS: 8000,
387+
ConnectionsByProto: map[string]int64{"TCP": 1},
388+
})
389+
require.NoError(t, err)
390+
391+
_, err = agent2.PostStats(context.Background(), &agentsdk.Stats{
392+
TxBytes: 2 + i, RxBytes: 4 + i,
393+
SessionCountVSCode: 6 + i, SessionCountJetBrains: 8 + i, SessionCountReconnectingPTY: 10 + i, SessionCountSSH: 12 + i,
394+
ConnectionCount: 8 + i, ConnectionMedianLatencyMS: 10000,
395+
ConnectionsByProto: map[string]int64{"TCP": 1},
396+
})
397+
require.NoError(t, err)
398+
399+
_, err = agent3.PostStats(context.Background(), &agentsdk.Stats{
400+
TxBytes: 3 + i, RxBytes: 6 + i,
401+
SessionCountVSCode: 12 + i, SessionCountJetBrains: 14 + i, SessionCountReconnectingPTY: 16 + i, SessionCountSSH: 18 + i,
402+
ConnectionCount: 9 + i, ConnectionMedianLatencyMS: 12000,
403+
ConnectionsByProto: map[string]int64{"TCP": 1},
404+
})
405+
require.NoError(t, err)
406+
}
400407

401408
// then
409+
goldenFile, err := os.ReadFile("testdata/agent-stats.json")
402410
require.NoError(t, err)
411+
areMetricsValid := func(collected map[string]int) bool {
412+
out, err := json.MarshalIndent(collected, " ", " ")
413+
require.NoError(t, err)
414+
os.WriteFile("testdata/agent-stats.json", out, 0644)
415+
return string(goldenFile) == string(out)
416+
}
403417

404-
collectedMetrics := map[string]struct{}{}
418+
collected := map[string]int{}
419+
var executionSeconds bool
405420
require.Eventually(t, func() bool {
406421
metrics, err := registry.Gather()
407422
assert.NoError(t, err)
@@ -413,7 +428,7 @@ func TestAgentStats(t *testing.T) {
413428
for _, metric := range metrics {
414429
switch metric.GetName() {
415430
case "coderd_prometheusmetrics_agentstats_execution_seconds":
416-
collectedMetrics[metric.GetName()] = struct{}{}
431+
executionSeconds = true
417432
case "coderd_agentstats_connection_count",
418433
"coderd_agentstats_connection_median_latency_seconds",
419434
"coderd_agentstats_rx_bytes",
@@ -422,16 +437,35 @@ func TestAgentStats(t *testing.T) {
422437
"coderd_agentstats_session_count_reconnecting_pty",
423438
"coderd_agentstats_session_count_ssh",
424439
"coderd_agentstats_session_count_vscode":
425-
collectedMetrics[metric.GetName()] = struct{}{}
426-
assert.Equal(t, "example", metric.Metric[0].Label[0].GetValue()) // Agent name
427-
assert.Equal(t, "testuser", metric.Metric[0].Label[1].GetValue()) // Username
428-
assert.Equal(t, workspace.Name, metric.Metric[0].Label[2].GetValue()) // Workspace name
429-
assert.NotZero(t, int(metric.Metric[0].Gauge.GetValue()), metric.GetName()) // Metric value
440+
for _, m := range metric.Metric {
441+
// username:workspace:agent:metric = value
442+
collected[m.Label[1].GetValue()+":"+m.Label[2].GetValue()+":"+m.Label[0].GetValue()+":"+metric.GetName()] = int(m.Gauge.GetValue())
443+
}
430444
default:
431445
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
432446
}
433447
}
448+
return executionSeconds && areMetricsValid(collected)
449+
}, testutil.WaitLong, testutil.IntervalMedium)
450+
}
451+
452+
func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) (*agentsdk.Client, codersdk.Workspace) {
453+
authToken := uuid.NewString()
454+
455+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
456+
Parse: echo.ParseComplete,
457+
ProvisionPlan: echo.ProvisionComplete,
458+
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
459+
})
460+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
461+
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
462+
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
463+
cwr.Name = fmt.Sprintf("workspace-%d", workspaceNum)
464+
})
465+
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
466+
467+
agentClient := agentsdk.New(client.URL)
468+
agentClient.SetSessionToken(authToken)
434469

435-
return len(collectedMetrics) == 9
436-
}, testutil.WaitShort, testutil.IntervalFast, "collected metrics: %v", collectedMetrics)
470+
return agentClient, workspace
437471
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"testuser:workspace-1:example:coderd_agentstats_connection_count": 9,
3+
"testuser:workspace-1:example:coderd_agentstats_connection_median_latency_seconds": 8,
4+
"testuser:workspace-1:example:coderd_agentstats_rx_bytes": 9,
5+
"testuser:workspace-1:example:coderd_agentstats_session_count_jetbrains": 6,
6+
"testuser:workspace-1:example:coderd_agentstats_session_count_reconnecting_pty": 7,
7+
"testuser:workspace-1:example:coderd_agentstats_session_count_ssh": 8,
8+
"testuser:workspace-1:example:coderd_agentstats_session_count_vscode": 5,
9+
"testuser:workspace-1:example:coderd_agentstats_tx_bytes": 6,
10+
"testuser:workspace-2:example:coderd_agentstats_connection_count": 10,
11+
"testuser:workspace-2:example:coderd_agentstats_connection_median_latency_seconds": 10,
12+
"testuser:workspace-2:example:coderd_agentstats_rx_bytes": 15,
13+
"testuser:workspace-2:example:coderd_agentstats_session_count_jetbrains": 10,
14+
"testuser:workspace-2:example:coderd_agentstats_session_count_reconnecting_pty": 12,
15+
"testuser:workspace-2:example:coderd_agentstats_session_count_ssh": 14,
16+
"testuser:workspace-2:example:coderd_agentstats_session_count_vscode": 8,
17+
"testuser:workspace-2:example:coderd_agentstats_tx_bytes": 9,
18+
"testuser:workspace-3:example:coderd_agentstats_connection_count": 11,
19+
"testuser:workspace-3:example:coderd_agentstats_connection_median_latency_seconds": 12,
20+
"testuser:workspace-3:example:coderd_agentstats_rx_bytes": 21,
21+
"testuser:workspace-3:example:coderd_agentstats_session_count_jetbrains": 16,
22+
"testuser:workspace-3:example:coderd_agentstats_session_count_reconnecting_pty": 18,
23+
"testuser:workspace-3:example:coderd_agentstats_session_count_ssh": 20,
24+
"testuser:workspace-3:example:coderd_agentstats_session_count_vscode": 14,
25+
"testuser:workspace-3:example:coderd_agentstats_tx_bytes": 12
26+
}

0 commit comments

Comments
 (0)