Skip to content

Commit b7bdb17

Browse files
authored
feat: add metrics to workspace agent scripts (#11132)
* push startup script metrics to agent
1 parent 41ed581 commit b7bdb17

File tree

20 files changed

+306
-127
lines changed

20 files changed

+306
-127
lines changed

agent/agent.go

+18-3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535
"tailscale.com/types/netlogtype"
3636

3737
"cdr.dev/slog"
38+
"github.com/coder/retry"
39+
3840
"github.com/coder/coder/v2/agent/agentproc"
3941
"github.com/coder/coder/v2/agent/agentscripts"
4042
"github.com/coder/coder/v2/agent/agentssh"
@@ -45,7 +47,6 @@ import (
4547
"github.com/coder/coder/v2/codersdk"
4648
"github.com/coder/coder/v2/codersdk/agentsdk"
4749
"github.com/coder/coder/v2/tailnet"
48-
"github.com/coder/retry"
4950
)
5051

5152
const (
@@ -222,8 +223,10 @@ type agent struct {
222223
connCountReconnectingPTY atomic.Int64
223224

224225
prometheusRegistry *prometheus.Registry
225-
metrics *agentMetrics
226-
syscaller agentproc.Syscaller
226+
// metrics are prometheus registered metrics that will be collected and
227+
// labeled in Coder with the agent + workspace.
228+
metrics *agentMetrics
229+
syscaller agentproc.Syscaller
227230

228231
// modifiedProcs is used for testing process priority management.
229232
modifiedProcs chan []*agentproc.Process
@@ -252,6 +255,9 @@ func (a *agent) init(ctx context.Context) {
252255
Filesystem: a.filesystem,
253256
PatchLogs: a.client.PatchLogs,
254257
})
258+
// Register runner metrics. If the prom registry is nil, the metrics
259+
// will not report anywhere.
260+
a.scriptRunner.RegisterMetrics(a.prometheusRegistry)
255261
go a.runLoop(ctx)
256262
}
257263

@@ -745,9 +751,12 @@ func (a *agent) run(ctx context.Context) error {
745751
return xerrors.Errorf("init script runner: %w", err)
746752
}
747753
err = a.trackConnGoroutine(func() {
754+
start := time.Now()
748755
err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
749756
return script.RunOnStart
750757
})
758+
// Measure the time immediately after the script has finished
759+
dur := time.Since(start).Seconds()
751760
if err != nil {
752761
a.logger.Warn(ctx, "startup script(s) failed", slog.Error(err))
753762
if errors.Is(err, agentscripts.ErrTimeout) {
@@ -758,6 +767,12 @@ func (a *agent) run(ctx context.Context) error {
758767
} else {
759768
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady)
760769
}
770+
771+
label := "false"
772+
if err == nil {
773+
label = "true"
774+
}
775+
a.metrics.startupScriptSeconds.WithLabelValues(label).Set(dur)
761776
a.scriptRunner.StartCron()
762777
})
763778
if err != nil {

agent/agent_test.go

+12
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
"cdr.dev/slog"
4747
"cdr.dev/slog/sloggers/sloghuman"
4848
"cdr.dev/slog/sloggers/slogtest"
49+
4950
"github.com/coder/coder/v2/agent"
5051
"github.com/coder/coder/v2/agent/agentproc"
5152
"github.com/coder/coder/v2/agent/agentproc/agentproctest"
@@ -2235,6 +2236,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
22352236
Type: agentsdk.AgentMetricTypeCounter,
22362237
Value: 0,
22372238
},
2239+
{
2240+
Name: "coderd_agentstats_startup_script_seconds",
2241+
Type: agentsdk.AgentMetricTypeGauge,
2242+
Value: 0,
2243+
Labels: []agentsdk.AgentMetricLabel{
2244+
{
2245+
Name: "success",
2246+
Value: "true",
2247+
},
2248+
},
2249+
},
22382250
}
22392251

22402252
var actual []*promgo.MetricFamily

agent/agentscripts/agentscripts.go

+33-2
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ import (
1313
"sync/atomic"
1414
"time"
1515

16+
"github.com/prometheus/client_golang/prometheus"
1617
"github.com/robfig/cron/v3"
1718
"github.com/spf13/afero"
1819
"golang.org/x/sync/errgroup"
1920
"golang.org/x/xerrors"
2021

2122
"cdr.dev/slog"
23+
2224
"github.com/coder/coder/v2/agent/agentssh"
2325
"github.com/coder/coder/v2/codersdk"
2426
"github.com/coder/coder/v2/codersdk/agentsdk"
@@ -57,6 +59,11 @@ func New(opts Options) *Runner {
5759
cronCtxCancel: cronCtxCancel,
5860
cron: cron.New(cron.WithParser(parser)),
5961
closed: make(chan struct{}),
62+
scriptsExecuted: prometheus.NewCounterVec(prometheus.CounterOpts{
63+
Namespace: "agent",
64+
Subsystem: "scripts",
65+
Name: "executed_total",
66+
}, []string{"success"}),
6067
}
6168
}
6269

@@ -71,6 +78,19 @@ type Runner struct {
7178
cron *cron.Cron
7279
initialized atomic.Bool
7380
scripts []codersdk.WorkspaceAgentScript
81+
82+
// scriptsExecuted includes all scripts executed by the workspace agent. Agents
83+
// execute startup scripts, and scripts on a cron schedule. Both will increment
84+
// this counter.
85+
scriptsExecuted *prometheus.CounterVec
86+
}
87+
88+
func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
89+
if reg == nil {
90+
// If no registry, do nothing.
91+
return
92+
}
93+
reg.MustRegister(r.scriptsExecuted)
7494
}
7595

7696
// Init initializes the runner with the provided scripts.
@@ -90,7 +110,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
90110
}
91111
script := script
92112
_, err := r.cron.AddFunc(script.Cron, func() {
93-
err := r.run(r.cronCtx, script)
113+
err := r.trackRun(r.cronCtx, script)
94114
if err != nil {
95115
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
96116
}
@@ -131,7 +151,7 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
131151
}
132152
script := script
133153
eg.Go(func() error {
134-
err := r.run(ctx, script)
154+
err := r.trackRun(ctx, script)
135155
if err != nil {
136156
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
137157
}
@@ -141,6 +161,17 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
141161
return eg.Wait()
142162
}
143163

164+
// trackRun wraps "run" with metrics.
165+
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
166+
err := r.run(ctx, script)
167+
if err != nil {
168+
r.scriptsExecuted.WithLabelValues("false").Add(1)
169+
} else {
170+
r.scriptsExecuted.WithLabelValues("true").Add(1)
171+
}
172+
return err
173+
}
174+
144175
// run executes the provided script with the timeout.
145176
// If the timeout is exceeded, the process is sent an interrupt signal.
146177
// If the process does not exit after a few seconds, it is forcefully killed.

agent/metrics.go

+12
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ import (
1717
type agentMetrics struct {
1818
connectionsTotal prometheus.Counter
1919
reconnectingPTYErrors *prometheus.CounterVec
20+
// startupScriptSeconds is the time in seconds that the start script(s)
21+
// took to run. This is reported once per agent.
22+
startupScriptSeconds *prometheus.GaugeVec
2023
}
2124

2225
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
@@ -35,9 +38,18 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
3538
)
3639
registerer.MustRegister(reconnectingPTYErrors)
3740

41+
startupScriptSeconds := prometheus.NewGaugeVec(prometheus.GaugeOpts{
42+
Namespace: "coderd",
43+
Subsystem: "agentstats",
44+
Name: "startup_script_seconds",
45+
Help: "Amount of time taken to run the startup script in seconds.",
46+
}, []string{"success"})
47+
registerer.MustRegister(startupScriptSeconds)
48+
3849
return &agentMetrics{
3950
connectionsTotal: connectionsTotal,
4051
reconnectingPTYErrors: reconnectingPTYErrors,
52+
startupScriptSeconds: startupScriptSeconds,
4153
}
4254
}
4355

coderd/coderd.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ import (
3838
_ "github.com/coder/coder/v2/coderd/apidoc"
3939
"github.com/coder/coder/v2/coderd/externalauth"
4040
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
41+
"github.com/coder/coder/v2/coderd/prometheusmetrics"
4142

4243
"cdr.dev/slog"
44+
4345
"github.com/coder/coder/v2/buildinfo"
4446
"github.com/coder/coder/v2/cli/clibase"
4547
"github.com/coder/coder/v2/coderd/audit"
@@ -168,7 +170,7 @@ type Options struct {
168170

169171
HTTPClient *http.Client
170172

171-
UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric)
173+
UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.AgentMetricLabels, metrics []agentsdk.AgentMetric)
172174
StatsBatcher *batchstats.Batcher
173175

174176
WorkspaceAppsStatsCollectorOptions workspaceapps.StatsCollectorOptions

coderd/database/dbauthz/dbauthz.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1918,7 +1918,7 @@ func (q *querier) GetWorkspaceBuildsCreatedAfter(ctx context.Context, createdAt
19181918
return q.db.GetWorkspaceBuildsCreatedAfter(ctx, createdAt)
19191919
}
19201920

1921-
func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) {
1921+
func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) {
19221922
return fetch(q.log, q.auth, q.db.GetWorkspaceByAgentID)(ctx, agentID)
19231923
}
19241924

coderd/database/dbauthz/dbauthz_test.go

+40-10
Original file line numberDiff line numberDiff line change
@@ -1065,21 +1065,30 @@ func (s *MethodTestSuite) TestWorkspace() {
10651065
check.Args(ws.ID).Asserts(ws, rbac.ActionRead).Returns(b)
10661066
}))
10671067
s.Run("GetWorkspaceAgentByID", s.Subtest(func(db database.Store, check *expects) {
1068-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1068+
tpl := dbgen.Template(s.T(), db, database.Template{})
1069+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1070+
TemplateID: tpl.ID,
1071+
})
10691072
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
10701073
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
10711074
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
10721075
check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(agt)
10731076
}))
10741077
s.Run("GetWorkspaceAgentByInstanceID", s.Subtest(func(db database.Store, check *expects) {
1075-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1078+
tpl := dbgen.Template(s.T(), db, database.Template{})
1079+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1080+
TemplateID: tpl.ID,
1081+
})
10761082
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
10771083
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
10781084
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
10791085
check.Args(agt.AuthInstanceID.String).Asserts(ws, rbac.ActionRead).Returns(agt)
10801086
}))
10811087
s.Run("UpdateWorkspaceAgentLifecycleStateByID", s.Subtest(func(db database.Store, check *expects) {
1082-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1088+
tpl := dbgen.Template(s.T(), db, database.Template{})
1089+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1090+
TemplateID: tpl.ID,
1091+
})
10831092
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
10841093
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
10851094
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1089,7 +1098,10 @@ func (s *MethodTestSuite) TestWorkspace() {
10891098
}).Asserts(ws, rbac.ActionUpdate).Returns()
10901099
}))
10911100
s.Run("UpdateWorkspaceAgentLogOverflowByID", s.Subtest(func(db database.Store, check *expects) {
1092-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1101+
tpl := dbgen.Template(s.T(), db, database.Template{})
1102+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1103+
TemplateID: tpl.ID,
1104+
})
10931105
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
10941106
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
10951107
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1099,7 +1111,10 @@ func (s *MethodTestSuite) TestWorkspace() {
10991111
}).Asserts(ws, rbac.ActionUpdate).Returns()
11001112
}))
11011113
s.Run("UpdateWorkspaceAgentStartupByID", s.Subtest(func(db database.Store, check *expects) {
1102-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1114+
tpl := dbgen.Template(s.T(), db, database.Template{})
1115+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1116+
TemplateID: tpl.ID,
1117+
})
11031118
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
11041119
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
11051120
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1111,7 +1126,10 @@ func (s *MethodTestSuite) TestWorkspace() {
11111126
}).Asserts(ws, rbac.ActionUpdate).Returns()
11121127
}))
11131128
s.Run("GetWorkspaceAgentLogsAfter", s.Subtest(func(db database.Store, check *expects) {
1114-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1129+
tpl := dbgen.Template(s.T(), db, database.Template{})
1130+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1131+
TemplateID: tpl.ID,
1132+
})
11151133
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
11161134
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
11171135
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1120,7 +1138,10 @@ func (s *MethodTestSuite) TestWorkspace() {
11201138
}).Asserts(ws, rbac.ActionRead).Returns([]database.WorkspaceAgentLog{})
11211139
}))
11221140
s.Run("GetWorkspaceAppByAgentIDAndSlug", s.Subtest(func(db database.Store, check *expects) {
1123-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1141+
tpl := dbgen.Template(s.T(), db, database.Template{})
1142+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1143+
TemplateID: tpl.ID,
1144+
})
11241145
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
11251146
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
11261147
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1132,7 +1153,10 @@ func (s *MethodTestSuite) TestWorkspace() {
11321153
}).Asserts(ws, rbac.ActionRead).Returns(app)
11331154
}))
11341155
s.Run("GetWorkspaceAppsByAgentID", s.Subtest(func(db database.Store, check *expects) {
1135-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1156+
tpl := dbgen.Template(s.T(), db, database.Template{})
1157+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1158+
TemplateID: tpl.ID,
1159+
})
11361160
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
11371161
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
11381162
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
@@ -1173,11 +1197,17 @@ func (s *MethodTestSuite) TestWorkspace() {
11731197
check.Args(database.GetWorkspaceBuildsByWorkspaceIDParams{WorkspaceID: ws.ID}).Asserts(ws, rbac.ActionRead) // ordering
11741198
}))
11751199
s.Run("GetWorkspaceByAgentID", s.Subtest(func(db database.Store, check *expects) {
1176-
ws := dbgen.Workspace(s.T(), db, database.Workspace{})
1200+
tpl := dbgen.Template(s.T(), db, database.Template{})
1201+
ws := dbgen.Workspace(s.T(), db, database.Workspace{
1202+
TemplateID: tpl.ID,
1203+
})
11771204
build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()})
11781205
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID})
11791206
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
1180-
check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(ws)
1207+
check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(database.GetWorkspaceByAgentIDRow{
1208+
Workspace: ws,
1209+
TemplateName: tpl.Name,
1210+
})
11811211
}))
11821212
s.Run("GetWorkspaceByOwnerIDAndName", s.Subtest(func(db database.Store, check *expects) {
11831213
ws := dbgen.Workspace(s.T(), db, database.Workspace{})

coderd/database/dbmem/dbmem.go

+15-2
Original file line numberDiff line numberDiff line change
@@ -4293,11 +4293,24 @@ func (q *FakeQuerier) GetWorkspaceBuildsCreatedAfter(_ context.Context, after ti
42934293
return workspaceBuilds, nil
42944294
}
42954295

4296-
func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) {
4296+
func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) {
42974297
q.mutex.RLock()
42984298
defer q.mutex.RUnlock()
42994299

4300-
return q.getWorkspaceByAgentIDNoLock(ctx, agentID)
4300+
w, err := q.getWorkspaceByAgentIDNoLock(ctx, agentID)
4301+
if err != nil {
4302+
return database.GetWorkspaceByAgentIDRow{}, err
4303+
}
4304+
4305+
tpl, err := q.getTemplateByIDNoLock(ctx, w.TemplateID)
4306+
if err != nil {
4307+
return database.GetWorkspaceByAgentIDRow{}, err
4308+
}
4309+
4310+
return database.GetWorkspaceByAgentIDRow{
4311+
Workspace: w,
4312+
TemplateName: tpl.Name,
4313+
}, nil
43014314
}
43024315

43034316
func (q *FakeQuerier) GetWorkspaceByID(ctx context.Context, id uuid.UUID) (database.Workspace, error) {

coderd/database/dbmetrics/dbmetrics.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)