Skip to content

Commit 7ebe6bd

Browse files
committed
wip: work on agent script metrics
1 parent 6d66cb2 commit 7ebe6bd

File tree

9 files changed

+91
-13
lines changed

9 files changed

+91
-13
lines changed

agent/agent.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535
"tailscale.com/types/netlogtype"
3636

3737
"cdr.dev/slog"
38+
"github.com/coder/retry"
39+
3840
"github.com/coder/coder/v2/agent/agentproc"
3941
"github.com/coder/coder/v2/agent/agentscripts"
4042
"github.com/coder/coder/v2/agent/agentssh"
@@ -45,7 +47,6 @@ import (
4547
"github.com/coder/coder/v2/codersdk"
4648
"github.com/coder/coder/v2/codersdk/agentsdk"
4749
"github.com/coder/coder/v2/tailnet"
48-
"github.com/coder/retry"
4950
)
5051

5152
const (
@@ -222,8 +223,11 @@ type agent struct {
222223
connCountReconnectingPTY atomic.Int64
223224

224225
prometheusRegistry *prometheus.Registry
225-
metrics *agentMetrics
226-
syscaller agentproc.Syscaller
226+
// metrics are prometheus registered metrics that will be collected and
227+
// labeled in Coder with the agent + workspace.
228+
metrics *agentMetrics
229+
stats agentStats
230+
syscaller agentproc.Syscaller
227231

228232
// modifiedProcs is used for testing process priority management.
229233
modifiedProcs chan []*agentproc.Process
@@ -252,6 +256,9 @@ func (a *agent) init(ctx context.Context) {
252256
Filesystem: a.filesystem,
253257
PatchLogs: a.client.PatchLogs,
254258
})
259+
// Register runner metrics. If the prom registry is nil, the metrics
260+
// will not report anywhere.
261+
a.scriptRunner.RegisterMetrics(a.prometheusRegistry)
255262
go a.runLoop(ctx)
256263
}
257264

@@ -745,6 +752,7 @@ func (a *agent) run(ctx context.Context) error {
745752
return xerrors.Errorf("init script runner: %w", err)
746753
}
747754
err = a.trackConnGoroutine(func() {
755+
start := time.Now()
748756
err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
749757
return script.RunOnStart
750758
})
@@ -758,6 +766,16 @@ func (a *agent) run(ctx context.Context) error {
758766
} else {
759767
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady)
760768
}
769+
770+
dur := time.Since(start).Nanoseconds()
771+
// If something really look 0 ns, just set it to 1 to indicate that it ran.
772+
// Otherwise, 0 looks like the startup script has not run yet. I don't think
773+
// this will ever be 1ns
774+
if dur == 0 {
775+
dur = 1
776+
}
777+
a.stats.startScriptNs.Store(dur)
778+
a.stats.startScriptSuccess.Store(err == nil)
761779
a.scriptRunner.StartCron()
762780
})
763781
if err != nil {

agent/agentscripts/agentscripts.go

+32-2
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ import (
1313
"sync/atomic"
1414
"time"
1515

16+
"github.com/prometheus/client_golang/prometheus"
1617
"github.com/robfig/cron/v3"
1718
"github.com/spf13/afero"
1819
"golang.org/x/sync/errgroup"
1920
"golang.org/x/xerrors"
2021

2122
"cdr.dev/slog"
23+
2224
"github.com/coder/coder/v2/agent/agentssh"
2325
"github.com/coder/coder/v2/codersdk"
2426
"github.com/coder/coder/v2/codersdk/agentsdk"
@@ -57,6 +59,11 @@ func New(opts Options) *Runner {
5759
cronCtxCancel: cronCtxCancel,
5860
cron: cron.New(cron.WithParser(parser)),
5961
closed: make(chan struct{}),
62+
scriptsExecuted: prometheus.NewCounterVec(prometheus.CounterOpts{
63+
Namespace: "agent",
64+
Subsystem: "scripts",
65+
Name: "executed_total",
66+
}, []string{"success"}),
6067
}
6168
}
6269

@@ -71,6 +78,18 @@ type Runner struct {
7178
cron *cron.Cron
7279
initialized atomic.Bool
7380
scripts []codersdk.WorkspaceAgentScript
81+
82+
// Metrics
83+
// scripts includes scripts that are scheduled.
84+
scriptsExecuted *prometheus.CounterVec
85+
}
86+
87+
func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
88+
if reg == nil {
89+
// If no registry, do nothing.
90+
return
91+
}
92+
reg.MustRegister(r.scriptsExecuted)
7493
}
7594

7695
// Init initializes the runner with the provided scripts.
@@ -90,7 +109,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
90109
}
91110
script := script
92111
_, err := r.cron.AddFunc(script.Cron, func() {
93-
err := r.run(r.cronCtx, script)
112+
err := r.trackRun(r.cronCtx, script)
94113
if err != nil {
95114
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
96115
}
@@ -131,7 +150,7 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
131150
}
132151
script := script
133152
eg.Go(func() error {
134-
err := r.run(ctx, script)
153+
err := r.trackRun(ctx, script)
135154
if err != nil {
136155
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
137156
}
@@ -141,6 +160,17 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
141160
return eg.Wait()
142161
}
143162

163+
// trackRun wraps "run" with metrics.
164+
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
165+
err := r.run(ctx, script)
166+
if err != nil {
167+
r.scriptsExecuted.WithLabelValues("false").Add(1)
168+
} else {
169+
r.scriptsExecuted.WithLabelValues("true").Add(1)
170+
}
171+
return err
172+
}
173+
144174
// run executes the provided script with the timeout.
145175
// If the timeout is exceeded, the process is sent an interrupt signal.
146176
// If the process does not exit after a few seconds, it is forcefully killed.

agent/metrics.go

+14
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,27 @@ import (
77

88
"github.com/prometheus/client_golang/prometheus"
99
prompb "github.com/prometheus/client_model/go"
10+
"go.uber.org/atomic"
1011
"tailscale.com/util/clientmetric"
1112

1213
"cdr.dev/slog"
1314

1415
"github.com/coder/coder/v2/codersdk/agentsdk"
1516
)
1617

18+
// agentStats unlike agentMetrics, are not prometheus metrics. Prometheus' metrics
19+
// are sent to Coder as generic "metrics" that get labeled and reported for each
20+
// workspace. agentStats are sent to Coder as first-class metrics that Coder decides
21+
// how to aggregate and report.
22+
type agentStats struct {
23+
// startScriptNs is the time in nanoseconds that the start script(s)
24+
// took to run. This is reported once per agent, and is collected into a
25+
// histogram by Coder.
26+
startScriptNs atomic.Int64
27+
// startScriptSuccess should be ignored if startScriptReadyMs is 0.
28+
startScriptSuccess atomic.Bool
29+
}
30+
1731
type agentMetrics struct {
1832
connectionsTotal prometheus.Counter
1933
reconnectingPTYErrors *prometheus.CounterVec

coderd/database/models.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaceagentstats.sql

+8-3
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,11 @@ WITH agent_stats AS (
171171
WHERE created_at > $1 AND connection_median_latency_ms > 0
172172
) AS a
173173
WHERE a.rn = 1
174-
GROUP BY a.user_id, a.agent_id, a.workspace_id
174+
GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id
175175
)
176176
SELECT
177-
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
177+
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name,
178+
workspaces.template_id AS template_id, rx_bytes, tx_bytes,
178179
session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty,
179180
connection_count, connection_median_latency_ms
180181
FROM
@@ -194,4 +195,8 @@ ON
194195
JOIN
195196
workspaces
196197
ON
197-
workspaces.id = agent_stats.workspace_id;
198+
workspaces.id = agent_stats.workspace_id
199+
JOIN
200+
templates
201+
ON
202+
templates.id = workspaces.template_id;

coderd/workspaceagents.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"tailscale.com/tailcfg"
3131

3232
"cdr.dev/slog"
33+
3334
"github.com/coder/coder/v2/coderd/autobuild"
3435
"github.com/coder/coder/v2/coderd/database"
3536
"github.com/coder/coder/v2/coderd/database/dbauthz"
@@ -1677,7 +1678,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques
16771678
var nextAutostart time.Time
16781679
if workspace.AutostartSchedule.String != "" {
16791680
templateSchedule, err := (*(api.TemplateScheduleStore.Load())).Get(ctx, api.Database, workspace.TemplateID)
1680-
// If the template schedule fails to load, just default to bumping without the next trasition and log it.
1681+
// If the template schedule fails to load, just default to bumping without the next transition and log it.
16811682
if err != nil {
16821683
api.Logger.Warn(ctx, "failed to load template schedule bumping activity, defaulting to bumping by 60min",
16831684
slog.F("workspace_id", workspace.ID),

codersdk/agentsdk/agentsdk.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ import (
1919
"tailscale.com/tailcfg"
2020

2121
"cdr.dev/slog"
22-
"github.com/coder/coder/v2/codersdk"
2322
"github.com/coder/retry"
23+
24+
"github.com/coder/coder/v2/codersdk"
2425
)
2526

2627
// ExternalLogSourceID is the statically-defined ID of a log-source that
@@ -574,6 +575,15 @@ type Stats struct {
574575
// that are normal, non-tagged SSH sessions.
575576
SessionCountSSH int64 `json:"session_count_ssh"`
576577

578+
// Script stats relate to all scripts executed by the agent.
579+
// StartupScriptNs is the duration in nano seconds the startup scripts
580+
// took to execute. If there are no scripts, this still has some value > 0.
581+
// This is because the act of "no script" still takes time to eval, and still
582+
// has a "success" value.
583+
StartupScriptNs int64 `json:"startup_script_ns"`
584+
// StartupScriptSuccess is true if the startup script(s) executed successfully.
585+
StartupScriptSuccess bool `json:"startup_script_success"`
586+
577587
// Metrics collected by the agent
578588
Metrics []AgentMetric `json:"metrics"`
579589
}

0 commit comments

Comments
 (0)