Skip to content

Commit 79fb8e4

Browse files
authored
feat: expose workspace statuses (with details) as a prometheus metric (coder#12762)
Implements coder#12462
1 parent 114830d commit 79fb8e4

File tree

8 files changed

+375
-161
lines changed

8 files changed

+375
-161
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func enablePrometheus(
209209
}
210210
afterCtx(ctx, closeUsersFunc)
211211

212-
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0)
212+
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)
213213
if err != nil {
214214
return nil, xerrors.Errorf("register workspaces prometheus metric: %w", err)
215215
}

cli/server_test.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -973,26 +973,20 @@ func TestServer(t *testing.T) {
973973

974974
scanner := bufio.NewScanner(res.Body)
975975
hasActiveUsers := false
976-
hasWorkspaces := false
977976
for scanner.Scan() {
978977
// This metric is manually registered to be tracked in the server. That's
979978
// why we test it's tracked here.
980979
if strings.HasPrefix(scanner.Text(), "coderd_api_active_users_duration_hour") {
981980
hasActiveUsers = true
982981
continue
983982
}
984-
if strings.HasPrefix(scanner.Text(), "coderd_api_workspace_latest_build_total") {
985-
hasWorkspaces = true
986-
continue
987-
}
988983
if strings.HasPrefix(scanner.Text(), "coderd_db_query_latencies_seconds") {
989984
t.Fatal("db metrics should not be tracked when --prometheus-collect-db-metrics is not enabled")
990985
}
991986
t.Logf("scanned %s", scanner.Text())
992987
}
993988
require.NoError(t, scanner.Err())
994989
require.True(t, hasActiveUsers)
995-
require.True(t, hasWorkspaces)
996990
})
997991

998992
t.Run("DBMetricsEnabled", func(t *testing.T) {

coderd/database/dbmem/dbmem.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,16 @@ func (q *FakeQuerier) convertToWorkspaceRowsNoLock(ctx context.Context, workspac
404404
break
405405
}
406406
}
407+
408+
if pj, err := q.getProvisionerJobByIDNoLock(ctx, build.JobID); err == nil {
409+
wr.LatestBuildStatus = pj.JobStatus
410+
}
411+
412+
wr.LatestBuildTransition = build.Transition
413+
}
414+
415+
if u, err := q.getUserByIDNoLock(w.OwnerID); err == nil {
416+
wr.Username = u.Username
407417
}
408418

409419
rows = append(rows, wr)

coderd/database/modelqueries.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ func (q *sqlQuerier) GetAuthorizedWorkspaces(ctx context.Context, arg GetWorkspa
266266
&i.LatestBuildCanceledAt,
267267
&i.LatestBuildError,
268268
&i.LatestBuildTransition,
269+
&i.LatestBuildStatus,
269270
&i.Count,
270271
); err != nil {
271272
return nil, err

coderd/database/queries.sql.go

Lines changed: 34 additions & 30 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ SELECT
9696
latest_build.completed_at as latest_build_completed_at,
9797
latest_build.canceled_at as latest_build_canceled_at,
9898
latest_build.error as latest_build_error,
99-
latest_build.transition as latest_build_transition
99+
latest_build.transition as latest_build_transition,
100+
latest_build.job_status as latest_build_status
100101
FROM
101102
workspaces
102103
JOIN
@@ -118,7 +119,7 @@ LEFT JOIN LATERAL (
118119
provisioner_jobs.job_status
119120
FROM
120121
workspace_builds
121-
LEFT JOIN
122+
JOIN
122123
provisioner_jobs
123124
ON
124125
provisioner_jobs.id = workspace_builds.job_id
@@ -374,7 +375,8 @@ WHERE
374375
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_completed_at,
375376
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_canceled_at,
376377
'', -- latest_build_error
377-
'start'::workspace_transition -- latest_build_transition
378+
'start'::workspace_transition, -- latest_build_transition
379+
'unknown'::provisioner_job_status -- latest_build_status
378380
WHERE
379381
@with_summary :: boolean = true
380382
), total_count AS (

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 65 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@ import (
2424
"github.com/coder/coder/v2/tailnet"
2525
)
2626

27+
const defaultRefreshRate = time.Minute
28+
2729
// ActiveUsers tracks the number of users that have authenticated within the past hour.
2830
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
2931
if duration == 0 {
30-
duration = 5 * time.Minute
32+
duration = defaultRefreshRate
3133
}
3234

3335
gauge := prometheus.NewGauge(prometheus.GaugeOpts{
@@ -72,36 +74,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
7274
}
7375

7476
// Workspaces tracks the total number of workspaces with labels on status.
75-
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
77+
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
7678
if duration == 0 {
77-
duration = 5 * time.Minute
79+
duration = defaultRefreshRate
7880
}
7981

80-
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
82+
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
8183
Namespace: "coderd",
8284
Subsystem: "api",
8385
Name: "workspace_latest_build_total",
84-
Help: "The latest workspace builds with a status.",
86+
Help: "The current number of workspace builds by status.",
8587
}, []string{"status"})
86-
err := registerer.Register(gauge)
87-
if err != nil {
88+
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
89+
return nil, err
90+
}
91+
92+
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
93+
Namespace: "coderd",
94+
Name: "workspace_latest_build_status",
95+
Help: "The current workspace statuses by template, transition, and owner.",
96+
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
97+
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
8898
return nil, err
8999
}
90-
// This exists so the prometheus metric exports immediately when set.
91-
// It helps with tests so they don't have to wait for a tick.
92-
gauge.WithLabelValues("pending").Set(0)
93100

94101
ctx, cancelFunc := context.WithCancel(ctx)
95102
done := make(chan struct{})
96103

97-
// Use time.Nanosecond to force an initial tick. It will be reset to the
98-
// correct duration after executing once.
99-
ticker := time.NewTicker(time.Nanosecond)
100-
doTick := func() {
101-
defer ticker.Reset(duration)
102-
104+
updateWorkspaceTotals := func() {
103105
builds, err := db.GetLatestWorkspaceBuilds(ctx)
104106
if err != nil {
107+
if errors.Is(err, sql.ErrNoRows) {
108+
// clear all series if there are no database entries
109+
workspaceLatestBuildTotals.Reset()
110+
}
111+
112+
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
105113
return
106114
}
107115
jobIDs := make([]uuid.UUID, 0, len(builds))
@@ -110,16 +118,53 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
110118
}
111119
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
112120
if err != nil {
121+
ids := make([]string, 0, len(jobIDs))
122+
for _, id := range jobIDs {
123+
ids = append(ids, id.String())
124+
}
125+
126+
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
113127
return
114128
}
115129

116-
gauge.Reset()
130+
workspaceLatestBuildTotals.Reset()
117131
for _, job := range jobs {
118132
status := codersdk.ProvisionerJobStatus(job.JobStatus)
119-
gauge.WithLabelValues(string(status)).Add(1)
133+
workspaceLatestBuildTotals.WithLabelValues(string(status)).Add(1)
120134
}
121135
}
122136

137+
updateWorkspaceStatuses := func() {
138+
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
139+
Deleted: false,
140+
WithSummary: false,
141+
})
142+
if err != nil {
143+
if errors.Is(err, sql.ErrNoRows) {
144+
// clear all series if there are no database entries
145+
workspaceLatestBuildStatuses.Reset()
146+
}
147+
148+
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
149+
return
150+
}
151+
152+
workspaceLatestBuildStatuses.Reset()
153+
for _, w := range ws {
154+
workspaceLatestBuildStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1)
155+
}
156+
}
157+
158+
// Use time.Nanosecond to force an initial tick. It will be reset to the
159+
// correct duration after executing once.
160+
ticker := time.NewTicker(time.Nanosecond)
161+
doTick := func() {
162+
defer ticker.Reset(duration)
163+
164+
updateWorkspaceTotals()
165+
updateWorkspaceStatuses()
166+
}
167+
123168
go func() {
124169
defer close(done)
125170
defer ticker.Stop()
@@ -141,7 +186,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
141186
// Agents tracks the total number of workspaces with labels on status.
142187
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
143188
if duration == 0 {
144-
duration = 1 * time.Minute
189+
duration = defaultRefreshRate
145190
}
146191

147192
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -330,7 +375,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
330375

331376
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
332377
if duration == 0 {
333-
duration = 1 * time.Minute
378+
duration = defaultRefreshRate
334379
}
335380

336381
if len(aggregateByLabels) == 0 {

0 commit comments

Comments
 (0)