Skip to content

Commit 843d650

Browse files
committed
Add metric to show operators what statuses workspaces are in, with relevant detail
Light refactoring Signed-off-by: Danny Kopping <danny@coder.com>
1 parent f34592f commit 843d650

File tree

6 files changed

+100
-47
lines changed

6 files changed

+100
-47
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func enablePrometheus(
209209
}
210210
afterCtx(ctx, closeUsersFunc)
211211

212-
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0)
212+
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)
213213
if err != nil {
214214
return nil, xerrors.Errorf("register workspaces prometheus metric: %w", err)
215215
}

coderd/database/modelqueries.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ func (q *sqlQuerier) GetAuthorizedWorkspaces(ctx context.Context, arg GetWorkspa
266266
&i.LatestBuildCanceledAt,
267267
&i.LatestBuildError,
268268
&i.LatestBuildTransition,
269+
&i.LatestBuildStatus,
269270
&i.Count,
270271
); err != nil {
271272
return nil, err

coderd/database/queries.sql.go

Lines changed: 33 additions & 29 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ SELECT
9696
latest_build.completed_at as latest_build_completed_at,
9797
latest_build.canceled_at as latest_build_canceled_at,
9898
latest_build.error as latest_build_error,
99-
latest_build.transition as latest_build_transition
99+
latest_build.transition as latest_build_transition,
100+
latest_build.job_status as latest_build_status
100101
FROM
101102
workspaces
102103
JOIN
@@ -374,7 +375,8 @@ WHERE
374375
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_completed_at,
375376
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_canceled_at,
376377
'', -- latest_build_error
377-
'start'::workspace_transition -- latest_build_transition
378+
'start'::workspace_transition, -- latest_build_transition
379+
'unknown'::provisioner_job_status -- latest_build_status
378380
WHERE
379381
@with_summary :: boolean = true
380382
), total_count AS (

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,36 +72,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
7272
}
7373

7474
// Workspaces tracks the total number of workspaces with labels on status.
75-
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
75+
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
7676
if duration == 0 {
7777
duration = 5 * time.Minute
7878
}
7979

80-
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
80+
workspacesByStatus := prometheus.NewGaugeVec(prometheus.GaugeOpts{
8181
Namespace: "coderd",
8282
Subsystem: "api",
8383
Name: "workspace_latest_build_total",
84-
Help: "The latest workspace builds with a status.",
84+
Help: "The current number of workspace builds by status.",
8585
}, []string{"status"})
86-
err := registerer.Register(gauge)
87-
if err != nil {
86+
if err := registerer.Register(workspacesByStatus); err != nil {
87+
return nil, err
88+
}
89+
90+
workspacesDetail := prometheus.NewGaugeVec(prometheus.GaugeOpts{
91+
Namespace: "coderd",
92+
Subsystem: "api",
93+
Name: "workspace_detail",
94+
Help: "The current workspace details by template, transition, owner, and status.",
95+
}, []string{"status", "template_name", "template_version", "workspace_name", "workspace_owner", "workspace_transition"})
96+
if err := registerer.Register(workspacesDetail); err != nil {
8897
return nil, err
8998
}
9099
// This exists so the prometheus metric exports immediately when set.
91100
// It helps with tests so they don't have to wait for a tick.
92-
gauge.WithLabelValues("pending").Set(0)
101+
workspacesByStatus.WithLabelValues(string(database.ProvisionerJobStatusPending)).Set(0)
102+
workspacesDetail.WithLabelValues(string(database.ProvisionerJobStatusPending), "", "", "", "", "").Set(0)
93103

94104
ctx, cancelFunc := context.WithCancel(ctx)
95105
done := make(chan struct{})
96106

97-
// Use time.Nanosecond to force an initial tick. It will be reset to the
98-
// correct duration after executing once.
99-
ticker := time.NewTicker(time.Nanosecond)
100-
doTick := func() {
101-
defer ticker.Reset(duration)
102-
107+
updateWorkspacesByStatus := func() {
103108
builds, err := db.GetLatestWorkspaceBuilds(ctx)
104109
if err != nil {
110+
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
105111
return
106112
}
107113
jobIDs := make([]uuid.UUID, 0, len(builds))
@@ -110,16 +116,56 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
110116
}
111117
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
112118
if err != nil {
119+
ids := make([]string, 0, len(jobIDs))
120+
for _, id := range jobIDs {
121+
ids = append(ids, id.String())
122+
}
123+
124+
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
113125
return
114126
}
115127

116-
gauge.Reset()
128+
workspacesByStatus.Reset()
117129
for _, job := range jobs {
118130
status := codersdk.ProvisionerJobStatus(job.JobStatus)
119-
gauge.WithLabelValues(string(status)).Add(1)
131+
workspacesByStatus.WithLabelValues(string(status)).Add(1)
120132
}
121133
}
122134

135+
updateWorkspacesDetail := func() {
136+
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
137+
Deleted: false,
138+
WithSummary: false,
139+
})
140+
if err != nil {
141+
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
142+
return
143+
}
144+
145+
workspacesDetail.Reset()
146+
for _, w := range ws {
147+
// TODO: there may be a more elegant/idiomatic way to do this?
148+
buildStatus := string(database.ProvisionerJobStatusUnknown)
149+
if val, err := w.LatestBuildStatus.Value(); err == nil {
150+
if status, ok := val.(string); ok {
151+
buildStatus = status
152+
}
153+
}
154+
155+
workspacesDetail.WithLabelValues(buildStatus, w.TemplateName, w.TemplateVersionName.String, w.Name, w.Username, string(w.LatestBuildTransition)).Set(1)
156+
}
157+
}
158+
159+
// Use time.Nanosecond to force an initial tick. It will be reset to the
160+
// correct duration after executing once.
161+
ticker := time.NewTicker(time.Nanosecond)
162+
doTick := func() {
163+
defer ticker.Reset(duration)
164+
165+
updateWorkspacesByStatus()
166+
updateWorkspacesDetail()
167+
}
168+
123169
go func() {
124170
defer close(done)
125171
defer ticker.Stop()

coderd/prometheusmetrics/prometheusmetrics_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ func TestWorkspaces(t *testing.T) {
229229
t.Run(tc.Name, func(t *testing.T) {
230230
t.Parallel()
231231
registry := prometheus.NewRegistry()
232-
closeFunc, err := prometheusmetrics.Workspaces(context.Background(), registry, tc.Database(), time.Millisecond)
232+
closeFunc, err := prometheusmetrics.Workspaces(context.Background(), slogtest.Make(t, nil), registry, tc.Database(), time.Millisecond)
233233
require.NoError(t, err)
234234
t.Cleanup(closeFunc)
235235

0 commit comments

Comments
 (0)