-
Notifications
You must be signed in to change notification settings - Fork 988
feat: expose workspace statuses (with details) as a prometheus metric #12762
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
843d650
2ed42a3
c31b498
2f5a948
fc61d37
6f95371
f42af07
2cb8ccc
b118044
acd104c
a333f98
c920508
cf14b9d
a94914f
8e6cde9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Signed-off-by: Danny Kopping <danny@coder.com>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,23 +79,22 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R | |
duration = defaultRefreshRate | ||
} | ||
|
||
workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
workspaceStatusTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coderd", | ||
Subsystem: "api", | ||
Name: "workspace_latest_build_total", | ||
Help: "The current number of workspace builds by status.", | ||
}, []string{"status"}) | ||
if err := registerer.Register(workspaceStatuses); err != nil { | ||
if err := registerer.Register(workspaceStatusTotals); err != nil { | ||
return nil, err | ||
} | ||
|
||
workspaceDetails := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coderd", | ||
Subsystem: "api", | ||
Name: "workspace_detail", | ||
Help: "The current workspace details by template, transition, owner, and status.", | ||
}, []string{"status", "template_name", "template_version", "workspace_name", "workspace_owner", "workspace_transition"}) | ||
if err := registerer.Register(workspaceDetails); err != nil { | ||
Name: "workspace_latest_build_status", | ||
Help: "The current workspace statuses by template, transition, and owner.", | ||
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"}) | ||
if err := registerer.Register(workspaceStatuses); err != nil { | ||
return nil, err | ||
} | ||
|
||
|
@@ -107,7 +106,7 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R | |
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceStatuses.Reset() | ||
workspaceStatusTotals.Reset() | ||
} | ||
|
||
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err)) | ||
|
@@ -128,10 +127,10 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R | |
return | ||
} | ||
|
||
workspaceStatuses.Reset() | ||
workspaceStatusTotals.Reset() | ||
for _, job := range jobs { | ||
status := codersdk.ProvisionerJobStatus(job.JobStatus) | ||
workspaceStatuses.WithLabelValues(string(status)).Add(1) | ||
workspaceStatusTotals.WithLabelValues(string(status)).Add(1) | ||
} | ||
} | ||
|
||
|
@@ -143,16 +142,16 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R | |
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceDetails.Reset() | ||
workspaceStatuses.Reset() | ||
} | ||
|
||
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't strictly call it an error because it could be an instance of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Will it also return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Believe so, yes. |
||
return | ||
} | ||
|
||
workspaceDetails.Reset() | ||
workspaceStatuses.Reset() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this has a race condition where if the registry collects the metrics before the for-loop below is complete, it will get partial data. This can result in glitchy stats were they are zeroed out for a single data point and then return to "normal." If anyone puts alerts on those stats it could result in false alerting in addition to visual noise on the dashboard. I realize this is following the pattern of the existing metric, which has the same bug. Instead these stats should be a custom collector that queries the database and returns the requested data on demand. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not sure if we can fix this race without the change to the collector pattern. Do you have any ideas? I'll add a follow-up issue for this in any case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
for _, w := range ws { | ||
workspaceDetails.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Name, w.Username, string(w.LatestBuildTransition)).Set(1) | ||
workspaceStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1) | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.