Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Remove workspace cardinality; rename for clarity
Signed-off-by: Danny Kopping <danny@coder.com>
  • Loading branch information
dannykopping committed Mar 28, 2024
commit a333f983c6b0bc401271836aaceedd1feba2560d
27 changes: 13 additions & 14 deletions coderd/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,22 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R
duration = defaultRefreshRate
}

workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
workspaceStatusTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_latest_build_total",
Help: "The current number of workspace builds by status.",
}, []string{"status"})
if err := registerer.Register(workspaceStatuses); err != nil {
if err := registerer.Register(workspaceStatusTotals); err != nil {
return nil, err
}

workspaceDetails := prometheus.NewGaugeVec(prometheus.GaugeOpts{
workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_detail",
Help: "The current workspace details by template, transition, owner, and status.",
}, []string{"status", "template_name", "template_version", "workspace_name", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceDetails); err != nil {
Name: "workspace_latest_build_status",
Help: "The current workspace statuses by template, transition, and owner.",
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceStatuses); err != nil {
return nil, err
}

Expand All @@ -107,7 +106,7 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceStatuses.Reset()
workspaceStatusTotals.Reset()
}

logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
Expand All @@ -128,10 +127,10 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R
return
}

workspaceStatuses.Reset()
workspaceStatusTotals.Reset()
for _, job := range jobs {
status := codersdk.ProvisionerJobStatus(job.JobStatus)
workspaceStatuses.WithLabelValues(string(status)).Add(1)
workspaceStatusTotals.WithLabelValues(string(status)).Add(1)
}
}

Expand All @@ -143,16 +142,16 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceDetails.Reset()
workspaceStatuses.Reset()
}

logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: maybe logger.Error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't strictly call it an error because it could be an instance of sql.ErrNoRows which is valid but undesirable. To be the most correct I would make it warn in case of sql.ErrNoRows and error in all other cases, but that feels unnecessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which is valid but undesirable

Will it also return sql.ErrNoRows if there are zero workspaces = fresh deployment?

Copy link
Contributor Author

@dannykopping dannykopping Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Believe so, yes.
Or if all workspaces have been deleted.

return
}

workspaceDetails.Reset()
workspaceStatuses.Reset()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this has a race condition where if the registry collects the metrics before the for-loop below is complete, it will get partial data. This can result in glitchy stats were they are zeroed out for a single data point and then return to "normal." If anyone puts alerts on those stats it could result in false alerting in addition to visual noise on the dashboard.

I realize this is following the pattern of the existing metric, which has the same bug.

Instead these stats should be a custom collector that queries the database and returns the requested data on demand.

Copy link
Contributor Author

@dannykopping dannykopping Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead these stats should be a custom collector that queries the database and returns the requested data on demand.
Definitely agree! Called this out in the description as well.

I'm not sure if we can fix this race without the change to the collector pattern. Do you have any ideas?
In reality the race would have to be on the nanosecond scale because this loop will iterate very quickly, but your point is absolutely valid.

I'll add a follow-up issue for this in any case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for _, w := range ws {
workspaceDetails.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Name, w.Username, string(w.LatestBuildTransition)).Set(1)
workspaceStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1)
}
}

Expand Down