-
Notifications
You must be signed in to change notification settings - Fork 894
feat: expose workspace statuses (with details) as a prometheus metric #12762
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
843d650
2ed42a3
c31b498
2f5a948
fc61d37
6f95371
f42af07
2cb8ccc
b118044
acd104c
a333f98
c920508
cf14b9d
a94914f
8e6cde9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,10 +24,12 @@ import ( | |
"github.com/coder/coder/v2/tailnet" | ||
) | ||
|
||
const defaultRefreshRate = time.Minute | ||
|
||
// ActiveUsers tracks the number of users that have authenticated within the past hour. | ||
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration = 5 * time.Minute | ||
duration = defaultRefreshRate | ||
} | ||
|
||
gauge := prometheus.NewGauge(prometheus.GaugeOpts{ | ||
|
@@ -72,36 +74,43 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab | |
} | ||
|
||
// Workspaces tracks the total number of workspaces with labels on status. | ||
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { | ||
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration = 5 * time.Minute | ||
duration = defaultRefreshRate | ||
} | ||
|
||
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coderd", | ||
Subsystem: "api", | ||
Name: "workspace_latest_build_total", | ||
Help: "The latest workspace builds with a status.", | ||
Help: "The current number of workspace builds by status.", | ||
}, []string{"status"}) | ||
err := registerer.Register(gauge) | ||
if err != nil { | ||
if err := registerer.Register(workspaceStatuses); err != nil { | ||
return nil, err | ||
} | ||
|
||
workspaceDetails := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking loud: During the last scale tests we hit 2000 workspaces, does it mean that we're going to swamp our Prometheus endpoint with details for all of them? If so, maybe we should make it configurable? cc @johnstcn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a time-series per workspace is excessive here. Operators want to know general counts of workspaces in different states. If they want to know specific workspace names they can just hit our API or look at the This is especially true since the metrics will be reported by each Coderd, so if you have M workspaces and N Coderd replicas, that's M*N time-series that prometheus needs to track. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's fair. I was basing the decision to include it from the original request so I'll defer to @bpmct before removing it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See also Spike's comment: #12462 (comment)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh wow, it seems I misunderstood the initial ask. I was under the incorrect assumption that @bpmct wanted a metric to match the format described in the issue, and "the builds metric" which he was referring to was the existing
Plus we don't have to worry about cardinality because these metrics are written by the provisioner when executing a job, not reading from the database state. I have amended the new metric to drop the I think this will satisfy the requirements but @bpmct please confirm.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep! |
||
Namespace: "coderd", | ||
Subsystem: "api", | ||
Name: "workspace_detail", | ||
mtojek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Help: "The current workspace details by template, transition, owner, and status.", | ||
}, []string{"status", "template_name", "template_version", "workspace_name", "workspace_owner", "workspace_transition"}) | ||
if err := registerer.Register(workspaceDetails); err != nil { | ||
return nil, err | ||
} | ||
// This exists so the prometheus metric exports immediately when set. | ||
// It helps with tests so they don't have to wait for a tick. | ||
gauge.WithLabelValues("pending").Set(0) | ||
|
||
ctx, cancelFunc := context.WithCancel(ctx) | ||
done := make(chan struct{}) | ||
|
||
// Use time.Nanosecond to force an initial tick. It will be reset to the | ||
// correct duration after executing once. | ||
ticker := time.NewTicker(time.Nanosecond) | ||
doTick := func() { | ||
defer ticker.Reset(duration) | ||
|
||
updateWorkspaceStatuses := func() { | ||
builds, err := db.GetLatestWorkspaceBuilds(ctx) | ||
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceStatuses.Reset() | ||
} | ||
|
||
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err)) | ||
return | ||
} | ||
jobIDs := make([]uuid.UUID, 0, len(builds)) | ||
|
@@ -110,14 +119,59 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa | |
} | ||
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs) | ||
if err != nil { | ||
ids := make([]string, 0, len(jobIDs)) | ||
for _, id := range jobIDs { | ||
ids = append(ids, id.String()) | ||
} | ||
|
||
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err)) | ||
return | ||
} | ||
|
||
gauge.Reset() | ||
workspaceStatuses.Reset() | ||
for _, job := range jobs { | ||
status := codersdk.ProvisionerJobStatus(job.JobStatus) | ||
gauge.WithLabelValues(string(status)).Add(1) | ||
workspaceStatuses.WithLabelValues(string(status)).Add(1) | ||
} | ||
} | ||
|
||
updateWorkspaceDetails := func() { | ||
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ | ||
Deleted: false, | ||
WithSummary: false, | ||
}) | ||
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceDetails.Reset() | ||
} | ||
|
||
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't strictly call it an error because it could be an instance of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Will it also return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Believe so, yes. |
||
return | ||
} | ||
|
||
workspaceDetails.Reset() | ||
for _, w := range ws { | ||
// TODO: there may be a more elegant/idiomatic way to do this? | ||
buildStatus := string(database.ProvisionerJobStatusUnknown) | ||
if val, err := w.LatestBuildStatus.Value(); err == nil { | ||
if status, ok := val.(string); ok { | ||
buildStatus = status | ||
} | ||
} | ||
|
||
workspaceDetails.WithLabelValues(buildStatus, w.TemplateName, w.TemplateVersionName.String, w.Name, w.Username, string(w.LatestBuildTransition)).Set(1) | ||
} | ||
} | ||
|
||
// Use time.Nanosecond to force an initial tick. It will be reset to the | ||
// correct duration after executing once. | ||
ticker := time.NewTicker(time.Nanosecond) | ||
doTick := func() { | ||
defer ticker.Reset(duration) | ||
|
||
updateWorkspaceStatuses() | ||
updateWorkspaceDetails() | ||
} | ||
|
||
go func() { | ||
|
@@ -141,7 +195,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa | |
// Agents tracks the total number of workspaces with labels on status. | ||
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration = 1 * time.Minute | ||
duration = defaultRefreshRate | ||
} | ||
|
||
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
|
@@ -330,7 +384,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis | |
|
||
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) { | ||
if duration == 0 { | ||
duration = 1 * time.Minute | ||
duration = defaultRefreshRate | ||
} | ||
|
||
if len(aggregateByLabels) == 0 { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Side-effect of clearing the gauge when no db rows are loaded.
This wasn't strictly necessary for the test since other manually-registered metrics are included to validate this test.