Skip to content

feat: expose workspace statuses (with details) as a prometheus metric #12762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func enablePrometheus(
}
afterCtx(ctx, closeUsersFunc)

closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0)
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)
if err != nil {
return nil, xerrors.Errorf("register workspaces prometheus metric: %w", err)
}
Expand Down
6 changes: 0 additions & 6 deletions cli/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -973,26 +973,20 @@ func TestServer(t *testing.T) {

scanner := bufio.NewScanner(res.Body)
hasActiveUsers := false
hasWorkspaces := false
for scanner.Scan() {
// This metric is manually registered to be tracked in the server. That's
// why we test it's tracked here.
if strings.HasPrefix(scanner.Text(), "coderd_api_active_users_duration_hour") {
hasActiveUsers = true
continue
}
if strings.HasPrefix(scanner.Text(), "coderd_api_workspace_latest_build_total") {
hasWorkspaces = true
continue
}
Comment on lines -984 to -987
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side-effect of clearing the gauge when no db rows are loaded.
This wasn't strictly necessary for the test since other manually-registered metrics are included to validate this test.

if strings.HasPrefix(scanner.Text(), "coderd_db_query_latencies_seconds") {
t.Fatal("db metrics should not be tracked when --prometheus-collect-db-metrics is not enabled")
}
t.Logf("scanned %s", scanner.Text())
}
require.NoError(t, scanner.Err())
require.True(t, hasActiveUsers)
require.True(t, hasWorkspaces)
})

t.Run("DBMetricsEnabled", func(t *testing.T) {
Expand Down
10 changes: 10 additions & 0 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,16 @@ func (q *FakeQuerier) convertToWorkspaceRowsNoLock(ctx context.Context, workspac
break
}
}

if pj, err := q.GetProvisionerJobByID(ctx, build.JobID); err == nil {
wr.LatestBuildStatus = database.NullProvisionerJobStatus{ProvisionerJobStatus: pj.JobStatus, Valid: true}
}

wr.LatestBuildTransition = build.Transition
}

if u, err := q.GetUserByID(ctx, w.OwnerID); err == nil {
wr.Username = u.Username
}

rows = append(rows, wr)
Expand Down
1 change: 1 addition & 0 deletions coderd/database/modelqueries.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ func (q *sqlQuerier) GetAuthorizedWorkspaces(ctx context.Context, arg GetWorkspa
&i.LatestBuildCanceledAt,
&i.LatestBuildError,
&i.LatestBuildTransition,
&i.LatestBuildStatus,
&i.Count,
); err != nil {
return nil, err
Expand Down
62 changes: 33 additions & 29 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions coderd/database/queries/workspaces.sql
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ SELECT
latest_build.completed_at as latest_build_completed_at,
latest_build.canceled_at as latest_build_canceled_at,
latest_build.error as latest_build_error,
latest_build.transition as latest_build_transition
latest_build.transition as latest_build_transition,
latest_build.job_status as latest_build_status
FROM
workspaces
JOIN
Expand Down Expand Up @@ -374,7 +375,8 @@ WHERE
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_completed_at,
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_canceled_at,
'', -- latest_build_error
'start'::workspace_transition -- latest_build_transition
'start'::workspace_transition, -- latest_build_transition
'unknown'::provisioner_job_status -- latest_build_status
WHERE
@with_summary :: boolean = true
), total_count AS (
Expand Down
94 changes: 74 additions & 20 deletions coderd/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ import (
"github.com/coder/coder/v2/tailnet"
)

const defaultRefreshRate = time.Minute

// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 5 * time.Minute
duration = defaultRefreshRate
}

gauge := prometheus.NewGauge(prometheus.GaugeOpts{
Expand Down Expand Up @@ -72,36 +74,43 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
}

// Workspaces tracks the total number of workspaces with labels on status.
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 5 * time.Minute
duration = defaultRefreshRate
}

gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
workspaceStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_latest_build_total",
Help: "The latest workspace builds with a status.",
Help: "The current number of workspace builds by status.",
}, []string{"status"})
err := registerer.Register(gauge)
if err != nil {
if err := registerer.Register(workspaceStatuses); err != nil {
return nil, err
}

workspaceDetails := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking loud:

During the last scale tests we hit 2000 workspaces, does it mean that we're going to swamp our Prometheus endpoint with details for all of them? If so, maybe we should make it configurable? cc @johnstcn

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a time-series per workspace is excessive here. Operators want to know general counts of workspaces in different states. If they want to know specific workspace names they can just hit our API or look at the /workspaces page in our frontend.

This is especially true since the metrics will be reported by each Coderd, so if you have M workspaces and N Coderd replicas, that's M*N time-series that prometheus needs to track.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair. I was basing the decision to include it from the original request so I'll defer to @bpmct before removing it.

Copy link
Member

@johnstcn johnstcn Mar 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See also Spike's comment: #12462 (comment)

I think what they want is a GUAGE that tracks the current number of workspaces in a particular state, not a counter of builds.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wow, it seems I misunderstood the initial ask.

I was under the incorrect assumption that @bpmct wanted a metric to match the format described in the issue, and "the builds metric" which he was referring to was the existing coderd_api_workspace_latest_build_total metric 🤦

coderd_workspace_builds_total already exists!

# HELP coderd_workspace_builds_total The number of workspaces started, updated, or deleted.
# TYPE coderd_workspace_builds_total counter
coderd_workspace_builds_total{status="failed",template_name="docker",template_version="vibrant_davinci2",workspace_name="asfsasfafs",workspace_owner="danny",workspace_transition="START"} 1
coderd_workspace_builds_total{status="success",template_name="docker",template_version="beautiful_visvesvaraya1",workspace_name="2",workspace_owner="danny",workspace_transition="START"} 1
coderd_workspace_builds_total{status="success",template_name="docker",template_version="beautiful_visvesvaraya1",workspace_name="asfsasfafs",workspace_owner="danny",workspace_transition="START"} 1
coderd_workspace_builds_total{status="success",template_name="docker",template_version="beautiful_visvesvaraya1",workspace_name="asfsasfafs",workspace_owner="danny",workspace_transition="STOP"} 1
coderd_workspace_builds_total{status="success",template_name="docker",template_version="mystifying_driscoll3",workspace_name="2",workspace_owner="danny",workspace_transition="STOP"} 1

Plus we don't have to worry about cardinality because these metrics are written by the provisioner when executing a job, not reading from the database state.

I have amended the new metric to drop the workspace label.

I think this will satisfy the requirements but @bpmct please confirm.

# HELP coderd_workspace_latest_build_status The current workspace statuses by template, transition, and owner.
# TYPE coderd_workspace_latest_build_status gauge
coderd_workspace_latest_build_status{status="failed",template_name="docker",template_version="happy_feynman7",workspace_owner="danny",workspace_transition="stop"} 1
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="beautiful_visvesvaraya1",workspace_owner="danny",workspace_transition="start"
} 2
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="distracted_hopper6",workspace_owner="danny",workspace_transition="start"} 1
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="ecstatic_greider3",workspace_owner="danny",workspace_transition="start"} 1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will satisfy the requirements but @bpmct please confirm.

# HELP coderd_workspace_latest_build_status The current workspace statuses by template, transition, and owner.
# TYPE coderd_workspace_latest_build_status gauge
coderd_workspace_latest_build_status{status="failed",template_name="docker",template_version="happy_feynman7",workspace_owner="danny",workspace_transition="stop"} 1
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="beautiful_visvesvaraya1",workspace_owner="danny",workspace_transition="start"
} 2
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="distracted_hopper6",workspace_owner="danny",workspace_transition="start"} 1
coderd_workspace_latest_build_status{status="succeeded",template_name="docker",template_version="ecstatic_greider3",workspace_owner="danny",workspace_transition="start"} 1

Yep!

Namespace: "coderd",
Subsystem: "api",
Name: "workspace_detail",
Help: "The current workspace details by template, transition, owner, and status.",
}, []string{"status", "template_name", "template_version", "workspace_name", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceDetails); err != nil {
return nil, err
}
// This exists so the prometheus metric exports immediately when set.
// It helps with tests so they don't have to wait for a tick.
gauge.WithLabelValues("pending").Set(0)

ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})

// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)

updateWorkspaceStatuses := func() {
builds, err := db.GetLatestWorkspaceBuilds(ctx)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceStatuses.Reset()
}

logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
return
}
jobIDs := make([]uuid.UUID, 0, len(builds))
Expand All @@ -110,14 +119,59 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
}
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
if err != nil {
ids := make([]string, 0, len(jobIDs))
for _, id := range jobIDs {
ids = append(ids, id.String())
}

logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
return
}

gauge.Reset()
workspaceStatuses.Reset()
for _, job := range jobs {
status := codersdk.ProvisionerJobStatus(job.JobStatus)
gauge.WithLabelValues(string(status)).Add(1)
workspaceStatuses.WithLabelValues(string(status)).Add(1)
}
}

updateWorkspaceDetails := func() {
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
Deleted: false,
WithSummary: false,
})
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceDetails.Reset()
}

logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: maybe logger.Error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't strictly call it an error because it could be an instance of sql.ErrNoRows which is valid but undesirable. To be the most correct I would make it warn in case of sql.ErrNoRows and error in all other cases, but that feels unnecessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which is valid but undesirable

Will it also return sql.ErrNoRows if there are zero workspaces = fresh deployment?

Copy link
Contributor Author

@dannykopping dannykopping Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Believe so, yes.
Or if all workspaces have been deleted.

return
}

workspaceDetails.Reset()
for _, w := range ws {
// TODO: there may be a more elegant/idiomatic way to do this?
buildStatus := string(database.ProvisionerJobStatusUnknown)
if val, err := w.LatestBuildStatus.Value(); err == nil {
if status, ok := val.(string); ok {
buildStatus = status
}
}

workspaceDetails.WithLabelValues(buildStatus, w.TemplateName, w.TemplateVersionName.String, w.Name, w.Username, string(w.LatestBuildTransition)).Set(1)
}
}

// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)

updateWorkspaceStatuses()
updateWorkspaceDetails()
}

go func() {
Expand All @@ -141,7 +195,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
// Agents tracks the total number of workspaces with labels on status.
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 1 * time.Minute
duration = defaultRefreshRate
}

agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand Down Expand Up @@ -330,7 +384,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis

func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
if duration == 0 {
duration = 1 * time.Minute
duration = defaultRefreshRate
}

if len(aggregateByLabels) == 0 {
Expand Down
Loading