diff --git a/cli/server.go b/cli/server.go index b29b39b05fb4a..f19c0df86fd1b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -212,10 +212,16 @@ func enablePrometheus( options.PrometheusRegistry.MustRegister(collectors.NewGoCollector()) options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) - closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0) + closeActiveUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.Logger.Named("active_user_metrics"), options.PrometheusRegistry, options.Database, 0) if err != nil { return nil, xerrors.Errorf("register active users prometheus metric: %w", err) } + afterCtx(ctx, closeActiveUsersFunc) + + closeUsersFunc, err := prometheusmetrics.Users(ctx, options.Logger.Named("user_metrics"), quartz.NewReal(), options.PrometheusRegistry, options.Database, 0) + if err != nil { + return nil, xerrors.Errorf("register users prometheus metric: %w", err) + } afterCtx(ctx, closeUsersFunc) closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index ebd50ff0f42ce..ccd88a9e3fc1d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -12,6 +12,7 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" "tailscale.com/tailcfg" "cdr.dev/slog" @@ -22,12 +23,13 @@ import ( "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/tailnet" + "github.com/coder/quartz" ) const defaultRefreshRate = time.Minute // ActiveUsers tracks the number of users that have authenticated within the past hour. -func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { +func ActiveUsers(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { duration = defaultRefreshRate } @@ -58,6 +60,7 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, dbtime.Now().Add(-1*time.Hour)) if err != nil { + logger.Error(ctx, "get api keys for active users prometheus metric", slog.Error(err)) continue } distinctUsers := map[uuid.UUID]struct{}{} @@ -73,6 +76,57 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab }, nil } +// Users tracks the total number of registered users, partitioned by status. +func Users(ctx context.Context, logger slog.Logger, clk quartz.Clock, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { + if duration == 0 { + // It's not super important this tracks real-time. + duration = defaultRefreshRate * 5 + } + + gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "api", + Name: "total_user_count", + Help: "The total number of registered users, partitioned by status.", + }, []string{"status"}) + err := registerer.Register(gauge) + if err != nil { + return nil, xerrors.Errorf("register total_user_count gauge: %w", err) + } + + ctx, cancelFunc := context.WithCancel(ctx) + done := make(chan struct{}) + ticker := clk.NewTicker(duration) + go func() { + defer close(done) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + gauge.Reset() + //nolint:gocritic // This is a system service that needs full access + //to the users table. + users, err := db.GetUsers(dbauthz.AsSystemRestricted(ctx), database.GetUsersParams{}) + if err != nil { + logger.Error(ctx, "get all users for prometheus metrics", slog.Error(err)) + continue + } + + for _, user := range users { + gauge.WithLabelValues(string(user.Status)).Inc() + } + } + }() + return func() { + cancelFunc() + <-done + }, nil +} + // Workspaces tracks the total number of workspaces with labels on status. func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 1c904d9f342e2..84aeda148662e 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -38,6 +38,7 @@ import ( "github.com/coder/coder/v2/tailnet" "github.com/coder/coder/v2/tailnet/tailnettest" "github.com/coder/coder/v2/testutil" + "github.com/coder/quartz" ) func TestActiveUsers(t *testing.T) { @@ -98,7 +99,7 @@ func TestActiveUsers(t *testing.T) { t.Run(tc.Name, func(t *testing.T) { t.Parallel() registry := prometheus.NewRegistry() - closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), registry, tc.Database(t), time.Millisecond) + closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), slogtest.Make(t, nil), registry, tc.Database(t), time.Millisecond) require.NoError(t, err) t.Cleanup(closeFunc) @@ -112,6 +113,100 @@ func TestActiveUsers(t *testing.T) { } } +func TestUsers(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + Name string + Database func(t *testing.T) database.Store + Count map[database.UserStatus]int + }{{ + Name: "None", + Database: func(t *testing.T) database.Store { + return dbmem.New() + }, + Count: map[database.UserStatus]int{}, + }, { + Name: "One", + Database: func(t *testing.T) database.Store { + db := dbmem.New() + dbgen.User(t, db, database.User{Status: database.UserStatusActive}) + return db + }, + Count: map[database.UserStatus]int{database.UserStatusActive: 1}, + }, { + Name: "MultipleStatuses", + Database: func(t *testing.T) database.Store { + db := dbmem.New() + + dbgen.User(t, db, database.User{Status: database.UserStatusActive}) + dbgen.User(t, db, database.User{Status: database.UserStatusDormant}) + + return db + }, + Count: map[database.UserStatus]int{database.UserStatusActive: 1, database.UserStatusDormant: 1}, + }, { + Name: "MultipleActive", + Database: func(t *testing.T) database.Store { + db := dbmem.New() + dbgen.User(t, db, database.User{Status: database.UserStatusActive}) + dbgen.User(t, db, database.User{Status: database.UserStatusActive}) + dbgen.User(t, db, database.User{Status: database.UserStatusActive}) + return db + }, + Count: map[database.UserStatus]int{database.UserStatusActive: 3}, + }} { + tc := tc + t.Run(tc.Name, func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + registry := prometheus.NewRegistry() + mClock := quartz.NewMock(t) + db := tc.Database(t) + closeFunc, err := prometheusmetrics.Users(context.Background(), slogtest.Make(t, nil), mClock, registry, db, time.Millisecond) + require.NoError(t, err) + t.Cleanup(closeFunc) + + _, w := mClock.AdvanceNext() + w.MustWait(ctx) + + checkFn := func() bool { + metrics, err := registry.Gather() + if err != nil { + return false + } + + // If we get no metrics and we know none should exist, bail + // early. If we get no metrics but we expect some, retry. + if len(metrics) == 0 { + return len(tc.Count) == 0 + } + + for _, metric := range metrics[0].Metric { + if tc.Count[database.UserStatus(*metric.Label[0].Value)] != int(metric.Gauge.GetValue()) { + return false + } + } + + return true + } + + require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast) + + // Add another dormant user and ensure it updates + dbgen.User(t, db, database.User{Status: database.UserStatusDormant}) + tc.Count[database.UserStatusDormant]++ + + _, w = mClock.AdvanceNext() + w.MustWait(ctx) + + require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast) + }) + } +} + func TestWorkspaceLatestBuildTotals(t *testing.T) { t.Parallel()