Skip to content

Commit 3de98c2

Browse files
authored
feat: add prometheus metric for tracking user statuses (coder#15281)
1 parent e9fbfcc commit 3de98c2

File tree

3 files changed

+158
-3
lines changed

3 files changed

+158
-3
lines changed

cli/server.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,16 @@ func enablePrometheus(
212212
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
213213
options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
214214

215-
closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0)
215+
closeActiveUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.Logger.Named("active_user_metrics"), options.PrometheusRegistry, options.Database, 0)
216216
if err != nil {
217217
return nil, xerrors.Errorf("register active users prometheus metric: %w", err)
218218
}
219+
afterCtx(ctx, closeActiveUsersFunc)
220+
221+
closeUsersFunc, err := prometheusmetrics.Users(ctx, options.Logger.Named("user_metrics"), quartz.NewReal(), options.PrometheusRegistry, options.Database, 0)
222+
if err != nil {
223+
return nil, xerrors.Errorf("register users prometheus metric: %w", err)
224+
}
219225
afterCtx(ctx, closeUsersFunc)
220226

221227
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)

coderd/prometheusmetrics/prometheusmetrics.go

+55-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
"github.com/google/uuid"
1414
"github.com/prometheus/client_golang/prometheus"
15+
"golang.org/x/xerrors"
1516
"tailscale.com/tailcfg"
1617

1718
"cdr.dev/slog"
@@ -22,12 +23,13 @@ import (
2223
"github.com/coder/coder/v2/coderd/database/dbtime"
2324
"github.com/coder/coder/v2/codersdk"
2425
"github.com/coder/coder/v2/tailnet"
26+
"github.com/coder/quartz"
2527
)
2628

2729
const defaultRefreshRate = time.Minute
2830

2931
// ActiveUsers tracks the number of users that have authenticated within the past hour.
30-
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
32+
func ActiveUsers(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
3133
if duration == 0 {
3234
duration = defaultRefreshRate
3335
}
@@ -58,6 +60,7 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
5860

5961
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, dbtime.Now().Add(-1*time.Hour))
6062
if err != nil {
63+
logger.Error(ctx, "get api keys for active users prometheus metric", slog.Error(err))
6164
continue
6265
}
6366
distinctUsers := map[uuid.UUID]struct{}{}
@@ -73,6 +76,57 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
7376
}, nil
7477
}
7578

79+
// Users tracks the total number of registered users, partitioned by status.
80+
func Users(ctx context.Context, logger slog.Logger, clk quartz.Clock, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
81+
if duration == 0 {
82+
// It's not super important this tracks real-time.
83+
duration = defaultRefreshRate * 5
84+
}
85+
86+
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
87+
Namespace: "coderd",
88+
Subsystem: "api",
89+
Name: "total_user_count",
90+
Help: "The total number of registered users, partitioned by status.",
91+
}, []string{"status"})
92+
err := registerer.Register(gauge)
93+
if err != nil {
94+
return nil, xerrors.Errorf("register total_user_count gauge: %w", err)
95+
}
96+
97+
ctx, cancelFunc := context.WithCancel(ctx)
98+
done := make(chan struct{})
99+
ticker := clk.NewTicker(duration)
100+
go func() {
101+
defer close(done)
102+
defer ticker.Stop()
103+
for {
104+
select {
105+
case <-ctx.Done():
106+
return
107+
case <-ticker.C:
108+
}
109+
110+
gauge.Reset()
111+
//nolint:gocritic // This is a system service that needs full access
112+
//to the users table.
113+
users, err := db.GetUsers(dbauthz.AsSystemRestricted(ctx), database.GetUsersParams{})
114+
if err != nil {
115+
logger.Error(ctx, "get all users for prometheus metrics", slog.Error(err))
116+
continue
117+
}
118+
119+
for _, user := range users {
120+
gauge.WithLabelValues(string(user.Status)).Inc()
121+
}
122+
}
123+
}()
124+
return func() {
125+
cancelFunc()
126+
<-done
127+
}, nil
128+
}
129+
76130
// Workspaces tracks the total number of workspaces with labels on status.
77131
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
78132
if duration == 0 {

coderd/prometheusmetrics/prometheusmetrics_test.go

+96-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/coder/coder/v2/tailnet"
3939
"github.com/coder/coder/v2/tailnet/tailnettest"
4040
"github.com/coder/coder/v2/testutil"
41+
"github.com/coder/quartz"
4142
)
4243

4344
func TestActiveUsers(t *testing.T) {
@@ -98,7 +99,7 @@ func TestActiveUsers(t *testing.T) {
9899
t.Run(tc.Name, func(t *testing.T) {
99100
t.Parallel()
100101
registry := prometheus.NewRegistry()
101-
closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), registry, tc.Database(t), time.Millisecond)
102+
closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), slogtest.Make(t, nil), registry, tc.Database(t), time.Millisecond)
102103
require.NoError(t, err)
103104
t.Cleanup(closeFunc)
104105

@@ -112,6 +113,100 @@ func TestActiveUsers(t *testing.T) {
112113
}
113114
}
114115

116+
func TestUsers(t *testing.T) {
117+
t.Parallel()
118+
119+
for _, tc := range []struct {
120+
Name string
121+
Database func(t *testing.T) database.Store
122+
Count map[database.UserStatus]int
123+
}{{
124+
Name: "None",
125+
Database: func(t *testing.T) database.Store {
126+
return dbmem.New()
127+
},
128+
Count: map[database.UserStatus]int{},
129+
}, {
130+
Name: "One",
131+
Database: func(t *testing.T) database.Store {
132+
db := dbmem.New()
133+
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
134+
return db
135+
},
136+
Count: map[database.UserStatus]int{database.UserStatusActive: 1},
137+
}, {
138+
Name: "MultipleStatuses",
139+
Database: func(t *testing.T) database.Store {
140+
db := dbmem.New()
141+
142+
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
143+
dbgen.User(t, db, database.User{Status: database.UserStatusDormant})
144+
145+
return db
146+
},
147+
Count: map[database.UserStatus]int{database.UserStatusActive: 1, database.UserStatusDormant: 1},
148+
}, {
149+
Name: "MultipleActive",
150+
Database: func(t *testing.T) database.Store {
151+
db := dbmem.New()
152+
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
153+
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
154+
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
155+
return db
156+
},
157+
Count: map[database.UserStatus]int{database.UserStatusActive: 3},
158+
}} {
159+
tc := tc
160+
t.Run(tc.Name, func(t *testing.T) {
161+
t.Parallel()
162+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
163+
defer cancel()
164+
165+
registry := prometheus.NewRegistry()
166+
mClock := quartz.NewMock(t)
167+
db := tc.Database(t)
168+
closeFunc, err := prometheusmetrics.Users(context.Background(), slogtest.Make(t, nil), mClock, registry, db, time.Millisecond)
169+
require.NoError(t, err)
170+
t.Cleanup(closeFunc)
171+
172+
_, w := mClock.AdvanceNext()
173+
w.MustWait(ctx)
174+
175+
checkFn := func() bool {
176+
metrics, err := registry.Gather()
177+
if err != nil {
178+
return false
179+
}
180+
181+
// If we get no metrics and we know none should exist, bail
182+
// early. If we get no metrics but we expect some, retry.
183+
if len(metrics) == 0 {
184+
return len(tc.Count) == 0
185+
}
186+
187+
for _, metric := range metrics[0].Metric {
188+
if tc.Count[database.UserStatus(*metric.Label[0].Value)] != int(metric.Gauge.GetValue()) {
189+
return false
190+
}
191+
}
192+
193+
return true
194+
}
195+
196+
require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast)
197+
198+
// Add another dormant user and ensure it updates
199+
dbgen.User(t, db, database.User{Status: database.UserStatusDormant})
200+
tc.Count[database.UserStatusDormant]++
201+
202+
_, w = mClock.AdvanceNext()
203+
w.MustWait(ctx)
204+
205+
require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast)
206+
})
207+
}
208+
}
209+
115210
func TestWorkspaceLatestBuildTotals(t *testing.T) {
116211
t.Parallel()
117212

0 commit comments

Comments
 (0)