Skip to content

Commit ccfffc6

Browse files
authored
chore: add tx metrics and logs for serialization errors (coder#15215)
Before db_metrics were all or nothing. Now `InTx` metrics are always recorded, and query metrics are opt in. Adds instrumentation & logging around serialization failures in the database.
1 parent df34858 commit ccfffc6

32 files changed

+3124
-2801
lines changed

cli/server.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,9 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
718718
}
719719

720720
if options.DeploymentValues.Prometheus.Enable && options.DeploymentValues.Prometheus.CollectDBMetrics {
721-
options.Database = dbmetrics.New(options.Database, options.PrometheusRegistry)
721+
options.Database = dbmetrics.NewQueryMetrics(options.Database, options.Logger, options.PrometheusRegistry)
722+
} else {
723+
options.Database = dbmetrics.NewDBMetrics(options.Database, options.Logger, options.PrometheusRegistry)
722724
}
723725

724726
var deploymentID string

cli/testdata/coder_server_--help.golden

+3-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,9 @@ INTROSPECTION / PROMETHEUS OPTIONS:
145145
Collect agent stats (may increase charges for metrics storage).
146146

147147
--prometheus-collect-db-metrics bool, $CODER_PROMETHEUS_COLLECT_DB_METRICS (default: false)
148-
Collect database metrics (may increase charges for metrics storage).
148+
Collect database query metrics (may increase charges for metrics
149+
storage). If set to false, a reduced set of database metrics are still
150+
collected.
149151

150152
--prometheus-enable bool, $CODER_PROMETHEUS_ENABLE
151153
Serve prometheus metrics on the address defined by prometheus address.

cli/testdata/server-config.yaml.golden

+2-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ introspection:
197197
- template_name
198198
- username
199199
- workspace_name
200-
# Collect database metrics (may increase charges for metrics storage).
200+
# Collect database query metrics (may increase charges for metrics storage). If
201+
# set to false, a reduced set of database metrics are still collected.
201202
# (default: false, type: bool)
202203
collect_db_metrics: false
203204
pprof:

coderd/autobuild/lifecycle_executor.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,10 @@ func (e *Executor) runOnce(t time.Time) Stats {
285285

286286
// Run with RepeatableRead isolation so that the build process sees the same data
287287
// as our calculation that determines whether an autobuild is necessary.
288-
}, &sql.TxOptions{Isolation: sql.LevelRepeatableRead})
288+
}, &database.TxOptions{
289+
Isolation: sql.LevelRepeatableRead,
290+
TxIdentifier: "lifecycle",
291+
})
289292
if auditLog != nil {
290293
// If the transition didn't succeed then updating the workspace
291294
// to indicate dormant didn't either.

coderd/coderdtest/promhelp/doc.go

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Package promhelp provides helper functions for asserting Prometheus
2+
// metric values in unit tests.
3+
package promhelp

coderd/coderdtest/promhelp/metrics.go

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
package promhelp
2+
3+
import (
4+
"context"
5+
"io"
6+
"maps"
7+
"net/http"
8+
"net/http/httptest"
9+
"strings"
10+
"testing"
11+
12+
"github.com/prometheus/client_golang/prometheus"
13+
"github.com/prometheus/client_golang/prometheus/promhttp"
14+
ptestutil "github.com/prometheus/client_golang/prometheus/testutil"
15+
io_prometheus_client "github.com/prometheus/client_model/go"
16+
"github.com/stretchr/testify/require"
17+
)
18+
19+
// RegistryDump returns the http page for a given registry's metrics.
20+
// Very useful for visual debugging.
21+
func RegistryDump(reg *prometheus.Registry) string {
22+
h := promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
23+
rec := httptest.NewRecorder()
24+
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil)
25+
h.ServeHTTP(rec, req)
26+
resp := rec.Result()
27+
data, _ := io.ReadAll(resp.Body)
28+
_ = resp.Body.Close()
29+
return string(data)
30+
}
31+
32+
// Compare can be used to compare a registry to some prometheus formatted
33+
// text. If any values differ, an error is returned.
34+
// If metric names are passed in, only those metrics will be compared.
35+
// Usage: `Compare(reg, RegistryDump(reg))`
36+
func Compare(reg prometheus.Gatherer, compare string, metricNames ...string) error {
37+
return ptestutil.GatherAndCompare(reg, strings.NewReader(compare), metricNames...)
38+
}
39+
40+
// HistogramValue returns the value of a histogram metric with the given name and labels.
41+
func HistogramValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) *io_prometheus_client.Histogram {
42+
t.Helper()
43+
44+
labeled := MetricValue(t, reg, metricName, labels)
45+
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
46+
return labeled.GetHistogram()
47+
}
48+
49+
// GaugeValue returns the value of a gauge metric with the given name and labels.
50+
func GaugeValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
51+
t.Helper()
52+
53+
labeled := MetricValue(t, reg, metricName, labels)
54+
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
55+
return int(labeled.GetGauge().GetValue())
56+
}
57+
58+
// CounterValue returns the value of a counter metric with the given name and labels.
59+
func CounterValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
60+
t.Helper()
61+
62+
labeled := MetricValue(t, reg, metricName, labels)
63+
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
64+
return int(labeled.GetCounter().GetValue())
65+
}
66+
67+
func MetricValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) *io_prometheus_client.Metric {
68+
t.Helper()
69+
70+
metrics, err := reg.Gather()
71+
require.NoError(t, err)
72+
73+
for _, m := range metrics {
74+
if m.GetName() == metricName {
75+
for _, labeled := range m.GetMetric() {
76+
mLabels := make(prometheus.Labels)
77+
for _, v := range labeled.GetLabel() {
78+
mLabels[v.GetName()] = v.GetValue()
79+
}
80+
if maps.Equal(mLabels, labels) {
81+
return labeled
82+
}
83+
}
84+
}
85+
}
86+
return nil
87+
}

coderd/cryptokeys/rotate.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,9 @@ func (k *rotator) rotateKeys(ctx context.Context) error {
161161
}
162162
}
163163
return nil
164-
}, &sql.TxOptions{
165-
Isolation: sql.LevelRepeatableRead,
164+
}, &database.TxOptions{
165+
Isolation: sql.LevelRepeatableRead,
166+
TxIdentifier: "rotate_keys",
166167
})
167168
}
168169

coderd/database/db.go

+61-8
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ type Store interface {
2828
wrapper
2929

3030
Ping(ctx context.Context) (time.Duration, error)
31-
InTx(func(Store) error, *sql.TxOptions) error
31+
InTx(func(Store) error, *TxOptions) error
3232
}
3333

3434
type wrapper interface {
@@ -57,6 +57,43 @@ func New(sdb *sql.DB) Store {
5757
}
5858
}
5959

60+
// TxOptions is used to pass some execution metadata to the callers.
61+
// Ideally we could throw this into a context, but no context is used for
62+
// transactions. So instead, the return context is attached to the options
63+
// passed in.
64+
// This metadata should not be returned in the method signature, because it
65+
// is only used for metric tracking. It should never be used by business logic.
66+
type TxOptions struct {
67+
// Isolation is the transaction isolation level.
68+
// If zero, the driver or database's default level is used.
69+
Isolation sql.IsolationLevel
70+
ReadOnly bool
71+
72+
// -- Coder specific metadata --
73+
// TxIdentifier is a unique identifier for the transaction to be used
74+
// in metrics. Can be any string.
75+
TxIdentifier string
76+
77+
// Set by InTx
78+
executionCount int
79+
}
80+
81+
// IncrementExecutionCount is a helper function for external packages
82+
// to increment the unexported count.
83+
// Mainly for `dbmem`.
84+
func IncrementExecutionCount(opts *TxOptions) {
85+
opts.executionCount++
86+
}
87+
88+
func (o TxOptions) ExecutionCount() int {
89+
return o.executionCount
90+
}
91+
92+
func (o *TxOptions) WithID(id string) *TxOptions {
93+
o.TxIdentifier = id
94+
return o
95+
}
96+
6097
// queries encompasses both are sqlc generated
6198
// queries and our custom queries.
6299
type querier interface {
@@ -80,25 +117,39 @@ func (q *sqlQuerier) Ping(ctx context.Context) (time.Duration, error) {
80117
return time.Since(start), err
81118
}
82119

83-
func (q *sqlQuerier) InTx(function func(Store) error, txOpts *sql.TxOptions) error {
120+
func DefaultTXOptions() *TxOptions {
121+
return &TxOptions{
122+
Isolation: sql.LevelDefault,
123+
ReadOnly: false,
124+
}
125+
}
126+
127+
func (q *sqlQuerier) InTx(function func(Store) error, txOpts *TxOptions) error {
84128
_, inTx := q.db.(*sqlx.Tx)
85-
isolation := sql.LevelDefault
86-
if txOpts != nil {
87-
isolation = txOpts.Isolation
129+
130+
if txOpts == nil {
131+
// create a default txOpts if left to nil
132+
txOpts = DefaultTXOptions()
133+
}
134+
135+
sqlOpts := &sql.TxOptions{
136+
Isolation: txOpts.Isolation,
137+
ReadOnly: txOpts.ReadOnly,
88138
}
89139

90140
// If we are not already in a transaction, and we are running in serializable
91141
// mode, we need to run the transaction in a retry loop. The caller should be
92142
// prepared to allow retries if using serializable mode.
93143
// If we are in a transaction already, the parent InTx call will handle the retry.
94144
// We do not want to duplicate those retries.
95-
if !inTx && isolation == sql.LevelSerializable {
145+
if !inTx && sqlOpts.Isolation == sql.LevelSerializable {
96146
// This is an arbitrarily chosen number.
97147
const retryAmount = 3
98148
var err error
99149
attempts := 0
100150
for attempts = 0; attempts < retryAmount; attempts++ {
101-
err = q.runTx(function, txOpts)
151+
txOpts.executionCount++
152+
err = q.runTx(function, sqlOpts)
102153
if err == nil {
103154
// Transaction succeeded.
104155
return nil
@@ -111,7 +162,9 @@ func (q *sqlQuerier) InTx(function func(Store) error, txOpts *sql.TxOptions) err
111162
// Transaction kept failing in serializable mode.
112163
return xerrors.Errorf("transaction failed after %d attempts: %w", attempts, err)
113164
}
114-
return q.runTx(function, txOpts)
165+
166+
txOpts.executionCount++
167+
return q.runTx(function, sqlOpts)
115168
}
116169

117170
// InTx performs database operations inside a transaction.

coderd/database/db_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func TestSerializedRetry(t *testing.T) {
2727
db := database.New(sqlDB)
2828

2929
called := 0
30-
txOpts := &sql.TxOptions{Isolation: sql.LevelSerializable}
30+
txOpts := &database.TxOptions{Isolation: sql.LevelSerializable}
3131
err := db.InTx(func(tx database.Store) error {
3232
// Test nested error
3333
return tx.InTx(func(tx database.Store) error {

coderd/database/dbauthz/dbauthz.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ func (q *querier) Ping(ctx context.Context) (time.Duration, error) {
558558
}
559559

560560
// InTx runs the given function in a transaction.
561-
func (q *querier) InTx(function func(querier database.Store) error, txOpts *sql.TxOptions) error {
561+
func (q *querier) InTx(function func(querier database.Store) error, txOpts *database.TxOptions) error {
562562
return q.db.InTx(func(tx database.Store) error {
563563
// Wrap the transaction store in a querier.
564564
wrapped := New(tx, q.auth, q.log, q.acs)

coderd/database/dbmem/dbmem.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ func (tx *fakeTx) releaseLocks() {
365365
}
366366

367367
// InTx doesn't rollback data properly for in-memory yet.
368-
func (q *FakeQuerier) InTx(fn func(database.Store) error, _ *sql.TxOptions) error {
368+
func (q *FakeQuerier) InTx(fn func(database.Store) error, opts *database.TxOptions) error {
369369
q.mutex.Lock()
370370
defer q.mutex.Unlock()
371371
tx := &fakeTx{
@@ -374,6 +374,9 @@ func (q *FakeQuerier) InTx(fn func(database.Store) error, _ *sql.TxOptions) erro
374374
}
375375
defer tx.releaseLocks()
376376

377+
if opts != nil {
378+
database.IncrementExecutionCount(opts)
379+
}
377380
return fn(tx)
378381
}
379382

0 commit comments

Comments
 (0)