Skip to content

Commit e740aeb

Browse files
authored
feat: add provisionerd prometheus metrics (#4909)
1 parent 8dd567d commit e740aeb

File tree

5 files changed

+95
-16
lines changed

5 files changed

+95
-16
lines changed

cli/server.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/google/go-github/v43/github"
3030
"github.com/google/uuid"
3131
"github.com/prometheus/client_golang/prometheus"
32+
"github.com/prometheus/client_golang/prometheus/collectors"
3233
"github.com/prometheus/client_golang/prometheus/promhttp"
3334
"github.com/spf13/afero"
3435
"github.com/spf13/cobra"
@@ -358,6 +359,7 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
358359
AgentStatsRefreshInterval: cfg.AgentStatRefreshInterval.Value,
359360
Experimental: ExperimentalEnabled(cmd),
360361
DeploymentConfig: cfg,
362+
PrometheusRegistry: prometheus.NewRegistry(),
361363
}
362364
if tlsConfig != nil {
363365
options.TLSCertificates = tlsConfig.Certificates
@@ -505,21 +507,25 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
505507
defer serveHandler(ctx, logger, nil, cfg.Pprof.Address.Value, "pprof")()
506508
}
507509
if cfg.Prometheus.Enable.Value {
508-
options.PrometheusRegisterer = prometheus.DefaultRegisterer
509-
closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegisterer, options.Database, 0)
510+
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
511+
options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
512+
513+
closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0)
510514
if err != nil {
511515
return xerrors.Errorf("register active users prometheus metric: %w", err)
512516
}
513517
defer closeUsersFunc()
514518

515-
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegisterer, options.Database, 0)
519+
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0)
516520
if err != nil {
517521
return xerrors.Errorf("register workspaces prometheus metric: %w", err)
518522
}
519523
defer closeWorkspacesFunc()
520524

521525
//nolint:revive
522-
defer serveHandler(ctx, logger, promhttp.Handler(), cfg.Prometheus.Address.Value, "prometheus")()
526+
defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler(
527+
options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}),
528+
), cfg.Prometheus.Address.Value, "prometheus")()
523529
}
524530

525531
// We use a separate coderAPICloser so the Enterprise API
@@ -555,8 +561,9 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
555561
_ = daemon.Close()
556562
}
557563
}()
564+
provisionerdMetrics := provisionerd.NewMetrics(options.PrometheusRegistry)
558565
for i := 0; i < cfg.ProvisionerDaemons.Value; i++ {
559-
daemon, err := newProvisionerDaemon(ctx, coderAPI, logger, cfg.CacheDirectory.Value, errCh, false)
566+
daemon, err := newProvisionerDaemon(ctx, coderAPI, provisionerdMetrics, logger, cfg.CacheDirectory.Value, errCh, false)
560567
if err != nil {
561568
return xerrors.Errorf("create provisioner daemon: %w", err)
562569
}
@@ -823,6 +830,7 @@ func shutdownWithTimeout(shutdown func(context.Context) error, timeout time.Dura
823830
func newProvisionerDaemon(
824831
ctx context.Context,
825832
coderAPI *coderd.API,
833+
metrics provisionerd.Metrics,
826834
logger slog.Logger,
827835
cacheDir string,
828836
errCh chan error,
@@ -899,7 +907,8 @@ func newProvisionerDaemon(
899907
UpdateInterval: 500 * time.Millisecond,
900908
Provisioners: provisioners,
901909
WorkDirectory: tempDir,
902-
Tracer: coderAPI.TracerProvider,
910+
TracerProvider: coderAPI.TracerProvider,
911+
Metrics: &metrics,
903912
}), nil
904913
}
905914

coderd/coderd.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ type Options struct {
7878
GoogleTokenValidator *idtoken.Validator
7979
GithubOAuth2Config *GithubOAuth2Config
8080
OIDCConfig *OIDCConfig
81-
PrometheusRegisterer prometheus.Registerer
81+
PrometheusRegistry *prometheus.Registry
8282
SecureAuthCookie bool
8383
SSHKeygenAlgorithm gitsshkey.Algorithm
8484
Telemetry telemetry.Reporter
@@ -132,8 +132,8 @@ func New(options *Options) *API {
132132
if options.Authorizer == nil {
133133
options.Authorizer = rbac.NewAuthorizer()
134134
}
135-
if options.PrometheusRegisterer == nil {
136-
options.PrometheusRegisterer = prometheus.NewRegistry()
135+
if options.PrometheusRegistry == nil {
136+
options.PrometheusRegistry = prometheus.NewRegistry()
137137
}
138138
if options.TailnetCoordinator == nil {
139139
options.TailnetCoordinator = tailnet.NewCoordinator()
@@ -204,7 +204,7 @@ func New(options *Options) *API {
204204
httpmw.Recover(api.Logger),
205205
httpmw.ExtractRealIP(api.RealIPConfig),
206206
httpmw.Logger(api.Logger),
207-
httpmw.Prometheus(options.PrometheusRegisterer),
207+
httpmw.Prometheus(options.PrometheusRegistry),
208208
// handleSubdomainApplications checks if the first subdomain is a valid
209209
// app URL. If it is, it will serve that application.
210210
api.handleSubdomainApplications(

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
3838
return
3939
case <-ticker.C:
4040
}
41+
4142
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, database.Now().Add(-1*time.Hour))
4243
if err != nil {
4344
continue
@@ -82,6 +83,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
8283
return
8384
case <-ticker.C:
8485
}
86+
8587
builds, err := db.GetLatestWorkspaceBuilds(ctx)
8688
if err != nil {
8789
continue

provisionerd/provisionerd.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"time"
1212

1313
"github.com/hashicorp/yamux"
14+
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/client_golang/prometheus/promauto"
1416
"github.com/spf13/afero"
1517
"go.opentelemetry.io/otel/attribute"
1618
semconv "go.opentelemetry.io/otel/semconv/v1.11.0"
@@ -41,9 +43,10 @@ type Provisioners map[string]sdkproto.DRPCProvisionerClient
4143

4244
// Options provides customizations to the behavior of a provisioner daemon.
4345
type Options struct {
44-
Filesystem afero.Fs
45-
Logger slog.Logger
46-
Tracer trace.TracerProvider
46+
Filesystem afero.Fs
47+
Logger slog.Logger
48+
TracerProvider trace.TracerProvider
49+
Metrics *Metrics
4750

4851
ForceCancelInterval time.Duration
4952
UpdateInterval time.Duration
@@ -66,14 +69,19 @@ func New(clientDialer Dialer, opts *Options) *Server {
6669
if opts.Filesystem == nil {
6770
opts.Filesystem = afero.NewOsFs()
6871
}
69-
if opts.Tracer == nil {
70-
opts.Tracer = trace.NewNoopTracerProvider()
72+
if opts.TracerProvider == nil {
73+
opts.TracerProvider = trace.NewNoopTracerProvider()
74+
}
75+
if opts.Metrics == nil {
76+
reg := prometheus.NewRegistry()
77+
mets := NewMetrics(reg)
78+
opts.Metrics = &mets
7179
}
7280

7381
ctx, ctxCancel := context.WithCancel(context.Background())
7482
daemon := &Server{
7583
opts: opts,
76-
tracer: opts.Tracer.Tracer(tracing.TracerName),
84+
tracer: opts.TracerProvider.Tracer(tracing.TracerName),
7785

7886
clientDialer: clientDialer,
7987

@@ -103,6 +111,42 @@ type Server struct {
103111
activeJob *runner.Runner
104112
}
105113

114+
type Metrics struct {
115+
Runner runner.Metrics
116+
}
117+
118+
func NewMetrics(reg prometheus.Registerer) Metrics {
119+
auto := promauto.With(reg)
120+
durationToFloatMs := func(d time.Duration) float64 {
121+
return float64(d.Milliseconds())
122+
}
123+
124+
return Metrics{
125+
Runner: runner.Metrics{
126+
ConcurrentJobs: auto.NewGaugeVec(prometheus.GaugeOpts{
127+
Namespace: "coderd",
128+
Subsystem: "provisionerd",
129+
Name: "jobs_current",
130+
}, []string{"provisioner"}),
131+
JobTimings: auto.NewHistogramVec(prometheus.HistogramOpts{
132+
Namespace: "coderd",
133+
Subsystem: "provisionerd",
134+
Name: "job_timings_ms",
135+
Buckets: []float64{
136+
durationToFloatMs(1 * time.Second),
137+
durationToFloatMs(10 * time.Second),
138+
durationToFloatMs(30 * time.Second),
139+
durationToFloatMs(1 * time.Minute),
140+
durationToFloatMs(5 * time.Minute),
141+
durationToFloatMs(10 * time.Minute),
142+
durationToFloatMs(30 * time.Minute),
143+
durationToFloatMs(1 * time.Hour),
144+
},
145+
}, []string{"provisioner", "status"}),
146+
},
147+
}
148+
}
149+
106150
// Connect establishes a connection to coderd.
107151
func (p *Server) connect(ctx context.Context) {
108152
// An exponential back-off occurs when the connection is failing to dial.
@@ -282,6 +326,7 @@ func (p *Server) acquireJob(ctx context.Context) {
282326
p.opts.UpdateInterval,
283327
p.opts.ForceCancelInterval,
284328
p.tracer,
329+
p.opts.Metrics.Runner,
285330
)
286331

287332
go p.activeJob.Run()

provisionerd/runner/runner.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"time"
1717

1818
"github.com/google/uuid"
19+
"github.com/prometheus/client_golang/prometheus"
1920
"github.com/spf13/afero"
2021
"go.opentelemetry.io/otel/codes"
2122
semconv "go.opentelemetry.io/otel/semconv/v1.11.0"
@@ -34,6 +35,7 @@ const (
3435

3536
type Runner struct {
3637
tracer trace.Tracer
38+
metrics Metrics
3739
job *proto.AcquiredJob
3840
sender JobUpdater
3941
logger slog.Logger
@@ -65,6 +67,12 @@ type Runner struct {
6567
okToSend bool
6668
}
6769

70+
type Metrics struct {
71+
ConcurrentJobs *prometheus.GaugeVec
72+
// JobTimings also counts the total amount of jobs.
73+
JobTimings *prometheus.HistogramVec
74+
}
75+
6876
type JobUpdater interface {
6977
UpdateJob(ctx context.Context, in *proto.UpdateJobRequest) (*proto.UpdateJobResponse, error)
7078
FailJob(ctx context.Context, in *proto.FailedJob) error
@@ -82,6 +90,7 @@ func NewRunner(
8290
updateInterval time.Duration,
8391
forceCancelInterval time.Duration,
8492
tracer trace.Tracer,
93+
metrics Metrics,
8594
) *Runner {
8695
m := new(sync.Mutex)
8796

@@ -91,6 +100,7 @@ func NewRunner(
91100

92101
return &Runner{
93102
tracer: tracer,
103+
metrics: metrics,
94104
job: job,
95105
sender: updater,
96106
logger: logger.With(slog.F("job_id", job.JobId)),
@@ -120,9 +130,22 @@ func NewRunner(
120130
// that goroutine on the context passed into Fail(), and it marks okToSend false to signal us here
121131
// that this function should not also send a terminal message.
122132
func (r *Runner) Run() {
133+
start := time.Now()
123134
ctx, span := r.startTrace(r.notStopped, tracing.FuncName())
124135
defer span.End()
125136

137+
concurrentGauge := r.metrics.ConcurrentJobs.WithLabelValues(r.job.Provisioner)
138+
concurrentGauge.Inc()
139+
defer func() {
140+
status := "success"
141+
if r.failedJob != nil {
142+
status = "failed"
143+
}
144+
145+
concurrentGauge.Dec()
146+
r.metrics.JobTimings.WithLabelValues(r.job.Provisioner, status).Observe(float64(time.Since(start).Milliseconds()))
147+
}()
148+
126149
r.mutex.Lock()
127150
defer r.mutex.Unlock()
128151
defer r.stop()

0 commit comments

Comments
 (0)