Skip to content

Commit c44dcf6

Browse files
committed
feat: add provisionerd prometheus metrics
Also adds back the go runtime metrics.
1 parent 8f4ae5b commit c44dcf6

File tree

4 files changed

+96
-19
lines changed

4 files changed

+96
-19
lines changed

cli/server.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/google/go-github/v43/github"
3030
"github.com/google/uuid"
3131
"github.com/prometheus/client_golang/prometheus"
32+
"github.com/prometheus/client_golang/prometheus/collectors"
3233
"github.com/prometheus/client_golang/prometheus/promhttp"
3334
"github.com/spf13/afero"
3435
"github.com/spf13/cobra"
@@ -358,6 +359,7 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
358359
AgentStatsRefreshInterval: cfg.AgentStatRefreshInterval.Value,
359360
Experimental: ExperimentalEnabled(cmd),
360361
DeploymentConfig: cfg,
362+
PrometheusRegistry: prometheus.NewRegistry(),
361363
}
362364
if tlsConfig != nil {
363365
options.TLSCertificates = tlsConfig.Certificates
@@ -505,7 +507,9 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
505507
defer serveHandler(ctx, logger, nil, cfg.Pprof.Address.Value, "pprof")()
506508
}
507509
if cfg.Prometheus.Enable.Value {
508-
options.PrometheusRegistry = prometheus.NewRegistry()
510+
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
511+
options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
512+
509513
closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0)
510514
if err != nil {
511515
return xerrors.Errorf("register active users prometheus metric: %w", err)
@@ -557,8 +561,9 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
557561
_ = daemon.Close()
558562
}
559563
}()
564+
provisionerdMetrics := provisionerd.NewMetrics(options.PrometheusRegistry)
560565
for i := 0; i < cfg.ProvisionerDaemons.Value; i++ {
561-
daemon, err := newProvisionerDaemon(ctx, coderAPI, logger, cfg.CacheDirectory.Value, errCh, false)
566+
daemon, err := newProvisionerDaemon(ctx, coderAPI, provisionerdMetrics, logger, cfg.CacheDirectory.Value, errCh, false)
562567
if err != nil {
563568
return xerrors.Errorf("create provisioner daemon: %w", err)
564569
}
@@ -825,6 +830,7 @@ func shutdownWithTimeout(shutdown func(context.Context) error, timeout time.Dura
825830
func newProvisionerDaemon(
826831
ctx context.Context,
827832
coderAPI *coderd.API,
833+
metrics provisionerd.Metrics,
828834
logger slog.Logger,
829835
cacheDir string,
830836
errCh chan error,
@@ -901,7 +907,8 @@ func newProvisionerDaemon(
901907
UpdateInterval: 500 * time.Millisecond,
902908
Provisioners: provisioners,
903909
WorkDirectory: tempDir,
904-
Tracer: coderAPI.TracerProvider,
910+
TracerProvider: coderAPI.TracerProvider,
911+
Metrics: &metrics,
905912
}), nil
906913
}
907914

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,6 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
3333
go func() {
3434
defer ticker.Stop()
3535
for {
36-
select {
37-
case <-ctx.Done():
38-
return
39-
case <-ticker.C:
40-
}
4136
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, database.Now().Add(-1*time.Hour))
4237
if err != nil {
4338
continue
@@ -47,6 +42,12 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
4742
distinctUsers[apiKey.UserID] = struct{}{}
4843
}
4944
gauge.Set(float64(len(distinctUsers)))
45+
46+
select {
47+
case <-ctx.Done():
48+
return
49+
case <-ticker.C:
50+
}
5051
}
5152
}()
5253
return cancelFunc, nil
@@ -77,11 +78,6 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
7778
go func() {
7879
defer ticker.Stop()
7980
for {
80-
select {
81-
case <-ctx.Done():
82-
return
83-
case <-ticker.C:
84-
}
8581
builds, err := db.GetLatestWorkspaceBuilds(ctx)
8682
if err != nil {
8783
continue
@@ -100,6 +96,12 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
10096
status := coderd.ConvertProvisionerJobStatus(job)
10197
gauge.WithLabelValues(string(status)).Add(1)
10298
}
99+
100+
select {
101+
case <-ctx.Done():
102+
return
103+
case <-ticker.C:
104+
}
103105
}
104106
}()
105107
return cancelFunc, nil

provisionerd/provisionerd.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"time"
1212

1313
"github.com/hashicorp/yamux"
14+
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/client_golang/prometheus/promauto"
1416
"github.com/spf13/afero"
1517
"go.opentelemetry.io/otel/attribute"
1618
semconv "go.opentelemetry.io/otel/semconv/v1.11.0"
@@ -41,9 +43,10 @@ type Provisioners map[string]sdkproto.DRPCProvisionerClient
4143

4244
// Options provides customizations to the behavior of a provisioner daemon.
4345
type Options struct {
44-
Filesystem afero.Fs
45-
Logger slog.Logger
46-
Tracer trace.TracerProvider
46+
Filesystem afero.Fs
47+
Logger slog.Logger
48+
TracerProvider trace.TracerProvider
49+
Metrics *Metrics
4750

4851
ForceCancelInterval time.Duration
4952
UpdateInterval time.Duration
@@ -66,14 +69,19 @@ func New(clientDialer Dialer, opts *Options) *Server {
6669
if opts.Filesystem == nil {
6770
opts.Filesystem = afero.NewOsFs()
6871
}
69-
if opts.Tracer == nil {
70-
opts.Tracer = trace.NewNoopTracerProvider()
72+
if opts.TracerProvider == nil {
73+
opts.TracerProvider = trace.NewNoopTracerProvider()
74+
}
75+
if opts.Metrics == nil {
76+
reg := prometheus.NewRegistry()
77+
mets := NewMetrics(reg)
78+
opts.Metrics = &mets
7179
}
7280

7381
ctx, ctxCancel := context.WithCancel(context.Background())
7482
daemon := &Server{
7583
opts: opts,
76-
tracer: opts.Tracer.Tracer(tracing.TracerName),
84+
tracer: opts.TracerProvider.Tracer(tracing.TracerName),
7785

7886
clientDialer: clientDialer,
7987

@@ -103,6 +111,42 @@ type Server struct {
103111
activeJob *runner.Runner
104112
}
105113

114+
type Metrics struct {
115+
Runner runner.Metrics
116+
}
117+
118+
func NewMetrics(reg prometheus.Registerer) Metrics {
119+
auto := promauto.With(reg)
120+
durationToFloatMs := func(d time.Duration) float64 {
121+
return float64(d.Milliseconds())
122+
}
123+
124+
return Metrics{
125+
Runner: runner.Metrics{
126+
ConcurrentJobs: auto.NewGaugeVec(prometheus.GaugeOpts{
127+
Namespace: "coderd",
128+
Subsystem: "provisionerd",
129+
Name: "jobs_current",
130+
}, []string{"provisioner"}),
131+
JobTimings: auto.NewHistogramVec(prometheus.HistogramOpts{
132+
Namespace: "coderd",
133+
Subsystem: "provisionerd",
134+
Name: "job_timings_ms",
135+
Buckets: []float64{
136+
durationToFloatMs(1 * time.Second),
137+
durationToFloatMs(10 * time.Second),
138+
durationToFloatMs(30 * time.Second),
139+
durationToFloatMs(1 * time.Minute),
140+
durationToFloatMs(5 * time.Minute),
141+
durationToFloatMs(10 * time.Minute),
142+
durationToFloatMs(30 * time.Minute),
143+
durationToFloatMs(1 * time.Hour),
144+
},
145+
}, []string{"provisioner", "status"}),
146+
},
147+
}
148+
}
149+
106150
// Connect establishes a connection to coderd.
107151
func (p *Server) connect(ctx context.Context) {
108152
// An exponential back-off occurs when the connection is failing to dial.
@@ -282,6 +326,7 @@ func (p *Server) acquireJob(ctx context.Context) {
282326
p.opts.UpdateInterval,
283327
p.opts.ForceCancelInterval,
284328
p.tracer,
329+
p.opts.Metrics.Runner,
285330
)
286331

287332
go p.activeJob.Run()

provisionerd/runner/runner.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"time"
1717

1818
"github.com/google/uuid"
19+
"github.com/prometheus/client_golang/prometheus"
1920
"github.com/spf13/afero"
2021
"go.opentelemetry.io/otel/codes"
2122
semconv "go.opentelemetry.io/otel/semconv/v1.11.0"
@@ -34,6 +35,7 @@ const (
3435

3536
type Runner struct {
3637
tracer trace.Tracer
38+
metrics Metrics
3739
job *proto.AcquiredJob
3840
sender JobUpdater
3941
logger slog.Logger
@@ -65,6 +67,12 @@ type Runner struct {
6567
okToSend bool
6668
}
6769

70+
type Metrics struct {
71+
ConcurrentJobs *prometheus.GaugeVec
72+
// JobTimings also counts the total amount of jobs.
73+
JobTimings *prometheus.HistogramVec
74+
}
75+
6876
type JobUpdater interface {
6977
UpdateJob(ctx context.Context, in *proto.UpdateJobRequest) (*proto.UpdateJobResponse, error)
7078
FailJob(ctx context.Context, in *proto.FailedJob) error
@@ -82,6 +90,7 @@ func NewRunner(
8290
updateInterval time.Duration,
8391
forceCancelInterval time.Duration,
8492
tracer trace.Tracer,
93+
metrics Metrics,
8594
) *Runner {
8695
m := new(sync.Mutex)
8796

@@ -91,6 +100,7 @@ func NewRunner(
91100

92101
return &Runner{
93102
tracer: tracer,
103+
metrics: metrics,
94104
job: job,
95105
sender: updater,
96106
logger: logger.With(slog.F("job_id", job.JobId)),
@@ -120,9 +130,22 @@ func NewRunner(
120130
// that goroutine on the context passed into Fail(), and it marks okToSend false to signal us here
121131
// that this function should not also send a terminal message.
122132
func (r *Runner) Run() {
133+
start := time.Now()
123134
ctx, span := r.startTrace(r.notStopped, tracing.FuncName())
124135
defer span.End()
125136

137+
concurrentGauge := r.metrics.ConcurrentJobs.WithLabelValues(r.job.Provisioner)
138+
concurrentGauge.Inc()
139+
defer func() {
140+
status := "success"
141+
if r.failedJob != nil {
142+
status = "failed"
143+
}
144+
145+
concurrentGauge.Dec()
146+
r.metrics.JobTimings.WithLabelValues(r.job.Provisioner, status).Observe(float64(time.Since(start).Milliseconds()))
147+
}()
148+
126149
r.mutex.Lock()
127150
defer r.mutex.Unlock()
128151
defer r.stop()

0 commit comments

Comments
 (0)