From c44dcf659eb993a9a5826a96dd13cc250e737176 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 4 Nov 2022 14:48:19 -0500 Subject: [PATCH 1/4] feat: add provisionerd prometheus metrics Also adds back the go runtime metrics. --- cli/server.go | 13 ++++- coderd/prometheusmetrics/prometheusmetrics.go | 22 +++---- provisionerd/provisionerd.go | 57 +++++++++++++++++-- provisionerd/runner/runner.go | 23 ++++++++ 4 files changed, 96 insertions(+), 19 deletions(-) diff --git a/cli/server.go b/cli/server.go index 8a8e4998db5d7..88f787b9589f3 100644 --- a/cli/server.go +++ b/cli/server.go @@ -29,6 +29,7 @@ import ( "github.com/google/go-github/v43/github" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/afero" "github.com/spf13/cobra" @@ -358,6 +359,7 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co AgentStatsRefreshInterval: cfg.AgentStatRefreshInterval.Value, Experimental: ExperimentalEnabled(cmd), DeploymentConfig: cfg, + PrometheusRegistry: prometheus.NewRegistry(), } if tlsConfig != nil { options.TLSCertificates = tlsConfig.Certificates @@ -505,7 +507,9 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co defer serveHandler(ctx, logger, nil, cfg.Pprof.Address.Value, "pprof")() } if cfg.Prometheus.Enable.Value { - options.PrometheusRegistry = prometheus.NewRegistry() + options.PrometheusRegistry.MustRegister(collectors.NewGoCollector()) + options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0) if err != nil { return xerrors.Errorf("register active users prometheus metric: %w", err) @@ -557,8 +561,9 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co _ = daemon.Close() } }() + provisionerdMetrics := provisionerd.NewMetrics(options.PrometheusRegistry) for i := 0; i < cfg.ProvisionerDaemons.Value; i++ { - daemon, err := newProvisionerDaemon(ctx, coderAPI, logger, cfg.CacheDirectory.Value, errCh, false) + daemon, err := newProvisionerDaemon(ctx, coderAPI, provisionerdMetrics, logger, cfg.CacheDirectory.Value, errCh, false) if err != nil { return xerrors.Errorf("create provisioner daemon: %w", err) } @@ -825,6 +830,7 @@ func shutdownWithTimeout(shutdown func(context.Context) error, timeout time.Dura func newProvisionerDaemon( ctx context.Context, coderAPI *coderd.API, + metrics provisionerd.Metrics, logger slog.Logger, cacheDir string, errCh chan error, @@ -901,7 +907,8 @@ func newProvisionerDaemon( UpdateInterval: 500 * time.Millisecond, Provisioners: provisioners, WorkDirectory: tempDir, - Tracer: coderAPI.TracerProvider, + TracerProvider: coderAPI.TracerProvider, + Metrics: &metrics, }), nil } diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index ec8cfe80e3a08..4048a4ad4212f 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -33,11 +33,6 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab go func() { defer ticker.Stop() for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - } apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, database.Now().Add(-1*time.Hour)) if err != nil { continue @@ -47,6 +42,12 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab distinctUsers[apiKey.UserID] = struct{}{} } gauge.Set(float64(len(distinctUsers))) + + select { + case <-ctx.Done(): + return + case <-ticker.C: + } } }() return cancelFunc, nil @@ -77,11 +78,6 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa go func() { defer ticker.Stop() for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - } builds, err := db.GetLatestWorkspaceBuilds(ctx) if err != nil { continue @@ -100,6 +96,12 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa status := coderd.ConvertProvisionerJobStatus(job) gauge.WithLabelValues(string(status)).Add(1) } + + select { + case <-ctx.Done(): + return + case <-ticker.C: + } } }() return cancelFunc, nil diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index fc8edab3dea2b..fa71ed3057a03 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -11,6 +11,8 @@ import ( "time" "github.com/hashicorp/yamux" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/spf13/afero" "go.opentelemetry.io/otel/attribute" semconv "go.opentelemetry.io/otel/semconv/v1.11.0" @@ -41,9 +43,10 @@ type Provisioners map[string]sdkproto.DRPCProvisionerClient // Options provides customizations to the behavior of a provisioner daemon. type Options struct { - Filesystem afero.Fs - Logger slog.Logger - Tracer trace.TracerProvider + Filesystem afero.Fs + Logger slog.Logger + TracerProvider trace.TracerProvider + Metrics *Metrics ForceCancelInterval time.Duration UpdateInterval time.Duration @@ -66,14 +69,19 @@ func New(clientDialer Dialer, opts *Options) *Server { if opts.Filesystem == nil { opts.Filesystem = afero.NewOsFs() } - if opts.Tracer == nil { - opts.Tracer = trace.NewNoopTracerProvider() + if opts.TracerProvider == nil { + opts.TracerProvider = trace.NewNoopTracerProvider() + } + if opts.Metrics == nil { + reg := prometheus.NewRegistry() + mets := NewMetrics(reg) + opts.Metrics = &mets } ctx, ctxCancel := context.WithCancel(context.Background()) daemon := &Server{ opts: opts, - tracer: opts.Tracer.Tracer(tracing.TracerName), + tracer: opts.TracerProvider.Tracer(tracing.TracerName), clientDialer: clientDialer, @@ -103,6 +111,42 @@ type Server struct { activeJob *runner.Runner } +type Metrics struct { + Runner runner.Metrics +} + +func NewMetrics(reg prometheus.Registerer) Metrics { + auto := promauto.With(reg) + durationToFloatMs := func(d time.Duration) float64 { + return float64(d.Milliseconds()) + } + + return Metrics{ + Runner: runner.Metrics{ + ConcurrentJobs: auto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "provisionerd", + Name: "jobs_current", + }, []string{"provisioner"}), + JobTimings: auto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "provisionerd", + Name: "job_timings_ms", + Buckets: []float64{ + durationToFloatMs(1 * time.Second), + durationToFloatMs(10 * time.Second), + durationToFloatMs(30 * time.Second), + durationToFloatMs(1 * time.Minute), + durationToFloatMs(5 * time.Minute), + durationToFloatMs(10 * time.Minute), + durationToFloatMs(30 * time.Minute), + durationToFloatMs(1 * time.Hour), + }, + }, []string{"provisioner", "status"}), + }, + } +} + // Connect establishes a connection to coderd. func (p *Server) connect(ctx context.Context) { // An exponential back-off occurs when the connection is failing to dial. @@ -282,6 +326,7 @@ func (p *Server) acquireJob(ctx context.Context) { p.opts.UpdateInterval, p.opts.ForceCancelInterval, p.tracer, + p.opts.Metrics.Runner, ) go p.activeJob.Run() diff --git a/provisionerd/runner/runner.go b/provisionerd/runner/runner.go index 319d0ff5750e5..13aafdbb1d373 100644 --- a/provisionerd/runner/runner.go +++ b/provisionerd/runner/runner.go @@ -16,6 +16,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "go.opentelemetry.io/otel/codes" semconv "go.opentelemetry.io/otel/semconv/v1.11.0" @@ -34,6 +35,7 @@ const ( type Runner struct { tracer trace.Tracer + metrics Metrics job *proto.AcquiredJob sender JobUpdater logger slog.Logger @@ -65,6 +67,12 @@ type Runner struct { okToSend bool } +type Metrics struct { + ConcurrentJobs *prometheus.GaugeVec + // JobTimings also counts the total amount of jobs. + JobTimings *prometheus.HistogramVec +} + type JobUpdater interface { UpdateJob(ctx context.Context, in *proto.UpdateJobRequest) (*proto.UpdateJobResponse, error) FailJob(ctx context.Context, in *proto.FailedJob) error @@ -82,6 +90,7 @@ func NewRunner( updateInterval time.Duration, forceCancelInterval time.Duration, tracer trace.Tracer, + metrics Metrics, ) *Runner { m := new(sync.Mutex) @@ -91,6 +100,7 @@ func NewRunner( return &Runner{ tracer: tracer, + metrics: metrics, job: job, sender: updater, logger: logger.With(slog.F("job_id", job.JobId)), @@ -120,9 +130,22 @@ func NewRunner( // that goroutine on the context passed into Fail(), and it marks okToSend false to signal us here // that this function should not also send a terminal message. func (r *Runner) Run() { + start := time.Now() ctx, span := r.startTrace(r.notStopped, tracing.FuncName()) defer span.End() + concurrentGauge := r.metrics.ConcurrentJobs.WithLabelValues(r.job.Provisioner) + concurrentGauge.Inc() + defer func() { + status := "success" + if r.failedJob != nil { + status = "failed" + } + + concurrentGauge.Dec() + r.metrics.JobTimings.WithLabelValues(r.job.Provisioner, status).Observe(float64(time.Since(start).Milliseconds())) + }() + r.mutex.Lock() defer r.mutex.Unlock() defer r.stop() From 4212ccf667915dca345d335257e80d7ca6cc9c7d Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 4 Nov 2022 17:06:02 -0500 Subject: [PATCH 2/4] fixup! Merge branch 'main' into colin/pd-metrics --- cli/server.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cli/server.go b/cli/server.go index a54e78368e68d..88f787b9589f3 100644 --- a/cli/server.go +++ b/cli/server.go @@ -516,14 +516,16 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co } defer closeUsersFunc() - closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegisterer, options.Database, 0) + closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0) if err != nil { return xerrors.Errorf("register workspaces prometheus metric: %w", err) } defer closeWorkspacesFunc() //nolint:revive - defer serveHandler(ctx, logger, promhttp.Handler(), cfg.Prometheus.Address.Value, "prometheus")() + defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( + options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), + ), cfg.Prometheus.Address.Value, "prometheus")() } // We use a separate coderAPICloser so the Enterprise API From b938110b6405e3e6a344b26cca1b8c78d804bd6a Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 4 Nov 2022 17:57:50 -0500 Subject: [PATCH 3/4] revert prometheusmetrics.go --- coderd/prometheusmetrics/prometheusmetrics.go | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 4048a4ad4212f..788feaf54abf1 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -33,6 +33,12 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab go func() { defer ticker.Stop() for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, database.Now().Add(-1*time.Hour)) if err != nil { continue @@ -43,11 +49,6 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab } gauge.Set(float64(len(distinctUsers))) - select { - case <-ctx.Done(): - return - case <-ticker.C: - } } }() return cancelFunc, nil @@ -78,6 +79,12 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa go func() { defer ticker.Stop() for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + builds, err := db.GetLatestWorkspaceBuilds(ctx) if err != nil { continue @@ -96,12 +103,6 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa status := coderd.ConvertProvisionerJobStatus(job) gauge.WithLabelValues(string(status)).Add(1) } - - select { - case <-ctx.Done(): - return - case <-ticker.C: - } } }() return cancelFunc, nil From a520a170f207c5d48d98e2befe61ff6ad15378a9 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 4 Nov 2022 17:58:24 -0500 Subject: [PATCH 4/4] fixup! revert prometheusmetrics.go --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 788feaf54abf1..536522bf73e04 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -48,7 +48,6 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab distinctUsers[apiKey.UserID] = struct{}{} } gauge.Set(float64(len(distinctUsers))) - } }() return cancelFunc, nil