Skip to content

feat: expose agent metrics via Prometheus endpoint #7011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 7, 2023
9 changes: 9 additions & 0 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
return xerrors.Errorf("create coder API: %w", err)
}

if cfg.Prometheus.Enable {
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
if err != nil {
return xerrors.Errorf("register agents prometheus metric: %w", err)
}
defer closeAgentsFunc()
}

client := codersdk.New(localURL)
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
// The certificate will likely be self-signed or for a different
Expand Down
95 changes: 95 additions & 0 deletions coderd/prometheusmetrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package prometheusmetrics

import (
"sync"

"github.com/prometheus/client_golang/prometheus"
)

// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows
// for staging changes in the metrics vector. Calling "WithLabelValues(...)"
// will update the internal gauge value, but it will not be returned by
// "Collect(...)" until the "Commit()" method is called. The "Commit()" method
// resets the internal gauge and applies all staged changes to it.
//
// The Use of CachedGaugeVec is recommended for use cases when there is a risk
// that the Prometheus collector receives incomplete metrics, collected
// in the middle of metrics recalculation, between "Reset()" and the last
// "WithLabelValues()" call.
type CachedGaugeVec struct {
m sync.Mutex

gaugeVec *prometheus.GaugeVec
records []vectorRecord
}
Comment on lines +19 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you doc the usage? And the why?

Eg:

CachedGaugeVec does .....
Calling WithLabelValues will update the internal gauge value. The value will not be returned by 'Collect' until 'Commit' is called.
'Commit' will reset the internal value, requiring the next set of values to build upon a completely reset metric.

Or something...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added 👍


var _ prometheus.Collector = new(CachedGaugeVec)

type VectorOperation int

const (
VectorOperationAdd VectorOperation = iota
VectorOperationSet
)

type vectorRecord struct {
operation VectorOperation
value float64
labelValues []string
}

func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
return &CachedGaugeVec{
gaugeVec: gaugeVec,
}
}

func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
v.gaugeVec.Describe(desc)
}

func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
v.m.Lock()
defer v.m.Unlock()

v.gaugeVec.Collect(ch)
}

func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
switch operation {
case VectorOperationAdd:
case VectorOperationSet:
default:
Comment on lines +59 to +62
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually prefer this. But it does not matter.

switch operation {
	case VectorOperationAdd, VectorOperationSet:
	default:
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

panic("unsupported vector operation")
}

v.m.Lock()
defer v.m.Unlock()

v.records = append(v.records, vectorRecord{
operation: operation,
value: value,
labelValues: labelValues,
})
}

// Commit will set the internal value as the cached value to return from "Collect()".
// The internal metric value is completely reset, so the caller should expect
// the gauge to be empty for the next 'WithLabelValues' values.
func (v *CachedGaugeVec) Commit() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// Commit will set the internal value as the cached value to return from 'Collect'.
// The internal metric value is completely reset, so the caller should expect
// the gauge to be empty for the next 'WithLabelValues' values.

Suggested change
func (v *CachedGaugeVec) Commit() {
func (v *CachedGaugeVec) Commit() {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment added.

v.m.Lock()
defer v.m.Unlock()

v.gaugeVec.Reset()
for _, record := range v.records {
g := v.gaugeVec.WithLabelValues(record.labelValues...)
switch record.operation {
case VectorOperationAdd:
g.Add(record.value)
case VectorOperationSet:
g.Set(record.value)
}
}

v.records = nil
}
140 changes: 140 additions & 0 deletions coderd/prometheusmetrics/collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package prometheusmetrics_test

import (
"sort"
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/coder/coder/coderd/prometheusmetrics"
)

func TestCollector_Add(t *testing.T) {
t.Parallel()

// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))

// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace")
agentsGauge.Commit()

// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)

metrics := collectAndSortMetrics(t, agentsGauge, 2)

assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value

assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value
}

func TestCollector_Set(t *testing.T) {
t.Parallel()

// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))

// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace")
agentsGauge.Commit()

// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)

metrics := collectAndSortMetrics(t, agentsGauge, 2)

assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value

assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
}

func TestCollector_Set_Add(t *testing.T) {
t.Parallel()

// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))

// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace")
agentsGauge.Commit()

// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)

metrics := collectAndSortMetrics(t, agentsGauge, 2)

assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value

assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
}

func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric {
ch := make(chan prometheus.Metric, count)
defer close(ch)

var metrics []dto.Metric

collector.Collect(ch)
for i := 0; i < count; i++ {
m := <-ch

var metric dto.Metric
err := m.Write(&metric)
require.NoError(t, err)

metrics = append(metrics, metric)
}

// Ensure always the same order of metrics
sort.Slice(metrics, func(i, j int) bool {
return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()})
})
return metrics
}
Loading