Skip to content

feat: expose agent metrics via Prometheus endpoint #7011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use CachedGaugeVec
  • Loading branch information
mtojek committed Apr 6, 2023
commit b5d0581caec093a09ae40e0477af35e095202e82
80 changes: 80 additions & 0 deletions coderd/prometheusmetrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package prometheusmetrics

import (
"sync"

"github.com/prometheus/client_golang/prometheus"
)

type CachedGaugeVec struct {
m sync.Mutex

gaugeVec *prometheus.GaugeVec
records []vectorRecord
}
Comment on lines +19 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you doc the usage? And the why?

Eg:

CachedGaugeVec does .....
Calling WithLabelValues will update the internal gauge value. The value will not be returned by 'Collect' until 'Commit' is called.
'Commit' will reset the internal value, requiring the next set of values to build upon a completely reset metric.

Or something...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added 👍


var _ prometheus.Collector = new(CachedGaugeVec)

type VectorOperation int

const (
VectorOperationAdd VectorOperation = iota
VectorOperationSet
)

type vectorRecord struct {
operation VectorOperation
value float64
labelValues []string
}

func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
return &CachedGaugeVec{
gaugeVec: gaugeVec,
}
}

func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
v.m.Lock()
defer v.m.Unlock()

v.gaugeVec.Describe(desc)
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually does not need the mutex. Describe is safe and does not return the counter, which is what you are protecting.

Describe is not really called much in prod, if ever, so it's not that big of a deal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure if Describe should be guarded or not, so thanks for raising it. I removed the mutex from the function.


func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
v.m.Lock()
defer v.m.Unlock()

v.gaugeVec.Collect(ch)
}

func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
v.m.Lock()
defer v.m.Unlock()

v.records = append(v.records, vectorRecord{
operation: operation,
value: value,
labelValues: labelValues,
})
}

func (v *CachedGaugeVec) Commit() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// Commit will set the internal value as the cached value to return from 'Collect'.
// The internal metric value is completely reset, so the caller should expect
// the gauge to be empty for the next 'WithLabelValues' values.

Suggested change
func (v *CachedGaugeVec) Commit() {
func (v *CachedGaugeVec) Commit() {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment added.

v.m.Lock()
defer v.m.Unlock()

v.gaugeVec.Reset()
for _, record := range v.records {
g := v.gaugeVec.WithLabelValues(record.labelValues...)
switch record.operation {
case VectorOperationAdd:
g.Add(record.value)
case VectorOperationSet:
g.Set(record.value)
default:
panic("unsupported vector operation")
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want this switch statement on the WithLabelValues call so the panic is closer to the source.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This switch is also useful to pick the right operation, so I will add another switch-case-panic to WithLabelValues.

}

v.records = nil
}
40 changes: 20 additions & 20 deletions coderd/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,45 +124,45 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
duration = 1 * time.Minute
}

agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"})
}, []string{"username", "workspace_name"}))
err := registerer.Register(agentsGauge)
if err != nil {
return nil, err
}

agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connections",
Help: "Agent connections with statuses.",
}, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})
}, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}))
err = registerer.Register(agentsConnectionsGauge)
if err != nil {
return nil, err
}

agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connection_latencies_seconds",
Help: "Agent connection latencies in seconds.",
}, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})
}, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}))
err = registerer.Register(agentsConnectionLatenciesGauge)
if err != nil {
return nil, err
}

agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "apps",
Help: "Agent applications with statuses.",
}, []string{"agent_name", "username", "workspace_name", "app_name", "health"})
}, []string{"agent_name", "username", "workspace_name", "app_name", "health"}))
err = registerer.Register(agentsAppsGauge)
if err != nil {
return nil, err
Expand Down Expand Up @@ -203,35 +203,30 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
continue
}

agentsGauge.Reset()
agentsConnectionsGauge.Reset()
agentsConnectionLatenciesGauge.Reset()
agentsAppsGauge.Reset()

for _, workspace := range workspaceRows {
user, err := db.GetUserByID(ctx, workspace.OwnerID)
if err != nil {
logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err))
agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}

agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if err != nil {
logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err))
agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}

if len(agents) == 0 {
logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID))
agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}

for _, agent := range agents {
// Collect information about agents
agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1)
agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name)

connectionStatus := agent.Status(agentInactiveDisconnectTimeout)
node := (*coordinator.Load()).Node(agent.ID)
Expand All @@ -241,7 +236,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
tailnetNode = node.ID.String()
}

agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1)
agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode)

if node == nil {
logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID))
Expand All @@ -266,7 +261,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}
}

agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency)
agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID))
}
}

Expand All @@ -278,11 +273,16 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}

for _, app := range apps {
agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1)
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health))
}
}
}

agentsGauge.Commit()
agentsConnectionsGauge.Commit()
agentsConnectionLatenciesGauge.Commit()
agentsAppsGauge.Commit()

logger.Debug(ctx, "Agent metrics collection is done")
metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds())
}
Expand Down