Skip to content

Commit 8764f89

Browse files
committed
Agents
1 parent 440657c commit 8764f89

File tree

2 files changed

+92
-74
lines changed

2 files changed

+92
-74
lines changed

cli/server.go

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -849,16 +849,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
849849
defer options.Telemetry.Close()
850850
}
851851

852-
databaseStoreWithoutAuth := options.Database
853-
854-
// We use a separate coderAPICloser so the Enterprise API
855-
// can have it's own close functions. This is cleaner
856-
// than abstracting the Coder API itself.
857-
coderAPI, coderAPICloser, err := newAPI(ctx, options)
858-
if err != nil {
859-
return xerrors.Errorf("create coder API: %w", err)
860-
}
861-
862852
// This prevents the pprof import from being accidentally deleted.
863853
_ = pprof.Handler
864854
if cfg.Pprof.Enable {
@@ -881,12 +871,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
881871
}
882872
defer closeWorkspacesFunc()
883873

884-
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0)
885-
if err != nil {
886-
return xerrors.Errorf("register agents prometheus metric: %w", err)
887-
}
888-
defer closeAgentsFunc()
889-
890874
//nolint:revive
891875
defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler(
892876
options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}),
@@ -897,6 +881,23 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
897881
options.SwaggerEndpoint = cfg.Swagger.Enable.Value()
898882
}
899883

884+
// We use a separate coderAPICloser so the Enterprise API
885+
// can have it's own close functions. This is cleaner
886+
// than abstracting the Coder API itself.
887+
coderAPI, coderAPICloser, err := newAPI(ctx, options)
888+
if err != nil {
889+
return xerrors.Errorf("create coder API: %w", err)
890+
}
891+
892+
if cfg.Prometheus.Enable {
893+
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
894+
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
895+
if err != nil {
896+
return xerrors.Errorf("register agents prometheus metric: %w", err)
897+
}
898+
defer closeAgentsFunc()
899+
}
900+
900901
client := codersdk.New(localURL)
901902
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
902903
// The certificate will likely be self-signed or for a different

coderd/prometheusmetrics/prometheusmetrics.go

Lines changed: 75 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package prometheusmetrics
33
import (
44
"context"
55
"fmt"
6-
"log"
76
"strconv"
87
"strings"
98
"sync/atomic"
@@ -13,8 +12,11 @@ import (
1312
"github.com/prometheus/client_golang/prometheus"
1413
"tailscale.com/tailcfg"
1514

15+
"cdr.dev/slog"
16+
1617
"github.com/coder/coder/coderd"
1718
"github.com/coder/coder/coderd/database"
19+
"github.com/coder/coder/coderd/database/dbauthz"
1820
"github.com/coder/coder/tailnet"
1921
)
2022

@@ -115,119 +117,134 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
115117
}
116118

117119
// Agents tracks the total number of workspaces with labels on status.
118-
func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) {
120+
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) {
119121
if duration == 0 {
120122
duration = 15 * time.Second // TODO 5 * time.Minute
121123
}
122124

123-
agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
125+
workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
124126
Namespace: "coderd",
125127
Subsystem: "agents",
126-
Name: "connection",
127-
Help: "The agent connection with a status.",
128-
}, []string{"agent_name", "workspace_name", "status"})
129-
err := registerer.Register(agentsConnectionGauge)
128+
Name: "up",
129+
Help: "The number of active agents per workspace.",
130+
}, []string{"username", "workspace_name"})
131+
err := registerer.Register(workspaceAgentsGauge)
130132
if err != nil {
131133
return nil, err
132134
}
133135

134-
agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
136+
agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
135137
Namespace: "coderd",
136138
Subsystem: "agents",
137-
Name: "user_latencies_seconds",
138-
Help: "The user's agent latency in seconds.",
139-
}, []string{"agent_id", "workspace_name", "derp_region", "preferred"})
140-
err = registerer.Register(agentsUserLatenciesGauge)
139+
Name: "connections",
140+
Help: "Agent connections with statuses.",
141+
}, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})
142+
err = registerer.Register(agentsConnectionGauge)
141143
if err != nil {
142144
return nil, err
143145
}
144146

145-
// FIXME connection_type ide
147+
agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
148+
Namespace: "coderd",
149+
Subsystem: "agents",
150+
Name: "connection_latencies_seconds",
151+
Help: "Agent connection latencies in seconds.",
152+
}, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})
153+
err = registerer.Register(agentsConnectionLatenciesGauge)
154+
if err != nil {
155+
return nil, err
156+
}
146157

147-
ctx, cancelFunc := context.WithCancel(ctx)
158+
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
159+
ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx))
148160
ticker := time.NewTicker(duration)
149161
go func() {
150162
defer ticker.Stop()
151163
for {
152-
log.Println("Agents!!!")
153-
154164
select {
155165
case <-ctx.Done():
156166
return
157167
case <-ticker.C:
158168
}
159169

160-
// FIXME Optimize this routine: SQL db calls
170+
logger.Info(ctx, "Collect agent metrics now")
161171

162-
builds, err := db.GetLatestWorkspaceBuilds(ctx)
172+
workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
173+
AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()),
174+
})
163175
if err != nil {
164-
log.Println("1", err)
176+
logger.Error(ctx, "can't get workspace rows", slog.Error(err))
165177
continue
166178
}
167179

180+
workspaceAgentsGauge.Reset()
168181
agentsConnectionGauge.Reset()
169-
agentsUserLatenciesGauge.Reset()
170-
for _, build := range builds {
171-
workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID)
182+
agentsConnectionLatenciesGauge.Reset()
183+
184+
for _, workspace := range workspaceRows {
185+
user, err := db.GetUserByID(ctx, workspace.OwnerID)
172186
if err != nil {
173-
log.Println("2", err)
187+
logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID))
188+
workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
174189
continue
175190
}
176191

177-
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID)
192+
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
178193
if err != nil {
179-
log.Println("3", err)
194+
logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err))
195+
workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
180196
continue
181197
}
182198

183199
if len(agents) == 0 {
200+
logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name))
201+
workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0)
184202
continue
185203
}
186204

187-
// FIXME publish workspace even if no agents
188-
189205
for _, agent := range agents {
190-
connectionStatus := agent.Status(6 * time.Second)
191-
192-
// FIXME AgentInactiveDisconnectTimeout
193-
// ? connection_timeout_seconds
194-
// obok latency lifecycle_state
195-
log.Println("with value " + agent.Name)
196-
agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1)
206+
// Collect information about agents
207+
workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1)
197208

209+
connectionStatus := agent.Status(agentInactiveDisconnectTimeout)
198210
node := (*coordinator.Load()).Node(agent.ID)
211+
212+
tailnetNode := "unknown"
199213
if node != nil {
200-
log.Println("coordinator")
214+
tailnetNode = node.ID.String()
215+
}
201216

202-
for rawRegion, latency := range node.DERPLatency {
203-
log.Println(rawRegion, latency)
217+
agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1)
204218

205-
regionParts := strings.SplitN(rawRegion, "-", 2)
206-
regionID, err := strconv.Atoi(regionParts[0])
207-
if err != nil {
208-
continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err)
209-
}
210-
region, found := derpMap.Regions[regionID]
211-
if !found {
212-
// It's possible that a workspace agent is using an old DERPMap
213-
// and reports regions that do not exist. If that's the case,
214-
// report the region as unknown!
215-
region = &tailcfg.DERPRegion{
216-
RegionID: regionID,
217-
RegionName: fmt.Sprintf("Unnamed %d", regionID),
218-
}
219-
}
219+
if node == nil {
220+
logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name))
221+
continue
222+
}
220223

221-
log.Println(region, latency)
222-
agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency)
224+
// Collect information about connection latencies
225+
for rawRegion, latency := range node.DERPLatency {
226+
regionParts := strings.SplitN(rawRegion, "-", 2)
227+
regionID, err := strconv.Atoi(regionParts[0])
228+
if err != nil {
229+
logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion))
230+
continue
223231
}
224-
} else {
225-
log.Println("node is null")
232+
region, found := derpMap.Regions[regionID]
233+
if !found {
234+
// It's possible that a workspace agent is using an old DERPMap
235+
// and reports regions that do not exist. If that's the case,
236+
// report the region as unknown!
237+
region = &tailcfg.DERPRegion{
238+
RegionID: regionID,
239+
RegionName: fmt.Sprintf("Unnamed %d", regionID),
240+
}
241+
}
242+
243+
agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency)
226244
}
227245

228-
// FIXME publish agent even if DERP is missing
229246
// FIXME IDE?
230-
// FIXME agent connection zero
247+
// FIXME connection_type ide
231248
}
232249
}
233250
}

0 commit comments

Comments
 (0)