@@ -3,7 +3,6 @@ package prometheusmetrics
3
3
import (
4
4
"context"
5
5
"fmt"
6
- "log"
7
6
"strconv"
8
7
"strings"
9
8
"sync/atomic"
@@ -13,8 +12,11 @@ import (
13
12
"github.com/prometheus/client_golang/prometheus"
14
13
"tailscale.com/tailcfg"
15
14
15
+ "cdr.dev/slog"
16
+
16
17
"github.com/coder/coder/coderd"
17
18
"github.com/coder/coder/coderd/database"
19
+ "github.com/coder/coder/coderd/database/dbauthz"
18
20
"github.com/coder/coder/tailnet"
19
21
)
20
22
@@ -115,119 +117,134 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
115
117
}
116
118
117
119
// Agents tracks the total number of workspaces with labels on status.
118
- func Agents (ctx context.Context , registerer prometheus.Registerer , db database.Store , coordinator * atomic.Pointer [tailnet.Coordinator ], derpMap * tailcfg.DERPMap , duration time.Duration ) (context.CancelFunc , error ) {
120
+ func Agents (ctx context.Context , logger slog. Logger , registerer prometheus.Registerer , db database.Store , coordinator * atomic.Pointer [tailnet.Coordinator ], derpMap * tailcfg.DERPMap , agentInactiveDisconnectTimeout , duration time.Duration ) (context.CancelFunc , error ) {
119
121
if duration == 0 {
120
122
duration = 15 * time .Second // TODO 5 * time.Minute
121
123
}
122
124
123
- agentsConnectionGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
125
+ workspaceAgentsGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
124
126
Namespace : "coderd" ,
125
127
Subsystem : "agents" ,
126
- Name : "connection " ,
127
- Help : "The agent connection with a status ." ,
128
- }, []string {"agent_name " , "workspace_name" , "status " })
129
- err := registerer .Register (agentsConnectionGauge )
128
+ Name : "up " ,
129
+ Help : "The number of active agents per workspace ." ,
130
+ }, []string {"username " , "workspace_name" })
131
+ err := registerer .Register (workspaceAgentsGauge )
130
132
if err != nil {
131
133
return nil , err
132
134
}
133
135
134
- agentsUserLatenciesGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
136
+ agentsConnectionGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
135
137
Namespace : "coderd" ,
136
138
Subsystem : "agents" ,
137
- Name : "user_latencies_seconds " ,
138
- Help : "The user's agent latency in seconds ." ,
139
- }, []string {"agent_id " , "workspace_name" , "derp_region " , "preferred " })
140
- err = registerer .Register (agentsUserLatenciesGauge )
139
+ Name : "connections " ,
140
+ Help : "Agent connections with statuses ." ,
141
+ }, []string {"agent_name " , "username" , " workspace_name" , "status " , "lifecycle_state" , "tailnet_node " })
142
+ err = registerer .Register (agentsConnectionGauge )
141
143
if err != nil {
142
144
return nil , err
143
145
}
144
146
145
- // FIXME connection_type ide
147
+ agentsConnectionLatenciesGauge := prometheus .NewGaugeVec (prometheus.GaugeOpts {
148
+ Namespace : "coderd" ,
149
+ Subsystem : "agents" ,
150
+ Name : "connection_latencies_seconds" ,
151
+ Help : "Agent connection latencies in seconds." ,
152
+ }, []string {"agent_id" , "username" , "workspace_name" , "derp_region" , "preferred" })
153
+ err = registerer .Register (agentsConnectionLatenciesGauge )
154
+ if err != nil {
155
+ return nil , err
156
+ }
146
157
147
- ctx , cancelFunc := context .WithCancel (ctx )
158
+ // nolint:gocritic // Prometheus must collect metrics for all Coder users.
159
+ ctx , cancelFunc := context .WithCancel (dbauthz .AsSystemRestricted (ctx ))
148
160
ticker := time .NewTicker (duration )
149
161
go func () {
150
162
defer ticker .Stop ()
151
163
for {
152
- log .Println ("Agents!!!" )
153
-
154
164
select {
155
165
case <- ctx .Done ():
156
166
return
157
167
case <- ticker .C :
158
168
}
159
169
160
- // FIXME Optimize this routine: SQL db calls
170
+ logger . Info ( ctx , "Collect agent metrics now" )
161
171
162
- builds , err := db .GetLatestWorkspaceBuilds (ctx )
172
+ workspaceRows , err := db .GetWorkspaces (ctx , database.GetWorkspacesParams {
173
+ AgentInactiveDisconnectTimeoutSeconds : int64 (agentInactiveDisconnectTimeout .Seconds ()),
174
+ })
163
175
if err != nil {
164
- log . Println ( "1 " , err )
176
+ logger . Error ( ctx , "can't get workspace rows " , slog . Error ( err ) )
165
177
continue
166
178
}
167
179
180
+ workspaceAgentsGauge .Reset ()
168
181
agentsConnectionGauge .Reset ()
169
- agentsUserLatenciesGauge .Reset ()
170
- for _ , build := range builds {
171
- workspace , err := db .GetWorkspaceByID (ctx , build .WorkspaceID )
182
+ agentsConnectionLatenciesGauge .Reset ()
183
+
184
+ for _ , workspace := range workspaceRows {
185
+ user , err := db .GetUserByID (ctx , workspace .OwnerID )
172
186
if err != nil {
173
- log .Println ("2" , err )
187
+ logger .Error (ctx , "can't get user" , slog .Error (err ), slog .F ("user_id" , workspace .OwnerID ))
188
+ workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
174
189
continue
175
190
}
176
191
177
- agents , err := db .GetWorkspaceAgentsInLatestBuildByWorkspaceID (ctx , build . WorkspaceID )
192
+ agents , err := db .GetWorkspaceAgentsInLatestBuildByWorkspaceID (ctx , workspace . ID )
178
193
if err != nil {
179
- log .Println ("3" , err )
194
+ logger .Error (ctx , "can't get workspace agents" , slog .F ("workspace_name" , workspace .Name ), slog .Error (err ))
195
+ workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
180
196
continue
181
197
}
182
198
183
199
if len (agents ) == 0 {
200
+ logger .Info (ctx , "workspace agents are unavailable" , slog .F ("workspace_name" , workspace .Name ))
201
+ workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (0 )
184
202
continue
185
203
}
186
204
187
- // FIXME publish workspace even if no agents
188
-
189
205
for _ , agent := range agents {
190
- connectionStatus := agent .Status (6 * time .Second )
191
-
192
- // FIXME AgentInactiveDisconnectTimeout
193
- // ? connection_timeout_seconds
194
- // obok latency lifecycle_state
195
- log .Println ("with value " + agent .Name )
196
- agentsConnectionGauge .WithLabelValues (agent .Name , workspace .Name , string (connectionStatus .Status )).Set (1 )
206
+ // Collect information about agents
207
+ workspaceAgentsGauge .WithLabelValues (user .Username , workspace .Name ).Add (1 )
197
208
209
+ connectionStatus := agent .Status (agentInactiveDisconnectTimeout )
198
210
node := (* coordinator .Load ()).Node (agent .ID )
211
+
212
+ tailnetNode := "unknown"
199
213
if node != nil {
200
- log .Println ("coordinator" )
214
+ tailnetNode = node .ID .String ()
215
+ }
201
216
202
- for rawRegion , latency := range node .DERPLatency {
203
- log .Println (rawRegion , latency )
217
+ agentsConnectionGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , string (connectionStatus .Status ), string (agent .LifecycleState ), tailnetNode ).Set (1 )
204
218
205
- regionParts := strings .SplitN (rawRegion , "-" , 2 )
206
- regionID , err := strconv .Atoi (regionParts [0 ])
207
- if err != nil {
208
- continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err)
209
- }
210
- region , found := derpMap .Regions [regionID ]
211
- if ! found {
212
- // It's possible that a workspace agent is using an old DERPMap
213
- // and reports regions that do not exist. If that's the case,
214
- // report the region as unknown!
215
- region = & tailcfg.DERPRegion {
216
- RegionID : regionID ,
217
- RegionName : fmt .Sprintf ("Unnamed %d" , regionID ),
218
- }
219
- }
219
+ if node == nil {
220
+ logger .Info (ctx , "can't read in-memory node for agent" , slog .F ("workspace_name" , workspace .Name ), slog .F ("agent_name" , agent .Name ))
221
+ continue
222
+ }
220
223
221
- log .Println (region , latency )
222
- agentsUserLatenciesGauge .WithLabelValues (agent .Name , workspace .Name , region .RegionName , fmt .Sprintf ("%v" , node .PreferredDERP == regionID )).Set (latency )
224
+ // Collect information about connection latencies
225
+ for rawRegion , latency := range node .DERPLatency {
226
+ regionParts := strings .SplitN (rawRegion , "-" , 2 )
227
+ regionID , err := strconv .Atoi (regionParts [0 ])
228
+ if err != nil {
229
+ logger .Error (ctx , "can't convert DERP region" , slog .Error (err ), slog .F ("agent_name" , agent .Name ), slog .F ("raw_region" , rawRegion ))
230
+ continue
223
231
}
224
- } else {
225
- log .Println ("node is null" )
232
+ region , found := derpMap .Regions [regionID ]
233
+ if ! found {
234
+ // It's possible that a workspace agent is using an old DERPMap
235
+ // and reports regions that do not exist. If that's the case,
236
+ // report the region as unknown!
237
+ region = & tailcfg.DERPRegion {
238
+ RegionID : regionID ,
239
+ RegionName : fmt .Sprintf ("Unnamed %d" , regionID ),
240
+ }
241
+ }
242
+
243
+ agentsConnectionLatenciesGauge .WithLabelValues (agent .Name , user .Username , workspace .Name , region .RegionName , fmt .Sprintf ("%v" , node .PreferredDERP == regionID )).Set (latency )
226
244
}
227
245
228
- // FIXME publish agent even if DERP is missing
229
246
// FIXME IDE?
230
- // FIXME agent connection zero
247
+ // FIXME connection_type ide
231
248
}
232
249
}
233
250
}
0 commit comments