Skip to content

Commit 0c0df91

Browse files
committed
more metrics
1 parent 092e1d0 commit 0c0df91

File tree

4 files changed

+59
-32
lines changed

4 files changed

+59
-32
lines changed

agent/agent.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
979979

980980
func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) {
981981
defer conn.Close()
982+
metricReconnectingPTYHandler.Add(1)
982983

983984
a.connCountReconnectingPTY.Add(1)
984985
defer a.connCountReconnectingPTY.Add(-1)
@@ -999,6 +1000,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
9991000
logger.Debug(ctx, "session error after agent close", slog.Error(err))
10001001
} else {
10011002
logger.Error(ctx, "session error", slog.Error(err))
1003+
metricReconnectingPTYError.Add(1)
10021004
}
10031005
}
10041006
logger.Debug(ctx, "session closed")
@@ -1018,6 +1020,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10181020
// Empty command will default to the users shell!
10191021
cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil)
10201022
if err != nil {
1023+
metricReconnectingPTYCreateCommandError.Add(1)
10211024
return xerrors.Errorf("create command: %w", err)
10221025
}
10231026
cmd.Env = append(cmd.Env, "TERM=xterm-256color")
@@ -1030,6 +1033,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10301033

10311034
ptty, process, err := pty.Start(cmd)
10321035
if err != nil {
1036+
metricReconnectingPTYCmdStartError.Add(1)
10331037
return xerrors.Errorf("start command: %w", err)
10341038
}
10351039

@@ -1056,7 +1060,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10561060
if err != nil {
10571061
// When the PTY is closed, this is triggered.
10581062
// Error is typically a benign EOF, so only log for debugging.
1059-
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
1063+
if errors.Is(err, io.EOF) {
1064+
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
1065+
} else {
1066+
logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err))
1067+
metricReconnectingPTYOutputReaderError.Add(1)
1068+
}
10601069
break
10611070
}
10621071
part := buffer[:read]
@@ -1071,11 +1080,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10711080
for cid, conn := range rpty.activeConns {
10721081
_, err = conn.Write(part)
10731082
if err != nil {
1074-
logger.Debug(ctx,
1083+
logger.Warn(ctx,
10751084
"error writing to active conn",
10761085
slog.F("other_conn_id", cid),
10771086
slog.Error(err),
10781087
)
1088+
metricReconnectingPTYWriteError.Add(1)
10791089
}
10801090
}
10811091
rpty.activeConnsMutex.Unlock()
@@ -1095,6 +1105,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10951105
if err != nil {
10961106
// We can continue after this, it's not fatal!
10971107
logger.Error(ctx, "resize", slog.Error(err))
1108+
metricReconnectingPTYResizeError.Add(1)
10981109
}
10991110
// Write any previously stored data for the TTY.
11001111
rpty.circularBufferMutex.RLock()
@@ -1107,6 +1118,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11071118
// while also holding circularBufferMutex seems dangerous.
11081119
_, err = conn.Write(prevBuf)
11091120
if err != nil {
1121+
metricReconnectingPTYWriteError.Add(1)
11101122
return xerrors.Errorf("write buffer to conn: %w", err)
11111123
}
11121124
// Multiple connections to the same TTY are permitted.
@@ -1157,6 +1169,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11571169
_, err = rpty.ptty.InputWriter().Write([]byte(req.Data))
11581170
if err != nil {
11591171
logger.Warn(ctx, "write to pty", slog.Error(err))
1172+
metricReconnectingPTYInputWriterError.Add(1)
11601173
return nil
11611174
}
11621175
// Check if a resize needs to happen!
@@ -1167,6 +1180,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11671180
if err != nil {
11681181
// We can continue after this, it's not fatal!
11691182
logger.Error(ctx, "resize", slog.Error(err))
1183+
metricReconnectingPTYResizeError.Add(1)
11701184
}
11711185
}
11721186
}

agent/agentssh/agentssh.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func (s *Server) sessionHandler(session ssh.Session) {
209209
s.logger.Warn(ctx, "ssh session failed", slog.Error(err))
210210
// This exit code is designed to be unlikely to be confused for a legit exit code
211211
// from the process.
212-
metricsSessionError.Add(1)
212+
metricSessionError.Add(1)
213213
_ = session.Exit(MagicSessionErrorCode)
214214
return
215215
}
@@ -243,14 +243,14 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er
243243

244244
cmd, err := s.CreateCommand(ctx, session.RawCommand(), env)
245245
if err != nil {
246-
metricsAgentCreateCommandError.Add(1)
246+
metricAgentCreateCommandError.Add(1)
247247
return err
248248
}
249249

250250
if ssh.AgentRequested(session) {
251251
l, err := ssh.NewAgentListener()
252252
if err != nil {
253-
metricsAgentListenerError.Add(1)
253+
metricAgentListenerError.Add(1)
254254
return xerrors.Errorf("new agent listener: %w", err)
255255
}
256256
defer l.Close()
@@ -266,27 +266,27 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er
266266
}
267267

268268
func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error {
269-
metricsStartNonPTYSession.Add(1)
269+
metricStartNonPTYSession.Add(1)
270270

271271
cmd.Stdout = session
272272
cmd.Stderr = session.Stderr()
273273
// This blocks forever until stdin is received if we don't
274274
// use StdinPipe. It's unknown what causes this.
275275
stdinPipe, err := cmd.StdinPipe()
276276
if err != nil {
277-
metricsNonPTYStdinPipeError.Add(1)
277+
metricNonPTYStdinPipeError.Add(1)
278278
return xerrors.Errorf("create stdin pipe: %w", err)
279279
}
280280
go func() {
281281
_, err := io.Copy(stdinPipe, session)
282282
if err != nil {
283-
metricsNonPTYStdinIoCopyError.Add(1)
283+
metricNonPTYStdinIoCopyError.Add(1)
284284
}
285285
_ = stdinPipe.Close()
286286
}()
287287
err = cmd.Start()
288288
if err != nil {
289-
metricsNonPTYCmdStartError.Add(1)
289+
metricNonPTYCmdStartError.Add(1)
290290
return xerrors.Errorf("start: %w", err)
291291
}
292292
return cmd.Wait()
@@ -302,7 +302,7 @@ type ptySession interface {
302302
}
303303

304304
func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) {
305-
metricsStartPTYSession.Add(1)
305+
metricStartPTYSession.Add(1)
306306

307307
ctx := session.Context()
308308
// Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL).
@@ -315,7 +315,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt
315315
err := showMOTD(session, manifest.MOTDFile)
316316
if err != nil {
317317
s.logger.Error(ctx, "show MOTD", slog.Error(err))
318-
metricsPTYMotdError.Add(1)
318+
metricPTYMotdError.Add(1)
319319
}
320320
} else {
321321
s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD")
@@ -330,14 +330,14 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt
330330
pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)),
331331
))
332332
if err != nil {
333-
metricsPTYCmdStartError.Add(1)
333+
metricPTYCmdStartError.Add(1)
334334
return xerrors.Errorf("start command: %w", err)
335335
}
336336
defer func() {
337337
closeErr := ptty.Close()
338338
if closeErr != nil {
339339
s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr))
340-
metricsPTYCloseError.Add(1)
340+
metricPTYCloseError.Add(1)
341341
if retErr == nil {
342342
retErr = closeErr
343343
}
@@ -349,15 +349,15 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt
349349
// If the pty is closed, then command has exited, no need to log.
350350
if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) {
351351
s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr))
352-
metricsPTYResizeError.Add(1)
352+
metricPTYResizeError.Add(1)
353353
}
354354
}
355355
}()
356356

357357
go func() {
358358
_, err := io.Copy(ptty.InputWriter(), session)
359359
if err != nil {
360-
metricsPTYInputIoCopyError.Add(1)
360+
metricPTYInputIoCopyError.Add(1)
361361
}
362362
}()
363363

@@ -372,7 +372,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt
372372
n, err := io.Copy(session, ptty.OutputReader())
373373
s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err))
374374
if err != nil {
375-
metricsPTYOutputIoCopyError.Add(1)
375+
metricPTYOutputIoCopyError.Add(1)
376376
return xerrors.Errorf("copy error: %w", err)
377377
}
378378
// We've gotten all the output, but we need to wait for the process to
@@ -384,7 +384,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt
384384
// and not something to be concerned about. But, if it's something else, we should log it.
385385
if err != nil && !xerrors.As(err, &exitErr) {
386386
s.logger.Warn(ctx, "wait error", slog.Error(err))
387-
metricsPTYWaitError.Add(1)
387+
metricPTYWaitError.Add(1)
388388
}
389389
if err != nil {
390390
return xerrors.Errorf("process wait: %w", err)

agent/agentssh/metrics.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,25 @@ var (
1111
metricX11Callback = clientmetric.NewCounter("ssh_x11_callback")
1212

1313
// Agent sessions
14-
metricsAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error")
15-
metricsAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error")
16-
metricsStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session")
17-
metricsStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session")
18-
metricsSessionError = clientmetric.NewCounter("ssh_session_error")
14+
metricAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error")
15+
metricAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error")
16+
metricStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session")
17+
metricStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session")
18+
metricSessionError = clientmetric.NewCounter("ssh_session_error")
1919

2020
// Non-PTY sessions
21-
metricsNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error")
22-
metricsNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error")
23-
metricsNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error")
21+
metricNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error")
22+
metricNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error")
23+
metricNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error")
2424

2525
// PTY sessions
26-
metricsPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error")
27-
metricsPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error")
28-
metricsPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error")
29-
metricsPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error")
30-
metricsPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error")
31-
metricsPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error")
32-
metricsPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error")
26+
metricPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error")
27+
metricPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error")
28+
metricPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error")
29+
metricPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error")
30+
metricPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error")
31+
metricPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error")
32+
metricPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error")
3333

3434
// SFTP
3535
metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler")

agent/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,19 @@ import (
99
"github.com/coder/coder/codersdk/agentsdk"
1010
)
1111

12+
var (
13+
// ReconnectingPTY
14+
metricReconnectingPTYHandler = clientmetric.NewCounter("ssh_reconnecting_pty_handler")
15+
metricReconnectingPTYError = clientmetric.NewCounter("ssh_reconnecting_pty_error")
16+
metricReconnectingPTYCreateCommandError = clientmetric.NewCounter("ssh_reconnecting_pty_create_command_error")
17+
metricReconnectingPTYCmdStartError = clientmetric.NewCounter("ssh_reconnecting_pty_cmd_start_error")
18+
metricReconnectingPTYReadError = clientmetric.NewCounter("ssh_reconnecting_pty_read_error")
19+
metricReconnectingPTYWriteError = clientmetric.NewCounter("ssh_reconnecting_pty_write_error")
20+
metricReconnectingPTYResizeError = clientmetric.NewCounter("ssh_reconnecting_pty_resize_error")
21+
metricReconnectingPTYOutputReaderError = clientmetric.NewCounter("ssh_reconnecting_pty_output_reader_error")
22+
metricReconnectingPTYInputWriterError = clientmetric.NewCounter("ssh_reconnecting_pty_input_writer_error")
23+
)
24+
1225
func collectMetrics() []agentsdk.AgentMetric {
1326
// Tailscale metrics
1427
metrics := clientmetric.Metrics()

0 commit comments

Comments
 (0)