Skip to content

feat: Collect agent SSH metrics #7584

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 42 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
092e1d0
feat: Collect agent SSH metrics
mtojek May 17, 2023
0c0df91
more metrics
mtojek May 17, 2023
84b59ea
err
mtojek May 17, 2023
000586a
session metrics
mtojek May 18, 2023
11fb056
session error
mtojek May 18, 2023
07b2b0e
fix
mtojek May 18, 2023
e38a7be
fmt
mtojek May 18, 2023
ba4bb4d
WIP
mtojek May 18, 2023
0d0f300
Refactored to client_golang/prometheus
mtojek May 18, 2023
315b5ce
fix
mtojek May 18, 2023
43d5d40
fix
mtojek May 18, 2023
85f8860
refactor
mtojek May 18, 2023
7b26267
Merge branch 'main' into 6724-ssh-metrics
mtojek May 18, 2023
34f07fc
refactor
mtojek May 18, 2023
59fd585
fix test
mtojek May 18, 2023
8cd927c
fix
mtojek May 18, 2023
6eec4d7
fix
mtojek May 18, 2023
a059edf
fix
mtojek May 18, 2023
c004c04
fix
mtojek May 18, 2023
9b0e31a
Address PR comments
mtojek May 19, 2023
7d4ccce
x11HostnameError
mtojek May 19, 2023
90b351d
Remove callbacks
mtojek May 19, 2023
1773c24
failedConnectionsTotal
mtojek May 19, 2023
5ac27b7
connectionsTotal
mtojek May 19, 2023
6eb1a95
sftpConnectionsTotal
mtojek May 19, 2023
e3d7493
sessionError
mtojek May 19, 2023
9620452
sftpServerErrors
mtojek May 19, 2023
f05466c
remove handlerError
mtojek May 19, 2023
5887ee8
WIP
mtojek May 19, 2023
bb3602b
WIP
mtojek May 22, 2023
27fc9a0
WIP
mtojek May 22, 2023
3f4696b
Finish impl
mtojek May 23, 2023
8cd07f2
Aggregator: labels
mtojek May 23, 2023
a51cde9
Merge branch 'main' into 6724-ssh-metrics
mtojek May 23, 2023
389dd9f
TestAgent_Metrics_SSH
mtojek May 23, 2023
8e10d6d
Address PR comments
mtojek May 24, 2023
1858dc2
use labelIndex
mtojek May 24, 2023
8dde9f9
Merge branch 'main' into 6724-ssh-metrics
mtojek May 24, 2023
db725a3
Merge branch 'main' into 6724-ssh-metrics
mtojek May 25, 2023
f416287
PR comments part 1
mtojek May 25, 2023
4daf37d
PR comments part 2
mtojek May 25, 2023
d9203b8
PR comments part 3
mtojek May 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
WIP
  • Loading branch information
mtojek committed May 19, 2023
commit 5887ee875d35b18e458df7e67e456e1522f70817
17 changes: 8 additions & 9 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,6 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
logger.Debug(ctx, "session error after agent close", slog.Error(err))
} else {
logger.Error(ctx, "session error", slog.Error(err))
a.metrics.handlerError.Add(1)
}
}
logger.Debug(ctx, "session closed")
Expand All @@ -1039,7 +1038,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
// Empty command will default to the users shell!
cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil)
if err != nil {
a.metrics.createCommandError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("create_command").Add(1)
return xerrors.Errorf("create command: %w", err)
}
cmd.Env = append(cmd.Env, "TERM=xterm-256color")
Expand All @@ -1052,7 +1051,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m

ptty, process, err := pty.Start(cmd)
if err != nil {
a.metrics.cmdStartError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("start_command").Add(1)
return xerrors.Errorf("start command: %w", err)
}

Expand Down Expand Up @@ -1083,7 +1082,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
} else {
logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err))
a.metrics.outputReaderError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("output_reader").Add(1)
}
break
}
Expand All @@ -1104,7 +1103,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
slog.F("other_conn_id", cid),
slog.Error(err),
)
a.metrics.writeError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1)
}
}
rpty.activeConnsMutex.Unlock()
Expand All @@ -1124,7 +1123,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
if err != nil {
// We can continue after this, it's not fatal!
logger.Error(ctx, "resize", slog.Error(err))
a.metrics.resizeError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
}
// Write any previously stored data for the TTY.
rpty.circularBufferMutex.RLock()
Expand All @@ -1137,7 +1136,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
// while also holding circularBufferMutex seems dangerous.
_, err = conn.Write(prevBuf)
if err != nil {
a.metrics.writeError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1)
return xerrors.Errorf("write buffer to conn: %w", err)
}
// Multiple connections to the same TTY are permitted.
Expand Down Expand Up @@ -1188,7 +1187,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
_, err = rpty.ptty.InputWriter().Write([]byte(req.Data))
if err != nil {
logger.Warn(ctx, "write to pty", slog.Error(err))
a.metrics.inputWriterError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("input_writer").Add(1)
return nil
}
// Check if a resize needs to happen!
Expand All @@ -1199,7 +1198,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
if err != nil {
// We can continue after this, it's not fatal!
logger.Error(ctx, "resize", slog.Error(err))
a.metrics.resizeError.Add(1)
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
}
}
}
Expand Down
43 changes: 22 additions & 21 deletions agent/agentssh/agentssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,7 @@ func (s *Server) sessionHandler(session ssh.Session) {
return
}

m := metricsForSession(s.metrics.sessions, magicType(session))
err := s.sessionStart(session, m, extraEnv)
err := s.sessionStart(session, extraEnv)
var exitError *exec.ExitError
if xerrors.As(err, &exitError) {
s.logger.Warn(ctx, "ssh session returned", slog.Error(exitError))
Expand All @@ -229,7 +228,7 @@ func magicType(session ssh.Session) string {
return ""
}

func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extraEnv []string) (retErr error) {
func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr error) {
ctx := session.Context()
env := append(session.Environ(), extraEnv...)
var magicType string
Expand All @@ -254,16 +253,18 @@ func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extra
s.logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("type", magicType))
}

magicTypeLabel := magicTypeMetricLabel(magicType)

cmd, err := s.CreateCommand(ctx, session.RawCommand(), env)
if err != nil {
m.agentCreateCommandError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "create_command").Add(1)
return err
}

if ssh.AgentRequested(session) {
l, err := ssh.NewAgentListener()
if err != nil {
m.agentListenerError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "listener").Add(1)
return xerrors.Errorf("new agent listener: %w", err)
}
defer l.Close()
Expand All @@ -273,33 +274,33 @@ func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extra

sshPty, windowSize, isPty := session.Pty()
if isPty {
return s.startPTYSession(session, m, cmd, sshPty, windowSize)
return s.startPTYSession(session, magicTypeLabel, cmd, sshPty, windowSize)
}
return startNonPTYSession(session, m, cmd.AsExec())
return s.startNonPTYSession(session, magicTypeLabel, cmd.AsExec())
}

func startNonPTYSession(session ssh.Session, m sessionMetricsObject, cmd *exec.Cmd) error {
m.startNonPTYSession.Add(1)
func (s *Server) startNonPTYSession(session ssh.Session, magicTypeLabel string, cmd *exec.Cmd) error {
s.metrics.sessionsTotal.WithLabelValues(magicTypeLabel, "no").Add(1)

cmd.Stdout = session
cmd.Stderr = session.Stderr()
// This blocks forever until stdin is received if we don't
// use StdinPipe. It's unknown what causes this.
stdinPipe, err := cmd.StdinPipe()
if err != nil {
m.nonPTYStdinPipeError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "stdin_pipe").Add(1)
return xerrors.Errorf("create stdin pipe: %w", err)
}
go func() {
_, err := io.Copy(stdinPipe, session)
if err != nil {
m.nonPTYStdinIoCopyError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "stdin_io_copy").Add(1)
}
_ = stdinPipe.Close()
}()
err = cmd.Start()
if err != nil {
m.nonPTYCmdStartError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "start_command").Add(1)
return xerrors.Errorf("start: %w", err)
}
return cmd.Wait()
Expand All @@ -314,8 +315,8 @@ type ptySession interface {
RawCommand() string
}

func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) {
m.startPTYSession.Add(1)
func (s *Server) startPTYSession(session ptySession, magicTypeLabel string, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) {
s.metrics.sessionsTotal.WithLabelValues(magicTypeLabel, "yes").Add(1)

ctx := session.Context()
// Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL).
Expand All @@ -328,7 +329,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd
err := showMOTD(session, manifest.MOTDFile)
if err != nil {
s.logger.Error(ctx, "show MOTD", slog.Error(err))
m.ptyMotdError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "motd").Add(1)
}
} else {
s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD")
Expand All @@ -343,14 +344,14 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd
pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)),
))
if err != nil {
m.ptyCmdStartError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "start_command").Add(1)
return xerrors.Errorf("start command: %w", err)
}
defer func() {
closeErr := ptty.Close()
if closeErr != nil {
s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr))
m.ptyCloseError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "close").Add(1)
if retErr == nil {
retErr = closeErr
}
Expand All @@ -362,15 +363,15 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd
// If the pty is closed, then command has exited, no need to log.
if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) {
s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr))
m.ptyResizeError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "resize").Add(1)
}
}
}()

go func() {
_, err := io.Copy(ptty.InputWriter(), session)
if err != nil {
m.ptyInputIoCopyError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "input_io_copy").Add(1)
}
}()

Expand All @@ -385,7 +386,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd
n, err := io.Copy(session, ptty.OutputReader())
s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err))
if err != nil {
m.ptyOutputIoCopyError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "output_io_copy").Add(1)
return xerrors.Errorf("copy error: %w", err)
}
// We've gotten all the output, but we need to wait for the process to
Expand All @@ -397,7 +398,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd
// and not something to be concerned about. But, if it's something else, we should log it.
if err != nil && !xerrors.As(err, &exitErr) {
s.logger.Warn(ctx, "wait error", slog.Error(err))
m.ptyWaitError.Add(1)
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "wait").Add(1)
}
if err != nil {
return xerrors.Errorf("process wait: %w", err)
Expand Down
Loading