From 092e1d0b7819e3d61ad655fe21fe181f52a92eff Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 17 May 2023 14:58:35 +0200 Subject: [PATCH 01/38] feat: Collect agent SSH metrics --- agent/agentssh/agentssh.go | 34 ++++++++++++++++++++++++++++--- agent/agentssh/metrics.go | 41 ++++++++++++++++++++++++++++++++++++++ agent/agentssh/x11.go | 4 ++++ 3 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 agent/agentssh/metrics.go diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 6221751ae8c82..bb44257afd43d 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -106,7 +106,8 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout "session": ssh.DefaultSessionHandler, }, ConnectionFailedCallback: func(_ net.Conn, err error) { - s.logger.Info(ctx, "ssh connection ended", slog.Error(err)) + s.logger.Warn(ctx, "ssh connection failed", slog.Error(err)) + metricConnectionFailedCallback.Add(1) }, Handler: s.sessionHandler, HostSigners: []ssh.Signer{randomSigner}, @@ -115,9 +116,11 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout s.logger.Debug(ctx, "local port forward", slog.F("destination-host", destinationHost), slog.F("destination-port", destinationPort)) + metricLocalPortForwardingCallback.Add(1) return true }, PtyCallback: func(ctx ssh.Context, pty ssh.Pty) bool { + metricPtyCallback.Add(1) return true }, ReversePortForwardingCallback: func(ctx ssh.Context, bindHost string, bindPort uint32) bool { @@ -125,6 +128,7 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout s.logger.Debug(ctx, "local port forward", slog.F("bind-host", bindHost), slog.F("bind-port", bindPort)) + metricReversePortForwardingCallback.Add(1) return true }, RequestHandlers: map[string]ssh.RequestHandler{ @@ -205,6 +209,7 @@ func (s *Server) sessionHandler(session ssh.Session) { s.logger.Warn(ctx, "ssh session failed", slog.Error(err)) // This exit code is designed to be unlikely to be confused for a legit exit code // from the process. + metricsSessionError.Add(1) _ = session.Exit(MagicSessionErrorCode) return } @@ -238,12 +243,14 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { + metricsAgentCreateCommandError.Add(1) return err } if ssh.AgentRequested(session) { l, err := ssh.NewAgentListener() if err != nil { + metricsAgentListenerError.Add(1) return xerrors.Errorf("new agent listener: %w", err) } defer l.Close() @@ -259,20 +266,27 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er } func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error { + metricsStartNonPTYSession.Add(1) + cmd.Stdout = session cmd.Stderr = session.Stderr() // This blocks forever until stdin is received if we don't // use StdinPipe. It's unknown what causes this. stdinPipe, err := cmd.StdinPipe() if err != nil { + metricsNonPTYStdinPipeError.Add(1) return xerrors.Errorf("create stdin pipe: %w", err) } go func() { - _, _ = io.Copy(stdinPipe, session) + _, err := io.Copy(stdinPipe, session) + if err != nil { + metricsNonPTYStdinIoCopyError.Add(1) + } _ = stdinPipe.Close() }() err = cmd.Start() if err != nil { + metricsNonPTYCmdStartError.Add(1) return xerrors.Errorf("start: %w", err) } return cmd.Wait() @@ -288,6 +302,8 @@ type ptySession interface { } func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { + metricsStartPTYSession.Add(1) + ctx := session.Context() // Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL). // See https://github.com/coder/coder/issues/3371. @@ -299,6 +315,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt err := showMOTD(session, manifest.MOTDFile) if err != nil { s.logger.Error(ctx, "show MOTD", slog.Error(err)) + metricsPTYMotdError.Add(1) } } else { s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD") @@ -313,12 +330,14 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)), )) if err != nil { + metricsPTYCmdStartError.Add(1) return xerrors.Errorf("start command: %w", err) } defer func() { closeErr := ptty.Close() if closeErr != nil { s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr)) + metricsPTYCloseError.Add(1) if retErr == nil { retErr = closeErr } @@ -330,12 +349,16 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // If the pty is closed, then command has exited, no need to log. if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) { s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr)) + metricsPTYResizeError.Add(1) } } }() go func() { - _, _ = io.Copy(ptty.InputWriter(), session) + _, err := io.Copy(ptty.InputWriter(), session) + if err != nil { + metricsPTYInputIoCopyError.Add(1) + } }() // We need to wait for the command output to finish copying. It's safe to @@ -349,6 +372,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt n, err := io.Copy(session, ptty.OutputReader()) s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err)) if err != nil { + metricsPTYOutputIoCopyError.Add(1) return xerrors.Errorf("copy error: %w", err) } // We've gotten all the output, but we need to wait for the process to @@ -360,6 +384,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // and not something to be concerned about. But, if it's something else, we should log it. if err != nil && !xerrors.As(err, &exitErr) { s.logger.Warn(ctx, "wait error", slog.Error(err)) + metricsPTYWaitError.Add(1) } if err != nil { return xerrors.Errorf("process wait: %w", err) @@ -368,6 +393,8 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt } func (s *Server) sftpHandler(session ssh.Session) { + metricSftpHandler.Add(1) + ctx := session.Context() // Typically sftp sessions don't request a TTY, but if they do, @@ -407,6 +434,7 @@ func (s *Server) sftpHandler(session ssh.Session) { return } s.logger.Warn(ctx, "sftp server closed with error", slog.Error(err)) + metricSftpServerError.Add(1) _ = session.Exit(1) } diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go new file mode 100644 index 0000000000000..bbf77774a630f --- /dev/null +++ b/agent/agentssh/metrics.go @@ -0,0 +1,41 @@ +package agentssh + +import "tailscale.com/util/clientmetric" + +var ( + // SSH callbacks + metricConnectionFailedCallback = clientmetric.NewCounter("ssh_connection_failed_callback") + metricLocalPortForwardingCallback = clientmetric.NewCounter("ssh_local_port_forwarding_callback") + metricPtyCallback = clientmetric.NewCounter("ssh_pty_callback") + metricReversePortForwardingCallback = clientmetric.NewCounter("ssh_reverse_port_forwarding_callback") + metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") + + // Agent sessions + metricsAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error") + metricsAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error") + metricsStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session") + metricsStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session") + metricsSessionError = clientmetric.NewCounter("ssh_session_error") + + // Non-PTY sessions + metricsNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error") + metricsNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error") + metricsNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error") + + // PTY sessions + metricsPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error") + metricsPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error") + metricsPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error") + metricsPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error") + metricsPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error") + metricsPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error") + metricsPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error") + + // SFTP + metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler") + metricSftpServerError = clientmetric.NewCounter("ssh_sftp_server_error") + + // X11 + metricX11SocketDirError = clientmetric.NewCounter("ssh_x11_socket_dir_error") + metricX11XauthorityError = clientmetric.NewCounter("ssh_x11_xauthority_error") +) diff --git a/agent/agentssh/x11.go b/agent/agentssh/x11.go index b301326a0acb3..340ab6b1b0eab 100644 --- a/agent/agentssh/x11.go +++ b/agent/agentssh/x11.go @@ -24,6 +24,8 @@ import ( // x11Callback is called when the client requests X11 forwarding. // It adds an Xauthority entry to the Xauthority file. func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { + metricX11Callback.Add(1) + hostname, err := os.Hostname() if err != nil { s.logger.Warn(ctx, "failed to get hostname", slog.Error(err)) @@ -33,12 +35,14 @@ func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { err = s.fs.MkdirAll(s.x11SocketDir, 0o700) if err != nil { s.logger.Warn(ctx, "failed to make the x11 socket dir", slog.F("dir", s.x11SocketDir), slog.Error(err)) + metricX11SocketDirError.Add(1) return false } err = addXauthEntry(ctx, s.fs, hostname, strconv.Itoa(int(x11.ScreenNumber)), x11.AuthProtocol, x11.AuthCookie) if err != nil { s.logger.Warn(ctx, "failed to add Xauthority entry", slog.Error(err)) + metricX11XauthorityError.Add(1) return false } return true From 0c0df91dbc861e263ca99a691061b80ca3286fc1 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 17 May 2023 15:51:06 +0200 Subject: [PATCH 02/38] more metrics --- agent/agent.go | 18 ++++++++++++++++-- agent/agentssh/agentssh.go | 30 +++++++++++++++--------------- agent/agentssh/metrics.go | 30 +++++++++++++++--------------- agent/metrics.go | 13 +++++++++++++ 4 files changed, 59 insertions(+), 32 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index ea22069fd08fe..6e0b454aa4f12 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -979,6 +979,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) { defer conn.Close() + metricReconnectingPTYHandler.Add(1) a.connCountReconnectingPTY.Add(1) defer a.connCountReconnectingPTY.Add(-1) @@ -999,6 +1000,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "session error after agent close", slog.Error(err)) } else { logger.Error(ctx, "session error", slog.Error(err)) + metricReconnectingPTYError.Add(1) } } logger.Debug(ctx, "session closed") @@ -1018,6 +1020,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // Empty command will default to the users shell! cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil) if err != nil { + metricReconnectingPTYCreateCommandError.Add(1) return xerrors.Errorf("create command: %w", err) } cmd.Env = append(cmd.Env, "TERM=xterm-256color") @@ -1030,6 +1033,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m ptty, process, err := pty.Start(cmd) if err != nil { + metricReconnectingPTYCmdStartError.Add(1) return xerrors.Errorf("start command: %w", err) } @@ -1056,7 +1060,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // When the PTY is closed, this is triggered. // Error is typically a benign EOF, so only log for debugging. - logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err)) + if errors.Is(err, io.EOF) { + logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err)) + } else { + logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err)) + metricReconnectingPTYOutputReaderError.Add(1) + } break } part := buffer[:read] @@ -1071,11 +1080,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m for cid, conn := range rpty.activeConns { _, err = conn.Write(part) if err != nil { - logger.Debug(ctx, + logger.Warn(ctx, "error writing to active conn", slog.F("other_conn_id", cid), slog.Error(err), ) + metricReconnectingPTYWriteError.Add(1) } } rpty.activeConnsMutex.Unlock() @@ -1095,6 +1105,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) + metricReconnectingPTYResizeError.Add(1) } // Write any previously stored data for the TTY. rpty.circularBufferMutex.RLock() @@ -1107,6 +1118,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // while also holding circularBufferMutex seems dangerous. _, err = conn.Write(prevBuf) if err != nil { + metricReconnectingPTYWriteError.Add(1) return xerrors.Errorf("write buffer to conn: %w", err) } // Multiple connections to the same TTY are permitted. @@ -1157,6 +1169,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m _, err = rpty.ptty.InputWriter().Write([]byte(req.Data)) if err != nil { logger.Warn(ctx, "write to pty", slog.Error(err)) + metricReconnectingPTYInputWriterError.Add(1) return nil } // Check if a resize needs to happen! @@ -1167,6 +1180,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) + metricReconnectingPTYResizeError.Add(1) } } } diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index bb44257afd43d..5f0d1cbce3b17 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -209,7 +209,7 @@ func (s *Server) sessionHandler(session ssh.Session) { s.logger.Warn(ctx, "ssh session failed", slog.Error(err)) // This exit code is designed to be unlikely to be confused for a legit exit code // from the process. - metricsSessionError.Add(1) + metricSessionError.Add(1) _ = session.Exit(MagicSessionErrorCode) return } @@ -243,14 +243,14 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { - metricsAgentCreateCommandError.Add(1) + metricAgentCreateCommandError.Add(1) return err } if ssh.AgentRequested(session) { l, err := ssh.NewAgentListener() if err != nil { - metricsAgentListenerError.Add(1) + metricAgentListenerError.Add(1) return xerrors.Errorf("new agent listener: %w", err) } defer l.Close() @@ -266,7 +266,7 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er } func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error { - metricsStartNonPTYSession.Add(1) + metricStartNonPTYSession.Add(1) cmd.Stdout = session cmd.Stderr = session.Stderr() @@ -274,19 +274,19 @@ func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error { // use StdinPipe. It's unknown what causes this. stdinPipe, err := cmd.StdinPipe() if err != nil { - metricsNonPTYStdinPipeError.Add(1) + metricNonPTYStdinPipeError.Add(1) return xerrors.Errorf("create stdin pipe: %w", err) } go func() { _, err := io.Copy(stdinPipe, session) if err != nil { - metricsNonPTYStdinIoCopyError.Add(1) + metricNonPTYStdinIoCopyError.Add(1) } _ = stdinPipe.Close() }() err = cmd.Start() if err != nil { - metricsNonPTYCmdStartError.Add(1) + metricNonPTYCmdStartError.Add(1) return xerrors.Errorf("start: %w", err) } return cmd.Wait() @@ -302,7 +302,7 @@ type ptySession interface { } func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { - metricsStartPTYSession.Add(1) + metricStartPTYSession.Add(1) ctx := session.Context() // Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL). @@ -315,7 +315,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt err := showMOTD(session, manifest.MOTDFile) if err != nil { s.logger.Error(ctx, "show MOTD", slog.Error(err)) - metricsPTYMotdError.Add(1) + metricPTYMotdError.Add(1) } } else { s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD") @@ -330,14 +330,14 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)), )) if err != nil { - metricsPTYCmdStartError.Add(1) + metricPTYCmdStartError.Add(1) return xerrors.Errorf("start command: %w", err) } defer func() { closeErr := ptty.Close() if closeErr != nil { s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr)) - metricsPTYCloseError.Add(1) + metricPTYCloseError.Add(1) if retErr == nil { retErr = closeErr } @@ -349,7 +349,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // If the pty is closed, then command has exited, no need to log. if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) { s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr)) - metricsPTYResizeError.Add(1) + metricPTYResizeError.Add(1) } } }() @@ -357,7 +357,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt go func() { _, err := io.Copy(ptty.InputWriter(), session) if err != nil { - metricsPTYInputIoCopyError.Add(1) + metricPTYInputIoCopyError.Add(1) } }() @@ -372,7 +372,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt n, err := io.Copy(session, ptty.OutputReader()) s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err)) if err != nil { - metricsPTYOutputIoCopyError.Add(1) + metricPTYOutputIoCopyError.Add(1) return xerrors.Errorf("copy error: %w", err) } // We've gotten all the output, but we need to wait for the process to @@ -384,7 +384,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // and not something to be concerned about. But, if it's something else, we should log it. if err != nil && !xerrors.As(err, &exitErr) { s.logger.Warn(ctx, "wait error", slog.Error(err)) - metricsPTYWaitError.Add(1) + metricPTYWaitError.Add(1) } if err != nil { return xerrors.Errorf("process wait: %w", err) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index bbf77774a630f..87ccc459e41bb 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -11,25 +11,25 @@ var ( metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") // Agent sessions - metricsAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error") - metricsAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error") - metricsStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session") - metricsStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session") - metricsSessionError = clientmetric.NewCounter("ssh_session_error") + metricAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error") + metricAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error") + metricStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session") + metricStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session") + metricSessionError = clientmetric.NewCounter("ssh_session_error") // Non-PTY sessions - metricsNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error") - metricsNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error") - metricsNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error") + metricNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error") + metricNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error") + metricNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error") // PTY sessions - metricsPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error") - metricsPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error") - metricsPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error") - metricsPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error") - metricsPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error") - metricsPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error") - metricsPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error") + metricPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error") + metricPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error") + metricPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error") + metricPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error") + metricPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error") + metricPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error") + metricPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error") // SFTP metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler") diff --git a/agent/metrics.go b/agent/metrics.go index fd195202c0086..397f3d706989c 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -9,6 +9,19 @@ import ( "github.com/coder/coder/codersdk/agentsdk" ) +var ( + // ReconnectingPTY + metricReconnectingPTYHandler = clientmetric.NewCounter("ssh_reconnecting_pty_handler") + metricReconnectingPTYError = clientmetric.NewCounter("ssh_reconnecting_pty_error") + metricReconnectingPTYCreateCommandError = clientmetric.NewCounter("ssh_reconnecting_pty_create_command_error") + metricReconnectingPTYCmdStartError = clientmetric.NewCounter("ssh_reconnecting_pty_cmd_start_error") + metricReconnectingPTYReadError = clientmetric.NewCounter("ssh_reconnecting_pty_read_error") + metricReconnectingPTYWriteError = clientmetric.NewCounter("ssh_reconnecting_pty_write_error") + metricReconnectingPTYResizeError = clientmetric.NewCounter("ssh_reconnecting_pty_resize_error") + metricReconnectingPTYOutputReaderError = clientmetric.NewCounter("ssh_reconnecting_pty_output_reader_error") + metricReconnectingPTYInputWriterError = clientmetric.NewCounter("ssh_reconnecting_pty_input_writer_error") +) + func collectMetrics() []agentsdk.AgentMetric { // Tailscale metrics metrics := clientmetric.Metrics() From 84b59ea2dbd62b3297204ba7a09bd84f4b29558f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 17 May 2023 15:58:52 +0200 Subject: [PATCH 03/38] err --- agent/metrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/agent/metrics.go b/agent/metrics.go index 397f3d706989c..9d16e4498c1c3 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -15,7 +15,6 @@ var ( metricReconnectingPTYError = clientmetric.NewCounter("ssh_reconnecting_pty_error") metricReconnectingPTYCreateCommandError = clientmetric.NewCounter("ssh_reconnecting_pty_create_command_error") metricReconnectingPTYCmdStartError = clientmetric.NewCounter("ssh_reconnecting_pty_cmd_start_error") - metricReconnectingPTYReadError = clientmetric.NewCounter("ssh_reconnecting_pty_read_error") metricReconnectingPTYWriteError = clientmetric.NewCounter("ssh_reconnecting_pty_write_error") metricReconnectingPTYResizeError = clientmetric.NewCounter("ssh_reconnecting_pty_resize_error") metricReconnectingPTYOutputReaderError = clientmetric.NewCounter("ssh_reconnecting_pty_output_reader_error") From 000586a79724b96e96004ae82d8b77f5f6cc4d0f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 12:52:35 +0200 Subject: [PATCH 04/38] session metrics --- agent/agentssh/agentssh.go | 37 +++++----- agent/agentssh/agentssh_internal_test.go | 3 +- agent/agentssh/metrics.go | 86 ++++++++++++++++++------ 3 files changed, 88 insertions(+), 38 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 5f0d1cbce3b17..a95bb8ed6f981 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -241,16 +241,17 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er s.logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("type", magicType)) } + m := metricsForSession(magicType) cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { - metricAgentCreateCommandError.Add(1) + m.agentCreateCommandError.Add(1) return err } if ssh.AgentRequested(session) { l, err := ssh.NewAgentListener() if err != nil { - metricAgentListenerError.Add(1) + m.agentListenerError.Add(1) return xerrors.Errorf("new agent listener: %w", err) } defer l.Close() @@ -260,13 +261,13 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er sshPty, windowSize, isPty := session.Pty() if isPty { - return s.startPTYSession(session, cmd, sshPty, windowSize) + return s.startPTYSession(session, m, cmd, sshPty, windowSize) } - return startNonPTYSession(session, cmd.AsExec()) + return startNonPTYSession(session, m, cmd.AsExec()) } -func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error { - metricStartNonPTYSession.Add(1) +func startNonPTYSession(session ssh.Session, m sessionMetricsObject, cmd *exec.Cmd) error { + m.startNonPTYSession.Add(1) cmd.Stdout = session cmd.Stderr = session.Stderr() @@ -274,19 +275,19 @@ func startNonPTYSession(session ssh.Session, cmd *exec.Cmd) error { // use StdinPipe. It's unknown what causes this. stdinPipe, err := cmd.StdinPipe() if err != nil { - metricNonPTYStdinPipeError.Add(1) + m.nonPTYStdinPipeError.Add(1) return xerrors.Errorf("create stdin pipe: %w", err) } go func() { _, err := io.Copy(stdinPipe, session) if err != nil { - metricNonPTYStdinIoCopyError.Add(1) + m.nonPTYStdinIoCopyError.Add(1) } _ = stdinPipe.Close() }() err = cmd.Start() if err != nil { - metricNonPTYCmdStartError.Add(1) + m.nonPTYCmdStartError.Add(1) return xerrors.Errorf("start: %w", err) } return cmd.Wait() @@ -301,8 +302,8 @@ type ptySession interface { RawCommand() string } -func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { - metricStartPTYSession.Add(1) +func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { + m.startPTYSession.Add(1) ctx := session.Context() // Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL). @@ -315,7 +316,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt err := showMOTD(session, manifest.MOTDFile) if err != nil { s.logger.Error(ctx, "show MOTD", slog.Error(err)) - metricPTYMotdError.Add(1) + m.ptyMotdError.Add(1) } } else { s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD") @@ -330,14 +331,14 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)), )) if err != nil { - metricPTYCmdStartError.Add(1) + m.ptyCmdStartError.Add(1) return xerrors.Errorf("start command: %w", err) } defer func() { closeErr := ptty.Close() if closeErr != nil { s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr)) - metricPTYCloseError.Add(1) + m.ptyCloseError.Add(1) if retErr == nil { retErr = closeErr } @@ -349,7 +350,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // If the pty is closed, then command has exited, no need to log. if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) { s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr)) - metricPTYResizeError.Add(1) + m.ptyResizeError.Add(1) } } }() @@ -357,7 +358,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt go func() { _, err := io.Copy(ptty.InputWriter(), session) if err != nil { - metricPTYInputIoCopyError.Add(1) + m.ptyInputIoCopyError.Add(1) } }() @@ -372,7 +373,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt n, err := io.Copy(session, ptty.OutputReader()) s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err)) if err != nil { - metricPTYOutputIoCopyError.Add(1) + m.ptyOutputIoCopyError.Add(1) return xerrors.Errorf("copy error: %w", err) } // We've gotten all the output, but we need to wait for the process to @@ -384,7 +385,7 @@ func (s *Server) startPTYSession(session ptySession, cmd *pty.Cmd, sshPty ssh.Pt // and not something to be concerned about. But, if it's something else, we should log it. if err != nil && !xerrors.As(err, &exitErr) { s.logger.Warn(ctx, "wait error", slog.Error(err)) - metricPTYWaitError.Add(1) + m.ptyWaitError.Add(1) } if err != nil { return xerrors.Errorf("process wait: %w", err) diff --git a/agent/agentssh/agentssh_internal_test.go b/agent/agentssh/agentssh_internal_test.go index ed05e53a04ba7..983ba436b7370 100644 --- a/agent/agentssh/agentssh_internal_test.go +++ b/agent/agentssh/agentssh_internal_test.go @@ -60,7 +60,8 @@ func Test_sessionStart_orphan(t *testing.T) { // we don't really care what the error is here. In the larger scenario, // the client has disconnected, so we can't return any error information // to them. - _ = s.startPTYSession(sess, cmd, ptyInfo, windowSize) + m := metricsForSession("ssh") + _ = s.startPTYSession(sess, m, cmd, ptyInfo, windowSize) }() readDone := make(chan struct{}) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 87ccc459e41bb..71e15ebba4d63 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -1,6 +1,10 @@ package agentssh -import "tailscale.com/util/clientmetric" +import ( + "fmt" + + "tailscale.com/util/clientmetric" +) var ( // SSH callbacks @@ -10,26 +14,9 @@ var ( metricReversePortForwardingCallback = clientmetric.NewCounter("ssh_reverse_port_forwarding_callback") metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") - // Agent sessions - metricAgentCreateCommandError = clientmetric.NewCounter("ssh_agent_create_command_error") - metricAgentListenerError = clientmetric.NewCounter("ssh_agent_listener_error") - metricStartPTYSession = clientmetric.NewCounter("ssh_agent_start_pty_session") - metricStartNonPTYSession = clientmetric.NewCounter("ssh_agent_start_non_pty_session") - metricSessionError = clientmetric.NewCounter("ssh_session_error") - - // Non-PTY sessions - metricNonPTYStdinPipeError = clientmetric.NewCounter("ssh_non_pty_stdin_pipe_error") - metricNonPTYStdinIoCopyError = clientmetric.NewCounter("ssh_non_pty_stdin_io_copy_error") - metricNonPTYCmdStartError = clientmetric.NewCounter("ssh_non_pty_cmd_start_error") + metricSessionError = clientmetric.NewCounter("ssh_session_error") // PTY sessions - metricPTYMotdError = clientmetric.NewCounter("ssh_pty_motd_error") - metricPTYCmdStartError = clientmetric.NewCounter("ssh_pty_cmd_start_error") - metricPTYCloseError = clientmetric.NewCounter("ssh_pty_close_error") - metricPTYResizeError = clientmetric.NewCounter("ssh_pty_resize_error") - metricPTYInputIoCopyError = clientmetric.NewCounter("ssh_pty_input_io_copy_error") - metricPTYOutputIoCopyError = clientmetric.NewCounter("ssh_pty_output_io_copy_error") - metricPTYWaitError = clientmetric.NewCounter("ssh_pty_wait_error") // SFTP metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler") @@ -39,3 +26,64 @@ var ( metricX11SocketDirError = clientmetric.NewCounter("ssh_x11_socket_dir_error") metricX11XauthorityError = clientmetric.NewCounter("ssh_x11_xauthority_error") ) + +var ( + sessionMetrics = map[string]sessionMetricsObject{} +) + +type sessionMetricsObject struct { + // Agent sessions + agentCreateCommandError *clientmetric.Metric + agentListenerError *clientmetric.Metric + startPTYSession *clientmetric.Metric + startNonPTYSession *clientmetric.Metric + + // Non-PTY sessions + nonPTYStdinPipeError *clientmetric.Metric + nonPTYStdinIoCopyError *clientmetric.Metric + nonPTYCmdStartError *clientmetric.Metric + + // PTY sessions + ptyMotdError *clientmetric.Metric + ptyCmdStartError *clientmetric.Metric + ptyCloseError *clientmetric.Metric + ptyResizeError *clientmetric.Metric + ptyInputIoCopyError *clientmetric.Metric + ptyOutputIoCopyError *clientmetric.Metric + ptyWaitError *clientmetric.Metric +} + +func init() { + for _, magicType := range []string{MagicSessionTypeVSCode, MagicSessionTypeJetBrains, "ssh", "unknown"} { + sessionMetrics[magicType] = sessionMetricsObject{ + agentCreateCommandError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_create_command_error", magicType)), + agentListenerError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_listener_error", magicType)), + startPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_pty_session", magicType)), + startNonPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_non_pty_session", magicType)), + + nonPTYStdinPipeError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_pipe_error", magicType)), + nonPTYStdinIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_io_copy_error", magicType)), + nonPTYCmdStartError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_cmd_start_error", magicType)), + + ptyMotdError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_motd_error", magicType)), + ptyCmdStartError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_cmd_start_error", magicType)), + ptyCloseError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_close_error", magicType)), + ptyResizeError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_resize_error", magicType)), + ptyInputIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_input_io_copy_error", magicType)), + ptyOutputIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_output_io_copy_error", magicType)), + ptyWaitError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_wait_error", magicType)), + } + } +} + +func metricsForSession(magicType string) sessionMetricsObject { + switch magicType { + case MagicSessionTypeVSCode: + case MagicSessionTypeJetBrains: + case "": + magicType = "ssh" + default: + magicType = "unknown" + } + return sessionMetrics[magicType] +} From 11fb0568ae71bcc0f922d132cb3f2cb547a3270e Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 13:16:34 +0200 Subject: [PATCH 05/38] session error --- agent/agentssh/agentssh.go | 21 ++++++++++++++++----- agent/agentssh/metrics.go | 4 ++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index a95bb8ed6f981..da4663a6d72ea 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -198,10 +198,12 @@ func (s *Server) sessionHandler(session ssh.Session) { return } - err := s.sessionStart(session, extraEnv) + m := metricsForSession(magicType(session)) + err := s.sessionStart(session, m, extraEnv) var exitError *exec.ExitError if xerrors.As(err, &exitError) { - s.logger.Debug(ctx, "ssh session returned", slog.Error(exitError)) + s.logger.Warn(ctx, "ssh session returned", slog.Error(exitError)) + m.sessionError.Add(1) _ = session.Exit(exitError.ExitCode()) return } @@ -209,14 +211,24 @@ func (s *Server) sessionHandler(session ssh.Session) { s.logger.Warn(ctx, "ssh session failed", slog.Error(err)) // This exit code is designed to be unlikely to be confused for a legit exit code // from the process. - metricSessionError.Add(1) + m.sessionError.Add(1) _ = session.Exit(MagicSessionErrorCode) return } _ = session.Exit(0) } -func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr error) { +func magicType(session ssh.Session) string { + for _, kv := range session.Environ() { + if !strings.HasPrefix(kv, MagicSessionTypeEnvironmentVariable) { + continue + } + return strings.TrimPrefix(kv, MagicSessionTypeEnvironmentVariable+"=") + } + return "" +} + +func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extraEnv []string) (retErr error) { ctx := session.Context() env := append(session.Environ(), extraEnv...) var magicType string @@ -241,7 +253,6 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er s.logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("type", magicType)) } - m := metricsForSession(magicType) cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { m.agentCreateCommandError.Add(1) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 71e15ebba4d63..4a0eed616c389 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -14,8 +14,6 @@ var ( metricReversePortForwardingCallback = clientmetric.NewCounter("ssh_reverse_port_forwarding_callback") metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") - metricSessionError = clientmetric.NewCounter("ssh_session_error") - // PTY sessions // SFTP @@ -37,6 +35,7 @@ type sessionMetricsObject struct { agentListenerError *clientmetric.Metric startPTYSession *clientmetric.Metric startNonPTYSession *clientmetric.Metric + sessionError *clientmetric.Metric // Non-PTY sessions nonPTYStdinPipeError *clientmetric.Metric @@ -60,6 +59,7 @@ func init() { agentListenerError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_listener_error", magicType)), startPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_pty_session", magicType)), startNonPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_non_pty_session", magicType)), + sessionError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_session_error", magicType)), nonPTYStdinPipeError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_pipe_error", magicType)), nonPTYStdinIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_io_copy_error", magicType)), From 07b2b0e91427d820514e6d337fd292a006710511 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 13:18:13 +0200 Subject: [PATCH 06/38] fix --- agent/agentssh/metrics.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 4a0eed616c389..a761ef82a389d 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -14,8 +14,6 @@ var ( metricReversePortForwardingCallback = clientmetric.NewCounter("ssh_reverse_port_forwarding_callback") metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") - // PTY sessions - // SFTP metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler") metricSftpServerError = clientmetric.NewCounter("ssh_sftp_server_error") From e38a7bef316a4d00de2339ada42b6c45ed3ce696 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 13:28:47 +0200 Subject: [PATCH 07/38] fmt --- agent/agentssh/metrics.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index a761ef82a389d..e116091a7a7c2 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -23,9 +23,7 @@ var ( metricX11XauthorityError = clientmetric.NewCounter("ssh_x11_xauthority_error") ) -var ( - sessionMetrics = map[string]sessionMetricsObject{} -) +var sessionMetrics = map[string]sessionMetricsObject{} type sessionMetricsObject struct { // Agent sessions From ba4bb4d0af683d25edba84f4a329dc0089a672f5 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 16:55:48 +0200 Subject: [PATCH 08/38] WIP --- agent/agent.go | 43 +++++++++----- agent/agentssh/agentssh.go | 22 +++++--- agent/agentssh/metrics.go | 82 +++++++++++++++++++++++---- agent/metrics.go | 112 ++++++++++++++++++++++++++++++++----- cli/agent.go | 24 +++++++- 5 files changed, 234 insertions(+), 49 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 6e0b454aa4f12..f49858e7c6049 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -24,6 +24,7 @@ import ( "github.com/armon/circbuf" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "go.uber.org/atomic" "golang.org/x/exp/slices" @@ -62,6 +63,8 @@ type Options struct { IgnorePorts map[int]string SSHMaxTimeout time.Duration TailnetListenPort uint16 + + PrometheusRegistry *prometheus.Registry } type Client interface { @@ -101,6 +104,10 @@ func New(options Options) Agent { return "", nil } } + if options.PrometheusRegistry == nil { + options.PrometheusRegistry = prometheus.NewRegistry() + } + ctx, cancelFunc := context.WithCancel(context.Background()) a := &agent{ tailnetListenPort: options.TailnetListenPort, @@ -119,6 +126,8 @@ func New(options Options) Agent { ignorePorts: options.IgnorePorts, connStatsChan: make(chan *agentsdk.Stats, 1), sshMaxTimeout: options.SSHMaxTimeout, + prometheusRegistry: options.PrometheusRegistry, + metrics: newAgentMetrics(options.PrometheusRegistry), } a.init(ctx) return a @@ -162,10 +171,13 @@ type agent struct { latestStat atomic.Pointer[agentsdk.Stats] connCountReconnectingPTY atomic.Int64 + + prometheusRegistry *prometheus.Registry + metrics *agentMetrics } func (a *agent) init(ctx context.Context) { - sshSrv, err := agentssh.NewServer(ctx, a.logger.Named("ssh-server"), a.filesystem, a.sshMaxTimeout, "") + sshSrv, err := agentssh.NewServer(ctx, a.logger.Named("ssh-server"), a.prometheusRegistry, a.filesystem, a.sshMaxTimeout, "") if err != nil { panic(err) } @@ -979,7 +991,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) { defer conn.Close() - metricReconnectingPTYHandler.Add(1) + a.metrics.handler.Add(1) a.connCountReconnectingPTY.Add(1) defer a.connCountReconnectingPTY.Add(-1) @@ -1000,7 +1012,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "session error after agent close", slog.Error(err)) } else { logger.Error(ctx, "session error", slog.Error(err)) - metricReconnectingPTYError.Add(1) + a.metrics.handlerError.Add(1) } } logger.Debug(ctx, "session closed") @@ -1020,7 +1032,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // Empty command will default to the users shell! cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil) if err != nil { - metricReconnectingPTYCreateCommandError.Add(1) + a.metrics.createCommandError.Add(1) return xerrors.Errorf("create command: %w", err) } cmd.Env = append(cmd.Env, "TERM=xterm-256color") @@ -1033,7 +1045,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m ptty, process, err := pty.Start(cmd) if err != nil { - metricReconnectingPTYCmdStartError.Add(1) + a.metrics.cmdStartError.Add(1) return xerrors.Errorf("start command: %w", err) } @@ -1064,7 +1076,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err)) } else { logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err)) - metricReconnectingPTYOutputReaderError.Add(1) + a.metrics.outputReaderError.Add(1) } break } @@ -1085,7 +1097,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m slog.F("other_conn_id", cid), slog.Error(err), ) - metricReconnectingPTYWriteError.Add(1) + a.metrics.writeError.Add(1) } } rpty.activeConnsMutex.Unlock() @@ -1105,7 +1117,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) - metricReconnectingPTYResizeError.Add(1) + a.metrics.resizeError.Add(1) } // Write any previously stored data for the TTY. rpty.circularBufferMutex.RLock() @@ -1118,7 +1130,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // while also holding circularBufferMutex seems dangerous. _, err = conn.Write(prevBuf) if err != nil { - metricReconnectingPTYWriteError.Add(1) + a.metrics.writeError.Add(1) return xerrors.Errorf("write buffer to conn: %w", err) } // Multiple connections to the same TTY are permitted. @@ -1169,7 +1181,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m _, err = rpty.ptty.InputWriter().Write([]byte(req.Data)) if err != nil { logger.Warn(ctx, "write to pty", slog.Error(err)) - metricReconnectingPTYInputWriterError.Add(1) + a.metrics.inputWriterError.Add(1) return nil } // Check if a resize needs to happen! @@ -1180,7 +1192,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) - metricReconnectingPTYResizeError.Add(1) + a.metrics.resizeError.Add(1) } } } @@ -1213,7 +1225,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) { var mu sync.Mutex status := a.network.Status() durations := []float64{} - ctx, cancelFunc := context.WithTimeout(ctx, 5*time.Second) + pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second) defer cancelFunc() for nodeID, peer := range status.Peer { if !peer.Active { @@ -1229,7 +1241,7 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) { wg.Add(1) go func() { defer wg.Done() - duration, _, _, err := a.network.Ping(ctx, addresses[0].Addr()) + duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr()) if err != nil { return } @@ -1254,7 +1266,10 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) { // Collect agent metrics. // Agent metrics are changing all the time, so there is no need to perform // reflect.DeepEqual to see if stats should be transferred. - stats.Metrics = collectMetrics() + + metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second) + defer cancelFunc() + stats.Metrics = a.collectMetrics(metricsCtx) a.latestStat.Store(stats) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index da4663a6d72ea..62e669dc20a90 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -20,6 +20,7 @@ import ( "github.com/gliderlabs/ssh" "github.com/pkg/sftp" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "go.uber.org/atomic" gossh "golang.org/x/crypto/ssh" @@ -69,9 +70,12 @@ type Server struct { connCountVSCode atomic.Int64 connCountJetBrains atomic.Int64 connCountSSHSession atomic.Int64 + + prometheusRegistry *prometheus.Registry + metrics *sshServerMetrics } -func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout time.Duration, x11SocketDir string) (*Server, error) { +func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prometheus.Registry, fs afero.Fs, maxTimeout time.Duration, x11SocketDir string) (*Server, error) { // Clients' should ignore the host key when connecting. // The agent needs to authenticate with coderd to SSH, // so SSH authentication doesn't improve security. @@ -90,6 +94,7 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout forwardHandler := &ssh.ForwardedTCPHandler{} unixForwardHandler := &forwardedUnixHandler{log: logger} + metrics := newSSHServerMetrics(prometheusRegistry) s := &Server{ listeners: make(map[net.Listener]struct{}), fs: fs, @@ -97,6 +102,9 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout sessions: make(map[ssh.Session]struct{}), logger: logger, x11SocketDir: x11SocketDir, + + prometheusRegistry: prometheusRegistry, + metrics: metrics, } s.srv = &ssh.Server{ @@ -107,7 +115,7 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout }, ConnectionFailedCallback: func(_ net.Conn, err error) { s.logger.Warn(ctx, "ssh connection failed", slog.Error(err)) - metricConnectionFailedCallback.Add(1) + metrics.connectionFailedCallback.Add(1) }, Handler: s.sessionHandler, HostSigners: []ssh.Signer{randomSigner}, @@ -116,11 +124,11 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout s.logger.Debug(ctx, "local port forward", slog.F("destination-host", destinationHost), slog.F("destination-port", destinationPort)) - metricLocalPortForwardingCallback.Add(1) + metrics.localPortForwardingCallback.Add(1) return true }, PtyCallback: func(ctx ssh.Context, pty ssh.Pty) bool { - metricPtyCallback.Add(1) + metrics.ptyCallback.Add(1) return true }, ReversePortForwardingCallback: func(ctx ssh.Context, bindHost string, bindPort uint32) bool { @@ -128,7 +136,7 @@ func NewServer(ctx context.Context, logger slog.Logger, fs afero.Fs, maxTimeout s.logger.Debug(ctx, "local port forward", slog.F("bind-host", bindHost), slog.F("bind-port", bindPort)) - metricReversePortForwardingCallback.Add(1) + metrics.reversePortForwardingCallback.Add(1) return true }, RequestHandlers: map[string]ssh.RequestHandler{ @@ -405,7 +413,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd } func (s *Server) sftpHandler(session ssh.Session) { - metricSftpHandler.Add(1) + s.metrics.sftpHandler.Add(1) ctx := session.Context() @@ -446,7 +454,7 @@ func (s *Server) sftpHandler(session ssh.Session) { return } s.logger.Warn(ctx, "sftp server closed with error", slog.Error(err)) - metricSftpServerError.Add(1) + s.metrics.sftpServerError.Add(1) _ = session.Exit(1) } diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index e116091a7a7c2..53e10ec46c57f 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -3,25 +3,85 @@ package agentssh import ( "fmt" + "github.com/prometheus/client_golang/prometheus" "tailscale.com/util/clientmetric" ) -var ( +type sshServerMetrics struct { // SSH callbacks - metricConnectionFailedCallback = clientmetric.NewCounter("ssh_connection_failed_callback") - metricLocalPortForwardingCallback = clientmetric.NewCounter("ssh_local_port_forwarding_callback") - metricPtyCallback = clientmetric.NewCounter("ssh_pty_callback") - metricReversePortForwardingCallback = clientmetric.NewCounter("ssh_reverse_port_forwarding_callback") - metricX11Callback = clientmetric.NewCounter("ssh_x11_callback") + connectionFailedCallback prometheus.Counter + localPortForwardingCallback prometheus.Counter + ptyCallback prometheus.Counter + reversePortForwardingCallback prometheus.Counter + x11Callback prometheus.Counter // SFTP - metricSftpHandler = clientmetric.NewCounter("ssh_sftp_handler") - metricSftpServerError = clientmetric.NewCounter("ssh_sftp_server_error") + sftpHandler prometheus.Counter + sftpServerError prometheus.Counter // X11 - metricX11SocketDirError = clientmetric.NewCounter("ssh_x11_socket_dir_error") - metricX11XauthorityError = clientmetric.NewCounter("ssh_x11_xauthority_error") -) + x11SocketDirError prometheus.Counter + x11XauthorityError prometheus.Counter +} + +func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { + connectionFailedCallback := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "connection_failed_callback", + }) + registerer.MustRegister(connectionFailedCallback) + + localPortForwardingCallback := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "local_port_forwarding_callback", + }) + registerer.MustRegister(localPortForwardingCallback) + + ptyCallback := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "pty_callback", + }) + registerer.MustRegister(ptyCallback) + + reversePortForwardingCallback := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "reverse_port_forwarding_callback", + }) + registerer.MustRegister(reversePortForwardingCallback) + + x11Callback := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "x11_callback", + }) + registerer.MustRegister(x11Callback) + + sftpHandler := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_handler", + }) + registerer.MustRegister(sftpHandler) + + sftpServerError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_server_error", + }) + registerer.MustRegister(sftpServerError) + + x11SocketDirError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "x11_socket_dir_error", + }) + registerer.MustRegister(x11SocketDirError) + + x11XauthorityError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "x11_xauthority_error", + }) + registerer.MustRegister(x11XauthorityError) + + return &sshServerMetrics{ + connectionFailedCallback: connectionFailedCallback, + localPortForwardingCallback: localPortForwardingCallback, + ptyCallback: ptyCallback, + reversePortForwardingCallback: reversePortForwardingCallback, + x11Callback: x11Callback, + sftpHandler: sftpHandler, + sftpServerError: sftpServerError, + x11SocketDirError: x11SocketDirError, + x11XauthorityError: x11XauthorityError, + } +} var sessionMetrics = map[string]sessionMetricsObject{} diff --git a/agent/metrics.go b/agent/metrics.go index 9d16e4498c1c3..ea82df42ab028 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -1,30 +1,87 @@ package agent import ( + "context" "fmt" "strings" + "github.com/prometheus/client_golang/prometheus" "tailscale.com/util/clientmetric" + "cdr.dev/slog" + "github.com/coder/coder/codersdk/agentsdk" ) -var ( - // ReconnectingPTY - metricReconnectingPTYHandler = clientmetric.NewCounter("ssh_reconnecting_pty_handler") - metricReconnectingPTYError = clientmetric.NewCounter("ssh_reconnecting_pty_error") - metricReconnectingPTYCreateCommandError = clientmetric.NewCounter("ssh_reconnecting_pty_create_command_error") - metricReconnectingPTYCmdStartError = clientmetric.NewCounter("ssh_reconnecting_pty_cmd_start_error") - metricReconnectingPTYWriteError = clientmetric.NewCounter("ssh_reconnecting_pty_write_error") - metricReconnectingPTYResizeError = clientmetric.NewCounter("ssh_reconnecting_pty_resize_error") - metricReconnectingPTYOutputReaderError = clientmetric.NewCounter("ssh_reconnecting_pty_output_reader_error") - metricReconnectingPTYInputWriterError = clientmetric.NewCounter("ssh_reconnecting_pty_input_writer_error") -) +type agentMetrics struct { + handler prometheus.Counter + handlerError prometheus.Counter + createCommandError prometheus.Counter + cmdStartError prometheus.Counter + writeError prometheus.Counter + resizeError prometheus.Counter + outputReaderError prometheus.Counter + inputWriterError prometheus.Counter +} + +func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { + handler := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "handler", + }) + registerer.MustRegister(handler) + + handlerError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "error", + }) + registerer.MustRegister(handlerError) + + createCommandError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "create_command_error", + }) + registerer.MustRegister(createCommandError) + + cmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "cmd_start_error", + }) + registerer.MustRegister(cmdStartError) + + writeError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "write_error", + }) + registerer.MustRegister(writeError) + + resizeError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "resize_error", + }) + registerer.MustRegister(resizeError) + + outputReaderError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "output_reader_error", + }) + registerer.MustRegister(outputReaderError) -func collectMetrics() []agentsdk.AgentMetric { - // Tailscale metrics + inputWriterError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "input_writer_error", + }) + registerer.MustRegister(inputWriterError) + + return &agentMetrics{ + handler: handler, + handlerError: handlerError, + createCommandError: createCommandError, + cmdStartError: cmdStartError, + writeError: writeError, + resizeError: resizeError, + outputReaderError: outputReaderError, + inputWriterError: inputWriterError, + } +} + +func (a *agent) collectMetrics(ctx context.Context) []agentsdk.AgentMetric { + var collected []agentsdk.AgentMetric + + // Tailscale internal metrics metrics := clientmetric.Metrics() - collected := make([]agentsdk.AgentMetric, 0, len(metrics)) for _, m := range metrics { if isIgnoredMetric(m.Name()) { continue @@ -36,6 +93,33 @@ func collectMetrics() []agentsdk.AgentMetric { Value: float64(m.Value()), }) } + + // Agent metrics + metricFamilies, err := a.prometheusRegistry.Gather() + if err != nil { + a.logger.Error(ctx, "can't gather agent metrics", slog.Error(err)) + return collected + } + + for _, metricFamily := range metricFamilies { + for _, metric := range metricFamily.GetMetric() { + if metric.Counter != nil { + collected = append(collected, agentsdk.AgentMetric{ + Name: metricFamily.GetName(), + Type: agentsdk.AgentMetricTypeCounter, + Value: metric.Counter.GetValue(), + }) + } else if metric.Gauge != nil { + collected = append(collected, agentsdk.AgentMetric{ + Name: metricFamily.GetName(), + Type: agentsdk.AgentMetricTypeGauge, + Value: metric.Gauge.GetValue(), + }) + } else { + a.logger.Error(ctx, "unsupported metric type", slog.F("type", metricFamily.Type.String())) + } + } + } return collected } diff --git a/cli/agent.go b/cli/agent.go index da39f1caa22b9..1cddd0055fe81 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -27,6 +27,8 @@ import ( "github.com/coder/coder/buildinfo" "github.com/coder/coder/cli/clibase" "github.com/coder/coder/codersdk/agentsdk" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" ) func (r *RootCmd) workspaceAgent() *clibase.Cmd { @@ -129,8 +131,6 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd { ignorePorts[port] = "pprof" } - prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(), prometheusAddress, "prometheus") - defer prometheusSrvClose() // Do a best effort here. If this fails, it's not a big deal. if port, err := urlPort(prometheusAddress); err == nil { ignorePorts[port] = "prometheus" @@ -197,6 +197,8 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd { return xerrors.Errorf("add executable to $PATH: %w", err) } + prometheusRegistry := prometheus.NewRegistry() + agnt := agent.New(agent.Options{ Client: client, Logger: logger, @@ -218,8 +220,13 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd { }, IgnorePorts: ignorePorts, SSHMaxTimeout: sshMaxTimeout, + + PrometheusRegistry: prometheusRegistry, }) + prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(prometheusRegistry, logger), prometheusAddress, "prometheus") + defer prometheusSrvClose() + debugSrvClose := ServeHandler(ctx, logger, agnt.HTTPDebug(), debugAddress, "debug") defer debugSrvClose() // Do a best effort here. If this fails, it's not a big deal. @@ -376,11 +383,22 @@ func urlPort(u string) (int, error) { return -1, xerrors.Errorf("invalid port: %s", u) } -func prometheusMetricsHandler() http.Handler { +func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler { // We don't have any other internal metrics so far, so it's safe to expose metrics this way. // Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") + clientmetric.WritePrometheusExpositionFormat(w) + + metricFamilies, err := prometheusRegistry.Gather() + if err != nil { + logger.Error(context.Background(), "Prometheus handler can't gather metric families", slog.Error(err)) + return + } + + for _, metricFamily := range metricFamilies { + expfmt.MetricFamilyToText(w, metricFamily) + } }) } From 0d0f3008107e8723d1ede25e0b99b261e0f3b8d2 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 17:31:09 +0200 Subject: [PATCH 09/38] Refactored to client_golang/prometheus --- agent/agentssh/metrics.go | 154 ++++++++++++++++++++++++++++---------- agent/agentssh/x11.go | 6 +- 2 files changed, 118 insertions(+), 42 deletions(-) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 53e10ec46c57f..371f8965011ec 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -4,7 +4,6 @@ import ( "fmt" "github.com/prometheus/client_golang/prometheus" - "tailscale.com/util/clientmetric" ) type sshServerMetrics struct { @@ -83,56 +82,133 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { } } -var sessionMetrics = map[string]sessionMetricsObject{} - type sessionMetricsObject struct { // Agent sessions - agentCreateCommandError *clientmetric.Metric - agentListenerError *clientmetric.Metric - startPTYSession *clientmetric.Metric - startNonPTYSession *clientmetric.Metric - sessionError *clientmetric.Metric + agentCreateCommandError prometheus.Counter + agentListenerError prometheus.Counter + startPTYSession prometheus.Counter + startNonPTYSession prometheus.Counter + sessionError prometheus.Counter // Non-PTY sessions - nonPTYStdinPipeError *clientmetric.Metric - nonPTYStdinIoCopyError *clientmetric.Metric - nonPTYCmdStartError *clientmetric.Metric + nonPTYStdinPipeError prometheus.Counter + nonPTYStdinIoCopyError prometheus.Counter + nonPTYCmdStartError prometheus.Counter // PTY sessions - ptyMotdError *clientmetric.Metric - ptyCmdStartError *clientmetric.Metric - ptyCloseError *clientmetric.Metric - ptyResizeError *clientmetric.Metric - ptyInputIoCopyError *clientmetric.Metric - ptyOutputIoCopyError *clientmetric.Metric - ptyWaitError *clientmetric.Metric + ptyMotdError prometheus.Counter + ptyCmdStartError prometheus.Counter + ptyCloseError prometheus.Counter + ptyResizeError prometheus.Counter + ptyInputIoCopyError prometheus.Counter + ptyOutputIoCopyError prometheus.Counter + ptyWaitError prometheus.Counter } -func init() { +type sessionMetrics map[string]sessionMetricsObject + +func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { + sm := sessionMetrics{} for _, magicType := range []string{MagicSessionTypeVSCode, MagicSessionTypeJetBrains, "ssh", "unknown"} { - sessionMetrics[magicType] = sessionMetricsObject{ - agentCreateCommandError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_create_command_error", magicType)), - agentListenerError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_listener_error", magicType)), - startPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_pty_session", magicType)), - startNonPTYSession: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_start_non_pty_session", magicType)), - sessionError: clientmetric.NewCounter(fmt.Sprintf("ssh_agent_%s_session_error", magicType)), - - nonPTYStdinPipeError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_pipe_error", magicType)), - nonPTYStdinIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_stdin_io_copy_error", magicType)), - nonPTYCmdStartError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_non_pty_cmd_start_error", magicType)), - - ptyMotdError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_motd_error", magicType)), - ptyCmdStartError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_cmd_start_error", magicType)), - ptyCloseError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_close_error", magicType)), - ptyResizeError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_resize_error", magicType)), - ptyInputIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_input_io_copy_error", magicType)), - ptyOutputIoCopyError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_output_io_copy_error", magicType)), - ptyWaitError: clientmetric.NewCounter(fmt.Sprintf("ssh_server_%s_pty_wait_error", magicType)), + agentCreateCommandError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "create_command_error", + }) + registerer.MustRegister(agentCreateCommandError) + + agentListenerError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "listener_error", + }) + registerer.MustRegister(agentListenerError) + + startPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "start_pty_session", + }) + registerer.MustRegister(startPTYSession) + + startNonPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "start_non_pty_session", + }) + registerer.MustRegister(startNonPTYSession) + + sessionError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "session_error", + }) + registerer.MustRegister(sessionError) + + nonPTYStdinPipeError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_stdin_pipe_error", + }) + registerer.MustRegister(nonPTYStdinPipeError) + + nonPTYStdinIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_io_copy_error", + }) + registerer.MustRegister(nonPTYStdinIoCopyError) + + nonPTYCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_io_start_error", + }) + registerer.MustRegister(nonPTYCmdStartError) + + ptyMotdError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_motd_error", + }) + registerer.MustRegister(ptyMotdError) + + ptyCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_cmd_start_error", + }) + registerer.MustRegister(ptyCmdStartError) + + ptyCloseError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_close_error", + }) + registerer.MustRegister(ptyCloseError) + + ptyResizeError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_resize_error", + }) + registerer.MustRegister(ptyResizeError) + + ptyInputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_input_io_copy_error", + }) + registerer.MustRegister(ptyInputIoCopyError) + + ptyOutputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_output_io_copy_error", + }) + registerer.MustRegister(ptyOutputIoCopyError) + + ptyWaitError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_wait_error", + }) + registerer.MustRegister(ptyWaitError) + + sm[magicType] = sessionMetricsObject{ + agentCreateCommandError: agentCreateCommandError, + agentListenerError: agentListenerError, + startPTYSession: startPTYSession, + startNonPTYSession: startNonPTYSession, + sessionError: sessionError, + + nonPTYStdinPipeError: nonPTYStdinPipeError, + nonPTYStdinIoCopyError: nonPTYStdinIoCopyError, + nonPTYCmdStartError: nonPTYCmdStartError, + + ptyMotdError: ptyMotdError, + ptyCmdStartError: ptyCmdStartError, + ptyCloseError: ptyCloseError, + ptyResizeError: ptyResizeError, + ptyInputIoCopyError: ptyInputIoCopyError, + ptyOutputIoCopyError: ptyOutputIoCopyError, + ptyWaitError: ptyWaitError, } } + return sm } -func metricsForSession(magicType string) sessionMetricsObject { +func metricsForSession(m sessionMetrics, magicType string) sessionMetricsObject { switch magicType { case MagicSessionTypeVSCode: case MagicSessionTypeJetBrains: @@ -141,5 +217,5 @@ func metricsForSession(magicType string) sessionMetricsObject { default: magicType = "unknown" } - return sessionMetrics[magicType] + return m[magicType] } diff --git a/agent/agentssh/x11.go b/agent/agentssh/x11.go index 340ab6b1b0eab..fe072e0f3fdd1 100644 --- a/agent/agentssh/x11.go +++ b/agent/agentssh/x11.go @@ -24,7 +24,7 @@ import ( // x11Callback is called when the client requests X11 forwarding. // It adds an Xauthority entry to the Xauthority file. func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { - metricX11Callback.Add(1) + s.metrics.x11Callback.Add(1) hostname, err := os.Hostname() if err != nil { @@ -35,14 +35,14 @@ func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { err = s.fs.MkdirAll(s.x11SocketDir, 0o700) if err != nil { s.logger.Warn(ctx, "failed to make the x11 socket dir", slog.F("dir", s.x11SocketDir), slog.Error(err)) - metricX11SocketDirError.Add(1) + s.metrics.x11SocketDirError.Add(1) return false } err = addXauthEntry(ctx, s.fs, hostname, strconv.Itoa(int(x11.ScreenNumber)), x11.AuthProtocol, x11.AuthCookie) if err != nil { s.logger.Warn(ctx, "failed to add Xauthority entry", slog.Error(err)) - metricX11XauthorityError.Add(1) + s.metrics.x11XauthorityError.Add(1) return false } return true From 315b5ce4a5a34cd46b41b2e3982516b8b66d57fa Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 17:33:54 +0200 Subject: [PATCH 10/38] fix --- agent/agentssh/agentssh.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 62e669dc20a90..0ac5a4a5d67b0 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -72,7 +72,9 @@ type Server struct { connCountSSHSession atomic.Int64 prometheusRegistry *prometheus.Registry - metrics *sshServerMetrics + + metrics *sshServerMetrics + sessionMetrics sessionMetrics } func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prometheus.Registry, fs afero.Fs, maxTimeout time.Duration, x11SocketDir string) (*Server, error) { @@ -95,6 +97,7 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom unixForwardHandler := &forwardedUnixHandler{log: logger} metrics := newSSHServerMetrics(prometheusRegistry) + sessionMetrics := newSessionMetrics(prometheusRegistry) s := &Server{ listeners: make(map[net.Listener]struct{}), fs: fs, @@ -105,6 +108,7 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom prometheusRegistry: prometheusRegistry, metrics: metrics, + sessionMetrics: sessionMetrics, } s.srv = &ssh.Server{ @@ -206,7 +210,7 @@ func (s *Server) sessionHandler(session ssh.Session) { return } - m := metricsForSession(magicType(session)) + m := metricsForSession(s.sessionMetrics, magicType(session)) err := s.sessionStart(session, m, extraEnv) var exitError *exec.ExitError if xerrors.As(err, &exitError) { From 43d5d4058af47aa0611313f6705a1328721f76b8 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 17:38:03 +0200 Subject: [PATCH 11/38] fix --- agent/agentssh/metrics.go | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 371f8965011ec..4a0657345c443 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -111,77 +111,77 @@ func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { sm := sessionMetrics{} for _, magicType := range []string{MagicSessionTypeVSCode, MagicSessionTypeJetBrains, "ssh", "unknown"} { agentCreateCommandError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "create_command_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "create_command_error", }) registerer.MustRegister(agentCreateCommandError) agentListenerError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "listener_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "listener_error", }) registerer.MustRegister(agentListenerError) startPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "start_pty_session", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "start_pty_session", }) registerer.MustRegister(startPTYSession) startNonPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "start_non_pty_session", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "start_non_pty_session", }) registerer.MustRegister(startNonPTYSession) sessionError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "session_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "error", }) registerer.MustRegister(sessionError) nonPTYStdinPipeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_stdin_pipe_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_stdin_pipe_error", }) registerer.MustRegister(nonPTYStdinPipeError) nonPTYStdinIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_io_copy_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_io_copy_error", }) registerer.MustRegister(nonPTYStdinIoCopyError) nonPTYCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "non_pty_io_start_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_io_start_error", }) registerer.MustRegister(nonPTYCmdStartError) ptyMotdError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_motd_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_motd_error", }) registerer.MustRegister(ptyMotdError) ptyCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_cmd_start_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_cmd_start_error", }) registerer.MustRegister(ptyCmdStartError) ptyCloseError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_close_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_close_error", }) registerer.MustRegister(ptyCloseError) ptyResizeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_resize_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_resize_error", }) registerer.MustRegister(ptyResizeError) ptyInputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_input_io_copy_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_input_io_copy_error", }) registerer.MustRegister(ptyInputIoCopyError) ptyOutputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_output_io_copy_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_output_io_copy_error", }) registerer.MustRegister(ptyOutputIoCopyError) ptyWaitError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("ssh_session_%s", magicType), Name: "pty_wait_error", + Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_wait_error", }) registerer.MustRegister(ptyWaitError) From 85f8860a71dc40a914744245de11b566f0a18c02 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 17:42:51 +0200 Subject: [PATCH 12/38] refactor --- agent/agentssh/agentssh.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 0ac5a4a5d67b0..afbcf4222f71e 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -106,9 +106,8 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom logger: logger, x11SocketDir: x11SocketDir, - prometheusRegistry: prometheusRegistry, - metrics: metrics, - sessionMetrics: sessionMetrics, + metrics: metrics, + sessionMetrics: sessionMetrics, } s.srv = &ssh.Server{ From 34f07fc76b1f74b6f743e4277efff73f766fe8ff Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 17:48:38 +0200 Subject: [PATCH 13/38] refactor --- agent/agentssh/agentssh.go | 9 ++---- agent/agentssh/metrics.go | 56 +++++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index afbcf4222f71e..6669979e11e2c 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -73,8 +73,7 @@ type Server struct { prometheusRegistry *prometheus.Registry - metrics *sshServerMetrics - sessionMetrics sessionMetrics + metrics *sshServerMetrics } func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prometheus.Registry, fs afero.Fs, maxTimeout time.Duration, x11SocketDir string) (*Server, error) { @@ -97,7 +96,6 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom unixForwardHandler := &forwardedUnixHandler{log: logger} metrics := newSSHServerMetrics(prometheusRegistry) - sessionMetrics := newSessionMetrics(prometheusRegistry) s := &Server{ listeners: make(map[net.Listener]struct{}), fs: fs, @@ -106,8 +104,7 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom logger: logger, x11SocketDir: x11SocketDir, - metrics: metrics, - sessionMetrics: sessionMetrics, + metrics: metrics, } s.srv = &ssh.Server{ @@ -209,7 +206,7 @@ func (s *Server) sessionHandler(session ssh.Session) { return } - m := metricsForSession(s.sessionMetrics, magicType(session)) + m := metricsForSession(s.metrics.sessions, magicType(session)) err := s.sessionStart(session, m, extraEnv) var exitError *exec.ExitError if xerrors.As(err, &exitError) { diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 4a0657345c443..012a0af47924d 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -21,8 +21,35 @@ type sshServerMetrics struct { // X11 x11SocketDirError prometheus.Counter x11XauthorityError prometheus.Counter + + sessions sessionMetrics } +type sessionMetricsObject struct { + // Agent sessions + agentCreateCommandError prometheus.Counter + agentListenerError prometheus.Counter + startPTYSession prometheus.Counter + startNonPTYSession prometheus.Counter + sessionError prometheus.Counter + + // Non-PTY sessions + nonPTYStdinPipeError prometheus.Counter + nonPTYStdinIoCopyError prometheus.Counter + nonPTYCmdStartError prometheus.Counter + + // PTY sessions + ptyMotdError prometheus.Counter + ptyCmdStartError prometheus.Counter + ptyCloseError prometheus.Counter + ptyResizeError prometheus.Counter + ptyInputIoCopyError prometheus.Counter + ptyOutputIoCopyError prometheus.Counter + ptyWaitError prometheus.Counter +} + +type sessionMetrics map[string]sessionMetricsObject + func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { connectionFailedCallback := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "connection_failed_callback", @@ -69,6 +96,8 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(x11XauthorityError) + sessions := newSessionMetrics(registerer) + return &sshServerMetrics{ connectionFailedCallback: connectionFailedCallback, localPortForwardingCallback: localPortForwardingCallback, @@ -79,34 +108,11 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { sftpServerError: sftpServerError, x11SocketDirError: x11SocketDirError, x11XauthorityError: x11XauthorityError, - } -} - -type sessionMetricsObject struct { - // Agent sessions - agentCreateCommandError prometheus.Counter - agentListenerError prometheus.Counter - startPTYSession prometheus.Counter - startNonPTYSession prometheus.Counter - sessionError prometheus.Counter - - // Non-PTY sessions - nonPTYStdinPipeError prometheus.Counter - nonPTYStdinIoCopyError prometheus.Counter - nonPTYCmdStartError prometheus.Counter - // PTY sessions - ptyMotdError prometheus.Counter - ptyCmdStartError prometheus.Counter - ptyCloseError prometheus.Counter - ptyResizeError prometheus.Counter - ptyInputIoCopyError prometheus.Counter - ptyOutputIoCopyError prometheus.Counter - ptyWaitError prometheus.Counter + sessions: sessions, + } } -type sessionMetrics map[string]sessionMetricsObject - func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { sm := sessionMetrics{} for _, magicType := range []string{MagicSessionTypeVSCode, MagicSessionTypeJetBrains, "ssh", "unknown"} { From 59fd58501f22e24e567e25532de3ccd0ef16b446 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 18:04:49 +0200 Subject: [PATCH 14/38] fix test --- agent/agentssh/agentssh_internal_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agent/agentssh/agentssh_internal_test.go b/agent/agentssh/agentssh_internal_test.go index 983ba436b7370..fe46632c9f237 100644 --- a/agent/agentssh/agentssh_internal_test.go +++ b/agent/agentssh/agentssh_internal_test.go @@ -10,6 +10,7 @@ import ( "testing" gliderssh "github.com/gliderlabs/ssh" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -36,7 +37,7 @@ func Test_sessionStart_orphan(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitMedium) defer cancel() logger := slogtest.Make(t, nil) - s, err := NewServer(ctx, logger, afero.NewMemMapFs(), 0, "") + s, err := NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), 0, "") require.NoError(t, err) // Here we're going to call the handler directly with a faked SSH session @@ -60,7 +61,7 @@ func Test_sessionStart_orphan(t *testing.T) { // we don't really care what the error is here. In the larger scenario, // the client has disconnected, so we can't return any error information // to them. - m := metricsForSession("ssh") + m := metricsForSession(s.metrics.sessions, "ssh") _ = s.startPTYSession(sess, m, cmd, ptyInfo, windowSize) }() From 8cd927c757bbc6464b121033ef6db90f5cb25e16 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 18:16:36 +0200 Subject: [PATCH 15/38] fix --- agent/agentssh/agentssh_test.go | 5 +++-- agent/agentssh/x11_test.go | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/agent/agentssh/agentssh_test.go b/agent/agentssh/agentssh_test.go index b1675f0029a2c..d2cb56d44df10 100644 --- a/agent/agentssh/agentssh_test.go +++ b/agent/agentssh/agentssh_test.go @@ -10,6 +10,7 @@ import ( "sync" "testing" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -33,7 +34,7 @@ func TestNewServer_ServeClient(t *testing.T) { ctx := context.Background() logger := slogtest.Make(t, nil) - s, err := agentssh.NewServer(ctx, logger, afero.NewMemMapFs(), 0, "") + s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), 0, "") require.NoError(t, err) // The assumption is that these are set before serving SSH connections. @@ -74,7 +75,7 @@ func TestNewServer_CloseActiveConnections(t *testing.T) { ctx := context.Background() logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) - s, err := agentssh.NewServer(ctx, logger, afero.NewMemMapFs(), 0, "") + s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), 0, "") require.NoError(t, err) // The assumption is that these are set before serving SSH connections. diff --git a/agent/agentssh/x11_test.go b/agent/agentssh/x11_test.go index cd935d326858c..1fce885bab780 100644 --- a/agent/agentssh/x11_test.go +++ b/agent/agentssh/x11_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/gliderlabs/ssh" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -33,7 +34,7 @@ func TestServer_X11(t *testing.T) { logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) fs := afero.NewOsFs() dir := t.TempDir() - s, err := agentssh.NewServer(ctx, logger, fs, 0, dir) + s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), fs, 0, dir) require.NoError(t, err) defer s.Close() From 6eec4d7692e48942f727184c49bed741a8b8ccd6 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 20:29:18 +0200 Subject: [PATCH 16/38] fix --- agent/agent.go | 8 +++++--- agent/agentssh/agentssh.go | 2 -- cli/agent.go | 12 +++++++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 293c77a3b1649..4fe6ff5706d8a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -105,8 +105,10 @@ func New(options Options) Agent { return "", nil } } - if options.PrometheusRegistry == nil { - options.PrometheusRegistry = prometheus.NewRegistry() + + prometheusRegistry := options.PrometheusRegistry + if prometheusRegistry == nil { + prometheusRegistry = prometheus.NewRegistry() } ctx, cancelFunc := context.WithCancel(context.Background()) @@ -129,7 +131,7 @@ func New(options Options) Agent { sshMaxTimeout: options.SSHMaxTimeout, subsystem: options.Subsystem, - prometheusRegistry: options.PrometheusRegistry, + prometheusRegistry: prometheusRegistry, metrics: newAgentMetrics(options.PrometheusRegistry), } a.init(ctx) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 6669979e11e2c..7ade72cf370d1 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -71,8 +71,6 @@ type Server struct { connCountJetBrains atomic.Int64 connCountSSHSession atomic.Int64 - prometheusRegistry *prometheus.Registry - metrics *sshServerMetrics } diff --git a/cli/agent.go b/cli/agent.go index db6b62ec4711a..e344d2eb45742 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -20,6 +20,9 @@ import ( "gopkg.in/natefinch/lumberjack.v2" "tailscale.com/util/clientmetric" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" + "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" "github.com/coder/coder/agent" @@ -28,8 +31,6 @@ import ( "github.com/coder/coder/cli/clibase" "github.com/coder/coder/codersdk" "github.com/coder/coder/codersdk/agentsdk" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/expfmt" ) func (r *RootCmd) workspaceAgent() *clibase.Cmd { @@ -400,7 +401,12 @@ func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl } for _, metricFamily := range metricFamilies { - expfmt.MetricFamilyToText(w, metricFamily) + _, err = expfmt.MetricFamilyToText(w, metricFamily) + if err != nil { + logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err)) + return + + } } }) } From a059edfec6ecc3bff6eab64e3f029d0c6b678c61 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 20:49:03 +0200 Subject: [PATCH 17/38] fix --- cli/agent.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cli/agent.go b/cli/agent.go index e344d2eb45742..911e014a0de70 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -405,7 +405,6 @@ func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl if err != nil { logger.Error(context.Background(), "expfmt.MetricFamilyToText failed", slog.Error(err)) return - } } }) From c004c043b2c26d777258caabe0f0df8f2aeb74ca Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 18 May 2023 20:59:12 +0200 Subject: [PATCH 18/38] fix --- agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 4fe6ff5706d8a..0ca7988de2110 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -132,7 +132,7 @@ func New(options Options) Agent { subsystem: options.Subsystem, prometheusRegistry: prometheusRegistry, - metrics: newAgentMetrics(options.PrometheusRegistry), + metrics: newAgentMetrics(prometheusRegistry), } a.init(ctx) return a From 9b0e31a033bcb6cb8f4ae2736cc24f5e307c8f64 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 11:14:37 +0200 Subject: [PATCH 19/38] Address PR comments --- agent/agentssh/agentssh_internal_test.go | 3 ++- agent/agentssh/metrics.go | 7 +++++++ cli/agent.go | 3 +-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/agent/agentssh/agentssh_internal_test.go b/agent/agentssh/agentssh_internal_test.go index fe46632c9f237..cb0e4e6741cce 100644 --- a/agent/agentssh/agentssh_internal_test.go +++ b/agent/agentssh/agentssh_internal_test.go @@ -58,10 +58,11 @@ func Test_sessionStart_orphan(t *testing.T) { done := make(chan struct{}) go func() { defer close(done) + + m := metricsForSession(s.metrics.sessions, "ssh") // we don't really care what the error is here. In the larger scenario, // the client has disconnected, so we can't return any error information // to them. - m := metricsForSession(s.metrics.sessions, "ssh") _ = s.startPTYSession(sess, m, cmd, ptyInfo, windowSize) }() diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 012a0af47924d..978eed8b43d8a 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -20,6 +20,7 @@ type sshServerMetrics struct { // X11 x11SocketDirError prometheus.Counter + x11HostnameError prometheus.Counter x11XauthorityError prometheus.Counter sessions sessionMetrics @@ -86,6 +87,11 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(sftpServerError) + x11HostnameError := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "x11_hostname_error", + }) + registerer.MustRegister(x11HostnameError) + x11SocketDirError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "x11_socket_dir_error", }) @@ -106,6 +112,7 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { x11Callback: x11Callback, sftpHandler: sftpHandler, sftpServerError: sftpServerError, + x11HostnameError: x11HostnameError, x11SocketDirError: x11SocketDirError, x11XauthorityError: x11XauthorityError, diff --git a/cli/agent.go b/cli/agent.go index 911e014a0de70..0bf045c63aadf 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -387,11 +387,10 @@ func urlPort(u string) (int, error) { } func prometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger slog.Logger) http.Handler { - // We don't have any other internal metrics so far, so it's safe to expose metrics this way. - // Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") + // Based on: https://github.com/tailscale/tailscale/blob/280255acae604796a1113861f5a84e6fa2dc6121/ipn/localapi/localapi.go#L489 clientmetric.WritePrometheusExpositionFormat(w) metricFamilies, err := prometheusRegistry.Gather() From 7d4ccce30cd94fa93df168a7d7e3b28f23073592 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:25:07 +0200 Subject: [PATCH 20/38] x11HostnameError --- agent/agentssh/x11.go | 1 + 1 file changed, 1 insertion(+) diff --git a/agent/agentssh/x11.go b/agent/agentssh/x11.go index fe072e0f3fdd1..01eba086286c7 100644 --- a/agent/agentssh/x11.go +++ b/agent/agentssh/x11.go @@ -29,6 +29,7 @@ func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { hostname, err := os.Hostname() if err != nil { s.logger.Warn(ctx, "failed to get hostname", slog.Error(err)) + s.metrics.x11HostnameError.Add(1) return false } From 90b351d168488f3c1cc27f1024bcd48292401e22 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:28:02 +0200 Subject: [PATCH 21/38] Remove callbacks --- agent/agentssh/agentssh.go | 3 --- agent/agentssh/metrics.go | 43 +++++++------------------------------- agent/agentssh/x11.go | 2 -- 3 files changed, 7 insertions(+), 41 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 7ade72cf370d1..e599043b299a2 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -122,11 +122,9 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom s.logger.Debug(ctx, "local port forward", slog.F("destination-host", destinationHost), slog.F("destination-port", destinationPort)) - metrics.localPortForwardingCallback.Add(1) return true }, PtyCallback: func(ctx ssh.Context, pty ssh.Pty) bool { - metrics.ptyCallback.Add(1) return true }, ReversePortForwardingCallback: func(ctx ssh.Context, bindHost string, bindPort uint32) bool { @@ -134,7 +132,6 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom s.logger.Debug(ctx, "local port forward", slog.F("bind-host", bindHost), slog.F("bind-port", bindPort)) - metrics.reversePortForwardingCallback.Add(1) return true }, RequestHandlers: map[string]ssh.RequestHandler{ diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 978eed8b43d8a..b74ed6119f83e 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -7,12 +7,7 @@ import ( ) type sshServerMetrics struct { - // SSH callbacks - connectionFailedCallback prometheus.Counter - localPortForwardingCallback prometheus.Counter - ptyCallback prometheus.Counter - reversePortForwardingCallback prometheus.Counter - x11Callback prometheus.Counter + connectionFailedCallback prometheus.Counter // SFTP sftpHandler prometheus.Counter @@ -57,26 +52,6 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(connectionFailedCallback) - localPortForwardingCallback := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "local_port_forwarding_callback", - }) - registerer.MustRegister(localPortForwardingCallback) - - ptyCallback := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "pty_callback", - }) - registerer.MustRegister(ptyCallback) - - reversePortForwardingCallback := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "reverse_port_forwarding_callback", - }) - registerer.MustRegister(reversePortForwardingCallback) - - x11Callback := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "x11_callback", - }) - registerer.MustRegister(x11Callback) - sftpHandler := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_handler", }) @@ -105,16 +80,12 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { sessions := newSessionMetrics(registerer) return &sshServerMetrics{ - connectionFailedCallback: connectionFailedCallback, - localPortForwardingCallback: localPortForwardingCallback, - ptyCallback: ptyCallback, - reversePortForwardingCallback: reversePortForwardingCallback, - x11Callback: x11Callback, - sftpHandler: sftpHandler, - sftpServerError: sftpServerError, - x11HostnameError: x11HostnameError, - x11SocketDirError: x11SocketDirError, - x11XauthorityError: x11XauthorityError, + connectionFailedCallback: connectionFailedCallback, + sftpHandler: sftpHandler, + sftpServerError: sftpServerError, + x11HostnameError: x11HostnameError, + x11SocketDirError: x11SocketDirError, + x11XauthorityError: x11XauthorityError, sessions: sessions, } diff --git a/agent/agentssh/x11.go b/agent/agentssh/x11.go index 01eba086286c7..e473bc52db495 100644 --- a/agent/agentssh/x11.go +++ b/agent/agentssh/x11.go @@ -24,8 +24,6 @@ import ( // x11Callback is called when the client requests X11 forwarding. // It adds an Xauthority entry to the Xauthority file. func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { - s.metrics.x11Callback.Add(1) - hostname, err := os.Hostname() if err != nil { s.logger.Warn(ctx, "failed to get hostname", slog.Error(err)) From 1773c2486faf9996a4bcc829c7118050d5b90bef Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:30:05 +0200 Subject: [PATCH 22/38] failedConnectionsTotal --- agent/agentssh/agentssh.go | 2 +- agent/agentssh/metrics.go | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index e599043b299a2..6198f377fad68 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -113,7 +113,7 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom }, ConnectionFailedCallback: func(_ net.Conn, err error) { s.logger.Warn(ctx, "ssh connection failed", slog.Error(err)) - metrics.connectionFailedCallback.Add(1) + metrics.failedConnectionsTotal.Add(1) }, Handler: s.sessionHandler, HostSigners: []ssh.Signer{randomSigner}, diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index b74ed6119f83e..27cca3100c946 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -7,7 +7,7 @@ import ( ) type sshServerMetrics struct { - connectionFailedCallback prometheus.Counter + failedConnectionsTotal prometheus.Counter // SFTP sftpHandler prometheus.Counter @@ -47,10 +47,10 @@ type sessionMetricsObject struct { type sessionMetrics map[string]sessionMetricsObject func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { - connectionFailedCallback := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "connection_failed_callback", + failedConnectionsTotal := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "failed_connections_total", }) - registerer.MustRegister(connectionFailedCallback) + registerer.MustRegister(failedConnectionsTotal) sftpHandler := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_handler", @@ -80,12 +80,12 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { sessions := newSessionMetrics(registerer) return &sshServerMetrics{ - connectionFailedCallback: connectionFailedCallback, - sftpHandler: sftpHandler, - sftpServerError: sftpServerError, - x11HostnameError: x11HostnameError, - x11SocketDirError: x11SocketDirError, - x11XauthorityError: x11XauthorityError, + failedConnectionsTotal: failedConnectionsTotal, + sftpHandler: sftpHandler, + sftpServerError: sftpServerError, + x11HostnameError: x11HostnameError, + x11SocketDirError: x11SocketDirError, + x11XauthorityError: x11XauthorityError, sessions: sessions, } From 5ac27b75c3711bc44d68cbe475a36d88600c1f68 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:32:28 +0200 Subject: [PATCH 23/38] connectionsTotal --- agent/agent.go | 2 +- agent/metrics.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 0ca7988de2110..1c09c5723f0e7 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -998,7 +998,7 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) { defer conn.Close() - a.metrics.handler.Add(1) + a.metrics.connectionsTotal.Add(1) a.connCountReconnectingPTY.Add(1) defer a.connCountReconnectingPTY.Add(-1) diff --git a/agent/metrics.go b/agent/metrics.go index ea82df42ab028..1f3570ed5291d 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -14,7 +14,7 @@ import ( ) type agentMetrics struct { - handler prometheus.Counter + connectionsTotal prometheus.Counter handlerError prometheus.Counter createCommandError prometheus.Counter cmdStartError prometheus.Counter @@ -25,10 +25,10 @@ type agentMetrics struct { } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { - handler := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "handler", + connectionsTotal := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "reconnecting_pty", Name: "connections_total", }) - registerer.MustRegister(handler) + registerer.MustRegister(connectionsTotal) handlerError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "reconnecting_pty", Name: "error", @@ -66,7 +66,7 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { registerer.MustRegister(inputWriterError) return &agentMetrics{ - handler: handler, + connectionsTotal: connectionsTotal, handlerError: handlerError, createCommandError: createCommandError, cmdStartError: cmdStartError, From 6eb1a95e34d41f77be70aede38652e14983841e1 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:34:39 +0200 Subject: [PATCH 24/38] sftpConnectionsTotal --- agent/agentssh/agentssh.go | 2 +- agent/agentssh/metrics.go | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 6198f377fad68..44121b5321d11 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -408,7 +408,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd } func (s *Server) sftpHandler(session ssh.Session) { - s.metrics.sftpHandler.Add(1) + s.metrics.sftpConnectionsTotal.Add(1) ctx := session.Context() diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 27cca3100c946..857b51909de12 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -10,8 +10,8 @@ type sshServerMetrics struct { failedConnectionsTotal prometheus.Counter // SFTP - sftpHandler prometheus.Counter - sftpServerError prometheus.Counter + sftpConnectionsTotal prometheus.Counter + sftpServerError prometheus.Counter // X11 x11SocketDirError prometheus.Counter @@ -52,10 +52,10 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(failedConnectionsTotal) - sftpHandler := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_handler", + sftpConnectionsTotal := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_connections_total", }) - registerer.MustRegister(sftpHandler) + registerer.MustRegister(sftpConnectionsTotal) sftpServerError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_server_error", @@ -81,7 +81,7 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { return &sshServerMetrics{ failedConnectionsTotal: failedConnectionsTotal, - sftpHandler: sftpHandler, + sftpConnectionsTotal: sftpConnectionsTotal, sftpServerError: sftpServerError, x11HostnameError: x11HostnameError, x11SocketDirError: x11SocketDirError, From e3d7493a752f485789c954906c82ac706b273bdf Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:36:28 +0200 Subject: [PATCH 25/38] sessionError --- agent/agentssh/agentssh.go | 2 -- agent/agentssh/metrics.go | 7 ------- 2 files changed, 9 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 44121b5321d11..72d79493025cf 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -206,7 +206,6 @@ func (s *Server) sessionHandler(session ssh.Session) { var exitError *exec.ExitError if xerrors.As(err, &exitError) { s.logger.Warn(ctx, "ssh session returned", slog.Error(exitError)) - m.sessionError.Add(1) _ = session.Exit(exitError.ExitCode()) return } @@ -214,7 +213,6 @@ func (s *Server) sessionHandler(session ssh.Session) { s.logger.Warn(ctx, "ssh session failed", slog.Error(err)) // This exit code is designed to be unlikely to be confused for a legit exit code // from the process. - m.sessionError.Add(1) _ = session.Exit(MagicSessionErrorCode) return } diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 857b51909de12..853a52b1f4300 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -27,7 +27,6 @@ type sessionMetricsObject struct { agentListenerError prometheus.Counter startPTYSession prometheus.Counter startNonPTYSession prometheus.Counter - sessionError prometheus.Counter // Non-PTY sessions nonPTYStdinPipeError prometheus.Counter @@ -114,11 +113,6 @@ func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { }) registerer.MustRegister(startNonPTYSession) - sessionError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "error", - }) - registerer.MustRegister(sessionError) - nonPTYStdinPipeError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_stdin_pipe_error", }) @@ -174,7 +168,6 @@ func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { agentListenerError: agentListenerError, startPTYSession: startPTYSession, startNonPTYSession: startNonPTYSession, - sessionError: sessionError, nonPTYStdinPipeError: nonPTYStdinPipeError, nonPTYStdinIoCopyError: nonPTYStdinIoCopyError, From 96204527180772cbe3e31dea0454acd5a6db8ffb Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 12:56:44 +0200 Subject: [PATCH 26/38] sftpServerErrors --- agent/agentssh/agentssh.go | 2 +- agent/agentssh/metrics.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 72d79493025cf..c598363bb5cb3 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -447,7 +447,7 @@ func (s *Server) sftpHandler(session ssh.Session) { return } s.logger.Warn(ctx, "sftp server closed with error", slog.Error(err)) - s.metrics.sftpServerError.Add(1) + s.metrics.sftpServerErrors.Add(1) _ = session.Exit(1) } diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 853a52b1f4300..6cb9c675f8d19 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -11,7 +11,7 @@ type sshServerMetrics struct { // SFTP sftpConnectionsTotal prometheus.Counter - sftpServerError prometheus.Counter + sftpServerErrors prometheus.Counter // X11 x11SocketDirError prometheus.Counter @@ -56,10 +56,10 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(sftpConnectionsTotal) - sftpServerError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_server_error", + sftpServerErrors := prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", Subsystem: "ssh_server", Name: "sftp_server_errors_total", }) - registerer.MustRegister(sftpServerError) + registerer.MustRegister(sftpServerErrors) x11HostnameError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "x11_hostname_error", @@ -81,7 +81,7 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { return &sshServerMetrics{ failedConnectionsTotal: failedConnectionsTotal, sftpConnectionsTotal: sftpConnectionsTotal, - sftpServerError: sftpServerError, + sftpServerErrors: sftpServerErrors, x11HostnameError: x11HostnameError, x11SocketDirError: x11SocketDirError, x11XauthorityError: x11XauthorityError, From f05466c924db1bf84f1b87b138cf2103443703fb Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 13:31:09 +0200 Subject: [PATCH 27/38] remove handlerError --- agent/metrics.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/agent/metrics.go b/agent/metrics.go index 1f3570ed5291d..a846c6e332562 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -15,7 +15,6 @@ import ( type agentMetrics struct { connectionsTotal prometheus.Counter - handlerError prometheus.Counter createCommandError prometheus.Counter cmdStartError prometheus.Counter writeError prometheus.Counter @@ -30,11 +29,6 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { }) registerer.MustRegister(connectionsTotal) - handlerError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "error", - }) - registerer.MustRegister(handlerError) - createCommandError := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "reconnecting_pty", Name: "create_command_error", }) @@ -67,7 +61,6 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { return &agentMetrics{ connectionsTotal: connectionsTotal, - handlerError: handlerError, createCommandError: createCommandError, cmdStartError: cmdStartError, writeError: writeError, From 5887ee875d35b18e458df7e67e456e1522f70817 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 19 May 2023 14:36:00 +0200 Subject: [PATCH 28/38] WIP --- agent/agent.go | 17 ++-- agent/agentssh/agentssh.go | 43 ++++---- agent/agentssh/metrics.go | 194 ++++++++----------------------------- agent/agentssh/x11.go | 6 +- agent/metrics.go | 64 +++--------- 5 files changed, 88 insertions(+), 236 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 1c09c5723f0e7..165c73598939c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1019,7 +1019,6 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "session error after agent close", slog.Error(err)) } else { logger.Error(ctx, "session error", slog.Error(err)) - a.metrics.handlerError.Add(1) } } logger.Debug(ctx, "session closed") @@ -1039,7 +1038,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // Empty command will default to the users shell! cmd, err := a.sshServer.CreateCommand(ctx, msg.Command, nil) if err != nil { - a.metrics.createCommandError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("create_command").Add(1) return xerrors.Errorf("create command: %w", err) } cmd.Env = append(cmd.Env, "TERM=xterm-256color") @@ -1052,7 +1051,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m ptty, process, err := pty.Start(cmd) if err != nil { - a.metrics.cmdStartError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("start_command").Add(1) return xerrors.Errorf("start command: %w", err) } @@ -1083,7 +1082,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err)) } else { logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err)) - a.metrics.outputReaderError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("output_reader").Add(1) } break } @@ -1104,7 +1103,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m slog.F("other_conn_id", cid), slog.Error(err), ) - a.metrics.writeError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1) } } rpty.activeConnsMutex.Unlock() @@ -1124,7 +1123,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) - a.metrics.resizeError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1) } // Write any previously stored data for the TTY. rpty.circularBufferMutex.RLock() @@ -1137,7 +1136,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // while also holding circularBufferMutex seems dangerous. _, err = conn.Write(prevBuf) if err != nil { - a.metrics.writeError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1) return xerrors.Errorf("write buffer to conn: %w", err) } // Multiple connections to the same TTY are permitted. @@ -1188,7 +1187,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m _, err = rpty.ptty.InputWriter().Write([]byte(req.Data)) if err != nil { logger.Warn(ctx, "write to pty", slog.Error(err)) - a.metrics.inputWriterError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("input_writer").Add(1) return nil } // Check if a resize needs to happen! @@ -1199,7 +1198,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m if err != nil { // We can continue after this, it's not fatal! logger.Error(ctx, "resize", slog.Error(err)) - a.metrics.resizeError.Add(1) + a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1) } } } diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index c598363bb5cb3..8f369aff3ec32 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -201,8 +201,7 @@ func (s *Server) sessionHandler(session ssh.Session) { return } - m := metricsForSession(s.metrics.sessions, magicType(session)) - err := s.sessionStart(session, m, extraEnv) + err := s.sessionStart(session, extraEnv) var exitError *exec.ExitError if xerrors.As(err, &exitError) { s.logger.Warn(ctx, "ssh session returned", slog.Error(exitError)) @@ -229,7 +228,7 @@ func magicType(session ssh.Session) string { return "" } -func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extraEnv []string) (retErr error) { +func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr error) { ctx := session.Context() env := append(session.Environ(), extraEnv...) var magicType string @@ -254,16 +253,18 @@ func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extra s.logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("type", magicType)) } + magicTypeLabel := magicTypeMetricLabel(magicType) + cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { - m.agentCreateCommandError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "create_command").Add(1) return err } if ssh.AgentRequested(session) { l, err := ssh.NewAgentListener() if err != nil { - m.agentListenerError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "listener").Add(1) return xerrors.Errorf("new agent listener: %w", err) } defer l.Close() @@ -273,13 +274,13 @@ func (s *Server) sessionStart(session ssh.Session, m sessionMetricsObject, extra sshPty, windowSize, isPty := session.Pty() if isPty { - return s.startPTYSession(session, m, cmd, sshPty, windowSize) + return s.startPTYSession(session, magicTypeLabel, cmd, sshPty, windowSize) } - return startNonPTYSession(session, m, cmd.AsExec()) + return s.startNonPTYSession(session, magicTypeLabel, cmd.AsExec()) } -func startNonPTYSession(session ssh.Session, m sessionMetricsObject, cmd *exec.Cmd) error { - m.startNonPTYSession.Add(1) +func (s *Server) startNonPTYSession(session ssh.Session, magicTypeLabel string, cmd *exec.Cmd) error { + s.metrics.sessionsTotal.WithLabelValues(magicTypeLabel, "no").Add(1) cmd.Stdout = session cmd.Stderr = session.Stderr() @@ -287,19 +288,19 @@ func startNonPTYSession(session ssh.Session, m sessionMetricsObject, cmd *exec.C // use StdinPipe. It's unknown what causes this. stdinPipe, err := cmd.StdinPipe() if err != nil { - m.nonPTYStdinPipeError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "stdin_pipe").Add(1) return xerrors.Errorf("create stdin pipe: %w", err) } go func() { _, err := io.Copy(stdinPipe, session) if err != nil { - m.nonPTYStdinIoCopyError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "stdin_io_copy").Add(1) } _ = stdinPipe.Close() }() err = cmd.Start() if err != nil { - m.nonPTYCmdStartError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "start_command").Add(1) return xerrors.Errorf("start: %w", err) } return cmd.Wait() @@ -314,8 +315,8 @@ type ptySession interface { RawCommand() string } -func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { - m.startPTYSession.Add(1) +func (s *Server) startPTYSession(session ptySession, magicTypeLabel string, cmd *pty.Cmd, sshPty ssh.Pty, windowSize <-chan ssh.Window) (retErr error) { + s.metrics.sessionsTotal.WithLabelValues(magicTypeLabel, "yes").Add(1) ctx := session.Context() // Disable minimal PTY emulation set by gliderlabs/ssh (NL-to-CRNL). @@ -328,7 +329,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd err := showMOTD(session, manifest.MOTDFile) if err != nil { s.logger.Error(ctx, "show MOTD", slog.Error(err)) - m.ptyMotdError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "motd").Add(1) } } else { s.logger.Warn(ctx, "metadata lookup failed, unable to show MOTD") @@ -343,14 +344,14 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd pty.WithLogger(slog.Stdlib(ctx, s.logger, slog.LevelInfo)), )) if err != nil { - m.ptyCmdStartError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "start_command").Add(1) return xerrors.Errorf("start command: %w", err) } defer func() { closeErr := ptty.Close() if closeErr != nil { s.logger.Warn(ctx, "failed to close tty", slog.Error(closeErr)) - m.ptyCloseError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "close").Add(1) if retErr == nil { retErr = closeErr } @@ -362,7 +363,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd // If the pty is closed, then command has exited, no need to log. if resizeErr != nil && !errors.Is(resizeErr, pty.ErrClosed) { s.logger.Warn(ctx, "failed to resize tty", slog.Error(resizeErr)) - m.ptyResizeError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "resize").Add(1) } } }() @@ -370,7 +371,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd go func() { _, err := io.Copy(ptty.InputWriter(), session) if err != nil { - m.ptyInputIoCopyError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "input_io_copy").Add(1) } }() @@ -385,7 +386,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd n, err := io.Copy(session, ptty.OutputReader()) s.logger.Debug(ctx, "copy output done", slog.F("bytes", n), slog.Error(err)) if err != nil { - m.ptyOutputIoCopyError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "output_io_copy").Add(1) return xerrors.Errorf("copy error: %w", err) } // We've gotten all the output, but we need to wait for the process to @@ -397,7 +398,7 @@ func (s *Server) startPTYSession(session ptySession, m sessionMetricsObject, cmd // and not something to be concerned about. But, if it's something else, we should log it. if err != nil && !xerrors.As(err, &exitErr) { s.logger.Warn(ctx, "wait error", slog.Error(err)) - m.ptyWaitError.Add(1) + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "yes", "wait").Add(1) } if err != nil { return xerrors.Errorf("process wait: %w", err) diff --git a/agent/agentssh/metrics.go b/agent/agentssh/metrics.go index 6cb9c675f8d19..88ee100d65d9a 100644 --- a/agent/agentssh/metrics.go +++ b/agent/agentssh/metrics.go @@ -1,50 +1,18 @@ package agentssh import ( - "fmt" - "github.com/prometheus/client_golang/prometheus" ) type sshServerMetrics struct { failedConnectionsTotal prometheus.Counter - - // SFTP - sftpConnectionsTotal prometheus.Counter - sftpServerErrors prometheus.Counter - - // X11 - x11SocketDirError prometheus.Counter - x11HostnameError prometheus.Counter - x11XauthorityError prometheus.Counter - - sessions sessionMetrics + sftpConnectionsTotal prometheus.Counter + sftpServerErrors prometheus.Counter + x11HandlerErrors *prometheus.CounterVec + sessionsTotal *prometheus.CounterVec + sessionErrors *prometheus.CounterVec } -type sessionMetricsObject struct { - // Agent sessions - agentCreateCommandError prometheus.Counter - agentListenerError prometheus.Counter - startPTYSession prometheus.Counter - startNonPTYSession prometheus.Counter - - // Non-PTY sessions - nonPTYStdinPipeError prometheus.Counter - nonPTYStdinIoCopyError prometheus.Counter - nonPTYCmdStartError prometheus.Counter - - // PTY sessions - ptyMotdError prometheus.Counter - ptyCmdStartError prometheus.Counter - ptyCloseError prometheus.Counter - ptyResizeError prometheus.Counter - ptyInputIoCopyError prometheus.Counter - ptyOutputIoCopyError prometheus.Counter - ptyWaitError prometheus.Counter -} - -type sessionMetrics map[string]sessionMetricsObject - func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { failedConnectionsTotal := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "agent", Subsystem: "ssh_server", Name: "failed_connections_total", @@ -61,131 +29,47 @@ func newSSHServerMetrics(registerer prometheus.Registerer) *sshServerMetrics { }) registerer.MustRegister(sftpServerErrors) - x11HostnameError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "x11_hostname_error", - }) - registerer.MustRegister(x11HostnameError) - - x11SocketDirError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "x11_socket_dir_error", - }) - registerer.MustRegister(x11SocketDirError) - - x11XauthorityError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "ssh_server", Name: "x11_xauthority_error", - }) - registerer.MustRegister(x11XauthorityError) - - sessions := newSessionMetrics(registerer) + x11HandlerErrors := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "x11_handler", + Name: "errors_total", + }, + []string{"error_type"}, + ) + registerer.MustRegister(x11HandlerErrors) + + sessionsTotal := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "sessions", + Name: "total", + }, + []string{"magic_type", "pty"}, + ) + registerer.MustRegister(sessionsTotal) + + sessionErrors := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "sessions", + Name: "errors_total", + }, + []string{"magic_type", "pty", "error_type"}, + ) + registerer.MustRegister(sessionErrors) return &sshServerMetrics{ failedConnectionsTotal: failedConnectionsTotal, sftpConnectionsTotal: sftpConnectionsTotal, sftpServerErrors: sftpServerErrors, - x11HostnameError: x11HostnameError, - x11SocketDirError: x11SocketDirError, - x11XauthorityError: x11XauthorityError, - - sessions: sessions, - } -} - -func newSessionMetrics(registerer prometheus.Registerer) sessionMetrics { - sm := sessionMetrics{} - for _, magicType := range []string{MagicSessionTypeVSCode, MagicSessionTypeJetBrains, "ssh", "unknown"} { - agentCreateCommandError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "create_command_error", - }) - registerer.MustRegister(agentCreateCommandError) - - agentListenerError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "listener_error", - }) - registerer.MustRegister(agentListenerError) - - startPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "start_pty_session", - }) - registerer.MustRegister(startPTYSession) - - startNonPTYSession := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "start_non_pty_session", - }) - registerer.MustRegister(startNonPTYSession) - - nonPTYStdinPipeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_stdin_pipe_error", - }) - registerer.MustRegister(nonPTYStdinPipeError) - - nonPTYStdinIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_io_copy_error", - }) - registerer.MustRegister(nonPTYStdinIoCopyError) - - nonPTYCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "non_pty_io_start_error", - }) - registerer.MustRegister(nonPTYCmdStartError) - - ptyMotdError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_motd_error", - }) - registerer.MustRegister(ptyMotdError) - - ptyCmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_cmd_start_error", - }) - registerer.MustRegister(ptyCmdStartError) - - ptyCloseError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_close_error", - }) - registerer.MustRegister(ptyCloseError) - - ptyResizeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_resize_error", - }) - registerer.MustRegister(ptyResizeError) - - ptyInputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_input_io_copy_error", - }) - registerer.MustRegister(ptyInputIoCopyError) - - ptyOutputIoCopyError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_output_io_copy_error", - }) - registerer.MustRegister(ptyOutputIoCopyError) - - ptyWaitError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: fmt.Sprintf("sessions_%s", magicType), Name: "pty_wait_error", - }) - registerer.MustRegister(ptyWaitError) - - sm[magicType] = sessionMetricsObject{ - agentCreateCommandError: agentCreateCommandError, - agentListenerError: agentListenerError, - startPTYSession: startPTYSession, - startNonPTYSession: startNonPTYSession, - - nonPTYStdinPipeError: nonPTYStdinPipeError, - nonPTYStdinIoCopyError: nonPTYStdinIoCopyError, - nonPTYCmdStartError: nonPTYCmdStartError, - - ptyMotdError: ptyMotdError, - ptyCmdStartError: ptyCmdStartError, - ptyCloseError: ptyCloseError, - ptyResizeError: ptyResizeError, - ptyInputIoCopyError: ptyInputIoCopyError, - ptyOutputIoCopyError: ptyOutputIoCopyError, - ptyWaitError: ptyWaitError, - } + x11HandlerErrors: x11HandlerErrors, + sessionsTotal: sessionsTotal, + sessionErrors: sessionErrors, } - return sm } -func metricsForSession(m sessionMetrics, magicType string) sessionMetricsObject { +func magicTypeMetricLabel(magicType string) string { switch magicType { case MagicSessionTypeVSCode: case MagicSessionTypeJetBrains: @@ -194,5 +78,5 @@ func metricsForSession(m sessionMetrics, magicType string) sessionMetricsObject default: magicType = "unknown" } - return m[magicType] + return magicType } diff --git a/agent/agentssh/x11.go b/agent/agentssh/x11.go index e473bc52db495..00c2819cc0155 100644 --- a/agent/agentssh/x11.go +++ b/agent/agentssh/x11.go @@ -27,21 +27,21 @@ func (s *Server) x11Callback(ctx ssh.Context, x11 ssh.X11) bool { hostname, err := os.Hostname() if err != nil { s.logger.Warn(ctx, "failed to get hostname", slog.Error(err)) - s.metrics.x11HostnameError.Add(1) + s.metrics.x11HandlerErrors.WithLabelValues("hostname").Add(1) return false } err = s.fs.MkdirAll(s.x11SocketDir, 0o700) if err != nil { s.logger.Warn(ctx, "failed to make the x11 socket dir", slog.F("dir", s.x11SocketDir), slog.Error(err)) - s.metrics.x11SocketDirError.Add(1) + s.metrics.x11HandlerErrors.WithLabelValues("socker_dir").Add(1) return false } err = addXauthEntry(ctx, s.fs, hostname, strconv.Itoa(int(x11.ScreenNumber)), x11.AuthProtocol, x11.AuthCookie) if err != nil { s.logger.Warn(ctx, "failed to add Xauthority entry", slog.Error(err)) - s.metrics.x11XauthorityError.Add(1) + s.metrics.x11HandlerErrors.WithLabelValues("xauthority").Add(1) return false } return true diff --git a/agent/metrics.go b/agent/metrics.go index a846c6e332562..e65d353492912 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -8,19 +8,12 @@ import ( "github.com/prometheus/client_golang/prometheus" "tailscale.com/util/clientmetric" - "cdr.dev/slog" - "github.com/coder/coder/codersdk/agentsdk" ) type agentMetrics struct { - connectionsTotal prometheus.Counter - createCommandError prometheus.Counter - cmdStartError prometheus.Counter - writeError prometheus.Counter - resizeError prometheus.Counter - outputReaderError prometheus.Counter - inputWriterError prometheus.Counter + connectionsTotal prometheus.Counter + reconnectingPTYErrors *prometheus.CounterVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -29,44 +22,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { }) registerer.MustRegister(connectionsTotal) - createCommandError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "create_command_error", - }) - registerer.MustRegister(createCommandError) - - cmdStartError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "cmd_start_error", - }) - registerer.MustRegister(cmdStartError) - - writeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "write_error", - }) - registerer.MustRegister(writeError) - - resizeError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "resize_error", - }) - registerer.MustRegister(resizeError) - - outputReaderError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "output_reader_error", - }) - registerer.MustRegister(outputReaderError) - - inputWriterError := prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "agent", Subsystem: "reconnecting_pty", Name: "input_writer_error", - }) - registerer.MustRegister(inputWriterError) + reconnectingPTYErrors := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "reconnecting_pty", + Name: "errors_total", + }, + []string{"error_type"}, + ) + registerer.MustRegister(reconnectingPTYErrors) return &agentMetrics{ - connectionsTotal: connectionsTotal, - createCommandError: createCommandError, - cmdStartError: cmdStartError, - writeError: writeError, - resizeError: resizeError, - outputReaderError: outputReaderError, - inputWriterError: inputWriterError, + connectionsTotal: connectionsTotal, + reconnectingPTYErrors: reconnectingPTYErrors, } } @@ -87,8 +55,8 @@ func (a *agent) collectMetrics(ctx context.Context) []agentsdk.AgentMetric { }) } - // Agent metrics - metricFamilies, err := a.prometheusRegistry.Gather() + // FIXME Agent metrics + /*metricFamilies, err := a.prometheusRegistry.Gather() if err != nil { a.logger.Error(ctx, "can't gather agent metrics", slog.Error(err)) return collected @@ -112,7 +80,7 @@ func (a *agent) collectMetrics(ctx context.Context) []agentsdk.AgentMetric { a.logger.Error(ctx, "unsupported metric type", slog.F("type", metricFamily.Type.String())) } } - } + }*/ return collected } From bb3602bbc53266f53794d2ee0f0aa6438966a93a Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Mon, 22 May 2023 09:09:11 +0200 Subject: [PATCH 29/38] WIP --- agent/agentssh/agentssh_internal_test.go | 3 +- agent/metrics.go | 39 ++++++++++++++++++------ coderd/apidoc/docs.go | 21 +++++++++++++ coderd/apidoc/swagger.json | 18 +++++++++++ codersdk/agentsdk/agentsdk.go | 12 ++++++-- docs/api/schemas.md | 39 +++++++++++++++++++++--- 6 files changed, 113 insertions(+), 19 deletions(-) diff --git a/agent/agentssh/agentssh_internal_test.go b/agent/agentssh/agentssh_internal_test.go index cb0e4e6741cce..8cee9b7a8632c 100644 --- a/agent/agentssh/agentssh_internal_test.go +++ b/agent/agentssh/agentssh_internal_test.go @@ -59,11 +59,10 @@ func Test_sessionStart_orphan(t *testing.T) { go func() { defer close(done) - m := metricsForSession(s.metrics.sessions, "ssh") // we don't really care what the error is here. In the larger scenario, // the client has disconnected, so we can't return any error information // to them. - _ = s.startPTYSession(sess, m, cmd, ptyInfo, windowSize) + _ = s.startPTYSession(sess, "ssh", cmd, ptyInfo, windowSize) }() readDone := make(chan struct{}) diff --git a/agent/metrics.go b/agent/metrics.go index e65d353492912..dc5fb6c018474 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -6,8 +6,11 @@ import ( "strings" "github.com/prometheus/client_golang/prometheus" + prompb "github.com/prometheus/client_model/go" "tailscale.com/util/clientmetric" + "cdr.dev/slog" + "github.com/coder/coder/codersdk/agentsdk" ) @@ -55,8 +58,7 @@ func (a *agent) collectMetrics(ctx context.Context) []agentsdk.AgentMetric { }) } - // FIXME Agent metrics - /*metricFamilies, err := a.prometheusRegistry.Gather() + metricFamilies, err := a.prometheusRegistry.Gather() if err != nil { a.logger.Error(ctx, "can't gather agent metrics", slog.Error(err)) return collected @@ -64,26 +66,45 @@ func (a *agent) collectMetrics(ctx context.Context) []agentsdk.AgentMetric { for _, metricFamily := range metricFamilies { for _, metric := range metricFamily.GetMetric() { + labels := toAgentMetricLabels(metric.Label) + if metric.Counter != nil { collected = append(collected, agentsdk.AgentMetric{ - Name: metricFamily.GetName(), - Type: agentsdk.AgentMetricTypeCounter, - Value: metric.Counter.GetValue(), + Name: metricFamily.GetName(), + Type: agentsdk.AgentMetricTypeCounter, + Value: metric.Counter.GetValue(), + Labels: labels, }) } else if metric.Gauge != nil { collected = append(collected, agentsdk.AgentMetric{ - Name: metricFamily.GetName(), - Type: agentsdk.AgentMetricTypeGauge, - Value: metric.Gauge.GetValue(), + Name: metricFamily.GetName(), + Type: agentsdk.AgentMetricTypeGauge, + Value: metric.Gauge.GetValue(), + Labels: labels, }) } else { a.logger.Error(ctx, "unsupported metric type", slog.F("type", metricFamily.Type.String())) } } - }*/ + } return collected } +func toAgentMetricLabels(metricLabels []*prompb.LabelPair) []agentsdk.AgentMetricLabel { + if len(metricLabels) == 0 { + return nil + } + + labels := make([]agentsdk.AgentMetricLabel, 0, len(metricLabels)) + for _, metricLabel := range metricLabels { + labels = append(labels, agentsdk.AgentMetricLabel{ + Name: metricLabel.GetName(), + Value: metricLabel.GetValue(), + }) + } + return labels +} + // isIgnoredMetric checks if the metric should be ignored, as Coder agent doesn't use related features. // Expected metric families: magicsock_*, derp_*, tstun_*, netcheck_*, portmap_*, etc. func isIgnoredMetric(metricName string) bool { diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index d58afadd533d9..5494fa4542564 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -5770,6 +5770,12 @@ const docTemplate = `{ "value" ], "properties": { + "labels": { + "type": "array", + "items": { + "$ref": "#/definitions/agentsdk.AgentMetricLabel" + } + }, "name": { "type": "string" }, @@ -5789,6 +5795,21 @@ const docTemplate = `{ } } }, + "agentsdk.AgentMetricLabel": { + "type": "object", + "required": [ + "name", + "value" + ], + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "agentsdk.AgentMetricType": { "type": "string", "enum": [ diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index dd75cdec2f03d..c15abd76c6588 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -5076,6 +5076,12 @@ "type": "object", "required": ["name", "type", "value"], "properties": { + "labels": { + "type": "array", + "items": { + "$ref": "#/definitions/agentsdk.AgentMetricLabel" + } + }, "name": { "type": "string" }, @@ -5092,6 +5098,18 @@ } } }, + "agentsdk.AgentMetricLabel": { + "type": "object", + "required": ["name", "value"], + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "agentsdk.AgentMetricType": { "type": "string", "enum": ["counter", "gauge"], diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index 0a5d30c0f1d9a..a14dbb54ccccc 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -496,9 +496,15 @@ const ( ) type AgentMetric struct { - Name string `json:"name" validate:"required"` - Type AgentMetricType `json:"type" validate:"required" enums:"counter,gauge"` - Value float64 `json:"value" validate:"required"` + Name string `json:"name" validate:"required"` + Type AgentMetricType `json:"type" validate:"required" enums:"counter,gauge"` + Value float64 `json:"value" validate:"required"` + Labels []AgentMetricLabel `json:"labels,omitempty"` +} + +type AgentMetricLabel struct { + Name string `json:"name" validate:"required"` + Value string `json:"value" validate:"required"` } type StatsResponse struct { diff --git a/docs/api/schemas.md b/docs/api/schemas.md index 307b83544a449..1f467a923e3e3 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -20,6 +20,12 @@ ```json { + "labels": [ + { + "name": "string", + "value": "string" + } + ], "name": "string", "type": "counter", "value": 0 @@ -28,11 +34,12 @@ ### Properties -| Name | Type | Required | Restrictions | Description | -| ------- | ---------------------------------------------------- | -------- | ------------ | ----------- | -| `name` | string | true | | | -| `type` | [agentsdk.AgentMetricType](#agentsdkagentmetrictype) | true | | | -| `value` | number | true | | | +| Name | Type | Required | Restrictions | Description | +| -------- | --------------------------------------------------------------- | -------- | ------------ | ----------- | +| `labels` | array of [agentsdk.AgentMetricLabel](#agentsdkagentmetriclabel) | false | | | +| `name` | string | true | | | +| `type` | [agentsdk.AgentMetricType](#agentsdkagentmetrictype) | true | | | +| `value` | number | true | | | #### Enumerated Values @@ -41,6 +48,22 @@ | `type` | `counter` | | `type` | `gauge` | +## agentsdk.AgentMetricLabel + +```json +{ + "name": "string", + "value": "string" +} +``` + +### Properties + +| Name | Type | Required | Restrictions | Description | +| ------- | ------ | -------- | ------------ | ----------- | +| `name` | string | true | | | +| `value` | string | true | | | + ## agentsdk.AgentMetricType ```json @@ -370,6 +393,12 @@ }, "metrics": [ { + "labels": [ + { + "name": "string", + "value": "string" + } + ], "name": "string", "type": "counter", "value": 0 From 27fc9a0459dfd51d5d3b36dd63d0f3f352b5a114 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Mon, 22 May 2023 13:58:19 +0200 Subject: [PATCH 30/38] WIP --- coderd/prometheusmetrics/aggregator.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index ba3d520468690..80ac205142f44 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -56,6 +56,7 @@ type annotatedMetric struct { username string workspaceName string agentName string + labels []agentsdk.AgentMetricLabel expiryDate time.Time } @@ -122,7 +123,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { UpdateLoop: for _, m := range req.metrics { for i, q := range ma.queue { - if q.username == req.username && q.workspaceName == req.workspaceName && q.agentName == req.agentName && q.Name == m.Name { + if q.username == req.username && q.workspaceName == req.workspaceName && q.agentName == req.agentName && q.Name == m.Name && q.labels == m.Labels { ma.queue[i].AgentMetric.Value = m.Value ma.queue[i].expiryDate = req.timestamp.Add(ma.metricsCleanupInterval) continue UpdateLoop From 3f4696b7bd3d9a513833ce82b5f8ef318600f333 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 23 May 2023 13:21:30 +0200 Subject: [PATCH 31/38] Finish impl --- agent/agentssh/agentssh.go | 10 -------- coderd/prometheusmetrics/aggregator.go | 35 +++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 8f369aff3ec32..18ff830ff3b14 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -218,16 +218,6 @@ func (s *Server) sessionHandler(session ssh.Session) { _ = session.Exit(0) } -func magicType(session ssh.Session) string { - for _, kv := range session.Environ() { - if !strings.HasPrefix(kv, MagicSessionTypeEnvironmentVariable) { - continue - } - return strings.TrimPrefix(kv, MagicSessionTypeEnvironmentVariable+"=") - } - return "" -} - func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr error) { ctx := session.Context() env := append(session.Environ(), extraEnv...) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 80ac205142f44..5e0164c5038e6 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -5,6 +5,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "golang.org/x/exp/slices" "golang.org/x/xerrors" "cdr.dev/slog" @@ -56,7 +57,6 @@ type annotatedMetric struct { username string workspaceName string agentName string - labels []agentsdk.AgentMetricLabel expiryDate time.Time } @@ -123,7 +123,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { UpdateLoop: for _, m := range req.metrics { for i, q := range ma.queue { - if q.username == req.username && q.workspaceName == req.workspaceName && q.agentName == req.agentName && q.Name == m.Name && q.labels == m.Labels { + if q.username == req.username && q.workspaceName == req.workspaceName && q.agentName == req.agentName && q.Name == m.Name && slices.Equal(q.Labels, m.Labels) { ma.queue[i].AgentMetric.Value = m.Value ma.queue[i].expiryDate = req.timestamp.Add(ma.metricsCleanupInterval) continue UpdateLoop @@ -147,13 +147,40 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { output := make([]prometheus.Metric, 0, len(ma.queue)) for _, m := range ma.queue { - desc := prometheus.NewDesc(m.Name, metricHelpForAgent, agentMetricsLabels, nil) + labels := make([]string, len(agentMetricsLabels)+len(m.Labels)) + var i int + for _, l := range agentMetricsLabels { + labels[i] = l + i++ + } + + for _, l := range m.Labels { + labels[i] = l.Name + i++ + } + + desc := prometheus.NewDesc(m.Name, metricHelpForAgent, labels, nil) valueType, err := asPrometheusValueType(m.Type) if err != nil { ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) continue } - constMetric := prometheus.MustNewConstMetric(desc, valueType, m.Value, m.username, m.workspaceName, m.agentName) + + labelValues := make([]string, len(agentMetricsLabels)+len(m.Labels)) + labelValues[0] = m.username + labelValues[1] = m.workspaceName + labelValues[2] = m.agentName + + if len(m.Labels) > 0 { + i = 3 + + for _, l := range m.Labels { + labelValues[i] = l.Value + i++ + } + } + + constMetric := prometheus.MustNewConstMetric(desc, valueType, m.Value, labelValues...) output = append(output, constMetric) } outputCh <- output From 8cd07f28178b1820e8e25b89245c42ee959acd95 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 23 May 2023 14:45:07 +0200 Subject: [PATCH 32/38] Aggregator: labels --- coderd/prometheusmetrics/aggregator_test.go | 26 +++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 68b5f94e464ee..e799caf4a98fe 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -44,13 +44,35 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { given2 := []agentsdk.AgentMetric{ {Name: "b_counter_two", Type: agentsdk.AgentMetricTypeCounter, Value: 4}, + {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 5}, + {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 2, Labels: []agentsdk.AgentMetricLabel{ + // Metrics are sorted alphabetically, and we have to prepend "z", so common labels (agent name, username, workspace name) go first. + { + Name: "zfoobar", + Value: "Foobaz", + }, + { + Name: "zhello", + Value: "world", + }, + }}, {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6}, } expected := []agentsdk.AgentMetric{ {Name: "a_counter_one", Type: agentsdk.AgentMetricTypeCounter, Value: 1}, {Name: "b_counter_two", Type: agentsdk.AgentMetricTypeCounter, Value: 4}, - {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 3}, + {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 5}, + {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 2, Labels: []agentsdk.AgentMetricLabel{ + { + Name: "zfoobar", + Value: "Foobaz", + }, + { + Name: "zhello", + Value: "world", + }, + }}, {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6}, } @@ -92,7 +114,7 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua err := actual[i].Write(&d) require.NoError(t, err) - require.Equal(t, "agent_name", *d.Label[0].Name) + require.Equal(t, "agent_name", *d.Label[0].Name, d.String()) require.Equal(t, testAgentName, *d.Label[0].Value) require.Equal(t, "username", *d.Label[1].Name) require.Equal(t, testUsername, *d.Label[1].Value) From 389dd9f5b6dbd6410ce762f854c93d56e3a0e097 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 23 May 2023 16:33:36 +0200 Subject: [PATCH 33/38] TestAgent_Metrics_SSH --- agent/agent_test.go | 119 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 3 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 1dab56c2c7696..1d4d90a323fff 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -27,6 +27,8 @@ import ( "github.com/google/uuid" "github.com/pion/udp" "github.com/pkg/sftp" + "github.com/prometheus/client_golang/prometheus" + promgo "github.com/prometheus/client_model/go" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -1724,7 +1726,7 @@ func (c closeFunc) Close() error { return c() } -func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Duration) ( +func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Duration, opts ...func(agent.Options) agent.Options) ( *codersdk.WorkspaceAgentConn, *client, <-chan *agentsdk.Stats, @@ -1749,12 +1751,19 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati statsChan: statsCh, coordinator: coordinator, } - closer := agent.New(agent.Options{ + + options := agent.Options{ Client: c, Filesystem: fs, Logger: logger.Named("agent"), ReconnectingPTYTimeout: ptyTimeout, - }) + } + + for _, opt := range opts { + options = opt(options) + } + + closer := agent.New(options) t.Cleanup(func() { _ = closer.Close() }) @@ -1979,3 +1988,107 @@ func tempDirUnixSocket(t *testing.T) string { return t.TempDir() } + +func TestAgent_Metrics_SSH(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + registry := prometheus.NewRegistry() + + //nolint:dogsled + conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0, func(o agent.Options) agent.Options { + o.PrometheusRegistry = registry + return o + }) + + sshClient, err := conn.SSHClient(ctx) + require.NoError(t, err) + defer sshClient.Close() + session, err := sshClient.NewSession() + require.NoError(t, err) + defer session.Close() + stdin, err := session.StdinPipe() + require.NoError(t, err) + err = session.Shell() + require.NoError(t, err) + + expected := []agentsdk.AgentMetric{ + { + Name: "agent_reconnecting_pty_connections_total", + Type: agentsdk.AgentMetricTypeCounter, + Value: 0, + }, + { + Name: "agent_sessions_total", + Type: agentsdk.AgentMetricTypeCounter, + Value: 1, + Labels: []agentsdk.AgentMetricLabel{ + { + Name: "magic_type", + Value: "ssh", + }, + { + Name: "pty", + Value: "no", + }, + }, + }, + { + Name: "agent_ssh_server_failed_connections_total", + Type: agentsdk.AgentMetricTypeCounter, + Value: 0, + }, + { + Name: "agent_ssh_server_sftp_connections_total", + Type: agentsdk.AgentMetricTypeCounter, + Value: 0, + }, + { + Name: "agent_ssh_server_sftp_server_errors_total", + Type: agentsdk.AgentMetricTypeCounter, + Value: 0, + }, + } + + var actual []*promgo.MetricFamily + assert.Eventually(t, func() bool { + actual, err = registry.Gather() + if err != nil { + return false + } + + if len(expected) != len(actual) { + return false + } + + return verifyCollectedMetrics(t, expected, actual) + }, testutil.WaitLong, testutil.IntervalFast) + + require.Len(t, actual, len(expected)) + verifyCollectedMetrics(t, expected, actual) + + _ = stdin.Close() + err = session.Wait() + require.NoError(t, err) +} + +func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool { + for i, e := range expected { + require.Equal(t, e.Name, actual[i].GetName()) + require.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String())) + + for _, m := range actual[i].GetMetric() { + require.Equal(t, e.Value, m.Counter.GetValue()) + + if len(m.GetLabel()) > 0 { + for j, lbl := range m.GetLabel() { + require.Equal(t, e.Labels[j].Name, lbl.GetName()) + require.Equal(t, e.Labels[j].Value, lbl.GetValue()) + } + } + m.GetLabel() + } + } + return true +} From 8e10d6dd7e6bbe1f4ff47fa2bcd84222f1040779 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 24 May 2023 10:12:27 +0200 Subject: [PATCH 34/38] Address PR comments --- agent/agent_test.go | 15 +++++++++------ coderd/prometheusmetrics/aggregator_test.go | 10 +++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 1d4d90a323fff..832919e7bf5df 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2066,7 +2066,8 @@ func TestAgent_Metrics_SSH(t *testing.T) { }, testutil.WaitLong, testutil.IntervalFast) require.Len(t, actual, len(expected)) - verifyCollectedMetrics(t, expected, actual) + collected := verifyCollectedMetrics(t, expected, actual) + require.True(t, collected, "expected metrics were not collected") _ = stdin.Close() err = session.Wait() @@ -2074,17 +2075,19 @@ func TestAgent_Metrics_SSH(t *testing.T) { } func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool { + t.Helper() + for i, e := range expected { - require.Equal(t, e.Name, actual[i].GetName()) - require.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String())) + assert.Equal(t, e.Name, actual[i].GetName()) + assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String())) for _, m := range actual[i].GetMetric() { - require.Equal(t, e.Value, m.Counter.GetValue()) + assert.Equal(t, e.Value, m.Counter.GetValue()) if len(m.GetLabel()) > 0 { for j, lbl := range m.GetLabel() { - require.Equal(t, e.Labels[j].Name, lbl.GetName()) - require.Equal(t, e.Labels[j].Value, lbl.GetValue()) + assert.Equal(t, e.Labels[j].Name, lbl.GetName()) + assert.Equal(t, e.Labels[j].Value, lbl.GetValue()) } } m.GetLabel() diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index e799caf4a98fe..c20aeadb3a8d6 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -115,11 +115,11 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua require.NoError(t, err) require.Equal(t, "agent_name", *d.Label[0].Name, d.String()) - require.Equal(t, testAgentName, *d.Label[0].Value) - require.Equal(t, "username", *d.Label[1].Name) - require.Equal(t, testUsername, *d.Label[1].Value) - require.Equal(t, "workspace_name", *d.Label[2].Name) - require.Equal(t, testWorkspaceName, *d.Label[2].Value) + require.Equal(t, testAgentName, *d.Label[0].Value, d.String()) + require.Equal(t, "username", *d.Label[1].Name, d.String()) + require.Equal(t, testUsername, *d.Label[1].Value, d.String()) + require.Equal(t, "workspace_name", *d.Label[2].Name, d.String()) + require.Equal(t, testWorkspaceName, *d.Label[2].Value, d.String()) if e.Type == agentsdk.AgentMetricTypeCounter { require.Equal(t, e.Value, *d.Counter.Value) From 1858dc2af97c43c345a390e9ae5ef8add80b7503 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 24 May 2023 11:06:00 +0200 Subject: [PATCH 35/38] use labelIndex --- coderd/prometheusmetrics/aggregator.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 5e0164c5038e6..80c3e4ec90eac 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -148,15 +148,15 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { output := make([]prometheus.Metric, 0, len(ma.queue)) for _, m := range ma.queue { labels := make([]string, len(agentMetricsLabels)+len(m.Labels)) - var i int + var labelIndex int for _, l := range agentMetricsLabels { - labels[i] = l - i++ + labels[labelIndex] = l + labelIndex++ } for _, l := range m.Labels { - labels[i] = l.Name - i++ + labels[labelIndex] = l.Name + labelIndex++ } desc := prometheus.NewDesc(m.Name, metricHelpForAgent, labels, nil) @@ -172,11 +172,11 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { labelValues[2] = m.agentName if len(m.Labels) > 0 { - i = 3 + labelIndex = 3 for _, l := range m.Labels { - labelValues[i] = l.Value - i++ + labelValues[labelIndex] = l.Value + labelIndex++ } } From f4162879b6db79d3f4e14fe6fc603d9bfac54dbd Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 25 May 2023 10:30:01 +0200 Subject: [PATCH 36/38] PR comments part 1 --- agent/agentssh/agentssh.go | 15 ++++++++++++--- coderd/prometheusmetrics/aggregator.go | 6 +++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 18ff830ff3b14..fcf562a888293 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -244,17 +244,27 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er } magicTypeLabel := magicTypeMetricLabel(magicType) + sshPty, windowSize, isPty := session.Pty() cmd, err := s.CreateCommand(ctx, session.RawCommand(), env) if err != nil { - s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "create_command").Add(1) + ptyLabel := "no" + if isPty { + ptyLabel = "yes" + } + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, ptyLabel, "create_command").Add(1) return err } if ssh.AgentRequested(session) { l, err := ssh.NewAgentListener() if err != nil { - s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "listener").Add(1) + ptyLabel := "no" + if isPty { + ptyLabel = "yes" + } + + s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, ptyLabel, "listener").Add(1) return xerrors.Errorf("new agent listener: %w", err) } defer l.Close() @@ -262,7 +272,6 @@ func (s *Server) sessionStart(session ssh.Session, extraEnv []string) (retErr er cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", "SSH_AUTH_SOCK", l.Addr().String())) } - sshPty, windowSize, isPty := session.Pty() if isPty { return s.startPTYSession(session, magicTypeLabel, cmd, sshPty, windowSize) } diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 80c3e4ec90eac..23ccc653cee5b 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -63,6 +63,10 @@ type annotatedMetric struct { var _ prometheus.Collector = new(MetricsAggregator) +func (am *annotatedMetric) is(req updateRequest, m agentsdk.AgentMetric) bool { + return am.username == req.username && am.workspaceName == req.workspaceName && am.agentName == req.agentName && am.Name == m.Name && slices.Equal(am.Labels, m.Labels) +} + func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) { metricsCleanupInterval := defaultMetricsCleanupInterval if duration > 0 { @@ -123,7 +127,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { UpdateLoop: for _, m := range req.metrics { for i, q := range ma.queue { - if q.username == req.username && q.workspaceName == req.workspaceName && q.agentName == req.agentName && q.Name == m.Name && slices.Equal(q.Labels, m.Labels) { + if q.is(req, m) { ma.queue[i].AgentMetric.Value = m.Value ma.queue[i].expiryDate = req.timestamp.Add(ma.metricsCleanupInterval) continue UpdateLoop From 4daf37d7444bc635e885aad9fa8e62acb37aa9ed Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 25 May 2023 10:54:59 +0200 Subject: [PATCH 37/38] PR comments part 2 --- coderd/prometheusmetrics/aggregator.go | 53 +++++++++++--------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 23ccc653cee5b..b236bac5f6aa1 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -67,6 +67,26 @@ func (am *annotatedMetric) is(req updateRequest, m agentsdk.AgentMetric) bool { return am.username == req.username && am.workspaceName == req.workspaceName && am.agentName == req.agentName && am.Name == m.Name && slices.Equal(am.Labels, m.Labels) } +func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { + labels := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) + labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) + + labels = append(labels, agentMetricsLabels...) + labelValues = append(labelValues, am.username, am.workspaceName, am.agentName) + + for _, l := range am.Labels { + labels = append(labels, l.Name) + labelValues = append(labelValues, l.Value) + } + + desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil) + valueType, err := asPrometheusValueType(am.Type) + if err != nil { + return nil, err + } + return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil +} + func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration) (*MetricsAggregator, error) { metricsCleanupInterval := defaultMetricsCleanupInterval if duration > 0 { @@ -151,41 +171,12 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { output := make([]prometheus.Metric, 0, len(ma.queue)) for _, m := range ma.queue { - labels := make([]string, len(agentMetricsLabels)+len(m.Labels)) - var labelIndex int - for _, l := range agentMetricsLabels { - labels[labelIndex] = l - labelIndex++ - } - - for _, l := range m.Labels { - labels[labelIndex] = l.Name - labelIndex++ - } - - desc := prometheus.NewDesc(m.Name, metricHelpForAgent, labels, nil) - valueType, err := asPrometheusValueType(m.Type) + promMetric, err := m.asPrometheus() if err != nil { ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) continue } - - labelValues := make([]string, len(agentMetricsLabels)+len(m.Labels)) - labelValues[0] = m.username - labelValues[1] = m.workspaceName - labelValues[2] = m.agentName - - if len(m.Labels) > 0 { - labelIndex = 3 - - for _, l := range m.Labels { - labelValues[labelIndex] = l.Value - labelIndex++ - } - } - - constMetric := prometheus.MustNewConstMetric(desc, valueType, m.Value, labelValues...) - output = append(output, constMetric) + output = append(output, promMetric) } outputCh <- output close(outputCh) From d9203b8d6fd0f69706ada936457df1b1fa2bb08a Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 25 May 2023 11:50:39 +0200 Subject: [PATCH 38/38] PR comments part 3 --- coderd/prometheusmetrics/aggregator_test.go | 63 +++++++++++---------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index c20aeadb3a8d6..d7a15a547e6d3 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -46,34 +46,29 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { {Name: "b_counter_two", Type: agentsdk.AgentMetricTypeCounter, Value: 4}, {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 5}, {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 2, Labels: []agentsdk.AgentMetricLabel{ - // Metrics are sorted alphabetically, and we have to prepend "z", so common labels (agent name, username, workspace name) go first. - { - Name: "zfoobar", - Value: "Foobaz", - }, - { - Name: "zhello", - Value: "world", - }, + {Name: "foobar", Value: "Foobaz"}, + {Name: "hello", Value: "world"}, }}, {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6}, } + commonLabels := []agentsdk.AgentMetricLabel{ + {Name: "agent_name", Value: testAgentName}, + {Name: "username", Value: testUsername}, + {Name: "workspace_name", Value: testWorkspaceName}, + } expected := []agentsdk.AgentMetric{ - {Name: "a_counter_one", Type: agentsdk.AgentMetricTypeCounter, Value: 1}, - {Name: "b_counter_two", Type: agentsdk.AgentMetricTypeCounter, Value: 4}, - {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 5}, + {Name: "a_counter_one", Type: agentsdk.AgentMetricTypeCounter, Value: 1, Labels: commonLabels}, + {Name: "b_counter_two", Type: agentsdk.AgentMetricTypeCounter, Value: 4, Labels: commonLabels}, + {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 5, Labels: commonLabels}, {Name: "c_gauge_three", Type: agentsdk.AgentMetricTypeGauge, Value: 2, Labels: []agentsdk.AgentMetricLabel{ - { - Name: "zfoobar", - Value: "Foobaz", - }, - { - Name: "zhello", - Value: "world", - }, + {Name: "agent_name", Value: testAgentName}, + {Name: "foobar", Value: "Foobaz"}, + {Name: "hello", Value: "world"}, + {Name: "username", Value: testUsername}, + {Name: "workspace_name", Value: testWorkspaceName}, }}, - {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6}, + {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6, Labels: commonLabels}, } // when @@ -105,7 +100,6 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua return false } - // Metrics are expected to arrive in order for i, e := range expected { desc := actual[i].Desc() assert.Contains(t, desc.String(), e.Name) @@ -114,24 +108,31 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua err := actual[i].Write(&d) require.NoError(t, err) - require.Equal(t, "agent_name", *d.Label[0].Name, d.String()) - require.Equal(t, testAgentName, *d.Label[0].Value, d.String()) - require.Equal(t, "username", *d.Label[1].Name, d.String()) - require.Equal(t, testUsername, *d.Label[1].Value, d.String()) - require.Equal(t, "workspace_name", *d.Label[2].Name, d.String()) - require.Equal(t, testWorkspaceName, *d.Label[2].Value, d.String()) - if e.Type == agentsdk.AgentMetricTypeCounter { - require.Equal(t, e.Value, *d.Counter.Value) + require.Equal(t, e.Value, d.Counter.GetValue()) } else if e.Type == agentsdk.AgentMetricTypeGauge { - require.Equal(t, e.Value, *d.Gauge.Value) + require.Equal(t, e.Value, d.Gauge.GetValue()) } else { require.Failf(t, "unsupported type: %s", string(e.Type)) } + + dtoLabels := asMetricAgentLabels(d.GetLabel()) + require.Equal(t, e.Labels, dtoLabels, d.String()) } return true } +func asMetricAgentLabels(dtoLabels []*dto.LabelPair) []agentsdk.AgentMetricLabel { + metricLabels := make([]agentsdk.AgentMetricLabel, 0, len(dtoLabels)) + for _, dtoLabel := range dtoLabels { + metricLabels = append(metricLabels, agentsdk.AgentMetricLabel{ + Name: dtoLabel.GetName(), + Value: dtoLabel.GetValue(), + }) + } + return metricLabels +} + func TestUpdateMetrics_MetricsExpire(t *testing.T) { t.Parallel()