-
Notifications
You must be signed in to change notification settings - Fork 875
fix(agent): ensure SSH server shutdown with process groups #17227
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -582,6 +582,12 @@ func (s *Server) sessionStart(logger slog.Logger, session ssh.Session, env []str | |
func (s *Server) startNonPTYSession(logger slog.Logger, session ssh.Session, magicTypeLabel string, cmd *exec.Cmd) error { | ||
s.metrics.sessionsTotal.WithLabelValues(magicTypeLabel, "no").Add(1) | ||
|
||
// Create a process group and send SIGHUP to child processes, | ||
// otherwise context cancellation will not propagate properly | ||
// and SSH server close may be delayed. | ||
cmd.SysProcAttr = cmdSysProcAttr() | ||
cmd.Cancel = cmdCancel(session.Context(), logger, cmd) | ||
|
||
cmd.Stdout = session | ||
cmd.Stderr = session.Stderr() | ||
// This blocks forever until stdin is received if we don't | ||
|
@@ -926,7 +932,12 @@ func (s *Server) CreateCommand(ctx context.Context, script string, env []string, | |
// Serve starts the server to handle incoming connections on the provided listener. | ||
// It returns an error if no host keys are set or if there is an issue accepting connections. | ||
func (s *Server) Serve(l net.Listener) (retErr error) { | ||
if len(s.srv.HostSigners) == 0 { | ||
// Ensure we're not mutating HostSigners as we're reading it. | ||
s.mu.RLock() | ||
noHostKeys := len(s.srv.HostSigners) == 0 | ||
s.mu.RUnlock() | ||
|
||
if noHostKeys { | ||
return xerrors.New("no host keys set") | ||
} | ||
|
||
|
@@ -1044,6 +1055,11 @@ func (s *Server) trackSession(ss ssh.Session, add bool) (ok bool) { | |
// Close the server and all active connections. Server can be re-used | ||
// after Close is done. | ||
func (s *Server) Close() error { | ||
return s.close(context.Background()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this extra method with the context argument? I don't see it used elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Originally I intended to use |
||
} | ||
|
||
//nolint:revive // Ignore the similarity of close and Close. | ||
func (s *Server) close(ctx context.Context) error { | ||
s.mu.Lock() | ||
|
||
// Guard against multiple calls to Close and | ||
|
@@ -1054,24 +1070,29 @@ func (s *Server) Close() error { | |
} | ||
s.closing = make(chan struct{}) | ||
|
||
s.logger.Debug(ctx, "closing server") | ||
|
||
// Stop accepting new connections. | ||
s.logger.Debug(ctx, "closing all active listeners") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: log the number of active listeners |
||
for l := range s.listeners { | ||
_ = l.Close() | ||
} | ||
|
||
// Close all active sessions to gracefully | ||
// terminate client connections. | ||
s.logger.Debug(ctx, "closing all active sessions") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: log the number of active sessions |
||
for ss := range s.sessions { | ||
// We call Close on the underlying channel here because we don't | ||
// want to send an exit status to the client (via Exit()). | ||
// Typically OpenSSH clients will return 255 as the exit status. | ||
_ = ss.Close() | ||
} | ||
|
||
// Close all active listeners and connections. | ||
for l := range s.listeners { | ||
_ = l.Close() | ||
} | ||
s.logger.Debug(ctx, "closing all active connections") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: log the number of active conns |
||
for c := range s.conns { | ||
_ = c.Close() | ||
} | ||
|
||
// Close the underlying SSH server. | ||
s.logger.Debug(ctx, "closing SSH server") | ||
err := s.srv.Close() | ||
|
||
s.mu.Unlock() | ||
|
@@ -1082,15 +1103,36 @@ func (s *Server) Close() error { | |
s.closing = nil | ||
s.mu.Unlock() | ||
|
||
s.logger.Debug(ctx, "closing server done") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: log elapsed time for closing |
||
|
||
return err | ||
} | ||
|
||
// Shutdown gracefully closes all active SSH connections and stops | ||
// Shutdown ~~gracefully~~ closes all active SSH connections and stops | ||
johnstcn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// accepting new connections. | ||
// | ||
// Shutdown is not implemented. | ||
func (*Server) Shutdown(_ context.Context) error { | ||
// TODO(mafredri): Implement shutdown, SIGHUP running commands, etc. | ||
// For now, simply calls Close and allows early return via context | ||
// cancellation. | ||
func (s *Server) Shutdown(ctx context.Context) error { | ||
ch := make(chan error, 1) | ||
go func() { | ||
// TODO(mafredri): Implement shutdown, SIGHUP running commands, etc. | ||
// For now we just close the server. | ||
ch <- s.Close() | ||
}() | ||
var err error | ||
select { | ||
case <-ctx.Done(): | ||
err = ctx.Err() | ||
case err = <-ch: | ||
} | ||
// Re-check for context cancellation precedence. | ||
if ctx.Err() != nil { | ||
err = ctx.Err() | ||
} | ||
if err != nil { | ||
return xerrors.Errorf("close server: %w", err) | ||
} | ||
return nil | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
//go:build !windows | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Review: These are copy-pasta from agentscripts package. |
||
|
||
package agentssh | ||
|
||
import ( | ||
"context" | ||
"os/exec" | ||
"syscall" | ||
|
||
"cdr.dev/slog" | ||
) | ||
|
||
func cmdSysProcAttr() *syscall.SysProcAttr { | ||
return &syscall.SysProcAttr{ | ||
Setsid: true, | ||
} | ||
} | ||
|
||
func cmdCancel(ctx context.Context, logger slog.Logger, cmd *exec.Cmd) func() error { | ||
return func() error { | ||
logger.Debug(ctx, "cmdCancel: sending SIGHUP to process and children", slog.F("pid", cmd.Process.Pid)) | ||
johnstcn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return syscall.Kill(-cmd.Process.Pid, syscall.SIGHUP) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package agentssh | ||
|
||
import ( | ||
"context" | ||
"os" | ||
"os/exec" | ||
"syscall" | ||
|
||
"cdr.dev/slog" | ||
) | ||
|
||
func cmdSysProcAttr() *syscall.SysProcAttr { | ||
return &syscall.SysProcAttr{} | ||
} | ||
|
||
func cmdCancel(ctx context.Context, logger slog.Logger, cmd *exec.Cmd) func() error { | ||
return func() error { | ||
logger.Debug(ctx, "cmdCancel: sending interrupt to process", slog.F("pid", cmd.Process.Pid)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 📋🍝 suggestion: check that |
||
return cmd.Process.Signal(os.Interrupt) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can foresee someone wanting to adjust this timeout. Maybe add a CLI option for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that if we do get requests to make this configurable, it will help drive the implementation. I'm not certain this approach is the end-goal we want and I would like to avoid exposing functionality that may limit future implementations.
When we start solving #6175, changing this behavior may be relevant.
I'll go ahead and reduce this to 5 seconds as well as the default docker grace timeout is 10s I believe.
The original motivation behind closing SSH before running shutdown scripts is to ensure that commands doing work exit before potentially doing backups of state or something along those lines. I think giving those commands a grace of 5 seconds is plenty in most cases.