Skip to content

feat: add file logger to coder ssh #7646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 157 additions & 76 deletions cli/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"net/url"
"os"
"os/exec"
"path"
"path/filepath"
"strings"
"time"
Expand All @@ -23,6 +24,9 @@ import (
"golang.org/x/term"
"golang.org/x/xerrors"

"cdr.dev/slog"
"cdr.dev/slog/sloggers/sloghuman"

"github.com/coder/coder/agent/agentssh"
"github.com/coder/coder/cli/clibase"
"github.com/coder/coder/cli/cliui"
Expand All @@ -46,6 +50,8 @@ func (r *RootCmd) ssh() *clibase.Cmd {
identityAgent string
wsPollInterval time.Duration
noWait bool
logDir string
logToFile bool
)
client := new(codersdk.Client)
cmd := &clibase.Cmd{
Expand All @@ -56,10 +62,44 @@ func (r *RootCmd) ssh() *clibase.Cmd {
clibase.RequireNArgs(1),
r.InitClient(client),
),
Handler: func(inv *clibase.Invocation) error {
Handler: func(inv *clibase.Invocation) (retErr error) {
ctx, cancel := context.WithCancel(inv.Context())
defer cancel()

logger := slog.Make() // empty logger
defer func() {
if retErr != nil {
// catch and log all returned errors so we see them in the
// log file (if there is one)
logger.Error(ctx, "command exit", slog.Error(retErr))
}
}()
if logToFile {
// we need a way to ensure different ssh invocations don't clobber
// each other's logs. Date-time strings will likely have collisions
// in unit tests and/or scripts unless we extend precision out to
// sub-millisecond, which seems unwieldy. A simple 5-character random
// string will do it, since the operating system already tracks
// dates and times for file IO.
qual, err := cryptorand.String(5)
if err != nil {
return xerrors.Errorf("generate random qualifier: %w", err)
}
logPth := path.Join(logDir, fmt.Sprintf("coder-ssh-%s.log", qual))
logFile, err := os.Create(logPth)
if err != nil {
return xerrors.Errorf("error opening %s for logging: %w", logPth, err)
}
logger = slog.Make(sloghuman.Sink(logFile))
defer logFile.Close()
if r.verbose {
logger = logger.Leveled(slog.LevelDebug)
}

// log HTTP requests
client.Logger = logger
}

workspace, workspaceAgent, err := getWorkspaceAndAgent(ctx, inv, client, codersdk.Me, inv.Args[0])
if err != nil {
return err
Expand Down Expand Up @@ -92,110 +132,71 @@ func (r *RootCmd) ssh() *clibase.Cmd {
// We don't print the error because cliui.Agent does that for us.
}

conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{})
conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{
Logger: logger,
})
if err != nil {
return err
return xerrors.Errorf("dial agent: %w", err)
}
defer conn.Close()
conn.AwaitReachable(ctx)
stopPolling := tryPollWorkspaceAutostop(ctx, client, workspace)
defer stopPolling()

// Enure connection is closed if the context is canceled or
// the workspace reaches the stopped state.
//
// Watching the stopped state is a work-around for cases
// where the agent is not gracefully shut down and the
// connection is left open. If, for instance, the networking
// is stopped before the agent is shut down, the disconnect
// will usually not propagate.
//
// See: https://github.com/coder/coder/issues/6180
watchAndClose := func(closer func() error) {
// Ensure session is ended on both context cancellation
// and workspace stop.
defer func() {
_ = closer()
}()

startWatchLoop:
for {
// (Re)connect to the coder server and watch workspace events.
var wsWatch <-chan codersdk.Workspace
var err error
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
wsWatch, err = client.WatchWorkspace(ctx, workspace.ID)
if err == nil {
break
}
if ctx.Err() != nil {
return
}
}

for {
select {
case <-ctx.Done():
return
case w, ok := <-wsWatch:
if !ok {
continue startWatchLoop
}

// Transitioning to stop or delete could mean that
// the agent will still gracefully stop. If a new
// build is starting, there's no reason to wait for
// the agent, it should be long gone.
if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart {
return
}
// Note, we only react to the stopped state here because we
// want to give the agent a chance to gracefully shut down
// during "stopping".
if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped {
return
}
}
}
}
}

if stdio {
rawSSH, err := conn.SSH(ctx)
if err != nil {
return err
return xerrors.Errorf("connect SSH: %w", err)
}
defer rawSSH.Close()
go watchAndClose(rawSSH.Close)
go watchAndClose(ctx, rawSSH.Close, logger, client, workspace)

go func() {
// Ensure stdout copy closes incase stdin is closed
// unexpectedly. Typically we wouldn't worry about
// this since OpenSSH should kill the proxy command.
defer rawSSH.Close()

_, _ = io.Copy(rawSSH, inv.Stdin)
_, err := io.Copy(rawSSH, inv.Stdin)
if err != nil {
logger.Error(ctx, "copy stdin error", slog.Error(err))
} else {
logger.Debug(ctx, "copy stdin complete")
}
}()
_, _ = io.Copy(inv.Stdout, rawSSH)
_, err = io.Copy(inv.Stdout, rawSSH)
if err != nil {
logger.Error(ctx, "copy stdout error", slog.Error(err))
} else {
logger.Debug(ctx, "copy stdout complete")
}
return nil
}

sshClient, err := conn.SSHClient(ctx)
if err != nil {
return err
return xerrors.Errorf("ssh client: %w", err)
}
defer sshClient.Close()

sshSession, err := sshClient.NewSession()
if err != nil {
return err
return xerrors.Errorf("ssh session: %w", err)
}
defer sshSession.Close()
go watchAndClose(func() error {
_ = sshSession.Close()
_ = sshClient.Close()
return nil
})
go watchAndClose(
ctx,
func() error {
err := sshSession.Close()
logger.Debug(ctx, "session close", slog.Error(err))
err = sshClient.Close()
logger.Debug(ctx, "client close", slog.Error(err))
return nil
},
logger,
client,
workspace,
)

if identityAgent == "" {
identityAgent = os.Getenv("SSH_AUTH_SOCK")
Expand Down Expand Up @@ -257,7 +258,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {

err = sshSession.RequestPty("xterm-256color", 128, 128, gossh.TerminalModes{})
if err != nil {
return err
return xerrors.Errorf("request pty: %w", err)
}

sshSession.Stdin = inv.Stdin
Expand All @@ -266,7 +267,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {

err = sshSession.Shell()
if err != nil {
return err
return xerrors.Errorf("start shell: %w", err)
}

// Put cancel at the top of the defer stack to initiate
Expand All @@ -289,7 +290,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
if errors.Is(err, &gossh.ExitMissingError{}) {
return xerrors.New("SSH connection ended unexpectedly")
}
return err
return xerrors.Errorf("session ended: %w", err)
}

return nil
Expand Down Expand Up @@ -335,10 +336,90 @@ func (r *RootCmd) ssh() *clibase.Cmd {
Description: "Specifies whether to wait for a workspace to become ready before logging in (only applicable when the login before ready option has not been enabled). Note that the workspace agent may still be in the process of executing the startup script and the workspace may be in an incomplete state.",
Value: clibase.BoolOf(&noWait),
},
{
Flag: "log-dir",
Default: os.TempDir(),
Description: "Specify the location for the log files.",
Env: "CODER_SSH_LOG_DIR",
Value: clibase.StringOf(&logDir),
},
{
Flag: "log-to-file",
FlagShorthand: "l",
Env: "CODER_SSH_LOG_TO_FILE",
Description: "Enable diagnostic logging to file.",
Value: clibase.BoolOf(&logToFile),
},
}
return cmd
}

// watchAndClose ensures closer is called if the context is canceled or
// the workspace reaches the stopped state.
//
// Watching the stopped state is a work-around for cases
// where the agent is not gracefully shut down and the
// connection is left open. If, for instance, the networking
// is stopped before the agent is shut down, the disconnect
// will usually not propagate.
//
// See: https://github.com/coder/coder/issues/6180
func watchAndClose(ctx context.Context, closer func() error, logger slog.Logger, client *codersdk.Client, workspace codersdk.Workspace) {
// Ensure session is ended on both context cancellation
// and workspace stop.
defer func() {
err := closer()
if err != nil {
logger.Error(ctx, "error closing session", slog.Error(err))
}
}()

startWatchLoop:
for {
logger.Debug(ctx, "(re)connecting to the coder server to watch workspace events.")
var wsWatch <-chan codersdk.Workspace
var err error
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
wsWatch, err = client.WatchWorkspace(ctx, workspace.ID)
if err == nil {
break
}
if ctx.Err() != nil {
logger.Info(ctx, "context expired", slog.Error(ctx.Err()))
return
}
}

for {
select {
case <-ctx.Done():
logger.Info(ctx, "context expired", slog.Error(ctx.Err()))
return
case w, ok := <-wsWatch:
if !ok {
continue startWatchLoop
}

// Transitioning to stop or delete could mean that
// the agent will still gracefully stop. If a new
// build is starting, there's no reason to wait for
// the agent, it should be long gone.
if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart {
logger.Info(ctx, "new build started")
return
}
// Note, we only react to the stopped state here because we
// want to give the agent a chance to gracefully shut down
// during "stopping".
if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped {
logger.Info(ctx, "workspace stopped")
return
}
}
}
}
}

// getWorkspaceAgent returns the workspace and agent selected using either the
// `<workspace>[.<agent>]` syntax via `in` or picks a random workspace and agent
// if `shuffle` is true.
Expand Down
44 changes: 44 additions & 0 deletions cli/ssh_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,50 @@ func TestSSH(t *testing.T) {
pty.WriteLine("exit")
<-cmdDone
})

t.Run("FileLogging", func(t *testing.T) {
t.Parallel()

dir := t.TempDir()

client, workspace, agentToken := setupWorkspaceForAgent(t, nil)
inv, root := clitest.New(t, "ssh", workspace.Name, "-l", "--log-dir", dir)
clitest.SetupConfig(t, client, root)
pty := ptytest.New(t).Attach(inv)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

cmdDone := tGo(t, func() {
err := inv.WithContext(ctx).Run()
assert.NoError(t, err)
})
pty.ExpectMatch("Waiting")

agentClient := agentsdk.New(client.URL)
agentClient.SetSessionToken(agentToken)
agentCloser := agent.New(agent.Options{
Client: agentClient,
Logger: slogtest.Make(t, nil).Named("agent"),
})
defer func() {
_ = agentCloser.Close()
}()

// Shells on Mac, Windows, and Linux all exit shells with the "exit" command.
pty.WriteLine("exit")
<-cmdDone

entries, err := os.ReadDir(dir)
require.NoError(t, err)
for _, e := range entries {
t.Logf("logdir entry: %s", e.Name())
if strings.HasPrefix(e.Name(), "coder-ssh") {
return
}
}
t.Fatal("failed to find ssh logfile")
})
}

//nolint:paralleltest // This test uses t.Setenv, parent test MUST NOT be parallel.
Expand Down
6 changes: 6 additions & 0 deletions cli/testdata/coder_ssh_--help.golden
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ Start a shell into a workspace
Specifies which identity agent to use (overrides $SSH_AUTH_SOCK),
forward agent must also be enabled.

--log-dir string, $CODER_SSH_LOG_DIR (default: /tmp)
Specify the location for the log files.

-l, --log-to-file bool, $CODER_SSH_LOG_TO_FILE
Enable diagnostic logging to file.

--no-wait bool, $CODER_SSH_NO_WAIT
Specifies whether to wait for a workspace to become ready before
logging in (only applicable when the login before ready option has not
Expand Down
Loading