coder · spikecurtis · May 25, 2023 · May 23, 2023 · May 23, 2023 · May 23, 2023
diff --git a/cli/ssh.go b/cli/ssh.go
@@ -10,6 +10,7 @@ import (
 	"net/url"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"strings"
 	"time"
@@ -23,6 +24,9 @@ import (
 	"golang.org/x/term"
 	"golang.org/x/xerrors"
 
+	"cdr.dev/slog"
+	"cdr.dev/slog/sloggers/sloghuman"
+
 	"github.com/coder/coder/agent/agentssh"
 	"github.com/coder/coder/cli/clibase"
 	"github.com/coder/coder/cli/cliui"
@@ -46,6 +50,8 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 		identityAgent  string
 		wsPollInterval time.Duration
 		noWait         bool
+		logDir         string
+		logToFile      bool
 	)
 	client := new(codersdk.Client)
 	cmd := &clibase.Cmd{
@@ -56,10 +62,44 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 			clibase.RequireNArgs(1),
 			r.InitClient(client),
 		),
-		Handler: func(inv *clibase.Invocation) error {
+		Handler: func(inv *clibase.Invocation) (retErr error) {
 			ctx, cancel := context.WithCancel(inv.Context())
 			defer cancel()
 
+			logger := slog.Make() // empty logger
+			defer func() {
+				if retErr != nil {
+					// catch and log all returned errors so we see them in the
+					// log file (if there is one)
+					logger.Error(ctx, "command exit", slog.Error(retErr))
+				}
+			}()
+			if logToFile {
+				// we need a way to ensure different ssh invocations don't clobber
+				// each other's logs. Date-time strings will likely have collisions
+				// in unit tests and/or scripts unless we extend precision out to
+				// sub-millisecond, which seems unwieldy.  A simple 5-character random
+				// string will do it, since the operating system already tracks
+				// dates and times for file IO.
+				qual, err := cryptorand.String(5)
+				if err != nil {
+					return xerrors.Errorf("generate random qualifier: %w", err)
+				}
+				logPth := path.Join(logDir, fmt.Sprintf("coder-ssh-%s.log", qual))
+				logFile, err := os.Create(logPth)
+				if err != nil {
+					return xerrors.Errorf("error opening %s for logging: %w", logPth, err)
+				}
+				logger = slog.Make(sloghuman.Sink(logFile))
+				defer logFile.Close()
+				if r.verbose {
+					logger = logger.Leveled(slog.LevelDebug)
+				}
+
+				// log HTTP requests
+				client.Logger = logger
+			}
+
 			workspace, workspaceAgent, err := getWorkspaceAndAgent(ctx, inv, client, codersdk.Me, inv.Args[0])
 			if err != nil {
 				return err
@@ -92,110 +132,71 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 				// We don't print the error because cliui.Agent does that for us.
 			}
 
-			conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{})
+			conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{
+				Logger: logger,
+			})
 			if err != nil {
-				return err
+				return xerrors.Errorf("dial agent: %w", err)
 			}
 			defer conn.Close()
 			conn.AwaitReachable(ctx)
 			stopPolling := tryPollWorkspaceAutostop(ctx, client, workspace)
 			defer stopPolling()
 
-			// Enure connection is closed if the context is canceled or
-			// the workspace reaches the stopped state.
-			//
-			// Watching the stopped state is a work-around for cases
-			// where the agent is not gracefully shut down and the
-			// connection is left open. If, for instance, the networking
-			// is stopped before the agent is shut down, the disconnect
-			// will usually not propagate.
-			//
-			// See: https://github.com/coder/coder/issues/6180
-			watchAndClose := func(closer func() error) {
-				// Ensure session is ended on both context cancellation
-				// and workspace stop.
-				defer func() {
-					_ = closer()
-				}()
-
-			startWatchLoop:
-				for {
-					// (Re)connect to the coder server and watch workspace events.
-					var wsWatch <-chan codersdk.Workspace
-					var err error
-					for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
-						wsWatch, err = client.WatchWorkspace(ctx, workspace.ID)
-						if err == nil {
-							break
-						}
-						if ctx.Err() != nil {
-							return
-						}
-					}
-
-					for {
-						select {
-						case <-ctx.Done():
-							return
-						case w, ok := <-wsWatch:
-							if !ok {
-								continue startWatchLoop
-							}
-
-							// Transitioning to stop or delete could mean that
-							// the agent will still gracefully stop. If a new
-							// build is starting, there's no reason to wait for
-							// the agent, it should be long gone.
-							if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart {
-								return
-							}
-							// Note, we only react to the stopped state here because we
-							// want to give the agent a chance to gracefully shut down
-							// during "stopping".
-							if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped {
-								return
-							}
-						}
-					}
-				}
-			}
-
 			if stdio {
 				rawSSH, err := conn.SSH(ctx)
 				if err != nil {
-					return err
+					return xerrors.Errorf("connect SSH: %w", err)
 				}
 				defer rawSSH.Close()
-				go watchAndClose(rawSSH.Close)
+				go watchAndClose(ctx, rawSSH.Close, logger, client, workspace)
 
 				go func() {
 					// Ensure stdout copy closes incase stdin is closed
 					// unexpectedly. Typically we wouldn't worry about
 					// this since OpenSSH should kill the proxy command.
 					defer rawSSH.Close()
 
-					_, _ = io.Copy(rawSSH, inv.Stdin)
+					_, err := io.Copy(rawSSH, inv.Stdin)
+					if err != nil {
+						logger.Error(ctx, "copy stdin error", slog.Error(err))
+					} else {
+						logger.Debug(ctx, "copy stdin complete")
+					}
 				}()
-				_, _ = io.Copy(inv.Stdout, rawSSH)
+				_, err = io.Copy(inv.Stdout, rawSSH)
+				if err != nil {
+					logger.Error(ctx, "copy stdout error", slog.Error(err))
+				} else {
+					logger.Debug(ctx, "copy stdout complete")
+				}
 				return nil
 			}
 
 			sshClient, err := conn.SSHClient(ctx)
 			if err != nil {
-				return err
+				return xerrors.Errorf("ssh client: %w", err)
 			}
 			defer sshClient.Close()
 
 			sshSession, err := sshClient.NewSession()
 			if err != nil {
-				return err
+				return xerrors.Errorf("ssh session: %w", err)
 			}
 			defer sshSession.Close()
-			go watchAndClose(func() error {
-				_ = sshSession.Close()
-				_ = sshClient.Close()
-				return nil
-			})
+			go watchAndClose(
+				ctx,
+				func() error {
+					err := sshSession.Close()
+					logger.Debug(ctx, "session close", slog.Error(err))
+					err = sshClient.Close()
+					logger.Debug(ctx, "client close", slog.Error(err))
+					return nil
+				},
+				logger,
+				client,
+				workspace,
+			)
 
 			if identityAgent == "" {
 				identityAgent = os.Getenv("SSH_AUTH_SOCK")
@@ -257,7 +258,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 
 			err = sshSession.RequestPty("xterm-256color", 128, 128, gossh.TerminalModes{})
 			if err != nil {
-				return err
+				return xerrors.Errorf("request pty: %w", err)
 			}
 
 			sshSession.Stdin = inv.Stdin
@@ -266,7 +267,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 
 			err = sshSession.Shell()
 			if err != nil {
-				return err
+				return xerrors.Errorf("start shell: %w", err)
 			}
 
 			// Put cancel at the top of the defer stack to initiate
@@ -289,7 +290,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 				if errors.Is(err, &gossh.ExitMissingError{}) {
 					return xerrors.New("SSH connection ended unexpectedly")
 				}
-				return err
+				return xerrors.Errorf("session ended: %w", err)
 			}
 
 			return nil
@@ -335,10 +336,90 @@ func (r *RootCmd) ssh() *clibase.Cmd {
 			Description: "Specifies whether to wait for a workspace to become ready before logging in (only applicable when the login before ready option has not been enabled). Note that the workspace agent may still be in the process of executing the startup script and the workspace may be in an incomplete state.",
 			Value:       clibase.BoolOf(&noWait),
 		},
+		{
+			Flag:        "log-dir",
+			Default:     os.TempDir(),
+			Description: "Specify the location for the log files.",
+			Env:         "CODER_SSH_LOG_DIR",
+			Value:       clibase.StringOf(&logDir),
+		},
+		{
+			Flag:          "log-to-file",
+			FlagShorthand: "l",
+			Env:           "CODER_SSH_LOG_TO_FILE",
+			Description:   "Enable diagnostic logging to file.",
+			Value:         clibase.BoolOf(&logToFile),
+		},
 	}
 	return cmd
 }
 
+// watchAndClose ensures closer is called if the context is canceled or
+// the workspace reaches the stopped state.
+//
+// Watching the stopped state is a work-around for cases
+// where the agent is not gracefully shut down and the
+// connection is left open. If, for instance, the networking
+// is stopped before the agent is shut down, the disconnect
+// will usually not propagate.
+//
+// See: https://github.com/coder/coder/issues/6180
+func watchAndClose(ctx context.Context, closer func() error, logger slog.Logger, client *codersdk.Client, workspace codersdk.Workspace) {
+	// Ensure session is ended on both context cancellation
+	// and workspace stop.
+	defer func() {
+		err := closer()
+		if err != nil {
+			logger.Error(ctx, "error closing session", slog.Error(err))
+		}
+	}()
+
+startWatchLoop:
+	for {
+		logger.Debug(ctx, "(re)connecting to the coder server to watch workspace events.")
+		var wsWatch <-chan codersdk.Workspace
+		var err error
+		for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
+			wsWatch, err = client.WatchWorkspace(ctx, workspace.ID)
+			if err == nil {
+				break
+			}
+			if ctx.Err() != nil {
+				logger.Info(ctx, "context expired", slog.Error(ctx.Err()))
+				return
+			}
+		}
+
+		for {
+			select {
+			case <-ctx.Done():
+				logger.Info(ctx, "context expired", slog.Error(ctx.Err()))
+				return
+			case w, ok := <-wsWatch:
+				if !ok {
+					continue startWatchLoop
+				}
+
+				// Transitioning to stop or delete could mean that
+				// the agent will still gracefully stop. If a new
+				// build is starting, there's no reason to wait for
+				// the agent, it should be long gone.
+				if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart {
+					logger.Info(ctx, "new build started")
+					return
+				}
+				// Note, we only react to the stopped state here because we
+				// want to give the agent a chance to gracefully shut down
+				// during "stopping".
+				if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped {
+					logger.Info(ctx, "workspace stopped")
+					return
+				}
+			}
+		}
+	}
+}
+
 // getWorkspaceAgent returns the workspace and agent selected using either the
 // `<workspace>[.<agent>]` syntax via `in` or picks a random workspace and agent
 // if `shuffle` is true.

diff --git a/cli/ssh_test.go b/cli/ssh_test.go
@@ -407,6 +407,50 @@ func TestSSH(t *testing.T) {
 		pty.WriteLine("exit")
 		<-cmdDone
 	})
+
+	t.Run("FileLogging", func(t *testing.T) {
+		t.Parallel()
+
+		dir := t.TempDir()
+
+		client, workspace, agentToken := setupWorkspaceForAgent(t, nil)
+		inv, root := clitest.New(t, "ssh", workspace.Name, "-l", "--log-dir", dir)
+		clitest.SetupConfig(t, client, root)
+		pty := ptytest.New(t).Attach(inv)
+
+		ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
+		defer cancel()
+
+		cmdDone := tGo(t, func() {
+			err := inv.WithContext(ctx).Run()
+			assert.NoError(t, err)
+		})
+		pty.ExpectMatch("Waiting")
+
+		agentClient := agentsdk.New(client.URL)
+		agentClient.SetSessionToken(agentToken)
+		agentCloser := agent.New(agent.Options{
+			Client: agentClient,
+			Logger: slogtest.Make(t, nil).Named("agent"),
+		})
+		defer func() {
+			_ = agentCloser.Close()
+		}()
+
+		// Shells on Mac, Windows, and Linux all exit shells with the "exit" command.
+		pty.WriteLine("exit")
+		<-cmdDone
+
+		entries, err := os.ReadDir(dir)
+		require.NoError(t, err)
+		for _, e := range entries {
+			t.Logf("logdir entry: %s", e.Name())
+			if strings.HasPrefix(e.Name(), "coder-ssh") {
+				return
+			}
+		}
+		t.Fatal("failed to find ssh logfile")
+	})
 }
 
 //nolint:paralleltest // This test uses t.Setenv, parent test MUST NOT be parallel.

diff --git a/cli/testdata/coder_ssh_--help.golden b/cli/testdata/coder_ssh_--help.golden
@@ -18,6 +18,12 @@ Start a shell into a workspace
           Specifies which identity agent to use (overrides $SSH_AUTH_SOCK),
           forward agent must also be enabled.
 
+      --log-dir string, $CODER_SSH_LOG_DIR (default: /tmp)
+          Specify the location for the log files.
+
+  -l, --log-to-file bool, $CODER_SSH_LOG_TO_FILE
+          Enable diagnostic logging to file.
+
       --no-wait bool, $CODER_SSH_NO_WAIT
           Specifies whether to wait for a workspace to become ready before
           logging in (only applicable when the login before ready option has not