diff --git a/cli/ssh.go b/cli/ssh.go index 5b89ca9fc56ad..04923bb1aa48c 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -10,6 +10,7 @@ import ( "net/url" "os" "os/exec" + "path" "path/filepath" "strings" "time" @@ -23,6 +24,9 @@ import ( "golang.org/x/term" "golang.org/x/xerrors" + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/agent/agentssh" "github.com/coder/coder/cli/clibase" "github.com/coder/coder/cli/cliui" @@ -46,6 +50,8 @@ func (r *RootCmd) ssh() *clibase.Cmd { identityAgent string wsPollInterval time.Duration noWait bool + logDir string + logToFile bool ) client := new(codersdk.Client) cmd := &clibase.Cmd{ @@ -56,10 +62,44 @@ func (r *RootCmd) ssh() *clibase.Cmd { clibase.RequireNArgs(1), r.InitClient(client), ), - Handler: func(inv *clibase.Invocation) error { + Handler: func(inv *clibase.Invocation) (retErr error) { ctx, cancel := context.WithCancel(inv.Context()) defer cancel() + logger := slog.Make() // empty logger + defer func() { + if retErr != nil { + // catch and log all returned errors so we see them in the + // log file (if there is one) + logger.Error(ctx, "command exit", slog.Error(retErr)) + } + }() + if logToFile { + // we need a way to ensure different ssh invocations don't clobber + // each other's logs. Date-time strings will likely have collisions + // in unit tests and/or scripts unless we extend precision out to + // sub-millisecond, which seems unwieldy. A simple 5-character random + // string will do it, since the operating system already tracks + // dates and times for file IO. + qual, err := cryptorand.String(5) + if err != nil { + return xerrors.Errorf("generate random qualifier: %w", err) + } + logPth := path.Join(logDir, fmt.Sprintf("coder-ssh-%s.log", qual)) + logFile, err := os.Create(logPth) + if err != nil { + return xerrors.Errorf("error opening %s for logging: %w", logPth, err) + } + logger = slog.Make(sloghuman.Sink(logFile)) + defer logFile.Close() + if r.verbose { + logger = logger.Leveled(slog.LevelDebug) + } + + // log HTTP requests + client.Logger = logger + } + workspace, workspaceAgent, err := getWorkspaceAndAgent(ctx, inv, client, codersdk.Me, inv.Args[0]) if err != nil { return err @@ -92,81 +132,24 @@ func (r *RootCmd) ssh() *clibase.Cmd { // We don't print the error because cliui.Agent does that for us. } - conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{}) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: logger, + }) if err != nil { - return err + return xerrors.Errorf("dial agent: %w", err) } defer conn.Close() conn.AwaitReachable(ctx) stopPolling := tryPollWorkspaceAutostop(ctx, client, workspace) defer stopPolling() - // Enure connection is closed if the context is canceled or - // the workspace reaches the stopped state. - // - // Watching the stopped state is a work-around for cases - // where the agent is not gracefully shut down and the - // connection is left open. If, for instance, the networking - // is stopped before the agent is shut down, the disconnect - // will usually not propagate. - // - // See: https://github.com/coder/coder/issues/6180 - watchAndClose := func(closer func() error) { - // Ensure session is ended on both context cancellation - // and workspace stop. - defer func() { - _ = closer() - }() - - startWatchLoop: - for { - // (Re)connect to the coder server and watch workspace events. - var wsWatch <-chan codersdk.Workspace - var err error - for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { - wsWatch, err = client.WatchWorkspace(ctx, workspace.ID) - if err == nil { - break - } - if ctx.Err() != nil { - return - } - } - - for { - select { - case <-ctx.Done(): - return - case w, ok := <-wsWatch: - if !ok { - continue startWatchLoop - } - - // Transitioning to stop or delete could mean that - // the agent will still gracefully stop. If a new - // build is starting, there's no reason to wait for - // the agent, it should be long gone. - if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart { - return - } - // Note, we only react to the stopped state here because we - // want to give the agent a chance to gracefully shut down - // during "stopping". - if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped { - return - } - } - } - } - } - if stdio { rawSSH, err := conn.SSH(ctx) if err != nil { - return err + return xerrors.Errorf("connect SSH: %w", err) } defer rawSSH.Close() - go watchAndClose(rawSSH.Close) + go watchAndClose(ctx, rawSSH.Close, logger, client, workspace) go func() { // Ensure stdout copy closes incase stdin is closed @@ -174,28 +157,46 @@ func (r *RootCmd) ssh() *clibase.Cmd { // this since OpenSSH should kill the proxy command. defer rawSSH.Close() - _, _ = io.Copy(rawSSH, inv.Stdin) + _, err := io.Copy(rawSSH, inv.Stdin) + if err != nil { + logger.Error(ctx, "copy stdin error", slog.Error(err)) + } else { + logger.Debug(ctx, "copy stdin complete") + } }() - _, _ = io.Copy(inv.Stdout, rawSSH) + _, err = io.Copy(inv.Stdout, rawSSH) + if err != nil { + logger.Error(ctx, "copy stdout error", slog.Error(err)) + } else { + logger.Debug(ctx, "copy stdout complete") + } return nil } sshClient, err := conn.SSHClient(ctx) if err != nil { - return err + return xerrors.Errorf("ssh client: %w", err) } defer sshClient.Close() sshSession, err := sshClient.NewSession() if err != nil { - return err + return xerrors.Errorf("ssh session: %w", err) } defer sshSession.Close() - go watchAndClose(func() error { - _ = sshSession.Close() - _ = sshClient.Close() - return nil - }) + go watchAndClose( + ctx, + func() error { + err := sshSession.Close() + logger.Debug(ctx, "session close", slog.Error(err)) + err = sshClient.Close() + logger.Debug(ctx, "client close", slog.Error(err)) + return nil + }, + logger, + client, + workspace, + ) if identityAgent == "" { identityAgent = os.Getenv("SSH_AUTH_SOCK") @@ -257,7 +258,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { err = sshSession.RequestPty("xterm-256color", 128, 128, gossh.TerminalModes{}) if err != nil { - return err + return xerrors.Errorf("request pty: %w", err) } sshSession.Stdin = inv.Stdin @@ -266,7 +267,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { err = sshSession.Shell() if err != nil { - return err + return xerrors.Errorf("start shell: %w", err) } // Put cancel at the top of the defer stack to initiate @@ -289,7 +290,7 @@ func (r *RootCmd) ssh() *clibase.Cmd { if errors.Is(err, &gossh.ExitMissingError{}) { return xerrors.New("SSH connection ended unexpectedly") } - return err + return xerrors.Errorf("session ended: %w", err) } return nil @@ -335,10 +336,90 @@ func (r *RootCmd) ssh() *clibase.Cmd { Description: "Specifies whether to wait for a workspace to become ready before logging in (only applicable when the login before ready option has not been enabled). Note that the workspace agent may still be in the process of executing the startup script and the workspace may be in an incomplete state.", Value: clibase.BoolOf(&noWait), }, + { + Flag: "log-dir", + Default: os.TempDir(), + Description: "Specify the location for the log files.", + Env: "CODER_SSH_LOG_DIR", + Value: clibase.StringOf(&logDir), + }, + { + Flag: "log-to-file", + FlagShorthand: "l", + Env: "CODER_SSH_LOG_TO_FILE", + Description: "Enable diagnostic logging to file.", + Value: clibase.BoolOf(&logToFile), + }, } return cmd } +// watchAndClose ensures closer is called if the context is canceled or +// the workspace reaches the stopped state. +// +// Watching the stopped state is a work-around for cases +// where the agent is not gracefully shut down and the +// connection is left open. If, for instance, the networking +// is stopped before the agent is shut down, the disconnect +// will usually not propagate. +// +// See: https://github.com/coder/coder/issues/6180 +func watchAndClose(ctx context.Context, closer func() error, logger slog.Logger, client *codersdk.Client, workspace codersdk.Workspace) { + // Ensure session is ended on both context cancellation + // and workspace stop. + defer func() { + err := closer() + if err != nil { + logger.Error(ctx, "error closing session", slog.Error(err)) + } + }() + +startWatchLoop: + for { + logger.Debug(ctx, "(re)connecting to the coder server to watch workspace events.") + var wsWatch <-chan codersdk.Workspace + var err error + for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { + wsWatch, err = client.WatchWorkspace(ctx, workspace.ID) + if err == nil { + break + } + if ctx.Err() != nil { + logger.Info(ctx, "context expired", slog.Error(ctx.Err())) + return + } + } + + for { + select { + case <-ctx.Done(): + logger.Info(ctx, "context expired", slog.Error(ctx.Err())) + return + case w, ok := <-wsWatch: + if !ok { + continue startWatchLoop + } + + // Transitioning to stop or delete could mean that + // the agent will still gracefully stop. If a new + // build is starting, there's no reason to wait for + // the agent, it should be long gone. + if workspace.LatestBuild.ID != w.LatestBuild.ID && w.LatestBuild.Transition == codersdk.WorkspaceTransitionStart { + logger.Info(ctx, "new build started") + return + } + // Note, we only react to the stopped state here because we + // want to give the agent a chance to gracefully shut down + // during "stopping". + if w.LatestBuild.Status == codersdk.WorkspaceStatusStopped { + logger.Info(ctx, "workspace stopped") + return + } + } + } + } +} + // getWorkspaceAgent returns the workspace and agent selected using either the // `[.]` syntax via `in` or picks a random workspace and agent // if `shuffle` is true. diff --git a/cli/ssh_test.go b/cli/ssh_test.go index 01d81107ab7ce..88e64d6fe48d7 100644 --- a/cli/ssh_test.go +++ b/cli/ssh_test.go @@ -407,6 +407,50 @@ func TestSSH(t *testing.T) { pty.WriteLine("exit") <-cmdDone }) + + t.Run("FileLogging", func(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + + client, workspace, agentToken := setupWorkspaceForAgent(t, nil) + inv, root := clitest.New(t, "ssh", workspace.Name, "-l", "--log-dir", dir) + clitest.SetupConfig(t, client, root) + pty := ptytest.New(t).Attach(inv) + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + cmdDone := tGo(t, func() { + err := inv.WithContext(ctx).Run() + assert.NoError(t, err) + }) + pty.ExpectMatch("Waiting") + + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(agentToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: slogtest.Make(t, nil).Named("agent"), + }) + defer func() { + _ = agentCloser.Close() + }() + + // Shells on Mac, Windows, and Linux all exit shells with the "exit" command. + pty.WriteLine("exit") + <-cmdDone + + entries, err := os.ReadDir(dir) + require.NoError(t, err) + for _, e := range entries { + t.Logf("logdir entry: %s", e.Name()) + if strings.HasPrefix(e.Name(), "coder-ssh") { + return + } + } + t.Fatal("failed to find ssh logfile") + }) } //nolint:paralleltest // This test uses t.Setenv, parent test MUST NOT be parallel. diff --git a/cli/testdata/coder_ssh_--help.golden b/cli/testdata/coder_ssh_--help.golden index 69214e74066d8..632003ffdbdb8 100644 --- a/cli/testdata/coder_ssh_--help.golden +++ b/cli/testdata/coder_ssh_--help.golden @@ -18,6 +18,12 @@ Start a shell into a workspace Specifies which identity agent to use (overrides $SSH_AUTH_SOCK), forward agent must also be enabled. + --log-dir string, $CODER_SSH_LOG_DIR (default: /tmp) + Specify the location for the log files. + + -l, --log-to-file bool, $CODER_SSH_LOG_TO_FILE + Enable diagnostic logging to file. + --no-wait bool, $CODER_SSH_NO_WAIT Specifies whether to wait for a workspace to become ready before logging in (only applicable when the login before ready option has not diff --git a/docs/cli/ssh.md b/docs/cli/ssh.md index 0015e69cc404f..2ec3ff5beb4dd 100644 --- a/docs/cli/ssh.md +++ b/docs/cli/ssh.md @@ -39,6 +39,25 @@ Specifies whether to forward the GPG agent. Unsupported on Windows workspaces, b Specifies which identity agent to use (overrides $SSH_AUTH_SOCK), forward agent must also be enabled. +### --log-dir + +| | | +| ----------- | ------------------------------- | +| Type | string | +| Environment | $CODER_SSH_LOG_DIR | +| Default | /tmp | + +Specify the location for the log files. + +### -l, --log-to-file + +| | | +| ----------- | ----------------------------------- | +| Type | bool | +| Environment | $CODER_SSH_LOG_TO_FILE | + +Enable diagnostic logging to file. + ### --no-wait | | |