From 0b8499defde283fd696b2c33fe048e0866911977 Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 18 Jul 2023 14:24:30 -0800 Subject: [PATCH 01/28] Add screen backend for reconnecting ptys The screen portion is a port from wsep. There is an interface that lets you choose between screen and the previous method. By default it will choose screen if it is installed but this can be overidden (mostly for tests). The tests use a scanner instead of a reader now because the reader will loop infinitely at the end of a stream. Relpace /bin/bash with bash since bash is not always in /bin. --- agent/agent.go | 209 ++-------------- agent/agent_test.go | 149 ++++++++---- agent/reconnectingpty/buffered.go | 167 +++++++++++++ agent/reconnectingpty/reconnectingpty.go | 298 +++++++++++++++++++++++ agent/reconnectingpty/screen.go | 283 +++++++++++++++++++++ cli/exp_scaletest.go | 9 +- coderd/httpapi/queryparams.go | 2 +- coderd/insights_test.go | 9 +- coderd/workspaceapps/apptest/apptest.go | 253 ++++++++++--------- coderd/workspaceapps/proxy.go | 3 +- codersdk/workspaceagentconn.go | 38 ++- codersdk/workspaceagents.go | 12 +- dogfood/Dockerfile | 1 + flake.nix | 1 + scaletest/createworkspaces/run_test.go | 14 +- scaletest/reconnectingpty/config_test.go | 9 +- scaletest/reconnectingpty/run.go | 12 +- scaletest/reconnectingpty/run_test.go | 24 +- scaletest/workspacetraffic/conn.go | 11 +- site/src/api/typesGenerated.ts | 8 + 20 files changed, 1114 insertions(+), 398 deletions(-) create mode 100644 agent/reconnectingpty/buffered.go create mode 100644 agent/reconnectingpty/reconnectingpty.go create mode 100644 agent/reconnectingpty/screen.go diff --git a/agent/agent.go b/agent/agent.go index d736564056791..208dbb2b1d70f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -21,7 +21,6 @@ import ( "sync" "time" - "github.com/armon/circbuf" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" @@ -35,12 +34,12 @@ import ( "cdr.dev/slog" "github.com/coder/coder/agent/agentssh" + "github.com/coder/coder/agent/reconnectingpty" "github.com/coder/coder/buildinfo" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/gitauth" "github.com/coder/coder/codersdk" "github.com/coder/coder/codersdk/agentsdk" - "github.com/coder/coder/pty" "github.com/coder/coder/tailnet" "github.com/coder/retry" ) @@ -89,9 +88,6 @@ type Agent interface { } func New(options Options) Agent { - if options.ReconnectingPTYTimeout == 0 { - options.ReconnectingPTYTimeout = 5 * time.Minute - } if options.Filesystem == nil { options.Filesystem = afero.NewOsFs() } @@ -1078,22 +1074,22 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // If the agent is closed, we don't want to // log this as an error since it's expected. if closed { - logger.Debug(ctx, "reconnecting PTY failed with session error (agent closed)", slog.Error(err)) + logger.Debug(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err)) } else { - logger.Error(ctx, "reconnecting PTY failed with session error", slog.Error(err)) + logger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err)) } } - logger.Debug(ctx, "session closed") + logger.Debug(ctx, "reconnecting pty connection closed") }() - var rpty *reconnectingPTY - sendConnected := make(chan *reconnectingPTY, 1) + var rpty *reconnectingpty.ReconnectingPTY + sendConnected := make(chan *reconnectingpty.ReconnectingPTY, 1) // On store, reserve this ID to prevent multiple concurrent new connections. waitReady, ok := a.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected) if ok { close(sendConnected) // Unused. - logger.Debug(ctx, "connecting to existing session") - c, ok := waitReady.(chan *reconnectingPTY) + logger.Debug(ctx, "connecting to existing reconnecting pty") + c, ok := waitReady.(chan *reconnectingpty.ReconnectingPTY) if !ok { return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady) } @@ -1103,7 +1099,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m } c <- rpty // Put it back for the next reconnect. } else { - logger.Debug(ctx, "creating new session") + logger.Debug(ctx, "creating new reconnecting pty", slog.F("backend", msg.BackendType)) connected := false defer func() { @@ -1119,169 +1115,25 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m a.metrics.reconnectingPTYErrors.WithLabelValues("create_command").Add(1) return xerrors.Errorf("create command: %w", err) } - cmd.Env = append(cmd.Env, "TERM=xterm-256color") - - // Default to buffer 64KiB. - circularBuffer, err := circbuf.NewBuffer(64 << 10) - if err != nil { - return xerrors.Errorf("create circular buffer: %w", err) - } - ptty, process, err := pty.Start(cmd) - if err != nil { - a.metrics.reconnectingPTYErrors.WithLabelValues("start_command").Add(1) - return xerrors.Errorf("start command: %w", err) - } + rpty = reconnectingpty.New(ctx, cmd, &reconnectingpty.Options{ + BackendType: msg.BackendType, + Timeout: a.reconnectingPTYTimeout, + Metrics: a.metrics.reconnectingPTYErrors, + }, logger) - ctx, cancel := context.WithCancel(ctx) - rpty = &reconnectingPTY{ - activeConns: map[string]net.Conn{ - // We have to put the connection in the map instantly otherwise - // the connection won't be closed if the process instantly dies. - connectionID: conn, - }, - ptty: ptty, - // Timeouts created with an after func can be reset! - timeout: time.AfterFunc(a.reconnectingPTYTimeout, cancel), - circularBuffer: circularBuffer, - } - // We don't need to separately monitor for the process exiting. - // When it exits, our ptty.OutputReader() will return EOF after - // reading all process output. if err = a.trackConnGoroutine(func() { - buffer := make([]byte, 1024) - for { - read, err := rpty.ptty.OutputReader().Read(buffer) - if err != nil { - // When the PTY is closed, this is triggered. - // Error is typically a benign EOF, so only log for debugging. - if errors.Is(err, io.EOF) { - logger.Debug(ctx, "unable to read pty output, command might have exited", slog.Error(err)) - } else { - logger.Warn(ctx, "unable to read pty output, command might have exited", slog.Error(err)) - a.metrics.reconnectingPTYErrors.WithLabelValues("output_reader").Add(1) - } - break - } - part := buffer[:read] - rpty.circularBufferMutex.Lock() - _, err = rpty.circularBuffer.Write(part) - rpty.circularBufferMutex.Unlock() - if err != nil { - logger.Error(ctx, "write to circular buffer", slog.Error(err)) - break - } - rpty.activeConnsMutex.Lock() - for cid, conn := range rpty.activeConns { - _, err = conn.Write(part) - if err != nil { - logger.Warn(ctx, - "error writing to active conn", - slog.F("other_conn_id", cid), - slog.Error(err), - ) - a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1) - } - } - rpty.activeConnsMutex.Unlock() - } - - // Cleanup the process, PTY, and delete it's - // ID from memory. - _ = process.Kill() - rpty.Close() + rpty.Wait() a.reconnectingPTYs.Delete(msg.ID) }); err != nil { - _ = process.Kill() - _ = ptty.Close() + rpty.Close(err.Error()) return xerrors.Errorf("start routine: %w", err) } + connected = true sendConnected <- rpty } - // Resize the PTY to initial height + width. - err := rpty.ptty.Resize(msg.Height, msg.Width) - if err != nil { - // We can continue after this, it's not fatal! - logger.Error(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) - a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1) - } - // Write any previously stored data for the TTY. - rpty.circularBufferMutex.RLock() - prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) - rpty.circularBufferMutex.RUnlock() - // Note that there is a small race here between writing buffered - // data and storing conn in activeConns. This is likely a very minor - // edge case, but we should look into ways to avoid it. Holding - // activeConnsMutex would be one option, but holding this mutex - // while also holding circularBufferMutex seems dangerous. - _, err = conn.Write(prevBuf) - if err != nil { - a.metrics.reconnectingPTYErrors.WithLabelValues("write").Add(1) - return xerrors.Errorf("write buffer to conn: %w", err) - } - // Multiple connections to the same TTY are permitted. - // This could easily be used for terminal sharing, but - // we do it because it's a nice user experience to - // copy/paste a terminal URL and have it _just work_. - rpty.activeConnsMutex.Lock() - rpty.activeConns[connectionID] = conn - rpty.activeConnsMutex.Unlock() - // Resetting this timeout prevents the PTY from exiting. - rpty.timeout.Reset(a.reconnectingPTYTimeout) - - ctx, cancelFunc := context.WithCancel(ctx) - defer cancelFunc() - heartbeat := time.NewTicker(a.reconnectingPTYTimeout / 2) - defer heartbeat.Stop() - go func() { - // Keep updating the activity while this - // connection is alive! - for { - select { - case <-ctx.Done(): - return - case <-heartbeat.C: - } - rpty.timeout.Reset(a.reconnectingPTYTimeout) - } - }() - defer func() { - // After this connection ends, remove it from - // the PTYs active connections. If it isn't - // removed, all PTY data will be sent to it. - rpty.activeConnsMutex.Lock() - delete(rpty.activeConns, connectionID) - rpty.activeConnsMutex.Unlock() - }() - decoder := json.NewDecoder(conn) - var req codersdk.ReconnectingPTYRequest - for { - err = decoder.Decode(&req) - if xerrors.Is(err, io.EOF) { - return nil - } - if err != nil { - logger.Warn(ctx, "reconnecting PTY failed with read error", slog.Error(err)) - return nil - } - _, err = rpty.ptty.InputWriter().Write([]byte(req.Data)) - if err != nil { - logger.Warn(ctx, "reconnecting PTY failed with write error", slog.Error(err)) - a.metrics.reconnectingPTYErrors.WithLabelValues("input_writer").Add(1) - return nil - } - // Check if a resize needs to happen! - if req.Height == 0 || req.Width == 0 { - continue - } - err = rpty.ptty.Resize(req.Height, req.Width) - if err != nil { - // We can continue after this, it's not fatal! - logger.Error(ctx, "reconnecting PTY resize failed, but will continue", slog.Error(err)) - a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1) - } - } + return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, logger) } // startReportingConnectionStats runs the connection stats reporting goroutine. @@ -1499,31 +1351,6 @@ lifecycleWaitLoop: return nil } -type reconnectingPTY struct { - activeConnsMutex sync.Mutex - activeConns map[string]net.Conn - - circularBuffer *circbuf.Buffer - circularBufferMutex sync.RWMutex - timeout *time.Timer - ptty pty.PTYCmd -} - -// Close ends all connections to the reconnecting -// PTY and clear the circular buffer. -func (r *reconnectingPTY) Close() { - r.activeConnsMutex.Lock() - defer r.activeConnsMutex.Unlock() - for _, conn := range r.activeConns { - _ = conn.Close() - } - _ = r.ptty.Close() - r.circularBufferMutex.Lock() - r.circularBuffer.Reset() - r.circularBufferMutex.Unlock() - r.timeout.Stop() -} - // userHomeDir returns the home directory of the current user, giving // priority to the $HOME environment variable. func userHomeDir() (string, error) { diff --git a/agent/agent_test.go b/agent/agent_test.go index d897951496896..16d4b4d7094c6 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -12,6 +12,7 @@ import ( "net/http/httptest" "net/netip" "os" + "os/exec" "os/user" "path" "path/filepath" @@ -102,7 +103,7 @@ func TestAgent_Stats_ReconnectingPTY(t *testing.T) { //nolint:dogsled conn, _, stats, _, _ := setupAgent(t, agentsdk.Manifest{}, 0) - ptyConn, err := conn.ReconnectingPTY(ctx, uuid.New(), 128, 128, "/bin/bash") + ptyConn, err := conn.ReconnectingPTY(ctx, uuid.New(), 128, 128, "bash", codersdk.ReconnectingPTYBackendTypeBuffered) require.NoError(t, err) defer ptyConn.Close() @@ -1587,6 +1588,10 @@ func TestAgent_Startup(t *testing.T) { }) } +const ansi = "[\u001B\u009B][[\\]()#;?]*(?:(?:(?:[a-zA-Z\\d]*(?:;[a-zA-Z\\d]*)*)?\u0007)|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PRZcf-ntqry=><~]))" + +var re = regexp.MustCompile(ansi) + func TestAgent_ReconnectingPTY(t *testing.T) { t.Parallel() if runtime.GOOS == "windows" { @@ -1596,61 +1601,117 @@ func TestAgent_ReconnectingPTY(t *testing.T) { t.Skip("ConPTY appears to be inconsistent on Windows.") } - ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) - defer cancel() + backends := []codersdk.ReconnectingPTYBackendType{ + codersdk.ReconnectingPTYBackendTypeBuffered, + codersdk.ReconnectingPTYBackendTypeScreen, + } - //nolint:dogsled - conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0) - id := uuid.New() - netConn, err := conn.ReconnectingPTY(ctx, id, 100, 100, "/bin/bash") - require.NoError(t, err) - defer netConn.Close() + for _, backendType := range backends { + backendType := backendType + t.Run(string(backendType), func(t *testing.T) { + t.Parallel() + if runtime.GOOS == "darwin" { + t.Skip("`screen` is flaky on darwin") + } else if backendType == codersdk.ReconnectingPTYBackendTypeScreen { + _, err := exec.LookPath("screen") + if err != nil { + t.Skip("`screen` not found") + } + } - bufRead := bufio.NewReader(netConn) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() - // Brief pause to reduce the likelihood that we send keystrokes while - // the shell is simultaneously sending a prompt. - time.Sleep(100 * time.Millisecond) + //nolint:dogsled + conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0) + id := uuid.New() + netConn1, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + require.NoError(t, err) + defer netConn1.Close() - data, err := json.Marshal(codersdk.ReconnectingPTYRequest{ - Data: "echo test\r\n", - }) - require.NoError(t, err) - _, err = netConn.Write(data) - require.NoError(t, err) + scanner1 := bufio.NewScanner(netConn1) - expectLine := func(matcher func(string) bool) { - for { - line, err := bufRead.ReadString('\n') + // A second simultaneous connection. + netConn2, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + require.NoError(t, err) + defer netConn2.Close() + scanner2 := bufio.NewScanner(netConn2) + + // Brief pause to reduce the likelihood that we send keystrokes while + // the shell is simultaneously sending a prompt. + time.Sleep(100 * time.Millisecond) + + data, err := json.Marshal(codersdk.ReconnectingPTYRequest{ + Data: "echo test\r\n", + }) require.NoError(t, err) - if matcher(line) { - break + _, err = netConn1.Write(data) + require.NoError(t, err) + + hasLine := func(scanner *bufio.Scanner, matcher func(string) bool) bool { + for scanner.Scan() { + line := scanner.Text() + t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) + if matcher(line) { + return true + } + } + return false } - } - } - matchEchoCommand := func(line string) bool { - return strings.Contains(line, "echo test") - } - matchEchoOutput := func(line string) bool { - return strings.Contains(line, "test") && !strings.Contains(line, "echo") - } + matchEchoCommand := func(line string) bool { + return strings.Contains(line, "echo test") + } + matchEchoOutput := func(line string) bool { + return strings.Contains(line, "test") && !strings.Contains(line, "echo") + } + matchExitCommand := func(line string) bool { + return strings.Contains(line, "exit") + } + matchExitOutput := func(line string) bool { + return strings.Contains(line, "exit") || strings.Contains(line, "logout") + } - // Once for typing the command... - expectLine(matchEchoCommand) - // And another time for the actual output. - expectLine(matchEchoOutput) + // Once for typing the command... + require.True(t, hasLine(scanner1, matchEchoCommand), "find echo command") + // And another time for the actual output. + require.True(t, hasLine(scanner1, matchEchoOutput), "find echo output") - _ = netConn.Close() - netConn, err = conn.ReconnectingPTY(ctx, id, 100, 100, "/bin/bash") - require.NoError(t, err) - defer netConn.Close() + // Same for the other connection. + require.True(t, hasLine(scanner2, matchEchoCommand), "find echo command") + require.True(t, hasLine(scanner2, matchEchoOutput), "find echo output") + + _ = netConn1.Close() + _ = netConn2.Close() + netConn3, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + require.NoError(t, err) + defer netConn3.Close() + + scanner3 := bufio.NewScanner(netConn3) + + // Same output again! + require.True(t, hasLine(scanner3, matchEchoCommand), "find echo command") + require.True(t, hasLine(scanner3, matchEchoOutput), "find echo output") + + // Exit should cause the connection to close. + data, err = json.Marshal(codersdk.ReconnectingPTYRequest{ + Data: "exit\r\n", + }) + require.NoError(t, err) + _, err = netConn3.Write(data) + require.NoError(t, err) - bufRead = bufio.NewReader(netConn) + // Once for the input and again for the output. + require.True(t, hasLine(scanner3, matchExitCommand), "find exit command") + require.True(t, hasLine(scanner3, matchExitOutput), "find exit output") - // Same output again! - expectLine(matchEchoCommand) - expectLine(matchEchoOutput) + // Wait for the connection to close. + for scanner3.Scan() { + line := scanner3.Text() + t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) + } + }) + } } func TestAgent_Dial(t *testing.T) { diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go new file mode 100644 index 0000000000000..4b3d0b1d70339 --- /dev/null +++ b/agent/reconnectingpty/buffered.go @@ -0,0 +1,167 @@ +package reconnectingpty + +import ( + "context" + "errors" + "io" + "net" + "sync" + + "github.com/armon/circbuf" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/exp/slices" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/pty" +) + +// bufferedBackend provides a reconnectable PTY by using a ring buffer to store +// scrollback. +type bufferedBackend struct { + command *pty.Cmd + + // mutex protects writing to the circular buffer and connections. + mutex sync.RWMutex + + activeConns map[string]net.Conn + circularBuffer *circbuf.Buffer + + ptty pty.PTYCmd + process pty.Process + + metrics *prometheus.CounterVec + + // closeSession is used to close the session when the process dies. + closeSession func(reason string) +} + +// start initializes the backend and starts the pty. It must be called only +// once. If the context ends the process will be killed. +func (b *bufferedBackend) start(ctx context.Context, logger slog.Logger) error { + b.activeConns = map[string]net.Conn{} + + // Default to buffer 64KiB. + circularBuffer, err := circbuf.NewBuffer(64 << 10) + if err != nil { + return xerrors.Errorf("create circular buffer: %w", err) + } + b.circularBuffer = circularBuffer + + // pty.Cmd duplicates Path as the first argument so remove it. + cmd := pty.CommandContext(ctx, b.command.Path, b.command.Args[1:]...) + cmd.Env = append(b.command.Env, "TERM=xterm-256color") + cmd.Dir = b.command.Dir + ptty, process, err := pty.Start(cmd) + if err != nil { + return err + } + b.ptty = ptty + b.process = process + + // Multiplex the output onto the circular buffer and each active connection. + // + // We do not need to separately monitor for the process exiting. When it + // exits, our ptty.OutputReader() will return EOF after reading all process + // output. + go func() { + buffer := make([]byte, 1024) + for { + read, err := ptty.OutputReader().Read(buffer) + if err != nil { + // When the PTY is closed, this is triggered. + // Error is typically a benign EOF, so only log for debugging. + if errors.Is(err, io.EOF) { + logger.Debug(ctx, "unable to read pty output, command might have exited", slog.Error(err)) + } else { + logger.Warn(ctx, "unable to read pty output, command might have exited", slog.Error(err)) + b.metrics.WithLabelValues("output_reader").Add(1) + } + // Could have been killed externally or failed to start at all (command + // not found for example). + b.closeSession("unable to read pty output, command might have exited") + break + } + part := buffer[:read] + b.mutex.Lock() + _, err = b.circularBuffer.Write(part) + if err != nil { + logger.Error(ctx, "write to circular buffer", slog.Error(err)) + b.metrics.WithLabelValues("write_buffer").Add(1) + } + for cid, conn := range b.activeConns { + _, err = conn.Write(part) + if err != nil { + logger.Warn(ctx, + "error writing to active connection", + slog.F("other_conn_id", cid), + slog.Error(err), + ) + b.metrics.WithLabelValues("write").Add(1) + } + } + b.mutex.Unlock() + } + }() + + return nil +} + +// attach attaches to the pty and replays the buffer. If the context closes it +// will detach the connection but leave the process up. A connection ID is +// required so that logs in the pty goroutine can reference the same ID +// reference in logs output by each individual connection when acting on those +// connections. +func (b *bufferedBackend) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) { + // Resize the PTY to initial height + width. + err := b.ptty.Resize(height, width) + if err != nil { + // We can continue after this, it's not fatal! + logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) + b.metrics.WithLabelValues("resize").Add(1) + } + + // Write any previously stored data for the TTY and store the connection for + // future writes. + b.mutex.Lock() + defer b.mutex.Unlock() + prevBuf := slices.Clone(b.circularBuffer.Bytes()) + _, err = conn.Write(prevBuf) + if err != nil { + b.metrics.WithLabelValues("write").Add(1) + return nil, xerrors.Errorf("write buffer to conn: %w", err) + } + b.activeConns[connID] = conn + + // Detach the connection when it or the reconnecting pty closes. + go func() { + <-ctx.Done() + b.mutex.Lock() + defer b.mutex.Unlock() + delete(b.activeConns, connID) + }() + + return b.ptty, nil +} + +// close closes all connections to the reconnecting PTY, clears the circular +// buffer, and kills the process. +func (b *bufferedBackend) close(ctx context.Context, logger slog.Logger) error { + var err error + b.mutex.Lock() + defer b.mutex.Unlock() + b.circularBuffer.Reset() + for _, conn := range b.activeConns { + err = errors.Join(err, conn.Close()) + } + pttyErr := b.ptty.Close() + if pttyErr != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(pttyErr)) + } + procErr := b.process.Kill() + if procErr != nil { + logger.Debug(ctx, "killed process with error", slog.Error(procErr)) + } + return err +} diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go new file mode 100644 index 0000000000000..2e6f4a4b73a5e --- /dev/null +++ b/agent/reconnectingpty/reconnectingpty.go @@ -0,0 +1,298 @@ +package reconnectingpty + +import ( + "context" + "encoding/json" + "io" + "net" + "os/exec" + "runtime" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/codersdk" + "github.com/coder/coder/pty" +) + +const attachTimeout = 30 * time.Second + +// Options allows configuring the reconnecting pty. +type Options struct { + // Timeout describes how long to keep the pty alive without any connections. + // Once elapsed the pty will be killed. + Timeout time.Duration + // Metrics tracks various error counters. + Metrics *prometheus.CounterVec + // BackendType indicates which backend to use for reconnections. + BackendType codersdk.ReconnectingPTYBackendType +} + +// State represents the current state of the reconnecting pty. States are +// sequential and will only move forward. +type State int + +const ( + // StateStarting is the default/start state. Attaching will block until the + // reconnecting pty becomes ready. + StateStarting = iota + // StateReady means the reconnecting pty is ready to be attached. + StateReady + // StateClosing means the reconnecting pty has begun closing. The underlying + // process may still be exiting. Attaching will result in an error. + StateClosing + // StateDone means the reconnecting pty has completely shut down and the + // process has exited. Attaching will result in an error. + StateDone +) + +type backend interface { + start(ctx context.Context, logger slog.Logger) error + attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) + close(ctx context.Context, logger slog.Logger) error +} + +// ReconnectingPTY is a pty that can be reconnected within a timeout and to +// simultaneous connections. +type ReconnectingPTY struct { + // The reconnecting pty can be backed by screen if installed or a (buggy) + // buffer replay fallback. + backend backend + // cond broadcasts state changes and any accompanying errors. + cond *sync.Cond + // error is used when a new connection attaches for cases where the session + // failed to start or has since been closed. It is not safe to access outside + // of cond.L. + error error + // options holds options for configuring the reconnecting pty. + options *Options + // state holds the current reconnecting pty state. It is not safe to access + // this outside of cond.L. + state State + // timer will close the reconnecting pty when it expires. The timer will be + // reset as long as there are active connections. + timer *time.Timer +} + +// New sets up a new reconnecting pty that wraps the provided command. Any +// errors with starting are returned on Attach(). The reconnecting pty will +// close itself (and all connections to it) if nothing is attached for the +// duration of the timeout, if the context ends, or the process exits (buffered +// backend only). +func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) *ReconnectingPTY { + if options.Timeout == 0 { + options.Timeout = 5 * time.Minute + } + // Screen seems flaky on Darwin. Locally the tests pass 100% of the time (100 + // runs) but in CI screen often incorrectly claims the session name does not + // exist even though screen -list shows it. + if runtime.GOOS == "darwin" { + options.BackendType = codersdk.ReconnectingPTYBackendTypeBuffered + } else if options.BackendType == "" || options.BackendType == codersdk.ReconnectingPTYBackendTypeAuto { + _, err := exec.LookPath("screen") + if err == nil { + options.BackendType = codersdk.ReconnectingPTYBackendTypeScreen + } else { + options.BackendType = codersdk.ReconnectingPTYBackendTypeBuffered + } + logger.Debug(ctx, "auto backend selection", slog.F("backend", options.BackendType)) + } + rpty := &ReconnectingPTY{ + cond: sync.NewCond(&sync.Mutex{}), + options: options, + state: StateStarting, + } + switch options.BackendType { + case codersdk.ReconnectingPTYBackendTypeScreen: + // The screen backend is not passed closeSession because we have no way of + // knowing when the screen daemon dies externally anyway. The consequence is + // that we might leave reconnecting ptys in memory around longer than they + // need to be but they will eventually clean up with the timer or context, + // or the next attach will respawn the screen daemon which is fine too. + rpty.backend = &screenBackend{ + command: cmd, + metrics: rpty.options.Metrics, + } + default: + rpty.backend = &bufferedBackend{ + command: cmd, + metrics: rpty.options.Metrics, + closeSession: func(reason string) { + rpty.Close(reason) + }, + } + } + go rpty.lifecycle(ctx, logger) + return rpty +} + +// Attach attaches the provided connection to the pty, waits for the attach to +// complete, then pipes the pty and the connection and blocks until EOF. The +// connection is expected to send JSON-encoded messages and accept raw output +// from the ptty. If the context ends or the process dies the connection will +// be detached and closed. +func (rpty *ReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + // This will kill the heartbeat and detach the connection from the backend + // once we hit EOF on the connection (or an error). + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + state, err := rpty.waitForStateOrContext(ctx, StateReady) + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go rpty.heartbeat(ctx) + + ptty, err := rpty.backend.attach(ctx, connID, conn, height, width, logger) + if err != nil { + return xerrors.Errorf("reconnecting pty attach: %w", err) + } + + decoder := json.NewDecoder(conn) + var req codersdk.ReconnectingPTYRequest + for { + err = decoder.Decode(&req) + if xerrors.Is(err, io.EOF) { + return nil + } + if err != nil { + logger.Warn(ctx, "reconnecting pty failed with read error", slog.Error(err)) + return nil + } + _, err = ptty.InputWriter().Write([]byte(req.Data)) + if err != nil { + logger.Warn(ctx, "reconnecting pty failed with write error", slog.Error(err)) + rpty.options.Metrics.WithLabelValues("input_writer").Add(1) + return nil + } + // Check if a resize needs to happen! + if req.Height == 0 || req.Width == 0 { + continue + } + err = ptty.Resize(req.Height, req.Width) + if err != nil { + // We can continue after this, it's not fatal! + logger.Warn(ctx, "reconnecting pty resize failed, but will continue", slog.Error(err)) + rpty.options.Metrics.WithLabelValues("resize").Add(1) + } + } +} + +// Wait waits for the reconnecting pty to close. The underlying process might +// still be exiting. +func (rpty *ReconnectingPTY) Wait() { + _, _ = rpty.waitForState(StateClosing) +} + +// Close kills the reconnecting pty process. +func (rpty *ReconnectingPTY) Close(reason string) { + // The closing state change will be handled by the lifecycle. + rpty.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %s", reason)) +} + +// lifecycle manages the lifecycle of the reconnecting pty. If the context ends +// the reconnecting pty will be closed. +func (rpty *ReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { + err := rpty.backend.start(ctx, logger) + if err != nil { + rpty.setState(StateDone, xerrors.Errorf("reconnecting pty start: %w", err)) + return + } + + // The initial timeout for attaching will probably be far shorter than the + // reconnect timeout in most cases; in tests it might be longer. It should be + // at least long enough for the first screen attach to be able to start up the + // daemon. + rpty.timer = time.AfterFunc(attachTimeout, func() { + rpty.Close("reconnecting pty timeout") + }) + + logger.Debug(ctx, "reconnecting pty ready") + rpty.setState(StateReady, nil) + + state, reasonErr := rpty.waitForStateOrContext(ctx, StateClosing) + if state < StateClosing { + // If we have not closed yet then the context is what unblocked us (which + // means the agent is shutting down) so move into the closing phase. + rpty.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %w", reasonErr)) + } + rpty.timer.Stop() + closeErr := rpty.backend.close(ctx, logger) + if closeErr != nil { + logger.Error(ctx, "closed reconnecting pty", slog.Error(closeErr)) + } else { + logger.Debug(ctx, "closed reconnecting pty") + } + rpty.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) +} + +// heartbeat keeps the pty alive while the provided context is not done. +func (rpty *ReconnectingPTY) heartbeat(ctx context.Context) { + // We just connected so reset the timer now in case it is near the end. + rpty.timer.Reset(rpty.options.Timeout) + + // Reset when the connection closes to ensure the pty stays up for the full + // timeout. + defer rpty.timer.Reset(rpty.options.Timeout) + + heartbeat := time.NewTicker(rpty.options.Timeout / 2) + defer heartbeat.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-heartbeat.C: + rpty.timer.Reset(rpty.options.Timeout) + } + } +} + +// setState sets and broadcasts the provided state if it is greater than the +// current state and the error if one has not already been set. +func (rpty *ReconnectingPTY) setState(state State, err error) { + rpty.cond.L.Lock() + defer rpty.cond.L.Unlock() + // Cannot regress states. For example, trying to close after the process is + // done should leave us in the done state and not the closing state. + if state <= rpty.state { + return + } + rpty.error = err + rpty.state = state + rpty.cond.Broadcast() +} + +// waitForState blocks until the state or a greater one is reached. +func (rpty *ReconnectingPTY) waitForState(state State) (State, error) { + rpty.cond.L.Lock() + defer rpty.cond.L.Unlock() + for state > rpty.state { + rpty.cond.Wait() + } + return rpty.state, rpty.error +} + +// waitForStateOrContext blocks until the state or a greater one is reached or +// the provided context ends. +func (rpty *ReconnectingPTY) waitForStateOrContext(ctx context.Context, state State) (State, error) { + go func() { + // Wake up when the context ends. + defer rpty.cond.Broadcast() + <-ctx.Done() + }() + rpty.cond.L.Lock() + defer rpty.cond.L.Unlock() + for ctx.Err() == nil && state > rpty.state { + rpty.cond.Wait() + } + if ctx.Err() != nil { + return rpty.state, ctx.Err() + } + return rpty.state, rpty.error +} diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go new file mode 100644 index 0000000000000..21b630ef4e3b2 --- /dev/null +++ b/agent/reconnectingpty/screen.go @@ -0,0 +1,283 @@ +package reconnectingpty + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "errors" + "io" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/gliderlabs/ssh" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" + + "cdr.dev/slog" + "github.com/coder/coder/pty" +) + +// screenBackend provides a reconnectable PTY via `screen`. +type screenBackend struct { + command *pty.Cmd + + // id holds the id of the session for both creating and attaching. This will + // be generated uniquely for each session because without control of the + // screen daemon we do not have its PID and without the PID screen will do + // partial matching. Enforcing a unique ID should guarantee we match on the + // right session. + id string + + // mutex prevents concurrent attaches to the session. Screen will happily + // spawn two separate sessions with the same name if multiple attaches happen + // in a close enough interval. We are not able to control the screen daemon + // ourselves to prevent this because the daemon will spawn with a hardcoded + // 24x80 size which results in confusing padding above the prompt once the + // attach comes in and resizes. + mutex sync.Mutex + + configFile string + + metrics *prometheus.CounterVec +} + +// start writes config settings and creates the socket directory. It must only +// be called once. If we could, we would want to spawn the daemon here and +// attach each connection to it but since doing that spawns the daemon with a +// hardcoded 24x80 size it is not a very good user experience. Instead we will +// let the attach command spawn the daemon on its own which causes it to spawn +// with the specified size. +func (b *screenBackend) start(_ context.Context, _ slog.Logger) error { + // Socket paths are limited to around 100 characters on Linux and macOS which + // depending on the temporary directory can be a problem. To give more leeway + // use a short ID. + buf := make([]byte, 4) + _, err := rand.Read(buf) + if err != nil { + return xerrors.Errorf("generate screen id: %w", err) + } + b.id = hex.EncodeToString(buf) + + settings := []string{ + // Tell screen not to handle motion for xterm* terminals which allows + // scrolling the terminal via the mouse wheel or scroll bar (by default + // screen uses it to cycle through the command history). There does not + // seem to be a way to make screen itself scroll on mouse wheel. tmux can + // do it but then there is no scroll bar and it kicks you into copy mode + // where keys stop working until you exit copy mode which seems like it + // could be confusing. + "termcapinfo xterm* ti@:te@", + // Enable alternate screen emulation otherwise applications get rendered in + // the current window which wipes out visible output resulting in missing + // output when scrolling back with the mouse wheel (copy mode still works + // since that is screen itself scrolling). + "altscreen on", + // Remap the control key to C-s since C-a may be used in applications. C-s + // is chosen because it cannot actually be used because by default it will + // pause and C-q to resume will just kill the browser window. We may not + // want people using the control key anyway since it will not be obvious + // they are in screen and doing things like switching windows makes mouse + // wheel scroll wonky due to the terminal doing the scrolling rather than + // screen itself (but again copy mode will work just fine). + "escape ^Ss", + } + + b.configFile = filepath.Join(os.TempDir(), "coder-screen", "config") + err = os.MkdirAll(filepath.Dir(b.configFile), 0o700) + if err != nil { + return err + } + + return os.WriteFile(b.configFile, []byte(strings.Join(settings, "\n")), 0o600) +} + +// attach attaches to the screen session by ID (which will spawn the daemon if +// necessary) and hooks up the output to the connection. If the context ends it +// will kill the connection's process (not the daemon), detaching it. +func (b *screenBackend) attach(ctx context.Context, _ string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) { + // Ensure another attach does not come in and spawn a duplicate session. + b.mutex.Lock() + defer b.mutex.Unlock() + + logger.Debug(ctx, "spawning screen client", slog.F("screen_id", b.id)) + + // Wrap the command with screen and tie it to the connection's context. + cmd := pty.CommandContext(ctx, "screen", append([]string{ + // -S is for setting the session's name. + "-S", b.id, + // -x allows attaching to an already attached session. + // -RR reattaches to the daemon or creates the session daemon if missing. + // -q disables the "New screen..." message that appears for five seconds + // when creating a new session with -RR. + // -c is the flag for the config file. + "-xRRqc", b.configFile, + b.command.Path, + // pty.Cmd duplicates Path as the first argument so remove it. + }, b.command.Args[1:]...)...) + cmd.Env = append(b.command.Env, "TERM=xterm-256color") + cmd.Dir = b.command.Dir + ptty, process, err := pty.Start(cmd, pty.WithPTYOption( + pty.WithSSHRequest(ssh.Pty{ + Window: ssh.Window{ + // Make sure to spawn at the right size because if we resize afterward it + // leaves confusing padding (screen will resize such that the screen + // contents are aligned to the bottom). + Height: int(height), + Width: int(width), + }, + }), + )) + if err != nil { + b.metrics.WithLabelValues("screen_spawn").Add(1) + return nil, err + } + + cleanup := func() { + pttyErr := ptty.Close() + if pttyErr != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(pttyErr)) + } + procErr := process.Kill() + if procErr != nil { + logger.Debug(ctx, "killed process with error", slog.Error(procErr)) + } + connErr := conn.Close() + if connErr != nil { + logger.Debug(ctx, "closed connection with error", slog.Error(connErr)) + } + } + + // Pipe the process's output to the connection. + // We do not need to separately monitor for the process exiting. When it + // exits, our ptty.OutputReader() will return EOF after reading all process + // output. + go func() { + defer cleanup() + buffer := make([]byte, 1024) + for { + read, err := ptty.OutputReader().Read(buffer) + if err != nil { + // When the PTY is closed, this is triggered. + // Error is typically a benign EOF, so only log for debugging. + if errors.Is(err, io.EOF) { + logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) + } else { + logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) + b.metrics.WithLabelValues("screen_output_reader").Add(1) + } + // The process might have died because the session itself died or it + // might have been separately killed and the session is still up ( + // example `exit` or we killed it when the connection closed). + break + } + part := buffer[:read] + _, err = conn.Write(part) + if err != nil { + // Connection might have been closed. + if errors.Unwrap(err).Error() != "endpoint is closed for send" { + logger.Warn(ctx, "error writing to active conn", slog.Error(err)) + b.metrics.WithLabelValues("screen_write").Add(1) + } + break + } + } + }() + + // Clean up the process when the connection or reconnecting pty closes. + go func() { + defer cleanup() + <-ctx.Done() + }() + + // Version seems to be the only command without a side effect (other than + // making the version pop up briefly) so use it to wait for the session to + // come up. If we do not wait we could end up spawning multiple sessions with + // the same name. + err = b.sendCommand(ctx, "version", nil) + if err != nil { + cleanup() + b.metrics.WithLabelValues("screen_wait").Add(1) + return nil, err + } + + return ptty, nil +} + +// close asks screen to kill the session by its ID. +func (b *screenBackend) close(_ context.Context, _ slog.Logger) error { + // If the command errors that the session is already gone that is fine. + return b.sendCommand(context.Background(), "quit", []string{"No screen session found"}) +} + +// sendCommand runs a screen command against a running screen session. If the +// command fails with an error matching anything in successErrors it will be +// considered a success state (for example "no session" when quitting and the +// session is already dead). The command will be retried until successful, the +// timeout is reached, or the context ends in which case the context error is +// returned together with the last error from the command. +func (b *screenBackend) sendCommand(ctx context.Context, command string, successErrors []string) error { + ctx, cancel := context.WithTimeout(ctx, attachTimeout) + defer cancel() + + var lastErr error + run := func() bool { + var stdout bytes.Buffer + //nolint:gosec + cmd := exec.CommandContext(ctx, "screen", + // -x targets an attached session. + "-x", b.id, + // -c is the flag for the config file. + "-c", b.configFile, + // -X runs a command in the matching session. + "-X", command, + ) + cmd.Env = append(b.command.Env, "TERM=xterm-256color") + cmd.Dir = b.command.Dir + cmd.Stdout = &stdout + err := cmd.Run() + if err == nil { + return true + } + + stdoutStr := stdout.String() + for _, se := range successErrors { + if strings.Contains(stdoutStr, se) { + return true + } + } + + // Things like "exit status 1" are imprecise so include stdout as it may + // contain more information ("no screen session found" for example). + if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + lastErr = xerrors.Errorf("`screen -x %s -X %s`: %w: %s", b.id, command, err, stdoutStr) + } + + return false + } + + // Run immediately. + if done := run(); done { + return nil + } + + // Then run on an interval. + ticker := time.NewTicker(250 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return errors.Join(ctx.Err(), lastErr) + case <-ticker.C: + if done := run(); done { + return nil + } + } + } +} diff --git a/cli/exp_scaletest.go b/cli/exp_scaletest.go index f74cc49eb693f..49c8d6ec74dab 100644 --- a/cli/exp_scaletest.go +++ b/cli/exp_scaletest.go @@ -650,10 +650,11 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { config.ReconnectingPTY = &reconnectingpty.Config{ // AgentID is set by the test automatically. Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - ID: uuid.Nil, - Height: 24, - Width: 80, - Command: runCommand, + ID: uuid.Nil, + Height: 24, + Width: 80, + Command: runCommand, + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(runTimeout), ExpectTimeout: runExpectTimeout, diff --git a/coderd/httpapi/queryparams.go b/coderd/httpapi/queryparams.go index 3f16565e1dd20..1ca9824a0d998 100644 --- a/coderd/httpapi/queryparams.go +++ b/coderd/httpapi/queryparams.go @@ -161,7 +161,7 @@ func (p *QueryParamParser) Strings(vals url.Values, def []string, queryParam str // ValidEnum parses enum query params. Add more to the list as needed. type ValidEnum interface { database.ResourceType | database.AuditAction | database.BuildReason | database.UserStatus | - database.WorkspaceStatus + database.WorkspaceStatus | codersdk.ReconnectingPTYBackendType // Valid is required on the enum type to be used with ParseEnum. Valid() bool diff --git a/coderd/insights_test.go b/coderd/insights_test.go index f108882299b9a..0ecb902fd1110 100644 --- a/coderd/insights_test.go +++ b/coderd/insights_test.go @@ -294,10 +294,11 @@ func TestTemplateInsights(t *testing.T) { // Start an rpty session to generate rpty usage stats. rpty, err := client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: resources[0].Agents[0].ID, - Reconnect: uuid.New(), - Width: 80, - Height: 24, + AgentID: resources[0].Agents[0].ID, + Reconnect: uuid.New(), + Width: 80, + Height: 24, + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }) require.NoError(t, err) defer rpty.Close() diff --git a/coderd/workspaceapps/apptest/apptest.go b/coderd/workspaceapps/apptest/apptest.go index f64ba7c30bf31..512e91eeb70a4 100644 --- a/coderd/workspaceapps/apptest/apptest.go +++ b/coderd/workspaceapps/apptest/apptest.go @@ -11,7 +11,9 @@ import ( "net/http/cookiejar" "net/http/httputil" "net/url" + "os/exec" "path" + "regexp" "runtime" "strconv" "strings" @@ -30,6 +32,10 @@ import ( "github.com/coder/coder/testutil" ) +const ansi = "[\u001B\u009B][[\\]()#;?]*(?:(?:(?:[a-zA-Z\\d]*(?:;[a-zA-Z\\d]*)*)?\u0007)|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PRZcf-ntqry=><~]))" + +var re = regexp.MustCompile(ansi) + // Run runs the entire workspace app test suite against deployments minted // by the provided factory. // @@ -51,128 +57,81 @@ func Run(t *testing.T, appHostIsPrimary bool, factory DeploymentFactory) { t.Skip("ConPTY appears to be inconsistent on Windows.") } - expectLine := func(t *testing.T, r *bufio.Reader, matcher func(string) bool) { - for { - line, err := r.ReadString('\n') - require.NoError(t, err) - if matcher(line) { - break - } - } - } - matchEchoCommand := func(line string) bool { - return strings.Contains(line, "echo test") + backends := []codersdk.ReconnectingPTYBackendType{ + codersdk.ReconnectingPTYBackendTypeBuffered, + codersdk.ReconnectingPTYBackendTypeScreen, } - matchEchoOutput := func(line string) bool { - return strings.Contains(line, "test") && !strings.Contains(line, "echo") - } - - t.Run("OK", func(t *testing.T) { - appDetails := setupProxyTest(t, nil) - - ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) - defer cancel() - - // Run the test against the path app hostname since that's where the - // reconnecting-pty proxy server we want to test is mounted. - client := appDetails.AppClient(t) - conn, err := client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: appDetails.Agent.ID, - Reconnect: uuid.New(), - Height: 80, - Width: 80, - Command: "/bin/bash", - }) - require.NoError(t, err) - defer conn.Close() - - // First attempt to resize the TTY. - // The websocket will close if it fails! - data, err := json.Marshal(codersdk.ReconnectingPTYRequest{ - Height: 250, - Width: 250, - }) - require.NoError(t, err) - _, err = conn.Write(data) - require.NoError(t, err) - bufRead := bufio.NewReader(conn) - - // Brief pause to reduce the likelihood that we send keystrokes while - // the shell is simultaneously sending a prompt. - time.Sleep(100 * time.Millisecond) - - data, err = json.Marshal(codersdk.ReconnectingPTYRequest{ - Data: "echo test\r\n", - }) - require.NoError(t, err) - _, err = conn.Write(data) - require.NoError(t, err) - expectLine(t, bufRead, matchEchoCommand) - expectLine(t, bufRead, matchEchoOutput) - }) + for _, backendType := range backends { + backendType := backendType + t.Run(string(backendType), func(t *testing.T) { + t.Parallel() + if runtime.GOOS == "darwin" { + t.Skip("`screen` is flaky on darwin") + } else if backendType == codersdk.ReconnectingPTYBackendTypeScreen { + _, err := exec.LookPath("screen") + if err != nil { + t.Skip("`screen` not found") + } + } - t.Run("SignedTokenQueryParameter", func(t *testing.T) { - t.Parallel() - if appHostIsPrimary { - t.Skip("Tickets are not used for terminal requests on the primary.") - } + t.Run("OK", func(t *testing.T) { + t.Parallel() + appDetails := setupProxyTest(t, nil) - appDetails := setupProxyTest(t, nil) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() - u := *appDetails.PathAppBaseURL - if u.Scheme == "http" { - u.Scheme = "ws" - } else { - u.Scheme = "wss" - } - u.Path = fmt.Sprintf("/api/v2/workspaceagents/%s/pty", appDetails.Agent.ID.String()) + // Run the test against the path app hostname since that's where the + // reconnecting-pty proxy server we want to test is mounted. + client := appDetails.AppClient(t) + testReconnectingPTY(ctx, t, client, codersdk.WorkspaceAgentReconnectingPTYOpts{ + AgentID: appDetails.Agent.ID, + Reconnect: uuid.New(), + Height: 80, + Width: 80, + Command: "bash", + BackendType: backendType, + }) + }) - ctx := testutil.Context(t, testutil.WaitLong) - issueRes, err := appDetails.SDKClient.IssueReconnectingPTYSignedToken(ctx, codersdk.IssueReconnectingPTYSignedTokenRequest{ - URL: u.String(), - AgentID: appDetails.Agent.ID, - }) - require.NoError(t, err) + t.Run("SignedTokenQueryParameter", func(t *testing.T) { + t.Parallel() + if appHostIsPrimary { + t.Skip("Tickets are not used for terminal requests on the primary.") + } - // Make an unauthenticated client. - unauthedAppClient := codersdk.New(appDetails.AppClient(t).URL) - conn, err := unauthedAppClient.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: appDetails.Agent.ID, - Reconnect: uuid.New(), - Height: 80, - Width: 80, - Command: "/bin/bash", - SignedToken: issueRes.SignedToken, - }) - require.NoError(t, err) - defer conn.Close() + appDetails := setupProxyTest(t, nil) - // First attempt to resize the TTY. - // The websocket will close if it fails! - data, err := json.Marshal(codersdk.ReconnectingPTYRequest{ - Height: 250, - Width: 250, - }) - require.NoError(t, err) - _, err = conn.Write(data) - require.NoError(t, err) - bufRead := bufio.NewReader(conn) + u := *appDetails.PathAppBaseURL + if u.Scheme == "http" { + u.Scheme = "ws" + } else { + u.Scheme = "wss" + } + u.Path = fmt.Sprintf("/api/v2/workspaceagents/%s/pty", appDetails.Agent.ID.String()) - // Brief pause to reduce the likelihood that we send keystrokes while - // the shell is simultaneously sending a prompt. - time.Sleep(100 * time.Millisecond) + ctx := testutil.Context(t, testutil.WaitLong) + issueRes, err := appDetails.SDKClient.IssueReconnectingPTYSignedToken(ctx, codersdk.IssueReconnectingPTYSignedTokenRequest{ + URL: u.String(), + AgentID: appDetails.Agent.ID, + }) + require.NoError(t, err) - data, err = json.Marshal(codersdk.ReconnectingPTYRequest{ - Data: "echo test\r\n", + // Make an unauthenticated client. + unauthedAppClient := codersdk.New(appDetails.AppClient(t).URL) + testReconnectingPTY(ctx, t, unauthedAppClient, codersdk.WorkspaceAgentReconnectingPTYOpts{ + AgentID: appDetails.Agent.ID, + Reconnect: uuid.New(), + Height: 80, + Width: 80, + Command: "bash", + SignedToken: issueRes.SignedToken, + BackendType: backendType, + }) + }) }) - require.NoError(t, err) - _, err = conn.Write(data) - require.NoError(t, err) - - expectLine(t, bufRead, matchEchoCommand) - expectLine(t, bufRead, matchEchoOutput) - }) + } }) t.Run("WorkspaceAppsProxyPath", func(t *testing.T) { @@ -1407,3 +1366,75 @@ func Run(t *testing.T, appHostIsPrimary bool, factory DeploymentFactory) { require.Equal(t, []string{"baz"}, resp.Header.Values("X-Foobar")) }) } + +func testReconnectingPTY(ctx context.Context, t *testing.T, client *codersdk.Client, opts codersdk.WorkspaceAgentReconnectingPTYOpts) { + hasLine := func(scanner *bufio.Scanner, matcher func(string) bool) bool { + for scanner.Scan() { + line := scanner.Text() + t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) + if matcher(line) { + return true + } + } + return false + } + matchEchoCommand := func(line string) bool { + return strings.Contains(line, "echo test") + } + matchEchoOutput := func(line string) bool { + return strings.Contains(line, "test") && !strings.Contains(line, "echo") + } + matchExitCommand := func(line string) bool { + return strings.Contains(line, "exit") + } + matchExitOutput := func(line string) bool { + return strings.Contains(line, "exit") || strings.Contains(line, "logout") + } + + conn, err := client.WorkspaceAgentReconnectingPTY(ctx, opts) + require.NoError(t, err) + defer conn.Close() + + // First attempt to resize the TTY. + // The websocket will close if it fails! + data, err := json.Marshal(codersdk.ReconnectingPTYRequest{ + Height: 250, + Width: 250, + }) + require.NoError(t, err) + _, err = conn.Write(data) + require.NoError(t, err) + scanner := bufio.NewScanner(conn) + + // Brief pause to reduce the likelihood that we send keystrokes while + // the shell is simultaneously sending a prompt. + time.Sleep(100 * time.Millisecond) + + data, err = json.Marshal(codersdk.ReconnectingPTYRequest{ + Data: "echo test\r\n", + }) + require.NoError(t, err) + _, err = conn.Write(data) + require.NoError(t, err) + + require.True(t, hasLine(scanner, matchEchoCommand), "find echo command") + require.True(t, hasLine(scanner, matchEchoOutput), "find echo output") + + // Exit should cause the connection to close. + data, err = json.Marshal(codersdk.ReconnectingPTYRequest{ + Data: "exit\r\n", + }) + require.NoError(t, err) + _, err = conn.Write(data) + require.NoError(t, err) + + // Once for the input and again for the output. + require.True(t, hasLine(scanner, matchExitCommand), "find exit command") + require.True(t, hasLine(scanner, matchExitOutput), "find exit output") + + // Ensure the connection closes. + for scanner.Scan() { + line := scanner.Text() + t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) + } +} diff --git a/coderd/workspaceapps/proxy.go b/coderd/workspaceapps/proxy.go index 9b2d9c4bfa297..19e670aa47d9b 100644 --- a/coderd/workspaceapps/proxy.go +++ b/coderd/workspaceapps/proxy.go @@ -632,6 +632,7 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { reconnect := parser.Required("reconnect").UUID(values, uuid.New(), "reconnect") height := parser.UInt(values, 80, "height") width := parser.UInt(values, 80, "width") + backendType := httpapi.ParseCustom(parser, values, codersdk.ReconnectingPTYBackendTypeAuto, "backend", httpapi.ParseEnum[codersdk.ReconnectingPTYBackendType]) if len(parser.Errors) > 0 { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Invalid query parameters.", @@ -671,7 +672,7 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { defer release() defer agentConn.Close() log.Debug(ctx, "dialed workspace agent") - ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command")) + ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command"), backendType) if err != nil { log.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err)) _ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial: %s", err)) diff --git a/codersdk/workspaceagentconn.go b/codersdk/workspaceagentconn.go index 6b9b6f0d33f44..259fe2713375c 100644 --- a/codersdk/workspaceagentconn.go +++ b/codersdk/workspaceagentconn.go @@ -193,13 +193,32 @@ func (c *WorkspaceAgentConn) Close() error { return c.Conn.Close() } +type ReconnectingPTYBackendType string + +const ( + ReconnectingPTYBackendTypeAuto ReconnectingPTYBackendType = "auto" + ReconnectingPTYBackendTypeScreen ReconnectingPTYBackendType = "screen" + ReconnectingPTYBackendTypeBuffered ReconnectingPTYBackendType = "buffered" +) + +func (s ReconnectingPTYBackendType) Valid() bool { + switch s { + case ReconnectingPTYBackendTypeAuto, ReconnectingPTYBackendTypeBuffered, + ReconnectingPTYBackendTypeScreen: + return true + default: + return false + } +} + // WorkspaceAgentReconnectingPTYInit initializes a new reconnecting PTY session. // @typescript-ignore WorkspaceAgentReconnectingPTYInit type WorkspaceAgentReconnectingPTYInit struct { - ID uuid.UUID - Height uint16 - Width uint16 - Command string + ID uuid.UUID + Height uint16 + Width uint16 + Command string + BackendType ReconnectingPTYBackendType } // ReconnectingPTYRequest is sent from the client to the server @@ -214,7 +233,7 @@ type ReconnectingPTYRequest struct { // ReconnectingPTY spawns a new reconnecting terminal session. // `ReconnectingPTYRequest` should be JSON marshaled and written to the returned net.Conn. // Raw terminal output will be read from the returned net.Conn. -func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, height, width uint16, command string) (net.Conn, error) { +func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, height, width uint16, command string, backendType ReconnectingPTYBackendType) (net.Conn, error) { ctx, span := tracing.StartSpan(ctx) defer span.End() @@ -227,10 +246,11 @@ func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, return nil, err } data, err := json.Marshal(WorkspaceAgentReconnectingPTYInit{ - ID: id, - Height: height, - Width: width, - Command: command, + ID: id, + Height: height, + Width: width, + Command: command, + BackendType: backendType, }) if err != nil { _ = conn.Close() diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 994f32026b61a..d60b8d7910df0 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -547,11 +547,12 @@ func (c *Client) IssueReconnectingPTYSignedToken(ctx context.Context, req IssueR // @typescript-ignore:WorkspaceAgentReconnectingPTYOpts type WorkspaceAgentReconnectingPTYOpts struct { - AgentID uuid.UUID - Reconnect uuid.UUID - Width uint16 - Height uint16 - Command string + AgentID uuid.UUID + Reconnect uuid.UUID + Width uint16 + Height uint16 + Command string + BackendType ReconnectingPTYBackendType // SignedToken is an optional signed token from the // issue-reconnecting-pty-signed-token endpoint. If set, the session token @@ -572,6 +573,7 @@ func (c *Client) WorkspaceAgentReconnectingPTY(ctx context.Context, opts Workspa q.Set("width", strconv.Itoa(int(opts.Width))) q.Set("height", strconv.Itoa(int(opts.Height))) q.Set("command", opts.Command) + q.Set("backend", string(opts.BackendType)) // If we're using a signed token, set the query parameter. if opts.SignedToken != "" { q.Set(SignedAppTokenQueryParameter, opts.SignedToken) diff --git a/dogfood/Dockerfile b/dogfood/Dockerfile index 95075c63f311e..fe80795f750c7 100644 --- a/dogfood/Dockerfile +++ b/dogfood/Dockerfile @@ -162,6 +162,7 @@ RUN apt-get update --quiet && apt-get install --yes \ fish \ unzip \ zstd \ + screen \ gettext-base && \ # Delete package cache to avoid consuming space in layer apt-get clean && \ diff --git a/flake.nix b/flake.nix index e720f28df7004..347468f387da0 100644 --- a/flake.nix +++ b/flake.nix @@ -43,6 +43,7 @@ pkg-config protoc-gen-go ripgrep + screen shellcheck shfmt sqlc diff --git a/scaletest/createworkspaces/run_test.go b/scaletest/createworkspaces/run_test.go index ac9812dccf3ce..039cdbbb8708e 100644 --- a/scaletest/createworkspaces/run_test.go +++ b/scaletest/createworkspaces/run_test.go @@ -106,9 +106,10 @@ func Test_Runner(t *testing.T) { }, ReconnectingPTY: &reconnectingpty.Config{ Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Height: 24, - Width: 80, - Command: "echo hello", + Height: 24, + Width: 80, + Command: "echo hello", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(testutil.WaitLong), }, @@ -230,9 +231,10 @@ func Test_Runner(t *testing.T) { }, ReconnectingPTY: &reconnectingpty.Config{ Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Height: 24, - Width: 80, - Command: "echo hello", + Height: 24, + Width: 80, + Command: "echo hello", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(testutil.WaitLong), }, diff --git a/scaletest/reconnectingpty/config_test.go b/scaletest/reconnectingpty/config_test.go index 0c5200bfd7fe6..5b7e3feef6501 100644 --- a/scaletest/reconnectingpty/config_test.go +++ b/scaletest/reconnectingpty/config_test.go @@ -32,10 +32,11 @@ func Test_Config(t *testing.T) { config: reconnectingpty.Config{ AgentID: id, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - ID: id, - Width: 80, - Height: 24, - Command: "echo 'hello world'", + ID: id, + Width: 80, + Height: 24, + Command: "echo 'hello world'", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(time.Minute), ExpectTimeout: false, diff --git a/scaletest/reconnectingpty/run.go b/scaletest/reconnectingpty/run.go index afded88eb0412..929e8bf1c612c 100644 --- a/scaletest/reconnectingpty/run.go +++ b/scaletest/reconnectingpty/run.go @@ -62,14 +62,16 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { _, _ = fmt.Fprintf(logs, "\tID: %s\n", id.String()) _, _ = fmt.Fprintf(logs, "\tWidth: %d\n", width) _, _ = fmt.Fprintf(logs, "\tHeight: %d\n", height) + _, _ = fmt.Fprintf(logs, "\tBackend: %s\n", r.cfg.Init.BackendType) _, _ = fmt.Fprintf(logs, "\tCommand: %q\n\n", r.cfg.Init.Command) conn, err := r.client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: r.cfg.AgentID, - Reconnect: id, - Width: width, - Height: height, - Command: r.cfg.Init.Command, + AgentID: r.cfg.AgentID, + Reconnect: id, + Width: width, + Height: height, + Command: r.cfg.Init.Command, + BackendType: r.cfg.Init.BackendType, }) if err != nil { return xerrors.Errorf("open reconnecting PTY: %w", err) diff --git a/scaletest/reconnectingpty/run_test.go b/scaletest/reconnectingpty/run_test.go index 382a3718436f9..be92313eda5c4 100644 --- a/scaletest/reconnectingpty/run_test.go +++ b/scaletest/reconnectingpty/run_test.go @@ -34,7 +34,8 @@ func Test_Runner(t *testing.T) { AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ // Use ; here because it's powershell compatible (vs &&). - Command: "echo 'hello world'; sleep 1", + Command: "echo 'hello world'; sleep 1", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, LogOutput: true, }) @@ -62,7 +63,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", + Command: "echo 'hello world'", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, LogOutput: false, }) @@ -90,7 +92,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", + Command: "echo 'hello world'", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(2 * testutil.WaitSuperLong), LogOutput: true, @@ -114,7 +117,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "sleep 120", + Command: "sleep 120", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(2 * time.Second), LogOutput: true, @@ -143,7 +147,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "sleep 120", + Command: "sleep 120", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(2 * time.Second), ExpectTimeout: true, @@ -168,7 +173,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", + Command: "echo 'hello world'", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, Timeout: httpapi.Duration(2 * testutil.WaitSuperLong), ExpectTimeout: true, @@ -198,7 +204,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'; sleep 1", + Command: "echo 'hello world'; sleep 1", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, ExpectOutput: "hello world", LogOutput: false, @@ -222,7 +229,8 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'; sleep 1", + Command: "echo 'hello world'; sleep 1", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }, ExpectOutput: "bello borld", LogOutput: false, diff --git a/scaletest/workspacetraffic/conn.go b/scaletest/workspacetraffic/conn.go index 167164c5ef33f..aced56e6e0576 100644 --- a/scaletest/workspacetraffic/conn.go +++ b/scaletest/workspacetraffic/conn.go @@ -15,11 +15,12 @@ import ( func connectPTY(ctx context.Context, client *codersdk.Client, agentID, reconnect uuid.UUID) (*countReadWriteCloser, error) { conn, err := client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: agentID, - Reconnect: reconnect, - Height: 25, - Width: 80, - Command: "/bin/sh", + AgentID: agentID, + Reconnect: reconnect, + Height: 25, + Width: 80, + Command: "sh", + BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, }) if err != nil { return nil, xerrors.Errorf("connect pty: %w", err) diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index fcb71675e11e6..09551eaa00f25 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -1739,6 +1739,14 @@ export const RBACResources: RBACResource[] = [ "workspace_proxy", ] +// From codersdk/workspaceagentconn.go +export type ReconnectingPTYBackendType = "auto" | "buffered" | "screen" +export const ReconnectingPTYBackendTypes: ReconnectingPTYBackendType[] = [ + "auto", + "buffered", + "screen", +] + // From codersdk/audit.go export type ResourceType = | "api_key" From 9cb043c7d077ac80271e2f3051760c258057a081 Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 09:41:31 -0800 Subject: [PATCH 02/28] Fix leaking goroutine in wait --- agent/reconnectingpty/reconnectingpty.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 2e6f4a4b73a5e..97b2f8c896653 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -281,11 +281,17 @@ func (rpty *ReconnectingPTY) waitForState(state State) (State, error) { // waitForStateOrContext blocks until the state or a greater one is reached or // the provided context ends. func (rpty *ReconnectingPTY) waitForStateOrContext(ctx context.Context, state State) (State, error) { + nevermind := make(chan struct{}) + defer close(nevermind) go func() { - // Wake up when the context ends. - defer rpty.cond.Broadcast() - <-ctx.Done() + select { + case <-ctx.Done(): + // Wake up when the context ends. + rpty.cond.Broadcast() + case <-nevermind: + } }() + rpty.cond.L.Lock() defer rpty.cond.L.Unlock() for ctx.Err() == nil && state > rpty.state { From 19633c99f3a9d2939fe70357f52de795954d0c2b Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 10:07:18 -0800 Subject: [PATCH 03/28] Remove connection_id from reconnecting PTY This serves multiple connections so it makes no sense to scope it to a single connection. Also lets us use "connection_id" when logging write errors instead of "other_conn_id". --- agent/agent.go | 2 +- agent/reconnectingpty/buffered.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 208dbb2b1d70f..0f33c38337579 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1120,7 +1120,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m BackendType: msg.BackendType, Timeout: a.reconnectingPTYTimeout, Metrics: a.metrics.reconnectingPTYErrors, - }, logger) + }, logger.With(slog.F("message_id", msg.ID))) if err = a.trackConnGoroutine(func() { rpty.Wait() diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 4b3d0b1d70339..528850e2f7bb8 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -95,7 +95,7 @@ func (b *bufferedBackend) start(ctx context.Context, logger slog.Logger) error { if err != nil { logger.Warn(ctx, "error writing to active connection", - slog.F("other_conn_id", cid), + slog.F("connection_id", cid), slog.Error(err), ) b.metrics.WithLabelValues("write").Add(1) From 8f4956f76b071aa66633405179ff84eb23871b3e Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 10:25:15 -0800 Subject: [PATCH 04/28] Remove error from close return Some errors are returned and some are logged at the debug level. The reason for the debug logging is that the process might already be closed so in that case I did not want to return an error. This will be fixed in a future commit but since this error is only logged by the caller anyway, go ahead and just log it in the backend to simplify things. --- agent/reconnectingpty/buffered.go | 9 +++++---- agent/reconnectingpty/reconnectingpty.go | 10 +++------- agent/reconnectingpty/screen.go | 7 +++++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 528850e2f7bb8..f3241b3c798ed 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -147,13 +147,15 @@ func (b *bufferedBackend) attach(ctx context.Context, connID string, conn net.Co // close closes all connections to the reconnecting PTY, clears the circular // buffer, and kills the process. -func (b *bufferedBackend) close(ctx context.Context, logger slog.Logger) error { - var err error +func (b *bufferedBackend) close(ctx context.Context, logger slog.Logger) { b.mutex.Lock() defer b.mutex.Unlock() b.circularBuffer.Reset() for _, conn := range b.activeConns { - err = errors.Join(err, conn.Close()) + connErr := conn.Close() + if connErr != nil { + logger.Error(ctx, "closed conn with error", slog.Error(connErr)) + } } pttyErr := b.ptty.Close() if pttyErr != nil { @@ -163,5 +165,4 @@ func (b *bufferedBackend) close(ctx context.Context, logger slog.Logger) error { if procErr != nil { logger.Debug(ctx, "killed process with error", slog.Error(procErr)) } - return err } diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 97b2f8c896653..59c98f67a09d1 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -53,7 +53,7 @@ const ( type backend interface { start(ctx context.Context, logger slog.Logger) error attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) - close(ctx context.Context, logger slog.Logger) error + close(ctx context.Context, logger slog.Logger) } // ReconnectingPTY is a pty that can be reconnected within a timeout and to @@ -222,12 +222,8 @@ func (rpty *ReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) rpty.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %w", reasonErr)) } rpty.timer.Stop() - closeErr := rpty.backend.close(ctx, logger) - if closeErr != nil { - logger.Error(ctx, "closed reconnecting pty", slog.Error(closeErr)) - } else { - logger.Debug(ctx, "closed reconnecting pty") - } + rpty.backend.close(ctx, logger) + logger.Debug(ctx, "closed reconnecting pty") rpty.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) } diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 21b630ef4e3b2..c4fb8e634718e 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -210,9 +210,12 @@ func (b *screenBackend) attach(ctx context.Context, _ string, conn net.Conn, hei } // close asks screen to kill the session by its ID. -func (b *screenBackend) close(_ context.Context, _ slog.Logger) error { +func (b *screenBackend) close(ctx context.Context, logger slog.Logger) { // If the command errors that the session is already gone that is fine. - return b.sendCommand(context.Background(), "quit", []string{"No screen session found"}) + err := b.sendCommand(context.Background(), "quit", []string{"No screen session found"}) + if err != nil { + logger.Error(ctx, "close screen session", slog.Error(err)) + } } // sendCommand runs a screen command against a running screen session. If the From ee887b9f68a8192102498bce721f0c3fcbe89541 Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 12:33:17 -0800 Subject: [PATCH 05/28] Refactor reconnecting PTY backends Instead of a shared layer, directly implement both and share code where possible with helper functions. This makes cleanup easier with the screen implementation too. --- agent/agent.go | 6 +- agent/reconnectingpty/buffered.go | 209 ++++++++++----- agent/reconnectingpty/reconnectingpty.go | 311 +++++++++-------------- agent/reconnectingpty/screen.go | 261 ++++++++++++------- 4 files changed, 432 insertions(+), 355 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 0f33c38337579..a2ef9ef9cd2a1 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1082,14 +1082,14 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m logger.Debug(ctx, "reconnecting pty connection closed") }() - var rpty *reconnectingpty.ReconnectingPTY - sendConnected := make(chan *reconnectingpty.ReconnectingPTY, 1) + var rpty reconnectingpty.ReconnectingPTY + sendConnected := make(chan reconnectingpty.ReconnectingPTY, 1) // On store, reserve this ID to prevent multiple concurrent new connections. waitReady, ok := a.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected) if ok { close(sendConnected) // Unused. logger.Debug(ctx, "connecting to existing reconnecting pty") - c, ok := waitReady.(chan *reconnectingpty.ReconnectingPTY) + c, ok := waitReady.(chan reconnectingpty.ReconnectingPTY) if !ok { return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady) } diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index f3241b3c798ed..1e472db2ccc0a 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -6,6 +6,7 @@ import ( "io" "net" "sync" + "time" "github.com/armon/circbuf" "github.com/prometheus/client_golang/prometheus" @@ -17,9 +18,9 @@ import ( "github.com/coder/coder/pty" ) -// bufferedBackend provides a reconnectable PTY by using a ring buffer to store +// bufferedReconnectingPTY provides a reconnectable PTY by using a ring buffer to store // scrollback. -type bufferedBackend struct { +type bufferedReconnectingPTY struct { command *pty.Cmd // mutex protects writing to the circular buffer and connections. @@ -33,35 +34,48 @@ type bufferedBackend struct { metrics *prometheus.CounterVec - // closeSession is used to close the session when the process dies. - closeSession func(reason string) + state *ptyState + // timer will close the reconnecting pty when it expires. The timer will be + // reset as long as there are active connections. + timer *time.Timer + timeout time.Duration } -// start initializes the backend and starts the pty. It must be called only -// once. If the context ends the process will be killed. -func (b *bufferedBackend) start(ctx context.Context, logger slog.Logger) error { - b.activeConns = map[string]net.Conn{} +// newBuffered starts the buffered pty. If the context ends the process will be +// killed. +func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) *bufferedReconnectingPTY { + rpty := &bufferedReconnectingPTY{ + activeConns: map[string]net.Conn{}, + command: cmd, + metrics: options.Metrics, + state: newState(), + timeout: options.Timeout, + } + + go rpty.lifecycle(ctx, logger) // Default to buffer 64KiB. circularBuffer, err := circbuf.NewBuffer(64 << 10) if err != nil { - return xerrors.Errorf("create circular buffer: %w", err) + rpty.state.setState(StateDone, xerrors.Errorf("generate screen id: %w", err)) + return rpty } - b.circularBuffer = circularBuffer - - // pty.Cmd duplicates Path as the first argument so remove it. - cmd := pty.CommandContext(ctx, b.command.Path, b.command.Args[1:]...) - cmd.Env = append(b.command.Env, "TERM=xterm-256color") - cmd.Dir = b.command.Dir - ptty, process, err := pty.Start(cmd) + rpty.circularBuffer = circularBuffer + + // Add TERM then start the command with a pty. pty.Cmd duplicates Path as the + // first argument so remove it. + cmdWithEnv := pty.CommandContext(ctx, cmd.Path, cmd.Args[1:]...) + cmdWithEnv.Env = append(rpty.command.Env, "TERM=xterm-256color") + cmdWithEnv.Dir = rpty.command.Dir + ptty, process, err := pty.Start(cmdWithEnv) if err != nil { - return err + rpty.state.setState(StateDone, xerrors.Errorf("generate screen id: %w", err)) + return rpty } - b.ptty = ptty - b.process = process + rpty.ptty = ptty + rpty.process = process // Multiplex the output onto the circular buffer and each active connection. - // // We do not need to separately monitor for the process exiting. When it // exits, our ptty.OutputReader() will return EOF after reading all process // output. @@ -76,21 +90,23 @@ func (b *bufferedBackend) start(ctx context.Context, logger slog.Logger) error { logger.Debug(ctx, "unable to read pty output, command might have exited", slog.Error(err)) } else { logger.Warn(ctx, "unable to read pty output, command might have exited", slog.Error(err)) - b.metrics.WithLabelValues("output_reader").Add(1) + rpty.metrics.WithLabelValues("output_reader").Add(1) } // Could have been killed externally or failed to start at all (command // not found for example). - b.closeSession("unable to read pty output, command might have exited") + rpty.Close("unable to read pty output, command might have exited") break } part := buffer[:read] - b.mutex.Lock() - _, err = b.circularBuffer.Write(part) + rpty.mutex.Lock() + _, err = rpty.circularBuffer.Write(part) if err != nil { logger.Error(ctx, "write to circular buffer", slog.Error(err)) - b.metrics.WithLabelValues("write_buffer").Add(1) + rpty.metrics.WithLabelValues("write_buffer").Add(1) } - for cid, conn := range b.activeConns { + // TODO: Instead of ranging over a map, could we send the output to a + // channel and have each individual Attach read from that? + for cid, conn := range rpty.activeConns { _, err = conn.Write(part) if err != nil { logger.Warn(ctx, @@ -98,71 +114,124 @@ func (b *bufferedBackend) start(ctx context.Context, logger slog.Logger) error { slog.F("connection_id", cid), slog.Error(err), ) - b.metrics.WithLabelValues("write").Add(1) + rpty.metrics.WithLabelValues("write").Add(1) } } - b.mutex.Unlock() + rpty.mutex.Unlock() } }() + return rpty +} + +// lifecycle manages the lifecycle of the reconnecting pty. If the context ends +// the reconnecting pty will be closed. +func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { + rpty.timer = time.AfterFunc(attachTimeout, func() { + rpty.Close("reconnecting pty timeout") + }) + + logger.Debug(ctx, "reconnecting pty ready") + rpty.state.setState(StateReady, nil) + + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) + if state < StateClosing { + // If we have not closed yet then the context is what unblocked us (which + // means the agent is shutting down) so move into the closing phase. + rpty.Close(reasonErr.Error()) + } + rpty.timer.Stop() + + rpty.mutex.Lock() + defer rpty.mutex.Unlock() + + rpty.circularBuffer.Reset() + + for _, conn := range rpty.activeConns { + err := conn.Close() + if err != nil { + logger.Debug(ctx, "closed conn with error", slog.Error(err)) + } + } + + err := rpty.ptty.Close() + if err != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(err)) + } + + err = rpty.process.Kill() + if err != nil { + logger.Debug(ctx, "killed process with error", slog.Error(err)) + } + + logger.Debug(ctx, "closed reconnecting pty") + rpty.state.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) +} + +func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + // This will kill the heartbeat once we hit EOF or an error. + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + logger.Debug(ctx, "reconnecting pty attach") + err := rpty.attach(ctx, connID, conn, height, width, logger) + if err != nil { + return err + } + + defer func() { + rpty.mutex.Lock() + defer rpty.mutex.Unlock() + delete(rpty.activeConns, connID) + }() + + // Pipe conn -> pty and block. pty -> conn is handled in newBuffered(). + readConnLoop(ctx, conn, rpty.ptty, rpty.metrics, logger) return nil } -// attach attaches to the pty and replays the buffer. If the context closes it -// will detach the connection but leave the process up. A connection ID is -// required so that logs in the pty goroutine can reference the same ID -// reference in logs output by each individual connection when acting on those -// connections. -func (b *bufferedBackend) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) { +// attach adds the connection to the map, replays the buffer, and starts the +// heartbeat. It exists separately only so we can defer the mutex unlock which +// is not possible in Attach since it blocks. +func (rpty *bufferedReconnectingPTY) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + // Ensure we do not write to or close connections while we attach. + rpty.mutex.Lock() + defer rpty.mutex.Unlock() + + state, err := rpty.state.waitForStateOrContext(ctx, StateReady) + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + // Resize the PTY to initial height + width. - err := b.ptty.Resize(height, width) + err = rpty.ptty.Resize(height, width) if err != nil { // We can continue after this, it's not fatal! logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) - b.metrics.WithLabelValues("resize").Add(1) + rpty.metrics.WithLabelValues("resize").Add(1) } // Write any previously stored data for the TTY and store the connection for // future writes. - b.mutex.Lock() - defer b.mutex.Unlock() - prevBuf := slices.Clone(b.circularBuffer.Bytes()) + prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) _, err = conn.Write(prevBuf) if err != nil { - b.metrics.WithLabelValues("write").Add(1) - return nil, xerrors.Errorf("write buffer to conn: %w", err) + rpty.metrics.WithLabelValues("write").Add(1) + rpty.mutex.Unlock() + return xerrors.Errorf("write buffer to conn: %w", err) } - b.activeConns[connID] = conn + rpty.activeConns[connID] = conn - // Detach the connection when it or the reconnecting pty closes. - go func() { - <-ctx.Done() - b.mutex.Lock() - defer b.mutex.Unlock() - delete(b.activeConns, connID) - }() + return nil +} - return b.ptty, nil +func (rpty *bufferedReconnectingPTY) Wait() { + _, _ = rpty.state.waitForState(StateClosing) } -// close closes all connections to the reconnecting PTY, clears the circular -// buffer, and kills the process. -func (b *bufferedBackend) close(ctx context.Context, logger slog.Logger) { - b.mutex.Lock() - defer b.mutex.Unlock() - b.circularBuffer.Reset() - for _, conn := range b.activeConns { - connErr := conn.Close() - if connErr != nil { - logger.Error(ctx, "closed conn with error", slog.Error(connErr)) - } - } - pttyErr := b.ptty.Close() - if pttyErr != nil { - logger.Debug(ctx, "closed ptty with error", slog.Error(pttyErr)) - } - procErr := b.process.Kill() - if procErr != nil { - logger.Debug(ctx, "killed process with error", slog.Error(procErr)) - } +func (rpty *bufferedReconnectingPTY) Close(reason string) { + // The closing state change will be handled by the lifecycle. + rpty.state.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %s", reason)) } diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 59c98f67a09d1..9000ad851d1c8 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -19,6 +19,10 @@ import ( "github.com/coder/coder/pty" ) +// attachTimeout is the initial timeout for attaching and will probably be far +// shorter than the reconnect timeout in most cases; in tests it might be +// longer. It should be at least long enough for the first screen attach to be +// able to start up the daemon and for the buffered pty to start. const attachTimeout = 30 * time.Second // Options allows configuring the reconnecting pty. @@ -32,50 +36,21 @@ type Options struct { BackendType codersdk.ReconnectingPTYBackendType } -// State represents the current state of the reconnecting pty. States are -// sequential and will only move forward. -type State int - -const ( - // StateStarting is the default/start state. Attaching will block until the - // reconnecting pty becomes ready. - StateStarting = iota - // StateReady means the reconnecting pty is ready to be attached. - StateReady - // StateClosing means the reconnecting pty has begun closing. The underlying - // process may still be exiting. Attaching will result in an error. - StateClosing - // StateDone means the reconnecting pty has completely shut down and the - // process has exited. Attaching will result in an error. - StateDone -) - -type backend interface { - start(ctx context.Context, logger slog.Logger) error - attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) - close(ctx context.Context, logger slog.Logger) -} - // ReconnectingPTY is a pty that can be reconnected within a timeout and to -// simultaneous connections. -type ReconnectingPTY struct { - // The reconnecting pty can be backed by screen if installed or a (buggy) - // buffer replay fallback. - backend backend - // cond broadcasts state changes and any accompanying errors. - cond *sync.Cond - // error is used when a new connection attaches for cases where the session - // failed to start or has since been closed. It is not safe to access outside - // of cond.L. - error error - // options holds options for configuring the reconnecting pty. - options *Options - // state holds the current reconnecting pty state. It is not safe to access - // this outside of cond.L. - state State - // timer will close the reconnecting pty when it expires. The timer will be - // reset as long as there are active connections. - timer *time.Timer +// simultaneous connections. The reconnecting pty can be backed by screen if +// installed or a (buggy) buffer replay fallback. +type ReconnectingPTY interface { + // Attach pipes the connection and pty, spawning it if necessary, replays + // history, then blocks until EOF, an error, or the context's end. The + // connection is expected to send JSON-encoded messages and accept raw output + // from the ptty. If the context ends or the process dies the connection will + // be detached and closed. + Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error + // Wait waits for the reconnecting pty to close. The underlying process might + // still be exiting. + Wait() + // Close kills the reconnecting pty process. + Close(reason string) } // New sets up a new reconnecting pty that wraps the provided command. Any @@ -83,7 +58,7 @@ type ReconnectingPTY struct { // close itself (and all connections to it) if nothing is attached for the // duration of the timeout, if the context ends, or the process exits (buffered // backend only). -func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) *ReconnectingPTY { +func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) ReconnectingPTY { if options.Timeout == 0 { options.Timeout = 5 * time.Minute } @@ -101,200 +76,152 @@ func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger } logger.Debug(ctx, "auto backend selection", slog.F("backend", options.BackendType)) } - rpty := &ReconnectingPTY{ - cond: sync.NewCond(&sync.Mutex{}), - options: options, - state: StateStarting, - } switch options.BackendType { case codersdk.ReconnectingPTYBackendTypeScreen: - // The screen backend is not passed closeSession because we have no way of - // knowing when the screen daemon dies externally anyway. The consequence is - // that we might leave reconnecting ptys in memory around longer than they - // need to be but they will eventually clean up with the timer or context, - // or the next attach will respawn the screen daemon which is fine too. - rpty.backend = &screenBackend{ - command: cmd, - metrics: rpty.options.Metrics, - } + return newScreen(ctx, cmd, options, logger) default: - rpty.backend = &bufferedBackend{ - command: cmd, - metrics: rpty.options.Metrics, - closeSession: func(reason string) { - rpty.Close(reason) - }, - } + return newBuffered(ctx, cmd, options, logger) } - go rpty.lifecycle(ctx, logger) - return rpty } -// Attach attaches the provided connection to the pty, waits for the attach to -// complete, then pipes the pty and the connection and blocks until EOF. The -// connection is expected to send JSON-encoded messages and accept raw output -// from the ptty. If the context ends or the process dies the connection will -// be detached and closed. -func (rpty *ReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { - // This will kill the heartbeat and detach the connection from the backend - // once we hit EOF on the connection (or an error). - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - state, err := rpty.waitForStateOrContext(ctx, StateReady) - if state != StateReady { - return xerrors.Errorf("reconnecting pty ready wait: %w", err) - } +// heartbeat resets timer before timeout elapses and blocks until ctx ends. +func heartbeat(ctx context.Context, timer *time.Timer, timeout time.Duration) { + // Reset now in case it is near the end. + timer.Reset(timeout) - go rpty.heartbeat(ctx) + // Reset when the context ends to ensure the pty stays up for the full + // timeout. + defer timer.Reset(timeout) - ptty, err := rpty.backend.attach(ctx, connID, conn, height, width, logger) - if err != nil { - return xerrors.Errorf("reconnecting pty attach: %w", err) - } + heartbeat := time.NewTicker(timeout / 2) + defer heartbeat.Stop() - decoder := json.NewDecoder(conn) - var req codersdk.ReconnectingPTYRequest for { - err = decoder.Decode(&req) - if xerrors.Is(err, io.EOF) { - return nil - } - if err != nil { - logger.Warn(ctx, "reconnecting pty failed with read error", slog.Error(err)) - return nil - } - _, err = ptty.InputWriter().Write([]byte(req.Data)) - if err != nil { - logger.Warn(ctx, "reconnecting pty failed with write error", slog.Error(err)) - rpty.options.Metrics.WithLabelValues("input_writer").Add(1) - return nil - } - // Check if a resize needs to happen! - if req.Height == 0 || req.Width == 0 { - continue - } - err = ptty.Resize(req.Height, req.Width) - if err != nil { - // We can continue after this, it's not fatal! - logger.Warn(ctx, "reconnecting pty resize failed, but will continue", slog.Error(err)) - rpty.options.Metrics.WithLabelValues("resize").Add(1) + select { + case <-ctx.Done(): + return + case <-heartbeat.C: + timer.Reset(timeout) } } } -// Wait waits for the reconnecting pty to close. The underlying process might -// still be exiting. -func (rpty *ReconnectingPTY) Wait() { - _, _ = rpty.waitForState(StateClosing) -} - -// Close kills the reconnecting pty process. -func (rpty *ReconnectingPTY) Close(reason string) { - // The closing state change will be handled by the lifecycle. - rpty.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %s", reason)) -} - -// lifecycle manages the lifecycle of the reconnecting pty. If the context ends -// the reconnecting pty will be closed. -func (rpty *ReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { - err := rpty.backend.start(ctx, logger) - if err != nil { - rpty.setState(StateDone, xerrors.Errorf("reconnecting pty start: %w", err)) - return - } - - // The initial timeout for attaching will probably be far shorter than the - // reconnect timeout in most cases; in tests it might be longer. It should be - // at least long enough for the first screen attach to be able to start up the - // daemon. - rpty.timer = time.AfterFunc(attachTimeout, func() { - rpty.Close("reconnecting pty timeout") - }) +// State represents the current state of the reconnecting pty. States are +// sequential and will only move forward. +type State int - logger.Debug(ctx, "reconnecting pty ready") - rpty.setState(StateReady, nil) +const ( + // StateStarting is the default/start state. Attaching will block until the + // reconnecting pty becomes ready. + StateStarting = iota + // StateReady means the reconnecting pty is ready to be attached. + StateReady + // StateClosing means the reconnecting pty has begun closing. The underlying + // process may still be exiting. Attaching will result in an error. + StateClosing + // StateDone means the reconnecting pty has completely shut down and the + // process has exited. Attaching will result in an error. + StateDone +) - state, reasonErr := rpty.waitForStateOrContext(ctx, StateClosing) - if state < StateClosing { - // If we have not closed yet then the context is what unblocked us (which - // means the agent is shutting down) so move into the closing phase. - rpty.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %w", reasonErr)) - } - rpty.timer.Stop() - rpty.backend.close(ctx, logger) - logger.Debug(ctx, "closed reconnecting pty") - rpty.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) +// ptyState is a helper for tracking the reconnecting PTY's state. +type ptyState struct { + // cond broadcasts state changes and any accompanying errors. + cond *sync.Cond + // error describes the error that caused the state change, if there was one. + // It is not safe to access outside of cond.L. + error error + // state holds the current reconnecting pty state. It is not safe to access + // this outside of cond.L. + state State } -// heartbeat keeps the pty alive while the provided context is not done. -func (rpty *ReconnectingPTY) heartbeat(ctx context.Context) { - // We just connected so reset the timer now in case it is near the end. - rpty.timer.Reset(rpty.options.Timeout) - - // Reset when the connection closes to ensure the pty stays up for the full - // timeout. - defer rpty.timer.Reset(rpty.options.Timeout) - - heartbeat := time.NewTicker(rpty.options.Timeout / 2) - defer heartbeat.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-heartbeat.C: - rpty.timer.Reset(rpty.options.Timeout) - } +func newState() *ptyState { + return &ptyState{ + cond: sync.NewCond(&sync.Mutex{}), + state: StateStarting, } } // setState sets and broadcasts the provided state if it is greater than the // current state and the error if one has not already been set. -func (rpty *ReconnectingPTY) setState(state State, err error) { - rpty.cond.L.Lock() - defer rpty.cond.L.Unlock() +func (s *ptyState) setState(state State, err error) { + s.cond.L.Lock() + defer s.cond.L.Unlock() // Cannot regress states. For example, trying to close after the process is // done should leave us in the done state and not the closing state. - if state <= rpty.state { + if state <= s.state { return } - rpty.error = err - rpty.state = state - rpty.cond.Broadcast() + s.error = err + s.state = state + s.cond.Broadcast() } // waitForState blocks until the state or a greater one is reached. -func (rpty *ReconnectingPTY) waitForState(state State) (State, error) { - rpty.cond.L.Lock() - defer rpty.cond.L.Unlock() - for state > rpty.state { - rpty.cond.Wait() +func (s *ptyState) waitForState(state State) (State, error) { + s.cond.L.Lock() + defer s.cond.L.Unlock() + for state > s.state { + s.cond.Wait() } - return rpty.state, rpty.error + return s.state, s.error } // waitForStateOrContext blocks until the state or a greater one is reached or // the provided context ends. -func (rpty *ReconnectingPTY) waitForStateOrContext(ctx context.Context, state State) (State, error) { +func (s *ptyState) waitForStateOrContext(ctx context.Context, state State) (State, error) { nevermind := make(chan struct{}) defer close(nevermind) go func() { select { case <-ctx.Done(): // Wake up when the context ends. - rpty.cond.Broadcast() + s.cond.Broadcast() case <-nevermind: } }() - rpty.cond.L.Lock() - defer rpty.cond.L.Unlock() - for ctx.Err() == nil && state > rpty.state { - rpty.cond.Wait() + s.cond.L.Lock() + defer s.cond.L.Unlock() + for ctx.Err() == nil && state > s.state { + s.cond.Wait() } if ctx.Err() != nil { - return rpty.state, ctx.Err() + return s.state, ctx.Err() + } + return s.state, s.error +} + +// readConnLoop reads messages from conn and writes to ptty as needed. Blocks +// until EOF or an error writing to ptty or reading from conn. +func readConnLoop(ctx context.Context, conn net.Conn, ptty pty.PTYCmd, metrics *prometheus.CounterVec, logger slog.Logger) { + decoder := json.NewDecoder(conn) + var req codersdk.ReconnectingPTYRequest + for { + err := decoder.Decode(&req) + if xerrors.Is(err, io.EOF) { + return + } + if err != nil { + logger.Warn(ctx, "reconnecting pty failed with read error", slog.Error(err)) + return + } + _, err = ptty.InputWriter().Write([]byte(req.Data)) + if err != nil { + logger.Warn(ctx, "reconnecting pty failed with write error", slog.Error(err)) + metrics.WithLabelValues("input_writer").Add(1) + return + } + // Check if a resize needs to happen! + if req.Height == 0 || req.Width == 0 { + continue + } + err = ptty.Resize(req.Height, req.Width) + if err != nil { + // We can continue after this, it's not fatal! + logger.Warn(ctx, "reconnecting pty resize failed, but will continue", slog.Error(err)) + metrics.WithLabelValues("resize").Add(1) + } } - return rpty.state, rpty.error } diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index c4fb8e634718e..03569ef8235b9 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -23,8 +23,8 @@ import ( "github.com/coder/coder/pty" ) -// screenBackend provides a reconnectable PTY via `screen`. -type screenBackend struct { +// screenReconnectingPTY provides a reconnectable PTY via `screen`. +type screenReconnectingPTY struct { command *pty.Cmd // id holds the id of the session for both creating and attaching. This will @@ -45,24 +45,40 @@ type screenBackend struct { configFile string metrics *prometheus.CounterVec + + state *ptyState + // timer will close the reconnecting pty when it expires. The timer will be + // reset as long as there are active connections. + timer *time.Timer + timeout time.Duration } -// start writes config settings and creates the socket directory. It must only -// be called once. If we could, we would want to spawn the daemon here and -// attach each connection to it but since doing that spawns the daemon with a -// hardcoded 24x80 size it is not a very good user experience. Instead we will -// let the attach command spawn the daemon on its own which causes it to spawn -// with the specified size. -func (b *screenBackend) start(_ context.Context, _ slog.Logger) error { +// newScreen creates a new screen-backed reconnecting PTY. It writes config +// settings and creates the socket directory. If we could, we would want to +// spawn the daemon here and attach each connection to it but since doing that +// spawns the daemon with a hardcoded 24x80 size it is not a very good user +// experience. Instead we will let the attach command spawn the daemon on its +// own which causes it to spawn with the specified size. +func newScreen(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger) *screenReconnectingPTY { + rpty := &screenReconnectingPTY{ + command: cmd, + metrics: options.Metrics, + state: newState(), + timeout: options.Timeout, + } + + go rpty.lifecycle(ctx, logger) + // Socket paths are limited to around 100 characters on Linux and macOS which // depending on the temporary directory can be a problem. To give more leeway // use a short ID. buf := make([]byte, 4) _, err := rand.Read(buf) if err != nil { - return xerrors.Errorf("generate screen id: %w", err) + rpty.state.setState(StateDone, xerrors.Errorf("generate screen id: %w", err)) + return rpty } - b.id = hex.EncodeToString(buf) + rpty.id = hex.EncodeToString(buf) settings := []string{ // Tell screen not to handle motion for xterm* terminals which allows @@ -88,77 +104,84 @@ func (b *screenBackend) start(_ context.Context, _ slog.Logger) error { "escape ^Ss", } - b.configFile = filepath.Join(os.TempDir(), "coder-screen", "config") - err = os.MkdirAll(filepath.Dir(b.configFile), 0o700) + rpty.configFile = filepath.Join(os.TempDir(), "coder-screen", "config") + err = os.MkdirAll(filepath.Dir(rpty.configFile), 0o700) if err != nil { - return err + rpty.state.setState(StateDone, xerrors.Errorf("make screen config dir: %w", err)) + return rpty + } + + os.WriteFile(rpty.configFile, []byte(strings.Join(settings, "\n")), 0o600) + if err != nil { + rpty.state.setState(StateDone, xerrors.Errorf("create config file: %w", err)) + return rpty } - return os.WriteFile(b.configFile, []byte(strings.Join(settings, "\n")), 0o600) + return rpty } -// attach attaches to the screen session by ID (which will spawn the daemon if -// necessary) and hooks up the output to the connection. If the context ends it -// will kill the connection's process (not the daemon), detaching it. -func (b *screenBackend) attach(ctx context.Context, _ string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, error) { - // Ensure another attach does not come in and spawn a duplicate session. - b.mutex.Lock() - defer b.mutex.Unlock() +// lifecycle manages the lifecycle of the reconnecting pty. If the context ends +// the reconnecting pty will be closed. +func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { + rpty.timer = time.AfterFunc(attachTimeout, func() { + rpty.Close("reconnecting pty timeout") + }) - logger.Debug(ctx, "spawning screen client", slog.F("screen_id", b.id)) + logger.Debug(ctx, "reconnecting pty ready") + rpty.state.setState(StateReady, nil) - // Wrap the command with screen and tie it to the connection's context. - cmd := pty.CommandContext(ctx, "screen", append([]string{ - // -S is for setting the session's name. - "-S", b.id, - // -x allows attaching to an already attached session. - // -RR reattaches to the daemon or creates the session daemon if missing. - // -q disables the "New screen..." message that appears for five seconds - // when creating a new session with -RR. - // -c is the flag for the config file. - "-xRRqc", b.configFile, - b.command.Path, - // pty.Cmd duplicates Path as the first argument so remove it. - }, b.command.Args[1:]...)...) - cmd.Env = append(b.command.Env, "TERM=xterm-256color") - cmd.Dir = b.command.Dir - ptty, process, err := pty.Start(cmd, pty.WithPTYOption( - pty.WithSSHRequest(ssh.Pty{ - Window: ssh.Window{ - // Make sure to spawn at the right size because if we resize afterward it - // leaves confusing padding (screen will resize such that the screen - // contents are aligned to the bottom). - Height: int(height), - Width: int(width), - }, - }), - )) + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) + if state < StateClosing { + // If we have not closed yet then the context is what unblocked us (which + // means the agent is shutting down) so move into the closing phase. + rpty.Close(reasonErr.Error()) + } + rpty.timer.Stop() + + // If the command errors that the session is already gone that is fine. + err := rpty.sendCommand(context.Background(), "quit", []string{"No screen session found"}) if err != nil { - b.metrics.WithLabelValues("screen_spawn").Add(1) - return nil, err + logger.Error(ctx, "close screen session", slog.Error(err)) } - cleanup := func() { - pttyErr := ptty.Close() - if pttyErr != nil { - logger.Debug(ctx, "closed ptty with error", slog.Error(pttyErr)) - } - procErr := process.Kill() - if procErr != nil { - logger.Debug(ctx, "killed process with error", slog.Error(procErr)) - } + logger.Debug(ctx, "closed reconnecting pty") + rpty.state.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) +} + +func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + // This will kill the heartbeat once we hit EOF or an error. + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + defer func() { connErr := conn.Close() if connErr != nil { logger.Debug(ctx, "closed connection with error", slog.Error(connErr)) } + }() + + logger.Debug(ctx, "reconnecting pty attach") + ptty, process, err := rpty.attach(ctx, connID, conn, height, width, logger) + if err != nil { + return err } - // Pipe the process's output to the connection. + defer func() { + err := ptty.Close() + if err != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(err)) + } + err = process.Kill() + if err != nil { + logger.Debug(ctx, "killed process with error", slog.Error(err)) + } + }() + + // Pipe pty -> conn. // We do not need to separately monitor for the process exiting. When it // exits, our ptty.OutputReader() will return EOF after reading all process // output. go func() { - defer cleanup() buffer := make([]byte, 1024) for { read, err := ptty.OutputReader().Read(buffer) @@ -169,11 +192,15 @@ func (b *screenBackend) attach(ctx context.Context, _ string, conn net.Conn, hei logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) } else { logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) - b.metrics.WithLabelValues("screen_output_reader").Add(1) + rpty.metrics.WithLabelValues("screen_output_reader").Add(1) } // The process might have died because the session itself died or it // might have been separately killed and the session is still up ( - // example `exit` or we killed it when the connection closed). + // example `exit` or we killed it when the connection closed). If the + // session is still up we might leave the reconnecting pty in memory + // around longer than it needs to be but it will eventually clean up + // with the timer or context, or the next attach will respawn the screen + // daemon which is fine too. break } part := buffer[:read] @@ -182,40 +209,85 @@ func (b *screenBackend) attach(ctx context.Context, _ string, conn net.Conn, hei // Connection might have been closed. if errors.Unwrap(err).Error() != "endpoint is closed for send" { logger.Warn(ctx, "error writing to active conn", slog.Error(err)) - b.metrics.WithLabelValues("screen_write").Add(1) + rpty.metrics.WithLabelValues("screen_write").Add(1) } break } } }() - // Clean up the process when the connection or reconnecting pty closes. - go func() { - defer cleanup() - <-ctx.Done() - }() + // Pipe conn -> pty and block. + readConnLoop(ctx, conn, ptty, rpty.metrics, logger) + return nil +} + +// attach spawns the screen client and starts the hearbeat. It exists +// separately only so we can defer the mutex unlock which is not possible in +// Attach since it blocks. +func (rpty *screenReconnectingPTY) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { + // Ensure another attach does not come in and spawn a duplicate session. + rpty.mutex.Lock() + defer rpty.mutex.Unlock() + + state, err := rpty.state.waitForStateOrContext(ctx, StateReady) + if state != StateReady { + return nil, nil, xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + + logger.Debug(ctx, "spawning screen client", slog.F("screen_id", rpty.id)) + + // Wrap the command with screen and tie it to the connection's context. + cmd := pty.CommandContext(ctx, "screen", append([]string{ + // -S is for setting the session's name. + "-S", rpty.id, + // -x allows attaching to an already attached session. + // -RR reattaches to the daemon or creates the session daemon if missing. + // -q disables the "New screen..." message that appears for five seconds + // when creating a new session with -RR. + // -c is the flag for the config file. + "-xRRqc", rpty.configFile, + rpty.command.Path, + // pty.Cmd duplicates Path as the first argument so remove it. + }, rpty.command.Args[1:]...)...) + cmd.Env = append(rpty.command.Env, "TERM=xterm-256color") + cmd.Dir = rpty.command.Dir + ptty, process, err := pty.Start(cmd, pty.WithPTYOption( + pty.WithSSHRequest(ssh.Pty{ + Window: ssh.Window{ + // Make sure to spawn at the right size because if we resize afterward it + // leaves confusing padding (screen will resize such that the screen + // contents are aligned to the bottom). + Height: int(height), + Width: int(width), + }, + }), + )) + if err != nil { + rpty.metrics.WithLabelValues("screen_spawn").Add(1) + return nil, nil, err + } // Version seems to be the only command without a side effect (other than // making the version pop up briefly) so use it to wait for the session to // come up. If we do not wait we could end up spawning multiple sessions with // the same name. - err = b.sendCommand(ctx, "version", nil) + err = rpty.sendCommand(ctx, "version", nil) if err != nil { - cleanup() - b.metrics.WithLabelValues("screen_wait").Add(1) - return nil, err + err := ptty.Close() + if err != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(err)) + } + err = process.Kill() + if err != nil { + logger.Debug(ctx, "killed process with error", slog.Error(err)) + } + rpty.metrics.WithLabelValues("screen_wait").Add(1) + return nil, nil, err } - return ptty, nil -} - -// close asks screen to kill the session by its ID. -func (b *screenBackend) close(ctx context.Context, logger slog.Logger) { - // If the command errors that the session is already gone that is fine. - err := b.sendCommand(context.Background(), "quit", []string{"No screen session found"}) - if err != nil { - logger.Error(ctx, "close screen session", slog.Error(err)) - } + return ptty, process, nil } // sendCommand runs a screen command against a running screen session. If the @@ -224,7 +296,7 @@ func (b *screenBackend) close(ctx context.Context, logger slog.Logger) { // session is already dead). The command will be retried until successful, the // timeout is reached, or the context ends in which case the context error is // returned together with the last error from the command. -func (b *screenBackend) sendCommand(ctx context.Context, command string, successErrors []string) error { +func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command string, successErrors []string) error { ctx, cancel := context.WithTimeout(ctx, attachTimeout) defer cancel() @@ -234,14 +306,14 @@ func (b *screenBackend) sendCommand(ctx context.Context, command string, success //nolint:gosec cmd := exec.CommandContext(ctx, "screen", // -x targets an attached session. - "-x", b.id, + "-x", rpty.id, // -c is the flag for the config file. - "-c", b.configFile, + "-c", rpty.configFile, // -X runs a command in the matching session. "-X", command, ) - cmd.Env = append(b.command.Env, "TERM=xterm-256color") - cmd.Dir = b.command.Dir + cmd.Env = append(rpty.command.Env, "TERM=xterm-256color") + cmd.Dir = rpty.command.Dir cmd.Stdout = &stdout err := cmd.Run() if err == nil { @@ -258,7 +330,7 @@ func (b *screenBackend) sendCommand(ctx context.Context, command string, success // Things like "exit status 1" are imprecise so include stdout as it may // contain more information ("no screen session found" for example). if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { - lastErr = xerrors.Errorf("`screen -x %s -X %s`: %w: %s", b.id, command, err, stdoutStr) + lastErr = xerrors.Errorf("`screen -x %s -X %s`: %w: %s", rpty.id, command, err, stdoutStr) } return false @@ -284,3 +356,12 @@ func (b *screenBackend) sendCommand(ctx context.Context, command string, success } } } + +func (rpty *screenReconnectingPTY) Wait() { + _, _ = rpty.state.waitForState(StateClosing) +} + +func (rpty *screenReconnectingPTY) Close(reason string) { + // The closing state change will be handled by the lifecycle. + rpty.state.setState(StateClosing, xerrors.Errorf("reconnecting pty closing: %s", reason)) +} From a2149fc17ccd1c307337207496d51ae19d2c366e Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 13:32:17 -0800 Subject: [PATCH 06/28] Remove extra mutex unlock --- agent/reconnectingpty/buffered.go | 1 - 1 file changed, 1 deletion(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 1e472db2ccc0a..d0d015cdeeb76 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -219,7 +219,6 @@ func (rpty *bufferedReconnectingPTY) attach(ctx context.Context, connID string, _, err = conn.Write(prevBuf) if err != nil { rpty.metrics.WithLabelValues("write").Add(1) - rpty.mutex.Unlock() return xerrors.Errorf("write buffer to conn: %w", err) } rpty.activeConns[connID] = conn From e3c808b867b8be477d3ad9aaf78a7759b56f2dcc Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 13:41:04 -0800 Subject: [PATCH 07/28] Fix heartbeat typo in comment --- agent/reconnectingpty/screen.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 03569ef8235b9..cb391dee5cf48 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -221,7 +221,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co return nil } -// attach spawns the screen client and starts the hearbeat. It exists +// attach spawns the screen client and starts the heartbeat. It exists // separately only so we can defer the mutex unlock which is not possible in // Attach since it blocks. func (rpty *screenReconnectingPTY) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { From 369a36e03090ba7f4f7e32d59dd6893eb707927b Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 13:56:22 -0800 Subject: [PATCH 08/28] Tweak connection close We need to close the connection when the process exits otherwise the read will keep blocking. No need to close it where we were before since the agent already closes the connection once Attach yields. --- agent/reconnectingpty/buffered.go | 2 ++ agent/reconnectingpty/screen.go | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index d0d015cdeeb76..929e62a50a977 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -147,6 +147,8 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. rpty.circularBuffer.Reset() + // Log these closes only for debugging since the connections or processes + // might have already closed on their own. for _, conn := range rpty.activeConns { err := conn.Close() if err != nil { diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index cb391dee5cf48..24d024181dd28 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -153,13 +153,6 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co ctx, cancel := context.WithCancel(ctx) defer cancel() - defer func() { - connErr := conn.Close() - if connErr != nil { - logger.Debug(ctx, "closed connection with error", slog.Error(connErr)) - } - }() - logger.Debug(ctx, "reconnecting pty attach") ptty, process, err := rpty.attach(ctx, connID, conn, height, width, logger) if err != nil { @@ -167,6 +160,8 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co } defer func() { + // Log only for debugging since the process might have already exited on its + // own. err := ptty.Close() if err != nil { logger.Debug(ctx, "closed ptty with error", slog.Error(err)) @@ -182,6 +177,14 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co // exits, our ptty.OutputReader() will return EOF after reading all process // output. go func() { + // Close the connection when the process exits. Log only for debugging + // since the connection might have already closed on its own. + defer func() { + err := conn.Close() + if err != nil { + logger.Debug(ctx, "closed connection with error", slog.Error(err)) + } + }() buffer := make([]byte, 1024) for { read, err := ptty.OutputReader().Read(buffer) @@ -195,7 +198,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co rpty.metrics.WithLabelValues("screen_output_reader").Add(1) } // The process might have died because the session itself died or it - // might have been separately killed and the session is still up ( + // might have been separately killed and the session is still up (for // example `exit` or we killed it when the connection closed). If the // session is still up we might leave the reconnecting pty in memory // around longer than it needs to be but it will eventually clean up From 72d405c6764427ebb4add7d636822ec1a29cd5d0 Mon Sep 17 00:00:00 2001 From: Asher Date: Tue, 8 Aug 2023 14:34:03 -0800 Subject: [PATCH 09/28] Linter fixes I missed an err, it dislikes the attach/Attach naming, and some unused variables. --- agent/reconnectingpty/buffered.go | 6 +++--- agent/reconnectingpty/screen.go | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 929e62a50a977..1df21e78f8b13 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -176,7 +176,7 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, defer cancel() logger.Debug(ctx, "reconnecting pty attach") - err := rpty.attach(ctx, connID, conn, height, width, logger) + err := rpty.doAttach(ctx, connID, conn, height, width, logger) if err != nil { return err } @@ -192,10 +192,10 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, return nil } -// attach adds the connection to the map, replays the buffer, and starts the +// doAttach adds the connection to the map, replays the buffer, and starts the // heartbeat. It exists separately only so we can defer the mutex unlock which // is not possible in Attach since it blocks. -func (rpty *bufferedReconnectingPTY) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { +func (rpty *bufferedReconnectingPTY) doAttach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { // Ensure we do not write to or close connections while we attach. rpty.mutex.Lock() defer rpty.mutex.Unlock() diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 24d024181dd28..f95730d841561 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -111,7 +111,7 @@ func newScreen(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog. return rpty } - os.WriteFile(rpty.configFile, []byte(strings.Join(settings, "\n")), 0o600) + err = os.WriteFile(rpty.configFile, []byte(strings.Join(settings, "\n")), 0o600) if err != nil { rpty.state.setState(StateDone, xerrors.Errorf("create config file: %w", err)) return rpty @@ -148,13 +148,13 @@ func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Lo rpty.state.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) } -func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { +func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn net.Conn, height, width uint16, logger slog.Logger) error { // This will kill the heartbeat once we hit EOF or an error. ctx, cancel := context.WithCancel(ctx) defer cancel() logger.Debug(ctx, "reconnecting pty attach") - ptty, process, err := rpty.attach(ctx, connID, conn, height, width, logger) + ptty, process, err := rpty.doAttach(ctx, height, width, logger) if err != nil { return err } @@ -224,10 +224,10 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, connID string, co return nil } -// attach spawns the screen client and starts the heartbeat. It exists +// doAttach spawns the screen client and starts the heartbeat. It exists // separately only so we can defer the mutex unlock which is not possible in // Attach since it blocks. -func (rpty *screenReconnectingPTY) attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { +func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { // Ensure another attach does not come in and spawn a duplicate session. rpty.mutex.Lock() defer rpty.mutex.Unlock() From e37fc0f18fcf439bb1e8c0863b326cd1b696530f Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:29:09 -0800 Subject: [PATCH 10/28] Clear active conns on close --- agent/reconnectingpty/buffered.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 1df21e78f8b13..40264d05cd2ea 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -155,6 +155,10 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. logger.Debug(ctx, "closed conn with error", slog.Error(err)) } } + // Connections get removed once the pty closes but it is possible there is + // still some data that needs to be written so clear the map now to avoid + // writing to closed connections. + rpty.activeConns = map[string]net.Conn{} err := rpty.ptty.Close() if err != nil { From 61a4253cee8ec36c222d28bc647c5ae31114c852 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:29:45 -0800 Subject: [PATCH 11/28] Avoid useless buffer reset on close --- agent/reconnectingpty/buffered.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 40264d05cd2ea..679d772f4c475 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -145,8 +145,6 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. rpty.mutex.Lock() defer rpty.mutex.Unlock() - rpty.circularBuffer.Reset() - // Log these closes only for debugging since the connections or processes // might have already closed on their own. for _, conn := range rpty.activeConns { From c7978db5670598c7a8c9a8406dfe3c22cb483f55 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:31:24 -0800 Subject: [PATCH 12/28] Move lifecycle after buffer and process are set The lifecycle will try to close the process (and before it would clear the buffer as well) so they need to be set. --- agent/reconnectingpty/buffered.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 679d772f4c475..753e979c376cf 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -52,8 +52,6 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo timeout: options.Timeout, } - go rpty.lifecycle(ctx, logger) - // Default to buffer 64KiB. circularBuffer, err := circbuf.NewBuffer(64 << 10) if err != nil { @@ -75,6 +73,8 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo rpty.ptty = ptty rpty.process = process + go rpty.lifecycle(ctx, logger) + // Multiplex the output onto the circular buffer and each active connection. // We do not need to separately monitor for the process exiting. When it // exits, our ptty.OutputReader() will return EOF after reading all process @@ -125,7 +125,7 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo } // lifecycle manages the lifecycle of the reconnecting pty. If the context ends -// the reconnecting pty will be closed. +// the reconnecting pty and every connection will be closed. func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { rpty.timer = time.AfterFunc(attachTimeout, func() { rpty.Close("reconnecting pty timeout") From a083b31abc2af55ac4a4cd184d254b14c2010b18 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:34:44 -0800 Subject: [PATCH 13/28] Add info logs for starting, stopping, and attaching --- agent/reconnectingpty/buffered.go | 5 +++-- agent/reconnectingpty/reconnectingpty.go | 3 +++ agent/reconnectingpty/screen.go | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 753e979c376cf..f0c7d63863c67 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -168,16 +168,17 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. logger.Debug(ctx, "killed process with error", slog.Error(err)) } - logger.Debug(ctx, "closed reconnecting pty") + logger.Info(ctx, "closed reconnecting pty") rpty.state.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) } func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + logger.Info(ctx, "attach to reconnecting pty") + // This will kill the heartbeat once we hit EOF or an error. ctx, cancel := context.WithCancel(ctx) defer cancel() - logger.Debug(ctx, "reconnecting pty attach") err := rpty.doAttach(ctx, connID, conn, height, width, logger) if err != nil { return err diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 9000ad851d1c8..29fa171e29dc2 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -76,6 +76,9 @@ func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger } logger.Debug(ctx, "auto backend selection", slog.F("backend", options.BackendType)) } + + logger.Info(ctx, "start reconnecting pty") + switch options.BackendType { case codersdk.ReconnectingPTYBackendTypeScreen: return newScreen(ctx, cmd, options, logger) diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index f95730d841561..afc3584420ec6 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -144,16 +144,17 @@ func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Lo logger.Error(ctx, "close screen session", slog.Error(err)) } - logger.Debug(ctx, "closed reconnecting pty") + logger.Info(ctx, "closed reconnecting pty") rpty.state.setState(StateDone, xerrors.Errorf("reconnecting pty closed: %w", reasonErr)) } func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn net.Conn, height, width uint16, logger slog.Logger) error { + logger.Info(ctx, "attach to reconnecting pty") + // This will kill the heartbeat once we hit EOF or an error. ctx, cancel := context.WithCancel(ctx) defer cancel() - logger.Debug(ctx, "reconnecting pty attach") ptty, process, err := rpty.doAttach(ctx, height, width, logger) if err != nil { return err From bb40f7897254697978059d64ad3589b14d295477 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:40:03 -0800 Subject: [PATCH 14/28] Do not hold mutex while waiting for state in screen Holding off on changing this for the buffered version until a potential issue with the pty closing before we add to the map is sorted. I moved the heartbeat start out as well since while we want to wait to start that until after ready, it does not need to be behind the mutex. --- agent/reconnectingpty/screen.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index afc3584420ec6..84c79ea33db82 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -155,6 +155,13 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne ctx, cancel := context.WithCancel(ctx) defer cancel() + state, err := rpty.state.waitForStateOrContext(ctx, StateReady) + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + ptty, process, err := rpty.doAttach(ctx, height, width, logger) if err != nil { return err @@ -233,13 +240,6 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u rpty.mutex.Lock() defer rpty.mutex.Unlock() - state, err := rpty.state.waitForStateOrContext(ctx, StateReady) - if state != StateReady { - return nil, nil, xerrors.Errorf("reconnecting pty ready wait: %w", err) - } - - go heartbeat(ctx, rpty.timer, rpty.timeout) - logger.Debug(ctx, "spawning screen client", slog.F("screen_id", rpty.id)) // Wrap the command with screen and tie it to the connection's context. From 089e1f998f4e381589dd85d27358828455211041 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 10:46:09 -0800 Subject: [PATCH 15/28] Remove incorrect statement about closing on Attach In some cases it might get closed but it is not a guarantee like the comment claims it is. While we could implement this behavior properly we already close the connection in the caller so it seems unnecessary. --- agent/reconnectingpty/reconnectingpty.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 29fa171e29dc2..75e623488417c 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -44,7 +44,7 @@ type ReconnectingPTY interface { // history, then blocks until EOF, an error, or the context's end. The // connection is expected to send JSON-encoded messages and accept raw output // from the ptty. If the context ends or the process dies the connection will - // be detached and closed. + // be detached. Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error // Wait waits for the reconnecting pty to close. The underlying process might // still be exiting. From ee670454caf2159e56c6b5908eadd1f5638263c7 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 11:05:41 -0800 Subject: [PATCH 16/28] Remove backend type from SDK/API Still need to figure out a way to test both, but we would rather not add the ability to select the backend through the API. --- agent/agent.go | 7 +- agent/agent_test.go | 17 ++-- agent/reconnectingpty/reconnectingpty.go | 21 ++--- cli/exp_scaletest.go | 9 +- coderd/httpapi/queryparams.go | 2 +- coderd/insights_test.go | 9 +- coderd/workspaceapps/apptest/apptest.go | 113 +++++++++-------------- coderd/workspaceapps/proxy.go | 3 +- codersdk/workspaceagentconn.go | 38 ++------ codersdk/workspaceagents.go | 12 +-- scaletest/createworkspaces/run_test.go | 14 ++- scaletest/reconnectingpty/config_test.go | 9 +- scaletest/reconnectingpty/run.go | 12 +-- scaletest/reconnectingpty/run_test.go | 24 ++--- scaletest/workspacetraffic/conn.go | 11 +-- site/src/api/typesGenerated.ts | 8 -- 16 files changed, 115 insertions(+), 194 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 05d9291b8b6a5..9e49e53177958 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1108,7 +1108,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m } c <- rpty // Put it back for the next reconnect. } else { - logger.Debug(ctx, "creating new reconnecting pty", slog.F("backend", msg.BackendType)) + logger.Debug(ctx, "creating new reconnecting pty") connected := false defer func() { @@ -1126,9 +1126,8 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m } rpty = reconnectingpty.New(ctx, cmd, &reconnectingpty.Options{ - BackendType: msg.BackendType, - Timeout: a.reconnectingPTYTimeout, - Metrics: a.metrics.reconnectingPTYErrors, + Timeout: a.reconnectingPTYTimeout, + Metrics: a.metrics.reconnectingPTYErrors, }, logger.With(slog.F("message_id", msg.ID))) if err = a.trackConnGoroutine(func() { diff --git a/agent/agent_test.go b/agent/agent_test.go index 42b5d4ebdef26..5b0cca5afded7 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -103,7 +103,7 @@ func TestAgent_Stats_ReconnectingPTY(t *testing.T) { //nolint:dogsled conn, _, stats, _, _ := setupAgent(t, agentsdk.Manifest{}, 0) - ptyConn, err := conn.ReconnectingPTY(ctx, uuid.New(), 128, 128, "bash", codersdk.ReconnectingPTYBackendTypeBuffered) + ptyConn, err := conn.ReconnectingPTY(ctx, uuid.New(), 128, 128, "bash") require.NoError(t, err) defer ptyConn.Close() @@ -1601,18 +1601,15 @@ func TestAgent_ReconnectingPTY(t *testing.T) { t.Skip("ConPTY appears to be inconsistent on Windows.") } - backends := []codersdk.ReconnectingPTYBackendType{ - codersdk.ReconnectingPTYBackendTypeBuffered, - codersdk.ReconnectingPTYBackendTypeScreen, - } + backends := []string{"Buffered", "Screen"} for _, backendType := range backends { backendType := backendType - t.Run(string(backendType), func(t *testing.T) { + t.Run(backendType, func(t *testing.T) { t.Parallel() if runtime.GOOS == "darwin" { t.Skip("`screen` is flaky on darwin") - } else if backendType == codersdk.ReconnectingPTYBackendTypeScreen { + } else if backendType == "Screen" { _, err := exec.LookPath("screen") if err != nil { t.Skip("`screen` not found") @@ -1625,14 +1622,14 @@ func TestAgent_ReconnectingPTY(t *testing.T) { //nolint:dogsled conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0) id := uuid.New() - netConn1, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + netConn1, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash") require.NoError(t, err) defer netConn1.Close() scanner1 := bufio.NewScanner(netConn1) // A second simultaneous connection. - netConn2, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + netConn2, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash") require.NoError(t, err) defer netConn2.Close() scanner2 := bufio.NewScanner(netConn2) @@ -1683,7 +1680,7 @@ func TestAgent_ReconnectingPTY(t *testing.T) { _ = netConn1.Close() _ = netConn2.Close() - netConn3, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash", backendType) + netConn3, err := conn.ReconnectingPTY(ctx, id, 100, 100, "bash") require.NoError(t, err) defer netConn3.Close() diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 75e623488417c..d8f2ee2229d1f 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -32,8 +32,6 @@ type Options struct { Timeout time.Duration // Metrics tracks various error counters. Metrics *prometheus.CounterVec - // BackendType indicates which backend to use for reconnections. - BackendType codersdk.ReconnectingPTYBackendType } // ReconnectingPTY is a pty that can be reconnected within a timeout and to @@ -64,23 +62,20 @@ func New(ctx context.Context, cmd *pty.Cmd, options *Options, logger slog.Logger } // Screen seems flaky on Darwin. Locally the tests pass 100% of the time (100 // runs) but in CI screen often incorrectly claims the session name does not - // exist even though screen -list shows it. - if runtime.GOOS == "darwin" { - options.BackendType = codersdk.ReconnectingPTYBackendTypeBuffered - } else if options.BackendType == "" || options.BackendType == codersdk.ReconnectingPTYBackendTypeAuto { + // exist even though screen -list shows it. For now, restrict screen to + // Linux. + backendType := "buffered" + if runtime.GOOS == "linux" { _, err := exec.LookPath("screen") if err == nil { - options.BackendType = codersdk.ReconnectingPTYBackendTypeScreen - } else { - options.BackendType = codersdk.ReconnectingPTYBackendTypeBuffered + backendType = "screen" } - logger.Debug(ctx, "auto backend selection", slog.F("backend", options.BackendType)) } - logger.Info(ctx, "start reconnecting pty") + logger.Info(ctx, "start reconnecting pty", slog.F("backend_type", backendType)) - switch options.BackendType { - case codersdk.ReconnectingPTYBackendTypeScreen: + switch backendType { + case "screen": return newScreen(ctx, cmd, options, logger) default: return newBuffered(ctx, cmd, options, logger) diff --git a/cli/exp_scaletest.go b/cli/exp_scaletest.go index eb5e4e6d60817..d2ee36c1819eb 100644 --- a/cli/exp_scaletest.go +++ b/cli/exp_scaletest.go @@ -676,11 +676,10 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { config.ReconnectingPTY = &reconnectingpty.Config{ // AgentID is set by the test automatically. Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - ID: uuid.Nil, - Height: 24, - Width: 80, - Command: runCommand, - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + ID: uuid.Nil, + Height: 24, + Width: 80, + Command: runCommand, }, Timeout: httpapi.Duration(runTimeout), ExpectTimeout: runExpectTimeout, diff --git a/coderd/httpapi/queryparams.go b/coderd/httpapi/queryparams.go index 1ca9824a0d998..3f16565e1dd20 100644 --- a/coderd/httpapi/queryparams.go +++ b/coderd/httpapi/queryparams.go @@ -161,7 +161,7 @@ func (p *QueryParamParser) Strings(vals url.Values, def []string, queryParam str // ValidEnum parses enum query params. Add more to the list as needed. type ValidEnum interface { database.ResourceType | database.AuditAction | database.BuildReason | database.UserStatus | - database.WorkspaceStatus | codersdk.ReconnectingPTYBackendType + database.WorkspaceStatus // Valid is required on the enum type to be used with ParseEnum. Valid() bool diff --git a/coderd/insights_test.go b/coderd/insights_test.go index c7a655be4c752..be75b31f5e80e 100644 --- a/coderd/insights_test.go +++ b/coderd/insights_test.go @@ -348,11 +348,10 @@ func TestTemplateInsights(t *testing.T) { // Start an rpty session to generate rpty usage stats. rpty, err := client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: resources[0].Agents[0].ID, - Reconnect: uuid.New(), - Width: 80, - Height: 24, - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + AgentID: resources[0].Agents[0].ID, + Reconnect: uuid.New(), + Width: 80, + Height: 24, }) require.NoError(t, err) defer rpty.Close() diff --git a/coderd/workspaceapps/apptest/apptest.go b/coderd/workspaceapps/apptest/apptest.go index 512e91eeb70a4..1c20fd19f5b28 100644 --- a/coderd/workspaceapps/apptest/apptest.go +++ b/coderd/workspaceapps/apptest/apptest.go @@ -11,7 +11,6 @@ import ( "net/http/cookiejar" "net/http/httputil" "net/url" - "os/exec" "path" "regexp" "runtime" @@ -57,81 +56,59 @@ func Run(t *testing.T, appHostIsPrimary bool, factory DeploymentFactory) { t.Skip("ConPTY appears to be inconsistent on Windows.") } - backends := []codersdk.ReconnectingPTYBackendType{ - codersdk.ReconnectingPTYBackendTypeBuffered, - codersdk.ReconnectingPTYBackendTypeScreen, - } - - for _, backendType := range backends { - backendType := backendType - t.Run(string(backendType), func(t *testing.T) { - t.Parallel() - if runtime.GOOS == "darwin" { - t.Skip("`screen` is flaky on darwin") - } else if backendType == codersdk.ReconnectingPTYBackendTypeScreen { - _, err := exec.LookPath("screen") - if err != nil { - t.Skip("`screen` not found") - } - } - - t.Run("OK", func(t *testing.T) { - t.Parallel() - appDetails := setupProxyTest(t, nil) + t.Run("OK", func(t *testing.T) { + t.Parallel() + appDetails := setupProxyTest(t, nil) - ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) - defer cancel() + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() - // Run the test against the path app hostname since that's where the - // reconnecting-pty proxy server we want to test is mounted. - client := appDetails.AppClient(t) - testReconnectingPTY(ctx, t, client, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: appDetails.Agent.ID, - Reconnect: uuid.New(), - Height: 80, - Width: 80, - Command: "bash", - BackendType: backendType, - }) - }) + // Run the test against the path app hostname since that's where the + // reconnecting-pty proxy server we want to test is mounted. + client := appDetails.AppClient(t) + testReconnectingPTY(ctx, t, client, codersdk.WorkspaceAgentReconnectingPTYOpts{ + AgentID: appDetails.Agent.ID, + Reconnect: uuid.New(), + Height: 80, + Width: 80, + Command: "bash", + }) + }) - t.Run("SignedTokenQueryParameter", func(t *testing.T) { - t.Parallel() - if appHostIsPrimary { - t.Skip("Tickets are not used for terminal requests on the primary.") - } + t.Run("SignedTokenQueryParameter", func(t *testing.T) { + t.Parallel() + if appHostIsPrimary { + t.Skip("Tickets are not used for terminal requests on the primary.") + } - appDetails := setupProxyTest(t, nil) + appDetails := setupProxyTest(t, nil) - u := *appDetails.PathAppBaseURL - if u.Scheme == "http" { - u.Scheme = "ws" - } else { - u.Scheme = "wss" - } - u.Path = fmt.Sprintf("/api/v2/workspaceagents/%s/pty", appDetails.Agent.ID.String()) + u := *appDetails.PathAppBaseURL + if u.Scheme == "http" { + u.Scheme = "ws" + } else { + u.Scheme = "wss" + } + u.Path = fmt.Sprintf("/api/v2/workspaceagents/%s/pty", appDetails.Agent.ID.String()) - ctx := testutil.Context(t, testutil.WaitLong) - issueRes, err := appDetails.SDKClient.IssueReconnectingPTYSignedToken(ctx, codersdk.IssueReconnectingPTYSignedTokenRequest{ - URL: u.String(), - AgentID: appDetails.Agent.ID, - }) - require.NoError(t, err) + ctx := testutil.Context(t, testutil.WaitLong) + issueRes, err := appDetails.SDKClient.IssueReconnectingPTYSignedToken(ctx, codersdk.IssueReconnectingPTYSignedTokenRequest{ + URL: u.String(), + AgentID: appDetails.Agent.ID, + }) + require.NoError(t, err) - // Make an unauthenticated client. - unauthedAppClient := codersdk.New(appDetails.AppClient(t).URL) - testReconnectingPTY(ctx, t, unauthedAppClient, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: appDetails.Agent.ID, - Reconnect: uuid.New(), - Height: 80, - Width: 80, - Command: "bash", - SignedToken: issueRes.SignedToken, - BackendType: backendType, - }) - }) + // Make an unauthenticated client. + unauthedAppClient := codersdk.New(appDetails.AppClient(t).URL) + testReconnectingPTY(ctx, t, unauthedAppClient, codersdk.WorkspaceAgentReconnectingPTYOpts{ + AgentID: appDetails.Agent.ID, + Reconnect: uuid.New(), + Height: 80, + Width: 80, + Command: "bash", + SignedToken: issueRes.SignedToken, }) - } + }) }) t.Run("WorkspaceAppsProxyPath", func(t *testing.T) { diff --git a/coderd/workspaceapps/proxy.go b/coderd/workspaceapps/proxy.go index 8e49ef9480342..d75970bcfa8d9 100644 --- a/coderd/workspaceapps/proxy.go +++ b/coderd/workspaceapps/proxy.go @@ -632,7 +632,6 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { reconnect := parser.Required("reconnect").UUID(values, uuid.New(), "reconnect") height := parser.UInt(values, 80, "height") width := parser.UInt(values, 80, "width") - backendType := httpapi.ParseCustom(parser, values, codersdk.ReconnectingPTYBackendTypeAuto, "backend", httpapi.ParseEnum[codersdk.ReconnectingPTYBackendType]) if len(parser.Errors) > 0 { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Invalid query parameters.", @@ -671,7 +670,7 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { } defer release() log.Debug(ctx, "dialed workspace agent") - ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command"), backendType) + ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command")) if err != nil { log.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err)) _ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial: %s", err)) diff --git a/codersdk/workspaceagentconn.go b/codersdk/workspaceagentconn.go index 259fe2713375c..6b9b6f0d33f44 100644 --- a/codersdk/workspaceagentconn.go +++ b/codersdk/workspaceagentconn.go @@ -193,32 +193,13 @@ func (c *WorkspaceAgentConn) Close() error { return c.Conn.Close() } -type ReconnectingPTYBackendType string - -const ( - ReconnectingPTYBackendTypeAuto ReconnectingPTYBackendType = "auto" - ReconnectingPTYBackendTypeScreen ReconnectingPTYBackendType = "screen" - ReconnectingPTYBackendTypeBuffered ReconnectingPTYBackendType = "buffered" -) - -func (s ReconnectingPTYBackendType) Valid() bool { - switch s { - case ReconnectingPTYBackendTypeAuto, ReconnectingPTYBackendTypeBuffered, - ReconnectingPTYBackendTypeScreen: - return true - default: - return false - } -} - // WorkspaceAgentReconnectingPTYInit initializes a new reconnecting PTY session. // @typescript-ignore WorkspaceAgentReconnectingPTYInit type WorkspaceAgentReconnectingPTYInit struct { - ID uuid.UUID - Height uint16 - Width uint16 - Command string - BackendType ReconnectingPTYBackendType + ID uuid.UUID + Height uint16 + Width uint16 + Command string } // ReconnectingPTYRequest is sent from the client to the server @@ -233,7 +214,7 @@ type ReconnectingPTYRequest struct { // ReconnectingPTY spawns a new reconnecting terminal session. // `ReconnectingPTYRequest` should be JSON marshaled and written to the returned net.Conn. // Raw terminal output will be read from the returned net.Conn. -func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, height, width uint16, command string, backendType ReconnectingPTYBackendType) (net.Conn, error) { +func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, height, width uint16, command string) (net.Conn, error) { ctx, span := tracing.StartSpan(ctx) defer span.End() @@ -246,11 +227,10 @@ func (c *WorkspaceAgentConn) ReconnectingPTY(ctx context.Context, id uuid.UUID, return nil, err } data, err := json.Marshal(WorkspaceAgentReconnectingPTYInit{ - ID: id, - Height: height, - Width: width, - Command: command, - BackendType: backendType, + ID: id, + Height: height, + Width: width, + Command: command, }) if err != nil { _ = conn.Close() diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index e03028df64e7f..e9aad8421e36a 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -552,12 +552,11 @@ func (c *Client) IssueReconnectingPTYSignedToken(ctx context.Context, req IssueR // @typescript-ignore:WorkspaceAgentReconnectingPTYOpts type WorkspaceAgentReconnectingPTYOpts struct { - AgentID uuid.UUID - Reconnect uuid.UUID - Width uint16 - Height uint16 - Command string - BackendType ReconnectingPTYBackendType + AgentID uuid.UUID + Reconnect uuid.UUID + Width uint16 + Height uint16 + Command string // SignedToken is an optional signed token from the // issue-reconnecting-pty-signed-token endpoint. If set, the session token @@ -578,7 +577,6 @@ func (c *Client) WorkspaceAgentReconnectingPTY(ctx context.Context, opts Workspa q.Set("width", strconv.Itoa(int(opts.Width))) q.Set("height", strconv.Itoa(int(opts.Height))) q.Set("command", opts.Command) - q.Set("backend", string(opts.BackendType)) // If we're using a signed token, set the query parameter. if opts.SignedToken != "" { q.Set(SignedAppTokenQueryParameter, opts.SignedToken) diff --git a/scaletest/createworkspaces/run_test.go b/scaletest/createworkspaces/run_test.go index 039cdbbb8708e..ac9812dccf3ce 100644 --- a/scaletest/createworkspaces/run_test.go +++ b/scaletest/createworkspaces/run_test.go @@ -106,10 +106,9 @@ func Test_Runner(t *testing.T) { }, ReconnectingPTY: &reconnectingpty.Config{ Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Height: 24, - Width: 80, - Command: "echo hello", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Height: 24, + Width: 80, + Command: "echo hello", }, Timeout: httpapi.Duration(testutil.WaitLong), }, @@ -231,10 +230,9 @@ func Test_Runner(t *testing.T) { }, ReconnectingPTY: &reconnectingpty.Config{ Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Height: 24, - Width: 80, - Command: "echo hello", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Height: 24, + Width: 80, + Command: "echo hello", }, Timeout: httpapi.Duration(testutil.WaitLong), }, diff --git a/scaletest/reconnectingpty/config_test.go b/scaletest/reconnectingpty/config_test.go index 5b7e3feef6501..0c5200bfd7fe6 100644 --- a/scaletest/reconnectingpty/config_test.go +++ b/scaletest/reconnectingpty/config_test.go @@ -32,11 +32,10 @@ func Test_Config(t *testing.T) { config: reconnectingpty.Config{ AgentID: id, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - ID: id, - Width: 80, - Height: 24, - Command: "echo 'hello world'", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + ID: id, + Width: 80, + Height: 24, + Command: "echo 'hello world'", }, Timeout: httpapi.Duration(time.Minute), ExpectTimeout: false, diff --git a/scaletest/reconnectingpty/run.go b/scaletest/reconnectingpty/run.go index 929e8bf1c612c..afded88eb0412 100644 --- a/scaletest/reconnectingpty/run.go +++ b/scaletest/reconnectingpty/run.go @@ -62,16 +62,14 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { _, _ = fmt.Fprintf(logs, "\tID: %s\n", id.String()) _, _ = fmt.Fprintf(logs, "\tWidth: %d\n", width) _, _ = fmt.Fprintf(logs, "\tHeight: %d\n", height) - _, _ = fmt.Fprintf(logs, "\tBackend: %s\n", r.cfg.Init.BackendType) _, _ = fmt.Fprintf(logs, "\tCommand: %q\n\n", r.cfg.Init.Command) conn, err := r.client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: r.cfg.AgentID, - Reconnect: id, - Width: width, - Height: height, - Command: r.cfg.Init.Command, - BackendType: r.cfg.Init.BackendType, + AgentID: r.cfg.AgentID, + Reconnect: id, + Width: width, + Height: height, + Command: r.cfg.Init.Command, }) if err != nil { return xerrors.Errorf("open reconnecting PTY: %w", err) diff --git a/scaletest/reconnectingpty/run_test.go b/scaletest/reconnectingpty/run_test.go index be92313eda5c4..382a3718436f9 100644 --- a/scaletest/reconnectingpty/run_test.go +++ b/scaletest/reconnectingpty/run_test.go @@ -34,8 +34,7 @@ func Test_Runner(t *testing.T) { AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ // Use ; here because it's powershell compatible (vs &&). - Command: "echo 'hello world'; sleep 1", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'; sleep 1", }, LogOutput: true, }) @@ -63,8 +62,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'", }, LogOutput: false, }) @@ -92,8 +90,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'", }, Timeout: httpapi.Duration(2 * testutil.WaitSuperLong), LogOutput: true, @@ -117,8 +114,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "sleep 120", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "sleep 120", }, Timeout: httpapi.Duration(2 * time.Second), LogOutput: true, @@ -147,8 +143,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "sleep 120", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "sleep 120", }, Timeout: httpapi.Duration(2 * time.Second), ExpectTimeout: true, @@ -173,8 +168,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'", }, Timeout: httpapi.Duration(2 * testutil.WaitSuperLong), ExpectTimeout: true, @@ -204,8 +198,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'; sleep 1", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'; sleep 1", }, ExpectOutput: "hello world", LogOutput: false, @@ -229,8 +222,7 @@ func Test_Runner(t *testing.T) { runner := reconnectingpty.NewRunner(client, reconnectingpty.Config{ AgentID: agentID, Init: codersdk.WorkspaceAgentReconnectingPTYInit{ - Command: "echo 'hello world'; sleep 1", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + Command: "echo 'hello world'; sleep 1", }, ExpectOutput: "bello borld", LogOutput: false, diff --git a/scaletest/workspacetraffic/conn.go b/scaletest/workspacetraffic/conn.go index aced56e6e0576..4be38a02c6abf 100644 --- a/scaletest/workspacetraffic/conn.go +++ b/scaletest/workspacetraffic/conn.go @@ -15,12 +15,11 @@ import ( func connectPTY(ctx context.Context, client *codersdk.Client, agentID, reconnect uuid.UUID) (*countReadWriteCloser, error) { conn, err := client.WorkspaceAgentReconnectingPTY(ctx, codersdk.WorkspaceAgentReconnectingPTYOpts{ - AgentID: agentID, - Reconnect: reconnect, - Height: 25, - Width: 80, - Command: "sh", - BackendType: codersdk.ReconnectingPTYBackendTypeBuffered, + AgentID: agentID, + Reconnect: reconnect, + Height: 25, + Width: 80, + Command: "sh", }) if err != nil { return nil, xerrors.Errorf("connect pty: %w", err) diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index b880d02982de3..3e5acc5671299 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -1770,14 +1770,6 @@ export const RBACResources: RBACResource[] = [ "workspace_proxy", ] -// From codersdk/workspaceagentconn.go -export type ReconnectingPTYBackendType = "auto" | "buffered" | "screen" -export const ReconnectingPTYBackendTypes: ReconnectingPTYBackendType[] = [ - "auto", - "buffered", - "screen", -] - // From codersdk/audit.go export type ResourceType = | "api_key" From a6bcdd281bd25335c835518570149948d3505085 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 12:08:39 -0800 Subject: [PATCH 17/28] Use PATH to test buffered reconnecting pty --- agent/agent_test.go | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 5b0cca5afded7..ccc4b0910dbe8 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1592,8 +1592,8 @@ const ansi = "[\u001B\u009B][[\\]()#;?]*(?:(?:(?:[a-zA-Z\\d]*(?:;[a-zA-Z\\d]*)*) var re = regexp.MustCompile(ansi) +//nolint:paralleltest // This test sets an environment variable. func TestAgent_ReconnectingPTY(t *testing.T) { - t.Parallel() if runtime.GOOS == "windows" { // This might be our implementation, or ConPTY itself. // It's difficult to find extensive tests for it, so @@ -1603,17 +1603,30 @@ func TestAgent_ReconnectingPTY(t *testing.T) { backends := []string{"Buffered", "Screen"} + _, err := exec.LookPath("screen") + hasScreen := err == nil + for _, backendType := range backends { backendType := backendType t.Run(backendType, func(t *testing.T) { - t.Parallel() - if runtime.GOOS == "darwin" { - t.Skip("`screen` is flaky on darwin") - } else if backendType == "Screen" { - _, err := exec.LookPath("screen") - if err != nil { + if backendType == "Screen" { + t.Parallel() + if runtime.GOOS != "linux" { + t.Skipf("`screen` is not supported on %s", runtime.GOOS) + } else if !hasScreen { t.Skip("`screen` not found") } + } else if hasScreen && runtime.GOOS == "linux" { + // Set up a PATH that does not have screen in it. + bashPath, err := exec.LookPath("bash") + require.NoError(t, err) + dir, err := os.MkdirTemp("/tmp", "coder-test-reconnecting-pty-PATH") + require.NoError(t, err, "create temp dir for reconnecting pty PATH") + err = os.Symlink(bashPath, filepath.Join(dir, "bash")) + require.NoError(t, err, "symlink bash into reconnecting pty PATH") + t.Setenv("PATH", dir) + } else { + t.Parallel() } ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) From 3ff1510fa0ce86cfcc449851adef5fb81ded760d Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 12:55:00 -0800 Subject: [PATCH 18/28] Do not hold mutex while waiting for state Since we might add to the map after it closes, tweak the delete to also run when the pty closes. I think this also fixes that issue where if you canceled the context you pass into Attach it would not do anything since readConnLoop does not use the context for anything but logging. --- agent/reconnectingpty/buffered.go | 47 +++++++++++++------------------ 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index f0c7d63863c67..760fb06b36a07 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -125,7 +125,7 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo } // lifecycle manages the lifecycle of the reconnecting pty. If the context ends -// the reconnecting pty and every connection will be closed. +// or the reconnecting pty closes the pty will be shut down. func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog.Logger) { rpty.timer = time.AfterFunc(attachTimeout, func() { rpty.Close("reconnecting pty timeout") @@ -142,22 +142,8 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. } rpty.timer.Stop() - rpty.mutex.Lock() - defer rpty.mutex.Unlock() - - // Log these closes only for debugging since the connections or processes - // might have already closed on their own. - for _, conn := range rpty.activeConns { - err := conn.Close() - if err != nil { - logger.Debug(ctx, "closed conn with error", slog.Error(err)) - } - } - // Connections get removed once the pty closes but it is possible there is - // still some data that needs to be written so clear the map now to avoid - // writing to closed connections. - rpty.activeConns = map[string]net.Conn{} - + // Log close/kill only for debugging since the process might have already + // closed on its own. err := rpty.ptty.Close() if err != nil { logger.Debug(ctx, "closed ptty with error", slog.Error(err)) @@ -179,15 +165,29 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, ctx, cancel := context.WithCancel(ctx) defer cancel() - err := rpty.doAttach(ctx, connID, conn, height, width, logger) + state, err := rpty.state.waitForStateOrContext(ctx, StateReady) + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + + err = rpty.doAttach(ctx, connID, conn, height, width, logger) if err != nil { return err } - defer func() { + go func() { + _, _ = rpty.state.waitForStateOrContext(ctx, StateClosing) rpty.mutex.Lock() defer rpty.mutex.Unlock() delete(rpty.activeConns, connID) + // Log closes only for debugging since the connection might have already + // closed on its own. + err := conn.Close() + if err != nil { + logger.Debug(ctx, "closed conn with error", slog.Error(err)) + } }() // Pipe conn -> pty and block. pty -> conn is handled in newBuffered(). @@ -203,15 +203,8 @@ func (rpty *bufferedReconnectingPTY) doAttach(ctx context.Context, connID string rpty.mutex.Lock() defer rpty.mutex.Unlock() - state, err := rpty.state.waitForStateOrContext(ctx, StateReady) - if state != StateReady { - return xerrors.Errorf("reconnecting pty ready wait: %w", err) - } - - go heartbeat(ctx, rpty.timer, rpty.timeout) - // Resize the PTY to initial height + width. - err = rpty.ptty.Resize(height, width) + err := rpty.ptty.Resize(height, width) if err != nil { // We can continue after this, it's not fatal! logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) From a7811733ca8e6082bb30487f3e3ed7cb0ebd3ec7 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 16:12:13 -0800 Subject: [PATCH 19/28] Avoid clobbering attach error with close errors We want to send the error from failing to wait for the version command, not the error from closing/killing the pty or process. --- agent/reconnectingpty/screen.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 84c79ea33db82..1186af9f309d2 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -279,13 +279,13 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u // the same name. err = rpty.sendCommand(ctx, "version", nil) if err != nil { - err := ptty.Close() - if err != nil { - logger.Debug(ctx, "closed ptty with error", slog.Error(err)) + closeErr := ptty.Close() + if closeErr != nil { + logger.Debug(ctx, "closed ptty with error", slog.Error(closeErr)) } - err = process.Kill() - if err != nil { - logger.Debug(ctx, "killed process with error", slog.Error(err)) + closeErr = process.Kill() + if closeErr != nil { + logger.Debug(ctx, "killed process with error", slog.Error(closeErr)) } rpty.metrics.WithLabelValues("screen_wait").Add(1) return nil, nil, err From 7a8ec2e582f03363d2c03c8d7e0c71481be37cd5 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 16:40:01 -0800 Subject: [PATCH 20/28] Immediately read screen process It is possible for the process to immediately exit (for example if you run `echo hello`), then we would wait for the `version` command to succeed but it never will because the session is already gone. If we immediately read to the process then we can tell when it has gone away and we can abort. --- agent/agent_test.go | 12 +++ agent/reconnectingpty/buffered.go | 2 + agent/reconnectingpty/screen.go | 121 +++++++++++++++++------------- 3 files changed, 83 insertions(+), 52 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index ccc4b0910dbe8..6d7e5fd1da47a 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1720,6 +1720,18 @@ func TestAgent_ReconnectingPTY(t *testing.T) { line := scanner3.Text() t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) } + + // Try a non-shell command. It should output then immediately exit. + netConn4, err := conn.ReconnectingPTY(ctx, uuid.New(), 100, 100, "echo test") + require.NoError(t, err) + defer netConn4.Close() + + scanner4 := bufio.NewScanner(netConn4) + require.True(t, hasLine(scanner4, matchEchoOutput), "find exit command") + for scanner4.Scan() { + line := scanner4.Text() + t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) + } }) } } diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 760fb06b36a07..e5b1307ef9b88 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -94,6 +94,8 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo } // Could have been killed externally or failed to start at all (command // not found for example). + // TODO: Should we check the process's exit code in case the command was + // invalid? rpty.Close("unable to read pty output, command might have exited") break } diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 1186af9f309d2..0203154f83335 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -162,8 +162,15 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne go heartbeat(ctx, rpty.timer, rpty.timeout) - ptty, process, err := rpty.doAttach(ctx, height, width, logger) + ptty, process, err := rpty.doAttach(ctx, conn, height, width, logger) if err != nil { + if errors.Is(err, context.Canceled) { + // Likely the process was too short-lived and canceled the version command. + // TODO: Is it worth distinguishing between that and a cancel from the + // Attach() caller? Additionally, since this could also happen if + // the command was invalid, should we check the process's exit code? + return nil + } return err } @@ -180,53 +187,6 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne } }() - // Pipe pty -> conn. - // We do not need to separately monitor for the process exiting. When it - // exits, our ptty.OutputReader() will return EOF after reading all process - // output. - go func() { - // Close the connection when the process exits. Log only for debugging - // since the connection might have already closed on its own. - defer func() { - err := conn.Close() - if err != nil { - logger.Debug(ctx, "closed connection with error", slog.Error(err)) - } - }() - buffer := make([]byte, 1024) - for { - read, err := ptty.OutputReader().Read(buffer) - if err != nil { - // When the PTY is closed, this is triggered. - // Error is typically a benign EOF, so only log for debugging. - if errors.Is(err, io.EOF) { - logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) - } else { - logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) - rpty.metrics.WithLabelValues("screen_output_reader").Add(1) - } - // The process might have died because the session itself died or it - // might have been separately killed and the session is still up (for - // example `exit` or we killed it when the connection closed). If the - // session is still up we might leave the reconnecting pty in memory - // around longer than it needs to be but it will eventually clean up - // with the timer or context, or the next attach will respawn the screen - // daemon which is fine too. - break - } - part := buffer[:read] - _, err = conn.Write(part) - if err != nil { - // Connection might have been closed. - if errors.Unwrap(err).Error() != "endpoint is closed for send" { - logger.Warn(ctx, "error writing to active conn", slog.Error(err)) - rpty.metrics.WithLabelValues("screen_write").Add(1) - } - break - } - } - }() - // Pipe conn -> pty and block. readConnLoop(ctx, conn, ptty, rpty.metrics, logger) return nil @@ -235,7 +195,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne // doAttach spawns the screen client and starts the heartbeat. It exists // separately only so we can defer the mutex unlock which is not possible in // Attach since it blocks. -func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { +func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) { // Ensure another attach does not come in and spawn a duplicate session. rpty.mutex.Lock() defer rpty.mutex.Unlock() @@ -273,12 +233,65 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u return nil, nil, err } + // This context lets us abort the version command if the process dies. + versionCtx, versionCancel := context.WithCancel(ctx) + defer versionCancel() + + // Pipe pty -> conn and close the connection when the process exits. + // We do not need to separately monitor for the process exiting. When it + // exits, our ptty.OutputReader() will return EOF after reading all process + // output. + go func() { + defer versionCancel() + defer func() { + err := conn.Close() + if err != nil { + // Log only for debugging since the connection might have already closed + // on its own. + logger.Debug(ctx, "closed connection with error", slog.Error(err)) + } + }() + buffer := make([]byte, 1024) + for { + read, err := ptty.OutputReader().Read(buffer) + if err != nil { + // When the PTY is closed, this is triggered. + // Error is typically a benign EOF, so only log for debugging. + if errors.Is(err, io.EOF) { + logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) + } else { + logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err)) + rpty.metrics.WithLabelValues("screen_output_reader").Add(1) + } + // The process might have died because the session itself died or it + // might have been separately killed and the session is still up (for + // example `exit` or we killed it when the connection closed). If the + // session is still up we might leave the reconnecting pty in memory + // around longer than it needs to be but it will eventually clean up + // with the timer or context, or the next attach will respawn the screen + // daemon which is fine too. + break + } + part := buffer[:read] + _, err = conn.Write(part) + if err != nil { + // Connection might have been closed. + if errors.Unwrap(err).Error() != "endpoint is closed for send" { + logger.Warn(ctx, "error writing to active conn", slog.Error(err)) + rpty.metrics.WithLabelValues("screen_write").Add(1) + } + break + } + } + }() + // Version seems to be the only command without a side effect (other than // making the version pop up briefly) so use it to wait for the session to // come up. If we do not wait we could end up spawning multiple sessions with // the same name. - err = rpty.sendCommand(ctx, "version", nil) + err = rpty.sendCommand(versionCtx, "version", nil) if err != nil { + // Log only for debugging since the process might already have closed. closeErr := ptty.Close() if closeErr != nil { logger.Debug(ctx, "closed ptty with error", slog.Error(closeErr)) @@ -298,8 +311,9 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u // command fails with an error matching anything in successErrors it will be // considered a success state (for example "no session" when quitting and the // session is already dead). The command will be retried until successful, the -// timeout is reached, or the context ends in which case the context error is -// returned together with the last error from the command. +// timeout is reached, or the context ends. A canceled context will return the +// canceled context's error as-is while a timed-out context returns together +// with the last error from the command. func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command string, successErrors []string) error { ctx, cancel := context.WithTimeout(ctx, attachTimeout) defer cancel() @@ -352,6 +366,9 @@ func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command stri for { select { case <-ctx.Done(): + if errors.Is(ctx.Err(), context.Canceled) { + return ctx.Err() + } return errors.Join(ctx.Err(), lastErr) case <-ticker.C: if done := run(); done { From 56ca7ac18d7cc64cbaedc5cf93ff0961c674722b Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 9 Aug 2023 17:54:55 -0800 Subject: [PATCH 21/28] Fix incorrect logger context on reconnecting PTY Since we used the same var all I did with my other change was use the same logger anyway. Now it will not have the connection ID, which is correct since the pty is not tied to a single connection. --- agent/agent.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 9e49e53177958..730cec875856f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1071,8 +1071,8 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m defer a.connCountReconnectingPTY.Add(-1) connectionID := uuid.NewString() - logger = logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID)) - logger.Debug(ctx, "starting handler") + connLogger := logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID)) + connLogger.Debug(ctx, "starting handler") defer func() { if err := retErr; err != nil { @@ -1083,12 +1083,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m // If the agent is closed, we don't want to // log this as an error since it's expected. if closed { - logger.Debug(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err)) + connLogger.Debug(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err)) } else { - logger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err)) + connLogger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err)) } } - logger.Debug(ctx, "reconnecting pty connection closed") + connLogger.Debug(ctx, "reconnecting pty connection closed") }() var rpty reconnectingpty.ReconnectingPTY @@ -1097,7 +1097,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m waitReady, ok := a.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected) if ok { close(sendConnected) // Unused. - logger.Debug(ctx, "connecting to existing reconnecting pty") + connLogger.Debug(ctx, "connecting to existing reconnecting pty") c, ok := waitReady.(chan reconnectingpty.ReconnectingPTY) if !ok { return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady) @@ -1108,7 +1108,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m } c <- rpty // Put it back for the next reconnect. } else { - logger.Debug(ctx, "creating new reconnecting pty") + connLogger.Debug(ctx, "creating new reconnecting pty") connected := false defer func() { @@ -1141,7 +1141,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m connected = true sendConnected <- rpty } - return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, logger) + return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, connLogger) } // startReportingConnectionStats runs the connection stats reporting goroutine. From 9b88a68b19243c9b467485126a1c1f2b0c1dc698 Mon Sep 17 00:00:00 2001 From: Asher Date: Thu, 10 Aug 2023 15:57:45 -0800 Subject: [PATCH 22/28] Protect map and state with the same mutex I moved the conn closes back to the lifecycle, too. --- agent/reconnectingpty/buffered.go | 99 +++++++++++------------- agent/reconnectingpty/reconnectingpty.go | 13 +++- agent/reconnectingpty/screen.go | 4 +- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index e5b1307ef9b88..37447e816501c 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -5,7 +5,6 @@ import ( "errors" "io" "net" - "sync" "time" "github.com/armon/circbuf" @@ -23,9 +22,6 @@ import ( type bufferedReconnectingPTY struct { command *pty.Cmd - // mutex protects writing to the circular buffer and connections. - mutex sync.RWMutex - activeConns map[string]net.Conn circularBuffer *circbuf.Buffer @@ -100,7 +96,7 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo break } part := buffer[:read] - rpty.mutex.Lock() + rpty.state.cond.L.Lock() _, err = rpty.circularBuffer.Write(part) if err != nil { logger.Error(ctx, "write to circular buffer", slog.Error(err)) @@ -119,7 +115,7 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo rpty.metrics.WithLabelValues("write").Add(1) } } - rpty.mutex.Unlock() + rpty.state.cond.L.Unlock() } }() @@ -136,7 +132,7 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. logger.Debug(ctx, "reconnecting pty ready") rpty.state.setState(StateReady, nil) - state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing, nil) if state < StateClosing { // If we have not closed yet then the context is what unblocked us (which // means the agent is shutting down) so move into the closing phase. @@ -144,6 +140,21 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. } rpty.timer.Stop() + rpty.state.cond.L.Lock() + // Log these closes only for debugging since the connections or processes + // might have already closed on their own. + for _, conn := range rpty.activeConns { + err := conn.Close() + if err != nil { + logger.Debug(ctx, "closed conn with error", slog.Error(err)) + } + } + // Connections get removed once they close but it is possible there is still + // some data that will be written before that happens so clear the map now to + // avoid writing to closed connections. + rpty.activeConns = map[string]net.Conn{} + rpty.state.cond.L.Unlock() + // Log close/kill only for debugging since the process might have already // closed on its own. err := rpty.ptty.Close() @@ -167,29 +178,42 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, ctx, cancel := context.WithCancel(ctx) defer cancel() - state, err := rpty.state.waitForStateOrContext(ctx, StateReady) - if state != StateReady { - return xerrors.Errorf("reconnecting pty ready wait: %w", err) - } + // Once we are ready, attach the active connection while we hold the mutex. + _, err := rpty.state.waitForStateOrContext(ctx, StateReady, func(state State, err error) error { + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + + // Resize the PTY to initial height + width. + err = rpty.ptty.Resize(height, width) + if err != nil { + // We can continue after this, it's not fatal! + logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) + rpty.metrics.WithLabelValues("resize").Add(1) + } - go heartbeat(ctx, rpty.timer, rpty.timeout) + // Write any previously stored data for the TTY and store the connection for + // future writes. + prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) + _, err = conn.Write(prevBuf) + if err != nil { + rpty.metrics.WithLabelValues("write").Add(1) + return xerrors.Errorf("write buffer to conn: %w", err) + } + rpty.activeConns[connID] = conn - err = rpty.doAttach(ctx, connID, conn, height, width, logger) + return nil + }) if err != nil { return err } - go func() { - _, _ = rpty.state.waitForStateOrContext(ctx, StateClosing) - rpty.mutex.Lock() - defer rpty.mutex.Unlock() + defer func() { + rpty.state.cond.L.Lock() + defer rpty.state.cond.L.Unlock() delete(rpty.activeConns, connID) - // Log closes only for debugging since the connection might have already - // closed on its own. - err := conn.Close() - if err != nil { - logger.Debug(ctx, "closed conn with error", slog.Error(err)) - } }() // Pipe conn -> pty and block. pty -> conn is handled in newBuffered(). @@ -197,35 +221,6 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, return nil } -// doAttach adds the connection to the map, replays the buffer, and starts the -// heartbeat. It exists separately only so we can defer the mutex unlock which -// is not possible in Attach since it blocks. -func (rpty *bufferedReconnectingPTY) doAttach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { - // Ensure we do not write to or close connections while we attach. - rpty.mutex.Lock() - defer rpty.mutex.Unlock() - - // Resize the PTY to initial height + width. - err := rpty.ptty.Resize(height, width) - if err != nil { - // We can continue after this, it's not fatal! - logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) - rpty.metrics.WithLabelValues("resize").Add(1) - } - - // Write any previously stored data for the TTY and store the connection for - // future writes. - prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) - _, err = conn.Write(prevBuf) - if err != nil { - rpty.metrics.WithLabelValues("write").Add(1) - return xerrors.Errorf("write buffer to conn: %w", err) - } - rpty.activeConns[connID] = conn - - return nil -} - func (rpty *bufferedReconnectingPTY) Wait() { _, _ = rpty.state.waitForState(StateClosing) } diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index d8f2ee2229d1f..7c030549cbcf0 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -167,8 +167,9 @@ func (s *ptyState) waitForState(state State) (State, error) { } // waitForStateOrContext blocks until the state or a greater one is reached or -// the provided context ends. -func (s *ptyState) waitForStateOrContext(ctx context.Context, state State) (State, error) { +// the provided context ends. If fn is non-nil it will be ran while the lock is +// held and fn's error will replace waitForStateOrContext's error. +func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State, err error) error) (State, error) { nevermind := make(chan struct{}) defer close(nevermind) go func() { @@ -185,10 +186,14 @@ func (s *ptyState) waitForStateOrContext(ctx context.Context, state State) (Stat for ctx.Err() == nil && state > s.state { s.cond.Wait() } + err := s.error if ctx.Err() != nil { - return s.state, ctx.Err() + err = ctx.Err() } - return s.state, s.error + if fn != nil { + return s.state, fn(s.state, err) + } + return s.state, err } // readConnLoop reads messages from conn and writes to ptty as needed. Blocks diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 0203154f83335..396c17e6b4faa 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -130,7 +130,7 @@ func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Lo logger.Debug(ctx, "reconnecting pty ready") rpty.state.setState(StateReady, nil) - state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing, nil) if state < StateClosing { // If we have not closed yet then the context is what unblocked us (which // means the agent is shutting down) so move into the closing phase. @@ -155,7 +155,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne ctx, cancel := context.WithCancel(ctx) defer cancel() - state, err := rpty.state.waitForStateOrContext(ctx, StateReady) + state, err := rpty.state.waitForStateOrContext(ctx, StateReady, nil) if state != StateReady { return xerrors.Errorf("reconnecting pty ready wait: %w", err) } From 56e71c99e9be183f17a9f065e39e2617afd2efaa Mon Sep 17 00:00:00 2001 From: Asher Date: Thu, 10 Aug 2023 16:32:25 -0800 Subject: [PATCH 23/28] Fix incorrect test comment --- agent/agent_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 6d7e5fd1da47a..e9f8bc772bb0f 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1727,7 +1727,7 @@ func TestAgent_ReconnectingPTY(t *testing.T) { defer netConn4.Close() scanner4 := bufio.NewScanner(netConn4) - require.True(t, hasLine(scanner4, matchEchoOutput), "find exit command") + require.True(t, hasLine(scanner4, matchEchoOutput), "find echo output") for scanner4.Scan() { line := scanner4.Text() t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, "")) From d4170ca13b0348f03ae27dcb90f0727f743308a6 Mon Sep 17 00:00:00 2001 From: Asher Date: Thu, 10 Aug 2023 17:39:31 -0800 Subject: [PATCH 24/28] Attempt fixing flake with 'echo test' command --- agent/reconnectingpty/buffered.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 37447e816501c..85c6886ec911e 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -180,6 +180,16 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, // Once we are ready, attach the active connection while we hold the mutex. _, err := rpty.state.waitForStateOrContext(ctx, StateReady, func(state State, err error) error { + // Write any previously stored data for the TTY. Since the command might be + // short-lived and have already exited, make sure we always at least output + // the buffer before returning. + prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) + _, writeErr := conn.Write(prevBuf) + if writeErr != nil { + rpty.metrics.WithLabelValues("write").Add(1) + return xerrors.Errorf("write buffer to conn: %w", writeErr) + } + if state != StateReady { return xerrors.Errorf("reconnecting pty ready wait: %w", err) } @@ -194,14 +204,7 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, rpty.metrics.WithLabelValues("resize").Add(1) } - // Write any previously stored data for the TTY and store the connection for - // future writes. - prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) - _, err = conn.Write(prevBuf) - if err != nil { - rpty.metrics.WithLabelValues("write").Add(1) - return xerrors.Errorf("write buffer to conn: %w", err) - } + // Store the connection for future writes. rpty.activeConns[connID] = conn return nil From 34c5c1a6a71d4fc5802749d607314cf6bd03961d Mon Sep 17 00:00:00 2001 From: Asher Date: Fri, 11 Aug 2023 10:28:30 -0800 Subject: [PATCH 25/28] Avoid wait callback when context expires --- agent/reconnectingpty/buffered.go | 4 ++-- agent/reconnectingpty/reconnectingpty.go | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 85c6886ec911e..109a2d1a68f08 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -191,7 +191,7 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, } if state != StateReady { - return xerrors.Errorf("reconnecting pty ready wait: %w", err) + return err } go heartbeat(ctx, rpty.timer, rpty.timeout) @@ -210,7 +210,7 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, return nil }) if err != nil { - return err + return xerrors.Errorf("reconnecting pty ready wait: %w", err) } defer func() { diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 7c030549cbcf0..4b154293dd3f6 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -168,7 +168,8 @@ func (s *ptyState) waitForState(state State) (State, error) { // waitForStateOrContext blocks until the state or a greater one is reached or // the provided context ends. If fn is non-nil it will be ran while the lock is -// held and fn's error will replace waitForStateOrContext's error. +// held unless the context ends, and fn's error will replace +// waitForStateOrContext's error. func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State, err error) error) (State, error) { nevermind := make(chan struct{}) defer close(nevermind) @@ -186,14 +187,13 @@ func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn fu for ctx.Err() == nil && state > s.state { s.cond.Wait() } - err := s.error if ctx.Err() != nil { - err = ctx.Err() + return s.state, ctx.Err() } if fn != nil { - return s.state, fn(s.state, err) + return s.state, fn(s.state, s.error) } - return s.state, err + return s.state, s.error } // readConnLoop reads messages from conn and writes to ptty as needed. Blocks From 88a6b9690ee0a5f4fa5155cc72b60e05623dfe0e Mon Sep 17 00:00:00 2001 From: Asher Date: Fri, 11 Aug 2023 10:38:44 -0800 Subject: [PATCH 26/28] Remove err from wait callback We do not really do anything with it other than just return it, and if we do need to do something with it we can just do it on the return from `waitForStateOrContext`, we probably would not need to use it behind the mutex. Also we should check the state outside since there is technically no guarantee an error is set on a state change (although in practice for close/done there always is, maybe this should be enforced in some way). --- agent/reconnectingpty/buffered.go | 12 ++++++------ agent/reconnectingpty/reconnectingpty.go | 11 +++++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 109a2d1a68f08..2dca5bb5e6562 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -179,19 +179,19 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, defer cancel() // Once we are ready, attach the active connection while we hold the mutex. - _, err := rpty.state.waitForStateOrContext(ctx, StateReady, func(state State, err error) error { + state, err := rpty.state.waitForStateOrContext(ctx, StateReady, func(state State) error { // Write any previously stored data for the TTY. Since the command might be // short-lived and have already exited, make sure we always at least output // the buffer before returning. prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) - _, writeErr := conn.Write(prevBuf) - if writeErr != nil { + _, err := conn.Write(prevBuf) + if err != nil { rpty.metrics.WithLabelValues("write").Add(1) - return xerrors.Errorf("write buffer to conn: %w", writeErr) + return xerrors.Errorf("write buffer to conn: %w", err) } if state != StateReady { - return err + return nil } go heartbeat(ctx, rpty.timer, rpty.timeout) @@ -209,7 +209,7 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, return nil }) - if err != nil { + if state != StateReady || err != nil { return xerrors.Errorf("reconnecting pty ready wait: %w", err) } diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 4b154293dd3f6..5df3d7727bdf8 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -168,9 +168,9 @@ func (s *ptyState) waitForState(state State) (State, error) { // waitForStateOrContext blocks until the state or a greater one is reached or // the provided context ends. If fn is non-nil it will be ran while the lock is -// held unless the context ends, and fn's error will replace -// waitForStateOrContext's error. -func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State, err error) error) (State, error) { +// held unless the context ends. If fn returns an error then fn's error will +// replace waitForStateOrContext's error. +func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State) error) (State, error) { nevermind := make(chan struct{}) defer close(nevermind) go func() { @@ -191,7 +191,10 @@ func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn fu return s.state, ctx.Err() } if fn != nil { - return s.state, fn(s.state, s.error) + err := fn(s.state) + if err != nil { + return s.state, err + } } return s.state, s.error } From 1fd4e9ab50b32c8c281440a05994f8907bcb2c95 Mon Sep 17 00:00:00 2001 From: Asher Date: Mon, 14 Aug 2023 08:13:10 -0800 Subject: [PATCH 27/28] Add state wait func where caller holds the lock --- agent/reconnectingpty/buffered.go | 73 +++++++++++++----------- agent/reconnectingpty/reconnectingpty.go | 20 +++---- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index 2dca5bb5e6562..e0267e71c0241 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -178,39 +178,9 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, ctx, cancel := context.WithCancel(ctx) defer cancel() - // Once we are ready, attach the active connection while we hold the mutex. - state, err := rpty.state.waitForStateOrContext(ctx, StateReady, func(state State) error { - // Write any previously stored data for the TTY. Since the command might be - // short-lived and have already exited, make sure we always at least output - // the buffer before returning. - prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) - _, err := conn.Write(prevBuf) - if err != nil { - rpty.metrics.WithLabelValues("write").Add(1) - return xerrors.Errorf("write buffer to conn: %w", err) - } - - if state != StateReady { - return nil - } - - go heartbeat(ctx, rpty.timer, rpty.timeout) - - // Resize the PTY to initial height + width. - err = rpty.ptty.Resize(height, width) - if err != nil { - // We can continue after this, it's not fatal! - logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) - rpty.metrics.WithLabelValues("resize").Add(1) - } - - // Store the connection for future writes. - rpty.activeConns[connID] = conn - - return nil - }) - if state != StateReady || err != nil { - return xerrors.Errorf("reconnecting pty ready wait: %w", err) + err := rpty.doAttach(ctx, connID, conn, height, width, logger) + if err != nil { + return err } defer func() { @@ -224,6 +194,43 @@ func (rpty *bufferedReconnectingPTY) Attach(ctx context.Context, connID string, return nil } +// doAttach adds the connection to the map, replays the buffer, and starts the +// heartbeat. It exists separately only so we can defer the mutex unlock which +// is not possible in Attach since it blocks. +func (rpty *bufferedReconnectingPTY) doAttach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error { + rpty.state.cond.L.Lock() + defer rpty.state.cond.L.Unlock() + + // Write any previously stored data for the TTY. Since the command might be + // short-lived and have already exited, make sure we always at least output + // the buffer before returning, mostly just so tests pass. + prevBuf := slices.Clone(rpty.circularBuffer.Bytes()) + _, err := conn.Write(prevBuf) + if err != nil { + rpty.metrics.WithLabelValues("write").Add(1) + return xerrors.Errorf("write buffer to conn: %w", err) + } + + state, err := rpty.state.waitForStateOrContextLocked(ctx, StateReady) + if state != StateReady { + return xerrors.Errorf("reconnecting pty ready wait: %w", err) + } + + go heartbeat(ctx, rpty.timer, rpty.timeout) + + // Resize the PTY to initial height + width. + err = rpty.ptty.Resize(height, width) + if err != nil { + // We can continue after this, it's not fatal! + logger.Warn(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err)) + rpty.metrics.WithLabelValues("resize").Add(1) + } + + rpty.activeConns[connID] = conn + + return nil +} + func (rpty *bufferedReconnectingPTY) Wait() { _, _ = rpty.state.waitForState(StateClosing) } diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index 5df3d7727bdf8..abd5a900dae85 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -167,10 +167,16 @@ func (s *ptyState) waitForState(state State) (State, error) { } // waitForStateOrContext blocks until the state or a greater one is reached or -// the provided context ends. If fn is non-nil it will be ran while the lock is -// held unless the context ends. If fn returns an error then fn's error will -// replace waitForStateOrContext's error. +// the provided context ends. func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State) error) (State, error) { + s.cond.L.Lock() + defer s.cond.L.Unlock() + return s.waitForStateOrContextLocked(ctx, state) +} + +// waitForStateOrContextLocked is the same as waitForStateOrContext except it +// assumes the caller has already locked cond. +func (s *ptyState) waitForStateOrContextLocked(ctx context.Context, state State) (State, error) { nevermind := make(chan struct{}) defer close(nevermind) go func() { @@ -182,20 +188,12 @@ func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn fu } }() - s.cond.L.Lock() - defer s.cond.L.Unlock() for ctx.Err() == nil && state > s.state { s.cond.Wait() } if ctx.Err() != nil { return s.state, ctx.Err() } - if fn != nil { - err := fn(s.state) - if err != nil { - return s.state, err - } - } return s.state, s.error } From 968526d9b3832811abf3466a4027c74bf944fbc2 Mon Sep 17 00:00:00 2001 From: Asher Date: Mon, 14 Aug 2023 09:49:44 -0800 Subject: [PATCH 28/28] Remove unused fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤦 --- agent/reconnectingpty/buffered.go | 2 +- agent/reconnectingpty/reconnectingpty.go | 2 +- agent/reconnectingpty/screen.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go index e0267e71c0241..93241ada29687 100644 --- a/agent/reconnectingpty/buffered.go +++ b/agent/reconnectingpty/buffered.go @@ -132,7 +132,7 @@ func (rpty *bufferedReconnectingPTY) lifecycle(ctx context.Context, logger slog. logger.Debug(ctx, "reconnecting pty ready") rpty.state.setState(StateReady, nil) - state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing, nil) + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) if state < StateClosing { // If we have not closed yet then the context is what unblocked us (which // means the agent is shutting down) so move into the closing phase. diff --git a/agent/reconnectingpty/reconnectingpty.go b/agent/reconnectingpty/reconnectingpty.go index abd5a900dae85..e3dbb9024b063 100644 --- a/agent/reconnectingpty/reconnectingpty.go +++ b/agent/reconnectingpty/reconnectingpty.go @@ -168,7 +168,7 @@ func (s *ptyState) waitForState(state State) (State, error) { // waitForStateOrContext blocks until the state or a greater one is reached or // the provided context ends. -func (s *ptyState) waitForStateOrContext(ctx context.Context, state State, fn func(state State) error) (State, error) { +func (s *ptyState) waitForStateOrContext(ctx context.Context, state State) (State, error) { s.cond.L.Lock() defer s.cond.L.Unlock() return s.waitForStateOrContextLocked(ctx, state) diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go index 396c17e6b4faa..0203154f83335 100644 --- a/agent/reconnectingpty/screen.go +++ b/agent/reconnectingpty/screen.go @@ -130,7 +130,7 @@ func (rpty *screenReconnectingPTY) lifecycle(ctx context.Context, logger slog.Lo logger.Debug(ctx, "reconnecting pty ready") rpty.state.setState(StateReady, nil) - state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing, nil) + state, reasonErr := rpty.state.waitForStateOrContext(ctx, StateClosing) if state < StateClosing { // If we have not closed yet then the context is what unblocked us (which // means the agent is shutting down) so move into the closing phase. @@ -155,7 +155,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne ctx, cancel := context.WithCancel(ctx) defer cancel() - state, err := rpty.state.waitForStateOrContext(ctx, StateReady, nil) + state, err := rpty.state.waitForStateOrContext(ctx, StateReady) if state != StateReady { return xerrors.Errorf("reconnecting pty ready wait: %w", err) }