Skip to content

chore: tailnet debug logging #7260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ jobs:
echo "cover=false" >> $GITHUB_OUTPUT
fi

export TS_DEBUG_DISCO=true
gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" --packages="./..." -- -parallel=8 -timeout=7m -short -failfast $COVERAGE_FLAGS

- name: Print test stats
Expand Down Expand Up @@ -377,6 +378,7 @@ jobs:

- name: Test with PostgreSQL Database
run: |
export TS_DEBUG_DISCO=true
make test-postgres

- name: Print test stats
Expand Down
17 changes: 15 additions & 2 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_
}
break
}
logger.Debug(ctx, "accepted conn", slog.F("remote", conn.RemoteAddr().String()))
wg.Add(1)
closed := make(chan struct{})
go func() {
Expand Down Expand Up @@ -676,6 +677,7 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_
var msg codersdk.WorkspaceAgentReconnectingPTYInit
err = json.Unmarshal(data, &msg)
if err != nil {
logger.Warn(ctx, "failed to unmarshal init", slog.F("raw", data))
return
}
_ = a.handleReconnectingPTY(ctx, logger, msg, conn)
Expand Down Expand Up @@ -967,6 +969,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m

connectionID := uuid.NewString()
logger = logger.With(slog.F("id", msg.ID), slog.F("connection_id", connectionID))
logger.Debug(ctx, "starting handler")

defer func() {
if err := retErr; err != nil {
Expand Down Expand Up @@ -1034,6 +1037,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
// 1. The timeout completed.
// 2. The parent context was canceled.
<-ctx.Done()
logger.Debug(ctx, "context done", slog.Error(ctx.Err()))
_ = process.Kill()
}()
// We don't need to separately monitor for the process exiting.
Expand All @@ -1045,6 +1049,8 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
read, err := rpty.ptty.OutputReader().Read(buffer)
if err != nil {
// When the PTY is closed, this is triggered.
// Error is typically a benign EOF, so only log for debugging.
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
break
}
part := buffer[:read]
Expand All @@ -1056,8 +1062,15 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
break
}
rpty.activeConnsMutex.Lock()
for _, conn := range rpty.activeConns {
_, _ = conn.Write(part)
for cid, conn := range rpty.activeConns {
_, err = conn.Write(part)
if err != nil {
logger.Debug(ctx,
"error writing to active conn",
slog.F("other_conn_id", cid),
slog.Error(err),
)
}
}
rpty.activeConnsMutex.Unlock()
}
Expand Down
34 changes: 19 additions & 15 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,7 @@ func TestAgent_StartupScript(t *testing.T) {
}
t.Run("Success", func(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
client := &client{
t: t,
agentID: uuid.New(),
Expand All @@ -887,12 +888,12 @@ func TestAgent_StartupScript(t *testing.T) {
DERPMap: &tailcfg.DERPMap{},
},
statsChan: make(chan *agentsdk.Stats),
coordinator: tailnet.NewCoordinator(),
coordinator: tailnet.NewCoordinator(logger),
}
closer := agent.New(agent.Options{
Client: client,
Filesystem: afero.NewMemMapFs(),
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
Logger: logger.Named("agent"),
ReconnectingPTYTimeout: 0,
})
t.Cleanup(func() {
Expand All @@ -910,6 +911,7 @@ func TestAgent_StartupScript(t *testing.T) {
// script has written too many lines it will still succeed!
t.Run("OverflowsAndSkips", func(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
client := &client{
t: t,
agentID: uuid.New(),
Expand All @@ -927,12 +929,12 @@ func TestAgent_StartupScript(t *testing.T) {
return codersdk.ReadBodyAsError(res)
},
statsChan: make(chan *agentsdk.Stats),
coordinator: tailnet.NewCoordinator(),
coordinator: tailnet.NewCoordinator(logger),
}
closer := agent.New(agent.Options{
Client: client,
Filesystem: afero.NewMemMapFs(),
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
Logger: logger.Named("agent"),
ReconnectingPTYTimeout: 0,
})
t.Cleanup(func() {
Expand Down Expand Up @@ -1282,7 +1284,7 @@ func TestAgent_Lifecycle(t *testing.T) {

t.Run("ShutdownScriptOnce", func(t *testing.T) {
t.Parallel()

logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
expected := "this-is-shutdown"
client := &client{
t: t,
Expand All @@ -1293,13 +1295,13 @@ func TestAgent_Lifecycle(t *testing.T) {
ShutdownScript: "echo " + expected,
},
statsChan: make(chan *agentsdk.Stats),
coordinator: tailnet.NewCoordinator(),
coordinator: tailnet.NewCoordinator(logger),
}

fs := afero.NewMemMapFs()
agent := agent.New(agent.Options{
Client: client,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
Logger: logger.Named("agent"),
Filesystem: fs,
})

Expand Down Expand Up @@ -1548,9 +1550,10 @@ func TestAgent_Speedtest(t *testing.T) {

func TestAgent_Reconnect(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
// After the agent is disconnected from a coordinator, it's supposed
// to reconnect!
coordinator := tailnet.NewCoordinator()
coordinator := tailnet.NewCoordinator(logger)
defer coordinator.Close()

agentID := uuid.New()
Expand All @@ -1572,7 +1575,7 @@ func TestAgent_Reconnect(t *testing.T) {
return "", nil
},
Client: client,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
Logger: logger.Named("agent"),
})
defer closer.Close()

Expand All @@ -1587,8 +1590,8 @@ func TestAgent_Reconnect(t *testing.T) {

func TestAgent_WriteVSCodeConfigs(t *testing.T) {
t.Parallel()

coordinator := tailnet.NewCoordinator()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
coordinator := tailnet.NewCoordinator(logger)
defer coordinator.Close()

client := &client{
Expand All @@ -1607,7 +1610,7 @@ func TestAgent_WriteVSCodeConfigs(t *testing.T) {
return "", nil
},
Client: client,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
Logger: logger.Named("agent"),
Filesystem: filesystem,
})
defer closer.Close()
Expand Down Expand Up @@ -1698,10 +1701,11 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
afero.Fs,
io.Closer,
) {
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
if metadata.DERPMap == nil {
metadata.DERPMap = tailnettest.RunDERPAndSTUN(t)
}
coordinator := tailnet.NewCoordinator()
coordinator := tailnet.NewCoordinator(logger)
t.Cleanup(func() {
_ = coordinator.Close()
})
Expand All @@ -1718,7 +1722,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
closer := agent.New(agent.Options{
Client: c,
Filesystem: fs,
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
Logger: logger.Named("agent"),
ReconnectingPTYTimeout: ptyTimeout,
})
t.Cleanup(func() {
Expand All @@ -1727,7 +1731,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
conn, err := tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)},
DERPMap: metadata.DERPMap,
Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug),
Logger: logger.Named("client"),
})
require.NoError(t, err)
clientConn, serverConn := net.Pipe()
Expand Down
2 changes: 1 addition & 1 deletion coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func New(options *Options) *API {
options.PrometheusRegistry = prometheus.NewRegistry()
}
if options.TailnetCoordinator == nil {
options.TailnetCoordinator = tailnet.NewCoordinator()
options.TailnetCoordinator = tailnet.NewCoordinator(options.Logger)
}
if options.DERPServer == nil {
options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp")))
Expand Down
3 changes: 2 additions & 1 deletion coderd/prometheusmetrics/prometheusmetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"

"github.com/coder/coder/coderd/coderdtest"
Expand Down Expand Up @@ -298,7 +299,7 @@ func TestAgents(t *testing.T) {
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

// given
coordinator := tailnet.NewCoordinator()
coordinator := tailnet.NewCoordinator(slogtest.Make(t, nil).Leveled(slog.LevelDebug))
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
coordinatorPtr.Store(&coordinator)
derpMap := tailnettest.RunDERPAndSTUN(t)
Expand Down
3 changes: 2 additions & 1 deletion coderd/workspaceapps/apptest/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/agent"
"github.com/coder/coder/coderd/coderdtest"
Expand Down Expand Up @@ -364,7 +365,7 @@ func createWorkspaceWithApps(t *testing.T, client *codersdk.Client, orgID uuid.U
}
agentCloser := agent.New(agent.Options{
Client: agentClient,
Logger: slogtest.Make(t, nil).Named("agent"),
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
})
t.Cleanup(func() {
_ = agentCloser.Close()
Expand Down
9 changes: 7 additions & 2 deletions coderd/workspaceapps/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,8 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) {
if !ok {
return
}
log := s.Logger.With(slog.F("agent_id", appToken.AgentID))
log.Debug(ctx, "resolved PTY request")

values := r.URL.Query()
parser := httpapi.NewQueryParamParser()
Expand Down Expand Up @@ -632,19 +634,22 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) {

agentConn, release, err := s.WorkspaceConnCache.Acquire(appToken.AgentID)
if err != nil {
s.Logger.Debug(ctx, "dial workspace agent", slog.Error(err))
log.Debug(ctx, "dial workspace agent", slog.Error(err))
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial workspace agent: %s", err))
return
}
defer release()
log.Debug(ctx, "dialed workspace agent")
ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command"))
if err != nil {
s.Logger.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err))
log.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err))
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial: %s", err))
return
}
defer ptNetConn.Close()
log.Debug(ctx, "obtained PTY")
agentssh.Bicopy(ctx, wsNetConn, ptNetConn)
log.Debug(ctx, "pty Bicopy finished")
}

// wsNetConn wraps net.Conn created by websocket.NetConn(). Cancel func
Expand Down
6 changes: 3 additions & 3 deletions coderd/wsconncache/wsconncache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ func TestCache(t *testing.T) {

func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Duration) *codersdk.WorkspaceAgentConn {
t.Helper()

logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
manifest.DERPMap = tailnettest.RunDERPAndSTUN(t)

coordinator := tailnet.NewCoordinator()
coordinator := tailnet.NewCoordinator(logger)
t.Cleanup(func() {
_ = coordinator.Close()
})
Expand All @@ -171,7 +171,7 @@ func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Durati
manifest: manifest,
coordinator: coordinator,
},
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelInfo),
Logger: logger.Named("agent"),
ReconnectingPTYTimeout: ptyTimeout,
})
t.Cleanup(func() {
Expand Down
2 changes: 1 addition & 1 deletion enterprise/coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ func (api *API) updateEntitlements(ctx context.Context) error {
}

if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed {
coordinator := agpltailnet.NewCoordinator()
coordinator := agpltailnet.NewCoordinator(api.Logger)
if enabled {
haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub)
if err != nil {
Expand Down
Loading