Skip to content

Commit b6666cf

Browse files
chore: tailnet debug logging (#7260)
* Enable discovery (disco) debug Signed-off-by: Spike Curtis <spike@coder.com> * Better debug on reconnectingPTY Signed-off-by: Spike Curtis <spike@coder.com> * Agent logging in appstest Signed-off-by: Spike Curtis <spike@coder.com> * More reconnectingPTY logging Signed-off-by: Spike Curtis <spike@coder.com> * Add logging to coordinator Signed-off-by: Spike Curtis <spike@coder.com> * Update agent/agent.go Co-authored-by: Mathias Fredriksson <mafredri@gmail.com> * Update agent/agent.go Co-authored-by: Mathias Fredriksson <mafredri@gmail.com> * Update agent/agent.go Co-authored-by: Mathias Fredriksson <mafredri@gmail.com> * Update agent/agent.go Co-authored-by: Mathias Fredriksson <mafredri@gmail.com> * Clarify logs; remove unrelated changes Signed-off-by: Spike Curtis <spike@coder.com> --------- Signed-off-by: Spike Curtis <spike@coder.com> Co-authored-by: Mathias Fredriksson <mafredri@gmail.com>
1 parent 7aa07cf commit b6666cf

File tree

11 files changed

+94
-33
lines changed

11 files changed

+94
-33
lines changed

.github/workflows/ci.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ jobs:
301301
echo "cover=false" >> $GITHUB_OUTPUT
302302
fi
303303
304+
export TS_DEBUG_DISCO=true
304305
gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" --packages="./..." -- -parallel=8 -timeout=7m -short -failfast $COVERAGE_FLAGS
305306
306307
- name: Print test stats
@@ -377,6 +378,7 @@ jobs:
377378

378379
- name: Test with PostgreSQL Database
379380
run: |
381+
export TS_DEBUG_DISCO=true
380382
make test-postgres
381383
382384
- name: Print test stats

agent/agent.go

+15-2
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_
653653
}
654654
break
655655
}
656+
logger.Debug(ctx, "accepted conn", slog.F("remote", conn.RemoteAddr().String()))
656657
wg.Add(1)
657658
closed := make(chan struct{})
658659
go func() {
@@ -681,6 +682,7 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_
681682
var msg codersdk.WorkspaceAgentReconnectingPTYInit
682683
err = json.Unmarshal(data, &msg)
683684
if err != nil {
685+
logger.Warn(ctx, "failed to unmarshal init", slog.F("raw", data))
684686
return
685687
}
686688
_ = a.handleReconnectingPTY(ctx, logger, msg, conn)
@@ -972,6 +974,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
972974

973975
connectionID := uuid.NewString()
974976
logger = logger.With(slog.F("id", msg.ID), slog.F("connection_id", connectionID))
977+
logger.Debug(ctx, "starting handler")
975978

976979
defer func() {
977980
if err := retErr; err != nil {
@@ -1039,6 +1042,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10391042
// 1. The timeout completed.
10401043
// 2. The parent context was canceled.
10411044
<-ctx.Done()
1045+
logger.Debug(ctx, "context done", slog.Error(ctx.Err()))
10421046
_ = process.Kill()
10431047
}()
10441048
// We don't need to separately monitor for the process exiting.
@@ -1050,6 +1054,8 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10501054
read, err := rpty.ptty.OutputReader().Read(buffer)
10511055
if err != nil {
10521056
// When the PTY is closed, this is triggered.
1057+
// Error is typically a benign EOF, so only log for debugging.
1058+
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
10531059
break
10541060
}
10551061
part := buffer[:read]
@@ -1061,8 +1067,15 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10611067
break
10621068
}
10631069
rpty.activeConnsMutex.Lock()
1064-
for _, conn := range rpty.activeConns {
1065-
_, _ = conn.Write(part)
1070+
for cid, conn := range rpty.activeConns {
1071+
_, err = conn.Write(part)
1072+
if err != nil {
1073+
logger.Debug(ctx,
1074+
"error writing to active conn",
1075+
slog.F("other_conn_id", cid),
1076+
slog.Error(err),
1077+
)
1078+
}
10661079
}
10671080
rpty.activeConnsMutex.Unlock()
10681081
}

agent/agent_test.go

+19-15
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,7 @@ func TestAgent_StartupScript(t *testing.T) {
879879
}
880880
t.Run("Success", func(t *testing.T) {
881881
t.Parallel()
882+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
882883
client := &client{
883884
t: t,
884885
agentID: uuid.New(),
@@ -887,12 +888,12 @@ func TestAgent_StartupScript(t *testing.T) {
887888
DERPMap: &tailcfg.DERPMap{},
888889
},
889890
statsChan: make(chan *agentsdk.Stats),
890-
coordinator: tailnet.NewCoordinator(),
891+
coordinator: tailnet.NewCoordinator(logger),
891892
}
892893
closer := agent.New(agent.Options{
893894
Client: client,
894895
Filesystem: afero.NewMemMapFs(),
895-
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
896+
Logger: logger.Named("agent"),
896897
ReconnectingPTYTimeout: 0,
897898
})
898899
t.Cleanup(func() {
@@ -910,6 +911,7 @@ func TestAgent_StartupScript(t *testing.T) {
910911
// script has written too many lines it will still succeed!
911912
t.Run("OverflowsAndSkips", func(t *testing.T) {
912913
t.Parallel()
914+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
913915
client := &client{
914916
t: t,
915917
agentID: uuid.New(),
@@ -927,12 +929,12 @@ func TestAgent_StartupScript(t *testing.T) {
927929
return codersdk.ReadBodyAsError(res)
928930
},
929931
statsChan: make(chan *agentsdk.Stats),
930-
coordinator: tailnet.NewCoordinator(),
932+
coordinator: tailnet.NewCoordinator(logger),
931933
}
932934
closer := agent.New(agent.Options{
933935
Client: client,
934936
Filesystem: afero.NewMemMapFs(),
935-
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
937+
Logger: logger.Named("agent"),
936938
ReconnectingPTYTimeout: 0,
937939
})
938940
t.Cleanup(func() {
@@ -1282,7 +1284,7 @@ func TestAgent_Lifecycle(t *testing.T) {
12821284

12831285
t.Run("ShutdownScriptOnce", func(t *testing.T) {
12841286
t.Parallel()
1285-
1287+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
12861288
expected := "this-is-shutdown"
12871289
client := &client{
12881290
t: t,
@@ -1293,13 +1295,13 @@ func TestAgent_Lifecycle(t *testing.T) {
12931295
ShutdownScript: "echo " + expected,
12941296
},
12951297
statsChan: make(chan *agentsdk.Stats),
1296-
coordinator: tailnet.NewCoordinator(),
1298+
coordinator: tailnet.NewCoordinator(logger),
12971299
}
12981300

12991301
fs := afero.NewMemMapFs()
13001302
agent := agent.New(agent.Options{
13011303
Client: client,
1302-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
1304+
Logger: logger.Named("agent"),
13031305
Filesystem: fs,
13041306
})
13051307

@@ -1548,9 +1550,10 @@ func TestAgent_Speedtest(t *testing.T) {
15481550

15491551
func TestAgent_Reconnect(t *testing.T) {
15501552
t.Parallel()
1553+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
15511554
// After the agent is disconnected from a coordinator, it's supposed
15521555
// to reconnect!
1553-
coordinator := tailnet.NewCoordinator()
1556+
coordinator := tailnet.NewCoordinator(logger)
15541557
defer coordinator.Close()
15551558

15561559
agentID := uuid.New()
@@ -1572,7 +1575,7 @@ func TestAgent_Reconnect(t *testing.T) {
15721575
return "", nil
15731576
},
15741577
Client: client,
1575-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
1578+
Logger: logger.Named("agent"),
15761579
})
15771580
defer closer.Close()
15781581

@@ -1587,8 +1590,8 @@ func TestAgent_Reconnect(t *testing.T) {
15871590

15881591
func TestAgent_WriteVSCodeConfigs(t *testing.T) {
15891592
t.Parallel()
1590-
1591-
coordinator := tailnet.NewCoordinator()
1593+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
1594+
coordinator := tailnet.NewCoordinator(logger)
15921595
defer coordinator.Close()
15931596

15941597
client := &client{
@@ -1607,7 +1610,7 @@ func TestAgent_WriteVSCodeConfigs(t *testing.T) {
16071610
return "", nil
16081611
},
16091612
Client: client,
1610-
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
1613+
Logger: logger.Named("agent"),
16111614
Filesystem: filesystem,
16121615
})
16131616
defer closer.Close()
@@ -1698,10 +1701,11 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
16981701
afero.Fs,
16991702
io.Closer,
17001703
) {
1704+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
17011705
if metadata.DERPMap == nil {
17021706
metadata.DERPMap = tailnettest.RunDERPAndSTUN(t)
17031707
}
1704-
coordinator := tailnet.NewCoordinator()
1708+
coordinator := tailnet.NewCoordinator(logger)
17051709
t.Cleanup(func() {
17061710
_ = coordinator.Close()
17071711
})
@@ -1718,7 +1722,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
17181722
closer := agent.New(agent.Options{
17191723
Client: c,
17201724
Filesystem: fs,
1721-
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
1725+
Logger: logger.Named("agent"),
17221726
ReconnectingPTYTimeout: ptyTimeout,
17231727
})
17241728
t.Cleanup(func() {
@@ -1727,7 +1731,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
17271731
conn, err := tailnet.NewConn(&tailnet.Options{
17281732
Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)},
17291733
DERPMap: metadata.DERPMap,
1730-
Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug),
1734+
Logger: logger.Named("client"),
17311735
})
17321736
require.NoError(t, err)
17331737
clientConn, serverConn := net.Pipe()

coderd/coderd.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func New(options *Options) *API {
221221
options.PrometheusRegistry = prometheus.NewRegistry()
222222
}
223223
if options.TailnetCoordinator == nil {
224-
options.TailnetCoordinator = tailnet.NewCoordinator()
224+
options.TailnetCoordinator = tailnet.NewCoordinator(options.Logger)
225225
}
226226
if options.DERPServer == nil {
227227
options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp")))

coderd/prometheusmetrics/prometheusmetrics_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/stretchr/testify/assert"
1717
"github.com/stretchr/testify/require"
1818

19+
"cdr.dev/slog"
1920
"cdr.dev/slog/sloggers/slogtest"
2021

2122
"github.com/coder/coder/coderd/coderdtest"
@@ -298,7 +299,7 @@ func TestAgents(t *testing.T) {
298299
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
299300

300301
// given
301-
coordinator := tailnet.NewCoordinator()
302+
coordinator := tailnet.NewCoordinator(slogtest.Make(t, nil).Leveled(slog.LevelDebug))
302303
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
303304
coordinatorPtr.Store(&coordinator)
304305
derpMap := tailnettest.RunDERPAndSTUN(t)

coderd/workspaceapps/apptest/setup.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/stretchr/testify/assert"
1717
"github.com/stretchr/testify/require"
1818

19+
"cdr.dev/slog"
1920
"cdr.dev/slog/sloggers/slogtest"
2021
"github.com/coder/coder/agent"
2122
"github.com/coder/coder/coderd/coderdtest"
@@ -364,7 +365,7 @@ func createWorkspaceWithApps(t *testing.T, client *codersdk.Client, orgID uuid.U
364365
}
365366
agentCloser := agent.New(agent.Options{
366367
Client: agentClient,
367-
Logger: slogtest.Make(t, nil).Named("agent"),
368+
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
368369
})
369370
t.Cleanup(func() {
370371
_ = agentCloser.Close()

coderd/workspaceapps/proxy.go

+7-2
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) {
600600
if !ok {
601601
return
602602
}
603+
log := s.Logger.With(slog.F("agent_id", appToken.AgentID))
604+
log.Debug(ctx, "resolved PTY request")
603605

604606
values := r.URL.Query()
605607
parser := httpapi.NewQueryParamParser()
@@ -632,19 +634,22 @@ func (s *Server) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) {
632634

633635
agentConn, release, err := s.WorkspaceConnCache.Acquire(appToken.AgentID)
634636
if err != nil {
635-
s.Logger.Debug(ctx, "dial workspace agent", slog.Error(err))
637+
log.Debug(ctx, "dial workspace agent", slog.Error(err))
636638
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial workspace agent: %s", err))
637639
return
638640
}
639641
defer release()
642+
log.Debug(ctx, "dialed workspace agent")
640643
ptNetConn, err := agentConn.ReconnectingPTY(ctx, reconnect, uint16(height), uint16(width), r.URL.Query().Get("command"))
641644
if err != nil {
642-
s.Logger.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err))
645+
log.Debug(ctx, "dial reconnecting pty server in workspace agent", slog.Error(err))
643646
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial: %s", err))
644647
return
645648
}
646649
defer ptNetConn.Close()
650+
log.Debug(ctx, "obtained PTY")
647651
agentssh.Bicopy(ctx, wsNetConn, ptNetConn)
652+
log.Debug(ctx, "pty Bicopy finished")
648653
}
649654

650655
// wsNetConn wraps net.Conn created by websocket.NetConn(). Cancel func

coderd/wsconncache/wsconncache_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,10 @@ func TestCache(t *testing.T) {
156156

157157
func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Duration) *codersdk.WorkspaceAgentConn {
158158
t.Helper()
159-
159+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
160160
manifest.DERPMap = tailnettest.RunDERPAndSTUN(t)
161161

162-
coordinator := tailnet.NewCoordinator()
162+
coordinator := tailnet.NewCoordinator(logger)
163163
t.Cleanup(func() {
164164
_ = coordinator.Close()
165165
})
@@ -171,7 +171,7 @@ func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Durati
171171
manifest: manifest,
172172
coordinator: coordinator,
173173
},
174-
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelInfo),
174+
Logger: logger.Named("agent"),
175175
ReconnectingPTYTimeout: ptyTimeout,
176176
})
177177
t.Cleanup(func() {

enterprise/coderd/coderd.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ func (api *API) updateEntitlements(ctx context.Context) error {
390390
}
391391

392392
if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed {
393-
coordinator := agpltailnet.NewCoordinator()
393+
coordinator := agpltailnet.NewCoordinator(api.Logger)
394394
if enabled {
395395
haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub)
396396
if err != nil {

0 commit comments

Comments
 (0)