Skip to content

Commit 179d9e0

Browse files
authored
fix(coderd): Detect agent disconnect via inactivity (coder#6528)
Fixes coder#5901
1 parent 7fa6483 commit 179d9e0

File tree

1 file changed

+66
-9
lines changed

1 file changed

+66
-9
lines changed

coderd/workspaceagents.go

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ import (
1111
"net/http"
1212
"net/netip"
1313
"net/url"
14+
"runtime/pprof"
1415
"strconv"
1516
"strings"
17+
"sync"
1618
"time"
1719

1820
"github.com/google/uuid"
@@ -291,11 +293,12 @@ func (api *API) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) {
291293
})
292294
return
293295
}
294-
go httpapi.Heartbeat(ctx, conn)
295296

296-
_, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary)
297+
ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary)
297298
defer wsNetConn.Close() // Also closes conn.
298299

300+
go httpapi.Heartbeat(ctx, conn)
301+
299302
agentConn, release, err := api.workspaceAgentCache.Acquire(r, workspaceAgent.ID)
300303
if err != nil {
301304
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("dial workspace agent: %s", err))
@@ -606,11 +609,40 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
606609
})
607610
return
608611
}
609-
go httpapi.Heartbeat(ctx, conn)
610612

611613
ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary)
612614
defer wsNetConn.Close()
613615

616+
// We use a custom heartbeat routine here instead of `httpapi.Heartbeat`
617+
// because we want to log the agent's last ping time.
618+
var lastPing time.Time
619+
var pingMu sync.Mutex
620+
go pprof.Do(ctx, pprof.Labels("agent", workspaceAgent.ID.String()), func(ctx context.Context) {
621+
// TODO(mafredri): Is this too frequent? Use separate ping disconnect timeout?
622+
t := time.NewTicker(api.AgentConnectionUpdateFrequency)
623+
defer t.Stop()
624+
625+
for {
626+
select {
627+
case <-t.C:
628+
case <-ctx.Done():
629+
return
630+
}
631+
632+
// We don't need a context that times out here because the ping will
633+
// eventually go through. If the context times out, then other
634+
// websocket read operations will receive an error, obfuscating the
635+
// actual problem.
636+
err := conn.Ping(ctx)
637+
if err != nil {
638+
return
639+
}
640+
pingMu.Lock()
641+
lastPing = time.Now()
642+
pingMu.Unlock()
643+
}
644+
})
645+
614646
firstConnectedAt := workspaceAgent.FirstConnectedAt
615647
if !firstConnectedAt.Valid {
616648
firstConnectedAt = sql.NullTime{
@@ -654,9 +686,12 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
654686
ctx, cancel := context.WithTimeout(dbauthz.AsSystemRestricted(api.ctx), api.AgentInactiveDisconnectTimeout)
655687
defer cancel()
656688

657-
disconnectedAt = sql.NullTime{
658-
Time: database.Now(),
659-
Valid: true,
689+
// Only update timestamp if the disconnect is new.
690+
if !disconnectedAt.Valid {
691+
disconnectedAt = sql.NullTime{
692+
Time: database.Now(),
693+
Valid: true,
694+
}
660695
}
661696
err := updateConnectionTimes(ctx)
662697
if err != nil {
@@ -711,15 +746,37 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
711746
return
712747
case <-ticker.C:
713748
}
714-
lastConnectedAt = sql.NullTime{
715-
Time: database.Now(),
716-
Valid: true,
749+
750+
pingMu.Lock()
751+
lastPing := lastPing
752+
pingMu.Unlock()
753+
754+
var connectionStatusChanged bool
755+
if time.Since(lastPing) > api.AgentInactiveDisconnectTimeout {
756+
if !disconnectedAt.Valid {
757+
connectionStatusChanged = true
758+
disconnectedAt = sql.NullTime{
759+
Time: database.Now(),
760+
Valid: true,
761+
}
762+
}
763+
} else {
764+
connectionStatusChanged = disconnectedAt.Valid
765+
// TODO(mafredri): Should we update it here or allow lastConnectedAt to shadow it?
766+
disconnectedAt = sql.NullTime{}
767+
lastConnectedAt = sql.NullTime{
768+
Time: database.Now(),
769+
Valid: true,
770+
}
717771
}
718772
err = updateConnectionTimes(ctx)
719773
if err != nil {
720774
_ = conn.Close(websocket.StatusGoingAway, err.Error())
721775
return
722776
}
777+
if connectionStatusChanged {
778+
api.publishWorkspaceUpdate(ctx, build.WorkspaceID)
779+
}
723780
err := ensureLatestBuild()
724781
if err != nil {
725782
// Disconnect agents that are no longer valid.

0 commit comments

Comments
 (0)