Skip to content

Commit a8ac205

Browse files
committed
fix(enterprise): mark nodes from unhealthy coordinators as lost
Instead of removing the mappings of unhealthy coordinators entirely, mark them as lost instead. This prevents peers from disappearing from other peers if a coordinator misses a heartbeat.
1 parent 3ff9cef commit a8ac205

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed

enterprise/tailnet/pgcoord.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
14851485
ok := m.coordinator == h.self
14861486
if !ok {
14871487
_, ok = h.coordinators[m.coordinator]
1488+
if !ok {
1489+
// If a mapping exists to a coordinator lost to heartbeats,
1490+
// still add the mapping as LOST. If a coordinator misses
1491+
// heartbeats but a client is still connected to it, this may be
1492+
// the only mapping available for it. Newer mappings will take
1493+
// precedence.
1494+
m.kind = proto.CoordinateResponse_PeerUpdate_LOST
1495+
}
14881496
}
1489-
if ok {
1490-
out = append(out, m)
1491-
}
1497+
1498+
out = append(out, m)
14921499
}
14931500
return out
14941501
}

enterprise/tailnet/pgcoord_internal_test.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/google/uuid"
14+
"github.com/stretchr/testify/assert"
1415
"github.com/stretchr/testify/require"
1516
"go.uber.org/mock/gomock"
1617
"golang.org/x/xerrors"
@@ -33,9 +34,9 @@ import (
3334
// make update-golden-files
3435
var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")
3536

36-
// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
37+
// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
3738
// cleanup.
38-
func TestHeartbeat_Cleanup(t *testing.T) {
39+
func TestHeartbeats_Cleanup(t *testing.T) {
3940
t.Parallel()
4041

4142
ctrl := gomock.NewController(t)
@@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
7879
close(waitForCleanup)
7980
}
8081

82+
func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
83+
t.Parallel()
84+
85+
ctrl := gomock.NewController(t)
86+
mStore := dbmock.NewMockStore(ctrl)
87+
88+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
89+
defer cancel()
90+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
91+
92+
uut := &heartbeats{
93+
ctx: ctx,
94+
logger: logger,
95+
store: mStore,
96+
cleanupPeriod: time.Millisecond,
97+
coordinators: map[uuid.UUID]time.Time{
98+
uuid.New(): time.Now(),
99+
},
100+
}
101+
102+
mpngs := []mapping{{
103+
peer: uuid.New(),
104+
coordinator: uuid.New(),
105+
updatedAt: time.Now(),
106+
node: &proto.Node{},
107+
kind: proto.CoordinateResponse_PeerUpdate_NODE,
108+
}}
109+
110+
// Filter should still return the mapping without a coordinator, but marked
111+
// as LOST.
112+
got := uut.filter(mpngs)
113+
require.Len(t, got, 1)
114+
assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
115+
}
116+
81117
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
82118
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
83119
func TestLostPeerCleanupQueries(t *testing.T) {

0 commit comments

Comments
 (0)