Skip to content

feat: add cleanup of lost tailnet peers and tunnels to PGCoordinator #10939

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
feat: add cleanup of lost tailnet peers and tunnels to PGCoordinator
  • Loading branch information
spikecurtis committed Dec 1, 2023
commit 484a54c91511fd3aeee0de24b5cdb331d1374e93
19 changes: 13 additions & 6 deletions enterprise/tailnet/pgcoord.go
Original file line number Diff line number Diff line change
Expand Up @@ -1723,15 +1723,22 @@ func (h *heartbeats) cleanupLoop() {
}
}

// cleanup issues a DB command to clean out any old expired coordinators state. The cleanup is idempotent, so no need
// to synchronize with other coordinators.
// cleanup issues a DB command to clean out any old expired coordinators or lost peer state. The
// cleanup is idempotent, so no need to synchronize with other coordinators.
func (h *heartbeats) cleanup() {
// the records we are attempting to clean up do no serious harm other than
// accumulating in the tables, so we don't bother retrying if it fails.
err := h.store.CleanTailnetCoordinators(h.ctx)
if err != nil {
// the records we are attempting to clean up do no serious harm other than
// accumulating in the tables, so we don't bother retrying if it fails.
h.logger.Error(h.ctx, "failed to cleanup old coordinators", slog.Error(err))
return
}
h.logger.Debug(h.ctx, "cleaned up old coordinators")
err = h.store.CleanTailnetLostPeers(h.ctx)
if err != nil {
h.logger.Error(h.ctx, "failed to cleanup lost peers", slog.Error(err))
}
err = h.store.CleanTailnetTunnels(h.ctx)
if err != nil {
h.logger.Error(h.ctx, "failed to cleanup abandoned tunnels", slog.Error(err))
}
h.logger.Debug(h.ctx, "completed cleanup")
}
108 changes: 107 additions & 1 deletion enterprise/tailnet/pgcoord_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ func TestHeartbeat_Cleanup(t *testing.T) {
<-waitForCleanup
return nil
})
mStore.EXPECT().CleanTailnetLostPeers(gomock.Any()).MinTimes(2).DoAndReturn(func(_ context.Context) error {
<-waitForCleanup
return nil
})
mStore.EXPECT().CleanTailnetTunnels(gomock.Any()).MinTimes(2).DoAndReturn(func(_ context.Context) error {
<-waitForCleanup
return nil
})

uut := &heartbeats{
ctx: ctx,
Expand All @@ -56,7 +64,7 @@ func TestHeartbeat_Cleanup(t *testing.T) {
}
go uut.cleanupLoop()

for i := 0; i < 2; i++ {
for i := 0; i < 6; i++ {
select {
case <-ctx.Done():
t.Fatal("timeout")
Expand All @@ -67,6 +75,104 @@ func TestHeartbeat_Cleanup(t *testing.T) {
close(waitForCleanup)
}

// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
func TestLostPeerCleanupQueries(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("test only with postgres")
}
store, _, sqlDB := dbtestutil.NewDBWithSQLDB(t, dbtestutil.WithDumpOnFailure())
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()

coordID := uuid.New()
_, err := store.UpsertTailnetCoordinator(ctx, coordID)
require.NoError(t, err)

peerID := uuid.New()
_, err = store.UpsertTailnetPeer(ctx, database.UpsertTailnetPeerParams{
ID: peerID,
CoordinatorID: coordID,
Node: []byte("test"),
Status: database.TailnetStatusLost,
})
require.NoError(t, err)

otherID := uuid.New()
_, err = store.UpsertTailnetTunnel(ctx, database.UpsertTailnetTunnelParams{
CoordinatorID: coordID,
SrcID: peerID,
DstID: otherID,
})
require.NoError(t, err)

peers, err := store.GetAllTailnetPeers(ctx)
require.NoError(t, err)
require.Len(t, peers, 1)
require.Equal(t, peerID, peers[0].ID)

tunnels, err := store.GetAllTailnetTunnels(ctx)
require.NoError(t, err)
require.Len(t, tunnels, 1)
require.Equal(t, peerID, tunnels[0].SrcID)
require.Equal(t, otherID, tunnels[0].DstID)

// this clean is a noop since the peer and tunnel are less than 24h old
err = store.CleanTailnetLostPeers(ctx)
require.NoError(t, err)
err = store.CleanTailnetTunnels(ctx)
require.NoError(t, err)

peers, err = store.GetAllTailnetPeers(ctx)
require.NoError(t, err)
require.Len(t, peers, 1)
require.Equal(t, peerID, peers[0].ID)

tunnels, err = store.GetAllTailnetTunnels(ctx)
require.NoError(t, err)
require.Len(t, tunnels, 1)
require.Equal(t, peerID, tunnels[0].SrcID)
require.Equal(t, otherID, tunnels[0].DstID)

// set the age of the tunnel to >24h
sqlDB.Exec("UPDATE tailnet_tunnels SET updated_at = $1", time.Now().Add(-25*time.Hour))

// this clean is still a noop since the peer hasn't been lost for 24 hours
err = store.CleanTailnetLostPeers(ctx)
require.NoError(t, err)
err = store.CleanTailnetTunnels(ctx)
require.NoError(t, err)

peers, err = store.GetAllTailnetPeers(ctx)
require.NoError(t, err)
require.Len(t, peers, 1)
require.Equal(t, peerID, peers[0].ID)

tunnels, err = store.GetAllTailnetTunnels(ctx)
require.NoError(t, err)
require.Len(t, tunnels, 1)
require.Equal(t, peerID, tunnels[0].SrcID)
require.Equal(t, otherID, tunnels[0].DstID)

// set the age of the tunnel to >24h
sqlDB.Exec("UPDATE tailnet_peers SET updated_at = $1", time.Now().Add(-25*time.Hour))

// this clean removes the peer and the associated tunnel
err = store.CleanTailnetLostPeers(ctx)
require.NoError(t, err)
err = store.CleanTailnetTunnels(ctx)
require.NoError(t, err)

peers, err = store.GetAllTailnetPeers(ctx)
require.NoError(t, err)
require.Len(t, peers, 0)

tunnels, err = store.GetAllTailnetTunnels(ctx)
require.NoError(t, err)
require.Len(t, tunnels, 0)
}

func TestDebugTemplate(t *testing.T) {
t.Parallel()
if runtime.GOOS == "windows" {
Expand Down
2 changes: 2 additions & 0 deletions enterprise/tailnet/pgcoord_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,8 @@ func TestPGCoordinator_Unhealthy(t *testing.T) {
Return(database.TailnetCoordinator{}, nil)
// extra calls we don't particularly care about for this test
mStore.EXPECT().CleanTailnetCoordinators(gomock.Any()).AnyTimes().Return(nil)
mStore.EXPECT().CleanTailnetLostPeers(gomock.Any()).AnyTimes().Return(nil)
mStore.EXPECT().CleanTailnetTunnels(gomock.Any()).AnyTimes().Return(nil)
mStore.EXPECT().GetTailnetTunnelPeerIDs(gomock.Any(), gomock.Any()).AnyTimes().Return(nil, nil)
mStore.EXPECT().GetTailnetTunnelPeerBindings(gomock.Any(), gomock.Any()).
AnyTimes().Return(nil, nil)
Expand Down