diff --git a/agent/agent.go b/agent/agent.go index 18243ee788789..16badf1034d2c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -272,6 +272,15 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) { // runCoordinator listens for nodes and updates the self-node as it changes. func (a *agent) runCoordinator(ctx context.Context) { + for { + reconnect := a.runCoordinatorWithRetry(ctx) + if !reconnect { + return + } + } +} + +func (a *agent) runCoordinatorWithRetry(ctx context.Context) (reconnect bool) { var coordinator net.Conn var err error // An exponential back-off occurs when the connection is failing to dial. @@ -280,38 +289,38 @@ func (a *agent) runCoordinator(ctx context.Context) { coordinator, err = a.coordinatorDialer(ctx) if err != nil { if errors.Is(err, context.Canceled) { - return + return false } if a.isClosed() { - return + return false } a.logger.Warn(context.Background(), "failed to dial", slog.Error(err)) continue } + //nolint:revive // Defer is ok because we're exiting this loop. + defer coordinator.Close() a.logger.Info(context.Background(), "connected to coordination server") break } select { case <-ctx.Done(): - return + return false default: } - defer coordinator.Close() sendNodes, errChan := tailnet.ServeCoordinator(coordinator, a.network.UpdateNodes) a.network.SetNodeCallback(sendNodes) select { case <-ctx.Done(): - return + return false case err := <-errChan: if a.isClosed() { - return + return false } if errors.Is(err, context.Canceled) { - return + return false } a.logger.Debug(ctx, "node broker accept exited; restarting connection", slog.Error(err)) - a.runCoordinator(ctx) - return + return true } }