@@ -186,7 +186,7 @@ func (c *configMaps) close() {
186
186
c .L .Lock ()
187
187
defer c .L .Unlock ()
188
188
for _ , lc := range c .peers {
189
- lc .resetTimer ()
189
+ lc .resetLostTimer ()
190
190
}
191
191
c .closing = true
192
192
c .Broadcast ()
@@ -398,17 +398,7 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
398
398
return false
399
399
}
400
400
logger = logger .With (slog .F ("key_id" , node .Key .ShortString ()), slog .F ("node" , node ))
401
- peerStatus , statusOk := status .Peer [node .Key ]
402
- // Starting KeepAlive messages at the initialization of a connection
403
- // causes a race condition. If we send the handshake before the peer has
404
- // our node, we'll have to wait for 5 seconds before trying again.
405
- // Ideally, the first handshake starts when the user first initiates a
406
- // connection to the peer. After a successful connection we enable
407
- // keep alives to persist the connection and keep it from becoming idle.
408
- // SSH connections don't send packets while idle, so we use keep alives
409
- // to avoid random hangs while we set up the connection again after
410
- // inactivity.
411
- node .KeepAlive = (statusOk && peerStatus .Active ) || (peerOk && lc .node != nil && lc .node .KeepAlive )
401
+ node .KeepAlive = c .nodeKeepalive (lc , status , node )
412
402
}
413
403
switch {
414
404
case ! peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
@@ -422,31 +412,27 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
422
412
node : node ,
423
413
lastHandshake : lastHandshake ,
424
414
lost : false ,
425
- // If we're receiving a NODE update for a peer we don't already have
426
- // a lifecycle for, it's likely the source of a tunnel. We don't
427
- // need to wait for a READY_FOR_HANDSHAKE.
428
- readyForHandshake : true ,
429
415
}
430
416
c .peers [id ] = lc
431
417
logger .Debug (context .Background (), "adding new peer" )
432
- // since we just got this node, we don't know if it's ready for
433
- // handshakes yet.
434
- return lc .readyForHandshake
418
+ return lc .validForWireguard ()
435
419
case peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
436
420
// update
437
421
if lc .node != nil {
438
422
node .Created = lc .node .Created
439
423
}
440
- dirty = ! lc .node .Equal (node ) && lc . readyForHandshake
424
+ dirty = ! lc .node .Equal (node )
441
425
lc .node = node
426
+ // validForWireguard checks that the node is non-nil, so should be
427
+ // called after we update the node.
428
+ dirty = dirty && lc .validForWireguard ()
442
429
lc .lost = false
443
- lc .resetTimer ()
444
- if lc .isDestination {
445
- if ! lc .readyForHandshake {
446
- lc .setReadyForHandshakeTimer (c )
447
- } else {
448
- lc .node .KeepAlive = true
449
- }
430
+ lc .resetLostTimer ()
431
+ if lc .isDestination && ! lc .readyForHandshake {
432
+ // We received the node of a destination peer before we've received
433
+ // their READY_FOR_HANDSHAKE. Set a timer
434
+ lc .setReadyForHandshakeTimer (c )
435
+ logger .Debug (context .Background (), "setting ready for handshake timeout" )
450
436
}
451
437
logger .Debug (context .Background (), "node update to existing peer" , slog .F ("dirty" , dirty ))
452
438
return dirty
@@ -455,7 +441,6 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
455
441
lc .readyForHandshake = true
456
442
if lc .readyForHandshakeTimer != nil {
457
443
lc .readyForHandshakeTimer .Stop ()
458
- lc .readyForHandshakeTimer = nil
459
444
}
460
445
if lc .node != nil {
461
446
dirty = dirty || ! lc .node .KeepAlive
@@ -475,16 +460,14 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
475
460
lost : true ,
476
461
readyForHandshake : true ,
477
462
}
478
- // Timeout the peer in case we never receive a NODE update for it.
479
- lc .setLostTimer (c )
480
463
c .peers [id ] = lc
481
464
return false
482
465
case ! peerOk :
483
466
// disconnected or lost, but we don't have the node. No op
484
467
logger .Debug (context .Background (), "skipping update for peer we don't recognize" )
485
468
return false
486
469
case update .Kind == proto .CoordinateResponse_PeerUpdate_DISCONNECTED :
487
- lc .resetTimer ()
470
+ lc .resetLostTimer ()
488
471
delete (c .peers , id )
489
472
logger .Debug (context .Background (), "disconnected peer" )
490
473
return true
@@ -607,39 +590,58 @@ func (c *configMaps) fillPeerDiagnostics(d *PeerDiagnostics, peerID uuid.UUID) {
607
590
}
608
591
}
609
592
lc , ok := c .peers [peerID ]
610
- if ! ok {
593
+ if ! ok || lc . node == nil {
611
594
return
612
595
}
613
596
614
597
d .ReceivedNode = lc .node
615
- if lc .node != nil {
616
- ps , ok := status .Peer [lc .node .Key ]
617
- if ! ok {
618
- return
619
- }
620
- d .LastWireguardHandshake = ps .LastHandshake
598
+ ps , ok := status .Peer [lc .node .Key ]
599
+ if ! ok {
600
+ return
621
601
}
622
- return
602
+ d . LastWireguardHandshake = ps . LastHandshake
623
603
}
624
604
625
605
func (c * configMaps ) peerReadyForHandshakeTimeout (peerID uuid.UUID ) {
606
+ logger := c .logger .With (slog .F ("peer_id" , peerID ))
607
+ logger .Debug (context .Background (), "peer ready for handshake timeout" )
626
608
c .L .Lock ()
627
609
defer c .L .Unlock ()
628
610
lc , ok := c .peers [peerID ]
629
611
if ! ok {
612
+ logger .Debug (context .Background (),
613
+ "ready for handshake timeout triggered for peer that is removed from the map" )
630
614
return
631
615
}
632
- if lc .readyForHandshakeTimer != nil {
633
- wasReady := lc .readyForHandshake
634
- lc .readyForHandshakeTimer = nil
635
- lc .readyForHandshake = true
636
- if ! wasReady {
637
- c .netmapDirty = true
638
- c .Broadcast ()
639
- }
616
+
617
+ wasReady := lc .readyForHandshake
618
+ lc .readyForHandshake = true
619
+ if ! wasReady {
620
+ logger .Info (context .Background (), "setting peer ready for handshake after timeout" )
621
+ c .netmapDirty = true
622
+ c .Broadcast ()
640
623
}
641
624
}
642
625
626
+ func (* configMaps ) nodeKeepalive (lc * peerLifecycle , status * ipnstate.Status , node * tailcfg.Node ) bool {
627
+ // If the peer is already active, keepalives should be enabled.
628
+ if peerStatus , statusOk := status .Peer [node .Key ]; statusOk && peerStatus .Active {
629
+ return true
630
+ }
631
+ // If the peer is a destination, we should only enable keepalives if we've
632
+ // received the READY_FOR_HANDSHAKE.
633
+ if lc != nil && lc .isDestination && lc .readyForHandshake {
634
+ return true
635
+ }
636
+ // If keepalives are already enabled on the node, keep them enabled.
637
+ if lc != nil && lc .node != nil && lc .node .KeepAlive {
638
+ return true
639
+ }
640
+
641
+ // If none of the above are true, keepalives should not be enabled.
642
+ return false
643
+ }
644
+
643
645
type peerLifecycle struct {
644
646
peerID uuid.UUID
645
647
// isDestination specifies if the peer is a destination, meaning we
@@ -658,7 +660,7 @@ type peerLifecycle struct {
658
660
readyForHandshakeTimer * clock.Timer
659
661
}
660
662
661
- func (l * peerLifecycle ) resetTimer () {
663
+ func (l * peerLifecycle ) resetLostTimer () {
662
664
if l .lostTimer != nil {
663
665
l .lostTimer .Stop ()
664
666
l .lostTimer = nil
@@ -678,13 +680,28 @@ func (l *peerLifecycle) setLostTimer(c *configMaps) {
678
680
})
679
681
}
680
682
683
+ const readyForHandshakeTimeout = 5 * time .Second
684
+
681
685
func (l * peerLifecycle ) setReadyForHandshakeTimer (c * configMaps ) {
682
- l .readyForHandshakeTimer = c .clock .AfterFunc (5 * time .Second , func () {
686
+ if l .readyForHandshakeTimer != nil {
687
+ l .readyForHandshakeTimer .Stop ()
688
+ }
689
+ l .readyForHandshakeTimer = c .clock .AfterFunc (readyForHandshakeTimeout , func () {
683
690
c .logger .Debug (context .Background (), "ready for handshake timeout" , slog .F ("peer_id" , l .peerID ))
684
691
c .peerReadyForHandshakeTimeout (l .peerID )
685
692
})
686
693
}
687
694
695
+ // validForWireguard returns true if the peer is ready to be programmed into
696
+ // wireguard.
697
+ func (l * peerLifecycle ) validForWireguard () bool {
698
+ valid := l .node != nil
699
+ if l .isDestination {
700
+ return valid && l .readyForHandshake
701
+ }
702
+ return valid
703
+ }
704
+
688
705
// prefixesDifferent returns true if the two slices contain different prefixes
689
706
// where order doesn't matter.
690
707
func prefixesDifferent (a , b []netip.Prefix ) bool {
0 commit comments