@@ -186,7 +186,7 @@ func (c *configMaps) close() {
186
186
c .L .Lock ()
187
187
defer c .L .Unlock ()
188
188
for _ , lc := range c .peers {
189
- lc .resetTimer ()
189
+ lc .resetLostTimer ()
190
190
}
191
191
c .closing = true
192
192
c .Broadcast ()
@@ -216,6 +216,12 @@ func (c *configMaps) netMapLocked() *netmap.NetworkMap {
216
216
func (c * configMaps ) peerConfigLocked () []* tailcfg.Node {
217
217
out := make ([]* tailcfg.Node , 0 , len (c .peers ))
218
218
for _ , p := range c .peers {
219
+ // Don't add nodes that we havent received a READY_FOR_HANDSHAKE for
220
+ // yet, if they're a destination. If we received a READY_FOR_HANDSHAKE
221
+ // for a peer before we receive their node, the node will be nil.
222
+ if (! p .readyForHandshake && p .isDestination ) || p .node == nil {
223
+ continue
224
+ }
219
225
n := p .node .Clone ()
220
226
if c .blockEndpoints {
221
227
n .Endpoints = nil
@@ -225,6 +231,19 @@ func (c *configMaps) peerConfigLocked() []*tailcfg.Node {
225
231
return out
226
232
}
227
233
234
+ func (c * configMaps ) setTunnelDestination (id uuid.UUID ) {
235
+ c .L .Lock ()
236
+ defer c .L .Unlock ()
237
+ lc , ok := c .peers [id ]
238
+ if ! ok {
239
+ lc = & peerLifecycle {
240
+ peerID : id ,
241
+ }
242
+ c .peers [id ] = lc
243
+ }
244
+ lc .isDestination = true
245
+ }
246
+
228
247
// setAddresses sets the addresses belonging to this node to the given slice. It
229
248
// triggers configuration of the engine if the addresses have changed.
230
249
// c.L MUST NOT be held.
@@ -331,8 +350,10 @@ func (c *configMaps) updatePeers(updates []*proto.CoordinateResponse_PeerUpdate)
331
350
// worry about them being up-to-date when handling updates below, and it covers
332
351
// all peers, not just the ones we got updates about.
333
352
for _ , lc := range c .peers {
334
- if peerStatus , ok := status .Peer [lc .node .Key ]; ok {
335
- lc .lastHandshake = peerStatus .LastHandshake
353
+ if lc .node != nil {
354
+ if peerStatus , ok := status .Peer [lc .node .Key ]; ok {
355
+ lc .lastHandshake = peerStatus .LastHandshake
356
+ }
336
357
}
337
358
}
338
359
@@ -363,7 +384,7 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
363
384
return false
364
385
}
365
386
logger := c .logger .With (slog .F ("peer_id" , id ))
366
- lc , ok := c .peers [id ]
387
+ lc , peerOk := c .peers [id ]
367
388
var node * tailcfg.Node
368
389
if update .Kind == proto .CoordinateResponse_PeerUpdate_NODE {
369
390
// If no preferred DERP is provided, we can't reach the node.
@@ -377,48 +398,76 @@ func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdat
377
398
return false
378
399
}
379
400
logger = logger .With (slog .F ("key_id" , node .Key .ShortString ()), slog .F ("node" , node ))
380
- peerStatus , ok := status .Peer [node .Key ]
381
- // Starting KeepAlive messages at the initialization of a connection
382
- // causes a race condition. If we send the handshake before the peer has
383
- // our node, we'll have to wait for 5 seconds before trying again.
384
- // Ideally, the first handshake starts when the user first initiates a
385
- // connection to the peer. After a successful connection we enable
386
- // keep alives to persist the connection and keep it from becoming idle.
387
- // SSH connections don't send packets while idle, so we use keep alives
388
- // to avoid random hangs while we set up the connection again after
389
- // inactivity.
390
- node .KeepAlive = ok && peerStatus .Active
401
+ node .KeepAlive = c .nodeKeepalive (lc , status , node )
391
402
}
392
403
switch {
393
- case ! ok && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
404
+ case ! peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
394
405
// new!
395
406
var lastHandshake time.Time
396
407
if ps , ok := status .Peer [node .Key ]; ok {
397
408
lastHandshake = ps .LastHandshake
398
409
}
399
- c . peers [ id ] = & peerLifecycle {
410
+ lc = & peerLifecycle {
400
411
peerID : id ,
401
412
node : node ,
402
413
lastHandshake : lastHandshake ,
403
414
lost : false ,
404
415
}
416
+ c .peers [id ] = lc
405
417
logger .Debug (context .Background (), "adding new peer" )
406
- return true
407
- case ok && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
418
+ return lc . validForWireguard ()
419
+ case peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_NODE :
408
420
// update
409
- node .Created = lc .node .Created
421
+ if lc .node != nil {
422
+ node .Created = lc .node .Created
423
+ }
410
424
dirty = ! lc .node .Equal (node )
411
425
lc .node = node
426
+ // validForWireguard checks that the node is non-nil, so should be
427
+ // called after we update the node.
428
+ dirty = dirty && lc .validForWireguard ()
412
429
lc .lost = false
413
- lc .resetTimer ()
430
+ lc .resetLostTimer ()
431
+ if lc .isDestination && ! lc .readyForHandshake {
432
+ // We received the node of a destination peer before we've received
433
+ // their READY_FOR_HANDSHAKE. Set a timer
434
+ lc .setReadyForHandshakeTimer (c )
435
+ logger .Debug (context .Background (), "setting ready for handshake timeout" )
436
+ }
414
437
logger .Debug (context .Background (), "node update to existing peer" , slog .F ("dirty" , dirty ))
415
438
return dirty
416
- case ! ok :
439
+ case peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_READY_FOR_HANDSHAKE :
440
+ dirty := ! lc .readyForHandshake
441
+ lc .readyForHandshake = true
442
+ if lc .readyForHandshakeTimer != nil {
443
+ lc .readyForHandshakeTimer .Stop ()
444
+ }
445
+ if lc .node != nil {
446
+ old := lc .node .KeepAlive
447
+ lc .node .KeepAlive = c .nodeKeepalive (lc , status , lc .node )
448
+ dirty = dirty || (old != lc .node .KeepAlive )
449
+ }
450
+ logger .Debug (context .Background (), "peer ready for handshake" )
451
+ // only force a reconfig if the node populated
452
+ return dirty && lc .node != nil
453
+ case ! peerOk && update .Kind == proto .CoordinateResponse_PeerUpdate_READY_FOR_HANDSHAKE :
454
+ // When we receive a READY_FOR_HANDSHAKE for a peer we don't know about,
455
+ // we create a peerLifecycle with the peerID and set readyForHandshake
456
+ // to true. Eventually we should receive a NODE update for this peer,
457
+ // and it'll be programmed into wireguard.
458
+ logger .Debug (context .Background (), "got peer ready for handshake for unknown peer" )
459
+ lc = & peerLifecycle {
460
+ peerID : id ,
461
+ readyForHandshake : true ,
462
+ }
463
+ c .peers [id ] = lc
464
+ return false
465
+ case ! peerOk :
417
466
// disconnected or lost, but we don't have the node. No op
418
467
logger .Debug (context .Background (), "skipping update for peer we don't recognize" )
419
468
return false
420
469
case update .Kind == proto .CoordinateResponse_PeerUpdate_DISCONNECTED :
421
- lc .resetTimer ()
470
+ lc .resetLostTimer ()
422
471
delete (c .peers , id )
423
472
logger .Debug (context .Background (), "disconnected peer" )
424
473
return true
@@ -476,10 +525,12 @@ func (c *configMaps) peerLostTimeout(id uuid.UUID) {
476
525
"timeout triggered for peer that is removed from the map" )
477
526
return
478
527
}
479
- if peerStatus , ok := status .Peer [lc .node .Key ]; ok {
480
- lc .lastHandshake = peerStatus .LastHandshake
528
+ if lc .node != nil {
529
+ if peerStatus , ok := status .Peer [lc .node .Key ]; ok {
530
+ lc .lastHandshake = peerStatus .LastHandshake
531
+ }
532
+ logger = logger .With (slog .F ("key_id" , lc .node .Key .ShortString ()))
481
533
}
482
- logger = logger .With (slog .F ("key_id" , lc .node .Key .ShortString ()))
483
534
if ! lc .lost {
484
535
logger .Debug (context .Background (),
485
536
"timeout triggered for peer that is no longer lost" )
@@ -522,7 +573,7 @@ func (c *configMaps) nodeAddresses(publicKey key.NodePublic) ([]netip.Prefix, bo
522
573
c .L .Lock ()
523
574
defer c .L .Unlock ()
524
575
for _ , lc := range c .peers {
525
- if lc .node .Key == publicKey {
576
+ if lc .node != nil && lc . node .Key == publicKey {
526
577
return lc .node .Addresses , true
527
578
}
528
579
}
@@ -539,9 +590,10 @@ func (c *configMaps) fillPeerDiagnostics(d *PeerDiagnostics, peerID uuid.UUID) {
539
590
}
540
591
}
541
592
lc , ok := c .peers [peerID ]
542
- if ! ok {
593
+ if ! ok || lc . node == nil {
543
594
return
544
595
}
596
+
545
597
d .ReceivedNode = lc .node
546
598
ps , ok := status .Peer [lc .node .Key ]
547
599
if ! ok {
@@ -550,34 +602,102 @@ func (c *configMaps) fillPeerDiagnostics(d *PeerDiagnostics, peerID uuid.UUID) {
550
602
d .LastWireguardHandshake = ps .LastHandshake
551
603
}
552
604
605
+ func (c * configMaps ) peerReadyForHandshakeTimeout (peerID uuid.UUID ) {
606
+ logger := c .logger .With (slog .F ("peer_id" , peerID ))
607
+ logger .Debug (context .Background (), "peer ready for handshake timeout" )
608
+ c .L .Lock ()
609
+ defer c .L .Unlock ()
610
+ lc , ok := c .peers [peerID ]
611
+ if ! ok {
612
+ logger .Debug (context .Background (),
613
+ "ready for handshake timeout triggered for peer that is removed from the map" )
614
+ return
615
+ }
616
+
617
+ wasReady := lc .readyForHandshake
618
+ lc .readyForHandshake = true
619
+ if ! wasReady {
620
+ logger .Info (context .Background (), "setting peer ready for handshake after timeout" )
621
+ c .netmapDirty = true
622
+ c .Broadcast ()
623
+ }
624
+ }
625
+
626
+ func (* configMaps ) nodeKeepalive (lc * peerLifecycle , status * ipnstate.Status , node * tailcfg.Node ) bool {
627
+ // If the peer is already active, keepalives should be enabled.
628
+ if peerStatus , statusOk := status .Peer [node .Key ]; statusOk && peerStatus .Active {
629
+ return true
630
+ }
631
+ // If the peer is a destination, we should only enable keepalives if we've
632
+ // received the READY_FOR_HANDSHAKE.
633
+ if lc != nil && lc .isDestination && lc .readyForHandshake {
634
+ return true
635
+ }
636
+
637
+ // If none of the above are true, keepalives should not be enabled.
638
+ return false
639
+ }
640
+
553
641
type peerLifecycle struct {
554
- peerID uuid.UUID
555
- node * tailcfg.Node
556
- lost bool
557
- lastHandshake time.Time
558
- timer * clock.Timer
642
+ peerID uuid.UUID
643
+ // isDestination specifies if the peer is a destination, meaning we
644
+ // initiated a tunnel to the peer. When the peer is a destination, we do not
645
+ // respond to node updates with `READY_FOR_HANDSHAKE`s, and we wait to
646
+ // program the peer into wireguard until we receive a READY_FOR_HANDSHAKE
647
+ // from the peer or the timeout is reached.
648
+ isDestination bool
649
+ // node is the tailcfg.Node for the peer. It may be nil until we receive a
650
+ // NODE update for it.
651
+ node * tailcfg.Node
652
+ lost bool
653
+ lastHandshake time.Time
654
+ lostTimer * clock.Timer
655
+ readyForHandshake bool
656
+ readyForHandshakeTimer * clock.Timer
559
657
}
560
658
561
- func (l * peerLifecycle ) resetTimer () {
562
- if l .timer != nil {
563
- l .timer .Stop ()
564
- l .timer = nil
659
+ func (l * peerLifecycle ) resetLostTimer () {
660
+ if l .lostTimer != nil {
661
+ l .lostTimer .Stop ()
662
+ l .lostTimer = nil
565
663
}
566
664
}
567
665
568
666
func (l * peerLifecycle ) setLostTimer (c * configMaps ) {
569
- if l .timer != nil {
570
- l .timer .Stop ()
667
+ if l .lostTimer != nil {
668
+ l .lostTimer .Stop ()
571
669
}
572
670
ttl := lostTimeout - c .clock .Since (l .lastHandshake )
573
671
if ttl <= 0 {
574
672
ttl = time .Nanosecond
575
673
}
576
- l .timer = c .clock .AfterFunc (ttl , func () {
674
+ l .lostTimer = c .clock .AfterFunc (ttl , func () {
577
675
c .peerLostTimeout (l .peerID )
578
676
})
579
677
}
580
678
679
+ const readyForHandshakeTimeout = 5 * time .Second
680
+
681
+ func (l * peerLifecycle ) setReadyForHandshakeTimer (c * configMaps ) {
682
+ if l .readyForHandshakeTimer != nil {
683
+ l .readyForHandshakeTimer .Stop ()
684
+ }
685
+ l .readyForHandshakeTimer = c .clock .AfterFunc (readyForHandshakeTimeout , func () {
686
+ c .logger .Debug (context .Background (), "ready for handshake timeout" , slog .F ("peer_id" , l .peerID ))
687
+ c .peerReadyForHandshakeTimeout (l .peerID )
688
+ })
689
+ }
690
+
691
+ // validForWireguard returns true if the peer is ready to be programmed into
692
+ // wireguard.
693
+ func (l * peerLifecycle ) validForWireguard () bool {
694
+ valid := l .node != nil
695
+ if l .isDestination {
696
+ return valid && l .readyForHandshake
697
+ }
698
+ return valid
699
+ }
700
+
581
701
// prefixesDifferent returns true if the two slices contain different prefixes
582
702
// where order doesn't matter.
583
703
func prefixesDifferent (a , b []netip.Prefix ) bool {
0 commit comments