Skip to content

Commit 1724cbf

Browse files
authored
feat: automatically use websockets if DERP upgrade is unavailable (#6381)
* feat: automatically use websockets if DERP upgrade is unavailable This might be our biggest hangup for deployments at the moment... Load balancers by default do not support the DERP protocol, so many of our prospects and customers run into failing workspace connections. This automatically swaps to use WebSockets, and reports the reason to coderd. In a future contribution, a warning will appear by the agent if it was forced to use WebSockets instead of DERP. * Fix nil pointer type in Tailscale dep * Fix requested changes
1 parent ce11400 commit 1724cbf

File tree

9 files changed

+300
-29
lines changed

9 files changed

+300
-29
lines changed

coderd/coderd.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,9 @@ func New(options *Options) *API {
284284
// replicas or instances of this middleware.
285285
apiRateLimiter := httpmw.RateLimit(options.APIRateLimit, time.Minute)
286286

287+
derpHandler := derphttp.Handler(api.DERPServer)
288+
derpHandler, api.derpCloseFunc = tailnet.WithWebsocketSupport(api.DERPServer, derpHandler)
289+
287290
r.Use(
288291
httpmw.Recover(api.Logger),
289292
tracing.StatusWriterMiddleware,
@@ -363,7 +366,7 @@ func New(options *Options) *API {
363366
r.Route("/%40{user}/{workspace_and_agent}/apps/{workspaceapp}", apps)
364367
r.Route("/@{user}/{workspace_and_agent}/apps/{workspaceapp}", apps)
365368
r.Route("/derp", func(r chi.Router) {
366-
r.Get("/", derphttp.Handler(api.DERPServer).ServeHTTP)
369+
r.Get("/", derpHandler.ServeHTTP)
367370
// This is used when UDP is blocked, and latency must be checked via HTTP(s).
368371
r.Get("/latency-check", func(w http.ResponseWriter, r *http.Request) {
369372
w.WriteHeader(http.StatusOK)
@@ -726,6 +729,7 @@ type API struct {
726729

727730
WebsocketWaitMutex sync.Mutex
728731
WebsocketWaitGroup sync.WaitGroup
732+
derpCloseFunc func()
729733

730734
metricsCache *metricscache.Cache
731735
workspaceAgentCache *wsconncache.Cache
@@ -739,6 +743,7 @@ type API struct {
739743
// Close waits for all WebSocket connections to drain before returning.
740744
func (api *API) Close() error {
741745
api.cancel()
746+
api.derpCloseFunc()
742747

743748
api.WebsocketWaitMutex.Lock()
744749
api.WebsocketWaitGroup.Wait()

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0
4040

4141
// There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here:
4242
// https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main
43-
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a
43+
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba
4444

4545
// Switch to our fork that imports fixes from http://github.com/tailscale/ssh.
4646
// See: https://github.com/coder/coder/issues/3371

go.sum

+28
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,34 @@ github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHui
379379
github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914=
380380
github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a h1:fTXoK+Yikz8Rl0v0nHxO+lTV0y8Q7wdmGFq1CqLgznE=
381381
github.com/coder/tailscale v1.1.1-0.20230210022917-446fc10e755a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
382+
github.com/coder/tailscale v1.1.1-0.20230228183907-71829ca8456b h1:i8R1RwDqLihavxsaTovKCax45AJLy2dtXI0PcLm+BnM=
383+
github.com/coder/tailscale v1.1.1-0.20230228183907-71829ca8456b/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
384+
github.com/coder/tailscale v1.1.1-0.20230228184228-8e7815556b85 h1:tg89NZBDe91wWTeII4jFThHdB6USE3mBNXzK0/CWODM=
385+
github.com/coder/tailscale v1.1.1-0.20230228184228-8e7815556b85/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
386+
github.com/coder/tailscale v1.1.1-0.20230228184330-c2cd7d4f5a70 h1:gGevbQLqz4fWk5HytFodNe/Bl8rqgHb2+YsRoaZPf1g=
387+
github.com/coder/tailscale v1.1.1-0.20230228184330-c2cd7d4f5a70/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
388+
github.com/coder/tailscale v1.1.1-0.20230228184508-acb075f559ce h1:oAYRHRM8/rPrQhcPiXBkLw/YU/NyPhhN+HypRJWSq2k=
389+
github.com/coder/tailscale v1.1.1-0.20230228184508-acb075f559ce/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
390+
github.com/coder/tailscale v1.1.1-0.20230228184618-0e3e4965c416 h1:25uoFv5YNvve9r0BEGJAJHVWulyPmPPvXp6S3b0085o=
391+
github.com/coder/tailscale v1.1.1-0.20230228184618-0e3e4965c416/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
392+
github.com/coder/tailscale v1.1.1-0.20230228185337-873afd546fb2 h1:yrKQPYAadmtrj5J33Fs/Ldx3FgXCfd0gUbcFqjpsq+I=
393+
github.com/coder/tailscale v1.1.1-0.20230228185337-873afd546fb2/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
394+
github.com/coder/tailscale v1.1.1-0.20230228185607-b6042b55609a h1:aqp8CUtVBcCLDI369ORzbVt/pfOt0mTZBxIAejgZkgY=
395+
github.com/coder/tailscale v1.1.1-0.20230228185607-b6042b55609a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
396+
github.com/coder/tailscale v1.1.1-0.20230228194335-31045cf53c85 h1:MHJDOehTwolaeT8FQrBQ5pNEdWXO+8pWTy5ODIYOZUU=
397+
github.com/coder/tailscale v1.1.1-0.20230228194335-31045cf53c85/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
398+
github.com/coder/tailscale v1.1.1-0.20230228194714-66eac1a43e5c h1:WylIWZ+8UpiW2NPTu6z2sgFHwqo9/XmT/QTNWT47qic=
399+
github.com/coder/tailscale v1.1.1-0.20230228194714-66eac1a43e5c/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
400+
github.com/coder/tailscale v1.1.1-0.20230228200853-ddedcafad3a1 h1:XgOpzrEiDR4GzoKE4dhqopZAg9MrG1cXPMc9/PXA6kk=
401+
github.com/coder/tailscale v1.1.1-0.20230228200853-ddedcafad3a1/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
402+
github.com/coder/tailscale v1.1.1-0.20230228204054-70dbfd7be018 h1:pv80tzqexcjrP/4CxFpalrk6bS2XJ2xOGA00u1WUy+g=
403+
github.com/coder/tailscale v1.1.1-0.20230228204054-70dbfd7be018/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
404+
github.com/coder/tailscale v1.1.1-0.20230228215233-ba13d5ceee2a h1:eXE/5hMkcHfa3dcJJ4WkseSbJNU4thMfC8cUMIAgi2s=
405+
github.com/coder/tailscale v1.1.1-0.20230228215233-ba13d5ceee2a/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
406+
github.com/coder/tailscale v1.1.1-0.20230228220447-d27e5d3056e9 h1:bFcFXLUUi+cwgOqrjbXN+XmI7QvB1up/UZoNF1+9nuM=
407+
github.com/coder/tailscale v1.1.1-0.20230228220447-d27e5d3056e9/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
408+
github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba h1:JOD5pqNtiz9lkSX74PY2BJOyNqsBmvGUjFGIuECtG+o=
409+
github.com/coder/tailscale v1.1.1-0.20230301203426-fb16ae7c5bba/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
382410
github.com/coder/terraform-provider-coder v0.6.14 h1:NsJ1mo0MN1x/VyNLYmsgPUYn2JgzdVNZBqnj9OSIDgY=
383411
github.com/coder/terraform-provider-coder v0.6.14/go.mod h1:UIfU3bYNeSzJJvHyJ30tEKjD6Z9utloI+HUM/7n94CY=
384412
github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE=

tailnet/conn.go

+42-26
Original file line numberDiff line numberDiff line change
@@ -192,19 +192,20 @@ func NewConn(options *Options) (conn *Conn, err error) {
192192
wireguardEngine.SetFilter(filter.New(netMap.PacketFilter, localIPs, logIPs, nil, Logger(options.Logger.Named("packet-filter"))))
193193
dialContext, dialCancel := context.WithCancel(context.Background())
194194
server := &Conn{
195-
blockEndpoints: options.BlockEndpoints,
196-
dialContext: dialContext,
197-
dialCancel: dialCancel,
198-
closed: make(chan struct{}),
199-
logger: options.Logger,
200-
magicConn: magicConn,
201-
dialer: dialer,
202-
listeners: map[listenKey]*listener{},
203-
peerMap: map[tailcfg.NodeID]*tailcfg.Node{},
204-
tunDevice: tunDevice,
205-
netMap: netMap,
206-
netStack: netStack,
207-
wireguardMonitor: wireguardMonitor,
195+
blockEndpoints: options.BlockEndpoints,
196+
dialContext: dialContext,
197+
dialCancel: dialCancel,
198+
closed: make(chan struct{}),
199+
logger: options.Logger,
200+
magicConn: magicConn,
201+
dialer: dialer,
202+
listeners: map[listenKey]*listener{},
203+
peerMap: map[tailcfg.NodeID]*tailcfg.Node{},
204+
lastDERPForcedWebsockets: map[int]string{},
205+
tunDevice: tunDevice,
206+
netMap: netMap,
207+
netStack: netStack,
208+
wireguardMonitor: wireguardMonitor,
208209
wireguardRouter: &router.Config{
209210
LocalAddrs: netMap.Addresses,
210211
},
@@ -247,6 +248,17 @@ func NewConn(options *Options) (conn *Conn, err error) {
247248
server.lastMutex.Unlock()
248249
server.sendNode()
249250
})
251+
magicConn.SetDERPForcedWebsocketCallback(func(region int, reason string) {
252+
server.logger.Debug(context.Background(), "derp forced websocket", slog.F("region", region), slog.F("reason", reason))
253+
server.lastMutex.Lock()
254+
if server.lastDERPForcedWebsockets[region] == reason {
255+
server.lastMutex.Unlock()
256+
return
257+
}
258+
server.lastDERPForcedWebsockets[region] = reason
259+
server.lastMutex.Unlock()
260+
server.sendNode()
261+
})
250262
netStack.ForwardTCPIn = server.forwardTCP
251263

252264
err = netStack.Start(nil)
@@ -299,10 +311,11 @@ type Conn struct {
299311
nodeChanged bool
300312
// It's only possible to store these values via status functions,
301313
// so the values must be stored for retrieval later on.
302-
lastStatus time.Time
303-
lastEndpoints []tailcfg.Endpoint
304-
lastNetInfo *tailcfg.NetInfo
305-
nodeCallback func(node *Node)
314+
lastStatus time.Time
315+
lastEndpoints []tailcfg.Endpoint
316+
lastDERPForcedWebsockets map[int]string
317+
lastNetInfo *tailcfg.NetInfo
318+
nodeCallback func(node *Node)
306319

307320
trafficStats *connstats.Statistics
308321
}
@@ -632,21 +645,24 @@ func (c *Conn) selfNode() *Node {
632645
}
633646
var preferredDERP int
634647
var derpLatency map[string]float64
648+
var derpForcedWebsocket map[int]string
635649
if c.lastNetInfo != nil {
636650
preferredDERP = c.lastNetInfo.PreferredDERP
637651
derpLatency = c.lastNetInfo.DERPLatency
652+
derpForcedWebsocket = c.lastDERPForcedWebsockets
638653
}
639654

640655
node := &Node{
641-
ID: c.netMap.SelfNode.ID,
642-
AsOf: database.Now(),
643-
Key: c.netMap.SelfNode.Key,
644-
Addresses: c.netMap.SelfNode.Addresses,
645-
AllowedIPs: c.netMap.SelfNode.AllowedIPs,
646-
DiscoKey: c.magicConn.DiscoPublicKey(),
647-
Endpoints: endpoints,
648-
PreferredDERP: preferredDERP,
649-
DERPLatency: derpLatency,
656+
ID: c.netMap.SelfNode.ID,
657+
AsOf: database.Now(),
658+
Key: c.netMap.SelfNode.Key,
659+
Addresses: c.netMap.SelfNode.Addresses,
660+
AllowedIPs: c.netMap.SelfNode.AllowedIPs,
661+
DiscoKey: c.magicConn.DiscoPublicKey(),
662+
Endpoints: endpoints,
663+
PreferredDERP: preferredDERP,
664+
DERPLatency: derpLatency,
665+
DERPForcedWebsocket: derpForcedWebsocket,
650666
}
651667
if c.blockEndpoints {
652668
node.Endpoints = nil

tailnet/conn_test.go

+82-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"cdr.dev/slog/sloggers/slogtest"
1414
"github.com/coder/coder/tailnet"
1515
"github.com/coder/coder/tailnet/tailnettest"
16+
"github.com/coder/coder/testutil"
1617
)
1718

1819
func TestMain(m *testing.M) {
@@ -63,7 +64,7 @@ func TestTailnet(t *testing.T) {
6364
assert.NoError(t, err)
6465
})
6566
require.True(t, w2.AwaitReachable(context.Background(), w1IP))
66-
conn := make(chan struct{})
67+
conn := make(chan struct{}, 1)
6768
go func() {
6869
listener, err := w1.Listen("tcp", ":35565")
6970
assert.NoError(t, err)
@@ -81,6 +82,86 @@ func TestTailnet(t *testing.T) {
8182
_ = nc.Close()
8283
<-conn
8384

85+
nodes := make(chan *tailnet.Node, 1)
86+
w2.SetNodeCallback(func(node *tailnet.Node) {
87+
select {
88+
case nodes <- node:
89+
default:
90+
}
91+
})
92+
node := <-nodes
93+
// Ensure this connected over DERP!
94+
require.Len(t, node.DERPForcedWebsocket, 0)
95+
96+
w1.Close()
97+
w2.Close()
98+
})
99+
100+
t.Run("ForcesWebSockets", func(t *testing.T) {
101+
t.Parallel()
102+
ctx, cancelFunc := testutil.Context(t)
103+
defer cancelFunc()
104+
105+
w1IP := tailnet.IP()
106+
derpMap := tailnettest.RunDERPOnlyWebSockets(t)
107+
w1, err := tailnet.NewConn(&tailnet.Options{
108+
Addresses: []netip.Prefix{netip.PrefixFrom(w1IP, 128)},
109+
Logger: logger.Named("w1"),
110+
DERPMap: derpMap,
111+
BlockEndpoints: true,
112+
})
113+
require.NoError(t, err)
114+
115+
w2, err := tailnet.NewConn(&tailnet.Options{
116+
Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)},
117+
Logger: logger.Named("w2"),
118+
DERPMap: derpMap,
119+
BlockEndpoints: true,
120+
})
121+
require.NoError(t, err)
122+
t.Cleanup(func() {
123+
_ = w1.Close()
124+
_ = w2.Close()
125+
})
126+
w1.SetNodeCallback(func(node *tailnet.Node) {
127+
err := w2.UpdateNodes([]*tailnet.Node{node}, false)
128+
assert.NoError(t, err)
129+
})
130+
w2.SetNodeCallback(func(node *tailnet.Node) {
131+
err := w1.UpdateNodes([]*tailnet.Node{node}, false)
132+
assert.NoError(t, err)
133+
})
134+
require.True(t, w2.AwaitReachable(ctx, w1IP))
135+
conn := make(chan struct{}, 1)
136+
go func() {
137+
listener, err := w1.Listen("tcp", ":35565")
138+
assert.NoError(t, err)
139+
defer listener.Close()
140+
nc, err := listener.Accept()
141+
if !assert.NoError(t, err) {
142+
return
143+
}
144+
_ = nc.Close()
145+
conn <- struct{}{}
146+
}()
147+
148+
nc, err := w2.DialContextTCP(ctx, netip.AddrPortFrom(w1IP, 35565))
149+
require.NoError(t, err)
150+
_ = nc.Close()
151+
<-conn
152+
153+
nodes := make(chan *tailnet.Node, 1)
154+
w2.SetNodeCallback(func(node *tailnet.Node) {
155+
select {
156+
case nodes <- node:
157+
default:
158+
}
159+
})
160+
node := <-nodes
161+
require.Len(t, node.DERPForcedWebsocket, 1)
162+
// Ensure the reason is valid!
163+
require.Equal(t, "GET failed with status code 400: Invalid \"Upgrade\" header: DERP", node.DERPForcedWebsocket[derpMap.RegionIDs()[0]])
164+
84165
w1.Close()
85166
w2.Close()
86167
})

tailnet/coordinator.go

+5
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ type Node struct {
5858
PreferredDERP int `json:"preferred_derp"`
5959
// DERPLatency is the latency in seconds to each DERP server.
6060
DERPLatency map[string]float64 `json:"derp_latency"`
61+
// DERPForcedWebsocket contains a mapping of DERP regions to
62+
// error messages that caused the connection to be forced to
63+
// use WebSockets. We don't use WebSockets by default because
64+
// they are less performant.
65+
DERPForcedWebsocket map[int]string `json:"derp_forced_websockets"`
6166
// Addresses are the IP address ranges this connection exposes.
6267
Addresses []netip.Prefix `json:"addresses"`
6368
// AllowedIPs specify what addresses can dial the connection.

tailnet/derp.go

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package tailnet
2+
3+
import (
4+
"bufio"
5+
"context"
6+
"log"
7+
"net/http"
8+
"strings"
9+
"sync"
10+
11+
"nhooyr.io/websocket"
12+
"tailscale.com/derp"
13+
"tailscale.com/net/wsconn"
14+
)
15+
16+
// WithWebsocketSupport returns an http.Handler that upgrades
17+
// connections to the "derp" subprotocol to WebSockets and
18+
// passes them to the DERP server.
19+
// Taken from: https://github.com/tailscale/tailscale/blob/e3211ff88ba85435f70984cf67d9b353f3d650d8/cmd/derper/websocket.go#L21
20+
func WithWebsocketSupport(s *derp.Server, base http.Handler) (http.Handler, func()) {
21+
var mu sync.Mutex
22+
var waitGroup sync.WaitGroup
23+
ctx, cancelFunc := context.WithCancel(context.Background())
24+
25+
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
26+
up := strings.ToLower(r.Header.Get("Upgrade"))
27+
28+
// Very early versions of Tailscale set "Upgrade: WebSocket" but didn't actually
29+
// speak WebSockets (they still assumed DERP's binary framing). So to distinguish
30+
// clients that actually want WebSockets, look for an explicit "derp" subprotocol.
31+
if up != "websocket" || !strings.Contains(r.Header.Get("Sec-Websocket-Protocol"), "derp") {
32+
base.ServeHTTP(w, r)
33+
return
34+
}
35+
36+
mu.Lock()
37+
if ctx.Err() != nil {
38+
mu.Unlock()
39+
return
40+
}
41+
waitGroup.Add(1)
42+
mu.Unlock()
43+
defer waitGroup.Done()
44+
c, err := websocket.Accept(w, r, &websocket.AcceptOptions{
45+
Subprotocols: []string{"derp"},
46+
OriginPatterns: []string{"*"},
47+
// Disable compression because we transmit WireGuard messages that
48+
// are not compressible.
49+
// Additionally, Safari has a broken implementation of compression
50+
// (see https://github.com/nhooyr/websocket/issues/218) that makes
51+
// enabling it actively harmful.
52+
CompressionMode: websocket.CompressionDisabled,
53+
})
54+
if err != nil {
55+
log.Printf("websocket.Accept: %v", err)
56+
return
57+
}
58+
defer c.Close(websocket.StatusInternalError, "closing")
59+
if c.Subprotocol() != "derp" {
60+
c.Close(websocket.StatusPolicyViolation, "client must speak the derp subprotocol")
61+
return
62+
}
63+
wc := wsconn.NetConn(ctx, c, websocket.MessageBinary)
64+
brw := bufio.NewReadWriter(bufio.NewReader(wc), bufio.NewWriter(wc))
65+
s.Accept(ctx, wc, brw, r.RemoteAddr)
66+
}), func() {
67+
cancelFunc()
68+
mu.Lock()
69+
waitGroup.Wait()
70+
mu.Unlock()
71+
}
72+
}

0 commit comments

Comments
 (0)