Skip to content

chore: add additional network telemetry stats & events #13800

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ func (r *RootCmd) ssh() *serpent.Command {
}

err = sshSession.Wait()
conn.SendDisconnectedTelemetry("ssh")
conn.SendDisconnectedTelemetry()
if err != nil {
if exitErr := (&gossh.ExitError{}); errors.As(err, &exitErr) {
// Clear the error since it's not useful beyond
Expand Down
4 changes: 2 additions & 2 deletions coderd/telemetry/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -1240,7 +1240,7 @@ type NetworkEvent struct {
NodeIDSelf uint64 `json:"node_id_self"`
NodeIDRemote uint64 `json:"node_id_remote"`
P2PEndpoint NetworkEventP2PEndpoint `json:"p2p_endpoint"`
HomeDERP string `json:"home_derp"`
HomeDERP int `json:"home_derp"`
DERPMap DERPMap `json:"derp_map"`
LatestNetcheck Netcheck `json:"latest_netcheck"`

Expand Down Expand Up @@ -1286,7 +1286,7 @@ func NetworkEventFromProto(proto *tailnetproto.TelemetryEvent) (NetworkEvent, er
NodeIDSelf: proto.NodeIdSelf,
NodeIDRemote: proto.NodeIdRemote,
P2PEndpoint: p2pEndpointFromProto(proto.P2PEndpoint),
HomeDERP: proto.HomeDerp,
HomeDERP: int(proto.HomeDerp),
DERPMap: derpMapFromProto(proto.DerpMap),
LatestNetcheck: netcheckFromProto(proto.LatestNetcheck),

Expand Down
4 changes: 0 additions & 4 deletions codersdk/workspacesdk/agentconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,3 @@ func (c *AgentConn) apiClient() *http.Client {
func (c *AgentConn) GetPeerDiagnostics() tailnet.PeerDiagnostics {
return c.Conn.GetPeerDiagnostics(c.opts.AgentID)
}

func (c *AgentConn) SendDisconnectedTelemetry(application string) {
c.Conn.SendDisconnectedTelemetry(c.agentAddress(), application)
}
12 changes: 6 additions & 6 deletions codersdk/workspacesdk/connector_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,12 @@ func TestTailnetAPIConnector_TelemetryUnimplemented(t *testing.T) {
return uut.client != nil
}, testutil.WaitShort, testutil.IntervalFast)

fakeDRPCClient.telemeteryErorr = drpcerr.WithCode(xerrors.New("Unimplemented"), 0)
fakeDRPCClient.telemetryError = drpcerr.WithCode(xerrors.New("Unimplemented"), 0)
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
require.False(t, uut.telemetryUnavailable.Load())
require.Equal(t, int64(1), atomic.LoadInt64(&fakeDRPCClient.postTelemetryCalls))

fakeDRPCClient.telemeteryErorr = drpcerr.WithCode(xerrors.New("Unimplemented"), drpcerr.Unimplemented)
fakeDRPCClient.telemetryError = drpcerr.WithCode(xerrors.New("Unimplemented"), drpcerr.Unimplemented)
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
require.True(t, uut.telemetryUnavailable.Load())
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
Expand Down Expand Up @@ -268,12 +268,12 @@ func TestTailnetAPIConnector_TelemetryNotRecognised(t *testing.T) {
return uut.client != nil
}, testutil.WaitShort, testutil.IntervalFast)

fakeDRPCClient.telemeteryErorr = drpc.ProtocolError.New("Protocol Error")
fakeDRPCClient.telemetryError = drpc.ProtocolError.New("Protocol Error")
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
require.False(t, uut.telemetryUnavailable.Load())
require.Equal(t, int64(1), atomic.LoadInt64(&fakeDRPCClient.postTelemetryCalls))

fakeDRPCClient.telemeteryErorr = drpc.ProtocolError.New("unknown rpc: /coder.tailnet.v2.Tailnet/PostTelemetry")
fakeDRPCClient.telemetryError = drpc.ProtocolError.New("unknown rpc: /coder.tailnet.v2.Tailnet/PostTelemetry")
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
require.True(t, uut.telemetryUnavailable.Load())
uut.SendTelemetryEvent(&proto.TelemetryEvent{})
Expand Down Expand Up @@ -301,7 +301,7 @@ func newFakeTailnetConn() *fakeTailnetConn {

type fakeDRPCClient struct {
postTelemetryCalls int64
telemeteryErorr error
telemetryError error
fakeDRPPCMapStream
}

Expand Down Expand Up @@ -331,7 +331,7 @@ func (*fakeDRPCClient) DRPCConn() drpc.Conn {
// PostTelemetry implements proto.DRPCTailnetClient.
func (f *fakeDRPCClient) PostTelemetry(_ context.Context, _ *proto.TelemetryRequest) (*proto.TelemetryResponse, error) {
atomic.AddInt64(&f.postTelemetryCalls, 1)
return nil, f.telemeteryErorr
return nil, f.telemetryError
}

// StreamDERPMaps implements proto.DRPCTailnetClient.
Expand Down
130 changes: 77 additions & 53 deletions tailnet/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"tailscale.com/types/key"
tslogger "tailscale.com/types/logger"
"tailscale.com/types/netlogtype"
"tailscale.com/types/netmap"
"tailscale.com/wgengine"
"tailscale.com/wgengine/capture"
"tailscale.com/wgengine/magicsock"
Expand Down Expand Up @@ -262,17 +263,8 @@ func NewConn(options *Options) (conn *Conn, err error) {
)
nodeUp.setAddresses(options.Addresses)
nodeUp.setBlockEndpoints(options.BlockEndpoints)
wireguardEngine.SetStatusCallback(nodeUp.setStatus)
magicConn.SetDERPForcedWebsocketCallback(nodeUp.setDERPForcedWebsocket)
if telemetryStore != nil {
wireguardEngine.SetNetInfoCallback(func(ni *tailcfg.NetInfo) {
nodeUp.setNetInfo(ni)
telemetryStore.setNetInfo(ni)
})
} else {
wireguardEngine.SetNetInfoCallback(nodeUp.setNetInfo)
}

ctx, ctxCancel := context.WithCancel(context.Background())
server := &Conn{
id: uuid.New(),
closed: make(chan struct{}),
Expand All @@ -290,13 +282,32 @@ func NewConn(options *Options) (conn *Conn, err error) {
configMaps: cfgMaps,
nodeUpdater: nodeUp,
telemetrySink: options.TelemetrySink,
telemeteryStore: telemetryStore,
telemetryStore: telemetryStore,
createdAt: time.Now(),
watchCtx: ctx,
watchCancel: ctxCancel,
}
defer func() {
if err != nil {
_ = server.Close()
}
}()
if server.telemetryStore != nil {
server.wireguardEngine.SetNetInfoCallback(func(ni *tailcfg.NetInfo) {
server.telemetryStore.setNetInfo(ni)
nodeUp.setNetInfo(ni)
server.telemetryStore.pingPeer(server)
})
server.wireguardEngine.AddNetworkMapCallback(func(nm *netmap.NetworkMap) {
Copy link
Member Author

@ethanndickson ethanndickson Jul 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use this callback to ensure we always have the latest Tailscale node data for a given peer.

server.telemetryStore.updateNetworkMap(nm)
server.telemetryStore.pingPeer(server)
})
go server.watchConnChange()
} else {
server.wireguardEngine.SetNetInfoCallback(nodeUp.setNetInfo)
}
server.wireguardEngine.SetStatusCallback(nodeUp.setStatus)
server.magicConn.SetDERPForcedWebsocketCallback(nodeUp.setDERPForcedWebsocket)

netStack.GetTCPHandlerForFlow = server.forwardTCP

Expand Down Expand Up @@ -351,11 +362,15 @@ type Conn struct {
wireguardEngine wgengine.Engine
listeners map[listenKey]*listener
clientType proto.TelemetryEvent_ClientType
createdAt time.Time

telemetrySink TelemetrySink
// telemeteryStore will be nil if telemetrySink is nil.
telemeteryStore *TelemetryStore
telemetryWg sync.WaitGroup
// telemetryStore will be nil if telemetrySink is nil.
telemetryStore *TelemetryStore
telemetryWg sync.WaitGroup

watchCtx context.Context
watchCancel func()

trafficStats *connstats.Statistics
}
Expand Down Expand Up @@ -390,8 +405,8 @@ func (c *Conn) SetNodeCallback(callback func(node *Node)) {

// SetDERPMap updates the DERPMap of a connection.
func (c *Conn) SetDERPMap(derpMap *tailcfg.DERPMap) {
if c.configMaps.setDERPMap(derpMap) && c.telemeteryStore != nil {
c.telemeteryStore.updateDerpMap(derpMap)
if c.configMaps.setDERPMap(derpMap) && c.telemetryStore != nil {
c.telemetryStore.updateDerpMap(derpMap)
}
}

Expand Down Expand Up @@ -540,6 +555,7 @@ func (c *Conn) Closed() <-chan struct{} {
// Close shuts down the Wireguard connection.
func (c *Conn) Close() error {
c.logger.Info(context.Background(), "closing tailnet Conn")
c.watchCancel()
c.telemetryWg.Wait()
c.configMaps.close()
c.nodeUpdater.close()
Expand Down Expand Up @@ -709,54 +725,34 @@ func (c *Conn) MagicsockServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
c.magicConn.ServeHTTPDebug(w, r)
}

// SendConnectedTelemetry should be called when connection to a peer with the given IP is established.
func (c *Conn) SendConnectedTelemetry(ip netip.Addr, application string) {
if c.telemetrySink == nil {
return
}
c.telemetryStore.markConnected(&ip, application)
e := c.newTelemetryEvent()
e.Status = proto.TelemetryEvent_CONNECTED
e.Application = application
pip, ok := c.wireguardEngine.PeerForIP(ip)
if ok {
e.NodeIdRemote = uint64(pip.Node.ID)
}
c.telemetryWg.Add(1)
go func() {
defer c.telemetryWg.Done()
c.telemetrySink.SendTelemetryEvent(e)
}()
c.sendTelemetryBackground(e)
}

func (c *Conn) SendDisconnectedTelemetry(ip netip.Addr, application string) {
func (c *Conn) SendDisconnectedTelemetry() {
if c.telemetrySink == nil {
return
}
e := c.newTelemetryEvent()
e.Status = proto.TelemetryEvent_DISCONNECTED
e.Application = application
pip, ok := c.wireguardEngine.PeerForIP(ip)
if ok {
e.NodeIdRemote = uint64(pip.Node.ID)
}
c.telemetryWg.Add(1)
go func() {
defer c.telemetryWg.Done()
c.telemetrySink.SendTelemetryEvent(e)
}()
c.sendTelemetryBackground(e)
}

func (c *Conn) SendSpeedtestTelemetry(throughputMbits float64) {
if c.telemetrySink == nil {
return
}
e := c.newTelemetryEvent()
e.Status = proto.TelemetryEvent_CONNECTED
e.ThroughputMbits = wrapperspb.Float(float32(throughputMbits))
c.telemetryWg.Add(1)
go func() {
defer c.telemetryWg.Done()
c.telemetrySink.SendTelemetryEvent(e)
}()
e.Status = proto.TelemetryEvent_CONNECTED
c.sendTelemetryBackground(e)
}

// nolint:revive
Expand All @@ -769,31 +765,59 @@ func (c *Conn) sendPingTelemetry(pr *ipnstate.PingResult) {
latency := durationpb.New(time.Duration(pr.LatencySeconds * float64(time.Second)))
if pr.Endpoint != "" {
e.P2PLatency = latency
e.P2PEndpoint = c.telemeteryStore.toEndpoint(pr.Endpoint)
e.P2PEndpoint = c.telemetryStore.toEndpoint(pr.Endpoint)
} else {
e.DerpLatency = latency
}
e.Status = proto.TelemetryEvent_CONNECTED
c.telemetryWg.Add(1)
go func() {
defer c.telemetryWg.Done()
c.telemetrySink.SendTelemetryEvent(e)
}()
c.sendTelemetryBackground(e)
}

// The returned telemetry event will not have it's status set.
func (c *Conn) newTelemetryEvent() *proto.TelemetryEvent {
// Infallible
id, _ := c.id.MarshalBinary()
event := c.telemeteryStore.newEvent()
event := c.telemetryStore.newEvent()
event.ClientType = c.clientType
event.Id = id
selfNode := c.Node()
event.NodeIdSelf = uint64(selfNode.ID)
event.HomeDerp = strconv.Itoa(selfNode.PreferredDERP)
event.ConnectionAge = durationpb.New(time.Since(c.createdAt))
return event
}

func (c *Conn) sendTelemetryBackground(e *proto.TelemetryEvent) {
c.telemetryWg.Add(1)
go func() {
defer c.telemetryWg.Done()
c.telemetrySink.SendTelemetryEvent(e)
}()
}

// Watch for changes in the connection type (P2P<->DERP) and send telemetry events.
func (c *Conn) watchConnChange() {
ticker := time.NewTicker(time.Millisecond * 50)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we have to manually check if the connection has changed like this, P2PSetup can be ~50ms off it's true value.

defer ticker.Stop()
for {
select {
case <-c.watchCtx.Done():
return
case <-ticker.C:
}
status := c.Status()
peers := status.Peers()
if len(peers) > 1 {
// Not a CLI<->agent connection, stop watching
return
} else if len(peers) == 0 {
continue
}
peer := status.Peer[peers[0]]
// If the connection type has changed, send a telemetry event with the latest ping stats
if c.telemetryStore.changedConntype(peer.CurAddr) {
c.telemetryStore.pingPeer(c)
}
}
}

// PeerDiagnostics is a checklist of human-readable conditions necessary to establish an encrypted
// tunnel to a peer via a Conn
type PeerDiagnostics struct {
Expand Down
8 changes: 4 additions & 4 deletions tailnet/proto/tailnet.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tailnet/proto/tailnet.proto
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ message TelemetryEvent {
uint64 node_id_self = 7;
uint64 node_id_remote = 8;
P2PEndpoint p2p_endpoint = 9;
string home_derp = 10;
int32 home_derp = 10;
DERPMap derp_map = 11;
Netcheck latest_netcheck = 12;

Expand Down
Loading
Loading