From 35b2fed6860da12707266979f14286f73fe1468a Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 22 Sep 2022 17:40:59 -0500 Subject: [PATCH 01/79] feat: HA tailnet coordinator --- agent/agent_test.go | 2 +- coderd/coderd.go | 4 +- coderd/database/pubsub_memory.go | 3 +- coderd/workspaceagents.go | 2 +- coderd/wsconncache/wsconncache_test.go | 2 +- codersdk/workspaceagents.go | 1 - enterprise/tailnet/coordinator.go | 426 +++++++++++++++++++++++++ enterprise/tailnet/coordinator_test.go | 267 ++++++++++++++++ tailnet/coordinator.go | 203 +++++++----- tailnet/coordinator_test.go | 10 +- 10 files changed, 834 insertions(+), 86 deletions(-) create mode 100644 enterprise/tailnet/coordinator.go create mode 100644 enterprise/tailnet/coordinator_test.go diff --git a/agent/agent_test.go b/agent/agent_test.go index afed644f78e5e..d6ff21cdcd33d 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -572,7 +572,7 @@ func setupAgent(t *testing.T, metadata agent.Metadata, ptyTimeout time.Duration) if metadata.DERPMap == nil { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) } - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentID := uuid.New() statsCh := make(chan *agent.Stats) closer := agent.New(agent.Options{ diff --git a/coderd/coderd.go b/coderd/coderd.go index 25ac1afec2f36..f183e4d9b9ab7 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -74,7 +74,7 @@ type Options struct { TracerProvider trace.TracerProvider AutoImportTemplates []AutoImportTemplate - TailnetCoordinator *tailnet.Coordinator + TailnetCoordinator tailnet.Coordinator DERPMap *tailcfg.DERPMap MetricsCacheRefreshInterval time.Duration @@ -121,7 +121,7 @@ func New(options *Options) *API { options.PrometheusRegistry = prometheus.NewRegistry() } if options.TailnetCoordinator == nil { - options.TailnetCoordinator = tailnet.NewCoordinator() + options.TailnetCoordinator = tailnet.NewMemoryCoordinator() } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/database/pubsub_memory.go b/coderd/database/pubsub_memory.go index 148d2f57b129f..de5a940414d6c 100644 --- a/coderd/database/pubsub_memory.go +++ b/coderd/database/pubsub_memory.go @@ -47,8 +47,9 @@ func (m *memoryPubsub) Publish(event string, message []byte) error { return nil } for _, listener := range listeners { - listener(context.Background(), message) + go listener(context.Background(), message) } + return nil } diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 6167790fb8bb7..dd777913c452d 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -447,7 +447,7 @@ func convertApps(dbApps []database.WorkspaceApp) []codersdk.WorkspaceApp { return apps } -func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator *tailnet.Coordinator, dbAgent database.WorkspaceAgent, apps []codersdk.WorkspaceApp, agentInactiveDisconnectTimeout time.Duration) (codersdk.WorkspaceAgent, error) { +func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordinator, dbAgent database.WorkspaceAgent, apps []codersdk.WorkspaceApp, agentInactiveDisconnectTimeout time.Duration) (codersdk.WorkspaceAgent, error) { var envs map[string]string if dbAgent.EnvironmentVariables.Valid { err := json.Unmarshal(dbAgent.EnvironmentVariables.RawMessage, &envs) diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index a9ea85a2492ac..e4c7d58413110 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -142,7 +142,7 @@ func TestCache(t *testing.T) { func setupAgent(t *testing.T, metadata agent.Metadata, ptyTimeout time.Duration) *agent.Conn { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentID := uuid.New() closer := agent.New(agent.Options{ FetchMetadata: func(ctx context.Context) (agent.Metadata, error) { diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 46d8ead8d2d6d..72e9767713c7c 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -20,7 +20,6 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" - "github.com/coder/coder/agent" "github.com/coder/coder/tailnet" "github.com/coder/retry" diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go new file mode 100644 index 0000000000000..8824f584d60da --- /dev/null +++ b/enterprise/tailnet/coordinator.go @@ -0,0 +1,426 @@ +package tailnet + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "sync" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/coderd/database" + agpl "github.com/coder/coder/tailnet" +) + +func NewHACoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { + coord := &haCoordinator{ + id: uuid.New(), + log: logger, + pubsub: pubsub, + close: make(chan struct{}), + nodes: map[uuid.UUID]*agpl.Node{}, + agentSockets: map[uuid.UUID]net.Conn{}, + agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, + } + + if err := coord.runPubsub(); err != nil { + return nil, xerrors.Errorf("run coordinator pubsub: %w", err) + } + + return coord, nil +} + +type haCoordinator struct { + id uuid.UUID + log slog.Logger + mutex sync.RWMutex + pubsub database.Pubsub + close chan struct{} + + // nodes maps agent and connection IDs their respective node. + nodes map[uuid.UUID]*agpl.Node + // agentSockets maps agent IDs to their open websocket. + agentSockets map[uuid.UUID]net.Conn + // agentToConnectionSockets maps agent IDs to connection IDs of conns that + // are subscribed to updates for that agent. + agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn +} + +// Node returns an in-memory node by ID. +func (c *haCoordinator) Node(id uuid.UUID) *agpl.Node { + c.mutex.RLock() + defer c.mutex.RUnlock() + node := c.nodes[id] + return node +} + +// ServeClient accepts a WebSocket connection that wants to connect to an agent +// with the specified ID. +func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { + c.mutex.Lock() + // When a new connection is requested, we update it with the latest + // node of the agent. This allows the connection to establish. + node, ok := c.nodes[agent] + if ok { + data, err := json.Marshal([]*agpl.Node{node}) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("marshal node: %w", err) + } + _, err = conn.Write(data) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("write nodes: %w", err) + } + } + + connectionSockets, ok := c.agentToConnectionSockets[agent] + if !ok { + connectionSockets = map[uuid.UUID]net.Conn{} + c.agentToConnectionSockets[agent] = connectionSockets + } + + // Insert this connection into a map so the agent can publish node updates. + connectionSockets[id] = conn + c.mutex.Unlock() + + defer func() { + c.mutex.Lock() + defer c.mutex.Unlock() + // Clean all traces of this connection from the map. + delete(c.nodes, id) + connectionSockets, ok := c.agentToConnectionSockets[agent] + if !ok { + return + } + delete(connectionSockets, id) + if len(connectionSockets) != 0 { + return + } + delete(c.agentToConnectionSockets, agent) + }() + + decoder := json.NewDecoder(conn) + // Indefinitely handle messages from the client websocket. + for { + err := c.handleNextClientMessage(id, agent, decoder) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next client message: %w", err) + } + } +} + +func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { + var node agpl.Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + // Update the node of this client in our in-memory map. If an agent entirely + // shuts down and reconnects, it needs to be aware of all clients attempting + // to establish connections. + c.nodes[id] = &node + + // Write the new node from this client to the actively connected agent. + err = c.writeNodeToAgent(agent, &node) + if err != nil { + return xerrors.Errorf("write node to agent: %w", err) + } + + return nil +} + +func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error { + agentSocket, ok := c.agentSockets[agent] + if !ok { + // If we don't own the agent locally, send it over pubsub to a node that + // owns the agent. + err := c.publishNodeToAgent(agent, node) + if err != nil { + return xerrors.Errorf("publish node to agent") + } + return nil + } + + // Write the new node from this client to the actively + // connected agent. + data, err := json.Marshal([]*agpl.Node{node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + _, err = agentSocket.Write(data) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("write json: %w", err) + } + return nil +} + +// ServeAgent accepts a WebSocket connection to an agent that listens to +// incoming connections and publishes node updates. +func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { + c.mutex.Lock() + sockets, ok := c.agentToConnectionSockets[id] + if ok { + // Publish all nodes that want to connect to the + // desired agent ID. + nodes := make([]*agpl.Node, 0, len(sockets)) + for targetID := range sockets { + node, ok := c.nodes[targetID] + if !ok { + continue + } + nodes = append(nodes, node) + } + data, err := json.Marshal(nodes) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("marshal json: %w", err) + } + _, err = conn.Write(data) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("write nodes: %w", err) + } + } + + // If an old agent socket is connected, we close it + // to avoid any leaks. This shouldn't ever occur because + // we expect one agent to be running. + oldAgentSocket, ok := c.agentSockets[id] + if ok { + _ = oldAgentSocket.Close() + } + c.agentSockets[id] = conn + c.mutex.Unlock() + defer func() { + c.mutex.Lock() + defer c.mutex.Unlock() + delete(c.agentSockets, id) + delete(c.nodes, id) + }() + + decoder := json.NewDecoder(conn) + for { + err := c.hangleAgentUpdate(id, decoder, false) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next agent message: %w", err) + } + } +} + +func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, fromPubsub bool) error { + var node agpl.Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + c.nodes[id] = &node + + // Don't send the agent back over pubsub if that's where we received it from! + if !fromPubsub { + err = c.publishAgentToNodes(id, &node) + if err != nil { + return xerrors.Errorf("publish agent to nodes: %w", err) + } + } + + connectionSockets, ok := c.agentToConnectionSockets[id] + if !ok { + return nil + } + + data, err := json.Marshal([]*agpl.Node{&node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + // Publish the new node to every listening socket. + var wg sync.WaitGroup + wg.Add(len(connectionSockets)) + for _, connectionSocket := range connectionSockets { + connectionSocket := connectionSocket + go func() { + _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) + _, _ = connectionSocket.Write(data) + wg.Done() + }() + } + + wg.Wait() + return nil +} + +func (c *haCoordinator) Close() error { + close(c.close) + return nil +} + +func (c *haCoordinator) publishNodeToAgent(recipient uuid.UUID, node *agpl.Node) error { + msg, err := c.formatCallMeMaybe(recipient, node) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + fmt.Println("publishing callmemaybe", c.id.String()) + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error { + msg, err := c.formatAgentUpdate(id, node) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + fmt.Println("publishing agentupdate", c.id.String()) + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) runPubsub() error { + cancelSub, err := c.pubsub.Subscribe("wireguard_peers", func(ctx context.Context, message []byte) { + sp := bytes.Split(message, []byte("|")) + if len(sp) != 4 { + c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + return + } + + var ( + coordinatorID = sp[0] + eventType = sp[1] + agentID = sp[2] + nodeJSON = sp[3] + ) + + sender, err := uuid.ParseBytes(coordinatorID) + if err != nil { + c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + return + } + + // We sent this message! + if sender == c.id { + return + } + + switch string(eventType) { + case "callmemaybe": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + fmt.Println("got callmemaybe", agentUUID.String()) + c.mutex.Lock() + defer c.mutex.Unlock() + + fmt.Println("process callmemaybe", agentUUID.String()) + agentSocket, ok := c.agentSockets[agentUUID] + if !ok { + fmt.Println("no socket") + return + } + + // We get a single node over pubsub, so turn into an array. + _, err = agentSocket.Write(bytes.Join([][]byte{[]byte("["), nodeJSON, []byte("]")}, []byte{})) + if err != nil { + if errors.Is(err, io.EOF) { + return + } + c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) + return + } + fmt.Println("success callmemaybe", agentUUID.String()) + + case "agentupdate": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + } + + decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) + err = c.hangleAgentUpdate(agentUUID, decoder, true) + if err != nil { + c.log.Error(ctx, "handle agent update", slog.Error(err)) + } + + default: + c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) + } + }) + if err != nil { + return xerrors.Errorf("subscribe wireguard peers") + } + + go func() { + defer cancelSub() + <-c.close + }() + + return nil +} + +// format: |callmemaybe|| +func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("callmemaybe|") + buf.WriteString(recipient.String() + "|") + err := json.NewEncoder(&buf).Encode(node) + if err != nil { + return nil, xerrors.Errorf("encode node: %w", err) + } + + return buf.Bytes(), nil +} + +// format: |agentupdate|| +func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("agentupdate|") + buf.WriteString(id.String() + "|") + err := json.NewEncoder(&buf).Encode(node) + if err != nil { + return nil, xerrors.Errorf("encode node: %w", err) + } + + return buf.Bytes(), nil +} diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go new file mode 100644 index 0000000000000..48fce5bfd0f6f --- /dev/null +++ b/enterprise/tailnet/coordinator_test.go @@ -0,0 +1,267 @@ +package tailnet_test + +import ( + "fmt" + "net" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "cdr.dev/slog/sloggers/slogtest" + + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/enterprise/tailnet" + agpl "github.com/coder/coder/tailnet" + "github.com/coder/coder/testutil" +) + +func TestCoordinatorSingle(t *testing.T) { + t.Parallel() + t.Run("ClientWithoutAgent", func(t *testing.T) { + t.Parallel() + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + client, server := net.Pipe() + sendNode, errChan := agpl.ServeCoordinator(client, func(node []*agpl.Node) error { + return nil + }) + id := uuid.New() + closeChan := make(chan struct{}) + go func() { + err := coordinator.ServeClient(server, id, uuid.New()) + assert.NoError(t, err) + close(closeChan) + }() + sendNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(id) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + err = client.Close() + require.NoError(t, err) + <-errChan + <-closeChan + }) + + t.Run("AgentWithoutClients", func(t *testing.T) { + t.Parallel() + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + client, server := net.Pipe() + sendNode, errChan := agpl.ServeCoordinator(client, func(node []*agpl.Node) error { + return nil + }) + id := uuid.New() + closeChan := make(chan struct{}) + go func() { + err := coordinator.ServeAgent(server, id) + assert.NoError(t, err) + close(closeChan) + }() + sendNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(id) != nil + }, testutil.WaitShort, testutil.IntervalFast) + err = client.Close() + require.NoError(t, err) + <-errChan + <-closeChan + }) + + t.Run("AgentWithClient", func(t *testing.T) { + t.Parallel() + + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + agentWS, agentServerWS := net.Pipe() + defer agentWS.Close() + agentNodeChan := make(chan []*agpl.Node) + sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + agentNodeChan <- nodes + return nil + }) + agentID := uuid.New() + closeAgentChan := make(chan struct{}) + go func() { + err := coordinator.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + sendAgentNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(agentID) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + clientWS, clientServerWS := net.Pipe() + defer clientWS.Close() + defer clientServerWS.Close() + clientNodeChan := make(chan []*agpl.Node) + sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { + clientNodeChan <- nodes + return nil + }) + clientID := uuid.New() + closeClientChan := make(chan struct{}) + go func() { + err := coordinator.ServeClient(clientServerWS, clientID, agentID) + assert.NoError(t, err) + close(closeClientChan) + }() + agentNodes := <-clientNodeChan + require.Len(t, agentNodes, 1) + sendClientNode(&agpl.Node{}) + clientNodes := <-agentNodeChan + require.Len(t, clientNodes, 1) + + // Ensure an update to the agent node reaches the client! + sendAgentNode(&agpl.Node{}) + agentNodes = <-clientNodeChan + require.Len(t, agentNodes, 1) + + // Close the agent WebSocket so a new one can connect. + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + // Create a new agent connection. This is to simulate a reconnect! + agentWS, agentServerWS = net.Pipe() + defer agentWS.Close() + agentNodeChan = make(chan []*agpl.Node) + _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + agentNodeChan <- nodes + return nil + }) + closeAgentChan = make(chan struct{}) + go func() { + err := coordinator.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + // Ensure the existing listening client sends it's node immediately! + clientNodes = <-agentNodeChan + require.Len(t, clientNodes, 1) + + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + err = clientWS.Close() + require.NoError(t, err) + <-clientErrChan + <-closeClientChan + }) +} + +func TestCoordinatorHA(t *testing.T) { + t.Parallel() + + t.Run("AgentWithClient", func(t *testing.T) { + t.Parallel() + + pubsub := database.NewPubsubInMemory() + + coordinator1, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator1.Close() + + coordinator2, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator2.Close() + + agentWS, agentServerWS := net.Pipe() + defer agentWS.Close() + agentNodeChan := make(chan []*agpl.Node) + sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + fmt.Println("got agent node") + agentNodeChan <- nodes + fmt.Println("sent agent node") + return nil + }) + agentID := uuid.New() + closeAgentChan := make(chan struct{}) + go func() { + err := coordinator1.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + sendAgentNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator1.Node(agentID) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + clientWS, clientServerWS := net.Pipe() + defer clientWS.Close() + defer clientServerWS.Close() + clientNodeChan := make(chan []*agpl.Node) + sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { + fmt.Println("got client node") + clientNodeChan <- nodes + fmt.Println("sent client node") + return nil + }) + clientID := uuid.New() + closeClientChan := make(chan struct{}) + go func() { + err := coordinator2.ServeClient(clientServerWS, clientID, agentID) + assert.NoError(t, err) + close(closeClientChan) + }() + agentNodes := <-clientNodeChan + require.Len(t, agentNodes, 1) + sendClientNode(&agpl.Node{}) + _ = sendClientNode + clientNodes := <-agentNodeChan + require.Len(t, clientNodes, 1) + + // Ensure an update to the agent node reaches the client! + sendAgentNode(&agpl.Node{}) + agentNodes = <-clientNodeChan + require.Len(t, agentNodes, 1) + + // Close the agent WebSocket so a new one can connect. + require.NoError(t, agentWS.Close()) + require.NoError(t, agentServerWS.Close()) + <-agentErrChan + <-closeAgentChan + + // Create a new agent connection. This is to simulate a reconnect! + agentWS, agentServerWS = net.Pipe() + defer agentWS.Close() + agentNodeChan = make(chan []*agpl.Node) + _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + fmt.Println("got agent node") + agentNodeChan <- nodes + fmt.Println("sent agent node") + return nil + }) + closeAgentChan = make(chan struct{}) + go func() { + err := coordinator1.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + // Ensure the existing listening client sends it's node immediately! + clientNodes = <-agentNodeChan + require.Len(t, clientNodes, 1) + + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + err = clientWS.Close() + require.NoError(t, err) + <-clientErrChan + <-closeClientChan + }) +} diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 95209d56559ff..af6a5fee58288 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -7,6 +7,7 @@ import ( "net" "net/netip" "sync" + "time" "github.com/google/uuid" "golang.org/x/xerrors" @@ -14,6 +15,24 @@ import ( "tailscale.com/types/key" ) +// Coordinator exchanges nodes with agents to establish connections. +// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ +// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ +// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ +// Coordinators have different guarantees for HA support. +type Coordinator interface { + // Node returns an in-memory node by ID. + Node(id uuid.UUID) *Node + // ServeClient accepts a WebSocket connection that wants to connect to an agent + // with the specified ID. + ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error + // ServeAgent accepts a WebSocket connection to an agent that listens to + // incoming connections and publishes node updates. + ServeAgent(conn net.Conn, id uuid.UUID) error + // Close closes the coordinator. + Close() error +} + // Node represents a node in the network. type Node struct { ID tailcfg.NodeID `json:"id"` @@ -64,44 +83,46 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func }, errChan } -// NewCoordinator constructs a new in-memory connection coordinator. -func NewCoordinator() *Coordinator { - return &Coordinator{ +// NewMemoryCoordinator constructs a new in-memory connection coordinator. This +// coordinator is incompatible with multiple Coder replicas as all node data is +// in-memory. +func NewMemoryCoordinator() Coordinator { + return &memoryCoordinator{ nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, } } -// Coordinator exchanges nodes with agents to establish connections. +// MemoryCoordinator exchanges nodes with agents to establish connections. // ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ // │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ // └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. -type Coordinator struct { +type memoryCoordinator struct { mutex sync.Mutex - // Maps agent and connection IDs to a node. + // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*Node - // Maps agent ID to an open socket. + // agentSockets maps agent IDs to their open websocket. agentSockets map[uuid.UUID]net.Conn - // Maps agent ID to connection ID for sending - // new node data as it comes in! + // agentToConnectionSockets maps agent IDs to connection IDs of conns that + // are subscribed to updates for that agent. agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn } // Node returns an in-memory node by ID. -func (c *Coordinator) Node(id uuid.UUID) *Node { +func (c *memoryCoordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() node := c.nodes[id] return node } -// ServeClient accepts a WebSocket connection that wants to -// connect to an agent with the specified ID. -func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { +// ServeClient accepts a WebSocket connection that wants to connect to an agent +// with the specified ID. +func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. @@ -145,48 +166,67 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) decoder := json.NewDecoder(conn) for { - var node Node - err := decoder.Decode(&node) - if errors.Is(err, io.EOF) { - return nil - } - if err != nil { - return xerrors.Errorf("read json: %w", err) - } - c.mutex.Lock() - // Update the node of this client in our in-memory map. - // If an agent entirely shuts down and reconnects, it - // needs to be aware of all clients attempting to - // establish connections. - c.nodes[id] = &node - agentSocket, ok := c.agentSockets[agent] - if !ok { - c.mutex.Unlock() - continue - } - // Write the new node from this client to the actively - // connected agent. - data, err := json.Marshal([]*Node{&node}) + err := c.handleNextClientMessage(id, agent, decoder) if err != nil { - c.mutex.Unlock() - return xerrors.Errorf("marshal nodes: %w", err) + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next client message: %w", err) } - _, err = agentSocket.Write(data) + } +} + +func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { + var node Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + // Update the node of this client in our in-memory map. If an agent + // entirely shuts down and reconnects, it needs to be aware of all clients + // attempting to establish connections. + c.nodes[id] = &node + + // Write the new node from this client to the actively + // connected agent. + err = c.writeNodeToAgent(agent, &node) + if err != nil { + return xerrors.Errorf("write node to agent: %w", err) + } + + return nil +} + +func (c *memoryCoordinator) writeNodeToAgent(agent uuid.UUID, node *Node) error { + agentSocket, ok := c.agentSockets[agent] + if !ok { + return nil + } + + // Write the new node from this client to the actively + // connected agent. + data, err := json.Marshal([]*Node{node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + _, err = agentSocket.Write(data) + if err != nil { if errors.Is(err, io.EOF) { - c.mutex.Unlock() return nil } - if err != nil { - c.mutex.Unlock() - return xerrors.Errorf("write json: %w", err) - } - c.mutex.Unlock() + return xerrors.Errorf("write json: %w", err) } + return nil } // ServeAgent accepts a WebSocket connection to an agent that // listens to incoming connections and publishes node updates. -func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { +func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() sockets, ok := c.agentToConnectionSockets[id] if ok { @@ -230,36 +270,51 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - var node Node - err := decoder.Decode(&node) - if errors.Is(err, io.EOF) { - return nil - } - if err != nil { - return xerrors.Errorf("read json: %w", err) - } - c.mutex.Lock() - c.nodes[id] = &node - connectionSockets, ok := c.agentToConnectionSockets[id] - if !ok { - c.mutex.Unlock() - continue - } - data, err := json.Marshal([]*Node{&node}) + err := c.handleNextAgentMessage(id, decoder) if err != nil { - return xerrors.Errorf("marshal nodes: %w", err) - } - // Publish the new node to every listening socket. - var wg sync.WaitGroup - wg.Add(len(connectionSockets)) - for _, connectionSocket := range connectionSockets { - connectionSocket := connectionSocket - go func() { - _, _ = connectionSocket.Write(data) - wg.Done() - }() + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next agent message: %w", err) } - wg.Wait() - c.mutex.Unlock() } } + +func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { + var node Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + c.nodes[id] = &node + connectionSockets, ok := c.agentToConnectionSockets[id] + if !ok { + return nil + } + + data, err := json.Marshal([]*Node{&node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + // Publish the new node to every listening socket. + var wg sync.WaitGroup + wg.Add(len(connectionSockets)) + for _, connectionSocket := range connectionSockets { + connectionSocket := connectionSocket + go func() { + _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) + _, _ = connectionSocket.Write(data) + wg.Done() + }() + } + + wg.Wait() + return nil +} + +func (*memoryCoordinator) Close() error { return nil } diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index f3fdab88d5ef8..e0ed44420ede2 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -16,7 +16,7 @@ func TestCoordinator(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -32,15 +32,15 @@ func TestCoordinator(t *testing.T) { require.Eventually(t, func() bool { return coordinator.Node(id) != nil }, testutil.WaitShort, testutil.IntervalFast) - err := client.Close() - require.NoError(t, err) + require.NoError(t, client.Close()) + require.NoError(t, server.Close()) <-errChan <-closeChan }) t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -64,7 +64,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentWS, agentServerWS := net.Pipe() defer agentWS.Close() From 68a812b134d43b3777d7173fdeadc503eea9ad4e Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:26:25 -0500 Subject: [PATCH 02/79] fixup! feat: HA tailnet coordinator --- enterprise/tailnet/coordinator.go | 132 +++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 40 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 8824f584d60da..6999fa7157d48 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "errors" - "fmt" "io" "net" "sync" @@ -150,7 +149,7 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error if !ok { // If we don't own the agent locally, send it over pubsub to a node that // owns the agent. - err := c.publishNodeToAgent(agent, node) + err := c.publishNodesToAgent(agent, []*agpl.Node{node}) if err != nil { return xerrors.Errorf("publish node to agent") } @@ -178,18 +177,15 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error // incoming connections and publishes node updates. func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() - sockets, ok := c.agentToConnectionSockets[id] - if ok { - // Publish all nodes that want to connect to the - // desired agent ID. - nodes := make([]*agpl.Node, 0, len(sockets)) - for targetID := range sockets { - node, ok := c.nodes[targetID] - if !ok { - continue - } - nodes = append(nodes, node) - } + + // Tell clients on other instances to send a callmemaybe to us. + err := c.publishAgentHello(id) + if err != nil { + return xerrors.Errorf("publish agent hello: %w", err) + } + + nodes := c.nodesSubscribedToAgent(id) + if len(nodes) > 0 { data, err := json.Marshal(nodes) if err != nil { c.mutex.Unlock() @@ -220,21 +216,46 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - err := c.hangleAgentUpdate(id, decoder, false) + node, err := c.hangleAgentUpdate(id, decoder) if err != nil { if errors.Is(err, io.EOF) { return nil } return xerrors.Errorf("handle next agent message: %w", err) } + + err = c.publishAgentToNodes(id, node) + if err != nil { + return xerrors.Errorf("publish agent to nodes: %w", err) + } } } -func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, fromPubsub bool) error { +func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { + sockets, ok := c.agentToConnectionSockets[agentID] + if !ok { + return nil + } + + // Publish all nodes that want to connect to the + // desired agent ID. + nodes := make([]*agpl.Node, 0, len(sockets)) + for targetID := range sockets { + node, ok := c.nodes[targetID] + if !ok { + continue + } + nodes = append(nodes, node) + } + + return nodes +} + +func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { var node agpl.Node err := decoder.Decode(&node) if err != nil { - return xerrors.Errorf("read json: %w", err) + return nil, xerrors.Errorf("read json: %w", err) } c.mutex.Lock() @@ -242,22 +263,14 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, f c.nodes[id] = &node - // Don't send the agent back over pubsub if that's where we received it from! - if !fromPubsub { - err = c.publishAgentToNodes(id, &node) - if err != nil { - return xerrors.Errorf("publish agent to nodes: %w", err) - } - } - connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { - return nil + return &node, nil } data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { - return xerrors.Errorf("marshal nodes: %w", err) + return nil, xerrors.Errorf("marshal nodes: %w", err) } // Publish the new node to every listening socket. @@ -273,7 +286,7 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, f } wg.Wait() - return nil + return &node, nil } func (c *haCoordinator) Close() error { @@ -281,13 +294,26 @@ func (c *haCoordinator) Close() error { return nil } -func (c *haCoordinator) publishNodeToAgent(recipient uuid.UUID, node *agpl.Node) error { - msg, err := c.formatCallMeMaybe(recipient, node) +func (c *haCoordinator) publishNodesToAgent(recipient uuid.UUID, nodes []*agpl.Node) error { + msg, err := c.formatCallMeMaybe(recipient, nodes) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) publishAgentHello(id uuid.UUID) error { + msg, err := c.formatAgentHello(id) if err != nil { return xerrors.Errorf("format publish message: %w", err) } - fmt.Println("publishing callmemaybe", c.id.String()) err = c.pubsub.Publish("wireguard_peers", msg) if err != nil { return xerrors.Errorf("publish message: %w", err) @@ -302,7 +328,6 @@ func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error return xerrors.Errorf("format publish message: %w", err) } - fmt.Println("publishing agentupdate", c.id.String()) err = c.pubsub.Publish("wireguard_peers", msg) if err != nil { return xerrors.Errorf("publish message: %w", err) @@ -345,19 +370,16 @@ func (c *haCoordinator) runPubsub() error { return } - fmt.Println("got callmemaybe", agentUUID.String()) c.mutex.Lock() defer c.mutex.Unlock() - fmt.Println("process callmemaybe", agentUUID.String()) agentSocket, ok := c.agentSockets[agentUUID] if !ok { - fmt.Println("no socket") return } // We get a single node over pubsub, so turn into an array. - _, err = agentSocket.Write(bytes.Join([][]byte{[]byte("["), nodeJSON, []byte("]")}, []byte{})) + _, err = agentSocket.Write(nodeJSON) if err != nil { if errors.Is(err, io.EOF) { return @@ -365,18 +387,37 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } - fmt.Println("success callmemaybe", agentUUID.String()) + + case "agenthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + c.mutex.Lock() + nodes := c.nodesSubscribedToAgent(agentUUID) + c.mutex.Unlock() + if len(nodes) > 0 { + err := c.publishNodesToAgent(agentUUID, nodes) + if err != nil { + c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) + return + } + } case "agentupdate": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return } decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - err = c.hangleAgentUpdate(agentUUID, decoder, true) + _, err = c.hangleAgentUpdate(agentUUID, decoder) if err != nil { c.log.Error(ctx, "handle agent update", slog.Error(err)) + return } default: @@ -396,13 +437,13 @@ func (c *haCoordinator) runPubsub() error { } // format: |callmemaybe|| -func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) ([]byte, error) { +func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, nodes []*agpl.Node) ([]byte, error) { buf := bytes.Buffer{} buf.WriteString(c.id.String() + "|") buf.WriteString("callmemaybe|") buf.WriteString(recipient.String() + "|") - err := json.NewEncoder(&buf).Encode(node) + err := json.NewEncoder(&buf).Encode(nodes) if err != nil { return nil, xerrors.Errorf("encode node: %w", err) } @@ -410,6 +451,17 @@ func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) return buf.Bytes(), nil } +// format: |agenthello|| +func (c *haCoordinator) formatAgentHello(id uuid.UUID) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("agenthello|") + buf.WriteString(id.String() + "|") + + return buf.Bytes(), nil +} + // format: |agentupdate|| func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { buf := bytes.Buffer{} From 774c5dafe3cb41a9eca1819531f073fd8ff9c9b9 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:29:40 -0500 Subject: [PATCH 03/79] fixup! feat: HA tailnet coordinator --- enterprise/tailnet/coordinator.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 6999fa7157d48..61b4bd5759ace 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -184,6 +184,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { return xerrors.Errorf("publish agent hello: %w", err) } + // Publish all nodes on this instance that want to connect to this agent. nodes := c.nodesSubscribedToAgent(id) if len(nodes) > 0 { data, err := json.Marshal(nodes) @@ -237,8 +238,6 @@ func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { return nil } - // Publish all nodes that want to connect to the - // desired agent ID. nodes := make([]*agpl.Node, 0, len(sockets)) for targetID := range sockets { node, ok := c.nodes[targetID] From bd82c5e36c79c080954b38255c8d198a0f0b925f Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:30:58 -0500 Subject: [PATCH 04/79] remove printlns --- enterprise/tailnet/coordinator_test.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go index 48fce5bfd0f6f..4889cd1c8ba60 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -1,7 +1,6 @@ package tailnet_test import ( - "fmt" "net" "testing" @@ -182,9 +181,7 @@ func TestCoordinatorHA(t *testing.T) { defer agentWS.Close() agentNodeChan := make(chan []*agpl.Node) sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { - fmt.Println("got agent node") agentNodeChan <- nodes - fmt.Println("sent agent node") return nil }) agentID := uuid.New() @@ -204,9 +201,7 @@ func TestCoordinatorHA(t *testing.T) { defer clientServerWS.Close() clientNodeChan := make(chan []*agpl.Node) sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { - fmt.Println("got client node") clientNodeChan <- nodes - fmt.Println("sent client node") return nil }) clientID := uuid.New() @@ -239,9 +234,7 @@ func TestCoordinatorHA(t *testing.T) { defer agentWS.Close() agentNodeChan = make(chan []*agpl.Node) _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { - fmt.Println("got agent node") agentNodeChan <- nodes - fmt.Println("sent agent node") return nil }) closeAgentChan = make(chan struct{}) From fbad8d075ddfb47d99c9cd7f2d1696ded78266ed Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 11:49:19 -0500 Subject: [PATCH 05/79] close all connections on coordinator --- codersdk/features.go | 12 ++++--- enterprise/coderd/coderd.go | 8 +++++ enterprise/coderd/license/license.go | 25 +++++++++++---- enterprise/tailnet/coordinator.go | 30 ++++++++++++++++- tailnet/coordinator.go | 48 ++++++++++++++++++++++++++-- 5 files changed, 109 insertions(+), 14 deletions(-) diff --git a/codersdk/features.go b/codersdk/features.go index fe8673ef028fd..6884f44087629 100644 --- a/codersdk/features.go +++ b/codersdk/features.go @@ -15,11 +15,12 @@ const ( ) const ( - FeatureUserLimit = "user_limit" - FeatureAuditLog = "audit_log" - FeatureBrowserOnly = "browser_only" - FeatureSCIM = "scim" - FeatureWorkspaceQuota = "workspace_quota" + FeatureUserLimit = "user_limit" + FeatureAuditLog = "audit_log" + FeatureBrowserOnly = "browser_only" + FeatureSCIM = "scim" + FeatureWorkspaceQuota = "workspace_quota" + FeatureHighAvailability = "high_availability" ) var FeatureNames = []string{ @@ -28,6 +29,7 @@ var FeatureNames = []string{ FeatureBrowserOnly, FeatureSCIM, FeatureWorkspaceQuota, + FeatureHighAvailability, } type Feature struct { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 11cceef98f0db..a6595e8bd6554 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -170,6 +170,14 @@ func (api *API) updateEntitlements(ctx context.Context) error { api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) } + if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { + enforcer := workspacequota.NewNop() + if enabled { + enforcer = NewEnforcer(api.Options.UserWorkspaceQuota) + } + api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) + } + api.entitlements = entitlements return nil diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 55a62eee17eee..84d28dfcccb21 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -17,7 +17,13 @@ import ( ) // Entitlements processes licenses to return whether features are enabled or not. -func Entitlements(ctx context.Context, db database.Store, logger slog.Logger, keys map[string]ed25519.PublicKey, enablements map[string]bool) (codersdk.Entitlements, error) { +func Entitlements( + ctx context.Context, + db database.Store, + logger slog.Logger, + keys map[string]ed25519.PublicKey, + enablements map[string]bool, +) (codersdk.Entitlements, error) { now := time.Now() // Default all entitlements to be disabled. entitlements := codersdk.Entitlements{ @@ -96,6 +102,12 @@ func Entitlements(ctx context.Context, db database.Store, logger slog.Logger, ke Enabled: enablements[codersdk.FeatureWorkspaceQuota], } } + if claims.Features.HighAvailability > 0 { + entitlements.Features[codersdk.FeatureHighAvailability] = codersdk.Feature{ + Entitlement: entitlement, + Enabled: enablements[codersdk.FeatureHighAvailability], + } + } if claims.AllFeatures { allFeatures = true } @@ -165,11 +177,12 @@ var ( ) type Features struct { - UserLimit int64 `json:"user_limit"` - AuditLog int64 `json:"audit_log"` - BrowserOnly int64 `json:"browser_only"` - SCIM int64 `json:"scim"` - WorkspaceQuota int64 `json:"workspace_quota"` + UserLimit int64 `json:"user_limit"` + AuditLog int64 `json:"audit_log"` + BrowserOnly int64 `json:"browser_only"` + SCIM int64 `json:"scim"` + WorkspaceQuota int64 `json:"workspace_quota"` + HighAvailability int64 `json:"high_availability"` } type Claims struct { diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 61b4bd5759ace..6bf2327507165 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -14,7 +14,6 @@ import ( "golang.org/x/xerrors" "cdr.dev/slog" - "github.com/coder/coder/coderd/database" agpl "github.com/coder/coder/tailnet" ) @@ -288,8 +287,37 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( return &node, nil } +// Close closes all of the open connections in the coordinator and stops the +// coordinator from accepting new connections. func (c *haCoordinator) Close() error { + c.mutex.Lock() + defer c.mutex.Unlock() + close(c.close) + + wg := sync.WaitGroup{} + + wg.Add(len(c.agentSockets)) + for _, socket := range c.agentSockets { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + + for _, connMap := range c.agentToConnectionSockets { + wg.Add(len(connMap)) + for _, socket := range connMap { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + } + + wg.Wait() return nil } diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index d79ffa34a5a3b..150a323bcfe52 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -99,6 +99,7 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func // in-memory. func NewMemoryCoordinator() Coordinator { return &memoryCoordinator{ + closed: false, nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, @@ -112,7 +113,8 @@ func NewMemoryCoordinator() Coordinator { // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. type memoryCoordinator struct { - mutex sync.Mutex + mutex sync.Mutex + closed bool // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*Node @@ -135,6 +137,11 @@ func (c *memoryCoordinator) Node(id uuid.UUID) *Node { // with the specified ID. func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() + + if c.closed { + return xerrors.New("coordinator is closed") + } + // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] @@ -229,6 +236,11 @@ func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder // listens to incoming connections and publishes node updates. func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() + + if c.closed { + return xerrors.New("coordinator is closed") + } + sockets, ok := c.agentToConnectionSockets[id] if ok { // Publish all nodes that want to connect to the @@ -320,4 +332,36 @@ func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.D return nil } -func (*memoryCoordinator) Close() error { return nil } +// Close closes all of the open connections in the coordinator and stops the +// coordinator from accepting new connections. +func (c *memoryCoordinator) Close() error { + c.mutex.Lock() + defer c.mutex.Unlock() + + c.closed = true + + wg := sync.WaitGroup{} + + wg.Add(len(c.agentSockets)) + for _, socket := range c.agentSockets { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + + for _, connMap := range c.agentToConnectionSockets { + wg.Add(len(connMap)) + for _, socket := range connMap { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + } + + wg.Wait() + return nil +} From 46803aa38ba2d4189f687bda248f01bf933bf18e Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:22:44 -0500 Subject: [PATCH 06/79] impelement high availability feature --- coderd/coderd.go | 2 ++ coderd/provisionerjobs.go | 2 +- coderd/workspaceagents.go | 16 ++++++++-------- coderd/workspacebuilds.go | 2 +- enterprise/coderd/coderd.go | 24 +++++++++++++++++++++--- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 58686ae66fbcd..f3cdab0caea04 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -158,6 +158,7 @@ func New(options *Options) *API { api.Auditor.Store(&options.Auditor) api.WorkspaceQuotaEnforcer.Store(&options.WorkspaceQuotaEnforcer) api.workspaceAgentCache = wsconncache.New(api.dialWorkspaceAgentTailnet, 0) + api.TailnetCoordinator.Store(&options.TailnetCoordinator) api.derpServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, @@ -525,6 +526,7 @@ type API struct { Auditor atomic.Pointer[audit.Auditor] WorkspaceClientCoordinateOverride atomic.Pointer[func(rw http.ResponseWriter) bool] WorkspaceQuotaEnforcer atomic.Pointer[workspacequota.Enforcer] + TailnetCoordinator atomic.Pointer[tailnet.Coordinator] HTTPAuth *HTTPAuthorizer // APIHandler serves "/api/v2" diff --git a/coderd/provisionerjobs.go b/coderd/provisionerjobs.go index 56a825ea09a3a..68802df04e5ec 100644 --- a/coderd/provisionerjobs.go +++ b/coderd/provisionerjobs.go @@ -270,7 +270,7 @@ func (api *API) provisionerJobResources(rw http.ResponseWriter, r *http.Request, } } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading job agent.", diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 247915db99592..29943c8701ec8 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -48,7 +48,7 @@ func (api *API) workspaceAgent(rw http.ResponseWriter, r *http.Request) { }) return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -77,7 +77,7 @@ func (api *API) workspaceAgentApps(rw http.ResponseWriter, r *http.Request) { func (api *API) workspaceAgentMetadata(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -97,7 +97,7 @@ func (api *API) workspaceAgentMetadata(rw http.ResponseWriter, r *http.Request) func (api *API) postWorkspaceAgentVersion(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -151,7 +151,7 @@ func (api *API) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { httpapi.ResourceNotFound(rw) return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -228,7 +228,7 @@ func (api *API) workspaceAgentListeningPorts(rw http.ResponseWriter, r *http.Req return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -322,7 +322,7 @@ func (api *API) dialWorkspaceAgentTailnet(r *http.Request, agentID uuid.UUID) (* }) conn.SetNodeCallback(sendNodes) go func() { - err := api.TailnetCoordinator.ServeClient(serverConn, uuid.New(), agentID) + err := (*api.TailnetCoordinator.Load()).ServeClient(serverConn, uuid.New(), agentID) if err != nil { _ = conn.Close() } @@ -460,7 +460,7 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request closeChan := make(chan struct{}) go func() { defer close(closeChan) - err := api.TailnetCoordinator.ServeAgent(wsNetConn, workspaceAgent.ID) + err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID) if err != nil { _ = conn.Close(websocket.StatusInternalError, err.Error()) return @@ -529,7 +529,7 @@ func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.R go httpapi.Heartbeat(ctx, conn) defer conn.Close(websocket.StatusNormalClosure, "") - err = api.TailnetCoordinator.ServeClient(websocket.NetConn(ctx, conn, websocket.MessageBinary), uuid.New(), workspaceAgent.ID) + err = (*api.TailnetCoordinator.Load()).ServeClient(websocket.NetConn(ctx, conn, websocket.MessageBinary), uuid.New(), workspaceAgent.ID) if err != nil { _ = conn.Close(websocket.StatusInternalError, err.Error()) return diff --git a/coderd/workspacebuilds.go b/coderd/workspacebuilds.go index 6ece8d379b153..88e162fa7db94 100644 --- a/coderd/workspacebuilds.go +++ b/coderd/workspacebuilds.go @@ -831,7 +831,7 @@ func (api *API) convertWorkspaceBuild( apiAgents := make([]codersdk.WorkspaceAgent, 0) for _, agent := range agents { apps := appsByAgentID[agent.ID] - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, agent, convertApps(apps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(apps), api.AgentInactiveDisconnectTimeout) if err != nil { return codersdk.WorkspaceBuild{}, xerrors.Errorf("converting workspace agent: %w", err) } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index a6595e8bd6554..8eddcf42e325b 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -22,6 +22,8 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" + "github.com/coder/coder/enterprise/tailnet" + agpltailnet "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -171,11 +173,27 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - enforcer := workspacequota.NewNop() + coordinator := agpltailnet.NewMemoryCoordinator() if enabled { - enforcer = NewEnforcer(api.Options.UserWorkspaceQuota) + haCoordinator, err := tailnet.NewHACoordinator(api.Logger, api.Pubsub) + if err != nil { + api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + // If we try to setup the HA coordinator and it fails, nothing + // is actually changing. + changed = false + } else { + coordinator = haCoordinator + } + } + + // Recheck changed in case the HA coordinator failed to set up. + if changed { + oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator) + err := oldCoordinator.Close() + if err != nil { + api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + } } - api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) } api.entitlements = entitlements From d38391e9f6ff27351e33017540efcc21f3dcc7d8 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:45:29 -0500 Subject: [PATCH 07/79] fixup! impelement high availability feature --- enterprise/coderd/coderd.go | 2 +- .../coderd/coderdenttest/coderdenttest.go | 23 ++++++++++--------- enterprise/coderd/license/license_test.go | 9 ++++---- enterprise/coderd/licenses_test.go | 22 ++++++++++-------- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 8eddcf42e325b..d52596c547027 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -191,7 +191,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator) err := oldCoordinator.Close() if err != nil { - api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + api.Logger.Error(ctx, "close old tailnet coordinator", slog.Error(err)) } } } diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 90d09fd5c9c85..a9e08b4aac088 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -85,17 +85,18 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } type LicenseOptions struct { - AccountType string - AccountID string - Trial bool - AllFeatures bool - GraceAt time.Time - ExpiresAt time.Time - UserLimit int64 - AuditLog bool - BrowserOnly bool - SCIM bool - WorkspaceQuota bool + AccountType string + AccountID string + Trial bool + AllFeatures bool + GraceAt time.Time + ExpiresAt time.Time + UserLimit int64 + AuditLog bool + BrowserOnly bool + SCIM bool + WorkspaceQuota bool + HighAvailability bool } // AddLicense generates a new license with the options provided and inserts it. diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 85958fbf4f60d..39d6e05fb50d3 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -20,10 +20,11 @@ import ( func TestEntitlements(t *testing.T) { t.Parallel() all := map[string]bool{ - codersdk.FeatureAuditLog: true, - codersdk.FeatureBrowserOnly: true, - codersdk.FeatureSCIM: true, - codersdk.FeatureWorkspaceQuota: true, + codersdk.FeatureAuditLog: true, + codersdk.FeatureBrowserOnly: true, + codersdk.FeatureSCIM: true, + codersdk.FeatureWorkspaceQuota: true, + codersdk.FeatureHighAvailability: true, } t.Run("Defaults", func(t *testing.T) { diff --git a/enterprise/coderd/licenses_test.go b/enterprise/coderd/licenses_test.go index 59d36cc9157a6..5b4c89212578d 100644 --- a/enterprise/coderd/licenses_test.go +++ b/enterprise/coderd/licenses_test.go @@ -99,21 +99,23 @@ func TestGetLicense(t *testing.T) { assert.Equal(t, int32(1), licenses[0].ID) assert.Equal(t, "testing", licenses[0].Claims["account_id"]) assert.Equal(t, map[string]interface{}{ - codersdk.FeatureUserLimit: json.Number("0"), - codersdk.FeatureAuditLog: json.Number("1"), - codersdk.FeatureSCIM: json.Number("1"), - codersdk.FeatureBrowserOnly: json.Number("1"), - codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureUserLimit: json.Number("0"), + codersdk.FeatureAuditLog: json.Number("1"), + codersdk.FeatureSCIM: json.Number("1"), + codersdk.FeatureBrowserOnly: json.Number("1"), + codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureHighAvailability: json.Number("0"), }, licenses[0].Claims["features"]) assert.Equal(t, int32(2), licenses[1].ID) assert.Equal(t, "testing2", licenses[1].Claims["account_id"]) assert.Equal(t, true, licenses[1].Claims["trial"]) assert.Equal(t, map[string]interface{}{ - codersdk.FeatureUserLimit: json.Number("200"), - codersdk.FeatureAuditLog: json.Number("1"), - codersdk.FeatureSCIM: json.Number("1"), - codersdk.FeatureBrowserOnly: json.Number("1"), - codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureUserLimit: json.Number("200"), + codersdk.FeatureAuditLog: json.Number("1"), + codersdk.FeatureSCIM: json.Number("1"), + codersdk.FeatureBrowserOnly: json.Number("1"), + codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureHighAvailability: json.Number("0"), }, licenses[1].Claims["features"]) }) } From a0bcd6464f16483c9524a69137de1dcc7d309095 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:53:18 -0500 Subject: [PATCH 08/79] fixup! impelement high availability feature --- enterprise/coderd/license/license_test.go | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 39d6e05fb50d3..204c6e7c3f5a2 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -60,11 +60,12 @@ func TestEntitlements(t *testing.T) { db := databasefake.New() db.InsertLicense(context.Background(), database.InsertLicenseParams{ JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ - UserLimit: 100, - AuditLog: true, - BrowserOnly: true, - SCIM: true, - WorkspaceQuota: true, + UserLimit: 100, + AuditLog: true, + BrowserOnly: true, + SCIM: true, + WorkspaceQuota: true, + HighAvailability: true, }), Exp: time.Now().Add(time.Hour), }) @@ -81,13 +82,14 @@ func TestEntitlements(t *testing.T) { db := databasefake.New() db.InsertLicense(context.Background(), database.InsertLicenseParams{ JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ - UserLimit: 100, - AuditLog: true, - BrowserOnly: true, - SCIM: true, - WorkspaceQuota: true, - GraceAt: time.Now().Add(-time.Hour), - ExpiresAt: time.Now().Add(time.Hour), + UserLimit: 100, + AuditLog: true, + BrowserOnly: true, + SCIM: true, + WorkspaceQuota: true, + HighAvailability: true, + GraceAt: time.Now().Add(-time.Hour), + ExpiresAt: time.Now().Add(time.Hour), }), Exp: time.Now().Add(time.Hour), }) From 1f33018bd1c586956c748e65c08e2049fcfdee78 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 13:02:40 -0500 Subject: [PATCH 09/79] fixup! impelement high availability feature --- enterprise/coderd/coderdenttest/coderdenttest.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index a9e08b4aac088..2c4250325b567 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -132,6 +132,10 @@ func GenerateLicense(t *testing.T, options LicenseOptions) string { if options.WorkspaceQuota { workspaceQuota = 1 } + highAvailability := int64(0) + if options.HighAvailability { + highAvailability = 1 + } c := &license.Claims{ RegisteredClaims: jwt.RegisteredClaims{ @@ -147,11 +151,12 @@ func GenerateLicense(t *testing.T, options LicenseOptions) string { Version: license.CurrentVersion, AllFeatures: options.AllFeatures, Features: license.Features{ - UserLimit: options.UserLimit, - AuditLog: auditLog, - BrowserOnly: browserOnly, - SCIM: scim, - WorkspaceQuota: workspaceQuota, + UserLimit: options.UserLimit, + AuditLog: auditLog, + BrowserOnly: browserOnly, + SCIM: scim, + WorkspaceQuota: workspaceQuota, + HighAvailability: highAvailability, }, } tok := jwt.NewWithClaims(jwt.SigningMethodEdDSA, c) From b6a507020417a5704d7d1336336cb5b961fa42eb Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 13:11:20 -0500 Subject: [PATCH 10/79] fixup! impelement high availability feature --- enterprise/cli/features_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enterprise/cli/features_test.go b/enterprise/cli/features_test.go index f5e7b1ff3520a..f892182f164fe 100644 --- a/enterprise/cli/features_test.go +++ b/enterprise/cli/features_test.go @@ -57,7 +57,7 @@ func TestFeaturesList(t *testing.T) { var entitlements codersdk.Entitlements err := json.Unmarshal(buf.Bytes(), &entitlements) require.NoError(t, err, "unmarshal JSON output") - assert.Len(t, entitlements.Features, 5) + assert.Len(t, entitlements.Features, 6) assert.Empty(t, entitlements.Warnings) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureUserLimit].Entitlement) @@ -69,6 +69,8 @@ func TestFeaturesList(t *testing.T) { entitlements.Features[codersdk.FeatureWorkspaceQuota].Entitlement) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureSCIM].Entitlement) + assert.Equal(t, codersdk.EntitlementNotEntitled, + entitlements.Features[codersdk.FeatureHighAvailability].Entitlement) assert.False(t, entitlements.HasLicense) assert.False(t, entitlements.Experimental) }) From 1883430c952607d28b188ca41c51fee7006e2250 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 04:22:37 +0000 Subject: [PATCH 11/79] Add replicas --- coderd/database/databasefake/databasefake.go | 78 ++++ coderd/database/dump.sql | 16 +- .../migrations/000059_replicas.down.sql | 2 + .../migrations/000059_replicas.up.sql | 26 ++ coderd/database/models.go | 14 + coderd/database/querier.go | 5 + coderd/database/queries.sql.go | 183 ++++++++- coderd/database/queries/replicas.sql | 33 ++ enterprise/replica/replica.go | 371 ++++++++++++++++++ enterprise/replica/replica_test.go | 193 +++++++++ enterprise/tailmesh/tailmesh.go | 32 ++ 11 files changed, 949 insertions(+), 4 deletions(-) create mode 100644 coderd/database/migrations/000059_replicas.down.sql create mode 100644 coderd/database/migrations/000059_replicas.up.sql create mode 100644 coderd/database/queries/replicas.sql create mode 100644 enterprise/replica/replica.go create mode 100644 enterprise/replica/replica_test.go create mode 100644 enterprise/tailmesh/tailmesh.go diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index 1a2a919925ec2..ae41d9c23620b 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -107,6 +107,7 @@ type data struct { workspaceApps []database.WorkspaceApp workspaces []database.Workspace licenses []database.License + replicas []database.Replica deploymentID string lastLicenseID int32 @@ -3025,3 +3026,80 @@ func (q *fakeQuerier) DeleteGroupByID(_ context.Context, id uuid.UUID) error { return sql.ErrNoRows } + +func (q *fakeQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, before time.Time) error { + q.mutex.Lock() + defer q.mutex.Unlock() + + for i, replica := range q.replicas { + if replica.UpdatedAt.Before(before) { + q.replicas = append(q.replicas[:i], q.replicas[i+1:]...) + } + } + + return nil +} + +func (q *fakeQuerier) InsertReplica(_ context.Context, arg database.InsertReplicaParams) (database.Replica, error) { + q.mutex.Lock() + defer q.mutex.Unlock() + + replica := database.Replica{ + ID: arg.ID, + CreatedAt: arg.CreatedAt, + StartedAt: arg.StartedAt, + UpdatedAt: arg.UpdatedAt, + Hostname: arg.Hostname, + RegionID: arg.RegionID, + RelayAddress: arg.RelayAddress, + Version: arg.Version, + } + q.replicas = append(q.replicas, replica) + return replica, nil +} + +func (q *fakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { + q.mutex.Lock() + defer q.mutex.Unlock() + + for index, replica := range q.replicas { + if replica.ID != arg.ID { + continue + } + replica.Hostname = arg.Hostname + replica.StartedAt = arg.StartedAt + replica.StoppedAt = arg.StoppedAt + replica.UpdatedAt = arg.UpdatedAt + replica.RelayAddress = arg.RelayAddress + replica.RegionID = arg.RegionID + replica.Version = arg.Version + replica.Error = arg.Error + q.replicas[index] = replica + return replica, nil + } + return database.Replica{}, sql.ErrNoRows +} + +func (q *fakeQuerier) GetReplicasUpdatedAfter(_ context.Context, updatedAt time.Time) ([]database.Replica, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + replicas := make([]database.Replica, 0) + for _, replica := range q.replicas { + if replica.UpdatedAt.After(updatedAt) && !replica.StoppedAt.Valid { + replicas = append(replicas, replica) + } + } + return replicas, nil +} + +func (q *fakeQuerier) GetReplicaByID(_ context.Context, id uuid.UUID) (database.Replica, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + for _, replica := range q.replicas { + if replica.ID == id { + return replica, nil + } + } + return database.Replica{}, sql.ErrNoRows +} diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index eb16074e90525..4b956fb64f10e 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -245,7 +245,8 @@ CREATE TABLE provisioner_daemons ( created_at timestamp with time zone NOT NULL, updated_at timestamp with time zone, name character varying(64) NOT NULL, - provisioners provisioner_type[] NOT NULL + provisioners provisioner_type[] NOT NULL, + replica_id uuid ); CREATE TABLE provisioner_job_logs ( @@ -276,6 +277,19 @@ CREATE TABLE provisioner_jobs ( worker_id uuid ); +CREATE TABLE replicas ( + id uuid NOT NULL, + created_at timestamp with time zone NOT NULL, + started_at timestamp with time zone NOT NULL, + stopped_at timestamp with time zone, + updated_at timestamp with time zone NOT NULL, + hostname text NOT NULL, + region_id integer NOT NULL, + relay_address text NOT NULL, + version text NOT NULL, + error text +); + CREATE TABLE site_configs ( key character varying(256) NOT NULL, value character varying(8192) NOT NULL diff --git a/coderd/database/migrations/000059_replicas.down.sql b/coderd/database/migrations/000059_replicas.down.sql new file mode 100644 index 0000000000000..4cca6615d4213 --- /dev/null +++ b/coderd/database/migrations/000059_replicas.down.sql @@ -0,0 +1,2 @@ +DROP TABLE replicas; +ALTER TABLE provisioner_daemons DROP COLUMN replica_id; diff --git a/coderd/database/migrations/000059_replicas.up.sql b/coderd/database/migrations/000059_replicas.up.sql new file mode 100644 index 0000000000000..a07587f35a234 --- /dev/null +++ b/coderd/database/migrations/000059_replicas.up.sql @@ -0,0 +1,26 @@ +CREATE TABLE IF NOT EXISTS replicas ( + -- A unique identifier for the replica that is stored on disk. + -- For persistent replicas, this will be reused. + -- For ephemeral replicas, this will be a new UUID for each one. + id uuid NOT NULL, + created_at timestamp with time zone NOT NULL, + -- The time the replica was created. + started_at timestamp with time zone NOT NULL, + -- The time the replica was last seen. + stopped_at timestamp with time zone, + -- Updated periodically to ensure the replica is still alive. + updated_at timestamp with time zone NOT NULL, + -- Hostname is the hostname of the replica. + hostname text NOT NULL, + -- Region is the region the replica is in. + -- We only DERP mesh to the same region ID of a running replica. + region_id integer NOT NULL, + -- An address that should be accessible to other replicas. + relay_address text NOT NULL, + -- Version is the Coder version of the replica. + version text NOT NULL, + error text +); + +-- Associates a provisioner daemon with a replica. +ALTER TABLE provisioner_daemons ADD COLUMN replica_id uuid; diff --git a/coderd/database/models.go b/coderd/database/models.go index f669b5e618138..9d73e097bfe0f 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -487,6 +487,7 @@ type ProvisionerDaemon struct { UpdatedAt sql.NullTime `db:"updated_at" json:"updated_at"` Name string `db:"name" json:"name"` Provisioners []ProvisionerType `db:"provisioners" json:"provisioners"` + ReplicaID uuid.NullUUID `db:"replica_id" json:"replica_id"` } type ProvisionerJob struct { @@ -517,6 +518,19 @@ type ProvisionerJobLog struct { Output string `db:"output" json:"output"` } +type Replica struct { + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` +} + type SiteConfig struct { Key string `db:"key" json:"key"` Value string `db:"value" json:"value"` diff --git a/coderd/database/querier.go b/coderd/database/querier.go index b58f6abbccfb8..db789e3399939 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -26,6 +26,7 @@ type sqlcQuerier interface { DeleteLicense(ctx context.Context, id int32) (int32, error) DeleteOldAgentStats(ctx context.Context) error DeleteParameterValueByID(ctx context.Context, id uuid.UUID) error + DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt time.Time) error GetAPIKeyByID(ctx context.Context, id string) (APIKey, error) GetAPIKeysByLoginType(ctx context.Context, loginType LoginType) ([]APIKey, error) GetAPIKeysLastUsedAfter(ctx context.Context, lastUsed time.Time) ([]APIKey, error) @@ -66,6 +67,8 @@ type sqlcQuerier interface { GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) GetProvisionerLogsByIDBetween(ctx context.Context, arg GetProvisionerLogsByIDBetweenParams) ([]ProvisionerJobLog, error) + GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) + GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetTemplateByID(ctx context.Context, id uuid.UUID) (Template, error) GetTemplateByOrganizationAndName(ctx context.Context, arg GetTemplateByOrganizationAndNameParams) (Template, error) GetTemplateDAUs(ctx context.Context, templateID uuid.UUID) ([]GetTemplateDAUsRow, error) @@ -134,6 +137,7 @@ type sqlcQuerier interface { InsertProvisionerDaemon(ctx context.Context, arg InsertProvisionerDaemonParams) (ProvisionerDaemon, error) InsertProvisionerJob(ctx context.Context, arg InsertProvisionerJobParams) (ProvisionerJob, error) InsertProvisionerJobLogs(ctx context.Context, arg InsertProvisionerJobLogsParams) ([]ProvisionerJobLog, error) + InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) InsertTemplate(ctx context.Context, arg InsertTemplateParams) (Template, error) InsertTemplateVersion(ctx context.Context, arg InsertTemplateVersionParams) (TemplateVersion, error) InsertUser(ctx context.Context, arg InsertUserParams) (User, error) @@ -154,6 +158,7 @@ type sqlcQuerier interface { UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error + UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) UpdateTemplateActiveVersionByID(ctx context.Context, arg UpdateTemplateActiveVersionByIDParams) error UpdateTemplateDeletedByID(ctx context.Context, arg UpdateTemplateDeletedByIDParams) error UpdateTemplateMetaByID(ctx context.Context, arg UpdateTemplateMetaByIDParams) (Template, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index ba90e102b819a..adfe532446a4e 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1985,7 +1985,7 @@ func (q *sqlQuerier) ParameterValues(ctx context.Context, arg ParameterValuesPar const getProvisionerDaemonByID = `-- name: GetProvisionerDaemonByID :one SELECT - id, created_at, updated_at, name, provisioners + id, created_at, updated_at, name, provisioners, replica_id FROM provisioner_daemons WHERE @@ -2001,13 +2001,14 @@ func (q *sqlQuerier) GetProvisionerDaemonByID(ctx context.Context, id uuid.UUID) &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ) return i, err } const getProvisionerDaemons = `-- name: GetProvisionerDaemons :many SELECT - id, created_at, updated_at, name, provisioners + id, created_at, updated_at, name, provisioners, replica_id FROM provisioner_daemons ` @@ -2027,6 +2028,7 @@ func (q *sqlQuerier) GetProvisionerDaemons(ctx context.Context) ([]ProvisionerDa &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ); err != nil { return nil, err } @@ -2050,7 +2052,7 @@ INSERT INTO provisioners ) VALUES - ($1, $2, $3, $4) RETURNING id, created_at, updated_at, name, provisioners + ($1, $2, $3, $4) RETURNING id, created_at, updated_at, name, provisioners, replica_id ` type InsertProvisionerDaemonParams struct { @@ -2074,6 +2076,7 @@ func (q *sqlQuerier) InsertProvisionerDaemon(ctx context.Context, arg InsertProv &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ) return i, err } @@ -2531,6 +2534,180 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a return err } +const deleteReplicasUpdatedBefore = `-- name: DeleteReplicasUpdatedBefore :exec +DELETE FROM replicas WHERE updated_at < $1 +` + +func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt time.Time) error { + _, err := q.db.ExecContext(ctx, deleteReplicasUpdatedBefore, updatedAt) + return err +} + +const getReplicaByID = `-- name: GetReplicaByID :one +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE id = $1 +` + +func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { + row := q.db.QueryRowContext(ctx, getReplicaByID, id) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + +const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL +` + +func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) { + rows, err := q.db.QueryContext(ctx, getReplicasUpdatedAfter, updatedAt) + if err != nil { + return nil, err + } + defer rows.Close() + var items []Replica + for rows.Next() { + var i Replica + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const insertReplica = `-- name: InsertReplica :one +INSERT INTO replicas ( + id, + created_at, + started_at, + updated_at, + hostname, + region_id, + relay_address, + version + +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error +` + +type InsertReplicaParams struct { + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` +} + +func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) { + row := q.db.QueryRowContext(ctx, insertReplica, + arg.ID, + arg.CreatedAt, + arg.StartedAt, + arg.UpdatedAt, + arg.Hostname, + arg.RegionID, + arg.RelayAddress, + arg.Version, + ) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + +const updateReplica = `-- name: UpdateReplica :one +UPDATE replicas SET + updated_at = $2, + started_at = $3, + stopped_at = $4, + relay_address = $5, + region_id = $6, + hostname = $7, + version = $8, + error = $9 +WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error +` + +type UpdateReplicaParams struct { + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` +} + +func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { + row := q.db.QueryRowContext(ctx, updateReplica, + arg.ID, + arg.UpdatedAt, + arg.StartedAt, + arg.StoppedAt, + arg.RelayAddress, + arg.RegionID, + arg.Hostname, + arg.Version, + arg.Error, + ) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + const getDeploymentID = `-- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id' ` diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql new file mode 100644 index 0000000000000..a7aa5b0aa1dee --- /dev/null +++ b/coderd/database/queries/replicas.sql @@ -0,0 +1,33 @@ +-- name: GetReplicasUpdatedAfter :many +SELECT * FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL; + +-- name: GetReplicaByID :one +SELECT * FROM replicas WHERE id = $1; + +-- name: InsertReplica :one +INSERT INTO replicas ( + id, + created_at, + started_at, + updated_at, + hostname, + region_id, + relay_address, + version + +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *; + +-- name: UpdateReplica :one +UPDATE replicas SET + updated_at = $2, + started_at = $3, + stopped_at = $4, + relay_address = $5, + region_id = $6, + hostname = $7, + version = $8, + error = $9 +WHERE id = $1 RETURNING *; + +-- name: DeleteReplicasUpdatedBefore :exec +DELETE FROM replicas WHERE updated_at < $1; diff --git a/enterprise/replica/replica.go b/enterprise/replica/replica.go new file mode 100644 index 0000000000000..ca0c450651e64 --- /dev/null +++ b/enterprise/replica/replica.go @@ -0,0 +1,371 @@ +package replica + +import ( + "context" + "database/sql" + "errors" + "fmt" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/buildinfo" + "github.com/coder/coder/coderd/database" +) + +var ( + PubsubEvent = "replica" +) + +type Options struct { + ID uuid.UUID + UpdateInterval time.Duration + PeerTimeout time.Duration + // Mesh will dial active replicas with the same region ID to ensure + // they are reachable. If not, an error will be updated on the replica. + Mesh bool + RelayAddress string + RegionID int32 +} + +// New registers the replica with the database and periodically updates to ensure +// it's healthy. It contacts all other alive replicas to ensure they are reachable. +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Server, error) { + if options.ID == uuid.Nil { + panic("An ID must be provided!") + } + if options.PeerTimeout == 0 { + options.PeerTimeout = 3 * time.Second + } + if options.UpdateInterval == 0 { + options.UpdateInterval = 5 * time.Second + } + hostname, err := os.Hostname() + if err != nil { + return nil, xerrors.Errorf("get hostname: %w", err) + } + var replica database.Replica + _, err = db.GetReplicaByID(ctx, options.ID) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + return nil, xerrors.Errorf("get replica: %w", err) + } + replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ + ID: options.ID, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + }) + if err != nil { + return nil, xerrors.Errorf("insert replica: %w", err) + } + } else { + replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: options.ID, + UpdatedAt: database.Now(), + StartedAt: database.Now(), + StoppedAt: sql.NullTime{}, + RelayAddress: options.RelayAddress, + RegionID: options.RegionID, + Hostname: hostname, + Version: buildinfo.Version(), + Error: sql.NullString{}, + }) + if err != nil { + return nil, xerrors.Errorf("update replica: %w", err) + } + } + err = pubsub.Publish(PubsubEvent, []byte(options.ID.String())) + if err != nil { + return nil, xerrors.Errorf("publish new replica: %w", err) + } + ctx, cancelFunc := context.WithCancel(ctx) + server := &Server{ + options: &options, + db: db, + pubsub: pubsub, + self: replica, + logger: logger, + closed: make(chan struct{}), + closeCancel: cancelFunc, + } + err = server.run(ctx) + if err != nil { + return nil, xerrors.Errorf("run replica: %w", err) + } + err = server.subscribe(ctx) + if err != nil { + return nil, xerrors.Errorf("subscribe: %w", err) + } + server.closeWait.Add(1) + go server.loop(ctx) + return server, nil +} + +type Server struct { + options *Options + db database.Store + pubsub database.Pubsub + logger slog.Logger + + closeWait sync.WaitGroup + closeMutex sync.Mutex + closed chan (struct{}) + closeCancel context.CancelFunc + + self database.Replica + mutex sync.Mutex + peers []database.Replica + callback func() +} + +// loop runs the replica update sequence on an update interval. +func (s *Server) loop(ctx context.Context) { + defer s.closeWait.Done() + ticker := time.NewTicker(s.options.UpdateInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err := s.run(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + s.logger.Warn(ctx, "run replica update loop", slog.Error(err)) + } + } +} + +// subscribe listens for new replica information! +func (s *Server) subscribe(ctx context.Context) error { + needsUpdate := false + updating := false + updateMutex := sync.Mutex{} + + // This loop will continually update nodes as updates are processed. + // The intent is to always be up to date without spamming the run + // function, so if a new update comes in while one is being processed, + // it will reprocess afterwards. + var update func() + update = func() { + err := s.run(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + s.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + } + updateMutex.Lock() + if needsUpdate { + needsUpdate = false + updateMutex.Unlock() + update() + return + } + updating = false + updateMutex.Unlock() + } + cancelFunc, err := s.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { + updateMutex.Lock() + defer updateMutex.Unlock() + id, err := uuid.Parse(string(message)) + if err != nil { + return + } + // Don't process updates for ourself! + if id == s.options.ID { + return + } + if updating { + needsUpdate = true + return + } + updating = true + go update() + }) + if err != nil { + return err + } + go func() { + <-ctx.Done() + cancelFunc() + }() + return nil +} + +func (s *Server) run(ctx context.Context) error { + s.closeMutex.Lock() + s.closeWait.Add(1) + s.closeMutex.Unlock() + go func() { + s.closeWait.Done() + }() + // Expect replicas to update once every three times the interval... + // If they don't, assume death! + replicas, err := s.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*s.options.UpdateInterval)) + if err != nil { + return xerrors.Errorf("get replicas: %w", err) + } + + s.mutex.Lock() + s.peers = make([]database.Replica, 0, len(replicas)) + for _, replica := range replicas { + if replica.ID == s.options.ID { + continue + } + s.peers = append(s.peers, replica) + } + s.mutex.Unlock() + + var wg sync.WaitGroup + var mu sync.Mutex + failed := make([]string, 0) + for _, peer := range s.Regional() { + wg.Add(1) + peer := peer + go func() { + defer wg.Done() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) + if err != nil { + s.logger.Error(ctx, "create http request for relay probe", + slog.F("relay_address", peer.RelayAddress), slog.Error(err)) + return + } + client := http.Client{ + Timeout: s.options.PeerTimeout, + } + res, err := client.Do(req) + if err != nil { + mu.Lock() + failed = append(failed, fmt.Sprintf("relay %s (%s): %s", peer.Hostname, peer.RelayAddress, err)) + mu.Unlock() + return + } + _ = res.Body.Close() + }() + } + wg.Wait() + replicaError := sql.NullString{} + if len(failed) > 0 { + replicaError = sql.NullString{ + Valid: true, + String: fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")), + } + } + + replica, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: s.self.ID, + UpdatedAt: database.Now(), + StartedAt: s.self.StartedAt, + StoppedAt: s.self.StoppedAt, + RelayAddress: s.self.RelayAddress, + RegionID: s.self.RegionID, + Hostname: s.self.Hostname, + Version: s.self.Version, + Error: replicaError, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + s.mutex.Lock() + if s.self.Error.String != replica.Error.String { + // Publish an update occurred! + err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + if err != nil { + s.mutex.Unlock() + return xerrors.Errorf("publish replica update: %w", err) + } + } + s.self = replica + if s.callback != nil { + go s.callback() + } + s.mutex.Unlock() + return nil +} + +// Self represents the current replica. +func (s *Server) Self() database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + return s.self +} + +// All returns every replica, including itself. +func (s *Server) All() []database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + return append(s.peers, s.self) +} + +// Regional returns all replicas in the same region excluding itself. +func (s *Server) Regional() []database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + replicas := make([]database.Replica, 0) + for _, replica := range s.peers { + if replica.RegionID != s.self.RegionID { + continue + } + replicas = append(replicas, replica) + } + return replicas +} + +// SetCallback sets a function to execute whenever new peers +// are refreshed or updated. +func (s *Server) SetCallback(callback func()) { + s.mutex.Lock() + defer s.mutex.Unlock() + s.callback = callback + // Instantly call the callback to inform replicas! + go callback() +} + +func (s *Server) Close() error { + s.closeMutex.Lock() + select { + case <-s.closed: + s.closeMutex.Unlock() + return nil + default: + } + close(s.closed) + s.closeCancel() + s.closeWait.Wait() + s.closeMutex.Unlock() + + ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelFunc() + _, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: s.self.ID, + UpdatedAt: database.Now(), + StartedAt: s.self.StartedAt, + StoppedAt: sql.NullTime{ + Time: database.Now(), + Valid: true, + }, + RelayAddress: s.self.RelayAddress, + RegionID: s.self.RegionID, + Hostname: s.self.Hostname, + Version: s.self.Version, + Error: s.self.Error, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + if err != nil { + return xerrors.Errorf("publish replica update: %w", err) + } + return nil +} diff --git a/enterprise/replica/replica_test.go b/enterprise/replica/replica_test.go new file mode 100644 index 0000000000000..74efb3b40470e --- /dev/null +++ b/enterprise/replica/replica_test.go @@ -0,0 +1,193 @@ +package replica_test + +import ( + "context" + "net/http" + "net/http/httptest" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/goleak" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/enterprise/replica" + "github.com/coder/coder/testutil" +) + +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + +func TestReplica(t *testing.T) { + t.Parallel() + t.Run("CreateOnNew", func(t *testing.T) { + // This ensures that a new replica is created on New. + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + assert.Equal(t, []byte(id.String()), message) + }) + require.NoError(t, err) + defer cancel() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + _ = server.Close() + require.NoError(t, err) + }) + t.Run("UpdatesOnNew", func(t *testing.T) { + // This ensures that a replica is updated when it initially connects + // and immediately publishes it's existence! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: id, + }) + require.NoError(t, err) + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + assert.Equal(t, []byte(id.String()), message) + }) + require.NoError(t, err) + defer cancel() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + _ = server.Close() + require.NoError(t, err) + }) + t.Run("ConnectsToPeerReplica", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + db, pubsub := dbtestutil.NewDB(t) + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + RelayAddress: srv.URL, + }) + require.NoError(t, err) + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: uuid.New(), + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.False(t, server.Self().Error.Valid) + _ = server.Close() + }) + t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + var count atomic.Int32 + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + count.Add(1) + }) + require.NoError(t, err) + defer cancel() + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + // Fake address to hit! + RelayAddress: "http://169.254.169.254", + }) + require.NoError(t, err) + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: uuid.New(), + PeerTimeout: 1 * time.Millisecond, + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.True(t, server.Self().Error.Valid) + require.Contains(t, server.Self().Error.String, "Failed to dial peers") + // Once for the initial creation of a replica, and another time for the error. + require.Equal(t, int32(2), count.Load()) + _ = server.Close() + }) + t.Run("RefreshOnPublish", func(t *testing.T) { + // Refresh when a new replica appears! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + RelayAddress: srv.URL, + UpdatedAt: database.Now(), + }) + require.NoError(t, err) + // Publish multiple times to ensure it can handle that case. + err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + require.NoError(t, err) + err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + require.NoError(t, err) + require.Eventually(t, func() bool { + return len(server.Regional()) == 1 + }, testutil.WaitShort, testutil.IntervalFast) + _ = server.Close() + }) + t.Run("TwentyConcurrent", func(t *testing.T) { + // Ensures that twenty concurrent replicas can spawn and all + // discover each other in parallel! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + logger := slogtest.Make(t, nil) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + var wg sync.WaitGroup + count := 20 + wg.Add(count) + for i := 0; i < count; i++ { + server, err := replica.New(context.Background(), logger, db, pubsub, replica.Options{ + ID: uuid.New(), + RelayAddress: srv.URL, + }) + require.NoError(t, err) + t.Cleanup(func() { + _ = server.Close() + }) + done := false + server.SetCallback(func() { + if len(server.All()) != count { + return + } + if done { + return + } + done = true + wg.Done() + }) + } + wg.Wait() + }) +} diff --git a/enterprise/tailmesh/tailmesh.go b/enterprise/tailmesh/tailmesh.go new file mode 100644 index 0000000000000..46e1c97fffcc9 --- /dev/null +++ b/enterprise/tailmesh/tailmesh.go @@ -0,0 +1,32 @@ +package tailmesh + +import ( + "context" + + "cdr.dev/slog" + "github.com/coder/coder/tailnet" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" +) + +func New(logger slog.Logger, server *derp.Server) *Mesh { + +} + +type Mesh struct { + logger slog.Logger + server *derp.Server + ctx context.Context + + active map[string]context.CancelFunc +} + +func (m *Mesh) SetAddresses(addresses []string) { + for _, address := range addresses { + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + if err != nil { + + } + go client.RunWatchConnectionLoop() + } +} From 7dc968c52313e1dbe485ea5b0b2d9f8edc01c0b3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 17:53:00 +0000 Subject: [PATCH 12/79] Add DERP meshing to arbitrary addresses --- enterprise/derpmesh/derpmesh.go | 124 +++++++++++++++++++++++ enterprise/derpmesh/derpmesh_test.go | 146 +++++++++++++++++++++++++++ enterprise/tailmesh/tailmesh.go | 32 ------ 3 files changed, 270 insertions(+), 32 deletions(-) create mode 100644 enterprise/derpmesh/derpmesh.go create mode 100644 enterprise/derpmesh/derpmesh_test.go delete mode 100644 enterprise/tailmesh/tailmesh.go diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go new file mode 100644 index 0000000000000..610fd749132cc --- /dev/null +++ b/enterprise/derpmesh/derpmesh.go @@ -0,0 +1,124 @@ +package derpmesh + +import ( + "context" + "sync" + + "golang.org/x/xerrors" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" + "tailscale.com/types/key" + + "github.com/coder/coder/tailnet" + + "cdr.dev/slog" +) + +func New(logger slog.Logger, server *derp.Server) *Mesh { + return &Mesh{ + logger: logger, + server: server, + ctx: context.Background(), + closed: make(chan struct{}), + active: make(map[string]context.CancelFunc), + } +} + +type Mesh struct { + logger slog.Logger + server *derp.Server + ctx context.Context + + mutex sync.Mutex + closed chan struct{} + active map[string]context.CancelFunc +} + +// SetAddresses performs a diff of the incoming addresses and adds +// or removes DERP clients from the mesh. +func (m *Mesh) SetAddresses(addresses []string) { + total := make(map[string]struct{}, 0) + for _, address := range addresses { + total[address] = struct{}{} + added, err := m.addAddress(address) + if err != nil { + m.logger.Error(m.ctx, "failed to add address", slog.F("address", address), slog.Error(err)) + continue + } + if added { + m.logger.Debug(m.ctx, "added mesh address", slog.F("address", address)) + } + } + + m.mutex.Lock() + for address := range m.active { + _, found := total[address] + if found { + continue + } + removed := m.removeAddress(address) + if removed { + m.logger.Debug(m.ctx, "removed mesh address", slog.F("address", address)) + } + } + m.mutex.Unlock() +} + +// addAddress begins meshing with a new address. +// It's expected that this is a full HTTP address with a path. +// e.g. http://127.0.0.1:8080/derp +func (m *Mesh) addAddress(address string) (bool, error) { + m.mutex.Lock() + defer m.mutex.Unlock() + _, isActive := m.active[address] + if isActive { + return false, nil + } + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + if err != nil { + return false, xerrors.Errorf("create derp client: %w", err) + } + client.MeshKey = m.server.MeshKey() + ctx, cancelFunc := context.WithCancel(m.ctx) + closed := make(chan struct{}) + closeFunc := func() { + cancelFunc() + _ = client.Close() + <-closed + } + m.active[address] = closeFunc + go func() { + defer close(closed) + client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger), func(np key.NodePublic) { + m.server.AddPacketForwarder(np, client) + }, func(np key.NodePublic) { + m.server.RemovePacketForwarder(np, client) + }) + }() + return true, nil +} + +// removeAddress stops meshing with a given address. +func (m *Mesh) removeAddress(address string) bool { + cancelFunc, isActive := m.active[address] + if isActive { + cancelFunc() + } + return isActive +} + +// Close ends all active meshes with the DERP server. +func (m *Mesh) Close() error { + m.mutex.Lock() + defer m.mutex.Unlock() + select { + case <-m.closed: + return nil + default: + } + close(m.closed) + for _, cancelFunc := range m.active { + cancelFunc() + } + return nil +} diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go new file mode 100644 index 0000000000000..313c33da99bad --- /dev/null +++ b/enterprise/derpmesh/derpmesh_test.go @@ -0,0 +1,146 @@ +package derpmesh_test + +import ( + "context" + "errors" + "io" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/goleak" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" + "tailscale.com/types/key" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/tailnet" +) + +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + +func TestDERPMesh(t *testing.T) { + t.Parallel() + t.Run("ExchangeMessages", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + firstServer, firstServerURL := startDERP(t) + defer firstServer.Close() + secondServer, secondServerURL := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer) + firstMesh.SetAddresses([]string{secondServerURL}) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer) + secondMesh.SetAddresses([]string{firstServerURL}) + defer firstMesh.Close() + defer secondMesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("RemoveAddress", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + server, serverURL := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server) + mesh.SetAddresses([]string{"http://fake.com"}) + // This should trigger a removal... + mesh.SetAddresses([]string{}) + defer mesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, serverURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, serverURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("TwentyMeshes", func(t *testing.T) { + t.Parallel() + meshes := make([]*derpmesh.Mesh, 0, 20) + serverURLs := make([]string, 0, 20) + for i := 0; i < 20; i++ { + server, url := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server) + t.Cleanup(func() { + _ = server.Close() + _ = mesh.Close() + }) + serverURLs = append(serverURLs, url) + meshes = append(meshes, mesh) + } + for _, mesh := range meshes { + mesh.SetAddresses(serverURLs) + } + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, serverURLs[9], tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, serverURLs[16], tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) +} + +func recvData(t *testing.T, client *derphttp.Client) []byte { + for { + msg, err := client.Recv() + if errors.Is(err, io.EOF) { + return nil + } + assert.NoError(t, err) + t.Logf("derp: %T", msg) + switch msg := msg.(type) { + case derp.ReceivedPacket: + return msg.Data + default: + // Drop all others! + } + } +} + +func startDERP(t *testing.T) (*derp.Server, string) { + logf := tailnet.Logger(slogtest.Make(t, nil)) + d := derp.NewServer(key.NewNode(), logf) + d.SetMeshKey("some-key") + server := httptest.NewUnstartedServer(derphttp.Handler(d)) + server.Start() + t.Cleanup(func() { + _ = d.Close() + }) + t.Cleanup(server.Close) + return d, server.URL +} diff --git a/enterprise/tailmesh/tailmesh.go b/enterprise/tailmesh/tailmesh.go deleted file mode 100644 index 46e1c97fffcc9..0000000000000 --- a/enterprise/tailmesh/tailmesh.go +++ /dev/null @@ -1,32 +0,0 @@ -package tailmesh - -import ( - "context" - - "cdr.dev/slog" - "github.com/coder/coder/tailnet" - "tailscale.com/derp" - "tailscale.com/derp/derphttp" -) - -func New(logger slog.Logger, server *derp.Server) *Mesh { - -} - -type Mesh struct { - logger slog.Logger - server *derp.Server - ctx context.Context - - active map[string]context.CancelFunc -} - -func (m *Mesh) SetAddresses(addresses []string) { - for _, address := range addresses { - client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) - if err != nil { - - } - go client.RunWatchConnectionLoop() - } -} From 1dcf0d01899a2544cbf0e3a1134b0a0c6d0e4bce Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 17:55:22 +0000 Subject: [PATCH 13/79] Move packages to highavailability folder --- enterprise/{ => highavailability}/derpmesh/derpmesh.go | 0 enterprise/{ => highavailability}/derpmesh/derpmesh_test.go | 2 +- enterprise/{ => highavailability}/replica/replica.go | 0 enterprise/{ => highavailability}/replica/replica_test.go | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename enterprise/{ => highavailability}/derpmesh/derpmesh.go (100%) rename enterprise/{ => highavailability}/derpmesh/derpmesh_test.go (98%) rename enterprise/{ => highavailability}/replica/replica.go (100%) rename enterprise/{ => highavailability}/replica/replica_test.go (98%) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/highavailability/derpmesh/derpmesh.go similarity index 100% rename from enterprise/derpmesh/derpmesh.go rename to enterprise/highavailability/derpmesh/derpmesh.go diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/highavailability/derpmesh/derpmesh_test.go similarity index 98% rename from enterprise/derpmesh/derpmesh_test.go rename to enterprise/highavailability/derpmesh/derpmesh_test.go index 313c33da99bad..6e1154fc3d6a8 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/highavailability/derpmesh/derpmesh_test.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/enterprise/highavailability/derpmesh" "github.com/coder/coder/tailnet" ) diff --git a/enterprise/replica/replica.go b/enterprise/highavailability/replica/replica.go similarity index 100% rename from enterprise/replica/replica.go rename to enterprise/highavailability/replica/replica.go diff --git a/enterprise/replica/replica_test.go b/enterprise/highavailability/replica/replica_test.go similarity index 98% rename from enterprise/replica/replica_test.go rename to enterprise/highavailability/replica/replica_test.go index 74efb3b40470e..a5bda874ea166 100644 --- a/enterprise/replica/replica_test.go +++ b/enterprise/highavailability/replica/replica_test.go @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/replica" + "github.com/coder/coder/enterprise/highavailability/replica" "github.com/coder/coder/testutil" ) From 289e13913a8586f2e160d56406e5e2a3e4d47254 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 18:03:51 +0000 Subject: [PATCH 14/79] Move coordinator to high availability package --- agent/agent_test.go | 2 +- coderd/coderd.go | 2 +- coderd/wsconncache/wsconncache_test.go | 2 +- enterprise/coderd/coderd.go | 8 +++---- .../coordinator.go | 6 +++-- .../coordinator_test.go | 14 +++++------ tailnet/coordinator.go | 23 ++++++++++--------- tailnet/coordinator_test.go | 6 ++--- 8 files changed, 33 insertions(+), 30 deletions(-) rename enterprise/{tailnet => highavailability}/coordinator.go (98%) rename enterprise/{tailnet => highavailability}/coordinator_test.go (92%) diff --git a/agent/agent_test.go b/agent/agent_test.go index 38d70846dfd8b..06a33598b755f 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -560,7 +560,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo if metadata.DERPMap == nil { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) } - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentID := uuid.New() statsCh := make(chan *codersdk.AgentStats) closer := agent.New(agent.Options{ diff --git a/coderd/coderd.go b/coderd/coderd.go index d6dba70b9f80b..4976b3a58cde2 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -119,7 +119,7 @@ func New(options *Options) *API { options.PrometheusRegistry = prometheus.NewRegistry() } if options.TailnetCoordinator == nil { - options.TailnetCoordinator = tailnet.NewMemoryCoordinator() + options.TailnetCoordinator = tailnet.NewCoordinator() } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index 2b5ed06b45784..003d3cddb8b7a 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -143,7 +143,7 @@ func TestCache(t *testing.T) { func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeout time.Duration) *codersdk.AgentConn { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentID := uuid.New() closer := agent.New(agent.Options{ FetchMetadata: func(ctx context.Context) (codersdk.WorkspaceAgentMetadata, error) { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index d49a6cd2c8a9d..dfa9c25e4cf77 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -23,8 +23,8 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" - "github.com/coder/coder/enterprise/tailnet" - agpltailnet "github.com/coder/coder/tailnet" + "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -206,9 +206,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - coordinator := agpltailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() if enabled { - haCoordinator, err := tailnet.NewHACoordinator(api.Logger, api.Pubsub) + haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) if err != nil { api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing diff --git a/enterprise/tailnet/coordinator.go b/enterprise/highavailability/coordinator.go similarity index 98% rename from enterprise/tailnet/coordinator.go rename to enterprise/highavailability/coordinator.go index 6bf2327507165..7c41e47d44f1d 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/highavailability/coordinator.go @@ -1,4 +1,4 @@ -package tailnet +package highavailability import ( "bytes" @@ -18,7 +18,9 @@ import ( agpl "github.com/coder/coder/tailnet" ) -func NewHACoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { +// NewCoordinator creates a new high availability coordinator +// that uses PostgreSQL pubsub to exchange handshakes. +func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { coord := &haCoordinator{ id: uuid.New(), log: logger, diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/highavailability/coordinator_test.go similarity index 92% rename from enterprise/tailnet/coordinator_test.go rename to enterprise/highavailability/coordinator_test.go index 4889cd1c8ba60..1e86c08f1b1ed 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/highavailability/coordinator_test.go @@ -1,4 +1,4 @@ -package tailnet_test +package highavailability_test import ( "net" @@ -11,7 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" - "github.com/coder/coder/enterprise/tailnet" + "github.com/coder/coder/enterprise/highavailability" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -20,7 +20,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -48,7 +48,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -76,7 +76,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -169,11 +169,11 @@ func TestCoordinatorHA(t *testing.T) { pubsub := database.NewPubsubInMemory() - coordinator1, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + coordinator1, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + coordinator2, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator2.Close() diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 150a323bcfe52..96de8d295162e 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -94,11 +94,11 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func }, errChan } -// NewMemoryCoordinator constructs a new in-memory connection coordinator. This +// NewCoordinator constructs a new in-memory connection coordinator. This // coordinator is incompatible with multiple Coder replicas as all node data is // in-memory. -func NewMemoryCoordinator() Coordinator { - return &memoryCoordinator{ +func NewCoordinator() Coordinator { + return &coordinator{ closed: false, nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, @@ -106,13 +106,14 @@ func NewMemoryCoordinator() Coordinator { } } -// MemoryCoordinator exchanges nodes with agents to establish connections. +// coordinator exchanges nodes with agents to establish connections entirely in-memory. +// The Enterprise implementation provides this for high-availability. // ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ // │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ // └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. -type memoryCoordinator struct { +type coordinator struct { mutex sync.Mutex closed bool @@ -126,7 +127,7 @@ type memoryCoordinator struct { } // Node returns an in-memory node by ID. -func (c *memoryCoordinator) Node(id uuid.UUID) *Node { +func (c *coordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() node := c.nodes[id] @@ -135,7 +136,7 @@ func (c *memoryCoordinator) Node(id uuid.UUID) *Node { // ServeClient accepts a WebSocket connection that wants to connect to an agent // with the specified ID. -func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { +func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() if c.closed { @@ -194,7 +195,7 @@ func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid. } } -func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { +func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { var node Node err := decoder.Decode(&node) if err != nil { @@ -234,7 +235,7 @@ func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder // ServeAgent accepts a WebSocket connection to an agent that // listens to incoming connections and publishes node updates. -func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { +func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() if c.closed { @@ -293,7 +294,7 @@ func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } } -func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { +func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { var node Node err := decoder.Decode(&node) if err != nil { @@ -334,7 +335,7 @@ func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.D // Close closes all of the open connections in the coordinator and stops the // coordinator from accepting new connections. -func (c *memoryCoordinator) Close() error { +func (c *coordinator) Close() error { c.mutex.Lock() defer c.mutex.Unlock() diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index e0ed44420ede2..a4a020deadf93 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -16,7 +16,7 @@ func TestCoordinator(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -40,7 +40,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -64,7 +64,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentWS, agentServerWS := net.Pipe() defer agentWS.Close() From 585bc1dfc81996b9967475de7f4249c93b24aa46 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 22:36:05 +0000 Subject: [PATCH 15/79] Add flags for HA --- cli/config/file.go | 5 ++ cli/deployment/flags.go | 15 +++++ cli/root.go | 2 +- cli/server.go | 4 +- coderd/coderd.go | 8 ++- coderd/coderdtest/coderdtest.go | 17 +++-- codersdk/flags.go | 2 + codersdk/replicas.go | 22 +++++++ enterprise/cli/server.go | 25 +++++++- enterprise/coderd/coderd.go | 64 ++++++++++++++++--- .../coderd/coderdenttest/coderdenttest.go | 7 +- enterprise/coderd/replicas.go | 1 + enterprise/coderd/replicas_test.go | 38 +++++++++++ .../highavailability/derpmesh/derpmesh.go | 1 + .../highavailability/replica/replica.go | 7 +- 15 files changed, 190 insertions(+), 28 deletions(-) create mode 100644 codersdk/replicas.go create mode 100644 enterprise/coderd/replicas.go create mode 100644 enterprise/coderd/replicas_test.go diff --git a/cli/config/file.go b/cli/config/file.go index a98237afed22b..388ce0881f304 100644 --- a/cli/config/file.go +++ b/cli/config/file.go @@ -13,6 +13,11 @@ func (r Root) Session() File { return File(filepath.Join(string(r), "session")) } +// ReplicaID is a unique identifier for the Coder server. +func (r Root) ReplicaID() File { + return File(filepath.Join(string(r), "replica_id")) +} + func (r Root) URL() File { return File(filepath.Join(string(r), "url")) } diff --git a/cli/deployment/flags.go b/cli/deployment/flags.go index 3a03bea762b1c..35ae248a0a722 100644 --- a/cli/deployment/flags.go +++ b/cli/deployment/flags.go @@ -85,6 +85,13 @@ func Flags() *codersdk.DeploymentFlags { Description: "Addresses for STUN servers to establish P2P connections. Set empty to disable P2P connections.", Default: []string{"stun.l.google.com:19302"}, }, + DerpServerRelayAddress: &codersdk.StringFlag{ + Name: "DERP Server Relay Address", + Flag: "derp-server-relay-address", + EnvVar: "CODER_DERP_SERVER_RELAY_ADDRESS", + Description: "An HTTP address that is accessible by other replicas to relay DERP traffic. Required for high availability.", + Enterprise: true, + }, DerpConfigURL: &codersdk.StringFlag{ Name: "DERP Config URL", Flag: "derp-config-url", @@ -123,6 +130,14 @@ func Flags() *codersdk.DeploymentFlags { Description: "The bind address to serve pprof.", Default: "127.0.0.1:6060", }, + HighAvailability: &codersdk.BoolFlag{ + Name: "High Availability", + Flag: "high-availability", + EnvVar: "CODER_HIGH_AVAILABILITY", + Description: "Specifies whether high availability is enabled.", + Default: true, + Enterprise: true, + }, CacheDir: &codersdk.StringFlag{ Name: "Cache Directory", Flag: "cache-dir", diff --git a/cli/root.go b/cli/root.go index e7104e64284eb..e29aa534da0a8 100644 --- a/cli/root.go +++ b/cli/root.go @@ -100,7 +100,7 @@ func Core() []*cobra.Command { } func AGPL() []*cobra.Command { - all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, error) { + all := append(Core(), Server(deployment.Flags(), func(_ context.Context, _ config.Root, o *coderd.Options) (*coderd.API, error) { return coderd.New(o), nil })) return all diff --git a/cli/server.go b/cli/server.go index e3cad09ca27ff..fc5f131da3d7b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -67,7 +67,7 @@ import ( ) // nolint:gocyclo -func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, error)) *cobra.Command { +func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, config.Root, *coderd.Options) (*coderd.API, error)) *cobra.Command { root := &cobra.Command{ Use: "server", Short: "Start a Coder server", @@ -463,7 +463,7 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code ), dflags.PromAddress.Value, "prometheus")() } - coderAPI, err := newAPI(ctx, options) + coderAPI, err := newAPI(ctx, config, options) if err != nil { return err } diff --git a/coderd/coderd.go b/coderd/coderd.go index 4976b3a58cde2..57b78520d1b50 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -77,6 +77,7 @@ type Options struct { AutoImportTemplates []AutoImportTemplate TailnetCoordinator tailnet.Coordinator + DERPServer *derp.Server DERPMap *tailcfg.DERPMap MetricsCacheRefreshInterval time.Duration @@ -121,6 +122,9 @@ func New(options *Options) *API { if options.TailnetCoordinator == nil { options.TailnetCoordinator = tailnet.NewCoordinator() } + if options.DERPServer == nil { + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + } if options.Auditor == nil { options.Auditor = audit.NewNop() } @@ -160,7 +164,6 @@ func New(options *Options) *API { api.WorkspaceQuotaEnforcer.Store(&options.WorkspaceQuotaEnforcer) api.workspaceAgentCache = wsconncache.New(api.dialWorkspaceAgentTailnet, 0) api.TailnetCoordinator.Store(&options.TailnetCoordinator) - api.derpServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, OIDC: options.OIDCConfig, @@ -228,7 +231,7 @@ func New(options *Options) *API { r.Route("/%40{user}/{workspace_and_agent}/apps/{workspaceapp}", apps) r.Route("/@{user}/{workspace_and_agent}/apps/{workspaceapp}", apps) r.Route("/derp", func(r chi.Router) { - r.Get("/", derphttp.Handler(api.derpServer).ServeHTTP) + r.Get("/", derphttp.Handler(api.DERPServer).ServeHTTP) // This is used when UDP is blocked, and latency must be checked via HTTP(s). r.Get("/latency-check", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) @@ -540,7 +543,6 @@ type API struct { // RootHandler serves "/" RootHandler chi.Router - derpServer *derp.Server metricsCache *metricscache.Cache siteHandler http.Handler websocketWaitMutex sync.Mutex diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index d7ac4eb14be97..23305cbcbab36 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -81,6 +81,12 @@ type Options struct { MetricsCacheRefreshInterval time.Duration AgentStatsRefreshInterval time.Duration DeploymentFlags *codersdk.DeploymentFlags + + // Overriding the database is heavily discouraged. + // It should only be used in cases where multiple Coder + // test instances are running against the same database. + Database database.Store + Pubsub database.Pubsub } // New constructs a codersdk client connected to an in-memory API instance. @@ -135,13 +141,14 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance close(options.AutobuildStats) }) } - - db, pubsub := dbtestutil.NewDB(t) + if options.Database == nil { + options.Database, options.Pubsub = dbtestutil.NewDB(t) + } ctx, cancelFunc := context.WithCancel(context.Background()) lifecycleExecutor := executor.New( ctx, - db, + options.Database, slogtest.Make(t, nil).Named("autobuild.executor").Leveled(slog.LevelDebug), options.AutobuildTicker, ).WithStatsChannel(options.AutobuildStats) @@ -181,8 +188,8 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance AppHostname: options.AppHostname, Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), CacheDir: t.TempDir(), - Database: db, - Pubsub: pubsub, + Database: options.Database, + Pubsub: options.Pubsub, Auditor: options.Auditor, AWSCertificates: options.AWSCertificates, diff --git a/codersdk/flags.go b/codersdk/flags.go index 92f02941a57f8..2dd1323a1fddc 100644 --- a/codersdk/flags.go +++ b/codersdk/flags.go @@ -19,6 +19,7 @@ type DeploymentFlags struct { DerpServerRegionCode *StringFlag `json:"derp_server_region_code" typescript:",notnull"` DerpServerRegionName *StringFlag `json:"derp_server_region_name" typescript:",notnull"` DerpServerSTUNAddresses *StringArrayFlag `json:"derp_server_stun_address" typescript:",notnull"` + DerpServerRelayAddress *StringFlag `json:"derp_server_relay_address" typescript:",notnull"` DerpConfigURL *StringFlag `json:"derp_config_url" typescript:",notnull"` DerpConfigPath *StringFlag `json:"derp_config_path" typescript:",notnull"` PromEnabled *BoolFlag `json:"prom_enabled" typescript:",notnull"` @@ -59,6 +60,7 @@ type DeploymentFlags struct { Verbose *BoolFlag `json:"verbose" typescript:",notnull"` AuditLogging *BoolFlag `json:"audit_logging" typescript:",notnull"` BrowserOnly *BoolFlag `json:"browser_only" typescript:",notnull"` + HighAvailability *BoolFlag `json:"high_availability" typescript:",notnull"` SCIMAuthHeader *StringFlag `json:"scim_auth_header" typescript:",notnull"` UserWorkspaceQuota *IntFlag `json:"user_workspace_quota" typescript:",notnull"` } diff --git a/codersdk/replicas.go b/codersdk/replicas.go new file mode 100644 index 0000000000000..341b460792ddd --- /dev/null +++ b/codersdk/replicas.go @@ -0,0 +1,22 @@ +package codersdk + +import ( + "time" + + "github.com/google/uuid" +) + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // Active determines whether the replica is online. + Active bool `json:"active"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // Error is the error. + Error string `json:"error"` +} diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index 62af6f2888373..e34bdaccfd342 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -3,8 +3,12 @@ package cli import ( "context" + "github.com/google/uuid" "github.com/spf13/cobra" + "cdr.dev/slog" + + "github.com/coder/coder/cli/config" "github.com/coder/coder/cli/deployment" "github.com/coder/coder/enterprise/coderd" @@ -14,14 +18,29 @@ import ( func server() *cobra.Command { dflags := deployment.Flags() - cmd := agpl.Server(dflags, func(ctx context.Context, options *agplcoderd.Options) (*agplcoderd.API, error) { + cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { + replicaIDRaw, err := cfg.ReplicaID().Read() + if err != nil { + replicaIDRaw = uuid.NewString() + } + replicaID, err := uuid.Parse(replicaIDRaw) + if err != nil { + options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) + replicaID = uuid.New() + } o := &coderd.Options{ AuditLogging: dflags.AuditLogging.Value, BrowserOnly: dflags.BrowserOnly.Value, SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, - RBACEnabled: true, - Options: options, + RBAC: true, + HighAvailability: dflags.HighAvailability.Value, + + ReplicaID: replicaID, + DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, + DERPServerRegionID: dflags.DerpServerRegionID.Value, + + Options: options, } api, err := coderd.New(ctx, o) if err != nil { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index dfa9c25e4cf77..f18776ade2c61 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -11,6 +11,7 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/go-chi/chi/v5" + "github.com/google/uuid" "cdr.dev/slog" "github.com/coder/coder/coderd" @@ -24,6 +25,8 @@ import ( "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/enterprise/highavailability/derpmesh" + "github.com/coder/coder/enterprise/highavailability/replica" "github.com/coder/coder/tailnet" ) @@ -43,6 +46,7 @@ func New(ctx context.Context, options *Options) (*API, error) { Options: options, cancelEntitlementsLoop: cancelFunc, } + oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, OIDC: options.OIDCConfig, @@ -113,7 +117,27 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - err := api.updateEntitlements(ctx) + // If high availability is disabled and multiple replicas appear, show an error. + // If high availability is enabled and the built-in DERP is but the DERP relay isn't set, show an error. + // We need to block meshing if high availability is disabled, because the meshing code would just work. + // SetAddresses([]string{}) + + api.AGPL.RootHandler.Route("/replicas", func(r chi.Router) { + + }) + + var err error + api.replica, err = replica.New(ctx, options.Logger, options.Database, options.Pubsub, replica.Options{ + ID: options.ReplicaID, + RelayAddress: options.DERPServerRelayAddress, + RegionID: int32(options.DERPServerRegionID), + }) + if err != nil { + return nil, xerrors.Errorf("initialize replica: %w", err) + } + api.derpMesh = derpmesh.New(options.Logger, api.DERPServer) + + err = api.updateEntitlements(ctx) if err != nil { return nil, xerrors.Errorf("update entitlements: %w", err) } @@ -125,12 +149,18 @@ func New(ctx context.Context, options *Options) (*API, error) { type Options struct { *coderd.Options - RBACEnabled bool + RBAC bool AuditLogging bool // Whether to block non-browser connections. BrowserOnly bool SCIMAPIKey []byte UserWorkspaceQuota int + HighAvailability bool + + // Used for high availability. + DERPServerRelayAddress string + DERPServerRegionID int + ReplicaID uuid.UUID EntitlementsUpdateInterval time.Duration Keys map[string]ed25519.PublicKey @@ -140,6 +170,11 @@ type API struct { AGPL *coderd.API *Options + // Detects multiple Coder replicas running at the same time. + replica *replica.Server + // Meshes DERP connections from multiple replicas. + derpMesh *derpmesh.Mesh + cancelEntitlementsLoop func() entitlementsMu sync.RWMutex entitlements codersdk.Entitlements @@ -147,6 +182,8 @@ type API struct { func (api *API) Close() error { api.cancelEntitlementsLoop() + _ = api.replica.Close() + _ = api.derpMesh.Close() return api.AGPL.Close() } @@ -155,11 +192,12 @@ func (api *API) updateEntitlements(ctx context.Context) error { defer api.entitlementsMu.Unlock() entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{ - codersdk.FeatureAuditLog: api.AuditLogging, - codersdk.FeatureBrowserOnly: api.BrowserOnly, - codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, - codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, - codersdk.FeatureTemplateRBAC: api.RBACEnabled, + codersdk.FeatureAuditLog: api.AuditLogging, + codersdk.FeatureBrowserOnly: api.BrowserOnly, + codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, + codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, + codersdk.FeatureHighAvailability: api.HighAvailability, + codersdk.FeatureTemplateRBAC: api.RBAC, }) if err != nil { return err @@ -210,13 +248,23 @@ func (api *API) updateEntitlements(ctx context.Context) error { if enabled { haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) if err != nil { - api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing // is actually changing. changed = false } else { coordinator = haCoordinator } + + api.replica.SetCallback(func() { + addresses := make([]string, 0) + for _, replica := range api.replica.Regional() { + addresses = append(addresses, replica.RelayAddress) + } + api.derpMesh.SetAddresses(addresses) + }) + } else { + api.derpMesh.SetAddresses([]string{}) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index bc6b0375df638..c5ec2391d97bf 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -9,6 +9,7 @@ import ( "time" "github.com/golang-jwt/jwt/v4" + "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -62,10 +63,14 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBACEnabled: true, + RBAC: true, AuditLogging: options.AuditLogging, BrowserOnly: options.BrowserOnly, SCIMAPIKey: options.SCIMAPIKey, + DERPServerRelayAddress: oop.AccessURL.String(), + DERPServerRegionID: 1, + HighAvailability: true, + ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, EntitlementsUpdateInterval: options.EntitlementsUpdateInterval, diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go new file mode 100644 index 0000000000000..ddb2b8b672186 --- /dev/null +++ b/enterprise/coderd/replicas.go @@ -0,0 +1 @@ +package coderd diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go new file mode 100644 index 0000000000000..1a5a3ed5f4eee --- /dev/null +++ b/enterprise/coderd/replicas_test.go @@ -0,0 +1,38 @@ +package coderd_test + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/codersdk" + "github.com/coder/coder/enterprise/coderd/coderdenttest" +) + +func TestReplicas(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + _ = coderdtest.CreateFirstUser(t, firstClient) + + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + + user, err := secondClient.User(context.Background(), codersdk.Me) + require.NoError(t, err) + fmt.Printf("%+v\n", user) +} diff --git a/enterprise/highavailability/derpmesh/derpmesh.go b/enterprise/highavailability/derpmesh/derpmesh.go index 610fd749132cc..94341079cd43f 100644 --- a/enterprise/highavailability/derpmesh/derpmesh.go +++ b/enterprise/highavailability/derpmesh/derpmesh.go @@ -14,6 +14,7 @@ import ( "cdr.dev/slog" ) +// New constructs a new mesh for DERP servers. func New(logger slog.Logger, server *derp.Server) *Mesh { return &Mesh{ logger: logger, diff --git a/enterprise/highavailability/replica/replica.go b/enterprise/highavailability/replica/replica.go index ca0c450651e64..6855b32852e3e 100644 --- a/enterprise/highavailability/replica/replica.go +++ b/enterprise/highavailability/replica/replica.go @@ -28,11 +28,8 @@ type Options struct { ID uuid.UUID UpdateInterval time.Duration PeerTimeout time.Duration - // Mesh will dial active replicas with the same region ID to ensure - // they are reachable. If not, an error will be updated on the replica. - Mesh bool - RelayAddress string - RegionID int32 + RelayAddress string + RegionID int32 } // New registers the replica with the database and periodically updates to ensure From fdb3557f7fe4599c77ccd06689712cc6856a6319 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 00:35:43 +0000 Subject: [PATCH 16/79] Rename to replicasync --- .vscode/settings.json | 1 + enterprise/coderd/coderd.go | 12 ++++----- .../replica.go => replicasync/replicasync.go} | 25 +++++++++--------- .../replicasync_test.go} | 26 +++++++++---------- 4 files changed, 33 insertions(+), 31 deletions(-) rename enterprise/highavailability/{replica/replica.go => replicasync/replicasync.go} (93%) rename enterprise/highavailability/{replica/replica_test.go => replicasync/replicasync_test.go} (80%) diff --git a/.vscode/settings.json b/.vscode/settings.json index e9a32e850c980..f556563596bc0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -85,6 +85,7 @@ "ptytest", "quickstart", "reconfig", + "replicasync", "retrier", "rpty", "sdkproto", diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f18776ade2c61..19812ea1f8b42 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -26,7 +26,7 @@ import ( "github.com/coder/coder/enterprise/coderd/license" "github.com/coder/coder/enterprise/highavailability" "github.com/coder/coder/enterprise/highavailability/derpmesh" - "github.com/coder/coder/enterprise/highavailability/replica" + "github.com/coder/coder/enterprise/highavailability/replicasync" "github.com/coder/coder/tailnet" ) @@ -127,7 +127,7 @@ func New(ctx context.Context, options *Options) (*API, error) { }) var err error - api.replica, err = replica.New(ctx, options.Logger, options.Database, options.Pubsub, replica.Options{ + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ ID: options.ReplicaID, RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), @@ -171,7 +171,7 @@ type API struct { *Options // Detects multiple Coder replicas running at the same time. - replica *replica.Server + replicaManager *replicasync.Manager // Meshes DERP connections from multiple replicas. derpMesh *derpmesh.Mesh @@ -182,7 +182,7 @@ type API struct { func (api *API) Close() error { api.cancelEntitlementsLoop() - _ = api.replica.Close() + _ = api.replicaManager.Close() _ = api.derpMesh.Close() return api.AGPL.Close() } @@ -256,9 +256,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } - api.replica.SetCallback(func() { + api.replicaManager.SetCallback(func() { addresses := make([]string, 0) - for _, replica := range api.replica.Regional() { + for _, replica := range api.replicaManager.Regional() { addresses = append(addresses, replica.RelayAddress) } api.derpMesh.SetAddresses(addresses) diff --git a/enterprise/highavailability/replica/replica.go b/enterprise/highavailability/replicasync/replicasync.go similarity index 93% rename from enterprise/highavailability/replica/replica.go rename to enterprise/highavailability/replicasync/replicasync.go index 6855b32852e3e..c632f8df2462b 100644 --- a/enterprise/highavailability/replica/replica.go +++ b/enterprise/highavailability/replicasync/replicasync.go @@ -1,4 +1,4 @@ -package replica +package replicasync import ( "context" @@ -34,7 +34,7 @@ type Options struct { // New registers the replica with the database and periodically updates to ensure // it's healthy. It contacts all other alive replicas to ensure they are reachable. -func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Server, error) { +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Manager, error) { if options.ID == uuid.Nil { panic("An ID must be provided!") } @@ -88,7 +88,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) - server := &Server{ + server := &Manager{ options: &options, db: db, pubsub: pubsub, @@ -110,7 +110,8 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return server, nil } -type Server struct { +// Manager keeps the replica up to date and in sync with other replicas. +type Manager struct { options *Options db database.Store pubsub database.Pubsub @@ -128,7 +129,7 @@ type Server struct { } // loop runs the replica update sequence on an update interval. -func (s *Server) loop(ctx context.Context) { +func (s *Manager) loop(ctx context.Context) { defer s.closeWait.Done() ticker := time.NewTicker(s.options.UpdateInterval) defer ticker.Stop() @@ -146,7 +147,7 @@ func (s *Server) loop(ctx context.Context) { } // subscribe listens for new replica information! -func (s *Server) subscribe(ctx context.Context) error { +func (s *Manager) subscribe(ctx context.Context) error { needsUpdate := false updating := false updateMutex := sync.Mutex{} @@ -199,7 +200,7 @@ func (s *Server) subscribe(ctx context.Context) error { return nil } -func (s *Server) run(ctx context.Context) error { +func (s *Manager) run(ctx context.Context) error { s.closeMutex.Lock() s.closeWait.Add(1) s.closeMutex.Unlock() @@ -291,21 +292,21 @@ func (s *Server) run(ctx context.Context) error { } // Self represents the current replica. -func (s *Server) Self() database.Replica { +func (s *Manager) Self() database.Replica { s.mutex.Lock() defer s.mutex.Unlock() return s.self } // All returns every replica, including itself. -func (s *Server) All() []database.Replica { +func (s *Manager) All() []database.Replica { s.mutex.Lock() defer s.mutex.Unlock() return append(s.peers, s.self) } // Regional returns all replicas in the same region excluding itself. -func (s *Server) Regional() []database.Replica { +func (s *Manager) Regional() []database.Replica { s.mutex.Lock() defer s.mutex.Unlock() replicas := make([]database.Replica, 0) @@ -320,7 +321,7 @@ func (s *Server) Regional() []database.Replica { // SetCallback sets a function to execute whenever new peers // are refreshed or updated. -func (s *Server) SetCallback(callback func()) { +func (s *Manager) SetCallback(callback func()) { s.mutex.Lock() defer s.mutex.Unlock() s.callback = callback @@ -328,7 +329,7 @@ func (s *Server) SetCallback(callback func()) { go callback() } -func (s *Server) Close() error { +func (s *Manager) Close() error { s.closeMutex.Lock() select { case <-s.closed: diff --git a/enterprise/highavailability/replica/replica_test.go b/enterprise/highavailability/replicasync/replicasync_test.go similarity index 80% rename from enterprise/highavailability/replica/replica_test.go rename to enterprise/highavailability/replicasync/replicasync_test.go index a5bda874ea166..f4d800650f939 100644 --- a/enterprise/highavailability/replica/replica_test.go +++ b/enterprise/highavailability/replicasync/replicasync_test.go @@ -1,4 +1,4 @@ -package replica_test +package replicasync_test import ( "context" @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/highavailability/replica" + "github.com/coder/coder/enterprise/highavailability/replicasync" "github.com/coder/coder/testutil" ) @@ -32,12 +32,12 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) id := uuid.New() - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { assert.Equal(t, []byte(id.String()), message) }) require.NoError(t, err) defer cancel() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -54,12 +54,12 @@ func TestReplica(t *testing.T) { ID: id, }) require.NoError(t, err) - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { assert.Equal(t, []byte(id.String()), message) }) require.NoError(t, err) defer cancel() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -84,7 +84,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: uuid.New(), }) require.NoError(t, err) @@ -97,7 +97,7 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) var count atomic.Int32 - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { count.Add(1) }) require.NoError(t, err) @@ -112,7 +112,7 @@ func TestReplica(t *testing.T) { RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: uuid.New(), PeerTimeout: 1 * time.Millisecond, }) @@ -130,7 +130,7 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) id := uuid.New() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -145,9 +145,9 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) // Publish multiple times to ensure it can handle that case. - err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + err = pubsub.Publish(replicasync.PubsubEvent, []byte(peer.ID.String())) require.NoError(t, err) - err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + err = pubsub.Publish(replicasync.PubsubEvent, []byte(peer.ID.String())) require.NoError(t, err) require.Eventually(t, func() bool { return len(server.Regional()) == 1 @@ -168,7 +168,7 @@ func TestReplica(t *testing.T) { count := 20 wg.Add(count) for i := 0; i < count; i++ { - server, err := replica.New(context.Background(), logger, db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), logger, db, pubsub, replicasync.Options{ ID: uuid.New(), RelayAddress: srv.URL, }) From 9124b0045cacf0b518a7299f9a8eb48842d2ecc7 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 03:44:47 +0000 Subject: [PATCH 17/79] Denest packages for replicas --- enterprise/coderd/coderd.go | 22 ++++++------------- .../derpmesh/derpmesh.go | 0 .../derpmesh/derpmesh_test.go | 2 +- .../replicasync/replicasync.go | 0 .../replicasync/replicasync_test.go | 2 +- .../coordinator.go | 2 +- .../coordinator_test.go | 14 ++++++------ 7 files changed, 17 insertions(+), 25 deletions(-) rename enterprise/{highavailability => }/derpmesh/derpmesh.go (100%) rename enterprise/{highavailability => }/derpmesh/derpmesh_test.go (98%) rename enterprise/{highavailability => }/replicasync/replicasync.go (100%) rename enterprise/{highavailability => }/replicasync/replicasync_test.go (98%) rename enterprise/{highavailability => tailnet}/coordinator.go (99%) rename enterprise/{highavailability => tailnet}/coordinator_test.go (92%) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 19812ea1f8b42..342b992c8076f 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -24,10 +24,10 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" - "github.com/coder/coder/enterprise/highavailability" - "github.com/coder/coder/enterprise/highavailability/derpmesh" - "github.com/coder/coder/enterprise/highavailability/replicasync" - "github.com/coder/coder/tailnet" + "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/enterprise/replicasync" + "github.com/coder/coder/enterprise/tailnet" + agpltailnet "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -117,15 +117,6 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - // If high availability is disabled and multiple replicas appear, show an error. - // If high availability is enabled and the built-in DERP is but the DERP relay isn't set, show an error. - // We need to block meshing if high availability is disabled, because the meshing code would just work. - // SetAddresses([]string{}) - - api.AGPL.RootHandler.Route("/replicas", func(r chi.Router) { - - }) - var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ ID: options.ReplicaID, @@ -244,9 +235,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - coordinator := tailnet.NewCoordinator() + coordinator := agpltailnet.NewCoordinator() if enabled { - haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) + haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub) if err != nil { api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing @@ -265,6 +256,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { }) } else { api.derpMesh.SetAddresses([]string{}) + api.replicaManager.SetCallback(func() {}) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/highavailability/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go similarity index 100% rename from enterprise/highavailability/derpmesh/derpmesh.go rename to enterprise/derpmesh/derpmesh.go diff --git a/enterprise/highavailability/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go similarity index 98% rename from enterprise/highavailability/derpmesh/derpmesh_test.go rename to enterprise/derpmesh/derpmesh_test.go index 6e1154fc3d6a8..313c33da99bad 100644 --- a/enterprise/highavailability/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/enterprise/highavailability/derpmesh" + "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/tailnet" ) diff --git a/enterprise/highavailability/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go similarity index 100% rename from enterprise/highavailability/replicasync/replicasync.go rename to enterprise/replicasync/replicasync.go diff --git a/enterprise/highavailability/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go similarity index 98% rename from enterprise/highavailability/replicasync/replicasync_test.go rename to enterprise/replicasync/replicasync_test.go index f4d800650f939..5ce774ea5f29a 100644 --- a/enterprise/highavailability/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/highavailability/replicasync" + "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/testutil" ) diff --git a/enterprise/highavailability/coordinator.go b/enterprise/tailnet/coordinator.go similarity index 99% rename from enterprise/highavailability/coordinator.go rename to enterprise/tailnet/coordinator.go index 7c41e47d44f1d..0643f7a259719 100644 --- a/enterprise/highavailability/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -1,4 +1,4 @@ -package highavailability +package tailnet import ( "bytes" diff --git a/enterprise/highavailability/coordinator_test.go b/enterprise/tailnet/coordinator_test.go similarity index 92% rename from enterprise/highavailability/coordinator_test.go rename to enterprise/tailnet/coordinator_test.go index 1e86c08f1b1ed..83fac250b2916 100644 --- a/enterprise/highavailability/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -1,4 +1,4 @@ -package highavailability_test +package tailnet_test import ( "net" @@ -11,7 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" - "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/enterprise/tailnet" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -20,7 +20,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -48,7 +48,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -76,7 +76,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -169,11 +169,11 @@ func TestCoordinatorHA(t *testing.T) { pubsub := database.NewPubsubInMemory() - coordinator1, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) + coordinator1, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) + coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator2.Close() From d5555f6938978c38906aaf4ae6518c290ff0b983 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 17:53:54 +0000 Subject: [PATCH 18/79] Add test for multiple replicas --- coderd/coderd.go | 1 + .../coderd/coderdenttest/coderdenttest.go | 11 ++++---- enterprise/coderd/license/license.go | 11 +++++--- enterprise/coderd/license/license_test.go | 2 +- enterprise/coderd/replicas_test.go | 26 ++++++++++++++----- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 57b78520d1b50..1b4e674de11cb 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -124,6 +124,7 @@ func New(options *Options) *API { } if options.DERPServer == nil { options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index c5ec2391d97bf..57440ac37082e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -63,11 +63,12 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBAC: true, - AuditLogging: options.AuditLogging, - BrowserOnly: options.BrowserOnly, - SCIMAPIKey: options.SCIMAPIKey, - DERPServerRelayAddress: oop.AccessURL.String(), + RBAC: true, + AuditLogging: options.AuditLogging, + BrowserOnly: options.BrowserOnly, + SCIMAPIKey: options.SCIMAPIKey, + // TODO: Kyle change this before merge! + DERPServerRelayAddress: oop.AccessURL.String() + "/derp", DERPServerRegionID: 1, HighAvailability: true, ReplicaID: uuid.New(), diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index d7643683a6d2f..43f8b53094c7c 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -153,9 +153,6 @@ func Entitlements( case codersdk.EntitlementNotEntitled: entitlements.Warnings = append(entitlements.Warnings, fmt.Sprintf("%s is enabled but your license is not entitled to this feature.", niceName)) - // Disable the feature and add a warning... - feature.Enabled = false - entitlements.Features[featureName] = feature case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, fmt.Sprintf("%s is enabled but your license for this feature is expired.", niceName)) @@ -164,6 +161,14 @@ func Entitlements( } } + for _, featureName := range codersdk.FeatureNames { + feature := entitlements.Features[featureName] + if feature.Entitlement == codersdk.EntitlementNotEntitled { + feature.Enabled = false + entitlements.Features[featureName] = feature + } + } + return entitlements, nil } diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index e1fbdc6d3d9fa..f1318e26bae47 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -31,7 +31,7 @@ func TestEntitlements(t *testing.T) { t.Run("Defaults", func(t *testing.T) { t.Parallel() db := databasefake.New() - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) require.False(t, entitlements.Trial) diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 1a5a3ed5f4eee..52836a720f623 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -2,14 +2,16 @@ package coderd_test import ( "context" - "fmt" "testing" + "time" "github.com/stretchr/testify/require" + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/codersdk" "github.com/coder/coder/enterprise/coderd/coderdenttest" ) @@ -18,11 +20,15 @@ func TestReplicas(t *testing.T) { db, pubsub := dbtestutil.NewDB(t) firstClient := coderdenttest.New(t, &coderdenttest.Options{ Options: &coderdtest.Options{ - Database: db, - Pubsub: pubsub, + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, }, }) - _ = coderdtest.CreateFirstUser(t, firstClient) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) secondClient := coderdenttest.New(t, &coderdenttest.Options{ Options: &coderdtest.Options{ @@ -32,7 +38,13 @@ func TestReplicas(t *testing.T) { }) secondClient.SessionToken = firstClient.SessionToken - user, err := secondClient.User(context.Background(), codersdk.Me) + agentID := setupWorkspaceAgent(t, firstClient, firstUser) + conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) require.NoError(t, err) - fmt.Printf("%+v\n", user) + require.Eventually(t, func() bool { + _, err = conn.Ping() + return err == nil + }, 10*time.Second, 250*time.Millisecond) + + _ = conn.Close() } From 8dfc261c7bb6bfa440fa62389b647414fdc57ddb Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:26:36 +0000 Subject: [PATCH 19/79] Fix coordination test --- coderd/coderd.go | 5 ++++- enterprise/coderd/workspaceagents_test.go | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 1b4e674de11cb..6b4b335161c32 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -558,7 +558,10 @@ func (api *API) Close() error { api.websocketWaitMutex.Unlock() api.metricsCache.Close() - + coordinator := api.TailnetCoordinator.Load() + if coordinator != nil { + _ = (*coordinator).Close() + } return api.workspaceAgentCache.Close() } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 3bb40b75b00f8..24e24e3f5f540 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -89,9 +89,9 @@ func setupWorkspaceAgent(t *testing.T, client *codersdk.Client, user codersdk.Cr CoordinatorDialer: agentClient.ListenWorkspaceAgentTailnet, Logger: slogtest.Make(t, nil).Named("agent"), }) - defer func() { + t.Cleanup(func() { _ = agentCloser.Close() - }() + }) resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) return resources[0].Agents[0].ID } From ff5968bd9c5386dea16ed68ed9684870cf6bb0bd Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:46:40 +0000 Subject: [PATCH 20/79] Add HA to the helm chart --- helm/templates/coder.yaml | 12 ++++++++---- helm/values.yaml | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/helm/templates/coder.yaml b/helm/templates/coder.yaml index 45f3f6e29a32e..1165251fc885b 100644 --- a/helm/templates/coder.yaml +++ b/helm/templates/coder.yaml @@ -14,10 +14,7 @@ metadata: {{- include "coder.labels" . | nindent 4 }} annotations: {{ toYaml .Values.coder.annotations | nindent 4}} spec: - # NOTE: this is currently not used as coder v2 does not support high - # availability yet. - # replicas: {{ .Values.coder.replicaCount }} - replicas: 1 + replicas: {{ .Values.coder.replicaCount }} selector: matchLabels: {{- include "coder.selectorLabels" . | nindent 6 }} @@ -38,6 +35,13 @@ spec: env: - name: CODER_ADDRESS value: "0.0.0.0:{{ include "coder.port" . }}" + # Used for inter-pod communication with high-availability. + - name: KUBE_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: CODER_DERP_SERVER_RELAY_ADDRESS + value: "{{ include "coder.portName" . }}://$(KUBE_POD_IP):{{ include "coder.port" . }}" {{- include "coder.tlsEnv" . | nindent 12 }} {{- with .Values.coder.env -}} {{ toYaml . | nindent 12 }} diff --git a/helm/values.yaml b/helm/values.yaml index cfba214ee6028..3beebdd3fc3b9 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -1,9 +1,9 @@ # coder -- Primary configuration for `coder server`. coder: - # NOTE: this is currently not used as coder v2 does not support high - # availability yet. - # # coder.replicaCount -- The number of Kubernetes deployment replicas. - # replicaCount: 1 + # coder.replicaCount -- The number of Kubernetes deployment replicas. + # This should only be increased if High Availability is enabled. + # This is an Enterprise feature. Contact sales@coder.com. + replicaCount: 1 # coder.image -- The image to use for Coder. image: From 557b390f62d2eee07e0131238c1b25a4e130078d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:49:28 +0000 Subject: [PATCH 21/79] Rename function pointer --- codersdk/replicas.go | 22 ---- enterprise/replicasync/replicasync.go | 148 +++++++++++++------------- 2 files changed, 74 insertions(+), 96 deletions(-) delete mode 100644 codersdk/replicas.go diff --git a/codersdk/replicas.go b/codersdk/replicas.go deleted file mode 100644 index 341b460792ddd..0000000000000 --- a/codersdk/replicas.go +++ /dev/null @@ -1,22 +0,0 @@ -package codersdk - -import ( - "time" - - "github.com/google/uuid" -) - -type Replica struct { - // ID is the unique identifier for the replica. - ID uuid.UUID `json:"id"` - // Hostname is the hostname of the replica. - Hostname string `json:"hostname"` - // CreatedAt is when the replica was first seen. - CreatedAt time.Time `json:"created_at"` - // Active determines whether the replica is online. - Active bool `json:"active"` - // RelayAddress is the accessible address to relay DERP connections. - RelayAddress string `json:"relay_address"` - // Error is the error. - Error string `json:"error"` -} diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index c632f8df2462b..4d6038a694940 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -129,9 +129,9 @@ type Manager struct { } // loop runs the replica update sequence on an update interval. -func (s *Manager) loop(ctx context.Context) { - defer s.closeWait.Done() - ticker := time.NewTicker(s.options.UpdateInterval) +func (m *Manager) loop(ctx context.Context) { + defer m.closeWait.Done() + ticker := time.NewTicker(m.options.UpdateInterval) defer ticker.Stop() for { select { @@ -139,15 +139,15 @@ func (s *Manager) loop(ctx context.Context) { return case <-ticker.C: } - err := s.run(ctx) + err := m.run(ctx) if err != nil && !errors.Is(err, context.Canceled) { - s.logger.Warn(ctx, "run replica update loop", slog.Error(err)) + m.logger.Warn(ctx, "run replica update loop", slog.Error(err)) } } } // subscribe listens for new replica information! -func (s *Manager) subscribe(ctx context.Context) error { +func (m *Manager) subscribe(ctx context.Context) error { needsUpdate := false updating := false updateMutex := sync.Mutex{} @@ -158,9 +158,9 @@ func (s *Manager) subscribe(ctx context.Context) error { // it will reprocess afterwards. var update func() update = func() { - err := s.run(ctx) + err := m.run(ctx) if err != nil && !errors.Is(err, context.Canceled) { - s.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) } updateMutex.Lock() if needsUpdate { @@ -172,7 +172,7 @@ func (s *Manager) subscribe(ctx context.Context) error { updating = false updateMutex.Unlock() } - cancelFunc, err := s.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { + cancelFunc, err := m.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { updateMutex.Lock() defer updateMutex.Unlock() id, err := uuid.Parse(string(message)) @@ -180,7 +180,7 @@ func (s *Manager) subscribe(ctx context.Context) error { return } // Don't process updates for ourself! - if id == s.options.ID { + if id == m.options.ID { return } if updating { @@ -200,46 +200,46 @@ func (s *Manager) subscribe(ctx context.Context) error { return nil } -func (s *Manager) run(ctx context.Context) error { - s.closeMutex.Lock() - s.closeWait.Add(1) - s.closeMutex.Unlock() +func (m *Manager) run(ctx context.Context) error { + m.closeMutex.Lock() + m.closeWait.Add(1) + m.closeMutex.Unlock() go func() { - s.closeWait.Done() + m.closeWait.Done() }() // Expect replicas to update once every three times the interval... // If they don't, assume death! - replicas, err := s.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*s.options.UpdateInterval)) + replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) if err != nil { return xerrors.Errorf("get replicas: %w", err) } - s.mutex.Lock() - s.peers = make([]database.Replica, 0, len(replicas)) + m.mutex.Lock() + m.peers = make([]database.Replica, 0, len(replicas)) for _, replica := range replicas { - if replica.ID == s.options.ID { + if replica.ID == m.options.ID { continue } - s.peers = append(s.peers, replica) + m.peers = append(m.peers, replica) } - s.mutex.Unlock() + m.mutex.Unlock() var wg sync.WaitGroup var mu sync.Mutex failed := make([]string, 0) - for _, peer := range s.Regional() { + for _, peer := range m.Regional() { wg.Add(1) peer := peer go func() { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { - s.logger.Error(ctx, "create http request for relay probe", + m.logger.Error(ctx, "create http request for relay probe", slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } client := http.Client{ - Timeout: s.options.PeerTimeout, + Timeout: m.options.PeerTimeout, } res, err := client.Do(req) if err != nil { @@ -260,58 +260,58 @@ func (s *Manager) run(ctx context.Context) error { } } - replica, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: s.self.ID, + replica, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: m.self.ID, UpdatedAt: database.Now(), - StartedAt: s.self.StartedAt, - StoppedAt: s.self.StoppedAt, - RelayAddress: s.self.RelayAddress, - RegionID: s.self.RegionID, - Hostname: s.self.Hostname, - Version: s.self.Version, + StartedAt: m.self.StartedAt, + StoppedAt: m.self.StoppedAt, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, Error: replicaError, }) if err != nil { return xerrors.Errorf("update replica: %w", err) } - s.mutex.Lock() - if s.self.Error.String != replica.Error.String { + m.mutex.Lock() + if m.self.Error.String != replica.Error.String { // Publish an update occurred! - err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { - s.mutex.Unlock() + m.mutex.Unlock() return xerrors.Errorf("publish replica update: %w", err) } } - s.self = replica - if s.callback != nil { - go s.callback() + m.self = replica + if m.callback != nil { + go m.callback() } - s.mutex.Unlock() + m.mutex.Unlock() return nil } // Self represents the current replica. -func (s *Manager) Self() database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() - return s.self +func (m *Manager) Self() database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() + return m.self } // All returns every replica, including itself. -func (s *Manager) All() []database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() - return append(s.peers, s.self) +func (m *Manager) All() []database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() + return append(m.peers, m.self) } // Regional returns all replicas in the same region excluding itself. -func (s *Manager) Regional() []database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() +func (m *Manager) Regional() []database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() replicas := make([]database.Replica, 0) - for _, replica := range s.peers { - if replica.RegionID != s.self.RegionID { + for _, replica := range m.peers { + if replica.RegionID != m.self.RegionID { continue } replicas = append(replicas, replica) @@ -321,47 +321,47 @@ func (s *Manager) Regional() []database.Replica { // SetCallback sets a function to execute whenever new peers // are refreshed or updated. -func (s *Manager) SetCallback(callback func()) { - s.mutex.Lock() - defer s.mutex.Unlock() - s.callback = callback +func (m *Manager) SetCallback(callback func()) { + m.mutex.Lock() + defer m.mutex.Unlock() + m.callback = callback // Instantly call the callback to inform replicas! go callback() } -func (s *Manager) Close() error { - s.closeMutex.Lock() +func (m *Manager) Close() error { + m.closeMutex.Lock() select { - case <-s.closed: - s.closeMutex.Unlock() + case <-m.closed: + m.closeMutex.Unlock() return nil default: } - close(s.closed) - s.closeCancel() - s.closeWait.Wait() - s.closeMutex.Unlock() + close(m.closed) + m.closeCancel() + m.closeWait.Wait() + m.closeMutex.Unlock() ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) defer cancelFunc() - _, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: s.self.ID, + _, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: m.self.ID, UpdatedAt: database.Now(), - StartedAt: s.self.StartedAt, + StartedAt: m.self.StartedAt, StoppedAt: sql.NullTime{ Time: database.Now(), Valid: true, }, - RelayAddress: s.self.RelayAddress, - RegionID: s.self.RegionID, - Hostname: s.self.Hostname, - Version: s.self.Version, - Error: s.self.Error, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, + Error: m.self.Error, }) if err != nil { return xerrors.Errorf("update replica: %w", err) } - err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { return xerrors.Errorf("publish replica update: %w", err) } From 186a5e2623d3aa57c7816a0fd6d9ff917e531298 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 20:27:32 +0000 Subject: [PATCH 22/79] Add warnings for HA --- .vscode/settings.json | 2 + cli/deployment/flags.go | 8 --- codersdk/deployment.go | 26 +++++++ codersdk/flags.go | 1 - enterprise/cli/server.go | 30 +++++--- enterprise/coderd/coderd.go | 5 +- .../coderd/coderdenttest/coderdenttest.go | 12 ++-- enterprise/coderd/license/license.go | 23 ++++++ enterprise/coderd/license/license_test.go | 72 ++++++++++++++++--- enterprise/derpmesh/derpmesh.go | 13 ++++ 10 files changed, 156 insertions(+), 36 deletions(-) create mode 100644 codersdk/deployment.go diff --git a/.vscode/settings.json b/.vscode/settings.json index f556563596bc0..2e6ff3d23704c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -19,6 +19,7 @@ "derphttp", "derpmap", "devel", + "dflags", "drpc", "drpcconn", "drpcmux", @@ -88,6 +89,7 @@ "replicasync", "retrier", "rpty", + "SCIM", "sdkproto", "sdktrace", "Signup", diff --git a/cli/deployment/flags.go b/cli/deployment/flags.go index 35ae248a0a722..8c6608b552586 100644 --- a/cli/deployment/flags.go +++ b/cli/deployment/flags.go @@ -130,14 +130,6 @@ func Flags() *codersdk.DeploymentFlags { Description: "The bind address to serve pprof.", Default: "127.0.0.1:6060", }, - HighAvailability: &codersdk.BoolFlag{ - Name: "High Availability", - Flag: "high-availability", - EnvVar: "CODER_HIGH_AVAILABILITY", - Description: "Specifies whether high availability is enabled.", - Default: true, - Enterprise: true, - }, CacheDir: &codersdk.StringFlag{ Name: "Cache Directory", Flag: "cache-dir", diff --git a/codersdk/deployment.go b/codersdk/deployment.go new file mode 100644 index 0000000000000..a1227b09e3f63 --- /dev/null +++ b/codersdk/deployment.go @@ -0,0 +1,26 @@ +package codersdk + +import ( + "time" + + "github.com/google/uuid" +) + +type DeploymentInfo struct { + Replicas []Replica `json:"replicas"` +} + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // Active determines whether the replica is online. + Active bool `json:"active"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // Error is the error. + Error string `json:"error"` +} diff --git a/codersdk/flags.go b/codersdk/flags.go index 2dd1323a1fddc..09ca65b1ea813 100644 --- a/codersdk/flags.go +++ b/codersdk/flags.go @@ -60,7 +60,6 @@ type DeploymentFlags struct { Verbose *BoolFlag `json:"verbose" typescript:",notnull"` AuditLogging *BoolFlag `json:"audit_logging" typescript:",notnull"` BrowserOnly *BoolFlag `json:"browser_only" typescript:",notnull"` - HighAvailability *BoolFlag `json:"high_availability" typescript:",notnull"` SCIMAuthHeader *StringFlag `json:"scim_auth_header" typescript:",notnull"` UserWorkspaceQuota *IntFlag `json:"user_workspace_quota" typescript:",notnull"` } diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index e34bdaccfd342..cc44985e0a4d4 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,9 +2,11 @@ package cli import ( "context" + "net/url" "github.com/google/uuid" "github.com/spf13/cobra" + "golang.org/x/xerrors" "cdr.dev/slog" @@ -20,22 +22,35 @@ func server() *cobra.Command { dflags := deployment.Flags() cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { replicaIDRaw, err := cfg.ReplicaID().Read() + generatedReplicaID := false if err != nil { replicaIDRaw = uuid.NewString() + generatedReplicaID = true } replicaID, err := uuid.Parse(replicaIDRaw) if err != nil { options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) replicaID = uuid.New() + generatedReplicaID = true + } + if generatedReplicaID { + // Make sure we save it to be reused later! + _ = cfg.ReplicaID().Write(replicaID.String()) + } + + if dflags.DerpServerRelayAddress.Value != "" { + _, err := url.Parse(dflags.DerpServerRelayAddress.Value) + if err != nil { + return nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) + } } - o := &coderd.Options{ - AuditLogging: dflags.AuditLogging.Value, - BrowserOnly: dflags.BrowserOnly.Value, - SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), - UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, - RBAC: true, - HighAvailability: dflags.HighAvailability.Value, + o := &coderd.Options{ + AuditLogging: dflags.AuditLogging.Value, + BrowserOnly: dflags.BrowserOnly.Value, + SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), + UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, + RBAC: true, ReplicaID: replicaID, DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, DERPServerRegionID: dflags.DerpServerRegionID.Value, @@ -50,6 +65,5 @@ func server() *cobra.Command { }) deployment.AttachFlags(cmd.Flags(), dflags, true) - return cmd } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 342b992c8076f..b06e843b658a5 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -146,7 +146,6 @@ type Options struct { BrowserOnly bool SCIMAPIKey []byte UserWorkspaceQuota int - HighAvailability bool // Used for high availability. DERPServerRelayAddress string @@ -182,12 +181,12 @@ func (api *API) updateEntitlements(ctx context.Context) error { api.entitlementsMu.Lock() defer api.entitlementsMu.Unlock() - entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{ + entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, len(api.replicaManager.All()), api.Keys, map[string]bool{ codersdk.FeatureAuditLog: api.AuditLogging, codersdk.FeatureBrowserOnly: api.BrowserOnly, codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, - codersdk.FeatureHighAvailability: api.HighAvailability, + codersdk.FeatureHighAvailability: api.DERPServerRelayAddress != "", codersdk.FeatureTemplateRBAC: api.RBAC, }) if err != nil { diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 57440ac37082e..24f0bffd5017f 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -63,14 +63,12 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBAC: true, - AuditLogging: options.AuditLogging, - BrowserOnly: options.BrowserOnly, - SCIMAPIKey: options.SCIMAPIKey, - // TODO: Kyle change this before merge! - DERPServerRelayAddress: oop.AccessURL.String() + "/derp", + RBAC: true, + AuditLogging: options.AuditLogging, + BrowserOnly: options.BrowserOnly, + SCIMAPIKey: options.SCIMAPIKey, + DERPServerRelayAddress: oop.AccessURL.String(), DERPServerRegionID: 1, - HighAvailability: true, ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 43f8b53094c7c..633b1a5056cab 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -21,6 +21,7 @@ func Entitlements( ctx context.Context, db database.Store, logger slog.Logger, + replicaCount int, keys map[string]ed25519.PublicKey, enablements map[string]bool, ) (codersdk.Entitlements, error) { @@ -144,6 +145,10 @@ func Entitlements( if featureName == codersdk.FeatureUserLimit { continue } + // High availability has it's own warnings based on replica count! + if featureName == codersdk.FeatureHighAvailability { + continue + } feature := entitlements.Features[featureName] if !feature.Enabled { continue @@ -161,6 +166,24 @@ func Entitlements( } } + if replicaCount > 1 { + feature := entitlements.Features[codersdk.FeatureHighAvailability] + + switch feature.Entitlement { + case codersdk.EntitlementNotEntitled: + if entitlements.HasLicense { + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but your license is not entitled to high availability.") + } else { + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.") + } + case codersdk.EntitlementGracePeriod: + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but your license for high availability is expired.") + } + } + for _, featureName := range codersdk.FeatureNames { feature := entitlements.Features[featureName] if feature.Entitlement == codersdk.EntitlementNotEntitled { diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index f1318e26bae47..5b50bdb97cfe2 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -31,7 +31,7 @@ func TestEntitlements(t *testing.T) { t.Run("Defaults", func(t *testing.T) { t.Parallel() db := databasefake.New() - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -47,7 +47,7 @@ func TestEntitlements(t *testing.T) { JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{}), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -71,7 +71,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -96,7 +96,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -104,6 +104,9 @@ func TestEntitlements(t *testing.T) { if featureName == codersdk.FeatureUserLimit { continue } + if featureName == codersdk.FeatureHighAvailability { + continue + } niceName := strings.Title(strings.ReplaceAll(featureName, "_", " ")) require.Equal(t, codersdk.EntitlementGracePeriod, entitlements.Features[featureName].Entitlement) require.Contains(t, entitlements.Warnings, fmt.Sprintf("%s is enabled but your license for this feature is expired.", niceName)) @@ -116,7 +119,7 @@ func TestEntitlements(t *testing.T) { JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{}), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -124,6 +127,9 @@ func TestEntitlements(t *testing.T) { if featureName == codersdk.FeatureUserLimit { continue } + if featureName == codersdk.FeatureHighAvailability { + continue + } niceName := strings.Title(strings.ReplaceAll(featureName, "_", " ")) // Ensures features that are not entitled are properly disabled. require.False(t, entitlements.Features[featureName].Enabled) @@ -142,7 +148,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Contains(t, entitlements.Warnings, "Your deployment has 2 active users but is only licensed for 1.") @@ -164,7 +170,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Empty(t, entitlements.Warnings) @@ -187,7 +193,7 @@ func TestEntitlements(t *testing.T) { }), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -202,7 +208,7 @@ func TestEntitlements(t *testing.T) { AllFeatures: true, }), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -214,4 +220,52 @@ func TestEntitlements(t *testing.T) { require.Equal(t, codersdk.EntitlementEntitled, entitlements.Features[featureName].Entitlement) } }) + + t.Run("MultipleReplicasNoLicense", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, all) + require.NoError(t, err) + require.False(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.", entitlements.Warnings[0]) + }) + + t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + db.InsertLicense(context.Background(), database.InsertLicenseParams{ + Exp: time.Now().Add(time.Hour), + JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ + AuditLog: true, + }), + }) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, map[string]bool{ + codersdk.FeatureHighAvailability: true, + }) + require.NoError(t, err) + require.True(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but your license is not entitled to high availability.", entitlements.Warnings[0]) + }) + + t.Run("MultipleReplicasGrace", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + db.InsertLicense(context.Background(), database.InsertLicenseParams{ + JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ + HighAvailability: true, + GraceAt: time.Now().Add(-time.Hour), + ExpiresAt: time.Now().Add(time.Hour), + }), + Exp: time.Now().Add(time.Hour), + }) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, map[string]bool{ + codersdk.FeatureHighAvailability: true, + }) + require.NoError(t, err) + require.True(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but your license for high availability is expired.", entitlements.Warnings[0]) + }) } diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 94341079cd43f..3ce22c1bd9a11 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "net/url" "sync" "golang.org/x/xerrors" @@ -40,6 +41,18 @@ type Mesh struct { func (m *Mesh) SetAddresses(addresses []string) { total := make(map[string]struct{}, 0) for _, address := range addresses { + addressURL, err := url.Parse(address) + if err != nil { + m.logger.Error(m.ctx, "invalid address", slog.F("address", err), slog.Error(err)) + continue + } + derpURL, err := addressURL.Parse("/derp") + if err != nil { + m.logger.Error(m.ctx, "parse derp", slog.F("address", err), slog.Error(err)) + continue + } + address = derpURL.String() + total[address] = struct{}{} added, err := m.addAddress(address) if err != nil { From de5b13b380795544cb38cd697c07cd0c63a39e51 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 21:35:41 +0000 Subject: [PATCH 23/79] Add the ability to block endpoints --- .../coderd/coderdenttest/coderdenttest.go | 2 +- enterprise/coderd/replicas_test.go | 80 ++++++++++++------- tailnet/conn.go | 20 +++-- 3 files changed, 67 insertions(+), 35 deletions(-) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 24f0bffd5017f..ea172c43116e4 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -68,7 +68,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c BrowserOnly: options.BrowserOnly, SCIMAPIKey: options.SCIMAPIKey, DERPServerRelayAddress: oop.AccessURL.String(), - DERPServerRegionID: 1, + DERPServerRegionID: oop.DERPMap.RegionIDs()[0], ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 52836a720f623..01c6be90199f0 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -3,7 +3,6 @@ package coderd_test import ( "context" "testing" - "time" "github.com/stretchr/testify/require" @@ -13,38 +12,63 @@ import ( "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/coderd/coderdenttest" + "github.com/coder/coder/testutil" ) func TestReplicas(t *testing.T) { t.Parallel() - db, pubsub := dbtestutil.NewDB(t) - firstClient := coderdenttest.New(t, &coderdenttest.Options{ - Options: &coderdtest.Options{ - IncludeProvisionerDaemon: true, - Database: db, - Pubsub: pubsub, - }, + t.Run("WarningsWithoutLicense", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + }, + }) + _ = coderdtest.CreateFirstUser(t, firstClient) + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + ents, err := secondClient.Entitlements(context.Background()) + require.NoError(t, err) + require.Len(t, ents.Warnings, 1) }) - firstUser := coderdtest.CreateFirstUser(t, firstClient) - coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ - HighAvailability: true, - }) - - secondClient := coderdenttest.New(t, &coderdenttest.Options{ - Options: &coderdtest.Options{ - Database: db, - Pubsub: pubsub, - }, - }) - secondClient.SessionToken = firstClient.SessionToken + t.Run("ConnectAcrossMultiple", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + }, + }) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) - agentID := setupWorkspaceAgent(t, firstClient, firstUser) - conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) - require.NoError(t, err) - require.Eventually(t, func() bool { - _, err = conn.Ping() - return err == nil - }, 10*time.Second, 250*time.Millisecond) + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken - _ = conn.Close() + agentID := setupWorkspaceAgent(t, firstClient, firstUser) + conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) + require.NoError(t, err) + require.Eventually(t, func() bool { + _, err = conn.Ping() + return err == nil + }, testutil.WaitShort, testutil.IntervalFast) + _ = conn.Close() + }) } diff --git a/tailnet/conn.go b/tailnet/conn.go index 1b454d6346b97..19a0cd50f49e6 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -48,7 +48,10 @@ type Options struct { Addresses []netip.Prefix DERPMap *tailcfg.DERPMap - Logger slog.Logger + // BlockEndpoints specifies whether P2P endpoints are blocked. + // If so, only DERPs can establish connections. + BlockEndpoints bool + Logger slog.Logger } // NewConn constructs a new Wireguard server that will accept connections from the addresses provided. @@ -175,6 +178,7 @@ func NewConn(options *Options) (*Conn, error) { wireguardEngine.SetFilter(filter.New(netMap.PacketFilter, localIPs, logIPs, nil, Logger(options.Logger.Named("packet-filter")))) dialContext, dialCancel := context.WithCancel(context.Background()) server := &Conn{ + blockEndpoints: options.BlockEndpoints, dialContext: dialContext, dialCancel: dialCancel, closed: make(chan struct{}), @@ -240,11 +244,12 @@ func IP() netip.Addr { // Conn is an actively listening Wireguard connection. type Conn struct { - dialContext context.Context - dialCancel context.CancelFunc - mutex sync.Mutex - closed chan struct{} - logger slog.Logger + dialContext context.Context + dialCancel context.CancelFunc + mutex sync.Mutex + closed chan struct{} + logger slog.Logger + blockEndpoints bool dialer *tsdial.Dialer tunDevice *tstun.Wrapper @@ -429,6 +434,9 @@ func (c *Conn) sendNode() { PreferredDERP: c.lastPreferredDERP, DERPLatency: c.lastDERPLatency, } + if c.blockEndpoints { + node.Endpoints = nil + } nodeCallback := c.nodeCallback if nodeCallback == nil { return From 9a50ac496ef1f0329f15add7ab4e58652f03b184 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 15:37:38 +0000 Subject: [PATCH 24/79] Add flag to disable P2P connections --- cli/agent_test.go | 8 +++----- cli/configssh_test.go | 3 +-- cli/portforward.go | 3 +-- cli/speedtest.go | 4 +++- cli/ssh.go | 4 +--- coderd/activitybump_test.go | 4 +++- coderd/coderd.go | 2 +- coderd/templates_test.go | 4 +++- coderd/workspaceagents_test.go | 6 ++++-- codersdk/workspaceagents.go | 23 +++++++++++++++-------- enterprise/coderd/coderd.go | 4 +++- enterprise/coderd/replicas_test.go | 9 ++++++--- enterprise/coderd/workspaceagents_test.go | 5 ++--- enterprise/derpmesh/derpmesh.go | 9 +++++++-- tailnet/conn.go | 3 +++ 15 files changed, 56 insertions(+), 35 deletions(-) diff --git a/cli/agent_test.go b/cli/agent_test.go index dd0cb1d789349..8a90bb4cada3b 100644 --- a/cli/agent_test.go +++ b/cli/agent_test.go @@ -7,8 +7,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "cdr.dev/slog" - "github.com/coder/coder/cli/clitest" "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/provisioner/echo" @@ -67,7 +65,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, workspace.LatestBuild.Resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { @@ -128,7 +126,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { @@ -189,7 +187,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { diff --git a/cli/configssh_test.go b/cli/configssh_test.go index 3e1512a0c3471..4553cbe431221 100644 --- a/cli/configssh_test.go +++ b/cli/configssh_test.go @@ -19,7 +19,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/agent" @@ -115,7 +114,7 @@ func TestConfigSSH(t *testing.T) { _ = agentCloser.Close() }() resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - agentConn, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, resources[0].Agents[0].ID) + agentConn, err := client.DialWorkspaceAgent(context.Background(), resources[0].Agents[0].ID, nil) require.NoError(t, err) defer agentConn.Close() diff --git a/cli/portforward.go b/cli/portforward.go index 476809d601558..9cd3bc317c3b4 100644 --- a/cli/portforward.go +++ b/cli/portforward.go @@ -16,7 +16,6 @@ import ( "github.com/spf13/cobra" "golang.org/x/xerrors" - "cdr.dev/slog" "github.com/coder/coder/agent" "github.com/coder/coder/cli/cliflag" "github.com/coder/coder/cli/cliui" @@ -96,7 +95,7 @@ func portForward() *cobra.Command { return xerrors.Errorf("await agent: %w", err) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil) if err != nil { return err } diff --git a/cli/speedtest.go b/cli/speedtest.go index 357048f63ea34..cbb226b341342 100644 --- a/cli/speedtest.go +++ b/cli/speedtest.go @@ -55,7 +55,9 @@ func speedtest() *cobra.Command { if cliflag.IsSetBool(cmd, varVerbose) { logger = logger.Leveled(slog.LevelDebug) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, logger, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: logger, + }) if err != nil { return err } diff --git a/cli/ssh.go b/cli/ssh.go index ef8538764e3ac..b4d4f6420da78 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -20,8 +20,6 @@ import ( "golang.org/x/term" "golang.org/x/xerrors" - "cdr.dev/slog" - "github.com/coder/coder/cli/cliflag" "github.com/coder/coder/cli/cliui" "github.com/coder/coder/coderd/autobuild/notify" @@ -86,7 +84,7 @@ func ssh() *cobra.Command { return xerrors.Errorf("await agent: %w", err) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil) if err != nil { return err } diff --git a/coderd/activitybump_test.go b/coderd/activitybump_test.go index b12c8bc170a29..746bef0c9994e 100644 --- a/coderd/activitybump_test.go +++ b/coderd/activitybump_test.go @@ -74,7 +74,9 @@ func TestWorkspaceActivityBump(t *testing.T) { client, workspace, assertBumped := setupActivityTest(t) resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil), + }) require.NoError(t, err) defer conn.Close() diff --git a/coderd/coderd.go b/coderd/coderd.go index 6b4b335161c32..da1fc0572ccc6 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -123,7 +123,7 @@ func New(options *Options) *API { options.TailnetCoordinator = tailnet.NewCoordinator() } if options.DERPServer == nil { - options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp").Leveled(slog.LevelDebug))) options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { diff --git a/coderd/templates_test.go b/coderd/templates_test.go index bf547c4d0eb9a..861ad6f459035 100644 --- a/coderd/templates_test.go +++ b/coderd/templates_test.go @@ -626,7 +626,9 @@ func TestTemplateDAUs(t *testing.T) { require.NoError(t, err) assert.Zero(t, workspaces[0].LastUsedAt) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil).Named("tailnet"), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil).Named("tailnet"), + }) require.NoError(t, err) defer func() { _ = conn.Close() diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index 6bd569dde9f71..c5f3d9f16c0d8 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -123,7 +123,7 @@ func TestWorkspaceAgentListen(t *testing.T) { defer cancel() resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer func() { _ = conn.Close() @@ -253,7 +253,9 @@ func TestWorkspaceAgentTailnet(t *testing.T) { ctx, cancelFunc := context.WithCancel(context.Background()) defer cancelFunc() - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), + }) require.NoError(t, err) defer conn.Close() sshClient, err := conn.SSHClient() diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 81f82b08d3efa..97d225c3eebb3 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -331,7 +331,13 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } -func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logger, agentID uuid.UUID) (*AgentConn, error) { +type DialWorkspaceAgentOptions struct { + Logger slog.Logger + // BlockEndpoints forced a direct connection through DERP. + BlockEndpoints bool +} + +func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) { res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) if err != nil { return nil, err @@ -348,9 +354,10 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg ip := tailnet.IP() conn, err := tailnet.NewConn(&tailnet.Options{ - Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)}, - DERPMap: connInfo.DERPMap, - Logger: logger, + Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)}, + DERPMap: connInfo.DERPMap, + Logger: options.Logger, + BlockEndpoints: options.BlockEndpoints, }) if err != nil { return nil, xerrors.Errorf("create tailnet: %w", err) @@ -378,7 +385,7 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg defer close(closed) isFirst := true for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { - logger.Debug(ctx, "connecting") + options.Logger.Debug(ctx, "connecting") // nolint:bodyclose ws, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{ HTTPClient: httpClient, @@ -397,21 +404,21 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg if errors.Is(err, context.Canceled) { return } - logger.Debug(ctx, "failed to dial", slog.Error(err)) + options.Logger.Debug(ctx, "failed to dial", slog.Error(err)) continue } sendNode, errChan := tailnet.ServeCoordinator(websocket.NetConn(ctx, ws, websocket.MessageBinary), func(node []*tailnet.Node) error { return conn.UpdateNodes(node) }) conn.SetNodeCallback(sendNode) - logger.Debug(ctx, "serving coordinator") + options.Logger.Debug(ctx, "serving coordinator") err = <-errChan if errors.Is(err, context.Canceled) { _ = ws.Close(websocket.StatusGoingAway, "") return } if err != nil { - logger.Debug(ctx, "error serving coordinator", slog.Error(err)) + options.Logger.Debug(ctx, "error serving coordinator", slog.Error(err)) _ = ws.Close(websocket.StatusGoingAway, "") continue } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index b06e843b658a5..252da9ac6f01a 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,6 +3,7 @@ package coderd import ( "context" "crypto/ed25519" + "fmt" "net/http" "sync" "time" @@ -126,7 +127,7 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger, api.DERPServer) + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer) err = api.updateEntitlements(ctx) if err != nil { @@ -246,6 +247,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } + fmt.Printf("HA enabled\n") api.replicaManager.SetCallback(func() { addresses := make([]string, 0) for _, replica := range api.replicaManager.Regional() { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 01c6be90199f0..1d60b24e6e81a 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -11,6 +11,7 @@ import ( "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/codersdk" "github.com/coder/coder/enterprise/coderd/coderdenttest" "github.com/coder/coder/testutil" ) @@ -61,14 +62,16 @@ func TestReplicas(t *testing.T) { }, }) secondClient.SessionToken = firstClient.SessionToken - agentID := setupWorkspaceAgent(t, firstClient, firstUser) - conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) + conn, err := secondClient.DialWorkspaceAgent(context.Background(), agentID, &codersdk.DialWorkspaceAgentOptions{ + BlockEndpoints: true, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + }) require.NoError(t, err) require.Eventually(t, func() bool { _, err = conn.Ping() return err == nil - }, testutil.WaitShort, testutil.IntervalFast) + }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() }) } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 24e24e3f5f540..a5250b3b81b44 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -8,7 +8,6 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/require" - "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/agent" "github.com/coder/coder/coderd/coderdtest" @@ -33,7 +32,7 @@ func TestBlockNonBrowser(t *testing.T) { BrowserOnly: true, }) id := setupWorkspaceAgent(t, client, user) - _, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, id) + _, err := client.DialWorkspaceAgent(context.Background(), id, nil) var apiErr *codersdk.Error require.ErrorAs(t, err, &apiErr) require.Equal(t, http.StatusConflict, apiErr.StatusCode()) @@ -50,7 +49,7 @@ func TestBlockNonBrowser(t *testing.T) { BrowserOnly: false, }) id := setupWorkspaceAgent(t, client, user) - conn, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, id) + conn, err := client.DialWorkspaceAgent(context.Background(), id, nil) require.NoError(t, err) _ = conn.Close() }) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 3ce22c1bd9a11..dbdf7bc1b1f3a 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "net" "net/url" "sync" @@ -88,11 +89,15 @@ func (m *Mesh) addAddress(address string) (bool, error) { if isActive { return false, nil } - client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger.Named("client"))) if err != nil { return false, xerrors.Errorf("create derp client: %w", err) } client.MeshKey = m.server.MeshKey() + client.SetURLDialer(func(ctx context.Context, network, addr string) (net.Conn, error) { + var dialer net.Dialer + return dialer.DialContext(ctx, network, addr) + }) ctx, cancelFunc := context.WithCancel(m.ctx) closed := make(chan struct{}) closeFunc := func() { @@ -103,7 +108,7 @@ func (m *Mesh) addAddress(address string) (bool, error) { m.active[address] = closeFunc go func() { defer close(closed) - client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger), func(np key.NodePublic) { + client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger.Named("loop")), func(np key.NodePublic) { m.server.AddPacketForwarder(np, client) }, func(np key.NodePublic) { m.server.RemovePacketForwarder(np, client) diff --git a/tailnet/conn.go b/tailnet/conn.go index 19a0cd50f49e6..e41ed60a527f3 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -344,6 +344,9 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { // reason. TODO: @kylecarbs debug this! KeepAlive: ok && peerStatus.Active, } + if c.blockEndpoints { + peerNode.Endpoints = nil + } c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From 6fa941f958ab91be3b30ae2679f58bfdd33ec9b2 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 16:16:08 +0000 Subject: [PATCH 25/79] Wow, I made the tests pass --- agent/agent_test.go | 2 +- cli/agent_test.go | 6 +++--- cli/portforward.go | 2 +- cli/speedtest.go | 2 +- coderd/workspaceagents_test.go | 2 +- codersdk/agentconn.go | 4 +++- enterprise/coderd/coderd.go | 2 -- enterprise/coderd/replicas_test.go | 5 ++++- 8 files changed, 14 insertions(+), 11 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 06a33598b755f..e1269d6003922 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -465,7 +465,7 @@ func TestAgent(t *testing.T) { conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0) require.Eventually(t, func() bool { - _, err := conn.Ping() + _, err := conn.Ping(context.Background()) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String()) diff --git a/cli/agent_test.go b/cli/agent_test.go index 8a90bb4cada3b..f487ebfc005ed 100644 --- a/cli/agent_test.go +++ b/cli/agent_test.go @@ -69,7 +69,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() @@ -130,7 +130,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() @@ -191,7 +191,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() diff --git a/cli/portforward.go b/cli/portforward.go index 9cd3bc317c3b4..5a6f4391dd897 100644 --- a/cli/portforward.go +++ b/cli/portforward.go @@ -155,7 +155,7 @@ func portForward() *cobra.Command { case <-ticker.C: } - _, err = conn.Ping() + _, err = conn.Ping(ctx) if err != nil { continue } diff --git a/cli/speedtest.go b/cli/speedtest.go index cbb226b341342..f6c06641ec26f 100644 --- a/cli/speedtest.go +++ b/cli/speedtest.go @@ -70,7 +70,7 @@ func speedtest() *cobra.Command { return ctx.Err() case <-ticker.C: } - dur, err := conn.Ping() + dur, err := conn.Ping(ctx) if err != nil { continue } diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index c5f3d9f16c0d8..e8dd772095736 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -129,7 +129,7 @@ func TestWorkspaceAgentListen(t *testing.T) { _ = conn.Close() }() require.Eventually(t, func() bool { - _, err := conn.Ping() + _, err := conn.Ping(ctx) return err == nil }, testutil.WaitLong, testutil.IntervalFast) }) diff --git a/codersdk/agentconn.go b/codersdk/agentconn.go index b11c440ce3a65..e75edf1ca6bb0 100644 --- a/codersdk/agentconn.go +++ b/codersdk/agentconn.go @@ -132,7 +132,7 @@ type AgentConn struct { CloseFunc func() } -func (c *AgentConn) Ping() (time.Duration, error) { +func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) { errCh := make(chan error, 1) durCh := make(chan time.Duration, 1) c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { @@ -145,6 +145,8 @@ func (c *AgentConn) Ping() (time.Duration, error) { select { case err := <-errCh: return 0, err + case <-ctx.Done(): + return 0, ctx.Err() case dur := <-durCh: return dur, nil } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 252da9ac6f01a..21bc6f497ee1f 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,7 +3,6 @@ package coderd import ( "context" "crypto/ed25519" - "fmt" "net/http" "sync" "time" @@ -247,7 +246,6 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } - fmt.Printf("HA enabled\n") api.replicaManager.SetCallback(func() { addresses := make([]string, 0) for _, replica := range api.replicaManager.Regional() { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 1d60b24e6e81a..0da4a05dbbb60 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -3,6 +3,7 @@ package coderd_test import ( "context" "testing" + "time" "github.com/stretchr/testify/require" @@ -69,7 +70,9 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - _, err = conn.Ping() + ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + defer cancelFunc() + _, err = conn.Ping(ctx) return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() From abff96b103bcc4d6a72697154d95477bd9b69aed Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 16:54:21 +0000 Subject: [PATCH 26/79] Add replicas endpoint --- coderd/rbac/object.go | 4 ++ codersdk/deployment.go | 26 ------------ codersdk/replicas.go | 42 +++++++++++++++++++ codersdk/workspaceagents.go | 4 ++ enterprise/cli/features_test.go | 2 +- enterprise/coderd/coderd.go | 4 ++ enterprise/coderd/coderd_test.go | 2 +- .../coderdenttest/coderdenttest_test.go | 4 ++ enterprise/coderd/replicas.go | 35 ++++++++++++++++ enterprise/coderd/replicas_test.go | 5 +++ enterprise/replicasync/replicasync.go | 8 ++++ enterprise/replicasync/replicasync_test.go | 36 ++++++++++------ site/src/api/typesGenerated.ts | 11 +++++ 13 files changed, 143 insertions(+), 40 deletions(-) delete mode 100644 codersdk/deployment.go create mode 100644 codersdk/replicas.go diff --git a/coderd/rbac/object.go b/coderd/rbac/object.go index 5492e4397d5f7..1a8861c984ce9 100644 --- a/coderd/rbac/object.go +++ b/coderd/rbac/object.go @@ -146,6 +146,10 @@ var ( ResourceDeploymentFlags = Object{ Type: "deployment_flags", } + + ResourceReplicas = Object{ + Type: "replicas", + } ) // Object is used to create objects for authz checks when you have none in diff --git a/codersdk/deployment.go b/codersdk/deployment.go deleted file mode 100644 index a1227b09e3f63..0000000000000 --- a/codersdk/deployment.go +++ /dev/null @@ -1,26 +0,0 @@ -package codersdk - -import ( - "time" - - "github.com/google/uuid" -) - -type DeploymentInfo struct { - Replicas []Replica `json:"replicas"` -} - -type Replica struct { - // ID is the unique identifier for the replica. - ID uuid.UUID `json:"id"` - // Hostname is the hostname of the replica. - Hostname string `json:"hostname"` - // CreatedAt is when the replica was first seen. - CreatedAt time.Time `json:"created_at"` - // Active determines whether the replica is online. - Active bool `json:"active"` - // RelayAddress is the accessible address to relay DERP connections. - RelayAddress string `json:"relay_address"` - // Error is the error. - Error string `json:"error"` -} diff --git a/codersdk/replicas.go b/codersdk/replicas.go new file mode 100644 index 0000000000000..8e698fd3e6345 --- /dev/null +++ b/codersdk/replicas.go @@ -0,0 +1,42 @@ +package codersdk + +import ( + "context" + "encoding/json" + "net/http" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" +) + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // RegionID is the region of the replica. + RegionID int32 `json:"region_id"` + // Error is the error. + Error string `json:"error"` +} + +// Replicas fetches the list of replicas. +func (c *Client) Replicas(ctx context.Context) ([]Replica, error) { + res, err := c.Request(ctx, http.MethodGet, "/api/v2/replicas", nil) + if err != nil { + return nil, xerrors.Errorf("execute request: %w", err) + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return nil, readBodyAsError(res) + } + + var replicas []Replica + return replicas, json.NewDecoder(res.Body).Decode(&replicas) +} diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 97d225c3eebb3..c86b399e189ab 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -331,6 +331,7 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } +// @typescript-ignore DialWorkspaceAgentOptions type DialWorkspaceAgentOptions struct { Logger slog.Logger // BlockEndpoints forced a direct connection through DERP. @@ -338,6 +339,9 @@ type DialWorkspaceAgentOptions struct { } func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) { + if options == nil { + options = &DialWorkspaceAgentOptions{} + } res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) if err != nil { return nil, err diff --git a/enterprise/cli/features_test.go b/enterprise/cli/features_test.go index 1a59e095c3594..78b94a6509526 100644 --- a/enterprise/cli/features_test.go +++ b/enterprise/cli/features_test.go @@ -57,7 +57,7 @@ func TestFeaturesList(t *testing.T) { var entitlements codersdk.Entitlements err := json.Unmarshal(buf.Bytes(), &entitlements) require.NoError(t, err, "unmarshal JSON output") - assert.Len(t, entitlements.Features, 6) + assert.Len(t, entitlements.Features, 7) assert.Empty(t, entitlements.Warnings) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureUserLimit].Entitlement) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 21bc6f497ee1f..1634f82d45366 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -59,6 +59,10 @@ func New(ctx context.Context, options *Options) (*API, error) { api.AGPL.APIHandler.Group(func(r chi.Router) { r.Get("/entitlements", api.serveEntitlements) + r.Route("/replicas", func(r chi.Router) { + r.Use(apiKeyMiddleware) + r.Get("/", api.replicas) + }) r.Route("/licenses", func(r chi.Router) { r.Use(apiKeyMiddleware) r.Post("/", api.postLicense) diff --git a/enterprise/coderd/coderd_test.go b/enterprise/coderd/coderd_test.go index 40421450415a5..7b51845ff3986 100644 --- a/enterprise/coderd/coderd_test.go +++ b/enterprise/coderd/coderd_test.go @@ -85,7 +85,7 @@ func TestEntitlements(t *testing.T) { assert.False(t, res.HasLicense) al = res.Features[codersdk.FeatureAuditLog] assert.Equal(t, codersdk.EntitlementNotEntitled, al.Entitlement) - assert.True(t, al.Enabled) + assert.False(t, al.Enabled) }) t.Run("Pubsub", func(t *testing.T) { t.Parallel() diff --git a/enterprise/coderd/coderdenttest/coderdenttest_test.go b/enterprise/coderd/coderdenttest/coderdenttest_test.go index ef7657ee5301c..0c4e4b3568bf3 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest_test.go +++ b/enterprise/coderd/coderdenttest/coderdenttest_test.go @@ -58,6 +58,10 @@ func TestAuthorizeAllEndpoints(t *testing.T) { AssertAction: rbac.ActionRead, AssertObject: rbac.ResourceLicense, } + assertRoute["GET:/api/v2/replicas"] = coderdtest.RouteCheck{ + AssertAction: rbac.ActionRead, + AssertObject: rbac.ResourceReplicas, + } assertRoute["DELETE:/api/v2/licenses/{id}"] = coderdtest.RouteCheck{ AssertAction: rbac.ActionDelete, AssertObject: rbac.ResourceLicense, diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index ddb2b8b672186..f8cb64fe553dc 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -1 +1,36 @@ package coderd + +import ( + "net/http" + + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/httpapi" + "github.com/coder/coder/coderd/rbac" + "github.com/coder/coder/codersdk" +) + +// replicas returns the number of replicas that are active in Coder. +func (api *API) replicas(rw http.ResponseWriter, r *http.Request) { + if !api.AGPL.Authorize(r, rbac.ActionRead, rbac.ResourceReplicas) { + httpapi.ResourceNotFound(rw) + return + } + + replicas := api.replicaManager.All() + res := make([]codersdk.Replica, 0, len(replicas)) + for _, replica := range replicas { + res = append(res, convertReplica(replica)) + } + httpapi.Write(r.Context(), rw, http.StatusOK, res) +} + +func convertReplica(replica database.Replica) codersdk.Replica { + return codersdk.Replica{ + ID: replica.ID, + Hostname: replica.Hostname, + CreatedAt: replica.CreatedAt, + RelayAddress: replica.RelayAddress, + RegionID: replica.RegionID, + Error: replica.Error.String, + } +} diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 0da4a05dbbb60..e51f9cc330dc8 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -63,6 +63,10 @@ func TestReplicas(t *testing.T) { }, }) secondClient.SessionToken = firstClient.SessionToken + replicas, err := secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + agentID := setupWorkspaceAgent(t, firstClient, firstUser) conn, err := secondClient.DialWorkspaceAgent(context.Background(), agentID, &codersdk.DialWorkspaceAgentOptions{ BlockEndpoints: true, @@ -76,5 +80,6 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + }) } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 4d6038a694940..8b8327038e088 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -101,6 +101,14 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("run replica: %w", err) } + peers := server.Regional() + if len(peers) > 0 { + self := server.Self() + if self.RelayAddress == "" { + return nil, xerrors.Errorf("a relay address must be specified when running multiple replicas in the same region") + } + } + err = server.subscribe(ctx) if err != nil { return nil, xerrors.Errorf("subscribe: %w", err) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 5ce774ea5f29a..ccacbeb310c23 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -5,7 +5,6 @@ import ( "net/http" "net/http/httptest" "sync" - "sync/atomic" "testing" "time" @@ -66,6 +65,25 @@ func TestReplica(t *testing.T) { _ = server.Close() require.NoError(t, err) }) + t.Run("ErrorsWithoutRelayAddress", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + }) + require.NoError(t, err) + _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ + ID: uuid.New(), + }) + require.Error(t, err) + require.Equal(t, "a relay address must be specified when running multiple replicas in the same region", err.Error()) + }) t.Run("ConnectsToPeerReplica", func(t *testing.T) { // Ensures that the replica reports a successful status for // accessing all of its peers. @@ -85,7 +103,8 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + ID: uuid.New(), + RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) @@ -96,12 +115,6 @@ func TestReplica(t *testing.T) { t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) - var count atomic.Int32 - cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - count.Add(1) - }) - require.NoError(t, err) - defer cancel() peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ ID: uuid.New(), CreatedAt: database.Now(), @@ -113,16 +126,15 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), - PeerTimeout: 1 * time.Millisecond, + ID: uuid.New(), + PeerTimeout: 1 * time.Millisecond, + RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) require.True(t, server.Self().Error.Valid) require.Contains(t, server.Self().Error.String, "Failed to dial peers") - // Once for the initial creation of a replica, and another time for the error. - require.Equal(t, int32(2), count.Load()) _ = server.Close() }) t.Run("RefreshOnPublish", func(t *testing.T) { diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 72abae519b469..2289d2100be92 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -268,6 +268,7 @@ export interface DeploymentFlags { readonly derp_server_region_code: StringFlag readonly derp_server_region_name: StringFlag readonly derp_server_stun_address: StringArrayFlag + readonly derp_server_relay_address: StringFlag readonly derp_config_url: StringFlag readonly derp_config_path: StringFlag readonly prom_enabled: BoolFlag @@ -522,6 +523,16 @@ export interface PutExtendWorkspaceRequest { readonly deadline: string } +// From codersdk/replicas.go +export interface Replica { + readonly id: string + readonly hostname: string + readonly created_at: string + readonly relay_address: string + readonly region_id: number + readonly error: string +} + // From codersdk/error.go export interface Response { readonly message: string From d6ce2167a243349472929136e0d1fab032c97ee0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 17:23:59 +0000 Subject: [PATCH 27/79] Ensure close kills replica --- cli/root.go | 6 ++-- cli/server.go | 9 ++++-- coderd/coderd.go | 2 +- enterprise/cli/server.go | 31 +++---------------- enterprise/coderd/coderd.go | 11 +++++-- .../coderd/coderdenttest/coderdenttest.go | 2 -- enterprise/coderd/license/license.go | 2 +- enterprise/coderd/license/license_test.go | 2 +- enterprise/coderd/replicas_test.go | 7 ++++- 9 files changed, 32 insertions(+), 40 deletions(-) diff --git a/cli/root.go b/cli/root.go index e29aa534da0a8..91d4551916cc0 100644 --- a/cli/root.go +++ b/cli/root.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "io" "net/http" "net/url" "os" @@ -100,8 +101,9 @@ func Core() []*cobra.Command { } func AGPL() []*cobra.Command { - all := append(Core(), Server(deployment.Flags(), func(_ context.Context, _ config.Root, o *coderd.Options) (*coderd.API, error) { - return coderd.New(o), nil + all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, io.Closer, error) { + api := coderd.New(o) + return api, api, nil })) return all } diff --git a/cli/server.go b/cli/server.go index fc5f131da3d7b..3a94716be064d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -67,7 +67,7 @@ import ( ) // nolint:gocyclo -func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, config.Root, *coderd.Options) (*coderd.API, error)) *cobra.Command { +func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, io.Closer, error)) *cobra.Command { root := &cobra.Command{ Use: "server", Short: "Start a Coder server", @@ -463,11 +463,14 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, confi ), dflags.PromAddress.Value, "prometheus")() } - coderAPI, err := newAPI(ctx, config, options) + // We use a separate closer so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, closer, err := newAPI(ctx, options) if err != nil { return err } - defer coderAPI.Close() + defer closer.Close() client := codersdk.New(localURL) if dflags.TLSEnable.Value { diff --git a/coderd/coderd.go b/coderd/coderd.go index da1fc0572ccc6..bb16553e47c66 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -123,7 +123,7 @@ func New(options *Options) *API { options.TailnetCoordinator = tailnet.NewCoordinator() } if options.DERPServer == nil { - options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp").Leveled(slog.LevelDebug))) + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index cc44985e0a4d4..f3e99c1613ab8 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,15 +2,12 @@ package cli import ( "context" + "io" "net/url" - "github.com/google/uuid" "github.com/spf13/cobra" "golang.org/x/xerrors" - "cdr.dev/slog" - - "github.com/coder/coder/cli/config" "github.com/coder/coder/cli/deployment" "github.com/coder/coder/enterprise/coderd" @@ -20,28 +17,11 @@ import ( func server() *cobra.Command { dflags := deployment.Flags() - cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { - replicaIDRaw, err := cfg.ReplicaID().Read() - generatedReplicaID := false - if err != nil { - replicaIDRaw = uuid.NewString() - generatedReplicaID = true - } - replicaID, err := uuid.Parse(replicaIDRaw) - if err != nil { - options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) - replicaID = uuid.New() - generatedReplicaID = true - } - if generatedReplicaID { - // Make sure we save it to be reused later! - _ = cfg.ReplicaID().Write(replicaID.String()) - } - + cmd := agpl.Server(dflags, func(ctx context.Context, options *agplcoderd.Options) (*agplcoderd.API, io.Closer, error) { if dflags.DerpServerRelayAddress.Value != "" { _, err := url.Parse(dflags.DerpServerRelayAddress.Value) if err != nil { - return nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) + return nil, nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) } } @@ -51,7 +31,6 @@ func server() *cobra.Command { SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, RBAC: true, - ReplicaID: replicaID, DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, DERPServerRegionID: dflags.DerpServerRegionID.Value, @@ -59,9 +38,9 @@ func server() *cobra.Command { } api, err := coderd.New(ctx, o) if err != nil { - return nil, err + return nil, nil, err } - return api.AGPL, nil + return api.AGPL, api, nil }) deployment.AttachFlags(cmd.Flags(), dflags, true) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 1634f82d45366..a25b432a16a7c 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -123,7 +123,8 @@ func New(ctx context.Context, options *Options) (*API, error) { var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - ID: options.ReplicaID, + // Create a new replica ID for each Coder instance! + ID: uuid.New(), RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), }) @@ -154,7 +155,6 @@ type Options struct { // Used for high availability. DERPServerRelayAddress string DERPServerRegionID int - ReplicaID uuid.UUID EntitlementsUpdateInterval time.Duration Keys map[string]ed25519.PublicKey @@ -256,10 +256,15 @@ func (api *API) updateEntitlements(ctx context.Context) error { addresses = append(addresses, replica.RelayAddress) } api.derpMesh.SetAddresses(addresses) + _ = api.updateEntitlements(ctx) }) } else { api.derpMesh.SetAddresses([]string{}) - api.replicaManager.SetCallback(func() {}) + api.replicaManager.SetCallback(func() { + // If the amount of replicas change, so should our entitlements. + // This is to display a warning in the UI if the user is unlicensed. + _ = api.updateEntitlements(ctx) + }) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index ea172c43116e4..fd1080a3ff30f 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -9,7 +9,6 @@ import ( "time" "github.com/golang-jwt/jwt/v4" - "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -69,7 +68,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c SCIMAPIKey: options.SCIMAPIKey, DERPServerRelayAddress: oop.AccessURL.String(), DERPServerRegionID: oop.DERPMap.RegionIDs()[0], - ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, EntitlementsUpdateInterval: options.EntitlementsUpdateInterval, diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 633b1a5056cab..f168f7472c80c 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -176,7 +176,7 @@ func Entitlements( "You have multiple replicas but your license is not entitled to high availability.") } else { entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.") + "You have multiple replicas but high availability is an Enterprise feature.") } case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 5b50bdb97cfe2..4d0f09913037d 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -228,7 +228,7 @@ func TestEntitlements(t *testing.T) { require.NoError(t, err) require.False(t, entitlements.HasLicense) require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.", entitlements.Warnings[0]) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature.", entitlements.Warnings[0]) }) t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index e51f9cc330dc8..3d41e83deb964 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -30,7 +30,7 @@ func TestReplicas(t *testing.T) { }, }) _ = coderdtest.CreateFirstUser(t, firstClient) - secondClient := coderdenttest.New(t, &coderdenttest.Options{ + secondClient, _, secondAPI := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ Options: &coderdtest.Options{ Database: db, Pubsub: pubsub, @@ -40,6 +40,11 @@ func TestReplicas(t *testing.T) { ents, err := secondClient.Entitlements(context.Background()) require.NoError(t, err) require.Len(t, ents.Warnings, 1) + _ = secondAPI.Close() + + ents, err = firstClient.Entitlements(context.Background()) + require.NoError(t, err) + require.Len(t, ents.Warnings, 0) }) t.Run("ConnectAcrossMultiple", func(t *testing.T) { t.Parallel() From d7cc0ff9bb7255b150a84b4bb683ac90f87d4089 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 22:36:51 +0000 Subject: [PATCH 28/79] Update sql --- cli/server.go | 10 +- ...icas.down.sql => 000061_replicas.down.sql} | 0 ...replicas.up.sql => 000061_replicas.up.sql} | 0 codersdk/features.go | 1 + enterprise/coderd/coderd.go | 2 +- enterprise/coderd/license/license.go | 11 ++- enterprise/coderd/license/license_test.go | 10 +- enterprise/derpmesh/derpmesh.go | 22 +++-- enterprise/derpmesh/derpmesh_test.go | 99 +++++++++++++++++-- go.mod | 2 +- go.sum | 4 +- 11 files changed, 124 insertions(+), 37 deletions(-) rename coderd/database/migrations/{000059_replicas.down.sql => 000061_replicas.down.sql} (100%) rename coderd/database/migrations/{000059_replicas.up.sql => 000061_replicas.up.sql} (100%) diff --git a/cli/server.go b/cli/server.go index 3a94716be064d..1ab1a6228f356 100644 --- a/cli/server.go +++ b/cli/server.go @@ -165,9 +165,10 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code } defer listener.Close() + var tlsConfig *tls.Config if dflags.TLSEnable.Value { - listener, err = configureServerTLS( - listener, dflags.TLSMinVersion.Value, + tlsConfig, err = configureTLS( + dflags.TLSMinVersion.Value, dflags.TLSClientAuth.Value, dflags.TLSCertFiles.Value, dflags.TLSKeyFiles.Value, @@ -176,6 +177,7 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code if err != nil { return xerrors.Errorf("configure tls: %w", err) } + listener = tls.NewListener(listener, tlsConfig) } tcpAddr, valid := listener.Addr().(*net.TCPAddr) @@ -888,7 +890,7 @@ func loadCertificates(tlsCertFiles, tlsKeyFiles []string) ([]tls.Certificate, er return certs, nil } -func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (net.Listener, error) { +func configureTLS(tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (*tls.Config, error) { tlsConfig := &tls.Config{ MinVersion: tls.VersionTLS12, } @@ -958,7 +960,7 @@ func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth stri tlsConfig.ClientCAs = caPool } - return tls.NewListener(listener, tlsConfig), nil + return tlsConfig, nil } func configureGithubOAuth2(accessURL *url.URL, clientID, clientSecret string, allowSignups bool, allowOrgs []string, rawTeams []string, enterpriseBaseURL string) (*coderd.GithubOAuth2Config, error) { diff --git a/coderd/database/migrations/000059_replicas.down.sql b/coderd/database/migrations/000061_replicas.down.sql similarity index 100% rename from coderd/database/migrations/000059_replicas.down.sql rename to coderd/database/migrations/000061_replicas.down.sql diff --git a/coderd/database/migrations/000059_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql similarity index 100% rename from coderd/database/migrations/000059_replicas.up.sql rename to coderd/database/migrations/000061_replicas.up.sql diff --git a/codersdk/features.go b/codersdk/features.go index 799307e8fe898..862411de62872 100644 --- a/codersdk/features.go +++ b/codersdk/features.go @@ -44,6 +44,7 @@ type Feature struct { type Entitlements struct { Features map[string]Feature `json:"features"` Warnings []string `json:"warnings"` + Errors []string `json:"errors"` HasLicense bool `json:"has_license"` Experimental bool `json:"experimental"` Trial bool `json:"trial"` diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 294ff0eef1c71..612e710395722 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -137,7 +137,7 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer) + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, nil) err = api.updateEntitlements(ctx) if err != nil { diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index f168f7472c80c..c5bb689db65a9 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -30,6 +30,7 @@ func Entitlements( entitlements := codersdk.Entitlements{ Features: map[string]codersdk.Feature{}, Warnings: []string{}, + Errors: []string{}, } for _, featureName := range codersdk.FeatureNames { entitlements.Features[featureName] = codersdk.Feature{ @@ -172,15 +173,15 @@ func Entitlements( switch feature.Entitlement { case codersdk.EntitlementNotEntitled: if entitlements.HasLicense { - entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but your license is not entitled to high availability.") + entitlements.Errors = append(entitlements.Warnings, + "You have multiple replicas but your license is not entitled to high availability. You will be unable to connect to workspaces.") } else { - entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but high availability is an Enterprise feature.") + entitlements.Errors = append(entitlements.Warnings, + "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.") } case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but your license for high availability is expired.") + "You have multiple replicas but your license for high availability is expired. Reduce to one replica or workspace connections will stop working.") } } diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 4d0f09913037d..6def291e3e24c 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -227,8 +227,8 @@ func TestEntitlements(t *testing.T) { entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) - require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature.", entitlements.Warnings[0]) + require.Len(t, entitlements.Errors, 1) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.", entitlements.Errors[0]) }) t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { @@ -245,8 +245,8 @@ func TestEntitlements(t *testing.T) { }) require.NoError(t, err) require.True(t, entitlements.HasLicense) - require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but your license is not entitled to high availability.", entitlements.Warnings[0]) + require.Len(t, entitlements.Errors, 1) + require.Equal(t, "You have multiple replicas but your license is not entitled to high availability. You will be unable to connect to workspaces.", entitlements.Errors[0]) }) t.Run("MultipleReplicasGrace", func(t *testing.T) { @@ -266,6 +266,6 @@ func TestEntitlements(t *testing.T) { require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but your license for high availability is expired.", entitlements.Warnings[0]) + require.Equal(t, "You have multiple replicas but your license for high availability is expired. Reduce to one replica or workspace connections will stop working.", entitlements.Warnings[0]) }) } diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index dbdf7bc1b1f3a..8f51343017593 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "crypto/tls" "net" "net/url" "sync" @@ -17,20 +18,22 @@ import ( ) // New constructs a new mesh for DERP servers. -func New(logger slog.Logger, server *derp.Server) *Mesh { +func New(logger slog.Logger, server *derp.Server, tlsConfig *tls.Config) *Mesh { return &Mesh{ - logger: logger, - server: server, - ctx: context.Background(), - closed: make(chan struct{}), - active: make(map[string]context.CancelFunc), + logger: logger, + server: server, + tlsConfig: tlsConfig, + ctx: context.Background(), + closed: make(chan struct{}), + active: make(map[string]context.CancelFunc), } } type Mesh struct { - logger slog.Logger - server *derp.Server - ctx context.Context + logger slog.Logger + server *derp.Server + ctx context.Context + tlsConfig *tls.Config mutex sync.Mutex closed chan struct{} @@ -93,6 +96,7 @@ func (m *Mesh) addAddress(address string) (bool, error) { if err != nil { return false, xerrors.Errorf("create derp client: %w", err) } + client.TLSConfig = m.tlsConfig client.MeshKey = m.server.MeshKey() client.SetURLDialer(func(ctx context.Context, network, addr string) (net.Conn, error) { var dialer net.Dialer diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 313c33da99bad..139e42566ffb1 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -1,11 +1,22 @@ package derpmesh_test import ( + "bytes" "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "errors" "io" + "math/big" + "net" "net/http/httptest" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -29,12 +40,41 @@ func TestDERPMesh(t *testing.T) { t.Run("ExchangeMessages", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - firstServer, firstServerURL := startDERP(t) + firstServer, firstServerURL, firstTLSName := startDERP(t) defer firstServer.Close() - secondServer, secondServerURL := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer) + secondServer, secondServerURL, secondTLSName := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) + secondMesh.SetAddresses([]string{firstServerURL}) + defer firstMesh.Close() + defer secondMesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("ExchangeMessages", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + firstServer, firstServerURL, firstTLSName := startDERP(t) + defer firstServer.Close() + secondServer, secondServerURL, secondTLSName := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) + firstMesh.SetAddresses([]string{secondServerURL}) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) secondMesh.SetAddresses([]string{firstServerURL}) defer firstMesh.Close() defer secondMesh.Close() @@ -58,8 +98,8 @@ func TestDERPMesh(t *testing.T) { t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - server, serverURL := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server) + server, serverURL, tlsName := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsName) mesh.SetAddresses([]string{"http://fake.com"}) // This should trigger a removal... mesh.SetAddresses([]string{}) @@ -84,8 +124,8 @@ func TestDERPMesh(t *testing.T) { meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { - server, url := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server) + server, url, tlsName := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsName) t.Cleanup(func() { _ = server.Close() _ = mesh.Close() @@ -132,15 +172,54 @@ func recvData(t *testing.T, client *derphttp.Client) []byte { } } -func startDERP(t *testing.T) (*derp.Server, string) { +func startDERP(t *testing.T) (*derp.Server, string, *tls.Config) { logf := tailnet.Logger(slogtest.Make(t, nil)) d := derp.NewServer(key.NewNode(), logf) d.SetMeshKey("some-key") server := httptest.NewUnstartedServer(derphttp.Handler(d)) + commonName := "something.org" + server.TLS = &tls.Config{ + Certificates: []tls.Certificate{generateTLSCertificate(t, commonName)}, + } server.Start() t.Cleanup(func() { _ = d.Close() }) t.Cleanup(server.Close) - return d, server.URL + return d, server.URL, server.TLS +} + +func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { + privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + template := x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{ + Organization: []string{"Acme Co"}, + CommonName: commonName, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour * 24 * 180), + + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) + require.NoError(t, err) + var certFile bytes.Buffer + require.NoError(t, err) + _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) + require.NoError(t, err) + privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) + require.NoError(t, err) + var keyFile bytes.Buffer + err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) + require.NoError(t, err) + cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) + require.NoError(t, err) + return cert } diff --git a/go.mod b/go.mod index 9834e27e5f39c..b33a438eb3d08 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 diff --git a/go.sum b/go.sum index 13fdc5724f6b6..5852582c26c4a 100644 --- a/go.sum +++ b/go.sum @@ -351,8 +351,8 @@ github.com/coder/retry v1.3.0 h1:5lAAwt/2Cm6lVmnfBY7sOMXcBOwcwJhmV5QGSELIVWY= github.com/coder/retry v1.3.0/go.mod h1:tXuRgZgWjUnU5LZPT4lJh4ew2elUhexhlnXzrJWdyFY= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHuiMNJ0qlhoD3pGN3JY9gxSko= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c h1:xa6lr5Pj87Is26tgpzwBsEGKL7aVz7/fRGgY9QIbf3E= -github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= +github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 h1:FgWWdu0fnFEpUNjW0vOaCuOxOZ/GQzn6oo7p5IMlSA0= +github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE= github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU= github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU= From 9914840133605ac75f90357ae3801355cd90c91d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 00:23:22 +0000 Subject: [PATCH 29/79] Add database latency to high availability --- coderd/database/databasefake/databasefake.go | 22 ++++--- coderd/database/db.go | 9 +++ coderd/database/dump.sql | 1 + .../migrations/000061_replicas.up.sql | 2 + coderd/database/models.go | 21 +++--- coderd/database/queries.sql.go | 57 +++++++++------- coderd/database/queries/replicas.sql | 9 +-- codersdk/replicas.go | 2 + enterprise/coderd/replicas.go | 13 ++-- enterprise/replicasync/replicasync.go | 64 ++++++++++-------- site/src/api/typesGenerated.ts | 1 + .../LicenseBanner/LicenseBanner.tsx | 6 +- .../LicenseBannerView.stories.tsx | 10 +++ .../LicenseBanner/LicenseBannerView.tsx | 66 +++++++++++-------- site/src/testHelpers/entities.ts | 3 + .../entitlements/entitlementsXService.ts | 1 + 16 files changed, 180 insertions(+), 107 deletions(-) diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index e58fb990271ca..b4724a9afe0aa 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -113,6 +113,10 @@ type data struct { lastLicenseID int32 } +func (q *fakeQuerier) Ping(_ context.Context) (time.Duration, error) { + return 0, nil +} + // InTx doesn't rollback data properly for in-memory yet. func (q *fakeQuerier) InTx(fn func(database.Store) error) error { q.mutex.Lock() @@ -3170,14 +3174,15 @@ func (q *fakeQuerier) InsertReplica(_ context.Context, arg database.InsertReplic defer q.mutex.Unlock() replica := database.Replica{ - ID: arg.ID, - CreatedAt: arg.CreatedAt, - StartedAt: arg.StartedAt, - UpdatedAt: arg.UpdatedAt, - Hostname: arg.Hostname, - RegionID: arg.RegionID, - RelayAddress: arg.RelayAddress, - Version: arg.Version, + ID: arg.ID, + CreatedAt: arg.CreatedAt, + StartedAt: arg.StartedAt, + UpdatedAt: arg.UpdatedAt, + Hostname: arg.Hostname, + RegionID: arg.RegionID, + RelayAddress: arg.RelayAddress, + Version: arg.Version, + DatabaseLatency: arg.DatabaseLatency, } q.replicas = append(q.replicas, replica) return replica, nil @@ -3199,6 +3204,7 @@ func (q *fakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplic replica.RegionID = arg.RegionID replica.Version = arg.Version replica.Error = arg.Error + replica.DatabaseLatency = arg.DatabaseLatency q.replicas[index] = replica return replica, nil } diff --git a/coderd/database/db.go b/coderd/database/db.go index 4cbbdb399f193..020000888f8eb 100644 --- a/coderd/database/db.go +++ b/coderd/database/db.go @@ -12,6 +12,7 @@ import ( "context" "database/sql" "errors" + "time" "github.com/jmoiron/sqlx" "golang.org/x/xerrors" @@ -24,6 +25,7 @@ type Store interface { // customQuerier contains custom queries that are not generated. customQuerier + Ping(ctx context.Context) (time.Duration, error) InTx(func(Store) error) error } @@ -58,6 +60,13 @@ type sqlQuerier struct { db DBTX } +// Ping returns the time it takes to ping the database. +func (q *sqlQuerier) Ping(ctx context.Context) (time.Duration, error) { + start := time.Now() + err := q.sdb.PingContext(ctx) + return time.Since(start), err +} + // InTx performs database operations inside a transaction. func (q *sqlQuerier) InTx(function func(Store) error) error { if _, ok := q.db.(*sqlx.Tx); ok { diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index ca301ac8504b7..1e0a18c1dafef 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -293,6 +293,7 @@ CREATE TABLE replicas ( hostname text NOT NULL, region_id integer NOT NULL, relay_address text NOT NULL, + database_latency integer NOT NULL, version text NOT NULL, error text ); diff --git a/coderd/database/migrations/000061_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql index a07587f35a234..b1d1a1ab13ee0 100644 --- a/coderd/database/migrations/000061_replicas.up.sql +++ b/coderd/database/migrations/000061_replicas.up.sql @@ -17,6 +17,8 @@ CREATE TABLE IF NOT EXISTS replicas ( region_id integer NOT NULL, -- An address that should be accessible to other replicas. relay_address text NOT NULL, + -- The latency of the replica to the database in microseconds. + database_latency int NOT NULL, -- Version is the Coder version of the replica. version text NOT NULL, error text diff --git a/coderd/database/models.go b/coderd/database/models.go index 55867a164bd98..b4601ecadeb78 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -540,16 +540,17 @@ type ProvisionerJobLog struct { } type Replica struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` } type SiteConfig struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 8577903ecc0a2..241474e7e66bd 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2580,7 +2580,7 @@ func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt } const getReplicaByID = `-- name: GetReplicaByID :one -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE id = $1 +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE id = $1 ` func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { @@ -2595,6 +2595,7 @@ func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) @@ -2602,7 +2603,7 @@ func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, } const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL ` func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) { @@ -2623,6 +2624,7 @@ func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ); err != nil { @@ -2648,20 +2650,21 @@ INSERT INTO replicas ( hostname, region_id, relay_address, - version - -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error + version, + database_latency +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error ` type InsertReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - Version string `db:"version" json:"version"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) { @@ -2674,6 +2677,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) arg.RegionID, arg.RelayAddress, arg.Version, + arg.DatabaseLatency, ) var i Replica err := row.Scan( @@ -2685,6 +2689,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) @@ -2700,20 +2705,22 @@ UPDATE replicas SET region_id = $6, hostname = $7, version = $8, - error = $9 -WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error + error = $9, + database_latency = $10 +WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error ` type UpdateReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - RelayAddress string `db:"relay_address" json:"relay_address"` - RegionID int32 `db:"region_id" json:"region_id"` - Hostname string `db:"hostname" json:"hostname"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { @@ -2727,6 +2734,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) arg.Hostname, arg.Version, arg.Error, + arg.DatabaseLatency, ) var i Replica err := row.Scan( @@ -2738,6 +2746,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql index a7aa5b0aa1dee..5a62527fac107 100644 --- a/coderd/database/queries/replicas.sql +++ b/coderd/database/queries/replicas.sql @@ -13,9 +13,9 @@ INSERT INTO replicas ( hostname, region_id, relay_address, - version - -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *; + version, + database_latency +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING *; -- name: UpdateReplica :one UPDATE replicas SET @@ -26,7 +26,8 @@ UPDATE replicas SET region_id = $6, hostname = $7, version = $8, - error = $9 + error = $9, + database_latency = $10 WHERE id = $1 RETURNING *; -- name: DeleteReplicasUpdatedBefore :exec diff --git a/codersdk/replicas.go b/codersdk/replicas.go index 8e698fd3e6345..e74af021ee9a3 100644 --- a/codersdk/replicas.go +++ b/codersdk/replicas.go @@ -23,6 +23,8 @@ type Replica struct { RegionID int32 `json:"region_id"` // Error is the error. Error string `json:"error"` + // DatabaseLatency is the latency in microseconds to the database. + DatabaseLatency int32 `json:"database_latency"` } // Replicas fetches the list of replicas. diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index f8cb64fe553dc..c07c37243d0ca 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -26,11 +26,12 @@ func (api *API) replicas(rw http.ResponseWriter, r *http.Request) { func convertReplica(replica database.Replica) codersdk.Replica { return codersdk.Replica{ - ID: replica.ID, - Hostname: replica.Hostname, - CreatedAt: replica.CreatedAt, - RelayAddress: replica.RelayAddress, - RegionID: replica.RegionID, - Error: replica.Error.String, + ID: replica.ID, + Hostname: replica.Hostname, + CreatedAt: replica.CreatedAt, + RelayAddress: replica.RelayAddress, + RegionID: replica.RegionID, + Error: replica.Error.String, + DatabaseLatency: replica.DatabaseLatency, } } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 8b8327038e088..75ba041aaa6e1 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -48,6 +48,10 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("get hostname: %w", err) } + databaseLatency, err := db.Ping(ctx) + if err != nil { + return nil, xerrors.Errorf("ping database: %w", err) + } var replica database.Replica _, err = db.GetReplicaByID(ctx, options.ID) if err != nil { @@ -55,29 +59,31 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("get replica: %w", err) } replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ - ID: options.ID, - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), - Hostname: hostname, - RegionID: options.RegionID, - RelayAddress: options.RelayAddress, - Version: buildinfo.Version(), + ID: options.ID, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return nil, xerrors.Errorf("insert replica: %w", err) } } else { replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: options.ID, - UpdatedAt: database.Now(), - StartedAt: database.Now(), - StoppedAt: sql.NullTime{}, - RelayAddress: options.RelayAddress, - RegionID: options.RegionID, - Hostname: hostname, - Version: buildinfo.Version(), - Error: sql.NullString{}, + ID: options.ID, + UpdatedAt: database.Now(), + StartedAt: database.Now(), + StoppedAt: sql.NullTime{}, + RelayAddress: options.RelayAddress, + RegionID: options.RegionID, + Hostname: hostname, + Version: buildinfo.Version(), + Error: sql.NullString{}, + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return nil, xerrors.Errorf("update replica: %w", err) @@ -268,16 +274,22 @@ func (m *Manager) run(ctx context.Context) error { } } + databaseLatency, err := m.db.Ping(ctx) + if err != nil { + return xerrors.Errorf("ping database: %w", err) + } + replica, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: m.self.ID, - UpdatedAt: database.Now(), - StartedAt: m.self.StartedAt, - StoppedAt: m.self.StoppedAt, - RelayAddress: m.self.RelayAddress, - RegionID: m.self.RegionID, - Hostname: m.self.Hostname, - Version: m.self.Version, - Error: replicaError, + ID: m.self.ID, + UpdatedAt: database.Now(), + StartedAt: m.self.StartedAt, + StoppedAt: m.self.StoppedAt, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, + Error: replicaError, + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return xerrors.Errorf("update replica: %w", err) diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 7a8af9278b1eb..92db958074a68 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -338,6 +338,7 @@ export interface DurationFlag { export interface Entitlements { readonly features: Record readonly warnings: string[] + readonly errors: string[] readonly has_license: boolean readonly experimental: boolean readonly trial: boolean diff --git a/site/src/components/LicenseBanner/LicenseBanner.tsx b/site/src/components/LicenseBanner/LicenseBanner.tsx index 8532bfca2ecbe..7ecfc2a2a2fac 100644 --- a/site/src/components/LicenseBanner/LicenseBanner.tsx +++ b/site/src/components/LicenseBanner/LicenseBanner.tsx @@ -8,15 +8,15 @@ export const LicenseBanner: React.FC = () => { const [entitlementsState, entitlementsSend] = useActor( xServices.entitlementsXService, ) - const { warnings } = entitlementsState.context.entitlements + const { errors, warnings } = entitlementsState.context.entitlements /** Gets license data on app mount because LicenseBanner is mounted in App */ useEffect(() => { entitlementsSend("GET_ENTITLEMENTS") }, [entitlementsSend]) - if (warnings.length > 0) { - return + if (errors.length > 0 || warnings.length > 0) { + return } else { return null } diff --git a/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx b/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx index c37653eff7bd5..c7ee69c261e38 100644 --- a/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx +++ b/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx @@ -12,13 +12,23 @@ const Template: Story = (args) => ( export const OneWarning = Template.bind({}) OneWarning.args = { + errors: [], warnings: ["You have exceeded the number of seats in your license."], } export const TwoWarnings = Template.bind({}) TwoWarnings.args = { + errors: [], warnings: [ "You have exceeded the number of seats in your license.", "You are flying too close to the sun.", ], } + +export const OneError = Template.bind({}) +OneError.args = { + errors: [ + "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.", + ], + warnings: [], +} diff --git a/site/src/components/LicenseBanner/LicenseBannerView.tsx b/site/src/components/LicenseBanner/LicenseBannerView.tsx index 49276b1f0d5ed..792bc191a0a2a 100644 --- a/site/src/components/LicenseBanner/LicenseBannerView.tsx +++ b/site/src/components/LicenseBanner/LicenseBannerView.tsx @@ -2,47 +2,56 @@ import { makeStyles } from "@material-ui/core/styles" import { Expander } from "components/Expander/Expander" import { Pill } from "components/Pill/Pill" import { useState } from "react" +import { colors } from "theme/colors" export const Language = { licenseIssue: "License Issue", licenseIssues: (num: number): string => `${num} License Issues`, - upgrade: "Contact us to upgrade your license.", + upgrade: "Contact sales@coder.com.", exceeded: "It looks like you've exceeded some limits of your license.", lessDetails: "Less", moreDetails: "More", } export interface LicenseBannerViewProps { + errors: string[] warnings: string[] } export const LicenseBannerView: React.FC = ({ + errors, warnings, }) => { const styles = useStyles() const [showDetails, setShowDetails] = useState(false) - if (warnings.length === 1) { + const isError = errors.length > 0 + const messages = [...errors, ...warnings] + const type = isError ? "error" : "warning" + + if (messages.length === 1) { return ( -
- - {warnings[0]} -   - - {Language.upgrade} - +
+ +
+ {messages[0]} +   + + {Language.upgrade} + +
) } else { return ( -
-
-
- - {Language.exceeded} +
+ +
+
    - {warnings.map((warning) => ( -
  • - {warning} + {messages.map((message) => ( +
  • + {message}
  • ))}
@@ -67,14 +76,18 @@ const useStyles = makeStyles((theme) => ({ container: { padding: theme.spacing(1.5), backgroundColor: theme.palette.warning.main, + display: "flex", + alignItems: "center", + + "&.error": { + backgroundColor: colors.red[12], + }, }, flex: { - display: "flex", + display: "column", }, leftContent: { marginRight: theme.spacing(1), - }, - text: { marginLeft: theme.spacing(1), }, link: { @@ -83,9 +96,10 @@ const useStyles = makeStyles((theme) => ({ fontWeight: "bold", }, list: { - margin: theme.spacing(1.5), + padding: theme.spacing(1), + margin: 0, }, listItem: { - margin: theme.spacing(1), + margin: theme.spacing(0.5), }, })) diff --git a/site/src/testHelpers/entities.ts b/site/src/testHelpers/entities.ts index 6e26a4fee5944..7080d2a8d6002 100644 --- a/site/src/testHelpers/entities.ts +++ b/site/src/testHelpers/entities.ts @@ -816,6 +816,7 @@ export const makeMockApiError = ({ }) export const MockEntitlements: TypesGen.Entitlements = { + errors: [], warnings: [], has_license: false, features: {}, @@ -824,6 +825,7 @@ export const MockEntitlements: TypesGen.Entitlements = { } export const MockEntitlementsWithWarnings: TypesGen.Entitlements = { + errors: [], warnings: ["You are over your active user limit.", "And another thing."], has_license: true, experimental: false, @@ -847,6 +849,7 @@ export const MockEntitlementsWithWarnings: TypesGen.Entitlements = { } export const MockEntitlementsWithAuditLog: TypesGen.Entitlements = { + errors: [], warnings: [], has_license: true, experimental: false, diff --git a/site/src/xServices/entitlements/entitlementsXService.ts b/site/src/xServices/entitlements/entitlementsXService.ts index 83ed44d12052d..a1e8bb0d9b895 100644 --- a/site/src/xServices/entitlements/entitlementsXService.ts +++ b/site/src/xServices/entitlements/entitlementsXService.ts @@ -20,6 +20,7 @@ export type EntitlementsEvent = | { type: "HIDE_MOCK_BANNER" } const emptyEntitlements = { + errors: [], warnings: [], features: {}, has_license: false, From c1aa3d230740ab4e2388c14781131b0f76686b33 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 00:36:24 +0000 Subject: [PATCH 30/79] Pipe TLS to DERP mesh --- cli/server.go | 3 +++ coderd/coderd.go | 3 +++ enterprise/coderd/coderd.go | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cli/server.go b/cli/server.go index 1ab1a6228f356..de15b7c63c84b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -322,6 +322,9 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code Experimental: ExperimentalEnabled(cmd), DeploymentFlags: dflags, } + if tlsConfig != nil { + options.TLSCertificates = tlsConfig.Certificates + } if dflags.OAuth2GithubClientSecret.Value != "" { options.GithubOAuth2Config, err = configureGithubOAuth2(accessURLParsed, diff --git a/coderd/coderd.go b/coderd/coderd.go index df5c85f030d09..735190373a4f3 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -1,6 +1,7 @@ package coderd import ( + "crypto/tls" "crypto/x509" "io" "net/http" @@ -76,6 +77,8 @@ type Options struct { TracerProvider trace.TracerProvider AutoImportTemplates []AutoImportTemplate + // TLSCertificates is used to mesh DERP servers securely. + TLSCertificates []tls.Certificate TailnetCoordinator tailnet.Coordinator DERPServer *derp.Server DERPMap *tailcfg.DERPMap diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 612e710395722..8a92c0b1c641a 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,6 +3,7 @@ package coderd import ( "context" "crypto/ed25519" + "crypto/tls" "net/http" "sync" "time" @@ -137,7 +138,11 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, nil) + // nolint:gosec + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ + Certificates: options.TLSCertificates, + ServerName: options.AccessURL.Host, + }) err = api.updateEntitlements(ctx) if err != nil { From 0cc4263715d02cbf83d7ef8c853d85fa427cb8ad Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 03:41:53 +0000 Subject: [PATCH 31/79] Fix DERP mesh with TLS --- enterprise/coderd/coderd.go | 17 ++++++- enterprise/derpmesh/derpmesh_test.go | 72 +++++++++++----------------- enterprise/tailnet/coordinator.go | 6 ++- go.mod | 2 +- go.sum | 4 +- site/src/api/typesGenerated.ts | 1 + 6 files changed, 53 insertions(+), 49 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 8a92c0b1c641a..803d13b44b7c4 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -4,6 +4,7 @@ import ( "context" "crypto/ed25519" "crypto/tls" + "crypto/x509" "net/http" "sync" "time" @@ -138,10 +139,22 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } + + rootCA := x509.NewCertPool() + for _, certificate := range options.TLSCertificates { + for _, certificatePart := range certificate.Certificate { + certificate, err := x509.ParseCertificate(certificatePart) + if err != nil { + return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err) + } + rootCA.AddCert(certificate) + } + } + // nolint:gosec api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ - Certificates: options.TLSCertificates, - ServerName: options.AccessURL.Host, + ServerName: options.AccessURL.Host, + RootCAs: rootCA, }) err = api.updateEntitlements(ctx) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 139e42566ffb1..353e51dd2983f 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -37,44 +37,27 @@ func TestMain(m *testing.M) { func TestDERPMesh(t *testing.T) { t.Parallel() - t.Run("ExchangeMessages", func(t *testing.T) { - // This tests messages passing through multiple DERP servers. - t.Parallel() - firstServer, firstServerURL, firstTLSName := startDERP(t) - defer firstServer.Close() - secondServer, secondServerURL, secondTLSName := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) - firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) - secondMesh.SetAddresses([]string{firstServerURL}) - defer firstMesh.Close() - defer secondMesh.Close() - - first := key.NewNode() - second := key.NewNode() - firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) - require.NoError(t, err) - secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) - require.NoError(t, err) - err = secondClient.Connect(context.Background()) - require.NoError(t, err) - - sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + commonName := "something.org" + rawCert := generateTLSCertificate(t, commonName) + certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) + require.NoError(t, err) + pool := x509.NewCertPool() + pool.AddCert(certificate) + tlsConfig := &tls.Config{ + ServerName: commonName, + RootCAs: pool, + Certificates: []tls.Certificate{rawCert}, + } - got := recvData(t, secondClient) - require.Equal(t, sent, got) - }) t.Run("ExchangeMessages", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - firstServer, firstServerURL, firstTLSName := startDERP(t) + firstServer, firstServerURL := startDERP(t, tlsConfig) defer firstServer.Close() - secondServer, secondServerURL, secondTLSName := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) + secondServer, secondServerURL := startDERP(t, tlsConfig) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, tlsConfig) firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, tlsConfig) secondMesh.SetAddresses([]string{firstServerURL}) defer firstMesh.Close() defer secondMesh.Close() @@ -83,8 +66,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) @@ -98,8 +83,8 @@ func TestDERPMesh(t *testing.T) { t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - server, serverURL, tlsName := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsName) + server, serverURL := startDERP(t, tlsConfig) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsConfig) mesh.SetAddresses([]string{"http://fake.com"}) // This should trigger a removal... mesh.SetAddresses([]string{}) @@ -109,8 +94,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, serverURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, serverURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) sent := []byte("hello world") @@ -124,8 +111,8 @@ func TestDERPMesh(t *testing.T) { meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { - server, url, tlsName := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsName) + server, url := startDERP(t, tlsConfig) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsConfig) t.Cleanup(func() { _ = server.Close() _ = mesh.Close() @@ -141,8 +128,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, serverURLs[9], tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, serverURLs[16], tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) @@ -172,21 +161,18 @@ func recvData(t *testing.T, client *derphttp.Client) []byte { } } -func startDERP(t *testing.T) (*derp.Server, string, *tls.Config) { +func startDERP(t *testing.T, tlsConfig *tls.Config) (*derp.Server, string) { logf := tailnet.Logger(slogtest.Make(t, nil)) d := derp.NewServer(key.NewNode(), logf) d.SetMeshKey("some-key") server := httptest.NewUnstartedServer(derphttp.Handler(d)) - commonName := "something.org" - server.TLS = &tls.Config{ - Certificates: []tls.Certificate{generateTLSCertificate(t, commonName)}, - } - server.Start() + server.TLS = tlsConfig + server.StartTLS() t.Cleanup(func() { _ = d.Close() }) t.Cleanup(server.Close) - return d, server.URL, server.TLS + return d, server.URL } func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 0643f7a259719..206dc68d6319c 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -294,7 +294,11 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( func (c *haCoordinator) Close() error { c.mutex.Lock() defer c.mutex.Unlock() - + select { + case <-c.close: + return nil + default: + } close(c.close) wg := sync.WaitGroup{} diff --git a/go.mod b/go.mod index b33a438eb3d08..195a09ae2b8fd 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5 // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 diff --git a/go.sum b/go.sum index 5852582c26c4a..b80c0d4173a5f 100644 --- a/go.sum +++ b/go.sum @@ -351,8 +351,8 @@ github.com/coder/retry v1.3.0 h1:5lAAwt/2Cm6lVmnfBY7sOMXcBOwcwJhmV5QGSELIVWY= github.com/coder/retry v1.3.0/go.mod h1:tXuRgZgWjUnU5LZPT4lJh4ew2elUhexhlnXzrJWdyFY= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHuiMNJ0qlhoD3pGN3JY9gxSko= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 h1:FgWWdu0fnFEpUNjW0vOaCuOxOZ/GQzn6oo7p5IMlSA0= -github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= +github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5 h1:WVH6e/qK3Wpl0wbmpORD2oQ1qLJborF3fsFHyO1ps0Y= +github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE= github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU= github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU= diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 92db958074a68..11b4c64e34786 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -538,6 +538,7 @@ export interface Replica { readonly relay_address: string readonly region_id: number readonly error: string + readonly database_latency: number } // From codersdk/error.go From f9177e40ecea7499f4c7d972b9a66f49fbe2689a Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 03:59:20 +0000 Subject: [PATCH 32/79] Add tests for TLS --- coderd/coderdtest/coderdtest.go | 14 ++++- codersdk/workspaceagents.go | 6 ++- enterprise/coderd/coderd.go | 3 +- .../coderd/coderdenttest/coderdenttest.go | 13 ++++- enterprise/coderd/replicas_test.go | 47 +++++++++++++++- enterprise/coderd/workspaceagents_test.go | 9 ++++ enterprise/derpmesh/derpmesh_test.go | 47 +--------------- testutil/certificate.go | 53 +++++++++++++++++++ 8 files changed, 138 insertions(+), 54 deletions(-) create mode 100644 testutil/certificate.go diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 59b414cad8903..cbbcd7aaa493a 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -7,6 +7,7 @@ import ( "crypto/rand" "crypto/rsa" "crypto/sha256" + "crypto/tls" "crypto/x509" "crypto/x509/pkix" "encoding/base64" @@ -75,6 +76,7 @@ type Options struct { AutobuildTicker <-chan time.Time AutobuildStats chan<- executor.Stats Auditor audit.Auditor + TLSCertificates []tls.Certificate // IncludeProvisionerDaemon when true means to start an in-memory provisionerD IncludeProvisionerDaemon bool @@ -158,7 +160,14 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance srv.Config.BaseContext = func(_ net.Listener) context.Context { return ctx } - srv.Start() + if options.TLSCertificates != nil { + srv.TLS = &tls.Config{ + Certificates: options.TLSCertificates, + } + srv.StartTLS() + } else { + srv.Start() + } t.Cleanup(srv.Close) tcpAddr, ok := srv.Listener.Addr().(*net.TCPAddr) @@ -201,6 +210,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance APIRateLimit: options.APIRateLimit, Authorizer: options.Authorizer, Telemetry: telemetry.NewNoop(), + TLSCertificates: options.TLSCertificates, DERPMap: &tailcfg.DERPMap{ Regions: map[int]*tailcfg.DERPRegion{ 1: { @@ -215,7 +225,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance DERPPort: derpPort, STUNPort: stunAddr.Port, InsecureForTests: true, - ForceHTTP: true, + ForceHTTP: options.TLSCertificates == nil, }}, }, }, diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index c86b399e189ab..c86944ae2b629 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -315,7 +315,8 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err Value: c.SessionToken, }}) httpClient := &http.Client{ - Jar: jar, + Jar: jar, + Transport: c.HTTPClient.Transport, } // nolint:bodyclose conn, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{ @@ -380,7 +381,8 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti Value: c.SessionToken, }}) httpClient := &http.Client{ - Jar: jar, + Jar: jar, + Transport: c.HTTPClient.Transport, } ctx, cancelFunc := context.WithCancel(ctx) closed := make(chan struct{}) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 803d13b44b7c4..469205cfa2e96 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -150,10 +150,9 @@ func New(ctx context.Context, options *Options) (*API, error) { rootCA.AddCert(certificate) } } - // nolint:gosec api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ - ServerName: options.AccessURL.Host, + ServerName: options.AccessURL.Hostname(), RootCAs: rootCA, }) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index a503a22ce459c..02eff4e2acf2e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -4,7 +4,9 @@ import ( "context" "crypto/ed25519" "crypto/rand" + "crypto/tls" "io" + "net/http" "testing" "time" @@ -85,7 +87,16 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c _ = provisionerCloser.Close() _ = coderAPI.Close() }) - return codersdk.New(coderAPI.AccessURL), provisionerCloser, coderAPI + client := codersdk.New(coderAPI.AccessURL) + client.HTTPClient = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + //nolint:gosec + InsecureSkipVerify: true, + }, + }, + } + return client, provisionerCloser, coderAPI } type LicenseOptions struct { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 0272fb018f3d6..b66bcaef9f976 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -2,6 +2,7 @@ package coderd_test import ( "context" + "crypto/tls" "testing" "time" @@ -19,7 +20,7 @@ import ( func TestReplicas(t *testing.T) { t.Parallel() - t.Run("WarningsWithoutLicense", func(t *testing.T) { + t.Run("ErrorWithoutLicense", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) firstClient := coderdenttest.New(t, &coderdenttest.Options{ @@ -39,7 +40,7 @@ func TestReplicas(t *testing.T) { secondClient.SessionToken = firstClient.SessionToken ents, err := secondClient.Entitlements(context.Background()) require.NoError(t, err) - require.Len(t, ents.Warnings, 1) + require.Len(t, ents.Errors, 1) _ = secondAPI.Close() ents, err = firstClient.Entitlements(context.Background()) @@ -85,6 +86,48 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + }) + t.Run("ConnectAcrossMultipleTLS", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + certificates := []tls.Certificate{testutil.GenerateTLSCertificate(t, "localhost")} + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + TLSCertificates: certificates, + }, + }) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) + + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + TLSCertificates: certificates, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + replicas, err := secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + _, agent := setupWorkspaceAgent(t, firstClient, firstUser, 0) + conn, err := secondClient.DialWorkspaceAgent(context.Background(), agent.ID, &codersdk.DialWorkspaceAgentOptions{ + BlockEndpoints: true, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + }) + require.NoError(t, err) + require.Eventually(t, func() bool { + ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + defer cancelFunc() + _, err = conn.Ping(ctx) + return err == nil + }, testutil.WaitLong, testutil.IntervalFast) + _ = conn.Close() }) } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 097bab354ba74..18285bcb94317 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -2,6 +2,7 @@ package coderd_test import ( "context" + "crypto/tls" "fmt" "net/http" "testing" @@ -108,6 +109,14 @@ func setupWorkspaceAgent(t *testing.T, client *codersdk.Client, user codersdk.Cr workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) agentClient := codersdk.New(client.URL) + agentClient.HTTPClient = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + //nolint:gosec + InsecureSkipVerify: true, + }, + }, + } agentClient.SessionToken = authToken agentCloser := agent.New(agent.Options{ FetchMetadata: agentClient.WorkspaceAgentMetadata, diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 353e51dd2983f..fcf410ac0e574 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -1,22 +1,13 @@ package derpmesh_test import ( - "bytes" "context" - "crypto/ecdsa" - "crypto/elliptic" - "crypto/rand" "crypto/tls" "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "errors" "io" - "math/big" - "net" "net/http/httptest" "testing" - "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -29,6 +20,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/tailnet" + "github.com/coder/coder/testutil" ) func TestMain(m *testing.M) { @@ -38,7 +30,7 @@ func TestMain(m *testing.M) { func TestDERPMesh(t *testing.T) { t.Parallel() commonName := "something.org" - rawCert := generateTLSCertificate(t, commonName) + rawCert := testutil.GenerateTLSCertificate(t, commonName) certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) require.NoError(t, err) pool := x509.NewCertPool() @@ -174,38 +166,3 @@ func startDERP(t *testing.T, tlsConfig *tls.Config) (*derp.Server, string) { t.Cleanup(server.Close) return d, server.URL } - -func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { - privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - require.NoError(t, err) - template := x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{ - Organization: []string{"Acme Co"}, - CommonName: commonName, - }, - DNSNames: []string{commonName}, - NotBefore: time.Now(), - NotAfter: time.Now().Add(time.Hour * 24 * 180), - - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) - require.NoError(t, err) - var certFile bytes.Buffer - require.NoError(t, err) - _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) - require.NoError(t, err) - privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) - require.NoError(t, err) - var keyFile bytes.Buffer - err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) - require.NoError(t, err) - cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) - require.NoError(t, err) - return cert -} diff --git a/testutil/certificate.go b/testutil/certificate.go new file mode 100644 index 0000000000000..1edc975746958 --- /dev/null +++ b/testutil/certificate.go @@ -0,0 +1,53 @@ +package testutil + +import ( + "bytes" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "net" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func GenerateTLSCertificate(t testing.TB, commonName string) tls.Certificate { + privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + template := x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{ + Organization: []string{"Acme Co"}, + CommonName: commonName, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour * 24 * 180), + + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) + require.NoError(t, err) + var certFile bytes.Buffer + require.NoError(t, err) + _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) + require.NoError(t, err) + privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) + require.NoError(t, err) + var keyFile bytes.Buffer + err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) + require.NoError(t, err) + cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) + require.NoError(t, err) + return cert +} From ee59d88a087408885a8297afb8eda41b0bd73a54 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:09:16 +0000 Subject: [PATCH 33/79] Fix replica sync TLS --- enterprise/coderd/coderd.go | 38 ++++++++++++++++----------- enterprise/coderd/replicas_test.go | 6 +++++ enterprise/replicasync/replicasync.go | 5 ++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 469205cfa2e96..6bbbcb16f33cf 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -129,32 +129,38 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - var err error - api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - // Create a new replica ID for each Coder instance! - ID: uuid.New(), - RelayAddress: options.DERPServerRelayAddress, - RegionID: int32(options.DERPServerRegionID), - }) - if err != nil { - return nil, xerrors.Errorf("initialize replica: %w", err) - } - - rootCA := x509.NewCertPool() + meshRootCA := x509.NewCertPool() for _, certificate := range options.TLSCertificates { for _, certificatePart := range certificate.Certificate { certificate, err := x509.ParseCertificate(certificatePart) if err != nil { return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err) } - rootCA.AddCert(certificate) + meshRootCA.AddCert(certificate) } } - // nolint:gosec - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ + // This TLS configuration spoofs access from the access URL hostname + // assuming that the certificates provided will cover that hostname. + // + // Replica sync and DERP meshing require accessing replicas via their + // internal IP addresses, and if TLS is configured we use the same + // certificates. + meshTLSConfig := &tls.Config{ ServerName: options.AccessURL.Hostname(), - RootCAs: rootCA, + RootCAs: meshRootCA, + } + var err error + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ + // Create a new replica ID for each Coder instance! + ID: uuid.New(), + RelayAddress: options.DERPServerRelayAddress, + RegionID: int32(options.DERPServerRegionID), + TLSConfig: meshTLSConfig, }) + if err != nil { + return nil, xerrors.Errorf("initialize replica: %w", err) + } + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig) err = api.updateEntitlements(ctx) if err != nil { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index b66bcaef9f976..63bae9ebce9e6 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -129,5 +129,11 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + replicas, err = secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + for _, replica := range replicas { + require.Empty(t, replica.Error) + } }) } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 75ba041aaa6e1..758a11a84e842 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -2,6 +2,7 @@ package replicasync import ( "context" + "crypto/tls" "database/sql" "errors" "fmt" @@ -30,6 +31,7 @@ type Options struct { PeerTimeout time.Duration RelayAddress string RegionID int32 + TLSConfig *tls.Config } // New registers the replica with the database and periodically updates to ensure @@ -254,6 +256,9 @@ func (m *Manager) run(ctx context.Context) error { } client := http.Client{ Timeout: m.options.PeerTimeout, + Transport: &http.Transport{ + TLSClientConfig: m.options.TLSConfig, + }, } res, err := client.Do(req) if err != nil { From 8641e58790b067a70ae29fb4e67e89d952b1fff3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:28:50 +0000 Subject: [PATCH 34/79] Fix RootCA for replica meshing --- enterprise/coderd/coderd.go | 5 ++- enterprise/replicasync/replicasync_test.go | 44 ++++++++++++++++++++++ helm/templates/service.yaml | 1 + 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 6bbbcb16f33cf..f836f786463d2 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -146,8 +146,9 @@ func New(ctx context.Context, options *Options) (*API, error) { // internal IP addresses, and if TLS is configured we use the same // certificates. meshTLSConfig := &tls.Config{ - ServerName: options.AccessURL.Hostname(), - RootCAs: meshRootCA, + Certificates: options.TLSCertificates, + RootCAs: meshRootCA, + ServerName: options.AccessURL.Hostname(), } var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index ccacbeb310c23..faba7345183ff 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -2,6 +2,8 @@ package replicasync_test import ( "context" + "crypto/tls" + "crypto/x509" "net/http" "net/http/httptest" "sync" @@ -112,6 +114,48 @@ func TestReplica(t *testing.T) { require.False(t, server.Self().Error.Valid) _ = server.Close() }) + t.Run("ConnectsToPeerReplicaTLS", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + rawCert := testutil.GenerateTLSCertificate(t, "hello.org") + certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) + require.NoError(t, err) + pool := x509.NewCertPool() + pool.AddCert(certificate) + // nolint:gosec + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{rawCert}, + ServerName: "hello.org", + RootCAs: pool, + } + srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + srv.TLS = tlsConfig + srv.StartTLS() + defer srv.Close() + db, pubsub := dbtestutil.NewDB(t) + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + RelayAddress: srv.URL, + }) + require.NoError(t, err) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ + ID: uuid.New(), + RelayAddress: "http://169.254.169.254", + TLSConfig: tlsConfig, + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.False(t, server.Self().Error.Valid) + _ = server.Close() + }) t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml index 28fe0e9f9aa8c..b9a7e9a2f0886 100644 --- a/helm/templates/service.yaml +++ b/helm/templates/service.yaml @@ -10,6 +10,7 @@ metadata: {{- toYaml .Values.coder.service.annotations | nindent 4 }} spec: type: {{ .Values.coder.service.type }} + sessionAffinity: ClientIP ports: - name: {{ include "coder.portName" . | quote }} port: {{ include "coder.servicePort" . }} From 3dfb796c29ae142be30399ffb4b1dd279f567ca0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:51:04 +0000 Subject: [PATCH 35/79] Remove ID from replicasync --- enterprise/coderd/coderd.go | 6 +- enterprise/coderd/replicas_test.go | 2 +- enterprise/replicasync/replicasync.go | 65 ++++++++-------------- enterprise/replicasync/replicasync_test.go | 53 ++++-------------- 4 files changed, 35 insertions(+), 91 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f836f786463d2..371ac12fe21b8 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -13,7 +13,6 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/go-chi/chi/v5" - "github.com/google/uuid" "cdr.dev/slog" "github.com/coder/coder/coderd" @@ -146,14 +145,13 @@ func New(ctx context.Context, options *Options) (*API, error) { // internal IP addresses, and if TLS is configured we use the same // certificates. meshTLSConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, Certificates: options.TLSCertificates, RootCAs: meshRootCA, ServerName: options.AccessURL.Hostname(), } var err error - api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - // Create a new replica ID for each Coder instance! - ID: uuid.New(), + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, &replicasync.Options{ RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), TLSConfig: meshTLSConfig, diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 63bae9ebce9e6..9d6970823befb 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -123,7 +123,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalMedium) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 758a11a84e842..82e7d74273eeb 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -26,7 +26,6 @@ var ( ) type Options struct { - ID uuid.UUID UpdateInterval time.Duration PeerTimeout time.Duration RelayAddress string @@ -36,9 +35,9 @@ type Options struct { // New registers the replica with the database and periodically updates to ensure // it's healthy. It contacts all other alive replicas to ensure they are reachable. -func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Manager, error) { - if options.ID == uuid.Nil { - panic("An ID must be provided!") +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options *Options) (*Manager, error) { + if options == nil { + options = &Options{} } if options.PeerTimeout == 0 { options.PeerTimeout = 3 * time.Second @@ -54,50 +53,29 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("ping database: %w", err) } - var replica database.Replica - _, err = db.GetReplicaByID(ctx, options.ID) + id := uuid.New() + replica, err := db.InsertReplica(ctx, database.InsertReplicaParams{ + ID: id, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + DatabaseLatency: int32(databaseLatency.Microseconds()), + }) if err != nil { - if !errors.Is(err, sql.ErrNoRows) { - return nil, xerrors.Errorf("get replica: %w", err) - } - replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ - ID: options.ID, - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), - Hostname: hostname, - RegionID: options.RegionID, - RelayAddress: options.RelayAddress, - Version: buildinfo.Version(), - DatabaseLatency: int32(databaseLatency.Microseconds()), - }) - if err != nil { - return nil, xerrors.Errorf("insert replica: %w", err) - } - } else { - replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: options.ID, - UpdatedAt: database.Now(), - StartedAt: database.Now(), - StoppedAt: sql.NullTime{}, - RelayAddress: options.RelayAddress, - RegionID: options.RegionID, - Hostname: hostname, - Version: buildinfo.Version(), - Error: sql.NullString{}, - DatabaseLatency: int32(databaseLatency.Microseconds()), - }) - if err != nil { - return nil, xerrors.Errorf("update replica: %w", err) - } + return nil, xerrors.Errorf("insert replica: %w", err) } - err = pubsub.Publish(PubsubEvent, []byte(options.ID.String())) + err = pubsub.Publish(PubsubEvent, []byte(id.String())) if err != nil { return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) server := &Manager{ - options: &options, + id: id, + options: options, db: db, pubsub: pubsub, self: replica, @@ -128,6 +106,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data // Manager keeps the replica up to date and in sync with other replicas. type Manager struct { + id uuid.UUID options *Options db database.Store pubsub database.Pubsub @@ -196,7 +175,7 @@ func (m *Manager) subscribe(ctx context.Context) error { return } // Don't process updates for ourself! - if id == m.options.ID { + if id == m.id { return } if updating { @@ -233,7 +212,7 @@ func (m *Manager) run(ctx context.Context) error { m.mutex.Lock() m.peers = make([]database.Replica, 0, len(replicas)) for _, replica := range replicas { - if replica.ID == m.options.ID { + if replica.ID == m.id { continue } m.peers = append(m.peers, replica) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index faba7345183ff..0b42f44791df4 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -11,7 +11,6 @@ import ( "time" "github.com/google/uuid" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" @@ -32,38 +31,15 @@ func TestReplica(t *testing.T) { // This ensures that a new replica is created on New. t.Parallel() db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() + closeChan := make(chan struct{}, 1) cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - assert.Equal(t, []byte(id.String()), message) + closeChan <- struct{}{} }) require.NoError(t, err) defer cancel() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) - require.NoError(t, err) - _ = server.Close() - require.NoError(t, err) - }) - t.Run("UpdatesOnNew", func(t *testing.T) { - // This ensures that a replica is updated when it initially connects - // and immediately publishes it's existence! - t.Parallel() - db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() - _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ - ID: id, - }) - require.NoError(t, err) - cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - assert.Equal(t, []byte(id.String()), message) - }) - require.NoError(t, err) - defer cancel() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.NoError(t, err) + <-closeChan _ = server.Close() require.NoError(t, err) }) @@ -80,9 +56,7 @@ func TestReplica(t *testing.T) { Hostname: "something", }) require.NoError(t, err) - _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), - }) + _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.Error(t, err) require.Equal(t, "a relay address must be specified when running multiple replicas in the same region", err.Error()) }) @@ -104,8 +78,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) @@ -145,8 +118,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ RelayAddress: "http://169.254.169.254", TLSConfig: tlsConfig, }) @@ -169,8 +141,7 @@ func TestReplica(t *testing.T) { RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ PeerTimeout: 1 * time.Millisecond, RelayAddress: "http://169.254.169.254", }) @@ -185,10 +156,7 @@ func TestReplica(t *testing.T) { // Refresh when a new replica appears! t.Parallel() db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.NoError(t, err) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) @@ -224,8 +192,7 @@ func TestReplica(t *testing.T) { count := 20 wg.Add(count) for i := 0; i < count; i++ { - server, err := replicasync.New(context.Background(), logger, db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), logger, db, pubsub, &replicasync.Options{ RelayAddress: srv.URL, }) require.NoError(t, err) From ec2c1f13403216460e6ab3104c3c5fe5b5355667 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:56:58 +0000 Subject: [PATCH 36/79] Fix getting certificates for meshing --- cli/server.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/server.go b/cli/server.go index de15b7c63c84b..0704aca10b07e 100644 --- a/cli/server.go +++ b/cli/server.go @@ -929,6 +929,7 @@ func configureTLS(tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles if err != nil { return nil, xerrors.Errorf("load certificates: %w", err) } + tlsConfig.Certificates = certs tlsConfig.GetCertificate = func(hi *tls.ClientHelloInfo) (*tls.Certificate, error) { // If there's only one certificate, return it. if len(certs) == 1 { From 590f0f896ae74d9765d7014150936bbc8ed6278d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:22:04 +0000 Subject: [PATCH 37/79] Remove excessive locking --- enterprise/replicasync/replicasync.go | 22 +++++++++--------- enterprise/tailnet/coordinator.go | 32 +++++++++------------------ tailnet/coordinator.go | 24 +++++++++++--------- 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 82e7d74273eeb..b635a84991e24 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -143,9 +143,11 @@ func (m *Manager) loop(ctx context.Context) { // subscribe listens for new replica information! func (m *Manager) subscribe(ctx context.Context) error { - needsUpdate := false - updating := false - updateMutex := sync.Mutex{} + var ( + needsUpdate = false + updating = false + updateMutex = sync.Mutex{} + ) // This loop will continually update nodes as updates are processed. // The intent is to always be up to date without spamming the run @@ -199,9 +201,7 @@ func (m *Manager) run(ctx context.Context) error { m.closeMutex.Lock() m.closeWait.Add(1) m.closeMutex.Unlock() - go func() { - m.closeWait.Done() - }() + defer m.closeWait.Done() // Expect replicas to update once every three times the interval... // If they don't, assume death! replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) @@ -224,8 +224,7 @@ func (m *Manager) run(ctx context.Context) error { failed := make([]string, 0) for _, peer := range m.Regional() { wg.Add(1) - peer := peer - go func() { + go func(peer database.Replica) { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { @@ -247,7 +246,7 @@ func (m *Manager) run(ctx context.Context) error { return } _ = res.Body.Close() - }() + }(peer) } wg.Wait() replicaError := sql.NullString{} @@ -279,11 +278,11 @@ func (m *Manager) run(ctx context.Context) error { return xerrors.Errorf("update replica: %w", err) } m.mutex.Lock() + defer m.mutex.Unlock() if m.self.Error.String != replica.Error.String { // Publish an update occurred! err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { - m.mutex.Unlock() return xerrors.Errorf("publish replica update: %w", err) } } @@ -291,7 +290,6 @@ func (m *Manager) run(ctx context.Context) error { if m.callback != nil { go m.callback() } - m.mutex.Unlock() return nil } @@ -306,7 +304,7 @@ func (m *Manager) Self() database.Replica { func (m *Manager) All() []database.Replica { m.mutex.Lock() defer m.mutex.Unlock() - return append(m.peers, m.self) + return append(m.peers[:], m.self) } // Regional returns all replicas in the same region excluding itself. diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 206dc68d6319c..1ccf56f50da11 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -69,19 +69,19 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] + c.mutex.Unlock() if ok { data, err := json.Marshal([]*agpl.Node{node}) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal node: %w", err) } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } + c.mutex.Lock() connectionSockets, ok := c.agentToConnectionSockets[agent] if !ok { connectionSockets = map[uuid.UUID]net.Conn{} @@ -129,28 +129,17 @@ func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *js } c.mutex.Lock() - defer c.mutex.Unlock() - // Update the node of this client in our in-memory map. If an agent entirely // shuts down and reconnects, it needs to be aware of all clients attempting // to establish connections. c.nodes[id] = &node - // Write the new node from this client to the actively connected agent. - err = c.writeNodeToAgent(agent, &node) - if err != nil { - return xerrors.Errorf("write node to agent: %w", err) - } - - return nil -} - -func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error { agentSocket, ok := c.agentSockets[agent] + c.mutex.Unlock() if !ok { // If we don't own the agent locally, send it over pubsub to a node that // owns the agent. - err := c.publishNodesToAgent(agent, []*agpl.Node{node}) + err := c.publishNodesToAgent(agent, []*agpl.Node{&node}) if err != nil { return xerrors.Errorf("publish node to agent") } @@ -159,7 +148,7 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error // Write the new node from this client to the actively // connected agent. - data, err := json.Marshal([]*agpl.Node{node}) + data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { return xerrors.Errorf("marshal nodes: %w", err) } @@ -171,14 +160,13 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error } return xerrors.Errorf("write json: %w", err) } + return nil } // ServeAgent accepts a WebSocket connection to an agent that listens to // incoming connections and publishes node updates. func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { - c.mutex.Lock() - // Tell clients on other instances to send a callmemaybe to us. err := c.publishAgentHello(id) if err != nil { @@ -203,6 +191,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. + c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() @@ -234,6 +223,8 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { + c.mutex.Lock() + defer c.mutex.Unlock() sockets, ok := c.agentToConnectionSockets[agentID] if !ok { return nil @@ -279,12 +270,11 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( for _, connectionSocket := range connectionSockets { connectionSocket := connectionSocket go func() { + defer wg.Done() _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) _, _ = connectionSocket.Write(data) - wg.Done() }() } - wg.Wait() return &node, nil } @@ -428,9 +418,7 @@ func (c *haCoordinator) runPubsub() error { return } - c.mutex.Lock() nodes := c.nodesSubscribedToAgent(agentUUID) - c.mutex.Unlock() if len(nodes) > 0 { err := c.publishNodesToAgent(agentUUID, nodes) if err != nil { diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 96de8d295162e..23531af1260f5 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -127,25 +127,26 @@ type coordinator struct { } // Node returns an in-memory node by ID. +// If the node does not exist, nil is returned. func (c *coordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() - node := c.nodes[id] - return node + return c.nodes[id] } // ServeClient accepts a WebSocket connection that wants to connect to an agent // with the specified ID. func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() - if c.closed { + c.mutex.Unlock() return xerrors.New("coordinator is closed") } // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] + c.mutex.Unlock() if ok { data, err := json.Marshal([]*Node{node}) if err != nil { @@ -158,6 +159,7 @@ func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) return xerrors.Errorf("write nodes: %w", err) } } + c.mutex.Lock() connectionSockets, ok := c.agentToConnectionSockets[agent] if !ok { connectionSockets = map[uuid.UUID]net.Conn{} @@ -203,7 +205,6 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json } c.mutex.Lock() - // Update the node of this client in our in-memory map. If an agent entirely // shuts down and reconnects, it needs to be aware of all clients attempting // to establish connections. @@ -237,12 +238,13 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json // listens to incoming connections and publishes node updates. func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() - if c.closed { + c.mutex.Unlock() return xerrors.New("coordinator is closed") } sockets, ok := c.agentToConnectionSockets[id] + c.mutex.Unlock() if ok { // Publish all nodes that want to connect to the // desired agent ID. @@ -269,6 +271,7 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. + c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() @@ -302,17 +305,15 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder } c.mutex.Lock() - c.nodes[id] = &node connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { c.mutex.Unlock() return nil } - + c.mutex.Unlock() data, err := json.Marshal([]*Node{&node}) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal nodes: %w", err) } @@ -328,7 +329,6 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder }() } - c.mutex.Unlock() wg.Wait() return nil } @@ -337,9 +337,11 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder // coordinator from accepting new connections. func (c *coordinator) Close() error { c.mutex.Lock() - defer c.mutex.Unlock() - + if c.closed { + return nil + } c.closed = true + c.mutex.Unlock() wg := sync.WaitGroup{} From d8580d107a16c4802262b9f8919175512c8d6ec6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:24:35 +0000 Subject: [PATCH 38/79] Fix linting --- coderd/coderdtest/coderdtest.go | 1 + enterprise/coderd/replicas_test.go | 3 +-- enterprise/derpmesh/derpmesh_test.go | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index cbbcd7aaa493a..4141e33cf8648 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -163,6 +163,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance if options.TLSCertificates != nil { srv.TLS = &tls.Config{ Certificates: options.TLSCertificates, + MinVersion: tls.VersionTLS12, } srv.StartTLS() } else { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 9d6970823befb..f9f6e138bd3cc 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -4,7 +4,6 @@ import ( "context" "crypto/tls" "testing" - "time" "github.com/stretchr/testify/require" @@ -80,7 +79,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index fcf410ac0e574..d1131d59da25b 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -36,6 +36,7 @@ func TestDERPMesh(t *testing.T) { pool := x509.NewCertPool() pool.AddCert(certificate) tlsConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, ServerName: commonName, RootCAs: pool, Certificates: []tls.Certificate{rawCert}, From ae956fbc00df6bed796e3624b515849002b8bb21 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:44:04 +0000 Subject: [PATCH 39/79] Store mesh key in the database --- coderd/coderd.go | 1 - coderd/database/databasefake/databasefake.go | 20 +++++++++++++++-- coderd/database/querier.go | 2 ++ coderd/database/queries.sql.go | 20 +++++++++++++++++ coderd/database/queries/siteconfig.sql | 6 +++++ enterprise/cli/server.go | 23 ++++++++++++++++++++ 6 files changed, 69 insertions(+), 3 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 735190373a4f3..2fe0a5dc0d08e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -127,7 +127,6 @@ func New(options *Options) *API { } if options.DERPServer == nil { options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) - options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index b4724a9afe0aa..e860c69dd11cb 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -110,10 +110,11 @@ type data struct { replicas []database.Replica deploymentID string + derpMeshKey string lastLicenseID int32 } -func (q *fakeQuerier) Ping(_ context.Context) (time.Duration, error) { +func (*fakeQuerier) Ping(_ context.Context) (time.Duration, error) { return 0, nil } @@ -2890,6 +2891,21 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) { return q.deploymentID, nil } +func (q *fakeQuerier) InsertDERPMeshKey(_ context.Context, id string) error { + q.mutex.Lock() + defer q.mutex.Unlock() + + q.derpMeshKey = id + return nil +} + +func (q *fakeQuerier) GetDERPMeshKey(_ context.Context) (string, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + return q.derpMeshKey, nil +} + func (q *fakeQuerier) InsertLicense( _ context.Context, arg database.InsertLicenseParams, ) (database.License, error) { @@ -3156,7 +3172,7 @@ func (q *fakeQuerier) DeleteGroupByID(_ context.Context, id uuid.UUID) error { return sql.ErrNoRows } -func (q *fakeQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, before time.Time) error { +func (q *fakeQuerier) DeleteReplicasUpdatedBefore(_ context.Context, before time.Time) error { q.mutex.Lock() defer q.mutex.Unlock() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 957de26c89e05..7f2f0d942bb10 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -39,6 +39,7 @@ type sqlcQuerier interface { // This function returns roles for authorization purposes. Implied member roles // are included. GetAuthorizationUserRoles(ctx context.Context, userID uuid.UUID) (GetAuthorizationUserRolesRow, error) + GetDERPMeshKey(ctx context.Context) (string, error) GetDeploymentID(ctx context.Context) (string, error) GetFileByHashAndCreator(ctx context.Context, arg GetFileByHashAndCreatorParams) (File, error) GetFileByID(ctx context.Context, id uuid.UUID) (File, error) @@ -125,6 +126,7 @@ type sqlcQuerier interface { // every member of the org. InsertAllUsersGroup(ctx context.Context, organizationID uuid.UUID) (Group, error) InsertAuditLog(ctx context.Context, arg InsertAuditLogParams) (AuditLog, error) + InsertDERPMeshKey(ctx context.Context, value string) error InsertDeploymentID(ctx context.Context, value string) error InsertFile(ctx context.Context, arg InsertFileParams) (File, error) InsertGitSSHKey(ctx context.Context, arg InsertGitSSHKeyParams) (GitSSHKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 241474e7e66bd..c40b93426ddee 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2753,6 +2753,17 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) return i, err } +const getDERPMeshKey = `-- name: GetDERPMeshKey :one +SELECT value FROM site_configs WHERE key = 'derp_mesh_key' +` + +func (q *sqlQuerier) GetDERPMeshKey(ctx context.Context) (string, error) { + row := q.db.QueryRowContext(ctx, getDERPMeshKey) + var value string + err := row.Scan(&value) + return value, err +} + const getDeploymentID = `-- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id' ` @@ -2764,6 +2775,15 @@ func (q *sqlQuerier) GetDeploymentID(ctx context.Context) (string, error) { return value, err } +const insertDERPMeshKey = `-- name: InsertDERPMeshKey :exec +INSERT INTO site_configs (key, value) VALUES ('derp_mesh_key', $1) +` + +func (q *sqlQuerier) InsertDERPMeshKey(ctx context.Context, value string) error { + _, err := q.db.ExecContext(ctx, insertDERPMeshKey, value) + return err +} + const insertDeploymentID = `-- name: InsertDeploymentID :exec INSERT INTO site_configs (key, value) VALUES ('deployment_id', $1) ` diff --git a/coderd/database/queries/siteconfig.sql b/coderd/database/queries/siteconfig.sql index 9d3936e23886d..b975d2f68cc3c 100644 --- a/coderd/database/queries/siteconfig.sql +++ b/coderd/database/queries/siteconfig.sql @@ -3,3 +3,9 @@ INSERT INTO site_configs (key, value) VALUES ('deployment_id', $1); -- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id'; + +-- name: InsertDERPMeshKey :exec +INSERT INTO site_configs (key, value) VALUES ('derp_mesh_key', $1); + +-- name: GetDERPMeshKey :one +SELECT value FROM site_configs WHERE key = 'derp_mesh_key'; diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index f3e99c1613ab8..a65b8e8faa6e0 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,14 +2,20 @@ package cli import ( "context" + "database/sql" + "errors" "io" "net/url" "github.com/spf13/cobra" "golang.org/x/xerrors" + "tailscale.com/derp" + "tailscale.com/types/key" "github.com/coder/coder/cli/deployment" + "github.com/coder/coder/cryptorand" "github.com/coder/coder/enterprise/coderd" + "github.com/coder/coder/tailnet" agpl "github.com/coder/coder/cli" agplcoderd "github.com/coder/coder/coderd" @@ -25,6 +31,23 @@ func server() *cobra.Command { } } + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) + meshKey, err := options.Database.GetDERPMeshKey(ctx) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + return nil, nil, xerrors.Errorf("get mesh key: %w", err) + } + meshKey, err = cryptorand.String(32) + if err != nil { + return nil, nil, xerrors.Errorf("generate mesh key: %w", err) + } + err = options.Database.InsertDERPMeshKey(ctx, meshKey) + if err != nil { + return nil, nil, xerrors.Errorf("insert mesh key: %w", err) + } + } + options.DERPServer.SetMeshKey(meshKey) + o := &coderd.Options{ AuditLogging: dflags.AuditLogging.Value, BrowserOnly: dflags.BrowserOnly.Value, From d703e2d08aeb82ac80e5adea594ec0951a1e80a6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:37:32 +0000 Subject: [PATCH 40/79] Fix replica key for tests --- coderd/coderdtest/coderdtest.go | 7 +++++++ enterprise/coderd/replicas_test.go | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 4141e33cf8648..c0361dfa8e2b8 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -37,8 +37,10 @@ import ( "golang.org/x/xerrors" "google.golang.org/api/idtoken" "google.golang.org/api/option" + "tailscale.com/derp" "tailscale.com/net/stun/stuntest" "tailscale.com/tailcfg" + "tailscale.com/types/key" "tailscale.com/types/nettype" "cdr.dev/slog" @@ -59,6 +61,7 @@ import ( "github.com/coder/coder/provisionerd" "github.com/coder/coder/provisionersdk" "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -184,6 +187,9 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance stunAddr, stunCleanup := stuntest.ServeWithPacketListener(t, nettype.Std{}) t.Cleanup(stunCleanup) + derpServer := derp.NewServer(key.NewNode(), tailnet.Logger(slogtest.Make(t, nil).Named("derp"))) + derpServer.SetMeshKey("test-key") + // match default with cli default if options.SSHKeygenAlgorithm == "" { options.SSHKeygenAlgorithm = gitsshkey.AlgorithmEd25519 @@ -208,6 +214,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance OIDCConfig: options.OIDCConfig, GoogleTokenValidator: options.GoogleTokenValidator, SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, + DERPServer: derpServer, APIRateLimit: options.APIRateLimit, Authorizer: options.Authorizer, Telemetry: telemetry.NewNoop(), diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index f9f6e138bd3cc..fae418ab87261 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -79,7 +79,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitShort) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil @@ -122,7 +122,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalMedium) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil From 9bb021c0e69f07efb4fb499eecad1359575432fc Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:38:08 +0000 Subject: [PATCH 41/79] Fix types gen --- site/src/api/api.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/site/src/api/api.ts b/site/src/api/api.ts index 2e60a88b8469c..fb12571fd91ae 100644 --- a/site/src/api/api.ts +++ b/site/src/api/api.ts @@ -28,6 +28,7 @@ export const defaultEntitlements = (): TypesGen.Entitlements => { return { features: features, has_license: false, + errors: [], warnings: [], experimental: false, trial: false, From 76c9e2c959bb7b260ebed3274ce525e5cac813a0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:44:56 +0000 Subject: [PATCH 42/79] Fix unlocking unlocked --- tailnet/coordinator.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 23531af1260f5..9d722ddeee117 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -263,7 +263,6 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } From 09e87b0aa06da43125ed9cc767c2fd7877657f9f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:57:46 +0000 Subject: [PATCH 43/79] Fix race in tests --- tailnet/coordinator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 9d722ddeee117..491c0db885224 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -310,7 +310,6 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder c.mutex.Unlock() return nil } - c.mutex.Unlock() data, err := json.Marshal([]*Node{&node}) if err != nil { return xerrors.Errorf("marshal nodes: %w", err) @@ -328,6 +327,7 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder }() } + c.mutex.Unlock() wg.Wait() return nil } From 18c0464e7f106d81c3dbdb84f2f7ad48d75fc5e3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 01:58:05 -0500 Subject: [PATCH 44/79] Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler --- enterprise/derpmesh/derpmesh.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 8f51343017593..059eac5a107e7 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -82,7 +82,7 @@ func (m *Mesh) SetAddresses(addresses []string) { m.mutex.Unlock() } -// addAddress begins meshing with a new address. +// addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp func (m *Mesh) addAddress(address string) (bool, error) { From 6f25b2d44b66b12c5c65c20c0095588c3b51347b Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:58:54 +0000 Subject: [PATCH 45/79] Rename to syncReplicas --- enterprise/replicasync/replicasync.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index b635a84991e24..46123953298de 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -73,7 +73,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) - server := &Manager{ + manager := &Manager{ id: id, options: options, db: db, @@ -83,25 +83,25 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data closed: make(chan struct{}), closeCancel: cancelFunc, } - err = server.run(ctx) + err = manager.syncReplicas(ctx) if err != nil { return nil, xerrors.Errorf("run replica: %w", err) } - peers := server.Regional() + peers := manager.Regional() if len(peers) > 0 { - self := server.Self() + self := manager.Self() if self.RelayAddress == "" { return nil, xerrors.Errorf("a relay address must be specified when running multiple replicas in the same region") } } - err = server.subscribe(ctx) + err = manager.subscribe(ctx) if err != nil { return nil, xerrors.Errorf("subscribe: %w", err) } - server.closeWait.Add(1) - go server.loop(ctx) - return server, nil + manager.closeWait.Add(1) + go manager.loop(ctx) + return manager, nil } // Manager keeps the replica up to date and in sync with other replicas. @@ -134,7 +134,7 @@ func (m *Manager) loop(ctx context.Context) { return case <-ticker.C: } - err := m.run(ctx) + err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { m.logger.Warn(ctx, "run replica update loop", slog.Error(err)) } @@ -155,7 +155,7 @@ func (m *Manager) subscribe(ctx context.Context) error { // it will reprocess afterwards. var update func() update = func() { - err := m.run(ctx) + err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) } @@ -197,7 +197,7 @@ func (m *Manager) subscribe(ctx context.Context) error { return nil } -func (m *Manager) run(ctx context.Context) error { +func (m *Manager) syncReplicas(ctx context.Context) error { m.closeMutex.Lock() m.closeWait.Add(1) m.closeMutex.Unlock() From 1e85039d346e5e143d5a3ae462681f11d5cff093 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 19:52:19 +0000 Subject: [PATCH 46/79] Reuse http client --- enterprise/replicasync/replicasync.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 46123953298de..317f6dc274bdc 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -219,6 +219,13 @@ func (m *Manager) syncReplicas(ctx context.Context) error { } m.mutex.Unlock() + client := http.Client{ + Timeout: m.options.PeerTimeout, + Transport: &http.Transport{ + TLSClientConfig: m.options.TLSConfig, + }, + } + defer client.CloseIdleConnections() var wg sync.WaitGroup var mu sync.Mutex failed := make([]string, 0) @@ -232,12 +239,6 @@ func (m *Manager) syncReplicas(ctx context.Context) error { slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } - client := http.Client{ - Timeout: m.options.PeerTimeout, - Transport: &http.Transport{ - TLSClientConfig: m.options.TLSConfig, - }, - } res, err := client.Do(req) if err != nil { mu.Lock() From ae0aa5f226bdc2e0e692e09fcee05e6a45d0247e Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 20:55:11 +0000 Subject: [PATCH 47/79] Delete old replicas on a CRON --- coderd/database/databasefake/databasefake.go | 12 ------ coderd/database/querier.go | 1 - coderd/database/queries.sql.go | 23 ------------ coderd/database/queries/replicas.sql | 3 -- enterprise/replicasync/replicasync.go | 39 +++++++++++++++----- enterprise/replicasync/replicasync_test.go | 18 +++++++++ 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index e860c69dd11cb..d95499147066b 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -3238,15 +3238,3 @@ func (q *fakeQuerier) GetReplicasUpdatedAfter(_ context.Context, updatedAt time. } return replicas, nil } - -func (q *fakeQuerier) GetReplicaByID(_ context.Context, id uuid.UUID) (database.Replica, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - for _, replica := range q.replicas { - if replica.ID == id { - return replica, nil - } - } - return database.Replica{}, sql.ErrNoRows -} diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 7f2f0d942bb10..8d1ea946ff5b6 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -69,7 +69,6 @@ type sqlcQuerier interface { GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) GetProvisionerLogsByIDBetween(ctx context.Context, arg GetProvisionerLogsByIDBetweenParams) ([]ProvisionerJobLog, error) - GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetTemplateByID(ctx context.Context, id uuid.UUID) (Template, error) GetTemplateByOrganizationAndName(ctx context.Context, arg GetTemplateByOrganizationAndNameParams) (Template, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c40b93426ddee..ff72247ad1e0f 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2579,29 +2579,6 @@ func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt return err } -const getReplicaByID = `-- name: GetReplicaByID :one -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE id = $1 -` - -func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { - row := q.db.QueryRowContext(ctx, getReplicaByID, id) - var i Replica - err := row.Scan( - &i.ID, - &i.CreatedAt, - &i.StartedAt, - &i.StoppedAt, - &i.UpdatedAt, - &i.Hostname, - &i.RegionID, - &i.RelayAddress, - &i.DatabaseLatency, - &i.Version, - &i.Error, - ) - return i, err -} - const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL ` diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql index 5a62527fac107..e87c1f46432f2 100644 --- a/coderd/database/queries/replicas.sql +++ b/coderd/database/queries/replicas.sql @@ -1,9 +1,6 @@ -- name: GetReplicasUpdatedAfter :many SELECT * FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL; --- name: GetReplicaByID :one -SELECT * FROM replicas WHERE id = $1; - -- name: InsertReplica :one INSERT INTO replicas ( id, diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 317f6dc274bdc..d6cd846d6c96f 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -26,11 +26,12 @@ var ( ) type Options struct { - UpdateInterval time.Duration - PeerTimeout time.Duration - RelayAddress string - RegionID int32 - TLSConfig *tls.Config + CleanupInterval time.Duration + UpdateInterval time.Duration + PeerTimeout time.Duration + RelayAddress string + RegionID int32 + TLSConfig *tls.Config } // New registers the replica with the database and periodically updates to ensure @@ -45,6 +46,11 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if options.UpdateInterval == 0 { options.UpdateInterval = 5 * time.Second } + if options.CleanupInterval == 0 { + // The cleanup interval can be quite long, because it's + // primary purpose is to clean up dead replicas. + options.CleanupInterval = 30 * time.Minute + } hostname, err := os.Hostname() if err != nil { return nil, xerrors.Errorf("get hostname: %w", err) @@ -123,16 +129,31 @@ type Manager struct { callback func() } +// updateInterval is used to determine a replicas state. +// If the replica was updated > the time, it's considered healthy. +// If the replica was updated < the time, it's considered stale. +func (m *Manager) updateInterval() time.Time { + return database.Now().Add(-3 * m.options.UpdateInterval) +} + // loop runs the replica update sequence on an update interval. func (m *Manager) loop(ctx context.Context) { defer m.closeWait.Done() - ticker := time.NewTicker(m.options.UpdateInterval) - defer ticker.Stop() + updateTicker := time.NewTicker(m.options.UpdateInterval) + defer updateTicker.Stop() + deleteTicker := time.NewTicker(m.options.CleanupInterval) + defer deleteTicker.Stop() for { select { case <-ctx.Done(): return - case <-ticker.C: + case <-deleteTicker.C: + err := m.db.DeleteReplicasUpdatedBefore(ctx, m.updateInterval()) + if err != nil { + m.logger.Warn(ctx, "delete old replicas", slog.Error(err)) + } + continue + case <-updateTicker.C: } err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { @@ -204,7 +225,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { defer m.closeWait.Done() // Expect replicas to update once every three times the interval... // If they don't, assume death! - replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) + replicas, err := m.db.GetReplicasUpdatedAfter(ctx, m.updateInterval()) if err != nil { return xerrors.Errorf("get replicas: %w", err) } diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 0b42f44791df4..79acf86865839 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -178,6 +178,24 @@ func TestReplica(t *testing.T) { }, testutil.WaitShort, testutil.IntervalFast) _ = server.Close() }) + t.Run("DeletesOld", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + UpdatedAt: database.Now().Add(-time.Hour), + }) + require.NoError(t, err) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ + RelayAddress: "google.com", + CleanupInterval: time.Millisecond, + }) + require.NoError(t, err) + defer server.Close() + require.Eventually(t, func() bool { + return len(server.Regional()) == 0 + }, testutil.WaitShort, testutil.IntervalFast) + }) t.Run("TwentyConcurrent", func(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! From bd7fb1314256227c578d659911f3f7ba0d37743f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:18:43 +0000 Subject: [PATCH 48/79] Fix race condition in connection tests --- enterprise/coderd/coderd.go | 4 ++-- enterprise/derpmesh/derpmesh.go | 12 +++++++++--- enterprise/derpmesh/derpmesh_test.go | 10 +++++----- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 371ac12fe21b8..1250e6ae129da 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -283,11 +283,11 @@ func (api *API) updateEntitlements(ctx context.Context) error { for _, replica := range api.replicaManager.Regional() { addresses = append(addresses, replica.RelayAddress) } - api.derpMesh.SetAddresses(addresses) + api.derpMesh.SetAddresses(addresses, false) _ = api.updateEntitlements(ctx) }) } else { - api.derpMesh.SetAddresses([]string{}) + api.derpMesh.SetAddresses([]string{}, false) api.replicaManager.SetCallback(func() { // If the amount of replicas change, so should our entitlements. // This is to display a warning in the UI if the user is unlicensed. diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 059eac5a107e7..530c799908fca 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -42,7 +42,10 @@ type Mesh struct { // SetAddresses performs a diff of the incoming addresses and adds // or removes DERP clients from the mesh. -func (m *Mesh) SetAddresses(addresses []string) { +// +// Connect is only used for testing to ensure DERPs are meshed before +// exchanging messages. +func (m *Mesh) SetAddresses(addresses []string, connect bool) { total := make(map[string]struct{}, 0) for _, address := range addresses { addressURL, err := url.Parse(address) @@ -58,7 +61,7 @@ func (m *Mesh) SetAddresses(addresses []string) { address = derpURL.String() total[address] = struct{}{} - added, err := m.addAddress(address) + added, err := m.addAddress(address, connect) if err != nil { m.logger.Error(m.ctx, "failed to add address", slog.F("address", address), slog.Error(err)) continue @@ -85,7 +88,7 @@ func (m *Mesh) SetAddresses(addresses []string) { // addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp -func (m *Mesh) addAddress(address string) (bool, error) { +func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() _, isActive := m.active[address] @@ -102,6 +105,9 @@ func (m *Mesh) addAddress(address string) (bool, error) { var dialer net.Dialer return dialer.DialContext(ctx, network, addr) }) + if connect { + _ = client.Connect(m.ctx) + } ctx, cancelFunc := context.WithCancel(m.ctx) closed := make(chan struct{}) closeFunc := func() { diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index d1131d59da25b..84875f106c7f2 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -49,9 +49,9 @@ func TestDERPMesh(t *testing.T) { defer firstServer.Close() secondServer, secondServerURL := startDERP(t, tlsConfig) firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, tlsConfig) - firstMesh.SetAddresses([]string{secondServerURL}) + firstMesh.SetAddresses([]string{secondServerURL}, true) secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, tlsConfig) - secondMesh.SetAddresses([]string{firstServerURL}) + secondMesh.SetAddresses([]string{firstServerURL}, true) defer firstMesh.Close() defer secondMesh.Close() @@ -78,9 +78,9 @@ func TestDERPMesh(t *testing.T) { t.Parallel() server, serverURL := startDERP(t, tlsConfig) mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsConfig) - mesh.SetAddresses([]string{"http://fake.com"}) + mesh.SetAddresses([]string{"http://fake.com"}, false) // This should trigger a removal... - mesh.SetAddresses([]string{}) + mesh.SetAddresses([]string{}, false) defer mesh.Close() first := key.NewNode() @@ -114,7 +114,7 @@ func TestDERPMesh(t *testing.T) { meshes = append(meshes, mesh) } for _, mesh := range meshes { - mesh.SetAddresses(serverURLs) + mesh.SetAddresses(serverURLs, true) } first := key.NewNode() From bb5b347ada043f7ed0a30eb9f6754c8ae25416d4 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:26:50 +0000 Subject: [PATCH 49/79] Fix linting --- enterprise/derpmesh/derpmesh.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 530c799908fca..5de7799aa74eb 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -45,6 +45,7 @@ type Mesh struct { // // Connect is only used for testing to ensure DERPs are meshed before // exchanging messages. +// nolint:revive func (m *Mesh) SetAddresses(addresses []string, connect bool) { total := make(map[string]struct{}, 0) for _, address := range addresses { @@ -88,6 +89,7 @@ func (m *Mesh) SetAddresses(addresses []string, connect bool) { // addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp +// nolint:revive func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() From 76e0511efef918d121b7834c11bedf3cea0a4771 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:38:15 +0000 Subject: [PATCH 50/79] Fix nil type --- coderd/database/dump.sql | 2 +- .../migrations/000061_replicas.up.sql | 2 +- coderd/database/models.go | 22 +++++++++---------- coderd/database/queries.sql.go | 20 ++++++++--------- enterprise/coderd/replicas.go | 2 +- enterprise/replicasync/replicasync.go | 9 +++----- enterprise/replicasync/replicasync_test.go | 8 +++---- 7 files changed, 31 insertions(+), 34 deletions(-) diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 1e0a18c1dafef..8e31a990a8925 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -295,7 +295,7 @@ CREATE TABLE replicas ( relay_address text NOT NULL, database_latency integer NOT NULL, version text NOT NULL, - error text + error text DEFAULT ''::text NOT NULL ); CREATE TABLE site_configs ( diff --git a/coderd/database/migrations/000061_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql index b1d1a1ab13ee0..1400662e30582 100644 --- a/coderd/database/migrations/000061_replicas.up.sql +++ b/coderd/database/migrations/000061_replicas.up.sql @@ -21,7 +21,7 @@ CREATE TABLE IF NOT EXISTS replicas ( database_latency int NOT NULL, -- Version is the Coder version of the replica. version text NOT NULL, - error text + error text NOT NULL DEFAULT '' ); -- Associates a provisioner daemon with a replica. diff --git a/coderd/database/models.go b/coderd/database/models.go index b4601ecadeb78..53e074984ac11 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -540,17 +540,17 @@ type ProvisionerJobLog struct { } type Replica struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - DatabaseLatency int32 `db:"database_latency" json:"database_latency"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Version string `db:"version" json:"version"` + Error string `db:"error" json:"error"` } type SiteConfig struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 1c10bc259c72c..aa76ddfec52a3 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2698,16 +2698,16 @@ WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, host ` type UpdateReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - RelayAddress string `db:"relay_address" json:"relay_address"` - RegionID int32 `db:"region_id" json:"region_id"` - Hostname string `db:"hostname" json:"hostname"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` - DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error string `db:"error" json:"error"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index c07c37243d0ca..906597f257f04 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -31,7 +31,7 @@ func convertReplica(replica database.Replica) codersdk.Replica { CreatedAt: replica.CreatedAt, RelayAddress: replica.RelayAddress, RegionID: replica.RegionID, - Error: replica.Error.String, + Error: replica.Error, DatabaseLatency: replica.DatabaseLatency, } } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index d6cd846d6c96f..aa8eba46613ff 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -271,12 +271,9 @@ func (m *Manager) syncReplicas(ctx context.Context) error { }(peer) } wg.Wait() - replicaError := sql.NullString{} + replicaError := "" if len(failed) > 0 { - replicaError = sql.NullString{ - Valid: true, - String: fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")), - } + replicaError = fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")) } databaseLatency, err := m.db.Ping(ctx) @@ -301,7 +298,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { } m.mutex.Lock() defer m.mutex.Unlock() - if m.self.Error.String != replica.Error.String { + if m.self.Error != replica.Error { // Publish an update occurred! err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 79acf86865839..40e087a7616ce 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -84,7 +84,7 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.False(t, server.Self().Error.Valid) + require.Empty(t, server.Self().Error) _ = server.Close() }) t.Run("ConnectsToPeerReplicaTLS", func(t *testing.T) { @@ -125,7 +125,7 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.False(t, server.Self().Error.Valid) + require.Empty(t, server.Self().Error) _ = server.Close() }) t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { @@ -148,8 +148,8 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.True(t, server.Self().Error.Valid) - require.Contains(t, server.Self().Error.String, "Failed to dial peers") + require.NotEmpty(t, server.Self().Error) + require.Contains(t, server.Self().Error, "Failed to dial peers") _ = server.Close() }) t.Run("RefreshOnPublish", func(t *testing.T) { From 1ff5f7d81cad45cf6c6346f6763df3dd8ad1bdb5 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 00:51:49 +0000 Subject: [PATCH 51/79] Move pubsub to in-memory for twenty test --- enterprise/replicasync/replicasync_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 40e087a7616ce..2e3d1deafc68c 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -16,6 +16,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/databasefake" "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/testutil" @@ -200,7 +201,8 @@ func TestReplica(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! t.Parallel() - db, pubsub := dbtestutil.NewDB(t) + db := databasefake.New() + pubsub := database.NewPubsubInMemory() logger := slogtest.Make(t, nil) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) From b732184e0a52d7c121d5b4a1d696c7721510d57f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 00:54:19 +0000 Subject: [PATCH 52/79] Add comment for configuration tweaking --- enterprise/replicasync/replicasync_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 2e3d1deafc68c..b6eb45bb9d316 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -201,6 +201,9 @@ func TestReplica(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! t.Parallel() + // This doesn't use the database fake because creating + // this many PostgreSQL connections takes some + // configuration tweaking. db := databasefake.New() pubsub := database.NewPubsubInMemory() logger := slogtest.Make(t, nil) From 38465ac4f9306bac1669335c4155b2440838d361 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:12:32 +0000 Subject: [PATCH 53/79] Fix leak with transport --- coderd/wsconncache/wsconncache_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index 003d3cddb8b7a..d4345ce9d5f05 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -128,7 +128,9 @@ func TestCache(t *testing.T) { return } defer release() - proxy.Transport = conn.HTTPTransport() + transport := conn.HTTPTransport() + defer transport.CloseIdleConnections() + proxy.Transport = transport res := httptest.NewRecorder() proxy.ServeHTTP(res, req) resp := res.Result() From 72555e2d8cd81cd7dd02498f13f59071d6ef1705 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:17:47 +0000 Subject: [PATCH 54/79] Fix close leak in derpmesh --- enterprise/derpmesh/derpmesh.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 5de7799aa74eb..3982542167073 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -93,6 +93,9 @@ func (m *Mesh) SetAddresses(addresses []string, connect bool) { func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() + if m.isClosed() { + return false, nil + } _, isActive := m.active[address] if isActive { return false, nil @@ -142,10 +145,8 @@ func (m *Mesh) removeAddress(address string) bool { func (m *Mesh) Close() error { m.mutex.Lock() defer m.mutex.Unlock() - select { - case <-m.closed: + if m.isClosed() { return nil - default: } close(m.closed) for _, cancelFunc := range m.active { @@ -153,3 +154,12 @@ func (m *Mesh) Close() error { } return nil } + +func (m *Mesh) isClosed() bool { + select { + case <-m.closed: + return true + default: + } + return false +} From e54072a53a80d78c54ea82897301f8120d35ad15 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:30:09 +0000 Subject: [PATCH 55/79] Fix race when creating server --- coderd/coderdtest/coderdtest.go | 8 +++++--- enterprise/coderd/coderdenttest/coderdenttest.go | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index de49afaa1c269..4a7c3e38b69e1 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -170,9 +170,6 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance Certificates: options.TLSCertificates, MinVersion: tls.VersionTLS12, } - srv.StartTLS() - } else { - srv.Start() } t.Cleanup(srv.Close) @@ -266,6 +263,11 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) srv.Config.Handler = coderAPI.RootHandler + if newOptions.TLSCertificates != nil { + srv.StartTLS() + } else { + srv.Start() + } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 02eff4e2acf2e..fbffa683bea8b 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -77,6 +77,11 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c }) assert.NoError(t, err) srv.Config.Handler = coderAPI.AGPL.RootHandler + if oop.TLSCertificates != nil { + srv.StartTLS() + } else { + srv.Start() + } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 27d5f40619e47cc5729dce114fba34b771daf612 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 02:48:52 +0000 Subject: [PATCH 56/79] Remove handler update --- coderd/coderdtest/coderdtest.go | 8 +++----- enterprise/coderd/coderdenttest/coderdenttest.go | 5 ----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 4a7c3e38b69e1..de49afaa1c269 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -170,6 +170,9 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance Certificates: options.TLSCertificates, MinVersion: tls.VersionTLS12, } + srv.StartTLS() + } else { + srv.Start() } t.Cleanup(srv.Close) @@ -263,11 +266,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) srv.Config.Handler = coderAPI.RootHandler - if newOptions.TLSCertificates != nil { - srv.StartTLS() - } else { - srv.Start() - } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index fbffa683bea8b..02eff4e2acf2e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -77,11 +77,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c }) assert.NoError(t, err) srv.Config.Handler = coderAPI.AGPL.RootHandler - if oop.TLSCertificates != nil { - srv.StartTLS() - } else { - srv.Start() - } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 4d0b1d86854a41a4616578c5655e00d65f2cfffa Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 03:23:53 +0000 Subject: [PATCH 57/79] Skip test on Windows --- enterprise/derpmesh/derpmesh_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 84875f106c7f2..1c1d658bb03c2 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -7,6 +7,7 @@ import ( "errors" "io" "net/http/httptest" + "runtime" "testing" "github.com/stretchr/testify/assert" @@ -101,6 +102,9 @@ func TestDERPMesh(t *testing.T) { }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("This test is races on Windows... I think because it's too slow.") + } meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { From 129f5ba6511615e0a8920164ce56b90efb41e214 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 03:39:45 +0000 Subject: [PATCH 58/79] Fix DERP mesh test --- enterprise/derpmesh/derpmesh_test.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 1c1d658bb03c2..7ca844c57e6fc 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -7,8 +7,8 @@ import ( "errors" "io" "net/http/httptest" - "runtime" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -102,9 +102,6 @@ func TestDERPMesh(t *testing.T) { }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() - if runtime.GOOS == "windows" { - t.Skip("This test is races on Windows... I think because it's too slow.") - } meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { @@ -132,12 +129,28 @@ func TestDERPMesh(t *testing.T) { err = secondClient.Connect(context.Background()) require.NoError(t, err) + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(time.Second) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) } From 4e5d30e6267ca70ea318ca2ffe84fea369725617 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 04:47:04 +0000 Subject: [PATCH 59/79] Wrap HTTP handler replacement in mutex --- coderd/coderdtest/coderdtest.go | 115 ++++++++++-------- .../coderd/coderdenttest/coderdenttest.go | 4 +- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index de49afaa1c269..31785ff7e7950 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -24,6 +24,7 @@ import ( "regexp" "strconv" "strings" + "sync" "testing" "time" @@ -127,7 +128,7 @@ func newWithCloser(t *testing.T, options *Options) (*codersdk.Client, io.Closer) return client, closer } -func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.CancelFunc, *coderd.Options) { +func NewOptions(t *testing.T, options *Options) (func(http.Handler), context.CancelFunc, *coderd.Options) { if options == nil { options = &Options{} } @@ -161,7 +162,15 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() - srv := httptest.NewUnstartedServer(nil) + var mutex sync.RWMutex + var handler http.Handler + srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mutex.RLock() + defer mutex.RUnlock() + if handler != nil { + handler.ServeHTTP(w, r) + } + })) srv.Config.BaseContext = func(_ net.Listener) context.Context { return ctx } @@ -204,55 +213,59 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance require.NoError(t, err) } - return srv, cancelFunc, &coderd.Options{ - AgentConnectionUpdateFrequency: 150 * time.Millisecond, - // Force a long disconnection timeout to ensure - // agents are not marked as disconnected during slow tests. - AgentInactiveDisconnectTimeout: testutil.WaitShort, - AccessURL: serverURL, - AppHostname: options.AppHostname, - AppHostnameRegex: appHostnameRegex, - Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), - CacheDir: t.TempDir(), - Database: options.Database, - Pubsub: options.Pubsub, - - Auditor: options.Auditor, - AWSCertificates: options.AWSCertificates, - AzureCertificates: options.AzureCertificates, - GithubOAuth2Config: options.GithubOAuth2Config, - OIDCConfig: options.OIDCConfig, - GoogleTokenValidator: options.GoogleTokenValidator, - SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, - DERPServer: derpServer, - APIRateLimit: options.APIRateLimit, - Authorizer: options.Authorizer, - Telemetry: telemetry.NewNoop(), - TLSCertificates: options.TLSCertificates, - DERPMap: &tailcfg.DERPMap{ - Regions: map[int]*tailcfg.DERPRegion{ - 1: { - EmbeddedRelay: true, - RegionID: 1, - RegionCode: "coder", - RegionName: "Coder", - Nodes: []*tailcfg.DERPNode{{ - Name: "1a", - RegionID: 1, - IPv4: "127.0.0.1", - DERPPort: derpPort, - STUNPort: stunAddr.Port, - InsecureForTests: true, - ForceHTTP: options.TLSCertificates == nil, - }}, + return func(h http.Handler) { + mutex.Lock() + handler = h + mutex.Unlock() + }, cancelFunc, &coderd.Options{ + AgentConnectionUpdateFrequency: 150 * time.Millisecond, + // Force a long disconnection timeout to ensure + // agents are not marked as disconnected during slow tests. + AgentInactiveDisconnectTimeout: testutil.WaitShort, + AccessURL: serverURL, + AppHostname: options.AppHostname, + AppHostnameRegex: appHostnameRegex, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + CacheDir: t.TempDir(), + Database: options.Database, + Pubsub: options.Pubsub, + + Auditor: options.Auditor, + AWSCertificates: options.AWSCertificates, + AzureCertificates: options.AzureCertificates, + GithubOAuth2Config: options.GithubOAuth2Config, + OIDCConfig: options.OIDCConfig, + GoogleTokenValidator: options.GoogleTokenValidator, + SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, + DERPServer: derpServer, + APIRateLimit: options.APIRateLimit, + Authorizer: options.Authorizer, + Telemetry: telemetry.NewNoop(), + TLSCertificates: options.TLSCertificates, + DERPMap: &tailcfg.DERPMap{ + Regions: map[int]*tailcfg.DERPRegion{ + 1: { + EmbeddedRelay: true, + RegionID: 1, + RegionCode: "coder", + RegionName: "Coder", + Nodes: []*tailcfg.DERPNode{{ + Name: "1a", + RegionID: 1, + IPv4: "127.0.0.1", + DERPPort: derpPort, + STUNPort: stunAddr.Port, + InsecureForTests: true, + ForceHTTP: options.TLSCertificates == nil, + }}, + }, }, }, - }, - AutoImportTemplates: options.AutoImportTemplates, - MetricsCacheRefreshInterval: options.MetricsCacheRefreshInterval, - AgentStatsRefreshInterval: options.AgentStatsRefreshInterval, - DeploymentFlags: options.DeploymentFlags, - } + AutoImportTemplates: options.AutoImportTemplates, + MetricsCacheRefreshInterval: options.MetricsCacheRefreshInterval, + AgentStatsRefreshInterval: options.AgentStatsRefreshInterval, + DeploymentFlags: options.DeploymentFlags, + } } // NewWithAPI constructs an in-memory API instance and returns a client to talk to it. @@ -262,10 +275,10 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c if options == nil { options = &Options{} } - srv, cancelFunc, newOptions := NewOptions(t, options) + setHandler, cancelFunc, newOptions := NewOptions(t, options) // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) - srv.Config.Handler = coderAPI.RootHandler + setHandler(coderAPI.APIHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 02eff4e2acf2e..a8595b5bc6ede 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -62,7 +62,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c if options.Options == nil { options.Options = &coderdtest.Options{} } - srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) + setHandler, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ RBAC: true, AuditLogging: options.AuditLogging, @@ -76,7 +76,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c Keys: Keys, }) assert.NoError(t, err) - srv.Config.Handler = coderAPI.AGPL.RootHandler + setHandler(coderAPI.AGPL.RootHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 0359a7e9a79debe4f4e28fdde81faa27094e153e Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 14:44:24 +0000 Subject: [PATCH 60/79] Fix error message for relay --- enterprise/replicasync/replicasync.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index aa8eba46613ff..4aeabd2a05742 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -178,7 +178,7 @@ func (m *Manager) subscribe(ctx context.Context) error { update = func() { err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { - m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + m.logger.Warn(ctx, "run replica from subscribe", slog.Error(err)) } updateMutex.Lock() if needsUpdate { @@ -256,7 +256,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { - m.logger.Error(ctx, "create http request for relay probe", + m.logger.Warn(ctx, "create http request for relay probe", slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } From f364d1fe7523af406fb047828ba93fd6c2b139df Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 14:51:48 +0000 Subject: [PATCH 61/79] Fix API handler for normal tests --- coderd/coderdtest/coderdtest.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 31785ff7e7950..2a7184e2ca05d 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -215,8 +215,8 @@ func NewOptions(t *testing.T, options *Options) (func(http.Handler), context.Can return func(h http.Handler) { mutex.Lock() + defer mutex.Unlock() handler = h - mutex.Unlock() }, cancelFunc, &coderd.Options{ AgentConnectionUpdateFrequency: 150 * time.Millisecond, // Force a long disconnection timeout to ensure @@ -278,7 +278,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c setHandler, cancelFunc, newOptions := NewOptions(t, options) // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) - setHandler(coderAPI.APIHandler) + setHandler(coderAPI.RootHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) From 423a47e1dd9aa227a3a48ada5ac343a732bb9d01 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 15:25:34 +0000 Subject: [PATCH 62/79] Fix speedtest --- agent/agent_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index e1269d6003922..e10eee7f111a0 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -483,9 +483,7 @@ func TestAgent(t *testing.T) { t.Run("Speedtest", func(t *testing.T) { t.Parallel() - if testing.Short() { - t.Skip("The minimum duration for a speedtest is hardcoded in Tailscale to 5s!") - } + t.Skip("This test is relatively flakey because of Tailscale's speedtest code...") derpMap := tailnettest.RunDERPAndSTUN(t) conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{ DERPMap: derpMap, From c3a77fe2d048b88d4e6e1f5d1ad8475735a18f18 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 16:21:13 +0000 Subject: [PATCH 63/79] Fix replica resend --- enterprise/derpmesh/derpmesh_test.go | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 7ca844c57e6fc..2878346d4ee43 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -94,11 +94,28 @@ func TestDERPMesh(t *testing.T) { secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) + + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(50 * time.Millisecond) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() @@ -135,7 +152,7 @@ func TestDERPMesh(t *testing.T) { sent := []byte("hello world") go func() { defer close(closed) - ticker := time.NewTicker(time.Second) + ticker := time.NewTicker(50 * time.Millisecond) for { select { case <-ctx.Done(): From 729f8a07acda2301d0fa60dc350b3838da956a04 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 16:34:38 +0000 Subject: [PATCH 64/79] Fix derpmesh send --- enterprise/derpmesh/derpmesh_test.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 2878346d4ee43..7fad141238442 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -67,12 +67,28 @@ func TestDERPMesh(t *testing.T) { err = secondClient.Connect(context.Background()) require.NoError(t, err) + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(50 * time.Millisecond) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. From ae0bc5df1e0538fc4e4a0155044bd37724dc2b23 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 21:17:12 +0000 Subject: [PATCH 65/79] Ping async --- codersdk/agentconn.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codersdk/agentconn.go b/codersdk/agentconn.go index e75edf1ca6bb0..ddfb9541a186a 100644 --- a/codersdk/agentconn.go +++ b/codersdk/agentconn.go @@ -135,7 +135,7 @@ type AgentConn struct { func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) { errCh := make(chan error, 1) durCh := make(chan time.Duration, 1) - c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { + go c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { if pr.Err != "" { errCh <- xerrors.New(pr.Err) return From d7d50db6dd6cec7adefbf4153cc075a3588c4be3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 21:23:26 +0000 Subject: [PATCH 66/79] Increase wait time of template version jobd --- coderd/coderdtest/coderdtest.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 2a7184e2ca05d..5cf307d842e90 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -497,7 +497,7 @@ func AwaitTemplateVersionJob(t *testing.T, client *codersdk.Client, version uuid var err error templateVersion, err = client.TemplateVersion(context.Background(), version) return assert.NoError(t, err) && templateVersion.Job.CompletedAt != nil - }, testutil.WaitShort, testutil.IntervalFast) + }, testutil.WaitMedium, testutil.IntervalFast) return templateVersion } From 77d23dc113a1fef2670ec9a898a94c9d6eb0ec18 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 22:24:18 +0000 Subject: [PATCH 67/79] Fix race when closing replica sync --- enterprise/replicasync/replicasync.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 4aeabd2a05742..0534c55246824 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -362,7 +362,8 @@ func (m *Manager) Close() error { m.closeCancel() m.closeWait.Wait() m.closeMutex.Unlock() - + m.mutex.Lock() + defer m.mutex.Unlock() ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) defer cancelFunc() _, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ From 435bbbb364a47bffeada924e45772059a4250022 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 22:57:10 +0000 Subject: [PATCH 68/79] Add name to client --- enterprise/coderd/replicas_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index fae418ab87261..7a3e130cf7770 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -118,7 +118,7 @@ func TestReplicas(t *testing.T) { _, agent := setupWorkspaceAgent(t, firstClient, firstUser, 0) conn, err := secondClient.DialWorkspaceAgent(context.Background(), agent.ID, &codersdk.DialWorkspaceAgentOptions{ BlockEndpoints: true, - Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), }) require.NoError(t, err) require.Eventually(t, func() bool { From 9b7c41afd462daf8f42d57d865b07e33d744f141 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 00:27:24 +0000 Subject: [PATCH 69/79] Log the derpmap being used --- agent/agent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/agent/agent.go b/agent/agent.go index 6d0a9a952f44b..f7c5598b7b710 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -170,6 +170,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) { if a.isClosed() { return } + a.logger.Debug(ctx, "running tailnet with derpmap", slog.F("derpmap", derpMap)) if a.network != nil { a.network.SetDERPMap(derpMap) return From 961540291ae4137848850dfd2c04bf78a3bd1079 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 00:30:43 +0000 Subject: [PATCH 70/79] Don't connect if DERP is empty --- tailnet/conn.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tailnet/conn.go b/tailnet/conn.go index e41ed60a527f3..7c572a55e1b66 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -344,9 +344,14 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { // reason. TODO: @kylecarbs debug this! KeepAlive: ok && peerStatus.Active, } + // If no preferred DERP is provided, don't set an IP! + if node.PreferredDERP == 0 { + peerNode.DERP = "" + } if c.blockEndpoints { peerNode.Endpoints = nil } + c.logger.Debug(context.Background(), "adding node", slog.F("node", peerNode)) c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From bcb97ac7087b105b78c087ac89203024e51192f5 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:30:29 +0000 Subject: [PATCH 71/79] Improve agent coordinator logging --- coderd/workspaceagents.go | 2 ++ tailnet/conn.go | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index b93369b0cb9cb..fb7f765cc7519 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -378,6 +378,7 @@ func (api *API) dialWorkspaceAgentTailnet(r *http.Request, agentID uuid.UUID) (* go func() { err := (*api.TailnetCoordinator.Load()).ServeClient(serverConn, uuid.New(), agentID) if err != nil { + api.Logger.Warn(r.Context(), "tailnet coordinator client error", slog.Error(err)) _ = conn.Close() } }() @@ -516,6 +517,7 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request defer close(closeChan) err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID) if err != nil { + api.Logger.Warn(ctx, "tailnet coordinator agent error", slog.Error(err)) _ = conn.Close(websocket.StatusInternalError, err.Error()) return } diff --git a/tailnet/conn.go b/tailnet/conn.go index 7c572a55e1b66..2f2549718880d 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -328,6 +328,8 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { delete(c.peerMap, peer.ID) } for _, node := range nodes { + c.logger.Debug(context.Background(), "adding node", slog.F("node", node)) + peerStatus, ok := status.Peer[node.Key] peerNode := &tailcfg.Node{ ID: node.ID, @@ -351,7 +353,6 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { if c.blockEndpoints { peerNode.Endpoints = nil } - c.logger.Debug(context.Background(), "adding node", slog.F("node", peerNode)) c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From e2f6a1939f819feb3a1785c31e90a44e239b6f35 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:35:07 +0000 Subject: [PATCH 72/79] Fix lock in coordinator --- enterprise/tailnet/coordinator.go | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 1ccf56f50da11..4bfae463e202a 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -178,12 +178,10 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { if len(nodes) > 0 { data, err := json.Marshal(nodes) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal json: %w", err) } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } @@ -250,17 +248,16 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( } c.mutex.Lock() - defer c.mutex.Unlock() - c.nodes[id] = &node - connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { + c.mutex.Unlock() return &node, nil } data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { + c.mutex.Unlock() return nil, xerrors.Errorf("marshal nodes: %w", err) } @@ -275,6 +272,7 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( _, _ = connectionSocket.Write(data) }() } + c.mutex.Unlock() wg.Wait() return &node, nil } @@ -394,12 +392,12 @@ func (c *haCoordinator) runPubsub() error { } c.mutex.Lock() - defer c.mutex.Unlock() - agentSocket, ok := c.agentSockets[agentUUID] if !ok { + c.mutex.Unlock() return } + c.mutex.Unlock() // We get a single node over pubsub, so turn into an array. _, err = agentSocket.Write(nodeJSON) @@ -410,7 +408,6 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } - case "agenthello": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -426,7 +423,6 @@ func (c *haCoordinator) runPubsub() error { return } } - case "agentupdate": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -440,7 +436,6 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "handle agent update", slog.Error(err)) return } - default: c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) } From c855c9ba60d511153ea9d1f30f57f6e8ccca626b Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:53:50 +0000 Subject: [PATCH 73/79] Fix relay addr --- enterprise/replicasync/replicasync_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index b6eb45bb9d316..b7709c1f6f814 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -134,17 +134,17 @@ func TestReplica(t *testing.T) { db, pubsub := dbtestutil.NewDB(t) peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ ID: uuid.New(), - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), + CreatedAt: database.Now().Add(time.Minute), + StartedAt: database.Now().Add(time.Minute), + UpdatedAt: database.Now().Add(time.Minute), Hostname: "something", - // Fake address to hit! - RelayAddress: "http://169.254.169.254", + // Fake address to dial! + RelayAddress: "http://127.0.0.1:1", }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ PeerTimeout: 1 * time.Millisecond, - RelayAddress: "http://169.254.169.254", + RelayAddress: "http://127.0.0.1:1", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) From a0e5cab653e6000e149404838fa90e8a27813ae6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:04:41 +0000 Subject: [PATCH 74/79] Fix race when updating durations --- coderd/activitybump_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/activitybump_test.go b/coderd/activitybump_test.go index dec5ec42f6556..e498b98fa0c80 100644 --- a/coderd/activitybump_test.go +++ b/coderd/activitybump_test.go @@ -72,7 +72,7 @@ func TestWorkspaceActivityBump(t *testing.T) { "deadline %v never updated", firstDeadline, ) - require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, time.Second) + require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, 3*time.Second) } } From 9878fc51d4b6a464e87bcb5f59f6891e1e39d5d9 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:37:28 +0000 Subject: [PATCH 75/79] Fix client publish race --- enterprise/tailnet/coordinator.go | 60 +++++++++++++++++++++++--- enterprise/tailnet/coordinator_test.go | 11 ++--- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 4bfae463e202a..2f284cb00ff61 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -56,8 +56,8 @@ type haCoordinator struct { // Node returns an in-memory node by ID. func (c *haCoordinator) Node(id uuid.UUID) *agpl.Node { - c.mutex.RLock() - defer c.mutex.RUnlock() + c.mutex.Lock() + defer c.mutex.Unlock() node := c.nodes[id] return node } @@ -79,6 +79,11 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID if err != nil { return xerrors.Errorf("write nodes: %w", err) } + } else { + err := c.publishClientHello(agent) + if err != nil { + return xerrors.Errorf("publish client hello: %w", err) + } } c.mutex.Lock() @@ -205,7 +210,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - node, err := c.hangleAgentUpdate(id, decoder) + node, err := c.handleAgentUpdate(id, decoder) if err != nil { if errors.Is(err, io.EOF) { return nil @@ -240,7 +245,17 @@ func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { return nodes } -func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { +func (c *haCoordinator) handleClientHello(id uuid.UUID) error { + c.mutex.Lock() + node, ok := c.nodes[id] + c.mutex.Unlock() + if !ok { + return nil + } + return c.publishAgentToNodes(id, node) +} + +func (c *haCoordinator) handleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { var node agpl.Node err := decoder.Decode(&node) if err != nil { @@ -343,6 +358,18 @@ func (c *haCoordinator) publishAgentHello(id uuid.UUID) error { return nil } +func (c *haCoordinator) publishClientHello(id uuid.UUID) error { + msg, err := c.formatClientHello(id) + if err != nil { + return xerrors.Errorf("format client hello: %w", err) + } + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish client hello: %w", err) + } + return nil +} + func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error { msg, err := c.formatAgentUpdate(id, node) if err != nil { @@ -408,6 +435,18 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } + case "clienthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + err = c.handleClientHello(agentUUID) + if err != nil { + c.log.Error(ctx, "handle agent request node", slog.Error(err)) + return + } case "agenthello": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -431,7 +470,7 @@ func (c *haCoordinator) runPubsub() error { } decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - _, err = c.hangleAgentUpdate(agentUUID, decoder) + _, err = c.handleAgentUpdate(agentUUID, decoder) if err != nil { c.log.Error(ctx, "handle agent update", slog.Error(err)) return @@ -478,6 +517,17 @@ func (c *haCoordinator) formatAgentHello(id uuid.UUID) ([]byte, error) { return buf.Bytes(), nil } +// format: |clienthello|| +func (c *haCoordinator) formatClientHello(id uuid.UUID) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("clienthello|") + buf.WriteString(id.String() + "|") + + return buf.Bytes(), nil +} + // format: |agentupdate|| func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { buf := bytes.Buffer{} diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go index 83fac250b2916..86cee94dbdf5b 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -11,6 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/tailnet" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" @@ -167,16 +168,12 @@ func TestCoordinatorHA(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - pubsub := database.NewPubsubInMemory() + _, pubsub := dbtestutil.NewDB(t) coordinator1, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) - require.NoError(t, err) - defer coordinator2.Close() - agentWS, agentServerWS := net.Pipe() defer agentWS.Close() agentNodeChan := make(chan []*agpl.Node) @@ -196,6 +193,10 @@ func TestCoordinatorHA(t *testing.T) { return coordinator1.Node(agentID) != nil }, testutil.WaitShort, testutil.IntervalFast) + coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator2.Close() + clientWS, clientServerWS := net.Pipe() defer clientWS.Close() defer clientServerWS.Close() From 7a40bf801f89fc09d35fa1bc9716504f09595f2c Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:52:02 +0000 Subject: [PATCH 76/79] Run pubsub loop in a queue --- enterprise/tailnet/coordinator.go | 202 +++++++++++++++++------------- 1 file changed, 113 insertions(+), 89 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 2f284cb00ff61..f001d4a9643dd 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -21,17 +21,19 @@ import ( // NewCoordinator creates a new high availability coordinator // that uses PostgreSQL pubsub to exchange handshakes. func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { + ctx, cancelFunc := context.WithCancel(context.Background()) coord := &haCoordinator{ id: uuid.New(), log: logger, pubsub: pubsub, + closeFunc: cancelFunc, close: make(chan struct{}), nodes: map[uuid.UUID]*agpl.Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, } - if err := coord.runPubsub(); err != nil { + if err := coord.runPubsub(ctx); err != nil { return nil, xerrors.Errorf("run coordinator pubsub: %w", err) } @@ -39,11 +41,12 @@ func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinato } type haCoordinator struct { - id uuid.UUID - log slog.Logger - mutex sync.RWMutex - pubsub database.Pubsub - close chan struct{} + id uuid.UUID + log slog.Logger + mutex sync.RWMutex + pubsub database.Pubsub + close chan struct{} + closeFunc context.CancelFunc // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*agpl.Node @@ -303,6 +306,7 @@ func (c *haCoordinator) Close() error { default: } close(c.close) + c.closeFunc() wg := sync.WaitGroup{} @@ -384,111 +388,131 @@ func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error return nil } -func (c *haCoordinator) runPubsub() error { +func (c *haCoordinator) runPubsub(ctx context.Context) error { + messageQueue := make(chan []byte, 64) cancelSub, err := c.pubsub.Subscribe("wireguard_peers", func(ctx context.Context, message []byte) { - sp := bytes.Split(message, []byte("|")) - if len(sp) != 4 { - c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + select { + case messageQueue <- message: + case <-ctx.Done(): return } + }) + if err != nil { + return xerrors.Errorf("subscribe wireguard peers") + } + go func() { + for { + var message []byte + select { + case <-ctx.Done(): + return + case message = <-messageQueue: + } + c.handlePubsubMessage(ctx, message) + } + }() + + go func() { + defer cancelSub() + <-c.close + }() + + return nil +} + +func (c *haCoordinator) handlePubsubMessage(ctx context.Context, message []byte) { + sp := bytes.Split(message, []byte("|")) + if len(sp) != 4 { + c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + return + } + + var ( + coordinatorID = sp[0] + eventType = sp[1] + agentID = sp[2] + nodeJSON = sp[3] + ) - var ( - coordinatorID = sp[0] - eventType = sp[1] - agentID = sp[2] - nodeJSON = sp[3] - ) + sender, err := uuid.ParseBytes(coordinatorID) + if err != nil { + c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + return + } - sender, err := uuid.ParseBytes(coordinatorID) + // We sent this message! + if sender == c.id { + return + } + + switch string(eventType) { + case "callmemaybe": + agentUUID, err := uuid.ParseBytes(agentID) if err != nil { - c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) return } - // We sent this message! - if sender == c.id { + c.mutex.Lock() + agentSocket, ok := c.agentSockets[agentUUID] + if !ok { + c.mutex.Unlock() return } + c.mutex.Unlock() - switch string(eventType) { - case "callmemaybe": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) - return - } - - c.mutex.Lock() - agentSocket, ok := c.agentSockets[agentUUID] - if !ok { - c.mutex.Unlock() - return - } - c.mutex.Unlock() - - // We get a single node over pubsub, so turn into an array. - _, err = agentSocket.Write(nodeJSON) - if err != nil { - if errors.Is(err, io.EOF) { - return - } - c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) - return - } - case "clienthello": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + // We get a single node over pubsub, so turn into an array. + _, err = agentSocket.Write(nodeJSON) + if err != nil { + if errors.Is(err, io.EOF) { return } + c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) + return + } + case "clienthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - err = c.handleClientHello(agentUUID) - if err != nil { - c.log.Error(ctx, "handle agent request node", slog.Error(err)) - return - } - case "agenthello": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) - return - } + err = c.handleClientHello(agentUUID) + if err != nil { + c.log.Error(ctx, "handle agent request node", slog.Error(err)) + return + } + case "agenthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - nodes := c.nodesSubscribedToAgent(agentUUID) - if len(nodes) > 0 { - err := c.publishNodesToAgent(agentUUID, nodes) - if err != nil { - c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) - return - } - } - case "agentupdate": - agentUUID, err := uuid.ParseBytes(agentID) + nodes := c.nodesSubscribedToAgent(agentUUID) + if len(nodes) > 0 { + err := c.publishNodesToAgent(agentUUID, nodes) if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) return } + } + case "agentupdate": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - _, err = c.handleAgentUpdate(agentUUID, decoder) - if err != nil { - c.log.Error(ctx, "handle agent update", slog.Error(err)) - return - } - default: - c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) + decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) + _, err = c.handleAgentUpdate(agentUUID, decoder) + if err != nil { + c.log.Error(ctx, "handle agent update", slog.Error(err)) + return } - }) - if err != nil { - return xerrors.Errorf("subscribe wireguard peers") + default: + c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) } - - go func() { - defer cancelSub() - <-c.close - }() - - return nil } // format: |callmemaybe|| From 08b9681baac814f970b455432d845a6028a80779 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 03:03:16 +0000 Subject: [PATCH 77/79] Store agent nodes in order --- enterprise/tailnet/coordinator.go | 7 +++++++ tailnet/conn.go | 1 + tailnet/coordinator.go | 2 ++ 3 files changed, 10 insertions(+) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index f001d4a9643dd..da3845f70b4c3 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -266,6 +266,13 @@ func (c *haCoordinator) handleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( } c.mutex.Lock() + oldNode := c.nodes[id] + if oldNode != nil { + if oldNode.AsOf.After(node.AsOf) { + c.mutex.Unlock() + return oldNode, nil + } + } c.nodes[id] = &node connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { diff --git a/tailnet/conn.go b/tailnet/conn.go index 2f2549718880d..e3af3786ec92f 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -435,6 +435,7 @@ func (c *Conn) sendNode() { } node := &Node{ ID: c.netMap.SelfNode.ID, + AsOf: c.lastStatus, Key: c.netMap.SelfNode.Key, Addresses: c.netMap.SelfNode.Addresses, AllowedIPs: c.netMap.SelfNode.AllowedIPs, diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 491c0db885224..52c1fa1e66ec4 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -37,6 +37,8 @@ type Coordinator interface { type Node struct { // ID is used to identify the connection. ID tailcfg.NodeID `json:"id"` + // AsOf is the time the node was created. + AsOf time.Time `json:"as_of"` // Key is the Wireguard public key of the node. Key key.NodePublic `json:"key"` // DiscoKey is used for discovery messages over DERP to establish peer-to-peer connections. From 79991a939139c64d0b66582d32aeb7078befb43c Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 03:18:19 +0000 Subject: [PATCH 78/79] Fix coordinator locking --- tailnet/coordinator.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 52c1fa1e66ec4..4216bbc624d48 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -246,7 +246,6 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } sockets, ok := c.agentToConnectionSockets[id] - c.mutex.Unlock() if ok { // Publish all nodes that want to connect to the // desired agent ID. @@ -258,21 +257,21 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } nodes = append(nodes, node) } + c.mutex.Unlock() data, err := json.Marshal(nodes) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal json: %w", err) } _, err = conn.Write(data) if err != nil { return xerrors.Errorf("write nodes: %w", err) } + c.mutex.Lock() } // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. - c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() From 020171b65e4a19faebc97f2983cf740c8d53755f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 04:06:01 +0000 Subject: [PATCH 79/79] Check for closed pipe --- enterprise/tailnet/coordinator.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index da3845f70b4c3..5749d9ef47c7a 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -121,7 +121,7 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID for { err := c.handleNextClientMessage(id, agent, decoder) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("handle next client message: %w", err) @@ -163,7 +163,7 @@ func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *js _, err = agentSocket.Write(data) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("write json: %w", err) @@ -215,7 +215,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { for { node, err := c.handleAgentUpdate(id, decoder) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("handle next agent message: %w", err) @@ -471,7 +471,7 @@ func (c *haCoordinator) handlePubsubMessage(ctx context.Context, message []byte) // We get a single node over pubsub, so turn into an array. _, err = agentSocket.Write(nodeJSON) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return } c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err))