diff --git a/agent/agent.go b/agent/agent.go index fd816c8f3389f..e3cac00663c71 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -27,6 +27,7 @@ import ( "github.com/spf13/afero" "go.uber.org/atomic" "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" "golang.org/x/xerrors" "tailscale.com/net/speedtest" "tailscale.com/tailcfg" @@ -72,6 +73,7 @@ type Options struct { type Client interface { Manifest(ctx context.Context) (agentsdk.Manifest, error) Listen(ctx context.Context) (net.Conn, error) + DERPMapUpdates(ctx context.Context) (<-chan agentsdk.DERPMapUpdate, io.Closer, error) ReportStats(ctx context.Context, log slog.Logger, statsChan <-chan *agentsdk.Stats, setInterval func(time.Duration)) (io.Closer, error) PostLifecycle(ctx context.Context, state agentsdk.PostLifecycleRequest) error PostAppHealth(ctx context.Context, req agentsdk.PostAppHealthsRequest) error @@ -699,12 +701,26 @@ func (a *agent) run(ctx context.Context) error { network.SetBlockEndpoints(manifest.DisableDirectConnections) } - a.logger.Debug(ctx, "running tailnet connection coordinator") - err = a.runCoordinator(ctx, network) - if err != nil { - return xerrors.Errorf("run coordinator: %w", err) - } - return nil + eg, egCtx := errgroup.WithContext(ctx) + eg.Go(func() error { + a.logger.Debug(egCtx, "running tailnet connection coordinator") + err := a.runCoordinator(egCtx, network) + if err != nil { + return xerrors.Errorf("run coordinator: %w", err) + } + return nil + }) + + eg.Go(func() error { + a.logger.Debug(egCtx, "running derp map subscriber") + err := a.runDERPMapSubscriber(egCtx, network) + if err != nil { + return xerrors.Errorf("run derp map subscriber: %w", err) + } + return nil + }) + + return eg.Wait() } func (a *agent) wireguardAddresses(agentID uuid.UUID) []netip.Prefix { @@ -927,6 +943,34 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error } } +// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur. +func (a *agent) runDERPMapSubscriber(ctx context.Context, network *tailnet.Conn) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + updates, closer, err := a.client.DERPMapUpdates(ctx) + if err != nil { + return err + } + defer closer.Close() + + a.logger.Info(ctx, "connected to derp map endpoint") + for { + select { + case <-ctx.Done(): + return ctx.Err() + case update := <-updates: + if update.Err != nil { + return update.Err + } + if update.DERPMap != nil && !tailnet.CompareDERPMaps(network.DERPMap(), update.DERPMap) { + a.logger.Info(ctx, "updating derp map due to detected changes") + network.SetDERPMap(update.DERPMap) + } + } + } +} + func (a *agent) runStartupScript(ctx context.Context, script string) error { return a.runScript(ctx, "startup", script) } diff --git a/agent/agent_test.go b/agent/agent_test.go index 94a9c5326e950..d897951496896 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1717,6 +1717,120 @@ func TestAgent_Dial(t *testing.T) { } } +// TestAgent_UpdatedDERP checks that agents can handle their DERP map being +// updated, and that clients can also handle it. +func TestAgent_UpdatedDERP(t *testing.T) { + t.Parallel() + + logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) + + originalDerpMap, _ := tailnettest.RunDERPAndSTUN(t) + require.NotNil(t, originalDerpMap) + + coordinator := tailnet.NewCoordinator(logger) + defer func() { + _ = coordinator.Close() + }() + agentID := uuid.New() + statsCh := make(chan *agentsdk.Stats, 50) + fs := afero.NewMemMapFs() + client := agenttest.NewClient(t, + logger.Named("agent"), + agentID, + agentsdk.Manifest{ + DERPMap: originalDerpMap, + // Force DERP. + DisableDirectConnections: true, + }, + statsCh, + coordinator, + ) + closer := agent.New(agent.Options{ + Client: client, + Filesystem: fs, + Logger: logger.Named("agent"), + ReconnectingPTYTimeout: time.Minute, + }) + defer func() { + _ = closer.Close() + }() + + // Setup a client connection. + newClientConn := func(derpMap *tailcfg.DERPMap) *codersdk.WorkspaceAgentConn { + conn, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + DERPMap: derpMap, + Logger: logger.Named("client"), + }) + require.NoError(t, err) + clientConn, serverConn := net.Pipe() + serveClientDone := make(chan struct{}) + t.Cleanup(func() { + _ = clientConn.Close() + _ = serverConn.Close() + _ = conn.Close() + <-serveClientDone + }) + go func() { + defer close(serveClientDone) + err := coordinator.ServeClient(serverConn, uuid.New(), agentID) + assert.NoError(t, err) + }() + sendNode, _ := tailnet.ServeCoordinator(clientConn, func(nodes []*tailnet.Node) error { + return conn.UpdateNodes(nodes, false) + }) + conn.SetNodeCallback(sendNode) + // Force DERP. + conn.SetBlockEndpoints(true) + + sdkConn := codersdk.NewWorkspaceAgentConn(conn, codersdk.WorkspaceAgentConnOptions{ + AgentID: agentID, + CloseFunc: func() error { return codersdk.ErrSkipClose }, + }) + t.Cleanup(func() { + _ = sdkConn.Close() + }) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + if !sdkConn.AwaitReachable(ctx) { + t.Fatal("agent not reachable") + } + + return sdkConn + } + conn1 := newClientConn(originalDerpMap) + + // Change the DERP map. + newDerpMap, _ := tailnettest.RunDERPAndSTUN(t) + require.NotNil(t, newDerpMap) + + // Change the region ID. + newDerpMap.Regions[2] = newDerpMap.Regions[1] + delete(newDerpMap.Regions, 1) + newDerpMap.Regions[2].RegionID = 2 + for _, node := range newDerpMap.Regions[2].Nodes { + node.RegionID = 2 + } + + // Push a new DERP map to the agent. + err := client.PushDERPMapUpdate(agentsdk.DERPMapUpdate{ + DERPMap: newDerpMap, + }) + require.NoError(t, err) + + // Connect from a second client and make sure it uses the new DERP map. + conn2 := newClientConn(newDerpMap) + require.Equal(t, []int{2}, conn2.DERPMap().RegionIDs()) + + // If the first client gets a DERP map update, it should be able to + // reconnect just fine. + conn1.SetDERPMap(newDerpMap) + require.Equal(t, []int{2}, conn1.DERPMap().RegionIDs()) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + require.True(t, conn1.AwaitReachable(ctx)) +} + func TestAgent_Speedtest(t *testing.T) { t.Parallel() t.Skip("This test is relatively flakey because of Tailscale's speedtest code...") @@ -1940,8 +2054,8 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati defer close(serveClientDone) coordinator.ServeClient(serverConn, uuid.New(), metadata.AgentID) }() - sendNode, _ := tailnet.ServeCoordinator(clientConn, func(node []*tailnet.Node) error { - return conn.UpdateNodes(node, false) + sendNode, _ := tailnet.ServeCoordinator(clientConn, func(nodes []*tailnet.Node) error { + return conn.UpdateNodes(nodes, false) }) conn.SetNodeCallback(sendNode) agentConn := codersdk.NewWorkspaceAgentConn(conn, codersdk.WorkspaceAgentConnOptions{ diff --git a/agent/agenttest/client.go b/agent/agenttest/client.go index a87607cf2dab8..81b0fcecb8d22 100644 --- a/agent/agenttest/client.go +++ b/agent/agenttest/client.go @@ -10,11 +10,13 @@ import ( "github.com/google/uuid" "golang.org/x/exp/maps" + "golang.org/x/xerrors" "cdr.dev/slog" "github.com/coder/coder/codersdk" "github.com/coder/coder/codersdk/agentsdk" "github.com/coder/coder/tailnet" + "github.com/coder/coder/testutil" ) func NewClient(t testing.TB, @@ -28,12 +30,13 @@ func NewClient(t testing.TB, manifest.AgentID = agentID } return &Client{ - t: t, - logger: logger.Named("client"), - agentID: agentID, - manifest: manifest, - statsChan: statsChan, - coordinator: coordinator, + t: t, + logger: logger.Named("client"), + agentID: agentID, + manifest: manifest, + statsChan: statsChan, + coordinator: coordinator, + derpMapUpdates: make(chan agentsdk.DERPMapUpdate), } } @@ -53,6 +56,7 @@ type Client struct { lifecycleStates []codersdk.WorkspaceAgentLifecycle startup agentsdk.PostStartupRequest logs []agentsdk.StartupLog + derpMapUpdates chan agentsdk.DERPMapUpdate } func (c *Client) Manifest(_ context.Context) (agentsdk.Manifest, error) { @@ -191,6 +195,26 @@ func (c *Client) GetServiceBanner(ctx context.Context) (codersdk.ServiceBannerCo return codersdk.ServiceBannerConfig{}, nil } +func (c *Client) PushDERPMapUpdate(update agentsdk.DERPMapUpdate) error { + timer := time.NewTimer(testutil.WaitShort) + defer timer.Stop() + select { + case c.derpMapUpdates <- update: + case <-timer.C: + return xerrors.New("timeout waiting to push derp map update") + } + + return nil +} + +func (c *Client) DERPMapUpdates(_ context.Context) (<-chan agentsdk.DERPMapUpdate, io.Closer, error) { + closed := make(chan struct{}) + return c.derpMapUpdates, closeFunc(func() error { + close(closed) + return nil + }), nil +} + type closeFunc func() error func (c closeFunc) Close() error { diff --git a/cli/netcheck.go b/cli/netcheck.go index bd52a2c87f718..b670e9c12b8ed 100644 --- a/cli/netcheck.go +++ b/cli/netcheck.go @@ -26,7 +26,7 @@ func (r *RootCmd) netcheck() *clibase.Cmd { ctx, cancel := context.WithTimeout(inv.Context(), 30*time.Second) defer cancel() - connInfo, err := client.WorkspaceAgentConnectionInfo(ctx) + connInfo, err := client.WorkspaceAgentConnectionInfoGeneric(ctx) if err != nil { return err } diff --git a/cli/server.go b/cli/server.go index 4ae651853c2db..29de70def1466 100644 --- a/cli/server.go +++ b/cli/server.go @@ -477,7 +477,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. AppHostnameRegex: appHostnameRegex, Logger: logger.Named("coderd"), Database: dbfake.New(), - DERPMap: derpMap, + BaseDERPMap: derpMap, Pubsub: pubsub.NewInMemory(), CacheDir: cacheDir, GoogleTokenValidator: googleTokenValidator, @@ -822,7 +822,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. if cfg.Prometheus.Enable { // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, coderAPI.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) if err != nil { return xerrors.Errorf("register agents prometheus metric: %w", err) } diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index f6cff3d136c00..7612020709a89 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -530,6 +530,25 @@ const docTemplate = `{ } } }, + "/derp-map": { + "get": { + "security": [ + { + "CoderSessionToken": [] + } + ], + "tags": [ + "Agents" + ], + "summary": "Get DERP map updates", + "operationId": "get-derp-map-updates", + "responses": { + "101": { + "description": "Switching Protocols" + } + } + } + }, "/entitlements": { "get": { "security": [ @@ -5343,28 +5362,36 @@ const docTemplate = `{ } } }, - "/workspaceproxies/me/goingaway": { + "/workspaceproxies/me/deregister": { "post": { "security": [ { "CoderSessionToken": [] } ], - "produces": [ + "consumes": [ "application/json" ], "tags": [ "Enterprise" ], - "summary": "Workspace proxy going away", - "operationId": "workspace-proxy-going-away", - "responses": { - "201": { - "description": "Created", + "summary": "Deregister workspace proxy", + "operationId": "deregister-workspace-proxy", + "parameters": [ + { + "description": "Deregister workspace proxy request", + "name": "request", + "in": "body", + "required": true, "schema": { - "$ref": "#/definitions/codersdk.Response" + "$ref": "#/definitions/wsproxysdk.DeregisterWorkspaceProxyRequest" } } + ], + "responses": { + "204": { + "description": "No Content" + } }, "x-apidocgen": { "skip": true @@ -5433,7 +5460,7 @@ const docTemplate = `{ "operationId": "register-workspace-proxy", "parameters": [ { - "description": "Issue signed app token request", + "description": "Register workspace proxy request", "name": "request", "in": "body", "required": true, @@ -10713,6 +10740,9 @@ const docTemplate = `{ "deleted": { "type": "boolean" }, + "derp_enabled": { + "type": "boolean" + }, "display_name": { "type": "string" }, @@ -11412,6 +11442,15 @@ const docTemplate = `{ } } }, + "wsproxysdk.DeregisterWorkspaceProxyRequest": { + "type": "object", + "properties": { + "replica_id": { + "description": "ReplicaID is a unique identifier for the replica of the proxy that is\nderegistering. It should be generated by the client on startup and\nshould've already been passed to the register endpoint.", + "type": "string" + } + } + }, "wsproxysdk.IssueSignedAppTokenResponse": { "type": "object", "properties": { @@ -11428,6 +11467,30 @@ const docTemplate = `{ "description": "AccessURL that hits the workspace proxy api.", "type": "string" }, + "derp_enabled": { + "description": "DerpEnabled indicates whether the proxy should be included in the DERP\nmap or not.", + "type": "boolean" + }, + "hostname": { + "description": "ReplicaHostname is the OS hostname of the machine that the proxy is running\non. This is only used for tracking purposes in the replicas table.", + "type": "string" + }, + "replica_error": { + "description": "ReplicaError is the error that the replica encountered when trying to\ndial it's peers. This is stored in the replicas table for debugging\npurposes but does not affect the proxy's ability to register.\n\nThis value is only stored on subsequent requests to the register\nendpoint, not the first request.", + "type": "string" + }, + "replica_id": { + "description": "ReplicaID is a unique identifier for the replica of the proxy that is\nregistering. It should be generated by the client on startup and\npersisted (in memory only) until the process is restarted.", + "type": "string" + }, + "replica_relay_address": { + "description": "ReplicaRelayAddress is the DERP address of the replica that other\nreplicas may use to connect internally for DERP meshing.", + "type": "string" + }, + "version": { + "description": "Version is the Coder version of the proxy.", + "type": "string" + }, "wildcard_hostname": { "description": "WildcardHostname that the workspace proxy api is serving for subdomain apps.", "type": "string" @@ -11439,6 +11502,19 @@ const docTemplate = `{ "properties": { "app_security_key": { "type": "string" + }, + "derp_mesh_key": { + "type": "string" + }, + "derp_region_id": { + "type": "integer" + }, + "sibling_replicas": { + "description": "SiblingReplicas is a list of all other replicas of the proxy that have\nnot timed out.", + "type": "array", + "items": { + "$ref": "#/definitions/codersdk.Replica" + } } } } diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index d3bc648e764ec..2dbc5da3ed05e 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -450,6 +450,23 @@ } } }, + "/derp-map": { + "get": { + "security": [ + { + "CoderSessionToken": [] + } + ], + "tags": ["Agents"], + "summary": "Get DERP map updates", + "operationId": "get-derp-map-updates", + "responses": { + "101": { + "description": "Switching Protocols" + } + } + } + }, "/entitlements": { "get": { "security": [ @@ -4709,24 +4726,32 @@ } } }, - "/workspaceproxies/me/goingaway": { + "/workspaceproxies/me/deregister": { "post": { "security": [ { "CoderSessionToken": [] } ], - "produces": ["application/json"], + "consumes": ["application/json"], "tags": ["Enterprise"], - "summary": "Workspace proxy going away", - "operationId": "workspace-proxy-going-away", - "responses": { - "201": { - "description": "Created", + "summary": "Deregister workspace proxy", + "operationId": "deregister-workspace-proxy", + "parameters": [ + { + "description": "Deregister workspace proxy request", + "name": "request", + "in": "body", + "required": true, "schema": { - "$ref": "#/definitions/codersdk.Response" + "$ref": "#/definitions/wsproxysdk.DeregisterWorkspaceProxyRequest" } } + ], + "responses": { + "204": { + "description": "No Content" + } }, "x-apidocgen": { "skip": true @@ -4783,7 +4808,7 @@ "operationId": "register-workspace-proxy", "parameters": [ { - "description": "Issue signed app token request", + "description": "Register workspace proxy request", "name": "request", "in": "body", "required": true, @@ -9719,6 +9744,9 @@ "deleted": { "type": "boolean" }, + "derp_enabled": { + "type": "boolean" + }, "display_name": { "type": "string" }, @@ -10406,6 +10434,15 @@ } } }, + "wsproxysdk.DeregisterWorkspaceProxyRequest": { + "type": "object", + "properties": { + "replica_id": { + "description": "ReplicaID is a unique identifier for the replica of the proxy that is\nderegistering. It should be generated by the client on startup and\nshould've already been passed to the register endpoint.", + "type": "string" + } + } + }, "wsproxysdk.IssueSignedAppTokenResponse": { "type": "object", "properties": { @@ -10422,6 +10459,30 @@ "description": "AccessURL that hits the workspace proxy api.", "type": "string" }, + "derp_enabled": { + "description": "DerpEnabled indicates whether the proxy should be included in the DERP\nmap or not.", + "type": "boolean" + }, + "hostname": { + "description": "ReplicaHostname is the OS hostname of the machine that the proxy is running\non. This is only used for tracking purposes in the replicas table.", + "type": "string" + }, + "replica_error": { + "description": "ReplicaError is the error that the replica encountered when trying to\ndial it's peers. This is stored in the replicas table for debugging\npurposes but does not affect the proxy's ability to register.\n\nThis value is only stored on subsequent requests to the register\nendpoint, not the first request.", + "type": "string" + }, + "replica_id": { + "description": "ReplicaID is a unique identifier for the replica of the proxy that is\nregistering. It should be generated by the client on startup and\npersisted (in memory only) until the process is restarted.", + "type": "string" + }, + "replica_relay_address": { + "description": "ReplicaRelayAddress is the DERP address of the replica that other\nreplicas may use to connect internally for DERP meshing.", + "type": "string" + }, + "version": { + "description": "Version is the Coder version of the proxy.", + "type": "string" + }, "wildcard_hostname": { "description": "WildcardHostname that the workspace proxy api is serving for subdomain apps.", "type": "string" @@ -10433,6 +10494,19 @@ "properties": { "app_security_key": { "type": "string" + }, + "derp_mesh_key": { + "type": "string" + }, + "derp_region_id": { + "type": "integer" + }, + "sibling_replicas": { + "description": "SiblingReplicas is a list of all other replicas of the proxy that have\nnot timed out.", + "type": "array", + "items": { + "$ref": "#/definitions/codersdk.Replica" + } } } } diff --git a/coderd/coderd.go b/coderd/coderd.go index 8fdd0fed26fd4..853d874ce6fa0 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -118,10 +118,13 @@ type Options struct { RealIPConfig *httpmw.RealIPConfig TrialGenerator func(ctx context.Context, email string) error // TLSCertificates is used to mesh DERP servers securely. - TLSCertificates []tls.Certificate - TailnetCoordinator tailnet.Coordinator - DERPServer *derp.Server - DERPMap *tailcfg.DERPMap + TLSCertificates []tls.Certificate + TailnetCoordinator tailnet.Coordinator + DERPServer *derp.Server + // BaseDERPMap is used as the base DERP map for all clients and agents. + // Proxies are added to this list. + BaseDERPMap *tailcfg.DERPMap + DERPMapUpdateFrequency time.Duration SwaggerEndpoint bool SetUserGroups func(ctx context.Context, tx database.Store, userID uuid.UUID, groupNames []string) error SetUserSiteRoles func(ctx context.Context, tx database.Store, userID uuid.UUID, roles []string) error @@ -236,12 +239,15 @@ func New(options *Options) *API { if options.PrometheusRegistry == nil { options.PrometheusRegistry = prometheus.NewRegistry() } - if options.TailnetCoordinator == nil { - options.TailnetCoordinator = tailnet.NewCoordinator(options.Logger) - } if options.DERPServer == nil { options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) } + if options.DERPMapUpdateFrequency == 0 { + options.DERPMapUpdateFrequency = 5 * time.Second + } + if options.TailnetCoordinator == nil { + options.TailnetCoordinator = tailnet.NewCoordinator(options.Logger) + } if options.Auditor == nil { options.Auditor = audit.NewNop() } @@ -281,22 +287,6 @@ func New(options *Options) *API { v := schedule.NewAGPLUserQuietHoursScheduleStore() options.UserQuietHoursScheduleStore.Store(&v) } - if options.HealthcheckFunc == nil { - options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report { - return healthcheck.Run(ctx, &healthcheck.ReportOptions{ - DB: options.Database, - AccessURL: options.AccessURL, - DERPMap: options.DERPMap.Clone(), - APIKey: apiKey, - }) - } - } - if options.HealthcheckTimeout == 0 { - options.HealthcheckTimeout = 30 * time.Second - } - if options.HealthcheckRefresh == 0 { - options.HealthcheckRefresh = 10 * time.Minute - } siteCacheDir := options.CacheDir if siteCacheDir != "" { @@ -376,6 +366,22 @@ func New(options *Options) *API { *options.UpdateCheckOptions, ) } + if options.HealthcheckFunc == nil { + options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report { + return healthcheck.Run(ctx, &healthcheck.ReportOptions{ + DB: options.Database, + AccessURL: options.AccessURL, + DERPMap: api.DERPMap(), + APIKey: apiKey, + }) + } + } + if options.HealthcheckTimeout == 0 { + options.HealthcheckTimeout = 30 * time.Second + } + if options.HealthcheckRefresh == 0 { + options.HealthcheckRefresh = 10 * time.Minute + } var oidcAuthURLParams map[string]string if options.OIDCConfig != nil { @@ -388,7 +394,7 @@ func New(options *Options) *API { api.agentProvider, err = NewServerTailnet(api.ctx, options.Logger, options.DERPServer, - options.DERPMap, + options.BaseDERPMap, func(context.Context) (tailnet.MultiAgentConn, error) { return (*api.TailnetCoordinator.Load()).ServeMultiAgent(uuid.New()), nil }, @@ -544,6 +550,10 @@ func New(options *Options) *API { r.Use(apiKeyMiddleware) r.Get("/regions", api.regions) }) + r.Route("/derp-map", func(r chi.Router) { + // r.Use(apiKeyMiddleware) + r.Get("/", api.derpMapUpdates) + }) r.Route("/deployment", func(r chi.Router) { r.Use(apiKeyMiddleware) r.Get("/config", api.deploymentValues) @@ -953,6 +963,8 @@ type API struct { // UserQuietHoursScheduleStore is a pointer to an atomic pointer for the // same reason as TemplateScheduleStore. UserQuietHoursScheduleStore *atomic.Pointer[schedule.UserQuietHoursScheduleStore] + // DERPMapper mutates the DERPMap to include workspace proxies. + DERPMapper atomic.Pointer[func(derpMap *tailcfg.DERPMap) *tailcfg.DERPMap] HTTPAuth *HTTPAuthorizer @@ -1107,6 +1119,15 @@ func (api *API) CreateInMemoryProvisionerDaemon(ctx context.Context, debounce ti return proto.NewDRPCProvisionerDaemonClient(clientSession), nil } +func (api *API) DERPMap() *tailcfg.DERPMap { + fn := api.DERPMapper.Load() + if fn != nil { + return (*fn)(api.Options.BaseDERPMap) + } + + return api.Options.BaseDERPMap +} + // nolint:revive func ReadExperiments(log slog.Logger, raw []string) codersdk.Experiments { exps := make([]codersdk.Experiment, 0, len(raw)) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 71882acec4a10..546bb60b1c1bd 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -385,7 +385,8 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can TLSCertificates: options.TLSCertificates, TrialGenerator: options.TrialGenerator, TailnetCoordinator: options.Coordinator, - DERPMap: derpMap, + BaseDERPMap: derpMap, + DERPMapUpdateFrequency: 150 * time.Millisecond, MetricsCacheRefreshInterval: options.MetricsCacheRefreshInterval, AgentStatsRefreshInterval: options.AgentStatsRefreshInterval, DeploymentValues: options.DeploymentValues, diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 348edf2644dc2..bd0c234883f36 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1123,6 +1123,13 @@ func (q *querier) GetQuotaConsumedForUser(ctx context.Context, userID uuid.UUID) return q.db.GetQuotaConsumedForUser(ctx, userID) } +func (q *querier) GetReplicaByID(ctx context.Context, id uuid.UUID) (database.Replica, error) { + if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil { + return database.Replica{}, err + } + return q.db.GetReplicaByID(ctx, id) +} + func (q *querier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]database.Replica, error) { if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil { return nil, err diff --git a/coderd/database/dbfake/dbfake.go b/coderd/database/dbfake/dbfake.go index 21b026bf2c782..07f479bf2fae0 100644 --- a/coderd/database/dbfake/dbfake.go +++ b/coderd/database/dbfake/dbfake.go @@ -1820,6 +1820,19 @@ func (q *FakeQuerier) GetQuotaConsumedForUser(_ context.Context, userID uuid.UUI return sum, nil } +func (q *FakeQuerier) GetReplicaByID(_ context.Context, id uuid.UUID) (database.Replica, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + for _, replica := range q.replicas { + if replica.ID == id { + return replica, nil + } + } + + return database.Replica{}, sql.ErrNoRows +} + func (q *FakeQuerier) GetReplicasUpdatedAfter(_ context.Context, updatedAt time.Time) ([]database.Replica, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -3684,6 +3697,7 @@ func (q *FakeQuerier) InsertReplica(_ context.Context, arg database.InsertReplic RelayAddress: arg.RelayAddress, Version: arg.Version, DatabaseLatency: arg.DatabaseLatency, + Primary: arg.Primary, } q.replicas = append(q.replicas, replica) return replica, nil @@ -4125,10 +4139,14 @@ func (q *FakeQuerier) InsertWorkspaceProxy(_ context.Context, arg database.Inser q.mutex.Lock() defer q.mutex.Unlock() + lastRegionID := int32(0) for _, p := range q.workspaceProxies { if !p.Deleted && p.Name == arg.Name { return database.WorkspaceProxy{}, errDuplicateKey } + if p.RegionID > lastRegionID { + lastRegionID = p.RegionID + } } p := database.WorkspaceProxy{ @@ -4136,7 +4154,9 @@ func (q *FakeQuerier) InsertWorkspaceProxy(_ context.Context, arg database.Inser Name: arg.Name, DisplayName: arg.DisplayName, Icon: arg.Icon, + DerpEnabled: arg.DerpEnabled, TokenHashedSecret: arg.TokenHashedSecret, + RegionID: lastRegionID + 1, CreatedAt: arg.CreatedAt, UpdatedAt: arg.UpdatedAt, Deleted: false, @@ -4208,6 +4228,7 @@ func (q *FakeQuerier) RegisterWorkspaceProxy(_ context.Context, arg database.Reg if p.ID == arg.ID { p.Url = arg.Url p.WildcardHostname = arg.WildcardHostname + p.DerpEnabled = arg.DerpEnabled p.UpdatedAt = database.Now() q.workspaceProxies[i] = p return p, nil @@ -4419,6 +4440,7 @@ func (q *FakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplic replica.Version = arg.Version replica.Error = arg.Error replica.DatabaseLatency = arg.DatabaseLatency + replica.Primary = arg.Primary q.replicas[index] = replica return replica, nil } diff --git a/coderd/database/dbmetrics/dbmetrics.go b/coderd/database/dbmetrics/dbmetrics.go index aca079e80818e..0c30eb354ef65 100644 --- a/coderd/database/dbmetrics/dbmetrics.go +++ b/coderd/database/dbmetrics/dbmetrics.go @@ -545,6 +545,13 @@ func (m metricsStore) GetQuotaConsumedForUser(ctx context.Context, ownerID uuid. return consumed, err } +func (m metricsStore) GetReplicaByID(ctx context.Context, id uuid.UUID) (database.Replica, error) { + start := time.Now() + replica, err := m.s.GetReplicaByID(ctx, id) + m.queryLatencies.WithLabelValues("GetReplicaByID").Observe(time.Since(start).Seconds()) + return replica, err +} + func (m metricsStore) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]database.Replica, error) { start := time.Now() replicas, err := m.s.GetReplicasUpdatedAfter(ctx, updatedAt) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 2e9d5042adbca..1b285c7ed384f 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1076,6 +1076,21 @@ func (mr *MockStoreMockRecorder) GetQuotaConsumedForUser(arg0, arg1 interface{}) return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetQuotaConsumedForUser", reflect.TypeOf((*MockStore)(nil).GetQuotaConsumedForUser), arg0, arg1) } +// GetReplicaByID mocks base method. +func (m *MockStore) GetReplicaByID(arg0 context.Context, arg1 uuid.UUID) (database.Replica, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetReplicaByID", arg0, arg1) + ret0, _ := ret[0].(database.Replica) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetReplicaByID indicates an expected call of GetReplicaByID. +func (mr *MockStoreMockRecorder) GetReplicaByID(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetReplicaByID", reflect.TypeOf((*MockStore)(nil).GetReplicaByID), arg0, arg1) +} + // GetReplicasUpdatedAfter mocks base method. func (m *MockStore) GetReplicasUpdatedAfter(arg0 context.Context, arg1 time.Time) ([]database.Replica, error) { m.ctrl.T.Helper() diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 3dfe832c621ad..82f1fdf615818 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -418,7 +418,8 @@ CREATE TABLE replicas ( relay_address text NOT NULL, database_latency integer NOT NULL, version text NOT NULL, - error text DEFAULT ''::text NOT NULL + error text DEFAULT ''::text NOT NULL, + "primary" boolean DEFAULT true NOT NULL ); CREATE TABLE site_configs ( @@ -862,7 +863,9 @@ CREATE TABLE workspace_proxies ( created_at timestamp with time zone NOT NULL, updated_at timestamp with time zone NOT NULL, deleted boolean NOT NULL, - token_hashed_secret bytea NOT NULL + token_hashed_secret bytea NOT NULL, + region_id integer NOT NULL, + derp_enabled boolean DEFAULT true NOT NULL ); COMMENT ON COLUMN workspace_proxies.icon IS 'Expects an emoji character. (/emojis/1f1fa-1f1f8.png)'; @@ -875,6 +878,16 @@ COMMENT ON COLUMN workspace_proxies.deleted IS 'Boolean indicator of a deleted w COMMENT ON COLUMN workspace_proxies.token_hashed_secret IS 'Hashed secret is used to authenticate the workspace proxy using a session token.'; +CREATE SEQUENCE workspace_proxies_region_id_seq + AS integer + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + +ALTER SEQUENCE workspace_proxies_region_id_seq OWNED BY workspace_proxies.region_id; + CREATE TABLE workspace_resource_metadata ( workspace_resource_id uuid NOT NULL, key character varying(1024) NOT NULL, @@ -927,6 +940,8 @@ ALTER TABLE ONLY provisioner_job_logs ALTER COLUMN id SET DEFAULT nextval('provi ALTER TABLE ONLY workspace_agent_startup_logs ALTER COLUMN id SET DEFAULT nextval('workspace_agent_startup_logs_id_seq'::regclass); +ALTER TABLE ONLY workspace_proxies ALTER COLUMN region_id SET DEFAULT nextval('workspace_proxies_region_id_seq'::regclass); + ALTER TABLE ONLY workspace_resource_metadata ALTER COLUMN id SET DEFAULT nextval('workspace_resource_metadata_id_seq'::regclass); ALTER TABLE ONLY workspace_agent_stats @@ -1058,6 +1073,9 @@ ALTER TABLE ONLY workspace_builds ALTER TABLE ONLY workspace_proxies ADD CONSTRAINT workspace_proxies_pkey PRIMARY KEY (id); +ALTER TABLE ONLY workspace_proxies + ADD CONSTRAINT workspace_proxies_region_id_unique UNIQUE (region_id); + ALTER TABLE ONLY workspace_resource_metadata ADD CONSTRAINT workspace_resource_metadata_name UNIQUE (workspace_resource_id, key); diff --git a/coderd/database/migrations/000142_proxy_derp.down.sql b/coderd/database/migrations/000142_proxy_derp.down.sql new file mode 100644 index 0000000000000..9937e47591ce5 --- /dev/null +++ b/coderd/database/migrations/000142_proxy_derp.down.sql @@ -0,0 +1,15 @@ +BEGIN; + +-- drop any rows that aren't primary replicas +DELETE FROM replicas + WHERE "primary" = false; + +ALTER TABLE replicas + DROP COLUMN "primary"; + +ALTER TABLE workspace_proxies + DROP CONSTRAINT workspace_proxies_region_id_unique, + DROP COLUMN region_id, + DROP COLUMN derp_enabled; + +COMMIT; diff --git a/coderd/database/migrations/000142_proxy_derp.up.sql b/coderd/database/migrations/000142_proxy_derp.up.sql new file mode 100644 index 0000000000000..e214fe50fc366 --- /dev/null +++ b/coderd/database/migrations/000142_proxy_derp.up.sql @@ -0,0 +1,13 @@ +BEGIN; + +ALTER TABLE replicas + ADD COLUMN "primary" boolean NOT NULL DEFAULT true; + +ALTER TABLE workspace_proxies + -- Adding a serial to a table without a default value will be filled as you + -- would expect on versions of Postgres >= 9 AFAIK (which we require). + ADD COLUMN region_id serial NOT NULL, + ADD COLUMN derp_enabled boolean NOT NULL DEFAULT true, + ADD CONSTRAINT workspace_proxies_region_id_unique UNIQUE (region_id); + +COMMIT; diff --git a/coderd/database/models.go b/coderd/database/models.go index d8b044443ffe8..32ac5f626eff1 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1539,6 +1539,7 @@ type Replica struct { DatabaseLatency int32 `db:"database_latency" json:"database_latency"` Version string `db:"version" json:"version"` Error string `db:"error" json:"error"` + Primary bool `db:"primary" json:"primary"` } type SiteConfig struct { @@ -1936,6 +1937,8 @@ type WorkspaceProxy struct { Deleted bool `db:"deleted" json:"deleted"` // Hashed secret is used to authenticate the workspace proxy using a session token. TokenHashedSecret []byte `db:"token_hashed_secret" json:"token_hashed_secret"` + RegionID int32 `db:"region_id" json:"region_id"` + DerpEnabled bool `db:"derp_enabled" json:"derp_enabled"` } type WorkspaceResource struct { diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 471b0e1619d2f..08f29ca1bc3ca 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -97,6 +97,7 @@ type sqlcQuerier interface { GetProvisionerLogsAfterID(ctx context.Context, arg GetProvisionerLogsAfterIDParams) ([]ProvisionerJobLog, error) GetQuotaAllowanceForUser(ctx context.Context, userID uuid.UUID) (int64, error) GetQuotaConsumedForUser(ctx context.Context, ownerID uuid.UUID) (int64, error) + GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetServiceBanner(ctx context.Context) (string, error) GetTailnetAgents(ctx context.Context, id uuid.UUID) ([]TailnetAgent, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 95ad36d9593ca..46502c402c618 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2848,7 +2848,7 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a const getWorkspaceProxies = `-- name: GetWorkspaceProxies :many SELECT - id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled FROM workspace_proxies WHERE @@ -2875,6 +2875,8 @@ func (q *sqlQuerier) GetWorkspaceProxies(ctx context.Context) ([]WorkspaceProxy, &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ); err != nil { return nil, err } @@ -2891,7 +2893,7 @@ func (q *sqlQuerier) GetWorkspaceProxies(ctx context.Context) ([]WorkspaceProxy, const getWorkspaceProxyByHostname = `-- name: GetWorkspaceProxyByHostname :one SELECT - id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled FROM workspace_proxies WHERE @@ -2947,13 +2949,15 @@ func (q *sqlQuerier) GetWorkspaceProxyByHostname(ctx context.Context, arg GetWor &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } const getWorkspaceProxyByID = `-- name: GetWorkspaceProxyByID :one SELECT - id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled FROM workspace_proxies WHERE @@ -2976,13 +2980,15 @@ func (q *sqlQuerier) GetWorkspaceProxyByID(ctx context.Context, id uuid.UUID) (W &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } const getWorkspaceProxyByName = `-- name: GetWorkspaceProxyByName :one SELECT - id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled FROM workspace_proxies WHERE @@ -3006,6 +3012,8 @@ func (q *sqlQuerier) GetWorkspaceProxyByName(ctx context.Context, name string) ( &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } @@ -3019,13 +3027,14 @@ INSERT INTO name, display_name, icon, + derp_enabled, token_hashed_secret, created_at, updated_at, deleted ) VALUES - ($1, '', '', $2, $3, $4, $5, $6, $7, false) RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + ($1, '', '', $2, $3, $4, $5, $6, $7, $8, false) RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled ` type InsertWorkspaceProxyParams struct { @@ -3033,6 +3042,7 @@ type InsertWorkspaceProxyParams struct { Name string `db:"name" json:"name"` DisplayName string `db:"display_name" json:"display_name"` Icon string `db:"icon" json:"icon"` + DerpEnabled bool `db:"derp_enabled" json:"derp_enabled"` TokenHashedSecret []byte `db:"token_hashed_secret" json:"token_hashed_secret"` CreatedAt time.Time `db:"created_at" json:"created_at"` UpdatedAt time.Time `db:"updated_at" json:"updated_at"` @@ -3044,6 +3054,7 @@ func (q *sqlQuerier) InsertWorkspaceProxy(ctx context.Context, arg InsertWorkspa arg.Name, arg.DisplayName, arg.Icon, + arg.DerpEnabled, arg.TokenHashedSecret, arg.CreatedAt, arg.UpdatedAt, @@ -3060,6 +3071,8 @@ func (q *sqlQuerier) InsertWorkspaceProxy(ctx context.Context, arg InsertWorkspa &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } @@ -3068,22 +3081,29 @@ const registerWorkspaceProxy = `-- name: RegisterWorkspaceProxy :one UPDATE workspace_proxies SET - url = $1, - wildcard_hostname = $2, + url = $1 :: text, + wildcard_hostname = $2 :: text, + derp_enabled = $3 :: boolean, updated_at = Now() WHERE - id = $3 -RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret + id = $4 +RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled ` type RegisterWorkspaceProxyParams struct { Url string `db:"url" json:"url"` WildcardHostname string `db:"wildcard_hostname" json:"wildcard_hostname"` + DerpEnabled bool `db:"derp_enabled" json:"derp_enabled"` ID uuid.UUID `db:"id" json:"id"` } func (q *sqlQuerier) RegisterWorkspaceProxy(ctx context.Context, arg RegisterWorkspaceProxyParams) (WorkspaceProxy, error) { - row := q.db.QueryRowContext(ctx, registerWorkspaceProxy, arg.Url, arg.WildcardHostname, arg.ID) + row := q.db.QueryRowContext(ctx, registerWorkspaceProxy, + arg.Url, + arg.WildcardHostname, + arg.DerpEnabled, + arg.ID, + ) var i WorkspaceProxy err := row.Scan( &i.ID, @@ -3096,6 +3116,8 @@ func (q *sqlQuerier) RegisterWorkspaceProxy(ctx context.Context, arg RegisterWor &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } @@ -3118,7 +3140,7 @@ SET updated_at = Now() WHERE id = $5 -RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret +RETURNING id, name, display_name, icon, url, wildcard_hostname, created_at, updated_at, deleted, token_hashed_secret, region_id, derp_enabled ` type UpdateWorkspaceProxyParams struct { @@ -3150,6 +3172,8 @@ func (q *sqlQuerier) UpdateWorkspaceProxy(ctx context.Context, arg UpdateWorkspa &i.UpdatedAt, &i.Deleted, &i.TokenHashedSecret, + &i.RegionID, + &i.DerpEnabled, ) return i, err } @@ -3230,8 +3254,32 @@ func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt return err } +const getReplicaByID = `-- name: GetReplicaByID :one +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error, "primary" FROM replicas WHERE id = $1 +` + +func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { + row := q.db.QueryRowContext(ctx, getReplicaByID, id) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.DatabaseLatency, + &i.Version, + &i.Error, + &i.Primary, + ) + return i, err +} + const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error, "primary" FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL ` func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) { @@ -3255,6 +3303,7 @@ func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time &i.DatabaseLatency, &i.Version, &i.Error, + &i.Primary, ); err != nil { return nil, err } @@ -3279,8 +3328,9 @@ INSERT INTO replicas ( region_id, relay_address, version, - database_latency -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error + database_latency, + "primary" +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error, "primary" ` type InsertReplicaParams struct { @@ -3293,6 +3343,7 @@ type InsertReplicaParams struct { RelayAddress string `db:"relay_address" json:"relay_address"` Version string `db:"version" json:"version"` DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Primary bool `db:"primary" json:"primary"` } func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) { @@ -3306,6 +3357,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) arg.RelayAddress, arg.Version, arg.DatabaseLatency, + arg.Primary, ) var i Replica err := row.Scan( @@ -3320,6 +3372,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) &i.DatabaseLatency, &i.Version, &i.Error, + &i.Primary, ) return i, err } @@ -3334,8 +3387,9 @@ UPDATE replicas SET hostname = $7, version = $8, error = $9, - database_latency = $10 -WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error + database_latency = $10, + "primary" = $11 +WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error, "primary" ` type UpdateReplicaParams struct { @@ -3349,6 +3403,7 @@ type UpdateReplicaParams struct { Version string `db:"version" json:"version"` Error string `db:"error" json:"error"` DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Primary bool `db:"primary" json:"primary"` } func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { @@ -3363,6 +3418,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) arg.Version, arg.Error, arg.DatabaseLatency, + arg.Primary, ) var i Replica err := row.Scan( @@ -3377,6 +3433,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) &i.DatabaseLatency, &i.Version, &i.Error, + &i.Primary, ) return i, err } diff --git a/coderd/database/queries/proxies.sql b/coderd/database/queries/proxies.sql index c6be3333fb9c6..d283ef87fe936 100644 --- a/coderd/database/queries/proxies.sql +++ b/coderd/database/queries/proxies.sql @@ -7,20 +7,22 @@ INSERT INTO name, display_name, icon, + derp_enabled, token_hashed_secret, created_at, updated_at, deleted ) VALUES - ($1, '', '', $2, $3, $4, $5, $6, $7, false) RETURNING *; + ($1, '', '', $2, $3, $4, $5, $6, $7, $8, false) RETURNING *; -- name: RegisterWorkspaceProxy :one UPDATE workspace_proxies SET - url = @url, - wildcard_hostname = @wildcard_hostname, + url = @url :: text, + wildcard_hostname = @wildcard_hostname :: text, + derp_enabled = @derp_enabled :: boolean, updated_at = Now() WHERE id = @id diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql index e87c1f46432f2..5a0b4ac0fe95e 100644 --- a/coderd/database/queries/replicas.sql +++ b/coderd/database/queries/replicas.sql @@ -1,6 +1,9 @@ -- name: GetReplicasUpdatedAfter :many SELECT * FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL; +-- name: GetReplicaByID :one +SELECT * FROM replicas WHERE id = $1; + -- name: InsertReplica :one INSERT INTO replicas ( id, @@ -11,8 +14,9 @@ INSERT INTO replicas ( region_id, relay_address, version, - database_latency -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING *; + database_latency, + "primary" +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING *; -- name: UpdateReplica :one UPDATE replicas SET @@ -24,7 +28,8 @@ UPDATE replicas SET hostname = $7, version = $8, error = $9, - database_latency = $10 + database_latency = $10, + "primary" = $11 WHERE id = $1 RETURNING *; -- name: DeleteReplicasUpdatedBefore :exec diff --git a/coderd/database/unique_constraint.go b/coderd/database/unique_constraint.go index f0ba6c702ac93..c8dbc831e8651 100644 --- a/coderd/database/unique_constraint.go +++ b/coderd/database/unique_constraint.go @@ -22,6 +22,7 @@ const ( UniqueWorkspaceBuildParametersWorkspaceBuildIDNameKey UniqueConstraint = "workspace_build_parameters_workspace_build_id_name_key" // ALTER TABLE ONLY workspace_build_parameters ADD CONSTRAINT workspace_build_parameters_workspace_build_id_name_key UNIQUE (workspace_build_id, name); UniqueWorkspaceBuildsJobIDKey UniqueConstraint = "workspace_builds_job_id_key" // ALTER TABLE ONLY workspace_builds ADD CONSTRAINT workspace_builds_job_id_key UNIQUE (job_id); UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey UniqueConstraint = "workspace_builds_workspace_id_build_number_key" // ALTER TABLE ONLY workspace_builds ADD CONSTRAINT workspace_builds_workspace_id_build_number_key UNIQUE (workspace_id, build_number); + UniqueWorkspaceProxiesRegionIDUnique UniqueConstraint = "workspace_proxies_region_id_unique" // ALTER TABLE ONLY workspace_proxies ADD CONSTRAINT workspace_proxies_region_id_unique UNIQUE (region_id); UniqueWorkspaceResourceMetadataName UniqueConstraint = "workspace_resource_metadata_name" // ALTER TABLE ONLY workspace_resource_metadata ADD CONSTRAINT workspace_resource_metadata_name UNIQUE (workspace_resource_id, key); UniqueIndexApiKeyName UniqueConstraint = "idx_api_key_name" // CREATE UNIQUE INDEX idx_api_key_name ON api_keys USING btree (user_id, token_name) WHERE (login_type = 'token'::login_type); UniqueIndexOrganizationName UniqueConstraint = "idx_organization_name" // CREATE UNIQUE INDEX idx_organization_name ON organizations USING btree (name); diff --git a/coderd/healthcheck/derp.go b/coderd/healthcheck/derp.go index 9472fcd98de50..0b77c42076254 100644 --- a/coderd/healthcheck/derp.go +++ b/coderd/healthcheck/derp.go @@ -170,7 +170,10 @@ func (r *DERPNodeReport) derpURL() *url.URL { derpURL.Scheme = "http" } if r.Node.HostName == "" { - derpURL.Host = fmt.Sprintf("%s:%d", r.Node.IPv4, r.Node.DERPPort) + derpURL.Host = r.Node.IPv4 + } + if r.Node.DERPPort != 0 { + derpURL.Host = fmt.Sprintf("%s:%d", derpURL.Host, r.Node.DERPPort) } return derpURL diff --git a/coderd/httpapi/httpapi.go b/coderd/httpapi/httpapi.go index 658c0cc39294b..b7559d5feeabe 100644 --- a/coderd/httpapi/httpapi.go +++ b/coderd/httpapi/httpapi.go @@ -151,11 +151,9 @@ func Write(ctx context.Context, rw http.ResponseWriter, status int, response int enc := json.NewEncoder(rw) enc.SetEscapeHTML(true) - err := enc.Encode(response) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } + // We can't really do much about these errors, it's probably due to a + // dropped connection. + _ = enc.Encode(response) } func WriteIndent(ctx context.Context, rw http.ResponseWriter, status int, response interface{}) { @@ -169,11 +167,9 @@ func WriteIndent(ctx context.Context, rw http.ResponseWriter, status int, respon enc.SetEscapeHTML(true) enc.SetIndent("", "\t") - err := enc.Encode(response) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } + // We can't really do much about these errors, it's probably due to a + // dropped connection. + _ = enc.Encode(response) } // Read decodes JSON from the HTTP request into the value provided. It uses diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 454934f1dd7b4..c1f749622accc 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -142,7 +142,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) { +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) { if duration == 0 { duration = 1 * time.Minute } @@ -223,6 +223,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis logger.Debug(ctx, "agent metrics collection is starting") timer := prometheus.NewTimer(metricsCollectorAgents) + derpMap := derpMapFn() workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e6170422aa78b..3ea774df1186d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -15,6 +15,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "tailscale.com/tailcfg" "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" @@ -299,10 +300,13 @@ func TestAgents(t *testing.T) { coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) // given + derpMap, _ := tailnettest.RunDERPAndSTUN(t) + derpMapFn := func() *tailcfg.DERPMap { + return derpMap + } coordinator := tailnet.NewCoordinator(slogtest.Make(t, nil).Leveled(slog.LevelDebug)) coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} coordinatorPtr.Store(&coordinator) - derpMap, _ := tailnettest.RunDERPAndSTUN(t) agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests registry := prometheus.NewRegistry() @@ -312,7 +316,7 @@ func TestAgents(t *testing.T) { // when closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, &slogtest.Options{ IgnoreErrors: true, - }), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, 50*time.Millisecond) + }), registry, db, &coordinatorPtr, derpMapFn, agentInactiveDisconnectTimeout, 50*time.Millisecond) require.NoError(t, err) t.Cleanup(closeFunc) diff --git a/coderd/provisionerjobs.go b/coderd/provisionerjobs.go index 6b68f2ebd3c32..1be1a56518d28 100644 --- a/coderd/provisionerjobs.go +++ b/coderd/provisionerjobs.go @@ -149,7 +149,7 @@ func (api *API) provisionerJobResources(rw http.ResponseWriter, r *http.Request, } apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { diff --git a/coderd/tracing/status_writer.go b/coderd/tracing/status_writer.go index 6f80ee4b1fc7d..9409c3adf5e69 100644 --- a/coderd/tracing/status_writer.go +++ b/coderd/tracing/status_writer.go @@ -2,10 +2,17 @@ package tracing import ( "bufio" + "flag" + "fmt" + "log" "net" "net/http" + "runtime" + "strings" "golang.org/x/xerrors" + + "github.com/coder/coder/buildinfo" ) var ( @@ -22,7 +29,8 @@ type StatusWriter struct { Hijacked bool responseBody []byte - wroteHeader bool + wroteHeader bool + wroteHeaderStack string } func StatusWriterMiddleware(next http.Handler) http.Handler { @@ -33,6 +41,20 @@ func StatusWriterMiddleware(next http.Handler) http.Handler { } func (w *StatusWriter) WriteHeader(status int) { + if buildinfo.IsDev() || flag.Lookup("test.v") != nil { + if w.wroteHeader { + stack := getStackString(2) + wroteHeaderStack := w.wroteHeaderStack + if wroteHeaderStack == "" { + wroteHeaderStack = "unknown" + } + // It's fine that this logs to stdlib logger since it only happens + // in dev builds and tests. + log.Printf("duplicate call to (*StatusWriter.).WriteHeader(%d):\n\nstack: %s\n\nheader written at: %s", status, stack, wroteHeaderStack) + } else { + w.wroteHeaderStack = getStackString(2) + } + } if !w.wroteHeader { w.Status = status w.wroteHeader = true @@ -89,3 +111,20 @@ func (w *StatusWriter) Flush() { } f.Flush() } + +func getStackString(skip int) string { + // Get up to 5 callers, skipping this one and the skip count. + pcs := make([]uintptr, 5) + got := runtime.Callers(skip+1, pcs) + frames := runtime.CallersFrames(pcs[:got]) + + callers := []string{} + for { + frame, more := frames.Next() + callers = append(callers, fmt.Sprintf("%s:%v", frame.File, frame.Line)) + if !more { + break + } + } + return strings.Join(callers, " -> ") +} diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index f8d5e10f62de3..2fd5aa49aeb44 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -63,7 +63,7 @@ func (api *API) workspaceAgent(rw http.ResponseWriter, r *http.Request) { return } apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { @@ -88,7 +88,7 @@ func (api *API) workspaceAgentManifest(rw http.ResponseWriter, r *http.Request) ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { @@ -163,7 +163,7 @@ func (api *API) workspaceAgentManifest(rw http.ResponseWriter, r *http.Request) httpapi.Write(ctx, rw, http.StatusOK, agentsdk.Manifest{ AgentID: apiAgent.ID, Apps: convertApps(dbApps), - DERPMap: api.DERPMap, + DERPMap: api.DERPMap(), GitAuthConfigs: len(api.GitAuthConfigs), EnvironmentVariables: apiAgent.EnvironmentVariables, StartupScript: apiAgent.StartupScript, @@ -192,7 +192,7 @@ func (api *API) postWorkspaceAgentStartup(rw http.ResponseWriter, r *http.Reques ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { @@ -590,7 +590,7 @@ func (api *API) workspaceAgentListeningPorts(rw http.ResponseWriter, r *http.Req workspaceAgent := httpmw.WorkspaceAgentParam(r) apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { @@ -686,9 +686,11 @@ func (api *API) workspaceAgentListeningPorts(rw http.ResponseWriter, r *http.Req // See: https://github.com/coder/coder/issues/8218 func (api *API) _dialWorkspaceAgentTailnet(agentID uuid.UUID) (*codersdk.WorkspaceAgentConn, error) { clientConn, serverConn := net.Pipe() + + derpMap := api.DERPMap() conn, err := tailnet.NewConn(&tailnet.Options{ Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, - DERPMap: api.DERPMap, + DERPMap: api.DERPMap(), Logger: api.Logger.Named("tailnet"), BlockEndpoints: api.DeploymentValues.DERP.Config.BlockDirect.Value(), }) @@ -712,14 +714,35 @@ func (api *API) _dialWorkspaceAgentTailnet(agentID uuid.UUID) (*codersdk.Workspa return left }) - sendNodes, _ := tailnet.ServeCoordinator(clientConn, func(node []*tailnet.Node) error { - err = conn.UpdateNodes(node, true) - if err != nil { - return xerrors.Errorf("update nodes: %w", err) - } - return nil + sendNodes, _ := tailnet.ServeCoordinator(clientConn, func(nodes []*tailnet.Node) error { + return conn.UpdateNodes(nodes, true) }) conn.SetNodeCallback(sendNodes) + + // Check for updated DERP map every 5 seconds. + go func() { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + lastDERPMap := derpMap + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + derpMap := api.DERPMap() + if lastDERPMap == nil || tailnet.CompareDERPMaps(lastDERPMap, derpMap) { + conn.SetDERPMap(derpMap) + lastDERPMap = derpMap + } + ticker.Reset(5 * time.Second) + } + } + }() + agentConn := codersdk.NewWorkspaceAgentConn(conn, codersdk.WorkspaceAgentConnOptions{ AgentID: agentID, AgentIP: codersdk.WorkspaceAgentIP, @@ -743,6 +766,9 @@ func (api *API) _dialWorkspaceAgentTailnet(agentID uuid.UUID) (*codersdk.Workspa }() if !agentConn.AwaitReachable(ctx) { _ = agentConn.Close() + _ = serverConn.Close() + _ = clientConn.Close() + cancel() return nil, xerrors.Errorf("agent not reachable") } return agentConn, nil @@ -760,7 +786,7 @@ func (api *API) workspaceAgentConnection(rw http.ResponseWriter, r *http.Request ctx := r.Context() httpapi.Write(ctx, rw, http.StatusOK, codersdk.WorkspaceAgentConnectionInfo{ - DERPMap: api.DERPMap, + DERPMap: api.DERPMap(), DisableDirectConnections: api.DeploymentValues.DERP.Config.BlockDirect.Value(), }) } @@ -780,10 +806,63 @@ func (api *API) workspaceAgentConnectionGeneric(rw http.ResponseWriter, r *http. ctx := r.Context() httpapi.Write(ctx, rw, http.StatusOK, codersdk.WorkspaceAgentConnectionInfo{ - DERPMap: api.DERPMap, + DERPMap: api.DERPMap(), + DisableDirectConnections: api.DeploymentValues.DERP.Config.BlockDirect.Value(), }) } +// @Summary Get DERP map updates +// @ID get-derp-map-updates +// @Security CoderSessionToken +// @Tags Agents +// @Success 101 +// @Router /derp-map [get] +func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + api.WebsocketWaitMutex.Lock() + api.WebsocketWaitGroup.Add(1) + api.WebsocketWaitMutex.Unlock() + defer api.WebsocketWaitGroup.Done() + + ws, err := websocket.Accept(rw, r, nil) + if err != nil { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: "Failed to accept websocket.", + Detail: err.Error(), + }) + return + } + nconn := websocket.NetConn(ctx, ws, websocket.MessageBinary) + defer nconn.Close() + + ticker := time.NewTicker(api.Options.DERPMapUpdateFrequency) + defer ticker.Stop() + + var lastDERPMap *tailcfg.DERPMap + for { + derpMap := api.DERPMap() + if lastDERPMap == nil || !tailnet.CompareDERPMaps(lastDERPMap, derpMap) { + err := json.NewEncoder(nconn).Encode(derpMap) + if err != nil { + _ = ws.Close(websocket.StatusInternalError, err.Error()) + return + } + lastDERPMap = derpMap + } + + select { + case <-ctx.Done(): + return + case <-api.ctx.Done(): + return + case <-ticker.C: + } + + ticker.Reset(api.Options.DERPMapUpdateFrequency) + } +} + // @Summary Coordinate workspace agent via Tailnet // @Description It accepts a WebSocket connection to an agent that listens to // @Description incoming connections and publishes node updates. diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index 6afec803bbc49..87603361780e1 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -9,12 +9,14 @@ import ( "runtime" "strconv" "strings" + "sync/atomic" "testing" "time" "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "tailscale.com/tailcfg" "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" @@ -25,6 +27,7 @@ import ( "github.com/coder/coder/codersdk/agentsdk" "github.com/coder/coder/provisioner/echo" "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -1247,3 +1250,103 @@ func TestWorkspaceAgent_Startup(t *testing.T) { require.Equal(t, http.StatusBadRequest, cerr.StatusCode()) }) } + +// TestWorkspaceAgent_UpdatedDERP runs a real coderd server, with a real agent +// and a real client, and updates the DERP map live to ensure connections still +// work. +func TestWorkspaceAgent_UpdatedDERP(t *testing.T) { + t.Parallel() + + logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) + + dv := coderdtest.DeploymentValues(t) + err := dv.DERP.Config.BlockDirect.Set("true") + require.NoError(t, err) + + client, closer, api := coderdtest.NewWithAPI(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + DeploymentValues: dv, + }) + defer closer.Close() + user := coderdtest.CreateFirstUser(t, client) + + originalDerpMap := api.DERPMap() + require.NotNil(t, originalDerpMap) + + // Change the DERP mapper to our custom one. + var currentDerpMap atomic.Pointer[tailcfg.DERPMap] + currentDerpMap.Store(originalDerpMap) + derpMapFn := func(_ *tailcfg.DERPMap) *tailcfg.DERPMap { + return currentDerpMap.Load().Clone() + } + api.DERPMapper.Store(&derpMapFn) + + // Start workspace a workspace agent. + agentToken := uuid.NewString() + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: echo.ProvisionApplyWithAgent(agentToken), + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(agentToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: logger.Named("agent"), + }) + defer func() { + _ = agentCloser.Close() + }() + resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) + agentID := resources[0].Agents[0].ID + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + // Connect from a client. + conn1, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{ + Logger: logger.Named("client1"), + }) + require.NoError(t, err) + defer conn1.Close() + ok := conn1.AwaitReachable(ctx) + require.True(t, ok) + + // Change the DERP map and change the region ID. + newDerpMap, _ := tailnettest.RunDERPAndSTUN(t) + require.NotNil(t, newDerpMap) + newDerpMap.Regions[2] = newDerpMap.Regions[1] + delete(newDerpMap.Regions, 1) + newDerpMap.Regions[2].RegionID = 2 + for _, node := range newDerpMap.Regions[2].Nodes { + node.RegionID = 2 + } + currentDerpMap.Store(newDerpMap) + + // Wait for the agent's DERP map to be updated. + // TODO: this + + // Wait for the DERP map to be updated on the existing client. + require.Eventually(t, func() bool { + regionIDs := conn1.Conn.DERPMap().RegionIDs() + return len(regionIDs) == 1 && regionIDs[0] == 2 + }, testutil.WaitLong, testutil.IntervalFast) + + // The first client should still be able to reach the agent. + ok = conn1.AwaitReachable(ctx) + require.True(t, ok) + + // Connect from a second client. + conn2, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{ + Logger: logger.Named("client2"), + }) + require.NoError(t, err) + defer conn2.Close() + ok = conn2.AwaitReachable(ctx) + require.True(t, ok) + require.Equal(t, []int{2}, conn2.DERPMap().RegionIDs()) +} diff --git a/coderd/workspacebuilds.go b/coderd/workspacebuilds.go index 726e07d8a88a9..a33ed16bcccf7 100644 --- a/coderd/workspacebuilds.go +++ b/coderd/workspacebuilds.go @@ -835,7 +835,7 @@ func (api *API) convertWorkspaceBuild( for _, agent := range agents { apps := appsByAgentID[agent.ID] apiAgent, err := convertWorkspaceAgent( - api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(apps), api.AgentInactiveDisconnectTimeout, + api.DERPMap(), *api.TailnetCoordinator.Load(), agent, convertApps(apps), api.AgentInactiveDisconnectTimeout, api.DeploymentValues.AgentFallbackTroubleshootingURL.String(), ) if err != nil { diff --git a/coderd/workspacebuilds_test.go b/coderd/workspacebuilds_test.go index 0fb5b03139224..b838e39e3b251 100644 --- a/coderd/workspacebuilds_test.go +++ b/coderd/workspacebuilds_test.go @@ -645,7 +645,8 @@ func TestWorkspaceBuildDebugMode(t *testing.T) { // Create user deploymentValues := coderdtest.DeploymentValues(t) - deploymentValues.EnableTerraformDebugMode = false + err := deploymentValues.EnableTerraformDebugMode.Set("false") + require.NoError(t, err) adminClient := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true, DeploymentValues: deploymentValues}) owner := coderdtest.CreateFirstUser(t, adminClient) @@ -663,7 +664,7 @@ func TestWorkspaceBuildDebugMode(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) defer cancel() - _, err := adminClient.CreateWorkspaceBuild(ctx, workspace.ID, codersdk.CreateWorkspaceBuildRequest{ + _, err = adminClient.CreateWorkspaceBuild(ctx, workspace.ID, codersdk.CreateWorkspaceBuildRequest{ TemplateVersionID: workspace.LatestBuild.TemplateVersionID, Transition: codersdk.WorkspaceTransitionStart, LogLevel: "debug", diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index 276e528313751..00526fa0d27fb 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -191,8 +191,8 @@ func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Durati _ = conn.Close() }) go coordinator.ServeClient(serverConn, uuid.New(), manifest.AgentID) - sendNode, _ := tailnet.ServeCoordinator(clientConn, func(node []*tailnet.Node) error { - return conn.UpdateNodes(node, false) + sendNode, _ := tailnet.ServeCoordinator(clientConn, func(nodes []*tailnet.Node) error { + return conn.UpdateNodes(nodes, false) }) conn.SetNodeCallback(sendNode) agentConn := codersdk.NewWorkspaceAgentConn(conn, codersdk.WorkspaceAgentConnOptions{ @@ -221,6 +221,24 @@ func (c *client) Manifest(_ context.Context) (agentsdk.Manifest, error) { return c.manifest, nil } +type closer struct { + closeFunc func() error +} + +func (c *closer) Close() error { + return c.closeFunc() +} + +func (*client) DERPMapUpdates(_ context.Context) (<-chan agentsdk.DERPMapUpdate, io.Closer, error) { + closed := make(chan struct{}) + return make(<-chan agentsdk.DERPMapUpdate), &closer{ + closeFunc: func() error { + close(closed) + return nil + }, + }, nil +} + func (c *client) Listen(_ context.Context) (net.Conn, error) { clientConn, serverConn := net.Pipe() closed := make(chan struct{}) diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index 90f15eff649e3..9d3ad05aa1d79 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -115,6 +115,20 @@ func (c *Client) Manifest(ctx context.Context) (Manifest, error) { if err != nil { return Manifest{}, err } + err = c.rewriteDerpMap(agentMeta.DERPMap) + if err != nil { + return Manifest{}, err + } + return agentMeta, nil +} + +// rewriteDerpMap rewrites the DERP map to use the access URL of the SDK as the +// "embedded relay" access URL. The passed derp map is modified in place. +// +// Agents can provide an arbitrary access URL that may be different that the +// globally configured one. This breaks the built-in DERP, which would continue +// to reference the global access URL. +func (c *Client) rewriteDerpMap(derpMap *tailcfg.DERPMap) error { accessingPort := c.SDK.URL.Port() if accessingPort == "" { accessingPort = "80" @@ -124,15 +138,9 @@ func (c *Client) Manifest(ctx context.Context) (Manifest, error) { } accessPort, err := strconv.Atoi(accessingPort) if err != nil { - return Manifest{}, xerrors.Errorf("convert accessing port %q: %w", accessingPort, err) + return xerrors.Errorf("convert accessing port %q: %w", accessingPort, err) } - // Agents can provide an arbitrary access URL that may be different - // that the globally configured one. This breaks the built-in DERP, - // which would continue to reference the global access URL. - // - // This converts all built-in DERPs to use the access URL that the - // manifest request was performed with. - for _, region := range agentMeta.DERPMap.Regions { + for _, region := range derpMap.Regions { if !region.EmbeddedRelay { continue } @@ -146,7 +154,89 @@ func (c *Client) Manifest(ctx context.Context) (Manifest, error) { node.ForceHTTP = c.SDK.URL.Scheme == "http" } } - return agentMeta, nil + return nil +} + +type DERPMapUpdate struct { + Err error + DERPMap *tailcfg.DERPMap +} + +// DERPMapUpdates connects to the DERP map updates WebSocket. +func (c *Client) DERPMapUpdates(ctx context.Context) (<-chan DERPMapUpdate, io.Closer, error) { + derpMapURL, err := c.SDK.URL.Parse("/api/v2/derp-map") + if err != nil { + return nil, nil, xerrors.Errorf("parse url: %w", err) + } + jar, err := cookiejar.New(nil) + if err != nil { + return nil, nil, xerrors.Errorf("create cookie jar: %w", err) + } + jar.SetCookies(derpMapURL, []*http.Cookie{{ + Name: codersdk.SessionTokenCookie, + Value: c.SDK.SessionToken(), + }}) + httpClient := &http.Client{ + Jar: jar, + Transport: c.SDK.HTTPClient.Transport, + } + // nolint:bodyclose + conn, res, err := websocket.Dial(ctx, derpMapURL.String(), &websocket.DialOptions{ + HTTPClient: httpClient, + }) + if err != nil { + if res == nil { + return nil, nil, err + } + return nil, nil, codersdk.ReadBodyAsError(res) + } + + ctx, cancelFunc := context.WithCancel(ctx) + ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary) + pingClosed := pingWebSocket(ctx, c.SDK.Logger(), conn, "derp map") + + var ( + updates = make(chan DERPMapUpdate) + updatesClosed = make(chan struct{}) + dec = json.NewDecoder(wsNetConn) + ) + go func() { + defer close(updates) + defer close(updatesClosed) + defer cancelFunc() + defer conn.Close(websocket.StatusGoingAway, "Listen closed") + for { + var update DERPMapUpdate + err := dec.Decode(&update.DERPMap) + if err != nil { + update.Err = err + update.DERPMap = nil + return + } + err = c.rewriteDerpMap(update.DERPMap) + if err != nil { + update.Err = err + update.DERPMap = nil + return + } + + select { + case updates <- update: + case <-ctx.Done(): + return + } + } + }() + + return updates, &closer{ + closeFunc: func() error { + cancelFunc() + _ = wsNetConn.Close() + <-pingClosed + <-updatesClosed + return nil + }, + }, nil } // Listen connects to the workspace agent coordinate WebSocket @@ -181,50 +271,14 @@ func (c *Client) Listen(ctx context.Context) (net.Conn, error) { ctx, cancelFunc := context.WithCancel(ctx) ctx, wsNetConn := websocketNetConn(ctx, conn, websocket.MessageBinary) - - // Ping once every 30 seconds to ensure that the websocket is alive. If we - // don't get a response within 30s we kill the websocket and reconnect. - // See: https://github.com/coder/coder/pull/5824 - closed := make(chan struct{}) - go func() { - defer close(closed) - tick := 30 * time.Second - ticker := time.NewTicker(tick) - defer ticker.Stop() - defer func() { - c.SDK.Logger().Debug(ctx, "coordinate pinger exited") - }() - for { - select { - case <-ctx.Done(): - return - case start := <-ticker.C: - ctx, cancel := context.WithTimeout(ctx, tick) - - err := conn.Ping(ctx) - if err != nil { - c.SDK.Logger().Error(ctx, "workspace agent coordinate ping", slog.Error(err)) - - err := conn.Close(websocket.StatusGoingAway, "Ping failed") - if err != nil { - c.SDK.Logger().Error(ctx, "close workspace agent coordinate websocket", slog.Error(err)) - } - - cancel() - return - } - - c.SDK.Logger().Debug(ctx, "got coordinate pong", slog.F("took", time.Since(start))) - cancel() - } - } - }() + pingClosed := pingWebSocket(ctx, c.SDK.Logger(), conn, "coordinate") return &closeNetConn{ Conn: wsNetConn, closeFunc: func() { cancelFunc() - <-closed + _ = conn.Close(websocket.StatusGoingAway, "Listen closed") + <-pingClosed }, }, nil } @@ -702,3 +756,53 @@ func (c *closeNetConn) Close() error { c.closeFunc() return c.Conn.Close() } + +func pingWebSocket(ctx context.Context, logger slog.Logger, conn *websocket.Conn, name string) <-chan struct{} { + // Ping once every 30 seconds to ensure that the websocket is alive. If we + // don't get a response within 30s we kill the websocket and reconnect. + // See: https://github.com/coder/coder/pull/5824 + closed := make(chan struct{}) + go func() { + defer close(closed) + tick := 30 * time.Second + ticker := time.NewTicker(tick) + defer ticker.Stop() + defer func() { + logger.Debug(ctx, fmt.Sprintf("%s pinger exited", name)) + }() + for { + select { + case <-ctx.Done(): + return + case start := <-ticker.C: + ctx, cancel := context.WithTimeout(ctx, tick) + + err := conn.Ping(ctx) + if err != nil { + logger.Error(ctx, fmt.Sprintf("workspace agent %s ping", name), slog.Error(err)) + + err := conn.Close(websocket.StatusGoingAway, "Ping failed") + if err != nil { + logger.Error(ctx, fmt.Sprintf("close workspace agent %s websocket", name), slog.Error(err)) + } + + cancel() + return + } + + logger.Debug(ctx, fmt.Sprintf("got %s ping", name), slog.F("took", time.Since(start))) + cancel() + } + } + }() + + return closed +} + +type closer struct { + closeFunc func() error +} + +func (c *closer) Close() error { + return c.closeFunc() +} diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 9e6acc9d91580..810143cd1e8fb 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -708,6 +708,7 @@ when required by your organization's security policy.`, Value: &c.DERP.Server.Enable, Group: &deploymentGroupNetworkingDERP, YAML: "enable", + Annotations: clibase.Annotations{}.Mark(annotationExternalProxies, "true"), }, { Name: "DERP Server Region ID", @@ -718,6 +719,7 @@ when required by your organization's security policy.`, Value: &c.DERP.Server.RegionID, Group: &deploymentGroupNetworkingDERP, YAML: "regionID", + // Does not apply to external proxies as this value is generated. }, { Name: "DERP Server Region Code", @@ -728,6 +730,7 @@ when required by your organization's security policy.`, Value: &c.DERP.Server.RegionCode, Group: &deploymentGroupNetworkingDERP, YAML: "regionCode", + // Does not apply to external proxies as we use the proxy name. }, { Name: "DERP Server Region Name", @@ -738,6 +741,7 @@ when required by your organization's security policy.`, Value: &c.DERP.Server.RegionName, Group: &deploymentGroupNetworkingDERP, YAML: "regionName", + // Does not apply to external proxies as we use the proxy name. }, { Name: "DERP Server STUN Addresses", @@ -754,10 +758,12 @@ when required by your organization's security policy.`, Description: "An HTTP URL that is accessible by other replicas to relay DERP traffic. Required for high availability.", Flag: "derp-server-relay-url", Env: "CODER_DERP_SERVER_RELAY_URL", - Annotations: clibase.Annotations{}.Mark(annotationEnterpriseKey, "true"), Value: &c.DERP.Server.RelayURL, Group: &deploymentGroupNetworkingDERP, YAML: "relayURL", + Annotations: clibase.Annotations{}. + Mark(annotationEnterpriseKey, "true"). + Mark(annotationExternalProxies, "true"), }, { Name: "Block Direct Connections", diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index b76ebba9344f5..994f32026b61a 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -189,24 +189,32 @@ type WorkspaceAgentConnectionInfo struct { DisableDirectConnections bool `json:"disable_direct_connections"` } -func (c *Client) WorkspaceAgentConnectionInfo(ctx context.Context) (*WorkspaceAgentConnectionInfo, error) { +func (c *Client) WorkspaceAgentConnectionInfoGeneric(ctx context.Context) (WorkspaceAgentConnectionInfo, error) { res, err := c.Request(ctx, http.MethodGet, "/api/v2/workspaceagents/connection", nil) if err != nil { - return nil, err + return WorkspaceAgentConnectionInfo{}, err } defer res.Body.Close() - if res.StatusCode != http.StatusOK { - return nil, ReadBodyAsError(res) + return WorkspaceAgentConnectionInfo{}, ReadBodyAsError(res) } - var info WorkspaceAgentConnectionInfo - err = json.NewDecoder(res.Body).Decode(&info) + var connInfo WorkspaceAgentConnectionInfo + return connInfo, json.NewDecoder(res.Body).Decode(&connInfo) +} + +func (c *Client) WorkspaceAgentConnectionInfo(ctx context.Context, agentID uuid.UUID) (WorkspaceAgentConnectionInfo, error) { + res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) if err != nil { - return nil, xerrors.Errorf("decode connection info: %w", err) + return WorkspaceAgentConnectionInfo{}, err + } + defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return WorkspaceAgentConnectionInfo{}, ReadBodyAsError(res) } - return &info, nil + var connInfo WorkspaceAgentConnectionInfo + return connInfo, json.NewDecoder(res.Body).Decode(&connInfo) } // @typescript-ignore DialWorkspaceAgentOptions @@ -221,18 +229,10 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti if options == nil { options = &DialWorkspaceAgentOptions{} } - res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) - if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode != http.StatusOK { - return nil, ReadBodyAsError(res) - } - var connInfo WorkspaceAgentConnectionInfo - err = json.NewDecoder(res.Body).Decode(&connInfo) + + connInfo, err := c.WorkspaceAgentConnectionInfo(ctx, agentID) if err != nil { - return nil, xerrors.Errorf("decode conn info: %w", err) + return nil, xerrors.Errorf("get connection info: %w", err) } if connInfo.DisableDirectConnections { options.BlockEndpoints = true @@ -262,43 +262,44 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti } }() - coordinateURL, err := c.URL.Parse(fmt.Sprintf("/api/v2/workspaceagents/%s/coordinate", agentID)) - if err != nil { - return nil, xerrors.Errorf("parse url: %w", err) - } - coordinateHeaders := make(http.Header) + headers := make(http.Header) tokenHeader := SessionTokenHeader if c.SessionTokenHeader != "" { tokenHeader = c.SessionTokenHeader } - coordinateHeaders.Set(tokenHeader, c.SessionToken()) + headers.Set(tokenHeader, c.SessionToken()) ctx, cancel := context.WithCancel(ctx) defer func() { if err != nil { cancel() } }() - closed := make(chan struct{}) - first := make(chan error) + + coordinateURL, err := c.URL.Parse(fmt.Sprintf("/api/v2/workspaceagents/%s/coordinate", agentID)) + if err != nil { + return nil, xerrors.Errorf("parse url: %w", err) + } + closedCoordinator := make(chan struct{}) + firstCoordinator := make(chan error) go func() { - defer close(closed) + defer close(closedCoordinator) isFirst := true for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { options.Logger.Debug(ctx, "connecting") // nolint:bodyclose ws, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{ HTTPClient: c.HTTPClient, - HTTPHeader: coordinateHeaders, + HTTPHeader: headers, // Need to disable compression to avoid a data-race. CompressionMode: websocket.CompressionDisabled, }) if isFirst { if res != nil && res.StatusCode == http.StatusConflict { - first <- ReadBodyAsError(res) + firstCoordinator <- ReadBodyAsError(res) return } isFirst = false - close(first) + close(firstCoordinator) } if err != nil { if errors.Is(err, context.Canceled) { @@ -325,7 +326,71 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti _ = ws.Close(websocket.StatusGoingAway, "") } }() - err = <-first + + derpMapURL, err := c.URL.Parse("/api/v2/derp-map") + if err != nil { + return nil, xerrors.Errorf("parse url: %w", err) + } + closedDerpMap := make(chan struct{}) + firstDerpMap := make(chan error) + go func() { + defer close(closedDerpMap) + isFirst := true + for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { + options.Logger.Debug(ctx, "connecting to server for derp map updates") + // nolint:bodyclose + ws, res, err := websocket.Dial(ctx, derpMapURL.String(), &websocket.DialOptions{ + HTTPClient: c.HTTPClient, + HTTPHeader: headers, + // Need to disable compression to avoid a data-race. + CompressionMode: websocket.CompressionDisabled, + }) + if isFirst { + if res != nil && res.StatusCode == http.StatusConflict { + firstDerpMap <- ReadBodyAsError(res) + return + } + isFirst = false + close(firstDerpMap) + } + if err != nil { + if errors.Is(err, context.Canceled) { + return + } + options.Logger.Debug(ctx, "failed to dial", slog.Error(err)) + continue + } + + var ( + nconn = websocket.NetConn(ctx, ws, websocket.MessageBinary) + dec = json.NewDecoder(nconn) + ) + for { + var derpMap tailcfg.DERPMap + err := dec.Decode(&derpMap) + if xerrors.Is(err, context.Canceled) { + _ = ws.Close(websocket.StatusGoingAway, "") + return + } + if err != nil { + options.Logger.Debug(ctx, "failed to decode derp map", slog.Error(err)) + _ = ws.Close(websocket.StatusGoingAway, "") + return + } + + if !tailnet.CompareDERPMaps(conn.DERPMap(), &derpMap) { + options.Logger.Debug(ctx, "updating derp map due to detected changes") + conn.SetDERPMap(&derpMap) + } + } + } + }() + + err = <-firstCoordinator + if err != nil { + return nil, err + } + err = <-firstDerpMap if err != nil { return nil, err } @@ -334,7 +399,8 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti AgentID: agentID, CloseFunc: func() error { cancel() - <-closed + <-closedCoordinator + <-closedDerpMap return conn.Close() }, }) diff --git a/codersdk/workspaceproxy.go b/codersdk/workspaceproxy.go index 2ce15b043e0fd..ef2ccd638c618 100644 --- a/codersdk/workspaceproxy.go +++ b/codersdk/workspaceproxy.go @@ -47,7 +47,8 @@ type ProxyHealthReport struct { type WorkspaceProxy struct { // Extends Region with extra information - Region `table:"region,recursive_inline"` + Region `table:"region,recursive_inline"` + DerpEnabled bool `json:"derp_enabled" table:"derp_enabled"` // Status is the latest status check of the proxy. This will be empty for deleted // proxies. This value can be used to determine if a workspace proxy is healthy diff --git a/docs/admin/audit-logs.md b/docs/admin/audit-logs.md index 214c96b2940f5..27ccbb763ab2a 100644 --- a/docs/admin/audit-logs.md +++ b/docs/admin/audit-logs.md @@ -21,7 +21,7 @@ We track the following resources: | User
create, write, delete |
FieldTracked
avatar_urlfalse
created_atfalse
deletedtrue
emailtrue
hashed_passwordtrue
idtrue
last_seen_atfalse
login_typetrue
quiet_hours_scheduletrue
rbac_rolestrue
statustrue
updated_atfalse
usernametrue
| | Workspace
create, write, delete |
FieldTracked
autostart_scheduletrue
created_atfalse
deletedfalse
deleting_attrue
idtrue
last_used_atfalse
locked_attrue
nametrue
organization_idfalse
owner_idtrue
template_idtrue
ttltrue
updated_atfalse
| | WorkspaceBuild
start, stop |
FieldTracked
build_numberfalse
created_atfalse
daily_costfalse
deadlinefalse
idfalse
initiator_by_avatar_urlfalse
initiator_by_usernamefalse
initiator_idfalse
job_idfalse
max_deadlinefalse
provisioner_statefalse
reasonfalse
template_version_idtrue
transitionfalse
updated_atfalse
workspace_idfalse
| -| WorkspaceProxy
|
FieldTracked
created_attrue
deletedfalse
display_nametrue
icontrue
idtrue
nametrue
token_hashed_secrettrue
updated_atfalse
urltrue
wildcard_hostnametrue
| +| WorkspaceProxy
|
FieldTracked
created_attrue
deletedfalse
derp_enabledtrue
display_nametrue
icontrue
idtrue
nametrue
region_idtrue
token_hashed_secrettrue
updated_atfalse
urltrue
wildcard_hostnametrue
| diff --git a/docs/api/agents.md b/docs/api/agents.md index 3726196d26857..919fa06923c82 100644 --- a/docs/api/agents.md +++ b/docs/api/agents.md @@ -1,5 +1,25 @@ # Agents +## Get DERP map updates + +### Code samples + +```shell +# Example request using curl +curl -X GET http://coder-server:8080/api/v2/derp-map \ + -H 'Coder-Session-Token: API_KEY' +``` + +`GET /derp-map` + +### Responses + +| Status | Meaning | Description | Schema | +| ------ | ------------------------------------------------------------------------ | ------------------- | ------ | +| 101 | [Switching Protocols](https://tools.ietf.org/html/rfc7231#section-6.2.2) | Switching Protocols | | + +To perform this operation, you must be authenticated. [Learn more](authentication.md). + ## Authenticate agent on AWS instance ### Code samples diff --git a/docs/api/enterprise.md b/docs/api/enterprise.md index da03774b433e7..b6cac3df2850a 100644 --- a/docs/api/enterprise.md +++ b/docs/api/enterprise.md @@ -1326,6 +1326,7 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies \ { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", @@ -1364,6 +1365,7 @@ Status Code **200** | `» regions` | array | false | | | | `»» created_at` | string(date-time) | false | | | | `»» deleted` | boolean | false | | | +| `»» derp_enabled` | boolean | false | | | | `»» display_name` | string | false | | | | `»» healthy` | boolean | false | | | | `»» icon_url` | string | false | | | @@ -1428,6 +1430,7 @@ curl -X POST http://coder-server:8080/api/v2/workspaceproxies \ { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", @@ -1482,6 +1485,7 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies/{workspaceproxy} \ { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", @@ -1594,6 +1598,7 @@ curl -X PATCH http://coder-server:8080/api/v2/workspaceproxies/{workspaceproxy} { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", diff --git a/docs/api/schemas.md b/docs/api/schemas.md index d01275456b639..652aea46b2c74 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -3753,6 +3753,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", @@ -5933,6 +5934,7 @@ If the schedule is empty, the user will be updated to use the default schedule.| { "created_at": "2019-08-24T14:15:22Z", "deleted": true, + "derp_enabled": true, "display_name": "string", "healthy": true, "icon_url": "string", @@ -5958,6 +5960,7 @@ If the schedule is empty, the user will be updated to use the default schedule.| | ------------------- | -------------------------------------------------------------- | -------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `created_at` | string | false | | | | `deleted` | boolean | false | | | +| `derp_enabled` | boolean | false | | | | `display_name` | string | false | | | | `healthy` | boolean | false | | | | `icon_url` | string | false | | | @@ -7295,6 +7298,20 @@ _None_ | `found` | boolean | false | | | | `legacy` | boolean | false | | | +## wsproxysdk.DeregisterWorkspaceProxyRequest + +```json +{ + "replica_id": "string" +} +``` + +### Properties + +| Name | Type | Required | Restrictions | Description | +| ------------ | ------ | -------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `replica_id` | string | false | | Replica ID is a unique identifier for the replica of the proxy that is deregistering. It should be generated by the client on startup and should've already been passed to the register endpoint. | + ## wsproxysdk.IssueSignedAppTokenResponse ```json @@ -7314,27 +7331,56 @@ _None_ ```json { "access_url": "string", + "derp_enabled": true, + "hostname": "string", + "replica_error": "string", + "replica_id": "string", + "replica_relay_address": "string", + "version": "string", "wildcard_hostname": "string" } ``` ### Properties -| Name | Type | Required | Restrictions | Description | -| ------------------- | ------ | -------- | ------------ | ----------------------------------------------------------------------------- | -| `access_url` | string | false | | Access URL that hits the workspace proxy api. | -| `wildcard_hostname` | string | false | | Wildcard hostname that the workspace proxy api is serving for subdomain apps. | +| Name | Type | Required | Restrictions | Description | +| ------------------------------------------------------------------------------------------------- | ------- | -------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `access_url` | string | false | | Access URL that hits the workspace proxy api. | +| `derp_enabled` | boolean | false | | Derp enabled indicates whether the proxy should be included in the DERP map or not. | +| `hostname` | string | false | | Hostname is the OS hostname of the machine that the proxy is running on. This is only used for tracking purposes in the replicas table. | +| `replica_error` | string | false | | Replica error is the error that the replica encountered when trying to dial it's peers. This is stored in the replicas table for debugging purposes but does not affect the proxy's ability to register. | +| This value is only stored on subsequent requests to the register endpoint, not the first request. | +| `replica_id` | string | false | | Replica ID is a unique identifier for the replica of the proxy that is registering. It should be generated by the client on startup and persisted (in memory only) until the process is restarted. | +| `replica_relay_address` | string | false | | Replica relay address is the DERP address of the replica that other replicas may use to connect internally for DERP meshing. | +| `version` | string | false | | Version is the Coder version of the proxy. | +| `wildcard_hostname` | string | false | | Wildcard hostname that the workspace proxy api is serving for subdomain apps. | ## wsproxysdk.RegisterWorkspaceProxyResponse ```json { - "app_security_key": "string" + "app_security_key": "string", + "derp_mesh_key": "string", + "derp_region_id": 0, + "sibling_replicas": [ + { + "created_at": "2019-08-24T14:15:22Z", + "database_latency": 0, + "error": "string", + "hostname": "string", + "id": "497f6eca-6276-4993-bfeb-53cbbbba6f08", + "region_id": 0, + "relay_address": "string" + } + ] } ``` ### Properties -| Name | Type | Required | Restrictions | Description | -| ------------------ | ------ | -------- | ------------ | ----------- | -| `app_security_key` | string | false | | | +| Name | Type | Required | Restrictions | Description | +| ------------------ | --------------------------------------------- | -------- | ------------ | -------------------------------------------------------------------------------------- | +| `app_security_key` | string | false | | | +| `derp_mesh_key` | string | false | | | +| `derp_region_id` | integer | false | | | +| `sibling_replicas` | array of [codersdk.Replica](#codersdkreplica) | false | | Sibling replicas is a list of all other replicas of the proxy that have not timed out. | diff --git a/enterprise/audit/table.go b/enterprise/audit/table.go index 27bcb48081fd5..cea72f7c703cb 100644 --- a/enterprise/audit/table.go +++ b/enterprise/audit/table.go @@ -197,6 +197,8 @@ var auditableResourcesTypes = map[any]map[string]Action{ "updated_at": ActionIgnore, "deleted": ActionIgnore, "token_hashed_secret": ActionSecret, + "derp_enabled": ActionTrack, + "region_id": ActionTrack, }, } diff --git a/enterprise/cli/proxyserver.go b/enterprise/cli/proxyserver.go index 822bebc699940..3fb706cb84143 100644 --- a/enterprise/cli/proxyserver.go +++ b/enterprise/cli/proxyserver.go @@ -220,21 +220,23 @@ func (*RootCmd) proxyServer() *clibase.Cmd { } proxy, err := wsproxy.New(ctx, &wsproxy.Options{ - Logger: logger, - Experiments: coderd.ReadExperiments(logger, cfg.Experiments.Value()), - HTTPClient: httpClient, - DashboardURL: primaryAccessURL.Value(), - AccessURL: cfg.AccessURL.Value(), - AppHostname: appHostname, - AppHostnameRegex: appHostnameRegex, - RealIPConfig: realIPConfig, - Tracing: tracer, - PrometheusRegistry: prometheusRegistry, - APIRateLimit: int(cfg.RateLimit.API.Value()), - SecureAuthCookie: cfg.SecureAuthCookie.Value(), - DisablePathApps: cfg.DisablePathApps.Value(), - ProxySessionToken: proxySessionToken.Value(), - AllowAllCors: cfg.Dangerous.AllowAllCors.Value(), + Logger: logger, + Experiments: coderd.ReadExperiments(logger, cfg.Experiments.Value()), + HTTPClient: httpClient, + DashboardURL: primaryAccessURL.Value(), + AccessURL: cfg.AccessURL.Value(), + AppHostname: appHostname, + AppHostnameRegex: appHostnameRegex, + RealIPConfig: realIPConfig, + Tracing: tracer, + PrometheusRegistry: prometheusRegistry, + APIRateLimit: int(cfg.RateLimit.API.Value()), + SecureAuthCookie: cfg.SecureAuthCookie.Value(), + DisablePathApps: cfg.DisablePathApps.Value(), + ProxySessionToken: proxySessionToken.Value(), + AllowAllCors: cfg.Dangerous.AllowAllCors.Value(), + DERPEnabled: cfg.DERP.Server.Enable.Value(), + DERPServerRelayAddress: cfg.DERP.Server.RelayURL.String(), }) if err != nil { return xerrors.Errorf("create workspace proxy: %w", err) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 1d22e668c6e84..25e4dc7b89c99 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -5,11 +5,17 @@ import ( "crypto/ed25519" "crypto/tls" "crypto/x509" + "fmt" + "math" "net/http" + "net/url" + "strconv" + "strings" "sync" "time" "golang.org/x/xerrors" + "tailscale.com/tailcfg" "github.com/cenkalti/backoff/v4" "github.com/go-chi/chi/v5" @@ -158,7 +164,7 @@ func New(ctx context.Context, options *Options) (_ *API, err error) { r.Get("/coordinate", api.workspaceProxyCoordinate) r.Post("/issue-signed-app-token", api.workspaceProxyIssueSignedAppToken) r.Post("/register", api.workspaceProxyRegister) - r.Post("/goingaway", api.workspaceProxyGoingAway) + r.Post("/deregister", api.workspaceProxyDeregister) }) r.Route("/{workspaceproxy}", func(r chi.Router) { r.Use( @@ -293,10 +299,11 @@ func New(ctx context.Context, options *Options) (_ *API, err error) { ServerName: options.AccessURL.Hostname(), } api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, &replicasync.Options{ - ID: api.AGPL.ID, - RelayAddress: options.DERPServerRelayAddress, - RegionID: int32(options.DERPServerRegionID), - TLSConfig: meshTLSConfig, + ID: api.AGPL.ID, + RelayAddress: options.DERPServerRelayAddress, + RegionID: int32(options.DERPServerRegionID), + TLSConfig: meshTLSConfig, + UpdateInterval: options.ReplicaSyncUpdateInterval, }) if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) @@ -344,8 +351,9 @@ type Options struct { SCIMAPIKey []byte // Used for high availability. - DERPServerRelayAddress string - DERPServerRegionID int + ReplicaSyncUpdateInterval time.Duration + DERPServerRelayAddress string + DERPServerRegionID int // Used for user quiet hours schedules. DefaultQuietHoursSchedule string // cron schedule, if empty user quiet hours schedules are disabled @@ -393,7 +401,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { entitlements, err := license.Entitlements( ctx, api.Database, - api.Logger, len(api.replicaManager.All()), len(api.GitAuthConfigs), api.Keys, map[codersdk.FeatureName]bool{ + api.Logger, len(api.replicaManager.AllPrimary()), len(api.GitAuthConfigs), api.Keys, map[codersdk.FeatureName]bool{ codersdk.FeatureAuditLog: api.AuditLogging, codersdk.FeatureBrowserOnly: api.BrowserOnly, codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, @@ -567,6 +575,15 @@ func (api *API) updateEntitlements(ctx context.Context) error { } } + if initial, changed, enabled := featureChanged(codersdk.FeatureWorkspaceProxy); shouldUpdate(initial, changed, enabled) { + if enabled { + fn := derpMapper(api.Logger, api.ProxyHealth) + api.AGPL.DERPMapper.Store(&fn) + } else { + api.AGPL.DERPMapper.Store(nil) + } + } + api.entitlementsMu.Lock() defer api.entitlementsMu.Unlock() api.entitlements = entitlements @@ -575,6 +592,167 @@ func (api *API) updateEntitlements(ctx context.Context) error { return nil } +// getProxyDERPStartingRegionID returns the starting region ID that should be +// used for workspace proxies. A proxy's actual region ID is the return value +// from this function + it's RegionID field. +// +// Two ints are returned, the first is the starting region ID for proxies, and +// the second is the maximum region ID that already exists in the DERP map. +func getProxyDERPStartingRegionID(derpMap *tailcfg.DERPMap) (sID int64, mID int64) { + var maxRegionID int64 + for _, region := range derpMap.Regions { + rid := int64(region.RegionID) + if rid > maxRegionID { + maxRegionID = rid + } + } + if maxRegionID < 0 { + maxRegionID = 0 + } + + // Round to the nearest 10,000 with a sufficient buffer of at least 2,000. + // The buffer allows for future "fixed" regions to be added to the base DERP + // map without conflicting with proxy region IDs (standard DERP maps usually + // use incrementing IDs for new regions). + // + // Example: + // maxRegionID = -2_000 -> startingRegionID = 10_000 + // maxRegionID = 8_000 -> startingRegionID = 10_000 + // maxRegionID = 8_500 -> startingRegionID = 20_000 + // maxRegionID = 12_000 -> startingRegionID = 20_000 + // maxRegionID = 20_000 -> startingRegionID = 30_000 + const roundStartingRegionID = 10_000 + const startingRegionIDBuffer = 2_000 + // Add the buffer first. + startingRegionID := maxRegionID + startingRegionIDBuffer + // Round UP to the nearest 10,000. Go's math.Ceil rounds up to the nearest + // integer, so we need to divide by 10,000 first and then multiply by + // 10,000. + startingRegionID = int64(math.Ceil(float64(startingRegionID)/roundStartingRegionID) * roundStartingRegionID) + // This should never be hit but it's here just in case. + if startingRegionID < roundStartingRegionID { + startingRegionID = roundStartingRegionID + } + + return startingRegionID, maxRegionID +} + +var ( + lastDerpConflictMutex sync.Mutex + lastDerpConflictLog time.Time +) + +func derpMapper(logger slog.Logger, proxyHealth *proxyhealth.ProxyHealth) func(*tailcfg.DERPMap) *tailcfg.DERPMap { + return func(derpMap *tailcfg.DERPMap) *tailcfg.DERPMap { + derpMap = derpMap.Clone() + + // Find the starting region ID that we'll use for proxies. This must be + // deterministic based on the derp map. + startingRegionID, largestRegionID := getProxyDERPStartingRegionID(derpMap) + if largestRegionID >= 1<<32 { + // Enforce an upper bound on the region ID. This shouldn't be hit in + // practice, but it's a good sanity check. + lastDerpConflictMutex.Lock() + shouldLog := lastDerpConflictLog.IsZero() || time.Since(lastDerpConflictLog) > time.Minute + if shouldLog { + lastDerpConflictLog = time.Now() + } + lastDerpConflictMutex.Unlock() + if shouldLog { + logger.Warn( + context.Background(), + "existing DERP region IDs are too large, proxy region IDs will not be populated in the derp map. Please ensure that all DERP region IDs are less than 2^32", + slog.F("largest_region_id", largestRegionID), + slog.F("max_region_id", 1<<32-1), + ) + return derpMap + } + } + + // Add all healthy proxies to the DERP map. + statusMap := proxyHealth.HealthStatus() + statusLoop: + for _, status := range statusMap { + if status.Status != proxyhealth.Healthy || !status.Proxy.DerpEnabled { + // Only add healthy proxies with DERP enabled to the DERP map. + continue + } + + u, err := url.Parse(status.Proxy.Url) + if err != nil { + // Not really any need to log, the proxy should be unreachable + // anyways and filtered out by the above condition. + continue + } + port := u.Port() + if port == "" { + port = "80" + if u.Scheme == "https" { + port = "443" + } + } + portInt, err := strconv.Atoi(port) + if err != nil { + // Not really any need to log, the proxy should be unreachable + // anyways and filtered out by the above condition. + continue + } + + // Sanity check that the region ID and code is unique. + // + // This should be impossible to hit as the IDs are enforced to be + // unique by the database and the computed ID is greater than any + // existing ID in the DERP map. + regionID := int(startingRegionID) + int(status.Proxy.RegionID) + regionCode := fmt.Sprintf("coder_%s", strings.ToLower(status.Proxy.Name)) + for _, r := range derpMap.Regions { + if r.RegionID == regionID || r.RegionCode == regionCode { + // Log a warning if we haven't logged one in the last + // minute. + lastDerpConflictMutex.Lock() + shouldLog := lastDerpConflictLog.IsZero() || time.Since(lastDerpConflictLog) > time.Minute + if shouldLog { + lastDerpConflictLog = time.Now() + } + lastDerpConflictMutex.Unlock() + if shouldLog { + logger.Warn(context.Background(), + "proxy region ID or code conflict, ignoring workspace proxy for DERP map. Please change the flags on the affected proxy to use a different region ID and code", + slog.F("proxy_id", status.Proxy.ID), + slog.F("proxy_name", status.Proxy.Name), + slog.F("proxy_display_name", status.Proxy.DisplayName), + slog.F("proxy_url", status.Proxy.Url), + slog.F("proxy_region_id", status.Proxy.RegionID), + slog.F("proxy_computed_region_id", regionID), + slog.F("proxy_computed_region_code", regionCode), + ) + } + + continue statusLoop + } + } + + derpMap.Regions[regionID] = &tailcfg.DERPRegion{ + // EmbeddedRelay ONLY applies to the primary. + EmbeddedRelay: false, + RegionID: regionID, + RegionCode: regionCode, + RegionName: status.Proxy.Name, + Nodes: []*tailcfg.DERPNode{{ + Name: fmt.Sprintf("%da", regionID), + RegionID: regionID, + HostName: u.Hostname(), + DERPPort: portInt, + STUNPort: -1, + ForceHTTP: u.Scheme == "http", + }}, + } + } + + return derpMap + } +} + // @Summary Get entitlements // @ID get-entitlements // @Security CoderSessionToken diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index e8eed329a29d0..92e0b627d60ae 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -55,6 +55,7 @@ type Options struct { NoDefaultQuietHoursSchedule bool DontAddLicense bool DontAddFirstUser bool + ReplicaSyncUpdateInterval time.Duration } // New constructs a codersdk client connected to an in-memory Enterprise API instance. @@ -86,7 +87,8 @@ func NewWithAPI(t *testing.T, options *Options) ( BrowserOnly: options.BrowserOnly, SCIMAPIKey: options.SCIMAPIKey, DERPServerRelayAddress: oop.AccessURL.String(), - DERPServerRegionID: oop.DERPMap.RegionIDs()[0], + DERPServerRegionID: oop.BaseDERPMap.RegionIDs()[0], + ReplicaSyncUpdateInterval: options.ReplicaSyncUpdateInterval, Options: oop, EntitlementsUpdateInterval: options.EntitlementsUpdateInterval, Keys: Keys, diff --git a/enterprise/coderd/coderdenttest/proxytest.go b/enterprise/coderd/coderdenttest/proxytest.go index baaa9a308b89a..2918659ac57c9 100644 --- a/enterprise/coderd/coderdenttest/proxytest.go +++ b/enterprise/coderd/coderdenttest/proxytest.go @@ -14,6 +14,7 @@ import ( "github.com/moby/moby/pkg/namesgenerator" "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "cdr.dev/slog" @@ -132,9 +133,15 @@ func NewWorkspaceProxy(t *testing.T, coderdAPI *coderd.API, owner *codersdk.Clie DisablePathApps: options.DisablePathApps, // We need a new registry to not conflict with the coderd internal // proxy metrics. - PrometheusRegistry: prometheus.NewRegistry(), + PrometheusRegistry: prometheus.NewRegistry(), + DERPEnabled: true, + DERPServerRelayAddress: accessURL.String(), }) require.NoError(t, err) + t.Cleanup(func() { + err := wssrv.Close() + assert.NoError(t, err) + }) mutex.Lock() handler = wssrv.Handler diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index b5c5af2743385..77e0c45aeff2c 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -24,7 +24,7 @@ func (api *API) replicas(rw http.ResponseWriter, r *http.Request) { return } - replicas := api.replicaManager.All() + replicas := api.replicaManager.AllPrimary() res := make([]codersdk.Replica, 0, len(replicas)) for _, replica := range replicas { res = append(res, convertReplica(replica)) diff --git a/enterprise/coderd/workspaceproxy.go b/enterprise/coderd/workspaceproxy.go index 8de4a95f02d93..cce631b18d844 100644 --- a/enterprise/coderd/workspaceproxy.go +++ b/enterprise/coderd/workspaceproxy.go @@ -4,6 +4,7 @@ import ( "context" "crypto/sha256" "database/sql" + "flag" "fmt" "net/http" "net/url" @@ -14,6 +15,7 @@ import ( "golang.org/x/xerrors" "cdr.dev/slog" + "github.com/coder/coder/buildinfo" agpl "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/audit" "github.com/coder/coder/coderd/database" @@ -25,6 +27,7 @@ import ( "github.com/coder/coder/codersdk" "github.com/coder/coder/cryptorand" "github.com/coder/coder/enterprise/coderd/proxyhealth" + "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/enterprise/wsproxy/wsproxysdk" ) @@ -347,10 +350,13 @@ func (api *API) postWorkspaceProxy(rw http.ResponseWriter, r *http.Request) { DisplayName: req.DisplayName, Icon: req.Icon, TokenHashedSecret: hashedSecret[:], - CreatedAt: database.Now(), - UpdatedAt: database.Now(), + // Enabled by default, but will be disabled on register if the proxy has + // it disabled. + DerpEnabled: true, + CreatedAt: database.Now(), + UpdatedAt: database.Now(), }) - if database.IsUniqueViolation(err) { + if database.IsUniqueViolation(err, database.UniqueWorkspaceProxiesLowerNameIndex) { httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{ Message: fmt.Sprintf("Workspace proxy with name %q already exists.", req.Name), }) @@ -489,13 +495,17 @@ func (api *API) workspaceProxyIssueSignedAppToken(rw http.ResponseWriter, r *htt // in the database and returns a signed token that can be used to authenticate // tokens. // +// This is called periodically by the proxy in the background (every 30s per +// replica) to ensure that the proxy is still registered and the corresponding +// replica table entry is refreshed. +// // @Summary Register workspace proxy // @ID register-workspace-proxy // @Security CoderSessionToken // @Accept json // @Produce json // @Tags Enterprise -// @Param request body wsproxysdk.RegisterWorkspaceProxyRequest true "Issue signed app token request" +// @Param request body wsproxysdk.RegisterWorkspaceProxyRequest true "Register workspace proxy request" // @Success 201 {object} wsproxysdk.RegisterWorkspaceProxyResponse // @Router /workspaceproxies/me/register [post] // @x-apidocgen {"skip": true} @@ -523,6 +533,17 @@ func (api *API) workspaceProxyRegister(rw http.ResponseWriter, r *http.Request) return } + // Version check should be forced in non-dev builds and when running in + // tests. + shouldForceVersion := !buildinfo.IsDev() || flag.Lookup("test.v") != nil + if shouldForceVersion && req.Version != buildinfo.Version() { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: "Version mismatch.", + Detail: fmt.Sprintf("Proxy version %q does not match primary server version %q", req.Version, buildinfo.Version()), + }) + return + } + if err := validateProxyURL(req.AccessURL); err != nil { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "URL is invalid.", @@ -541,11 +562,80 @@ func (api *API) workspaceProxyRegister(rw http.ResponseWriter, r *http.Request) } } - _, err := api.Database.RegisterWorkspaceProxy(ctx, database.RegisterWorkspaceProxyParams{ - ID: proxy.ID, - Url: req.AccessURL, - WildcardHostname: req.WildcardHostname, - }) + if req.ReplicaID == uuid.Nil { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: "Replica ID is invalid.", + }) + return + } + + startingRegionID, _ := getProxyDERPStartingRegionID(api.Options.BaseDERPMap) + regionID := int32(startingRegionID) + proxy.RegionID + + err := api.Database.InTx(func(db database.Store) error { + // First, update the proxy's values in the database. + _, err := db.RegisterWorkspaceProxy(ctx, database.RegisterWorkspaceProxyParams{ + ID: proxy.ID, + Url: req.AccessURL, + DerpEnabled: req.DerpEnabled, + WildcardHostname: req.WildcardHostname, + }) + if err != nil { + return xerrors.Errorf("register workspace proxy: %w", err) + } + + // Second, find the replica that corresponds to this proxy and refresh + // it if it exists. If it doesn't exist, create it. + now := time.Now() + replica, err := db.GetReplicaByID(ctx, req.ReplicaID) + if err == nil { + // Replica exists, update it. + if replica.StoppedAt.Valid && !replica.StartedAt.IsZero() { + // If the replica deregistered, it shouldn't be able to + // re-register before restarting. + // TODO: sadly this results in 500 when it should be 400 + return xerrors.Errorf("replica %s is marked stopped", replica.ID) + } + + replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: replica.ID, + UpdatedAt: now, + StartedAt: replica.StartedAt, + StoppedAt: replica.StoppedAt, + RelayAddress: req.ReplicaRelayAddress, + RegionID: regionID, + Hostname: req.ReplicaHostname, + Version: req.Version, + Error: req.ReplicaError, + DatabaseLatency: 0, + Primary: false, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + } else if xerrors.Is(err, sql.ErrNoRows) { + // Replica doesn't exist, create it. + replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ + ID: req.ReplicaID, + CreatedAt: now, + StartedAt: now, + UpdatedAt: now, + Hostname: req.ReplicaHostname, + RegionID: regionID, + RelayAddress: req.ReplicaRelayAddress, + Version: req.Version, + DatabaseLatency: 0, + Primary: false, + }) + if err != nil { + return xerrors.Errorf("insert replica: %w", err) + } + } else if err != nil { + return xerrors.Errorf("get replica: %w", err) + } + + return nil + }, nil) if httpapi.Is404Error(err) { httpapi.ResourceNotFound(rw) return @@ -555,39 +645,112 @@ func (api *API) workspaceProxyRegister(rw http.ResponseWriter, r *http.Request) return } + // Update replica sync and notify all other replicas to update their + // replica list. + err = api.replicaManager.PublishUpdate() + if err != nil { + httpapi.InternalServerError(rw, err) + return + } + replicaUpdateCtx, replicaUpdateCancel := context.WithTimeout(ctx, 5*time.Second) + defer replicaUpdateCancel() + err = api.replicaManager.UpdateNow(replicaUpdateCtx) + if err != nil { + httpapi.InternalServerError(rw, err) + return + } + + // Find sibling regions to respond with for derpmesh. + siblings := api.replicaManager.InRegion(regionID) + siblingsRes := make([]codersdk.Replica, 0, len(siblings)) + for _, replica := range siblings { + if replica.ID == req.ReplicaID { + continue + } + siblingsRes = append(siblingsRes, convertReplica(replica)) + } + // aReq.New = updatedProxy httpapi.Write(ctx, rw, http.StatusCreated, wsproxysdk.RegisterWorkspaceProxyResponse{ - AppSecurityKey: api.AppSecurityKey.String(), + AppSecurityKey: api.AppSecurityKey.String(), + DERPMeshKey: api.DERPServer.MeshKey(), + DERPRegionID: regionID, + SiblingReplicas: siblingsRes, }) go api.forceWorkspaceProxyHealthUpdate(api.ctx) } -// workspaceProxyGoingAway is used to tell coderd that the workspace proxy is -// shutting down and going away. The main purpose of this function is for the -// health status of the workspace proxy to be more quickly updated when we know -// that the proxy is going to be unhealthy. This does not delete the workspace -// or cause any other side effects. -// If the workspace proxy comes back online, even without a register, it will -// be found healthy again by the normal checks. -// @Summary Workspace proxy going away -// @ID workspace-proxy-going-away +// @Summary Deregister workspace proxy +// @ID deregister-workspace-proxy // @Security CoderSessionToken -// @Produce json +// @Accept json // @Tags Enterprise -// @Success 201 {object} codersdk.Response -// @Router /workspaceproxies/me/goingaway [post] +// @Param request body wsproxysdk.DeregisterWorkspaceProxyRequest true "Deregister workspace proxy request" +// @Success 204 +// @Router /workspaceproxies/me/deregister [post] // @x-apidocgen {"skip": true} -func (api *API) workspaceProxyGoingAway(rw http.ResponseWriter, r *http.Request) { +func (api *API) workspaceProxyDeregister(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() - // Force a health update to happen immediately. The proxy should - // not return a successful response if it is going away. - go api.forceWorkspaceProxyHealthUpdate(api.ctx) + var req wsproxysdk.DeregisterWorkspaceProxyRequest + if !httpapi.Read(ctx, rw, r, &req) { + return + } - httpapi.Write(ctx, rw, http.StatusOK, codersdk.Response{ - Message: "OK", - }) + err := api.Database.InTx(func(db database.Store) error { + now := time.Now() + replica, err := db.GetReplicaByID(ctx, req.ReplicaID) + if err != nil { + return xerrors.Errorf("get replica: %w", err) + } + + if replica.StoppedAt.Valid && !replica.StartedAt.IsZero() { + // TODO: sadly this results in 500 when it should be 400 + return xerrors.Errorf("replica %s is already marked stopped", replica.ID) + } + + replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: replica.ID, + UpdatedAt: now, + StartedAt: replica.StartedAt, + StoppedAt: sql.NullTime{ + Valid: true, + Time: now, + }, + RelayAddress: replica.RelayAddress, + RegionID: replica.RegionID, + Hostname: replica.Hostname, + Version: replica.Version, + Error: replica.Error, + DatabaseLatency: replica.DatabaseLatency, + Primary: replica.Primary, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + + return nil + }, nil) + if httpapi.Is404Error(err) { + httpapi.ResourceNotFound(rw) + return + } + if err != nil { + httpapi.InternalServerError(rw, err) + return + } + + // Publish a replicasync event with a nil ID so every replica (yes, even the + // current replica) will refresh its replicas list. + err = api.Pubsub.Publish(replicasync.PubsubEvent, []byte(uuid.Nil.String())) + if err != nil { + httpapi.InternalServerError(rw, err) + return + } + + rw.WriteHeader(http.StatusNoContent) + go api.forceWorkspaceProxyHealthUpdate(api.ctx) } // reconnectingPTYSignedToken issues a signed app token for use when connecting @@ -670,7 +833,8 @@ func (api *API) reconnectingPTYSignedToken(rw http.ResponseWriter, r *http.Reque }, SessionToken: httpmw.APITokenFromRequest(r), // The following fields aren't required as long as the request is authed - // with a valid API key. + // with a valid API key, which we know since this endpoint is protected + // by auth middleware already. PathAppBaseURL: "", AppHostname: "", // The following fields are empty for terminal apps. @@ -733,10 +897,11 @@ func convertProxy(p database.WorkspaceProxy, status proxyhealth.ProxyStatus) cod status.Status = proxyhealth.Unknown } return codersdk.WorkspaceProxy{ - Region: convertRegion(p, status), - CreatedAt: p.CreatedAt, - UpdatedAt: p.UpdatedAt, - Deleted: p.Deleted, + Region: convertRegion(p, status), + DerpEnabled: p.DerpEnabled, + CreatedAt: p.CreatedAt, + UpdatedAt: p.UpdatedAt, + Deleted: p.Deleted, Status: codersdk.WorkspaceProxyStatus{ Status: codersdk.ProxyHealthStatus(status.Status), Report: status.Report, diff --git a/enterprise/coderd/workspaceproxy_test.go b/enterprise/coderd/workspaceproxy_test.go index 0f0de8dfe9e65..781ef3974ed15 100644 --- a/enterprise/coderd/workspaceproxy_test.go +++ b/enterprise/coderd/workspaceproxy_test.go @@ -17,7 +17,9 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/agent" + "github.com/coder/coder/buildinfo" "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/coderd/workspaceapps" "github.com/coder/coder/codersdk" @@ -167,69 +169,6 @@ func TestRegions(t *testing.T) { require.Error(t, err) require.Empty(t, regions) }) - - t.Run("GoingAway", func(t *testing.T) { - t.Skip("This is flakey in CI because it relies on internal go routine timing. Should refactor.") - t.Parallel() - - dv := coderdtest.DeploymentValues(t) - dv.Experiments = []string{ - string(codersdk.ExperimentMoons), - "*", - } - - db, pubsub := dbtestutil.NewDB(t) - - ctx := testutil.Context(t, testutil.WaitLong) - - client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ - Options: &coderdtest.Options{ - AppHostname: appHostname, - Database: db, - Pubsub: pubsub, - DeploymentValues: dv, - }, - // The interval is set to 1 hour so the proxy health - // check will never happen manually. All checks will be - // forced updates. - ProxyHealthInterval: time.Hour, - LicenseOptions: &coderdenttest.LicenseOptions{ - Features: license.Features{ - codersdk.FeatureWorkspaceProxy: 1, - }, - }, - }) - t.Cleanup(func() { - _ = closer.Close() - }) - - const proxyName = "testproxy" - proxy := coderdenttest.NewWorkspaceProxy(t, api, client, &coderdenttest.ProxyOptions{ - Name: proxyName, - }) - _ = proxy - - require.Eventuallyf(t, func() bool { - proxy, err := client.WorkspaceProxyByName(ctx, proxyName) - if err != nil { - // We are testing the going away, not the initial healthy. - // Just force an update to change this to healthy. - _ = api.ProxyHealth.ForceUpdate(ctx) - return false - } - return proxy.Status.Status == codersdk.ProxyHealthy - }, testutil.WaitShort, testutil.IntervalFast, "proxy never became healthy") - - _ = proxy.Close() - // The proxy should tell the primary on close that is is no longer healthy. - require.Eventuallyf(t, func() bool { - proxy, err := client.WorkspaceProxyByName(ctx, proxyName) - if err != nil { - return false - } - return proxy.Status.Status == codersdk.ProxyUnhealthy - }, testutil.WaitShort, testutil.IntervalFast, "proxy never became unhealthy after close") - }) } func TestWorkspaceProxyCRUD(t *testing.T) { @@ -321,6 +260,386 @@ func TestWorkspaceProxyCRUD(t *testing.T) { }) } +func TestProxyRegisterDeregister(t *testing.T) { + t.Parallel() + + setup := func(t *testing.T) (*codersdk.Client, database.Store) { + dv := coderdtest.DeploymentValues(t) + dv.Experiments = []string{ + string(codersdk.ExperimentMoons), + "*", + } + + db, pubsub := dbtestutil.NewDB(t) + client, _ := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + DeploymentValues: dv, + Database: db, + Pubsub: pubsub, + IncludeProvisionerDaemon: true, + }, + ReplicaSyncUpdateInterval: time.Minute, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{ + codersdk.FeatureWorkspaceProxy: 1, + }, + }, + }) + + return client, db + } + + t.Run("OK", func(t *testing.T) { + t.Parallel() + + client, db := setup(t) + + ctx := testutil.Context(t, testutil.WaitLong) + const ( + proxyName = "hello" + proxyDisplayName = "Hello World" + proxyIcon = "/emojis/flag.png" + ) + createRes, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: proxyName, + DisplayName: proxyDisplayName, + Icon: proxyIcon, + }) + require.NoError(t, err) + + proxyClient := wsproxysdk.New(client.URL) + proxyClient.SetSessionToken(createRes.ProxyToken) + + // Register + req := wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://proxy.coder.test", + WildcardHostname: "*.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "mars", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:8080", + Version: buildinfo.Version(), + } + registerRes1, err := proxyClient.RegisterWorkspaceProxy(ctx, req) + require.NoError(t, err) + require.NotEmpty(t, registerRes1.AppSecurityKey) + require.NotEmpty(t, registerRes1.DERPMeshKey) + require.EqualValues(t, 10001, registerRes1.DERPRegionID) + require.Empty(t, registerRes1.SiblingReplicas) + + proxy, err := client.WorkspaceProxyByID(ctx, createRes.Proxy.ID) + require.NoError(t, err) + require.Equal(t, createRes.Proxy.ID, proxy.ID) + require.Equal(t, proxyName, proxy.Name) + require.Equal(t, proxyDisplayName, proxy.DisplayName) + require.Equal(t, proxyIcon, proxy.IconURL) + require.Equal(t, req.AccessURL, proxy.PathAppURL) + require.Equal(t, req.AccessURL, proxy.PathAppURL) + require.Equal(t, req.WildcardHostname, proxy.WildcardHostname) + require.Equal(t, req.DerpEnabled, proxy.DerpEnabled) + require.False(t, proxy.Deleted) + + // Get the replica from the DB. + replica, err := db.GetReplicaByID(ctx, req.ReplicaID) + require.NoError(t, err) + require.Equal(t, req.ReplicaID, replica.ID) + require.Equal(t, req.ReplicaHostname, replica.Hostname) + require.Equal(t, req.ReplicaError, replica.Error) + require.Equal(t, req.ReplicaRelayAddress, replica.RelayAddress) + require.Equal(t, req.Version, replica.Version) + require.EqualValues(t, 10001, replica.RegionID) + require.False(t, replica.StoppedAt.Valid) + require.Zero(t, replica.DatabaseLatency) + require.False(t, replica.Primary) + + // Re-register with most fields changed. + req = wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://cool.proxy.coder.test", + WildcardHostname: "*.cool.proxy.coder.test", + DerpEnabled: false, + ReplicaID: req.ReplicaID, + ReplicaHostname: "venus", + ReplicaError: "error", + ReplicaRelayAddress: "http://127.0.0.1:9090", + Version: buildinfo.Version(), + } + registerRes2, err := proxyClient.RegisterWorkspaceProxy(ctx, req) + require.NoError(t, err) + require.Equal(t, registerRes1, registerRes2) + + // Get the proxy to ensure nothing has changed except updated_at. + proxyNew, err := client.WorkspaceProxyByID(ctx, createRes.Proxy.ID) + require.NoError(t, err) + require.Equal(t, createRes.Proxy.ID, proxyNew.ID) + require.Equal(t, proxyName, proxyNew.Name) + require.Equal(t, proxyDisplayName, proxyNew.DisplayName) + require.Equal(t, proxyIcon, proxyNew.IconURL) + require.Equal(t, req.AccessURL, proxyNew.PathAppURL) + require.Equal(t, req.AccessURL, proxyNew.PathAppURL) + require.Equal(t, req.WildcardHostname, proxyNew.WildcardHostname) + require.Equal(t, req.DerpEnabled, proxyNew.DerpEnabled) + require.False(t, proxyNew.Deleted) + + // Get the replica from the DB and ensure the fields have been updated, + // especially the updated_at. + replica, err = db.GetReplicaByID(ctx, req.ReplicaID) + require.NoError(t, err) + require.Equal(t, req.ReplicaID, replica.ID) + require.Equal(t, req.ReplicaHostname, replica.Hostname) + require.Equal(t, req.ReplicaError, replica.Error) + require.Equal(t, req.ReplicaRelayAddress, replica.RelayAddress) + require.Equal(t, req.Version, replica.Version) + require.EqualValues(t, 10001, replica.RegionID) + require.False(t, replica.StoppedAt.Valid) + require.Zero(t, replica.DatabaseLatency) + require.False(t, replica.Primary) + + // Deregister + err = proxyClient.DeregisterWorkspaceProxy(ctx, wsproxysdk.DeregisterWorkspaceProxyRequest{ + ReplicaID: req.ReplicaID, + }) + require.NoError(t, err) + + // Ensure the replica has been fully stopped. + replica, err = db.GetReplicaByID(ctx, req.ReplicaID) + require.NoError(t, err) + require.Equal(t, req.ReplicaID, replica.ID) + require.True(t, replica.StoppedAt.Valid) + + // Re-register should fail + _, err = proxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{}) + require.Error(t, err) + }) + + t.Run("BlockMismatchingVersion", func(t *testing.T) { + t.Parallel() + + client, _ := setup(t) + + ctx := testutil.Context(t, testutil.WaitLong) + createRes, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "hi", + }) + require.NoError(t, err) + + proxyClient := wsproxysdk.New(client.URL) + proxyClient.SetSessionToken(createRes.ProxyToken) + + _, err = proxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://proxy.coder.test", + WildcardHostname: "*.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "mars", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:8080", + Version: "v0.0.0", + }) + require.Error(t, err) + var sdkErr *codersdk.Error + require.ErrorAs(t, err, &sdkErr) + require.Equal(t, http.StatusBadRequest, sdkErr.StatusCode()) + require.Contains(t, sdkErr.Response.Message, "Version mismatch") + }) + + t.Run("ReregisterUpdateReplica", func(t *testing.T) { + t.Parallel() + + client, db := setup(t) + + ctx := testutil.Context(t, testutil.WaitLong) + createRes, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "hi", + }) + require.NoError(t, err) + + proxyClient := wsproxysdk.New(client.URL) + proxyClient.SetSessionToken(createRes.ProxyToken) + + req := wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://proxy.coder.test", + WildcardHostname: "*.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "mars", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:8080", + Version: buildinfo.Version(), + } + _, err = proxyClient.RegisterWorkspaceProxy(ctx, req) + require.NoError(t, err) + + // Get the replica from the DB. + replica, err := db.GetReplicaByID(ctx, req.ReplicaID) + require.NoError(t, err) + require.Equal(t, req.ReplicaID, replica.ID) + + time.Sleep(time.Millisecond) + + // Re-register with no changed fields. + _, err = proxyClient.RegisterWorkspaceProxy(ctx, req) + require.NoError(t, err) + + // Get the replica from the DB and make sure updated_at has changed. + replica, err = db.GetReplicaByID(ctx, req.ReplicaID) + require.NoError(t, err) + require.Equal(t, req.ReplicaID, replica.ID) + require.Greater(t, replica.UpdatedAt.UnixNano(), replica.CreatedAt.UnixNano()) + }) + + t.Run("DeregisterNonExistentReplica", func(t *testing.T) { + t.Parallel() + + client, _ := setup(t) + + ctx := testutil.Context(t, testutil.WaitLong) + createRes, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "hi", + }) + require.NoError(t, err) + + proxyClient := wsproxysdk.New(client.URL) + proxyClient.SetSessionToken(createRes.ProxyToken) + + err = proxyClient.DeregisterWorkspaceProxy(ctx, wsproxysdk.DeregisterWorkspaceProxyRequest{ + ReplicaID: uuid.New(), + }) + require.Error(t, err) + var sdkErr *codersdk.Error + require.ErrorAs(t, err, &sdkErr) + require.Equal(t, http.StatusNotFound, sdkErr.StatusCode()) + }) + + t.Run("ReturnSiblings", func(t *testing.T) { + t.Parallel() + + client, _ := setup(t) + + ctx := testutil.Context(t, testutil.WaitLong) + createRes1, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "one", + }) + require.NoError(t, err) + createRes2, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "two", + }) + require.NoError(t, err) + + // Register a replica on proxy 2. This shouldn't be returned by replicas + // for proxy 1. + proxyClient2 := wsproxysdk.New(client.URL) + proxyClient2.SetSessionToken(createRes2.ProxyToken) + _, err = proxyClient2.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://other.proxy.coder.test", + WildcardHostname: "*.other.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "venus", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:9090", + Version: buildinfo.Version(), + }) + require.NoError(t, err) + + // Register replica 1. + proxyClient1 := wsproxysdk.New(client.URL) + proxyClient1.SetSessionToken(createRes1.ProxyToken) + req1 := wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://one.proxy.coder.test", + WildcardHostname: "*.one.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "mars1", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:8081", + Version: buildinfo.Version(), + } + registerRes1, err := proxyClient1.RegisterWorkspaceProxy(ctx, req1) + require.NoError(t, err) + require.Empty(t, registerRes1.SiblingReplicas) + + // Register replica 2 and expect to get replica 1 as a sibling. + req2 := wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://two.proxy.coder.test", + WildcardHostname: "*.two.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "mars2", + ReplicaError: "", + ReplicaRelayAddress: "http://127.0.0.1:8082", + Version: buildinfo.Version(), + } + registerRes2, err := proxyClient1.RegisterWorkspaceProxy(ctx, req2) + require.NoError(t, err) + require.Len(t, registerRes2.SiblingReplicas, 1) + require.Equal(t, req1.ReplicaID, registerRes2.SiblingReplicas[0].ID) + require.Equal(t, req1.ReplicaHostname, registerRes2.SiblingReplicas[0].Hostname) + require.Equal(t, req1.ReplicaRelayAddress, registerRes2.SiblingReplicas[0].RelayAddress) + require.EqualValues(t, 10001, registerRes2.SiblingReplicas[0].RegionID) + + // Re-register replica 1 and expect to get replica 2 as a sibling. + registerRes1, err = proxyClient1.RegisterWorkspaceProxy(ctx, req1) + require.NoError(t, err) + require.Len(t, registerRes1.SiblingReplicas, 1) + require.Equal(t, req2.ReplicaID, registerRes1.SiblingReplicas[0].ID) + require.Equal(t, req2.ReplicaHostname, registerRes1.SiblingReplicas[0].Hostname) + require.Equal(t, req2.ReplicaRelayAddress, registerRes1.SiblingReplicas[0].RelayAddress) + require.EqualValues(t, 10001, registerRes1.SiblingReplicas[0].RegionID) + }) + + // ReturnSiblings2 tries to create 100 proxy replicas and ensures that they + // all return the correct number of siblings. + t.Run("ReturnSiblings2", func(t *testing.T) { + t.Parallel() + + client, _ := setup(t) + ctx := testutil.Context(t, testutil.WaitLong) + + createRes, err := client.CreateWorkspaceProxy(ctx, codersdk.CreateWorkspaceProxyRequest{ + Name: "proxy", + }) + require.NoError(t, err) + + proxyClient := wsproxysdk.New(client.URL) + proxyClient.SetSessionToken(createRes.ProxyToken) + + for i := 0; i < 100; i++ { + ok := false + for j := 0; j < 2; j++ { + registerRes, err := proxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: "https://proxy.coder.test", + WildcardHostname: "*.proxy.coder.test", + DerpEnabled: true, + ReplicaID: uuid.New(), + ReplicaHostname: "venus", + ReplicaError: "", + ReplicaRelayAddress: fmt.Sprintf("http://127.0.0.1:%d", 8080+i), + Version: buildinfo.Version(), + }) + require.NoErrorf(t, err, "register proxy %d", i) + + // If the sibling replica count is wrong, try again. The impact + // of this not being immediate is that proxies may not function + // as DERP relays until they register again in 30 seconds. + // + // In the real world, replicas will not be registering this + // quickly. Kubernetes rolls out gradually in practice. + if len(registerRes.SiblingReplicas) != i { + t.Logf("%d: expected %d siblings, got %d", i, i, len(registerRes.SiblingReplicas)) + time.Sleep(100 * time.Millisecond) + continue + } + + ok = true + break + } + + require.True(t, ok, "expected to register replica %d", i) + } + }) +} + func TestIssueSignedAppToken(t *testing.T) { t.Parallel() diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 3e1f7d866c9b0..42bf402a6682e 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -74,6 +74,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, ps pubsub.P RelayAddress: options.RelayAddress, Version: buildinfo.Version(), DatabaseLatency: int32(databaseLatency.Microseconds()), + Primary: true, }) if err != nil { return nil, xerrors.Errorf("insert replica: %w", err) @@ -125,6 +126,20 @@ type Manager struct { callback func() } +func (m *Manager) ID() uuid.UUID { + return m.id +} + +// UpdateNow synchronously updates replicas. +func (m *Manager) UpdateNow(ctx context.Context) error { + return m.syncReplicas(ctx) +} + +// PublishUpdate notifies all other replicas to update. +func (m *Manager) PublishUpdate() error { + return m.pubsub.Publish(PubsubEvent, []byte(m.id.String())) +} + // updateInterval is used to determine a replicas state. // If the replica was updated > the time, it's considered healthy. // If the replica was updated < the time, it's considered stale. @@ -299,13 +314,14 @@ func (m *Manager) syncReplicas(ctx context.Context) error { Version: m.self.Version, Error: replicaError, DatabaseLatency: int32(databaseLatency.Microseconds()), + Primary: m.self.Primary, }) if err != nil { return xerrors.Errorf("update replica: %w", err) } if m.self.Error != replica.Error { // Publish an update occurred! - err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) + err = m.PublishUpdate() if err != nil { return xerrors.Errorf("publish replica update: %w", err) } @@ -324,12 +340,17 @@ func (m *Manager) Self() database.Replica { return m.self } -// All returns every replica, including itself. -func (m *Manager) All() []database.Replica { +// AllPrimary returns every primary replica (not workspace proxy replicas), +// including itself. +func (m *Manager) AllPrimary() []database.Replica { m.mutex.Lock() defer m.mutex.Unlock() replicas := make([]database.Replica, 0, len(m.peers)) for _, replica := range append(m.peers, m.self) { + if !replica.Primary { + continue + } + // When we assign the non-pointer to a // variable it loses the reference. replica := replica @@ -338,13 +359,13 @@ func (m *Manager) All() []database.Replica { return replicas } -// Regional returns all replicas in the same region excluding itself. -func (m *Manager) Regional() []database.Replica { +// InRegion returns every replica in the given DERP region excluding itself. +func (m *Manager) InRegion(regionID int32) []database.Replica { m.mutex.Lock() defer m.mutex.Unlock() replicas := make([]database.Replica, 0) for _, replica := range m.peers { - if replica.RegionID != m.self.RegionID { + if replica.RegionID != regionID { continue } replicas = append(replicas, replica) @@ -352,6 +373,11 @@ func (m *Manager) Regional() []database.Replica { return replicas } +// Regional returns all replicas in the same region excluding itself. +func (m *Manager) Regional() []database.Replica { + return m.InRegion(m.self.RegionID) +} + // SetCallback sets a function to execute whenever new peers // are refreshed or updated. func (m *Manager) SetCallback(callback func()) { diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 741be64fa12cc..1f33075fd44b3 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -66,6 +66,7 @@ func TestReplica(t *testing.T) { UpdatedAt: database.Now(), Hostname: "something", RelayAddress: srv.URL, + Primary: true, }) require.NoError(t, err) ctx, cancelCtx := context.WithCancel(context.Background()) @@ -110,6 +111,7 @@ func TestReplica(t *testing.T) { UpdatedAt: database.Now(), Hostname: "something", RelayAddress: srv.URL, + Primary: true, }) require.NoError(t, err) ctx, cancelCtx := context.WithCancel(context.Background()) @@ -137,6 +139,7 @@ func TestReplica(t *testing.T) { Hostname: "something", // Fake address to dial! RelayAddress: "http://127.0.0.1:1", + Primary: true, }) require.NoError(t, err) ctx, cancelCtx := context.WithCancel(context.Background()) @@ -171,6 +174,7 @@ func TestReplica(t *testing.T) { ID: uuid.New(), RelayAddress: srv.URL, UpdatedAt: database.Now(), + Primary: true, }) require.NoError(t, err) // Publish multiple times to ensure it can handle that case. @@ -189,6 +193,7 @@ func TestReplica(t *testing.T) { _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ ID: uuid.New(), UpdatedAt: database.Now().Add(-time.Hour), + Primary: true, }) require.NoError(t, err) ctx, cancelCtx := context.WithCancel(context.Background()) @@ -236,8 +241,7 @@ func TestReplica(t *testing.T) { server.SetCallback(func() { m.Lock() defer m.Unlock() - - if len(server.All()) != count { + if len(server.AllPrimary()) != count { return } if done { diff --git a/enterprise/wsproxy/wsproxy.go b/enterprise/wsproxy/wsproxy.go index b949290fe84c7..4536fe2bab7dc 100644 --- a/enterprise/wsproxy/wsproxy.go +++ b/enterprise/wsproxy/wsproxy.go @@ -2,9 +2,12 @@ package wsproxy import ( "context" + "crypto/tls" + "crypto/x509" "fmt" "net/http" "net/url" + "os" "reflect" "regexp" "strings" @@ -12,9 +15,13 @@ import ( "github.com/go-chi/chi/v5" "github.com/google/uuid" + "github.com/hashicorp/go-multierror" "github.com/prometheus/client_golang/prometheus" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" + "tailscale.com/types/key" "cdr.dev/slog" "github.com/coder/coder/buildinfo" @@ -25,9 +32,10 @@ import ( "github.com/coder/coder/coderd/workspaceapps" "github.com/coder/coder/coderd/wsconncache" "github.com/coder/coder/codersdk" + "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/enterprise/wsproxy/wsproxysdk" "github.com/coder/coder/site" - agpl "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet" ) type Options struct { @@ -52,14 +60,16 @@ type Options struct { // options.AppHostname is set. AppHostnameRegex *regexp.Regexp - RealIPConfig *httpmw.RealIPConfig - + RealIPConfig *httpmw.RealIPConfig Tracing trace.TracerProvider PrometheusRegistry *prometheus.Registry + TLSCertificates []tls.Certificate - APIRateLimit int - SecureAuthCookie bool - DisablePathApps bool + APIRateLimit int + SecureAuthCookie bool + DisablePathApps bool + DERPEnabled bool + DERPServerRelayAddress string ProxySessionToken string // AllowAllCors will set all CORs headers to '*'. @@ -103,12 +113,14 @@ type Server struct { // the moon's token. SDKClient *wsproxysdk.Client - // TODO: Missing: - // - derpserver + // DERP + derpMesh *derpmesh.Mesh // Used for graceful shutdown. Required for the dialer. - ctx context.Context - cancel context.CancelFunc + ctx context.Context + cancel context.CancelFunc + derpCloseFunc func() + registerDone <-chan struct{} } // New creates a new workspace proxy server. This requires a primary coderd @@ -143,21 +155,33 @@ func New(ctx context.Context, opts *Options) (*Server, error) { return nil, xerrors.Errorf("%q is a workspace proxy, not a primary coderd instance", opts.DashboardURL) } - regResp, err := client.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{ - AccessURL: opts.AccessURL.String(), - WildcardHostname: opts.AppHostname, - }) - if err != nil { - return nil, xerrors.Errorf("register proxy: %w", err) + meshRootCA := x509.NewCertPool() + for _, certificate := range opts.TLSCertificates { + for _, certificatePart := range certificate.Certificate { + certificate, err := x509.ParseCertificate(certificatePart) + if err != nil { + return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err) + } + meshRootCA.AddCert(certificate) + } } - - secKey, err := workspaceapps.KeyFromString(regResp.AppSecurityKey) - if err != nil { - return nil, xerrors.Errorf("parse app security key: %w", err) + // This TLS configuration spoofs access from the access URL hostname + // assuming that the certificates provided will cover that hostname. + // + // Replica sync and DERP meshing require accessing replicas via their + // internal IP addresses, and if TLS is configured we use the same + // certificates. + meshTLSConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, + Certificates: opts.TLSCertificates, + RootCAs: meshRootCA, + ServerName: opts.AccessURL.Hostname(), } - r := chi.NewRouter() + derpServer := derp.NewServer(key.NewNode(), tailnet.Logger(opts.Logger.Named("derp"))) + ctx, cancel := context.WithCancel(context.Background()) + r := chi.NewRouter() s := &Server{ Options: opts, Handler: r, @@ -166,11 +190,50 @@ func New(ctx context.Context, opts *Options) (*Server, error) { TracerProvider: opts.Tracing, PrometheusRegistry: opts.PrometheusRegistry, SDKClient: client, + derpMesh: derpmesh.New(opts.Logger.Named("derpmesh"), derpServer, meshTLSConfig), ctx: ctx, cancel: cancel, } - connInfo, err := client.SDKClient.WorkspaceAgentConnectionInfo(ctx) + // Register the workspace proxy with the primary coderd instance and start a + // goroutine to periodically re-register. + replicaID := uuid.New() + osHostname, err := os.Hostname() + if err != nil { + return nil, xerrors.Errorf("get OS hostname: %w", err) + } + regResp, registerDone, err := client.RegisterWorkspaceProxyLoop(ctx, wsproxysdk.RegisterWorkspaceProxyLoopOpts{ + Logger: opts.Logger, + Request: wsproxysdk.RegisterWorkspaceProxyRequest{ + AccessURL: opts.AccessURL.String(), + WildcardHostname: opts.AppHostname, + DerpEnabled: opts.DERPEnabled, + ReplicaID: replicaID, + ReplicaHostname: osHostname, + ReplicaError: "", + ReplicaRelayAddress: opts.DERPServerRelayAddress, + Version: buildinfo.Version(), + }, + MutateFn: s.mutateRegister, + CallbackFn: s.handleRegister, + FailureFn: s.handleRegisterFailure, + }) + if err != nil { + return nil, xerrors.Errorf("register proxy: %w", err) + } + s.registerDone = registerDone + err = s.handleRegister(ctx, regResp) + if err != nil { + return nil, xerrors.Errorf("handle register: %w", err) + } + derpServer.SetMeshKey(regResp.DERPMeshKey) + + secKey, err := workspaceapps.KeyFromString(regResp.AppSecurityKey) + if err != nil { + return nil, xerrors.Errorf("parse app security key: %w", err) + } + + connInfo, err := client.SDKClient.WorkspaceAgentConnectionInfoGeneric(ctx) if err != nil { return nil, xerrors.Errorf("get derpmap: %w", err) } @@ -216,6 +279,9 @@ func New(ctx context.Context, opts *Options) (*Server, error) { SecureAuthCookie: opts.SecureAuthCookie, } + derpHandler := derphttp.Handler(derpServer) + derpHandler, s.derpCloseFunc = tailnet.WithWebsocketSupport(derpServer, derpHandler) + // The primary coderd dashboard needs to make some GET requests to // the workspace proxies to check latency. corsMW := httpmw.Cors(opts.AllowAllCors, opts.DashboardURL.String()) @@ -266,6 +332,14 @@ func New(ctx context.Context, opts *Options) (*Server, error) { s.AppServer.Attach(r) }) + r.Route("/derp", func(r chi.Router) { + r.Get("/", derpHandler.ServeHTTP) + // This is used when UDP is blocked, and latency must be checked via HTTP(s). + r.Get("/latency-check", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }) + }) + r.Get("/api/v2/buildinfo", s.buildInfo) r.Get("/healthz", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("OK")) }) // TODO: @emyrk should this be authenticated or debounced? @@ -295,20 +369,56 @@ func New(ctx context.Context, opts *Options) (*Server, error) { func (s *Server) Close() error { s.cancel() - // A timeout to prevent the SDK from blocking the server shutdown. - tmp, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - _ = s.SDKClient.WorkspaceProxyGoingAway(tmp) - _ = s.AppServer.AgentProvider.Close() - - return s.AppServer.Close() + var err error + registerDoneWaitTicker := time.NewTicker(11 * time.Second) // the attempt timeout is 10s + select { + case <-registerDoneWaitTicker.C: + err = multierror.Append(err, xerrors.New("timed out waiting for registerDone")) + case <-s.registerDone: + } + s.derpCloseFunc() + appServerErr := s.AppServer.Close() + if appServerErr != nil { + err = multierror.Append(err, appServerErr) + } + agentProviderErr := s.AppServer.AgentProvider.Close() + if agentProviderErr != nil { + err = multierror.Append(err, agentProviderErr) + } + s.SDKClient.SDKClient.HTTPClient.CloseIdleConnections() + return err } func (s *Server) DialWorkspaceAgent(id uuid.UUID) (*codersdk.WorkspaceAgentConn, error) { return s.SDKClient.DialWorkspaceAgent(s.ctx, id, nil) } -func (s *Server) DialCoordinator(ctx context.Context) (agpl.MultiAgentConn, error) { +func (*Server) mutateRegister(_ *wsproxysdk.RegisterWorkspaceProxyRequest) { + // TODO: we should probably ping replicas similarly to the replicasync + // package in the primary and update req.ReplicaError accordingly. +} + +func (s *Server) handleRegister(_ context.Context, res wsproxysdk.RegisterWorkspaceProxyResponse) error { + addresses := make([]string, len(res.SiblingReplicas)) + for i, replica := range res.SiblingReplicas { + addresses[i] = replica.RelayAddress + } + s.derpMesh.SetAddresses(addresses, false) + + return nil +} + +func (s *Server) handleRegisterFailure(err error) { + if s.ctx.Err() != nil { + return + } + s.Logger.Fatal(s.ctx, + "failed to periodically re-register workspace proxy with primary Coder deployment", + slog.Error(err), + ) +} + +func (s *Server) DialCoordinator(ctx context.Context) (tailnet.MultiAgentConn, error) { return s.SDKClient.DialCoordinator(ctx) } diff --git a/enterprise/wsproxy/wsproxy_test.go b/enterprise/wsproxy/wsproxy_test.go index f918daa82736a..26c6fe418eb43 100644 --- a/enterprise/wsproxy/wsproxy_test.go +++ b/enterprise/wsproxy/wsproxy_test.go @@ -1,19 +1,376 @@ package wsproxy_test import ( + "fmt" "net" "testing" + "github.com/davecgh/go-spew/spew" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "tailscale.com/tailcfg" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/agent" "github.com/coder/coder/cli/clibase" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/coderd/healthcheck" "github.com/coder/coder/coderd/httpmw" "github.com/coder/coder/coderd/workspaceapps/apptest" "github.com/coder/coder/codersdk" + "github.com/coder/coder/codersdk/agentsdk" "github.com/coder/coder/enterprise/coderd/coderdenttest" "github.com/coder/coder/enterprise/coderd/license" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/testutil" ) +func TestDERP(t *testing.T) { + t.Parallel() + + deploymentValues := coderdtest.DeploymentValues(t) + deploymentValues.Experiments = []string{ + string(codersdk.ExperimentMoons), + "*", + } + + client, closer, api, user := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + DeploymentValues: deploymentValues, + AppHostname: "*.primary.test.coder.com", + IncludeProvisionerDaemon: true, + RealIPConfig: &httpmw.RealIPConfig{ + TrustedOrigins: []*net.IPNet{{ + IP: net.ParseIP("127.0.0.1"), + Mask: net.CIDRMask(8, 32), + }}, + TrustedHeaders: []string{ + "CF-Connecting-IP", + }, + }, + }, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{ + codersdk.FeatureWorkspaceProxy: 1, + }, + }, + }) + t.Cleanup(func() { + _ = closer.Close() + }) + + // Create two running external proxies. + proxyAPI1 := coderdenttest.NewWorkspaceProxy(t, api, client, &coderdenttest.ProxyOptions{ + Name: "best-proxy", + }) + proxyAPI2 := coderdenttest.NewWorkspaceProxy(t, api, client, &coderdenttest.ProxyOptions{ + Name: "worst-proxy", + }) + + // Create a proxy that is never started. + createProxyCtx := testutil.Context(t, testutil.WaitLong) + _, err := client.CreateWorkspaceProxy(createProxyCtx, codersdk.CreateWorkspaceProxyRequest{ + Name: "never-started-proxy", + }) + require.NoError(t, err) + + // Wait for both running proxies to become healthy. + require.Eventually(t, func() bool { + healthCtx := testutil.Context(t, testutil.WaitLong) + err := api.ProxyHealth.ForceUpdate(healthCtx) + if !assert.NoError(t, err) { + return false + } + + regions, err := client.Regions(healthCtx) + if !assert.NoError(t, err) { + return false + } + if !assert.Len(t, regions, 4) { + return false + } + + // The first 3 regions should be healthy. + for _, r := range regions[:3] { + if !r.Healthy { + return false + } + } + + // The last region should never be healthy. + assert.False(t, regions[3].Healthy) + return true + }, testutil.WaitLong, testutil.IntervalMedium) + + // Create a workspace + apps + authToken := uuid.NewString() + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionApply: echo.ProvisionApplyWithAgent(authToken), + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + build := coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + workspace.LatestBuild = build + + agentID := uuid.Nil +resourceLoop: + for _, res := range build.Resources { + for _, agnt := range res.Agents { + agentID = agnt.ID + break resourceLoop + } + } + require.NotEqual(t, uuid.Nil, agentID) + + // Connect an agent to the workspace + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug), + }) + t.Cleanup(func() { + _ = agentCloser.Close() + }) + coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) + + t.Run("ReturnedInDERPMap", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + connInfo, err := client.WorkspaceAgentConnectionInfo(ctx, agentID) + require.NoError(t, err) + + // There should be three DERP servers in the map: the primary, and each + // of the two running proxies. + require.NotNil(t, connInfo.DERPMap) + require.Len(t, connInfo.DERPMap.Regions, 3) + + var ( + primaryRegion *tailcfg.DERPRegion + proxy1Region *tailcfg.DERPRegion + proxy2Region *tailcfg.DERPRegion + ) + for _, r := range connInfo.DERPMap.Regions { + if r.EmbeddedRelay { + primaryRegion = r + continue + } + if r.RegionName == "best-proxy" { + proxy1Region = r + continue + } + if r.RegionName == "worst-proxy" { + proxy2Region = r + continue + } + + t.Fatalf("unexpected region: %+v", r) + } + + // The primary region: + require.Equal(t, "Coder Embedded Relay", primaryRegion.RegionName) + require.Equal(t, "coder", primaryRegion.RegionCode) + require.Equal(t, 999, primaryRegion.RegionID) + require.True(t, primaryRegion.EmbeddedRelay) + + // The first proxy region: + require.Equal(t, "best-proxy", proxy1Region.RegionName) + require.Equal(t, "coder_best-proxy", proxy1Region.RegionCode) + require.Equal(t, 10001, proxy1Region.RegionID) + require.False(t, proxy1Region.EmbeddedRelay) + require.Len(t, proxy1Region.Nodes, 1) + require.Equal(t, "10001a", proxy1Region.Nodes[0].Name) + require.Equal(t, 10001, proxy1Region.Nodes[0].RegionID) + require.Equal(t, proxyAPI1.Options.AccessURL.Hostname(), proxy1Region.Nodes[0].HostName) + require.Equal(t, proxyAPI1.Options.AccessURL.Port(), fmt.Sprint(proxy1Region.Nodes[0].DERPPort)) + require.Equal(t, proxyAPI1.Options.AccessURL.Scheme == "http", proxy1Region.Nodes[0].ForceHTTP) + + // The second proxy region: + require.Equal(t, "worst-proxy", proxy2Region.RegionName) + require.Equal(t, "coder_worst-proxy", proxy2Region.RegionCode) + require.Equal(t, 10002, proxy2Region.RegionID) + require.False(t, proxy2Region.EmbeddedRelay) + require.Len(t, proxy2Region.Nodes, 1) + require.Equal(t, "10002a", proxy2Region.Nodes[0].Name) + require.Equal(t, 10002, proxy2Region.Nodes[0].RegionID) + require.Equal(t, proxyAPI2.Options.AccessURL.Hostname(), proxy2Region.Nodes[0].HostName) + require.Equal(t, proxyAPI2.Options.AccessURL.Port(), fmt.Sprint(proxy2Region.Nodes[0].DERPPort)) + require.Equal(t, proxyAPI2.Options.AccessURL.Scheme == "http", proxy2Region.Nodes[0].ForceHTTP) + }) + + t.Run("ConnectDERP", func(t *testing.T) { + t.Parallel() + + connInfo, err := client.WorkspaceAgentConnectionInfo(testutil.Context(t, testutil.WaitLong), agentID) + require.NoError(t, err) + require.NotNil(t, connInfo.DERPMap) + require.Len(t, connInfo.DERPMap.Regions, 3) + + // Connect to each region. + for _, r := range connInfo.DERPMap.Regions { + r := r + + t.Run(r.RegionName, func(t *testing.T) { + t.Parallel() + + derpMap := &tailcfg.DERPMap{ + Regions: map[int]*tailcfg.DERPRegion{ + r.RegionID: r, + }, + OmitDefaultRegions: true, + } + + ctx := testutil.Context(t, testutil.WaitLong) + report := healthcheck.DERPReport{} + report.Run(ctx, &healthcheck.DERPReportOptions{ + DERPMap: derpMap, + }) + + t.Log("healthcheck report: " + spew.Sdump(&report)) + require.True(t, report.Healthy, "healthcheck failed, see report dump") + }) + } + }) +} + +func TestDERPEndToEnd(t *testing.T) { + t.Parallel() + + deploymentValues := coderdtest.DeploymentValues(t) + deploymentValues.Experiments = []string{ + string(codersdk.ExperimentMoons), + "*", + } + + client, closer, api, user := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + DeploymentValues: deploymentValues, + AppHostname: "*.primary.test.coder.com", + IncludeProvisionerDaemon: true, + RealIPConfig: &httpmw.RealIPConfig{ + TrustedOrigins: []*net.IPNet{{ + IP: net.ParseIP("127.0.0.1"), + Mask: net.CIDRMask(8, 32), + }}, + TrustedHeaders: []string{ + "CF-Connecting-IP", + }, + }, + }, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{ + codersdk.FeatureWorkspaceProxy: 1, + }, + }, + }) + t.Cleanup(func() { + _ = closer.Close() + }) + + coderdenttest.NewWorkspaceProxy(t, api, client, &coderdenttest.ProxyOptions{ + Name: "best-proxy", + }) + + // Wait for the proxy to become healthy. + require.Eventually(t, func() bool { + healthCtx := testutil.Context(t, testutil.WaitLong) + err := api.ProxyHealth.ForceUpdate(healthCtx) + if !assert.NoError(t, err) { + return false + } + + regions, err := client.Regions(healthCtx) + if !assert.NoError(t, err) { + return false + } + if !assert.Len(t, regions, 2) { + return false + } + for _, r := range regions { + if !r.Healthy { + return false + } + } + return true + }, testutil.WaitLong, testutil.IntervalMedium) + + // Swap out the DERPMapper for a fake one that only returns the proxy. This + // allows us to force the agent to pick the proxy as its preferred region. + oldDERPMapper := *api.AGPL.DERPMapper.Load() + newDERPMapper := func(derpMap *tailcfg.DERPMap) *tailcfg.DERPMap { + derpMap = oldDERPMapper(derpMap) + // Strip everything but the proxy, which is region ID 10001. + derpMap.Regions = map[int]*tailcfg.DERPRegion{ + 10001: derpMap.Regions[10001], + } + derpMap.OmitDefaultRegions = true + return derpMap + } + api.AGPL.DERPMapper.Store(&newDERPMapper) + + // Create a workspace + apps + authToken := uuid.NewString() + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionApply: echo.ProvisionApplyWithAgent(authToken), + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + build := coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + workspace.LatestBuild = build + + agentID := uuid.Nil +resourceLoop: + for _, res := range build.Resources { + for _, agnt := range res.Agents { + agentID = agnt.ID + break resourceLoop + } + } + require.NotEqual(t, uuid.Nil, agentID) + + // Connect an agent to the workspace + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug), + }) + t.Cleanup(func() { + _ = agentCloser.Close() + }) + coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) + + // Connect to the workspace agent. + ctx := testutil.Context(t, testutil.WaitLong) + conn, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, &slogtest.Options{ + IgnoreErrors: true, + }).Named("client").Leveled(slog.LevelDebug), + // Force DERP. + BlockEndpoints: true, + }) + require.NoError(t, err) + t.Cleanup(func() { + err := conn.Close() + assert.NoError(t, err) + }) + + ok := conn.AwaitReachable(ctx) + require.True(t, ok) + + _, p2p, _, err := conn.Ping(ctx) + require.NoError(t, err) + require.False(t, p2p) +} + func TestWorkspaceProxyWorkspaceApps_Wsconncache(t *testing.T) { t.Parallel() @@ -27,7 +384,7 @@ func TestWorkspaceProxyWorkspaceApps_Wsconncache(t *testing.T) { "*", } - client, _, api, user := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ + client, closer, api, user := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ Options: &coderdtest.Options{ DeploymentValues: deploymentValues, AppHostname: "*.primary.test.coder.com", @@ -48,6 +405,9 @@ func TestWorkspaceProxyWorkspaceApps_Wsconncache(t *testing.T) { }, }, }) + t.Cleanup(func() { + _ = closer.Close() + }) // Create the external proxy if opts.DisableSubdomainApps { diff --git a/enterprise/wsproxy/wsproxysdk/wsproxysdk.go b/enterprise/wsproxy/wsproxysdk/wsproxysdk.go index 5703114281096..f98ab3673eadd 100644 --- a/enterprise/wsproxy/wsproxysdk/wsproxysdk.go +++ b/enterprise/wsproxy/wsproxysdk/wsproxysdk.go @@ -21,7 +21,7 @@ import ( "github.com/coder/coder/coderd/httpmw" "github.com/coder/coder/coderd/workspaceapps" "github.com/coder/coder/codersdk" - agpl "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet" ) // Client is a HTTP client for a subset of Coder API routes that external @@ -157,10 +157,39 @@ type RegisterWorkspaceProxyRequest struct { AccessURL string `json:"access_url"` // WildcardHostname that the workspace proxy api is serving for subdomain apps. WildcardHostname string `json:"wildcard_hostname"` + // DerpEnabled indicates whether the proxy should be included in the DERP + // map or not. + DerpEnabled bool `json:"derp_enabled"` + + // ReplicaID is a unique identifier for the replica of the proxy that is + // registering. It should be generated by the client on startup and + // persisted (in memory only) until the process is restarted. + ReplicaID uuid.UUID `json:"replica_id"` + // ReplicaHostname is the OS hostname of the machine that the proxy is running + // on. This is only used for tracking purposes in the replicas table. + ReplicaHostname string `json:"hostname"` + // ReplicaError is the error that the replica encountered when trying to + // dial it's peers. This is stored in the replicas table for debugging + // purposes but does not affect the proxy's ability to register. + // + // This value is only stored on subsequent requests to the register + // endpoint, not the first request. + ReplicaError string `json:"replica_error"` + // ReplicaRelayAddress is the DERP address of the replica that other + // replicas may use to connect internally for DERP meshing. + ReplicaRelayAddress string `json:"replica_relay_address"` + + // Version is the Coder version of the proxy. + Version string `json:"version"` } type RegisterWorkspaceProxyResponse struct { AppSecurityKey string `json:"app_security_key"` + DERPMeshKey string `json:"derp_mesh_key"` + DERPRegionID int32 `json:"derp_region_id"` + // SiblingReplicas is a list of all other replicas of the proxy that have + // not timed out. + SiblingReplicas []codersdk.Replica `json:"sibling_replicas"` } func (c *Client) RegisterWorkspaceProxy(ctx context.Context, req RegisterWorkspaceProxyRequest) (RegisterWorkspaceProxyResponse, error) { @@ -180,22 +209,183 @@ func (c *Client) RegisterWorkspaceProxy(ctx context.Context, req RegisterWorkspa return resp, json.NewDecoder(res.Body).Decode(&resp) } -func (c *Client) WorkspaceProxyGoingAway(ctx context.Context) error { +type DeregisterWorkspaceProxyRequest struct { + // ReplicaID is a unique identifier for the replica of the proxy that is + // deregistering. It should be generated by the client on startup and + // should've already been passed to the register endpoint. + ReplicaID uuid.UUID `json:"replica_id"` +} + +func (c *Client) DeregisterWorkspaceProxy(ctx context.Context, req DeregisterWorkspaceProxyRequest) error { res, err := c.Request(ctx, http.MethodPost, - "/api/v2/workspaceproxies/me/goingaway", - nil, + "/api/v2/workspaceproxies/me/deregister", + req, ) if err != nil { return xerrors.Errorf("make request: %w", err) } defer res.Body.Close() - if res.StatusCode != http.StatusOK { + if res.StatusCode != http.StatusNoContent { return codersdk.ReadBodyAsError(res) } return nil } +type RegisterWorkspaceProxyLoopOpts struct { + Logger slog.Logger + Request RegisterWorkspaceProxyRequest + + // Interval between registration attempts. Defaults to 30 seconds. Note that + // the initial registration is not delayed by this interval. + Interval time.Duration + // MaxFailureCount is the maximum amount of attempts that the loop will + // retry registration before giving up. Defaults to 10 (for ~5 minutes). + MaxFailureCount int + // AttemptTimeout is the maximum amount of time that the loop will wait for + // a response from the server before considering the attempt a failure. + // Defaults to 10 seconds. + AttemptTimeout time.Duration + + // MutateFn is called before each request to mutate the request struct. This + // can be used to update fields like ReplicaError. + MutateFn func(req *RegisterWorkspaceProxyRequest) + // CallbackFn is called with the response from the server after each + // successful registration, except the first. The callback function is + // called in a blocking manner, so it should avoid blocking for too long. If + // the callback returns an error, the loop will stop immediately and the + // error will be returned to the FailureFn. + CallbackFn func(ctx context.Context, res RegisterWorkspaceProxyResponse) error + // FailureFn is called with the last error returned from the server if the + // context is canceled, registration fails for more than MaxFailureCount, + // or if any permanent values in the response change. + FailureFn func(err error) +} + +// RegisterWorkspaceProxyLoop will register the workspace proxy and then start a +// goroutine to keep registering periodically in the background. +// +// The first response is returned immediately, and subsequent responses will be +// notified to the given CallbackFn. When the context is canceled the loop will +// stop immediately and the context error will be returned to the FailureFn. +// +// The returned channel will be closed when the loop stops and can be used to +// ensure the loop is dead before continuing. When a fatal error is encountered, +// the proxy will be deregistered (with the same ReplicaID and AttemptTimeout) +// before calling the FailureFn. +func (c *Client) RegisterWorkspaceProxyLoop(ctx context.Context, opts RegisterWorkspaceProxyLoopOpts) (RegisterWorkspaceProxyResponse, <-chan struct{}, error) { + if opts.Interval == 0 { + opts.Interval = 30 * time.Second + } + if opts.MaxFailureCount == 0 { + opts.MaxFailureCount = 10 + } + if opts.AttemptTimeout == 0 { + opts.AttemptTimeout = 10 * time.Second + } + if opts.MutateFn == nil { + opts.MutateFn = func(_ *RegisterWorkspaceProxyRequest) {} + } + if opts.CallbackFn == nil { + opts.CallbackFn = func(_ context.Context, _ RegisterWorkspaceProxyResponse) error { + return nil + } + } + + failureFn := func(err error) { + // We have to use background context here because the original context + // may be canceled. + deregisterCtx, cancel := context.WithTimeout(context.Background(), opts.AttemptTimeout) + defer cancel() + deregisterErr := c.DeregisterWorkspaceProxy(deregisterCtx, DeregisterWorkspaceProxyRequest{ + ReplicaID: opts.Request.ReplicaID, + }) + if deregisterErr != nil { + opts.Logger.Error(ctx, + "failed to deregister workspace proxy with Coder primary (it will be automatically deregistered shortly)", + slog.Error(deregisterErr), + ) + } + + if opts.FailureFn != nil { + opts.FailureFn(err) + } + } + + originalRes, err := c.RegisterWorkspaceProxy(ctx, opts.Request) + if err != nil { + return RegisterWorkspaceProxyResponse{}, nil, xerrors.Errorf("register workspace proxy: %w", err) + } + + done := make(chan struct{}) + go func() { + defer close(done) + + var ( + failedAttempts = 0 + ticker = time.NewTicker(opts.Interval) + ) + for { + select { + case <-ctx.Done(): + failureFn(ctx.Err()) + return + case <-ticker.C: + } + + opts.Logger.Debug(ctx, + "re-registering workspace proxy with Coder primary", + slog.F("req", opts.Request), + slog.F("timeout", opts.AttemptTimeout), + slog.F("failed_attempts", failedAttempts), + ) + opts.MutateFn(&opts.Request) + registerCtx, cancel := context.WithTimeout(ctx, opts.AttemptTimeout) + res, err := c.RegisterWorkspaceProxy(registerCtx, opts.Request) + cancel() + if err != nil { + failedAttempts++ + opts.Logger.Warn(ctx, + "failed to re-register workspace proxy with Coder primary", + slog.F("req", opts.Request), + slog.F("timeout", opts.AttemptTimeout), + slog.F("failed_attempts", failedAttempts), + slog.Error(err), + ) + + if failedAttempts > opts.MaxFailureCount { + failureFn(xerrors.Errorf("exceeded re-registration failure count of %d: last error: %w", opts.MaxFailureCount, err)) + return + } + continue + } + failedAttempts = 0 + + if res.AppSecurityKey != originalRes.AppSecurityKey { + failureFn(xerrors.New("app security key has changed, proxy must be restarted")) + return + } + if res.DERPMeshKey != originalRes.DERPMeshKey { + failureFn(xerrors.New("DERP mesh key has changed, proxy must be restarted")) + return + } + if res.DERPRegionID != originalRes.DERPRegionID { + failureFn(xerrors.New("DERP region ID has changed, proxy must be restarted")) + } + + err = opts.CallbackFn(ctx, res) + if err != nil { + failureFn(xerrors.Errorf("callback fn returned error: %w", err)) + return + } + + ticker.Reset(opts.Interval) + } + }() + + return originalRes, done, nil +} + type CoordinateMessageType int const ( @@ -207,14 +397,14 @@ const ( type CoordinateMessage struct { Type CoordinateMessageType `json:"type"` AgentID uuid.UUID `json:"agent_id"` - Node *agpl.Node `json:"node"` + Node *tailnet.Node `json:"node"` } type CoordinateNodes struct { - Nodes []*agpl.Node + Nodes []*tailnet.Node } -func (c *Client) DialCoordinator(ctx context.Context) (agpl.MultiAgentConn, error) { +func (c *Client) DialCoordinator(ctx context.Context) (tailnet.MultiAgentConn, error) { ctx, cancel := context.WithCancel(ctx) coordinateURL, err := c.SDKClient.URL.Parse("/api/v2/workspaceproxies/me/coordinate") @@ -248,7 +438,7 @@ func (c *Client) DialCoordinator(ctx context.Context) (agpl.MultiAgentConn, erro legacyAgentCache: map[uuid.UUID]bool{}, } - ma := (&agpl.MultiAgent{ + ma := (&tailnet.MultiAgent{ ID: uuid.New(), AgentIsLegacyFunc: rma.AgentIsLegacy, OnSubscribe: rma.OnSubscribe, @@ -300,7 +490,7 @@ func (a *remoteMultiAgentHandler) writeJSON(v interface{}) error { // Set a deadline so that hung connections don't put back pressure on the system. // Node updates are tiny, so even the dinkiest connection can handle them if it's not hung. - err = a.nc.SetWriteDeadline(time.Now().Add(agpl.WriteTimeout)) + err = a.nc.SetWriteDeadline(time.Now().Add(tailnet.WriteTimeout)) if err != nil { return xerrors.Errorf("set write deadline: %w", err) } @@ -322,21 +512,21 @@ func (a *remoteMultiAgentHandler) writeJSON(v interface{}) error { return nil } -func (a *remoteMultiAgentHandler) OnNodeUpdate(_ uuid.UUID, node *agpl.Node) error { +func (a *remoteMultiAgentHandler) OnNodeUpdate(_ uuid.UUID, node *tailnet.Node) error { return a.writeJSON(CoordinateMessage{ Type: CoordinateMessageTypeNodeUpdate, Node: node, }) } -func (a *remoteMultiAgentHandler) OnSubscribe(_ agpl.Queue, agentID uuid.UUID) (*agpl.Node, error) { +func (a *remoteMultiAgentHandler) OnSubscribe(_ tailnet.Queue, agentID uuid.UUID) (*tailnet.Node, error) { return nil, a.writeJSON(CoordinateMessage{ Type: CoordinateMessageTypeSubscribe, AgentID: agentID, }) } -func (a *remoteMultiAgentHandler) OnUnsubscribe(_ agpl.Queue, agentID uuid.UUID) error { +func (a *remoteMultiAgentHandler) OnUnsubscribe(_ tailnet.Queue, agentID uuid.UUID) error { return a.writeJSON(CoordinateMessage{ Type: CoordinateMessageTypeUnsubscribe, AgentID: agentID, diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index d5f60f4b19ed9..5cb0ab071d500 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -1446,6 +1446,7 @@ export interface WorkspaceOptions { // From codersdk/workspaceproxy.go export interface WorkspaceProxy extends Region { + readonly derp_enabled: boolean readonly status?: WorkspaceProxyStatus readonly created_at: string readonly updated_at: string diff --git a/site/src/testHelpers/entities.ts b/site/src/testHelpers/entities.ts index ad7259c05a76b..a8e1baaf0056f 100644 --- a/site/src/testHelpers/entities.ts +++ b/site/src/testHelpers/entities.ts @@ -79,6 +79,7 @@ export const MockPrimaryWorkspaceProxy: TypesGen.WorkspaceProxy = { healthy: true, path_app_url: "https://coder.com", wildcard_hostname: "*.coder.com", + derp_enabled: true, created_at: new Date().toISOString(), updated_at: new Date().toISOString(), deleted: false, @@ -96,6 +97,7 @@ export const MockHealthyWildWorkspaceProxy: TypesGen.WorkspaceProxy = { healthy: true, path_app_url: "https://external.com", wildcard_hostname: "*.external.com", + derp_enabled: true, created_at: new Date().toISOString(), updated_at: new Date().toISOString(), deleted: false, @@ -113,6 +115,7 @@ export const MockUnhealthyWildWorkspaceProxy: TypesGen.WorkspaceProxy = { healthy: false, path_app_url: "https://unhealthy.coder.com", wildcard_hostname: "*unhealthy..coder.com", + derp_enabled: true, created_at: new Date().toISOString(), updated_at: new Date().toISOString(), deleted: false, @@ -138,6 +141,7 @@ export const MockWorkspaceProxies: TypesGen.WorkspaceProxy[] = [ healthy: true, path_app_url: "https://cowboy.coder.com", wildcard_hostname: "", + derp_enabled: false, created_at: new Date().toISOString(), updated_at: new Date().toISOString(), deleted: false, diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index 0dd0812b94777..99a88fabb2263 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -195,3 +195,124 @@ func TestConn_PreferredDERP(t *testing.T) { t.Fatal("timed out waiting for node") } } + +// TestConn_UpdateDERP tests that when update the DERP map we pick a new +// preferred DERP server and new connections can be made from clients. +func TestConn_UpdateDERP(t *testing.T) { + t.Parallel() + logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) + + derpMap1, _ := tailnettest.RunDERPAndSTUN(t) + ip := tailnet.IP() + conn, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)}, + Logger: logger.Named("w1"), + DERPMap: derpMap1, + BlockEndpoints: true, + }) + require.NoError(t, err) + defer func() { + err := conn.Close() + assert.NoError(t, err) + }() + + // Buffer channel so callback doesn't block + nodes := make(chan *tailnet.Node, 50) + conn.SetNodeCallback(func(node *tailnet.Node) { + nodes <- node + }) + + ctx1, cancel1 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel1() + select { + case node := <-nodes: + require.Equal(t, 1, node.PreferredDERP) + case <-ctx1.Done(): + t.Fatal("timed out waiting for node") + } + + // Connect from a different client. + client1, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + Logger: logger.Named("client1"), + DERPMap: derpMap1, + BlockEndpoints: true, + }) + require.NoError(t, err) + defer func() { + err := client1.Close() + assert.NoError(t, err) + }() + client1.SetNodeCallback(func(node *tailnet.Node) { + err := conn.UpdateNodes([]*tailnet.Node{node}, false) + assert.NoError(t, err) + }) + client1.UpdateNodes([]*tailnet.Node{conn.Node()}, false) + + awaitReachableCtx1, awaitReachableCancel1 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer awaitReachableCancel1() + require.True(t, client1.AwaitReachable(awaitReachableCtx1, ip)) + + // Update the DERP map and wait for the preferred DERP server to change. + derpMap2, _ := tailnettest.RunDERPAndSTUN(t) + // Change the region ID. + derpMap2.Regions[2] = derpMap2.Regions[1] + delete(derpMap2.Regions, 1) + derpMap2.Regions[2].RegionID = 2 + for _, node := range derpMap2.Regions[2].Nodes { + node.RegionID = 2 + } + conn.SetDERPMap(derpMap2) + + ctx2, cancel2 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel2() +parentLoop: + for { + select { + case node := <-nodes: + if node.PreferredDERP != 2 { + t.Logf("waiting for preferred DERP server to change, got %v", node.PreferredDERP) + continue + } + t.Log("preferred DERP server changed!") + break parentLoop + case <-ctx2.Done(): + t.Fatal("timed out waiting for preferred DERP server to change") + } + } + + // Client1 should be dropped... + awaitReachableCtx2, awaitReachableCancel2 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer awaitReachableCancel2() + require.False(t, client1.AwaitReachable(awaitReachableCtx2, ip)) + + // ... unless the client updates it's derp map and nodes. + client1.SetDERPMap(derpMap2) + client1.UpdateNodes([]*tailnet.Node{conn.Node()}, false) + awaitReachableCtx3, awaitReachableCancel3 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer awaitReachableCancel3() + require.True(t, client1.AwaitReachable(awaitReachableCtx3, ip)) + + // Connect from a different different client with up-to-date derp map and + // nodes. + client2, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + Logger: logger.Named("client2"), + DERPMap: derpMap2, + BlockEndpoints: true, + }) + require.NoError(t, err) + defer func() { + err := client2.Close() + assert.NoError(t, err) + }() + client2.SetNodeCallback(func(node *tailnet.Node) { + err := conn.UpdateNodes([]*tailnet.Node{node}, false) + assert.NoError(t, err) + }) + client2.UpdateNodes([]*tailnet.Node{conn.Node()}, false) + + awaitReachableCtx4, awaitReachableCancel4 := context.WithTimeout(context.Background(), testutil.WaitShort) + defer awaitReachableCancel4() + require.True(t, client2.AwaitReachable(awaitReachableCtx4, ip)) +} diff --git a/tailnet/derpmap.go b/tailnet/derpmap.go index 37092886540dd..b4bbd17d5879b 100644 --- a/tailnet/derpmap.go +++ b/tailnet/derpmap.go @@ -97,3 +97,115 @@ func NewDERPMap(ctx context.Context, region *tailcfg.DERPRegion, stunAddrs []str return derpMap, nil } + +// CompareDERPMaps returns true if the given DERPMaps are equivalent. Ordering +// of slices is ignored. +// +// If the first map is nil, the second map must also be nil for them to be +// considered equivalent. If the second map is nil, the first map can be any +// value and the function will return true. +func CompareDERPMaps(a *tailcfg.DERPMap, b *tailcfg.DERPMap) bool { + if a == nil { + return b == nil + } + if b == nil { + return true + } + if len(a.Regions) != len(b.Regions) { + return false + } + if a.OmitDefaultRegions != b.OmitDefaultRegions { + return false + } + + for id, region := range a.Regions { + other, ok := b.Regions[id] + if !ok { + return false + } + if !compareDERPRegions(region, other) { + return false + } + } + return true +} + +func compareDERPRegions(a *tailcfg.DERPRegion, b *tailcfg.DERPRegion) bool { + if a == nil || b == nil { + return false + } + if a.EmbeddedRelay != b.EmbeddedRelay { + return false + } + if a.RegionID != b.RegionID { + return false + } + if a.RegionCode != b.RegionCode { + return false + } + if a.RegionName != b.RegionName { + return false + } + if a.Avoid != b.Avoid { + return false + } + if len(a.Nodes) != len(b.Nodes) { + return false + } + + // Convert both slices to maps so ordering can be ignored easier. + aNodes := map[string]*tailcfg.DERPNode{} + for _, node := range a.Nodes { + aNodes[node.Name] = node + } + bNodes := map[string]*tailcfg.DERPNode{} + for _, node := range b.Nodes { + bNodes[node.Name] = node + } + + for name, aNode := range aNodes { + bNode, ok := bNodes[name] + if !ok { + return false + } + + if aNode.Name != bNode.Name { + return false + } + if aNode.RegionID != bNode.RegionID { + return false + } + if aNode.HostName != bNode.HostName { + return false + } + if aNode.CertName != bNode.CertName { + return false + } + if aNode.IPv4 != bNode.IPv4 { + return false + } + if aNode.IPv6 != bNode.IPv6 { + return false + } + if aNode.STUNPort != bNode.STUNPort { + return false + } + if aNode.STUNOnly != bNode.STUNOnly { + return false + } + if aNode.DERPPort != bNode.DERPPort { + return false + } + if aNode.InsecureForTests != bNode.InsecureForTests { + return false + } + if aNode.ForceHTTP != bNode.ForceHTTP { + return false + } + if aNode.STUNTestIP != bNode.STUNTestIP { + return false + } + } + + return true +}