Skip to content

fix: cache disconnected agent names in tailnet coordinator debug #5870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix: cache disconnected agent names in tailnet coordinator debug
  • Loading branch information
coadler committed Jan 26, 2023
commit 1cf24d412a06e87a9d25c3348603bcd16c8ca4a0
13 changes: 12 additions & 1 deletion coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,15 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
return
}

owner, err := api.Database.GetUserByID(ctx, workspace.OwnerID)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Internal error fetching user.",
Detail: err.Error(),
})
return
}

// Ensure the resource is still valid!
// We only accept agents for resources on the latest build.
ensureLatestBuild := func() error {
Expand Down Expand Up @@ -628,7 +637,9 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request
closeChan := make(chan struct{})
go func() {
defer close(closeChan)
err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID, fmt.Sprintf("%s-%s", workspace.Name, workspaceAgent.Name))
err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID,
fmt.Sprintf("%s-%s-%s", owner.Username, workspace.Name, workspaceAgent.Name),
)
if err != nil {
api.Logger.Warn(ctx, "tailnet coordinator agent error", slog.Error(err))
_ = conn.Close(websocket.StatusInternalError, err.Error())
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ require (
github.com/google/uuid v1.3.0
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b
github.com/hashicorp/go-version v1.6.0
github.com/hashicorp/golang-lru/v2 v2.0.1
github.com/hashicorp/hc-install v0.4.1-0.20220912074615-4487b02cbcbb
github.com/hashicorp/hcl/v2 v2.14.0
github.com/hashicorp/terraform-config-inspect v0.0.0-20211115214459-90acf1ca460f
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,8 @@ github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
github.com/hashicorp/golang-lru/v2 v2.0.1 h1:5pv5N1lT1fjLg2VQ5KWc7kmucp2x/kvFOnxuVTqZ6x4=
github.com/hashicorp/golang-lru/v2 v2.0.1/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/hashicorp/hc-install v0.4.1-0.20220912074615-4487b02cbcbb h1:0AmumMAu6gi5zXEyXvLKDu/HALK+rIcVBZU5XJNyjRM=
github.com/hashicorp/hc-install v0.4.1-0.20220912074615-4487b02cbcbb/go.mod h1:b3vG+IG40BBISnWiQb9/nHqZI/N3oiunwTtyTDaMGOA=
github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f/go.mod h1:oZtUIOe8dh44I2q6ScRibXws4Ajl+d+nod3AaR9vL5w=
Expand Down
21 changes: 20 additions & 1 deletion tailnet/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"time"

"github.com/google/uuid"
lru "github.com/hashicorp/golang-lru/v2"
"golang.org/x/exp/slices"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
Expand Down Expand Up @@ -109,11 +110,17 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func
// coordinator is incompatible with multiple Coder replicas as all node data is
// in-memory.
func NewCoordinator() Coordinator {
cache, err := lru.New[uuid.UUID, string](512)
if err != nil {
panic("make lru cache: " + err.Error())
}

return &coordinator{
closed: false,
nodes: map[uuid.UUID]*Node{},
agentSockets: map[uuid.UUID]*trackedConn{},
agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]*trackedConn{},
agentNameCache: cache,
}
}

Expand All @@ -135,6 +142,10 @@ type coordinator struct {
// agentToConnectionSockets maps agent IDs to connection IDs of conns that
// are subscribed to updates for that agent.
agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]*trackedConn

// agentNameCache holds a cache of agent names. If one of them disappears,
// it's helpful to have a name cached for debugging.
agentNameCache *lru.Cache[uuid.UUID, string]
}

type trackedConn struct {
Expand Down Expand Up @@ -288,6 +299,8 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID, name string) error
return xerrors.New("coordinator is closed")
}

c.agentNameCache.Add(id, name)

sockets, ok := c.agentToConnectionSockets[id]
if ok {
// Publish all nodes that want to connect to the
Expand Down Expand Up @@ -532,7 +545,13 @@ func (c *coordinator) ServeHTTPDebug(w http.ResponseWriter, _ *http.Request) {
fmt.Fprintln(w, "<ul>")

for _, agentConns := range missingAgents {
fmt.Fprintf(w, "<li style=\"margin-top:4px\"><b>unknown</b> (<code>%s</code>): created ? ago, write ? ago, overwrites ? </li>\n",
agentName, ok := c.agentNameCache.Get(agentConns.id)
if !ok {
agentName = "unknown"
}

fmt.Fprintf(w, "<li style=\"margin-top:4px\"><b>%s</b> (<code>%s</code>): created ? ago, write ? ago, overwrites ? </li>\n",
agentName,
agentConns.id.String(),
)

Expand Down