Skip to content

feat: add health check monitoring to workspace apps #4114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 64 commits into from
Sep 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
79c761e
add db types
f0ssel Sep 19, 2022
511be13
add sdk types
f0ssel Sep 19, 2022
b034b06
add postWorkspaceAppHealth route
f0ssel Sep 19, 2022
419f8e7
Add more healthcheck fields to db schema
f0ssel Sep 19, 2022
8d0517e
healthcheck threshold
f0ssel Sep 19, 2022
719eb4d
add storybooks
f0ssel Sep 19, 2022
1bcac73
typo
f0ssel Sep 19, 2022
ae77f1c
change to warning icon
f0ssel Sep 19, 2022
467a715
fix missing err check
f0ssel Sep 19, 2022
22e275e
gosec
f0ssel Sep 19, 2022
9f84cf2
make fmt
f0ssel Sep 19, 2022
7793799
fix js tests
f0ssel Sep 19, 2022
349116c
add authtest skip
f0ssel Sep 19, 2022
66a6146
rebase
f0ssel Sep 19, 2022
342cbb0
fix insert
f0ssel Sep 19, 2022
c8534d7
whitespace
f0ssel Sep 19, 2022
48c9c76
whitespace
f0ssel Sep 19, 2022
f08718e
healthcheck url
f0ssel Sep 19, 2022
737209f
add proto
f0ssel Sep 19, 2022
466340a
connect proto
f0ssel Sep 20, 2022
25fc5d8
whitespace
f0ssel Sep 20, 2022
e28c366
lint
f0ssel Sep 20, 2022
1c179a4
add workspace agent apps route
f0ssel Sep 20, 2022
6df6998
add myWorkspaceAgent
f0ssel Sep 20, 2022
18fb1a5
noauthorize
f0ssel Sep 20, 2022
dea8070
add postworkspaceagentapphealth
f0ssel Sep 20, 2022
c098980
docs
f0ssel Sep 20, 2022
84c3cf8
add reportAppHealth
f0ssel Sep 20, 2022
7028377
add retry loop
f0ssel Sep 20, 2022
947ff9c
gosimp
f0ssel Sep 20, 2022
047a2e6
fix
f0ssel Sep 20, 2022
26d902a
authorizer
f0ssel Sep 20, 2022
4e65229
workspace app health reporter
f0ssel Sep 20, 2022
9129027
health
f0ssel Sep 20, 2022
e87b48a
fix types
f0ssel Sep 20, 2022
2d5d27a
handle context
f0ssel Sep 20, 2022
fec256d
handle nil interface
f0ssel Sep 21, 2022
a3330c7
add test for agent app health routes
f0ssel Sep 21, 2022
18d05a9
fix test
f0ssel Sep 21, 2022
e6dc742
fix json
f0ssel Sep 21, 2022
1947adc
remove healthcheck_enabled
f0ssel Sep 21, 2022
bb5aa3e
add healthcheck type
f0ssel Sep 22, 2022
d7c2ef2
fix merge
f0ssel Sep 22, 2022
e7a2798
fix nil
f0ssel Sep 22, 2022
b774aee
fix js
f0ssel Sep 22, 2022
8cfef1a
update tf provider
f0ssel Sep 22, 2022
aaabc5a
make fmt
f0ssel Sep 22, 2022
7c70495
add to example
f0ssel Sep 22, 2022
5aedcdc
fix agent logic
f0ssel Sep 22, 2022
2654c1a
fix cast
f0ssel Sep 22, 2022
8b293fe
add apphealth_test.go
f0ssel Sep 23, 2022
6b95ddd
lint
f0ssel Sep 23, 2022
cf53ce6
lint
f0ssel Sep 23, 2022
2f17c5a
lint
f0ssel Sep 23, 2022
e63769b
make tests more reliable
f0ssel Sep 23, 2022
0fbd251
fix migration number
f0ssel Sep 23, 2022
1cde12b
fix migration number
f0ssel Sep 23, 2022
e7f93a9
fix goleak
f0ssel Sep 23, 2022
d304f64
simplify goroutines
f0ssel Sep 23, 2022
634fb64
pr comments
f0ssel Sep 23, 2022
7f3f45a
fix datarace in test
f0ssel Sep 23, 2022
7caea9a
fix another datarace
f0ssel Sep 23, 2022
52ab3dc
dont wait twice
f0ssel Sep 23, 2022
f1ca9c5
cleanup
f0ssel Sep 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 42 additions & 49 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

"cdr.dev/slog"
"github.com/coder/coder/agent/usershell"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/pty"
"github.com/coder/coder/tailnet"
"github.com/coder/retry"
Expand All @@ -49,55 +50,41 @@ const (
MagicSessionErrorCode = 229
)

var (
// tailnetIP is a static IPv6 address with the Tailscale prefix that is used to route
// connections from clients to this node. A dynamic address is not required because a Tailnet
// client only dials a single agent at a time.
tailnetIP = netip.MustParseAddr("fd7a:115c:a1e0:49d6:b259:b7ac:b1b2:48f4")
tailnetSSHPort = 1
tailnetReconnectingPTYPort = 2
tailnetSpeedtestPort = 3
)

type Options struct {
CoordinatorDialer CoordinatorDialer
FetchMetadata FetchMetadata

StatsReporter StatsReporter
ReconnectingPTYTimeout time.Duration
EnvironmentVariables map[string]string
Logger slog.Logger
}

type Metadata struct {
DERPMap *tailcfg.DERPMap `json:"derpmap"`
EnvironmentVariables map[string]string `json:"environment_variables"`
StartupScript string `json:"startup_script"`
Directory string `json:"directory"`
CoordinatorDialer CoordinatorDialer
FetchMetadata FetchMetadata
StatsReporter StatsReporter
WorkspaceAgentApps WorkspaceAgentApps
PostWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth
ReconnectingPTYTimeout time.Duration
EnvironmentVariables map[string]string
Logger slog.Logger
}

// CoordinatorDialer is a function that constructs a new broker.
// A dialer must be passed in to allow for reconnects.
type CoordinatorDialer func(ctx context.Context) (net.Conn, error)
type CoordinatorDialer func(context.Context) (net.Conn, error)

// FetchMetadata is a function to obtain metadata for the agent.
type FetchMetadata func(ctx context.Context) (Metadata, error)
type FetchMetadata func(context.Context) (codersdk.WorkspaceAgentMetadata, error)

func New(options Options) io.Closer {
if options.ReconnectingPTYTimeout == 0 {
options.ReconnectingPTYTimeout = 5 * time.Minute
}
ctx, cancelFunc := context.WithCancel(context.Background())
server := &agent{
reconnectingPTYTimeout: options.ReconnectingPTYTimeout,
logger: options.Logger,
closeCancel: cancelFunc,
closed: make(chan struct{}),
envVars: options.EnvironmentVariables,
coordinatorDialer: options.CoordinatorDialer,
fetchMetadata: options.FetchMetadata,
stats: &Stats{},
statsReporter: options.StatsReporter,
reconnectingPTYTimeout: options.ReconnectingPTYTimeout,
logger: options.Logger,
closeCancel: cancelFunc,
closed: make(chan struct{}),
envVars: options.EnvironmentVariables,
coordinatorDialer: options.CoordinatorDialer,
fetchMetadata: options.FetchMetadata,
stats: &Stats{},
statsReporter: options.StatsReporter,
workspaceAgentApps: options.WorkspaceAgentApps,
postWorkspaceAgentAppHealth: options.PostWorkspaceAgentAppHealth,
}
server.init(ctx)
return server
Expand All @@ -120,14 +107,16 @@ type agent struct {
fetchMetadata FetchMetadata
sshServer *ssh.Server

network *tailnet.Conn
coordinatorDialer CoordinatorDialer
stats *Stats
statsReporter StatsReporter
network *tailnet.Conn
coordinatorDialer CoordinatorDialer
stats *Stats
statsReporter StatsReporter
workspaceAgentApps WorkspaceAgentApps
postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth
}

func (a *agent) run(ctx context.Context) {
var metadata Metadata
var metadata codersdk.WorkspaceAgentMetadata
var err error
// An exponential back-off occurs when the connection is failing to dial.
// This is to prevent server spam in case of a coderd outage.
Expand Down Expand Up @@ -168,6 +157,10 @@ func (a *agent) run(ctx context.Context) {
if metadata.DERPMap != nil {
go a.runTailnet(ctx, metadata.DERPMap)
}

if a.workspaceAgentApps != nil && a.postWorkspaceAgentAppHealth != nil {
go NewWorkspaceAppHealthReporter(a.logger, a.workspaceAgentApps, a.postWorkspaceAgentAppHealth)(ctx)
}
}

func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
Expand All @@ -182,7 +175,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
}
var err error
a.network, err = tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(tailnetIP, 128)},
Addresses: []netip.Prefix{netip.PrefixFrom(codersdk.TailnetIP, 128)},
DERPMap: derpMap,
Logger: a.logger.Named("tailnet"),
})
Expand All @@ -199,7 +192,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
})
go a.runCoordinator(ctx)

sshListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(tailnetSSHPort))
sshListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(codersdk.TailnetSSHPort))
if err != nil {
a.logger.Critical(ctx, "listen for ssh", slog.Error(err))
return
Expand All @@ -213,7 +206,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
go a.sshServer.HandleConn(a.stats.wrapConn(conn))
}
}()
reconnectingPTYListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(tailnetReconnectingPTYPort))
reconnectingPTYListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(codersdk.TailnetReconnectingPTYPort))
if err != nil {
a.logger.Critical(ctx, "listen for reconnecting pty", slog.Error(err))
return
Expand All @@ -239,15 +232,15 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) {
if err != nil {
continue
}
var msg reconnectingPTYInit
var msg codersdk.ReconnectingPTYInit
err = json.Unmarshal(data, &msg)
if err != nil {
continue
}
go a.handleReconnectingPTY(ctx, msg, conn)
}
}()
speedtestListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(tailnetSpeedtestPort))
speedtestListener, err := a.network.Listen("tcp", ":"+strconv.Itoa(codersdk.TailnetSpeedtestPort))
if err != nil {
a.logger.Critical(ctx, "listen for speedtest", slog.Error(err))
return
Expand Down Expand Up @@ -434,7 +427,7 @@ func (a *agent) init(ctx context.Context) {

go a.run(ctx)
if a.statsReporter != nil {
cl, err := a.statsReporter(ctx, a.logger, func() *Stats {
cl, err := a.statsReporter(ctx, a.logger, func() *codersdk.AgentStats {
return a.stats.Copy()
})
if err != nil {
Expand Down Expand Up @@ -469,7 +462,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
if rawMetadata == nil {
return nil, xerrors.Errorf("no metadata was provided: %w", err)
}
metadata, valid := rawMetadata.(Metadata)
metadata, valid := rawMetadata.(codersdk.WorkspaceAgentMetadata)
if !valid {
return nil, xerrors.Errorf("metadata is the wrong type: %T", metadata)
}
Expand Down Expand Up @@ -625,7 +618,7 @@ func (a *agent) handleSSHSession(session ssh.Session) (retErr error) {
return cmd.Wait()
}

func (a *agent) handleReconnectingPTY(ctx context.Context, msg reconnectingPTYInit, conn net.Conn) {
func (a *agent) handleReconnectingPTY(ctx context.Context, msg codersdk.ReconnectingPTYInit, conn net.Conn) {
defer conn.Close()

var rpty *reconnectingPTY
Expand Down Expand Up @@ -766,7 +759,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, msg reconnectingPTYIn
rpty.activeConnsMutex.Unlock()
}()
decoder := json.NewDecoder(conn)
var req ReconnectingPTYRequest
var req codersdk.ReconnectingPTYRequest
for {
err = decoder.Decode(&req)
if xerrors.Is(err, io.EOF) {
Expand Down
Loading