Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
79c761e
add db types
f0ssel Sep 19, 2022
511be13
add sdk types
f0ssel Sep 19, 2022
b034b06
add postWorkspaceAppHealth route
f0ssel Sep 19, 2022
419f8e7
Add more healthcheck fields to db schema
f0ssel Sep 19, 2022
8d0517e
healthcheck threshold
f0ssel Sep 19, 2022
719eb4d
add storybooks
f0ssel Sep 19, 2022
1bcac73
typo
f0ssel Sep 19, 2022
ae77f1c
change to warning icon
f0ssel Sep 19, 2022
467a715
fix missing err check
f0ssel Sep 19, 2022
22e275e
gosec
f0ssel Sep 19, 2022
9f84cf2
make fmt
f0ssel Sep 19, 2022
7793799
fix js tests
f0ssel Sep 19, 2022
349116c
add authtest skip
f0ssel Sep 19, 2022
66a6146
rebase
f0ssel Sep 19, 2022
342cbb0
fix insert
f0ssel Sep 19, 2022
c8534d7
whitespace
f0ssel Sep 19, 2022
48c9c76
whitespace
f0ssel Sep 19, 2022
f08718e
healthcheck url
f0ssel Sep 19, 2022
737209f
add proto
f0ssel Sep 19, 2022
466340a
connect proto
f0ssel Sep 20, 2022
25fc5d8
whitespace
f0ssel Sep 20, 2022
e28c366
lint
f0ssel Sep 20, 2022
1c179a4
add workspace agent apps route
f0ssel Sep 20, 2022
6df6998
add myWorkspaceAgent
f0ssel Sep 20, 2022
18fb1a5
noauthorize
f0ssel Sep 20, 2022
dea8070
add postworkspaceagentapphealth
f0ssel Sep 20, 2022
c098980
docs
f0ssel Sep 20, 2022
84c3cf8
add reportAppHealth
f0ssel Sep 20, 2022
7028377
add retry loop
f0ssel Sep 20, 2022
947ff9c
gosimp
f0ssel Sep 20, 2022
047a2e6
fix
f0ssel Sep 20, 2022
26d902a
authorizer
f0ssel Sep 20, 2022
4e65229
workspace app health reporter
f0ssel Sep 20, 2022
9129027
health
f0ssel Sep 20, 2022
e87b48a
fix types
f0ssel Sep 20, 2022
2d5d27a
handle context
f0ssel Sep 20, 2022
fec256d
handle nil interface
f0ssel Sep 21, 2022
a3330c7
add test for agent app health routes
f0ssel Sep 21, 2022
18d05a9
fix test
f0ssel Sep 21, 2022
e6dc742
fix json
f0ssel Sep 21, 2022
1947adc
remove healthcheck_enabled
f0ssel Sep 21, 2022
bb5aa3e
add healthcheck type
f0ssel Sep 22, 2022
d7c2ef2
fix merge
f0ssel Sep 22, 2022
e7a2798
fix nil
f0ssel Sep 22, 2022
b774aee
fix js
f0ssel Sep 22, 2022
8cfef1a
update tf provider
f0ssel Sep 22, 2022
aaabc5a
make fmt
f0ssel Sep 22, 2022
7c70495
add to example
f0ssel Sep 22, 2022
5aedcdc
fix agent logic
f0ssel Sep 22, 2022
2654c1a
fix cast
f0ssel Sep 22, 2022
8b293fe
add apphealth_test.go
f0ssel Sep 23, 2022
6b95ddd
lint
f0ssel Sep 23, 2022
cf53ce6
lint
f0ssel Sep 23, 2022
2f17c5a
lint
f0ssel Sep 23, 2022
e63769b
make tests more reliable
f0ssel Sep 23, 2022
0fbd251
fix migration number
f0ssel Sep 23, 2022
1cde12b
fix migration number
f0ssel Sep 23, 2022
e7f93a9
fix goleak
f0ssel Sep 23, 2022
d304f64
simplify goroutines
f0ssel Sep 23, 2022
634fb64
pr comments
f0ssel Sep 23, 2022
7f3f45a
fix datarace in test
f0ssel Sep 23, 2022
7caea9a
fix another datarace
f0ssel Sep 23, 2022
52ab3dc
dont wait twice
f0ssel Sep 23, 2022
f1ca9c5
cleanup
f0ssel Sep 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
pr comments
  • Loading branch information
f0ssel committed Sep 23, 2022
commit 634fb64cdca97c324fcb1b5de4a2bc9059053fe5
221 changes: 113 additions & 108 deletions agent/apphealth.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,125 +24,130 @@ type WorkspaceAppHealthReporter func(ctx context.Context)

// NewWorkspaceAppHealthReporter creates a WorkspaceAppHealthReporter that reports app health to coderd.
func NewWorkspaceAppHealthReporter(logger slog.Logger, workspaceAgentApps WorkspaceAgentApps, postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth) WorkspaceAppHealthReporter {
return func(ctx context.Context) {
r := retry.New(time.Second, 30*time.Second)
for {
err := func() error {
apps, err := workspaceAgentApps(ctx)
if err != nil {
if xerrors.Is(err, context.Canceled) {
return nil
}
return xerrors.Errorf("getting workspace apps: %w", err)
}

// no need to run this loop if no apps for this workspace.
if len(apps) == 0 {
return nil
}
runHealthcheckLoop := func(ctx context.Context) error {
apps, err := workspaceAgentApps(ctx)
if err != nil {
if xerrors.Is(err, context.Canceled) {
return nil
}
return xerrors.Errorf("getting workspace apps: %w", err)
}

hasHealthchecksEnabled := false
health := make(map[string]codersdk.WorkspaceAppHealth, 0)
for _, app := range apps {
health[app.Name] = app.Health
if !hasHealthchecksEnabled && app.Health != codersdk.WorkspaceAppHealthDisabled {
hasHealthchecksEnabled = true
}
}
// no need to run this loop if no apps for this workspace.
if len(apps) == 0 {
return nil
}

// no need to run this loop if no health checks are configured.
if !hasHealthchecksEnabled {
return nil
}
hasHealthchecksEnabled := false
health := make(map[string]codersdk.WorkspaceAppHealth, 0)
for _, app := range apps {
health[app.Name] = app.Health
if !hasHealthchecksEnabled && app.Health != codersdk.WorkspaceAppHealthDisabled {
hasHealthchecksEnabled = true
}
}

// run a ticker for each app health check.
var mu sync.RWMutex
failures := make(map[string]int, 0)
for _, app := range apps {
if shouldStartTicker(app) {
t := time.NewTicker(time.Duration(app.Healthcheck.Interval) * time.Second)
go func() {
for {
select {
case <-ctx.Done():
return
case <-t.C:
// we set the http timeout to the healthcheck interval to prevent getting too backed up.
client := &http.Client{
Timeout: time.Duration(app.Healthcheck.Interval) * time.Second,
}
err := func() error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, app.Healthcheck.URL, nil)
if err != nil {
return err
}
res, err := client.Do(req)
if err != nil {
return err
}
// successful healthcheck is a non-5XX status code
res.Body.Close()
if res.StatusCode >= http.StatusInternalServerError {
return xerrors.Errorf("error status code: %d", res.StatusCode)
}

return nil
}()
if err != nil {
mu.Lock()
if failures[app.Name] < int(app.Healthcheck.Threshold) {
// increment the failure count and keep status the same.
// we will change it when we hit the threshold.
failures[app.Name]++
} else {
// set to unhealthy if we hit the failure threshold.
// we stop incrementing at the threshold to prevent the failure value from increasing forever.
health[app.Name] = codersdk.WorkspaceAppHealthUnhealthy
}
mu.Unlock()
} else {
mu.Lock()
// we only need one successful health check to be considered healthy.
health[app.Name] = codersdk.WorkspaceAppHealthHealthy
failures[app.Name] = 0
mu.Unlock()
}

t.Reset(time.Duration(app.Healthcheck.Interval))
}
}
}()
}
}
// no need to run this loop if no health checks are configured.
if !hasHealthchecksEnabled {
return nil
}

mu.Lock()
lastHealth := copyHealth(health)
mu.Unlock()
reportTicker := time.NewTicker(time.Second)
// every second we check if the health values of the apps have changed
// and if there is a change we will report the new values.
// run a ticker for each app health check.
var mu sync.RWMutex
failures := make(map[string]int, 0)
for _, nextApp := range apps {
if !shouldStartTicker(nextApp) {
continue
}
app := nextApp
t := time.NewTicker(time.Duration(app.Healthcheck.Interval) * time.Second)
go func() {
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
// we set the http timeout to the healthcheck interval to prevent getting too backed up.
client := &http.Client{
Timeout: time.Duration(app.Healthcheck.Interval) * time.Second,
}
err := func() error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, app.Healthcheck.URL, nil)
if err != nil {
return err
}
res, err := client.Do(req)
if err != nil {
return err
}
// successful healthcheck is a non-5XX status code
res.Body.Close()
if res.StatusCode >= http.StatusInternalServerError {
return xerrors.Errorf("error status code: %d", res.StatusCode)
}

return nil
case <-reportTicker.C:
mu.RLock()
changed := healthChanged(lastHealth, health)
mu.RUnlock()
if changed {
mu.Lock()
lastHealth = copyHealth(health)
mu.Unlock()
err := postWorkspaceAgentAppHealth(ctx, codersdk.PostWorkspaceAppHealthsRequest{
Healths: health,
})
if err != nil {
logger.Error(ctx, "failed to report workspace app stat", slog.Error(err))
}
}()
if err != nil {
mu.Lock()
if failures[app.Name] < int(app.Healthcheck.Threshold) {
// increment the failure count and keep status the same.
// we will change it when we hit the threshold.
failures[app.Name]++
} else {
// set to unhealthy if we hit the failure threshold.
// we stop incrementing at the threshold to prevent the failure value from increasing forever.
health[app.Name] = codersdk.WorkspaceAppHealthUnhealthy
}
mu.Unlock()
} else {
mu.Lock()
// we only need one successful health check to be considered healthy.
health[app.Name] = codersdk.WorkspaceAppHealthHealthy
failures[app.Name] = 0
mu.Unlock()
}

t.Reset(time.Duration(app.Healthcheck.Interval))
}
}()
}

mu.Lock()
lastHealth := copyHealth(health)
mu.Unlock()
reportTicker := time.NewTicker(time.Second)
// every second we check if the health values of the apps have changed
// and if there is a change we will report the new values.
for {
select {
case <-ctx.Done():
return nil
case <-reportTicker.C:
mu.RLock()
changed := healthChanged(lastHealth, health)
mu.RUnlock()
if !changed {
continue
}

mu.Lock()
lastHealth = copyHealth(health)
mu.Unlock()
err := postWorkspaceAgentAppHealth(ctx, codersdk.PostWorkspaceAppHealthsRequest{
Healths: health,
})
if err != nil {
logger.Error(ctx, "failed to report workspace app stat", slog.Error(err))
}
}
}
}

return func(ctx context.Context) {
for r := retry.New(time.Second, 30*time.Second); r.Wait(ctx); {
err := runHealthcheckLoop(ctx)
if err != nil {
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
return
Expand All @@ -167,7 +172,7 @@ func healthChanged(old map[string]codersdk.WorkspaceAppHealth, new map[string]co
for name, newValue := range new {
oldValue, found := old[name]
if !found {
panic("workspace app lengths are not equal")
return true
}
if newValue != oldValue {
return true
Expand Down
3 changes: 2 additions & 1 deletion agent/apphealth_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ func setupAppReporter(ctx context.Context, t *testing.T, apps []codersdk.Workspa

var mu sync.Mutex
workspaceAgentApps := func(context.Context) ([]codersdk.WorkspaceApp, error) {
return apps, nil
var newApps []codersdk.WorkspaceApp
return append(newApps, apps...), nil
}
postWorkspaceAgentAppHealth := func(_ context.Context, req codersdk.PostWorkspaceAppHealthsRequest) error {
for name, health := range req.Healths {
Expand Down
2 changes: 1 addition & 1 deletion coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ func TestWorkspaceAgentAppHealth(t *testing.T) {
},
})
require.Error(t, err)
// app.HealthEnabled == false
// healcheck disabled
err = agentClient.PostWorkspaceAgentAppHealth(ctx, codersdk.PostWorkspaceAppHealthsRequest{
Healths: map[string]codersdk.WorkspaceAppHealth{
"code-server": codersdk.WorkspaceAppHealthInitializing,
Expand Down