Skip to content

Commit 18973a6

Browse files
authored
fix: Add reaper to coder agent (#2441)
* fix: Add reaper to coder agent - The coder agent runs as PID 1 in some of our Docker workspaces. In such cases it is the responsibility of the init process to reap dead processes. Failing to do so can result in an inability to create new processes by running out of PIDs. This PR adds a reaper to our agent that is only spawned if it detects that it is PID1.
1 parent 6c1208e commit 18973a6

File tree

7 files changed

+191
-2
lines changed

7 files changed

+191
-2
lines changed

agent/reaper/doc.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// Package reaper contains logic for reaping subprocesses. It is
2+
// specifically used in the agent to avoid the accumulation of
3+
// zombie processes.
4+
package reaper

agent/reaper/reaper_stub.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//go:build !linux
2+
3+
package reaper
4+
5+
import "github.com/hashicorp/go-reap"
6+
7+
// IsChild returns true if we're the forked process.
8+
func IsChild() bool {
9+
return false
10+
}
11+
12+
// IsInitProcess returns true if the current process's PID is 1.
13+
func IsInitProcess() bool {
14+
return false
15+
}
16+
17+
func ForkReap(pids reap.PidCh) error {
18+
return nil
19+
}

agent/reaper/reaper_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//go:build linux
2+
3+
package reaper_test
4+
5+
import (
6+
"os"
7+
"os/exec"
8+
"testing"
9+
"time"
10+
11+
"github.com/hashicorp/go-reap"
12+
"github.com/stretchr/testify/require"
13+
14+
"github.com/coder/coder/agent/reaper"
15+
)
16+
17+
func TestReap(t *testing.T) {
18+
t.Parallel()
19+
20+
// Don't run the reaper test in CI. It does weird
21+
// things like forkexecing which may have unintended
22+
// consequences in CI.
23+
if _, ok := os.LookupEnv("CI"); ok {
24+
t.Skip("Detected CI, skipping reaper tests")
25+
}
26+
27+
// Because we're forkexecing these tests will try to run twice...
28+
if reaper.IsChild() {
29+
t.Skip("I'm a child!")
30+
}
31+
32+
// OK checks that's the reaper is successfully reaping
33+
// exited processes and passing the PIDs through the shared
34+
// channel.
35+
t.Run("OK", func(t *testing.T) {
36+
pids := make(reap.PidCh, 1)
37+
err := reaper.ForkReap(pids)
38+
require.NoError(t, err)
39+
40+
cmd := exec.Command("tail", "-f", "/dev/null")
41+
err = cmd.Start()
42+
require.NoError(t, err)
43+
44+
cmd2 := exec.Command("tail", "-f", "/dev/null")
45+
err = cmd2.Start()
46+
require.NoError(t, err)
47+
48+
err = cmd.Process.Kill()
49+
require.NoError(t, err)
50+
51+
err = cmd2.Process.Kill()
52+
require.NoError(t, err)
53+
54+
expectedPIDs := []int{cmd.Process.Pid, cmd2.Process.Pid}
55+
56+
deadline := time.NewTimer(time.Second * 5)
57+
for i := 0; i < len(expectedPIDs); i++ {
58+
select {
59+
case <-deadline.C:
60+
t.Fatalf("Timed out waiting for process")
61+
case pid := <-pids:
62+
require.Contains(t, expectedPIDs, pid)
63+
}
64+
}
65+
})
66+
}

agent/reaper/reaper_unix.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
//go:build linux
2+
3+
package reaper
4+
5+
import (
6+
"fmt"
7+
"os"
8+
"syscall"
9+
10+
"github.com/hashicorp/go-reap"
11+
"golang.org/x/xerrors"
12+
)
13+
14+
// agentEnvMark is a simple environment variable that we use as a marker
15+
// to indicated that the process is a child as opposed to the reaper.
16+
// Since we are forkexec'ing we need to be able to differentiate between
17+
// the two to avoid fork bombing ourselves.
18+
const agentEnvMark = "CODER_DO_NOT_REAP"
19+
20+
// IsChild returns true if we're the forked process.
21+
func IsChild() bool {
22+
return os.Getenv(agentEnvMark) != ""
23+
}
24+
25+
// IsInitProcess returns true if the current process's PID is 1.
26+
func IsInitProcess() bool {
27+
return os.Getpid() == 1
28+
}
29+
30+
// ForkReap spawns a goroutine that reaps children. In order to avoid
31+
// complications with spawning `exec.Commands` in the same process that
32+
// is reaping, we forkexec a child process. This prevents a race between
33+
// the reaper and an exec.Command waiting for its process to complete.
34+
// The provided 'pids' channel may be nil if the caller does not care about the
35+
// reaped children PIDs.
36+
func ForkReap(pids reap.PidCh) error {
37+
// Check if the process is the parent or the child.
38+
// If it's the child we want to skip attempting to reap.
39+
if IsChild() {
40+
return nil
41+
}
42+
43+
go reap.ReapChildren(pids, nil, nil, nil)
44+
45+
args := os.Args
46+
// This is simply done to help identify the real agent process
47+
// when viewing in something like 'ps'.
48+
args = append(args, "#Agent")
49+
50+
pwd, err := os.Getwd()
51+
if err != nil {
52+
return xerrors.Errorf("get wd: %w", err)
53+
}
54+
55+
pattrs := &syscall.ProcAttr{
56+
Dir: pwd,
57+
// Add our marker for identifying the child process.
58+
Env: append(os.Environ(), fmt.Sprintf("%s=true", agentEnvMark)),
59+
Sys: &syscall.SysProcAttr{
60+
Setsid: true,
61+
},
62+
Files: []uintptr{
63+
uintptr(syscall.Stdin),
64+
uintptr(syscall.Stdout),
65+
uintptr(syscall.Stderr),
66+
},
67+
}
68+
69+
//#nosec G204
70+
pid, _ := syscall.ForkExec(args[0], args, pattrs)
71+
72+
var wstatus syscall.WaitStatus
73+
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
74+
for xerrors.Is(err, syscall.EINTR) {
75+
_, err = syscall.Wait4(pid, &wstatus, 0, nil)
76+
}
77+
78+
return nil
79+
}

cli/agent.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"net/url"
99
"os"
1010
"path/filepath"
11+
"runtime"
1112
"time"
1213

1314
"cloud.google.com/go/compute/metadata"
@@ -18,6 +19,7 @@ import (
1819
"cdr.dev/slog/sloggers/sloghuman"
1920

2021
"github.com/coder/coder/agent"
22+
"github.com/coder/coder/agent/reaper"
2123
"github.com/coder/coder/cli/cliflag"
2224
"github.com/coder/coder/codersdk"
2325
"github.com/coder/retry"
@@ -51,6 +53,23 @@ func workspaceAgent() *cobra.Command {
5153
}
5254
defer logWriter.Close()
5355
logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug)
56+
57+
isLinux := runtime.GOOS == "linux"
58+
59+
// Spawn a reaper so that we don't accumulate a ton
60+
// of zombie processes.
61+
if reaper.IsInitProcess() && !reaper.IsChild() && isLinux {
62+
logger.Info(cmd.Context(), "spawning reaper process")
63+
err := reaper.ForkReap(nil)
64+
if err != nil {
65+
logger.Error(cmd.Context(), "failed to reap", slog.Error(err))
66+
return xerrors.Errorf("fork reap: %w", err)
67+
}
68+
69+
logger.Info(cmd.Context(), "reaper process exiting")
70+
return nil
71+
}
72+
5473
client := codersdk.New(coderURL)
5574

5675
if pprofEnabled {

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ require (
7171
github.com/golang-migrate/migrate/v4 v4.15.2
7272
github.com/google/go-github/v43 v43.0.1-0.20220414155304-00e42332e405
7373
github.com/google/uuid v1.3.0
74+
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b
7475
github.com/hashicorp/go-version v1.5.0
7576
github.com/hashicorp/hc-install v0.3.2
7677
github.com/hashicorp/hcl/v2 v2.12.0
@@ -131,8 +132,6 @@ require (
131132
storj.io/drpc v0.0.30
132133
)
133134

134-
require github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
135-
136135
require (
137136
github.com/agnivade/levenshtein v1.0.1 // indirect
138137
github.com/elastic/go-windows v1.0.0 // indirect
@@ -240,6 +239,7 @@ require (
240239
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
241240
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
242241
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
242+
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
243243
github.com/yashtewari/glob-intersection v0.1.0 // indirect
244244
github.com/zclconf/go-cty v1.10.0 // indirect
245245
github.com/zeebo/errs v1.2.2 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,8 @@ github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHh
896896
github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA=
897897
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
898898
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
899+
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b h1:3GrpnZQBxcMj1gCXQLelfjCT1D5MPGTuGMKHVzSIH6A=
900+
github.com/hashicorp/go-reap v0.0.0-20170704170343-bf58d8a43e7b/go.mod h1:qIFzeFcJU3OIFk/7JreWXcUjFmcCaeHTH9KoNyHYVCs=
899901
github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
900902
github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
901903
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=

0 commit comments

Comments
 (0)