From 02eb7476561b440850b8566bd0199f32403fa687 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Tue, 19 Mar 2024 01:08:42 +0000 Subject: [PATCH 1/9] fix: adjust process oom_score_adj to 0 --- agent/agent.go | 29 ++++++++++++++++++++++++++--- agent/agent_test.go | 15 +++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 6f601cbcd7742..eb36bd60ab08b 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1569,6 +1569,20 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { return } + const agentOOMScore = "-1000" + + err := afero.WriteFile(a.filesystem, "/proc/self/oom_score_adj", []byte(agentOOMScore), 0600) + if err != nil { + a.logger.Error(ctx, "error adjusting agent oom_score_adj", + slog.F("score", agentOOMScore), + slog.Error(err), + ) + } else { + a.logger.Debug(ctx, "adjusted agent oom_score_adj to avoid OOM Killer", + slog.F("score", agentOOMScore), + ) + } + if a.processManagementTick == nil { ticker := time.NewTicker(time.Second) defer ticker.Stop() @@ -1622,12 +1636,12 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process // If the process is prioritized we should adjust // it's oom_score_adj and avoid lowering its niceness. - if slices.ContainsFunc[[]string, string](prioritizedProcs, containsFn) { + if slices.ContainsFunc(prioritizedProcs, containsFn) { continue } score, err := proc.Niceness(a.syscaller) - if err != nil { + if err != nil && !xerrors.Is(err, os.ErrPermission) { logger.Warn(ctx, "unable to get proc niceness", slog.Error(err), ) @@ -1644,7 +1658,7 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process } err = proc.SetNiceness(a.syscaller, niceness) - if err != nil { + if err != nil && !xerrors.Is(err, os.ErrPermission) { logger.Warn(ctx, "unable to set proc niceness", slog.F("niceness", niceness), slog.Error(err), @@ -1652,6 +1666,15 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process continue } + err = afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte("0"), 0600) + if err != nil && !xerrors.Is(err, os.ErrPermission) { + logger.Warn(ctx, "unable to set oom_score_adj", + slog.F("score", "0"), + slog.Error(err), + ) + continue + } + modProcs = append(modProcs, proc) } return modProcs, nil diff --git a/agent/agent_test.go b/agent/agent_test.go index 07258fee5eef3..eb78f894ab3d2 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2589,8 +2589,16 @@ func TestAgent_ManageProcessPriority(t *testing.T) { logger = slog.Make(sloghuman.Sink(io.Discard)) ) + requireScore := func(t *testing.T, p *agentproc.Process, score string) { + t.Helper() + + actual, err := afero.ReadFile(fs, fmt.Sprintf("/proc/%d/oom_score_adj", p.PID)) + require.NoError(t, err) + require.Equal(t, score, string(actual)) + } + // Create some processes. - for i := 0; i < 2; i++ { + for i := 0; i < 3; i++ { proc := agentproctest.GenerateProcess(t, fs) syscaller.EXPECT(). Kill(proc.PID, syscall.Signal(0)). @@ -2618,7 +2626,10 @@ func TestAgent_ManageProcessPriority(t *testing.T) { }) actualProcs := <-modProcs // We should ignore the process with a custom nice score. - require.Len(t, actualProcs, 1) + require.Len(t, actualProcs, 2) + for _, proc := range actualProcs { + requireScore(t, proc, "0") + } }) t.Run("DisabledByDefault", func(t *testing.T) { From b3a787a5441c64f20ba0ae6472d86dc704339ace Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Tue, 19 Mar 2024 01:12:39 +0000 Subject: [PATCH 2/9] fmt --- agent/agent.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index eb36bd60ab08b..d20041f7e78b8 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1571,7 +1571,7 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { const agentOOMScore = "-1000" - err := afero.WriteFile(a.filesystem, "/proc/self/oom_score_adj", []byte(agentOOMScore), 0600) + err := afero.WriteFile(a.filesystem, "/proc/self/oom_score_adj", []byte(agentOOMScore), 0o600) if err != nil { a.logger.Error(ctx, "error adjusting agent oom_score_adj", slog.F("score", agentOOMScore), @@ -1666,7 +1666,7 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process continue } - err = afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte("0"), 0600) + err = afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte("0"), 0o600) if err != nil && !xerrors.Is(err, os.ErrPermission) { logger.Warn(ctx, "unable to set oom_score_adj", slog.F("score", "0"), From f37242b781fa7d17285da6b1064136f3b0f2eb37 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Wed, 20 Mar 2024 23:55:39 +0000 Subject: [PATCH 3/9] oom score --- agent/agent.go | 121 ++++++++++++++++++-------- agent/agent_test.go | 86 +++++++++++++++--- agent/agentproc/agentproctest/proc.go | 10 ++- agent/agentproc/proc_unix.go | 23 ++++- agent/agentproc/syscaller.go | 7 +- 5 files changed, 192 insertions(+), 55 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index d20041f7e78b8..aa73e4850aba0 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -63,6 +63,7 @@ const ( // EnvProcPrioMgmt determines whether we attempt to manage // process CPU and OOM Killer priority. const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT" +const EnvProcOOMScore = "CODER_PROC_OOM_SCORE" type Options struct { Filesystem afero.Fs @@ -1569,28 +1570,22 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { return } - const agentOOMScore = "-1000" - - err := afero.WriteFile(a.filesystem, "/proc/self/oom_score_adj", []byte(agentOOMScore), 0o600) - if err != nil { - a.logger.Error(ctx, "error adjusting agent oom_score_adj", - slog.F("score", agentOOMScore), - slog.Error(err), - ) - } else { - a.logger.Debug(ctx, "adjusted agent oom_score_adj to avoid OOM Killer", - slog.F("score", agentOOMScore), - ) - } - if a.processManagementTick == nil { ticker := time.NewTicker(time.Second) defer ticker.Stop() a.processManagementTick = ticker.C } + oomScore := unsetOOMScore + if scoreStr, ok := a.environmentVariables[EnvProcOOMScore]; ok { + score, err := strconv.Atoi(strings.TrimSpace(scoreStr)) + if err == nil { + oomScore = score + } + } + for { - procs, err := a.manageProcessPriority(ctx) + procs, err := a.manageProcessPriority(ctx, oomScore) if err != nil { a.logger.Error(ctx, "manage process priority", slog.Error(err), @@ -1608,11 +1603,23 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { } } -func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process, error) { +const unsetOOMScore = 1001 + +func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*agentproc.Process, error) { const ( niceness = 10 ) + agentScore, err := a.getAgentOOMScore() + if err != nil { + agentScore = unsetOOMScore + } + if oomScore == unsetOOMScore && agentScore != unsetOOMScore { + // If the child score has not been explicitly specified we should + // set it to a score relative to the agent score. + oomScore = childOOMScore(agentScore) + } + procs, err := agentproc.List(a.filesystem, a.syscaller) if err != nil { return nil, xerrors.Errorf("list: %w", err) @@ -1640,10 +1647,10 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process continue } - score, err := proc.Niceness(a.syscaller) - if err != nil && !xerrors.Is(err, os.ErrPermission) { + score, niceErr := proc.Niceness(a.syscaller) + if niceErr != nil && !xerrors.Is(niceErr, os.ErrPermission) { logger.Warn(ctx, "unable to get proc niceness", - slog.Error(err), + slog.Error(niceErr), ) continue } @@ -1653,28 +1660,31 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process // Getpriority actually returns priority for the nice value // which is niceness + 20, so here 20 = a niceness of 0 (aka unset). if score != 20 { - // We don't log here since it can get spammy continue } - err = proc.SetNiceness(a.syscaller, niceness) - if err != nil && !xerrors.Is(err, os.ErrPermission) { - logger.Warn(ctx, "unable to set proc niceness", - slog.F("niceness", niceness), - slog.Error(err), - ) - continue + if niceErr == nil { + err := proc.SetNiceness(a.syscaller, niceness) + if err != nil && !xerrors.Is(err, os.ErrPermission) { + logger.Warn(ctx, "unable to set proc niceness", + slog.F("niceness", niceness), + slog.Error(err), + ) + } } - err = afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte("0"), 0o600) - if err != nil && !xerrors.Is(err, os.ErrPermission) { - logger.Warn(ctx, "unable to set oom_score_adj", - slog.F("score", "0"), - slog.Error(err), - ) - continue + // If the oom score is valid and it's not already set and isn't a custom value set by another process + // then it's ok to update it. + if oomScore != unsetOOMScore && oomScore != proc.OOMScoreAdj && !isCustomOOMScore(agentScore, proc) { + oomScoreStr := strconv.Itoa(oomScore) + err := afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte(oomScoreStr), 0o644) + if err != nil && !xerrors.Is(err, os.ErrPermission) { + logger.Warn(ctx, "unable to set oom_score_adj", + slog.F("score", "0"), + slog.Error(err), + ) + } } - modProcs = append(modProcs, proc) } return modProcs, nil @@ -2028,3 +2038,44 @@ func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl } }) } + +// childOOMScore returns the oom_score_adj for a child process. It is based +// on the oom_score_adj of the agent process. +func childOOMScore(agentScore int) int { + // If the agent has a negative oom_score_adj, we set the child to 0 + // so it's treated like every other process. + if agentScore < 0 { + return 0 + } + + // If the agent is already almost at the maximum then set it to the max. + if agentScore >= 998 { + return 1000 + } + + // If the agent oom_score_adj is >=0, we set the child to slightly + // less than the maximum. If users want a different score they set it + // directly. + return 998 +} + +func (a *agent) getAgentOOMScore() (int, error) { + scoreStr, err := afero.ReadFile(a.filesystem, "/proc/self/oom_score_adj") + if err != nil { + return 0, xerrors.Errorf("read file: %w", err) + } + + score, err := strconv.Atoi(strings.TrimSpace(string(scoreStr))) + if err != nil { + return 0, xerrors.Errorf("parse int: %w", err) + } + + return score, nil +} + +// isCustomOOMScore checks to see if the oom_score_adj is not a value that would +// originate from an agent-spawned process. +func isCustomOOMScore(agentScore int, process *agentproc.Process) bool { + score := process.OOMScoreAdj + return agentScore != score && score != 1000 && score != 0 && score != 998 +} diff --git a/agent/agent_test.go b/agent/agent_test.go index eb78f894ab3d2..c8ca43c4d8e50 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2529,11 +2529,11 @@ func TestAgent_ManageProcessPriority(t *testing.T) { logger = slog.Make(sloghuman.Sink(io.Discard)) ) + requireFileWrite(t, fs, "/proc/self/oom_score_adj", "-500") + // Create some processes. for i := 0; i < 4; i++ { - // Create a prioritized process. This process should - // have it's oom_score_adj set to -500 and its nice - // score should be untouched. + // Create a prioritized process. var proc agentproc.Process if i == 0 { proc = agentproctest.GenerateProcess(t, fs, @@ -2551,8 +2551,8 @@ func TestAgent_ManageProcessPriority(t *testing.T) { }, ) - syscaller.EXPECT().SetPriority(proc.PID, 10).Return(nil) syscaller.EXPECT().GetPriority(proc.PID).Return(20, nil) + syscaller.EXPECT().SetPriority(proc.PID, 10).Return(nil) } syscaller.EXPECT(). Kill(proc.PID, syscall.Signal(0)). @@ -2571,6 +2571,9 @@ func TestAgent_ManageProcessPriority(t *testing.T) { }) actualProcs := <-modProcs require.Len(t, actualProcs, len(expectedProcs)-1) + for _, proc := range actualProcs { + requireFileEquals(t, fs, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), "0") + } }) t.Run("IgnoreCustomNice", func(t *testing.T) { @@ -2589,13 +2592,8 @@ func TestAgent_ManageProcessPriority(t *testing.T) { logger = slog.Make(sloghuman.Sink(io.Discard)) ) - requireScore := func(t *testing.T, p *agentproc.Process, score string) { - t.Helper() - - actual, err := afero.ReadFile(fs, fmt.Sprintf("/proc/%d/oom_score_adj", p.PID)) - require.NoError(t, err) - require.Equal(t, score, string(actual)) - } + err := afero.WriteFile(fs, "/proc/self/oom_score_adj", []byte("0"), 0o644) + require.NoError(t, err) // Create some processes. for i := 0; i < 3; i++ { @@ -2628,8 +2626,58 @@ func TestAgent_ManageProcessPriority(t *testing.T) { // We should ignore the process with a custom nice score. require.Len(t, actualProcs, 2) for _, proc := range actualProcs { - requireScore(t, proc, "0") + _, ok := expectedProcs[proc.PID] + require.True(t, ok) + requireFileEquals(t, fs, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), "998") + } + }) + + t.Run("CustomOOMScore", func(t *testing.T) { + t.Parallel() + + if runtime.GOOS != "linux" { + t.Skip("Skipping non-linux environment") + } + + var ( + fs = afero.NewMemMapFs() + ticker = make(chan time.Time) + syscaller = agentproctest.NewMockSyscaller(gomock.NewController(t)) + modProcs = make(chan []*agentproc.Process) + logger = slog.Make(sloghuman.Sink(io.Discard)) + ) + + err := afero.WriteFile(fs, "/proc/self/oom_score_adj", []byte("0"), 0o644) + require.NoError(t, err) + + // Create some processes. + for i := 0; i < 3; i++ { + proc := agentproctest.GenerateProcess(t, fs) + syscaller.EXPECT(). + Kill(proc.PID, syscall.Signal(0)). + Return(nil) + syscaller.EXPECT().GetPriority(proc.PID).Return(20, nil) + syscaller.EXPECT().SetPriority(proc.PID, 10).Return(nil) + } + + _, _, _, _, _ = setupAgent(t, agentsdk.Manifest{}, 0, func(c *agenttest.Client, o *agent.Options) { + o.Syscaller = syscaller + o.ModifiedProcesses = modProcs + o.EnvironmentVariables = map[string]string{ + agent.EnvProcPrioMgmt: "1", + agent.EnvProcOOMScore: "-567", + } + o.Filesystem = fs + o.Logger = logger + o.ProcessManagementTick = ticker + }) + actualProcs := <-modProcs + // We should ignore the process with a custom nice score. + require.Len(t, actualProcs, 3) + for _, proc := range actualProcs { + requireFileEquals(t, fs, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), "-567") } + }) t.Run("DisabledByDefault", func(t *testing.T) { @@ -2750,3 +2798,17 @@ func requireEcho(t *testing.T, conn net.Conn) { require.NoError(t, err) require.Equal(t, "test", string(b)) } + +func requireFileWrite(t testing.TB, fs afero.Fs, path, data string) { + t.Helper() + err := afero.WriteFile(fs, path, []byte(data), 0o600) + require.NoError(t, err) +} + +func requireFileEquals(t testing.TB, fs afero.Fs, path, expect string) { + t.Helper() + actual, err := afero.ReadFile(fs, path) + require.NoError(t, err) + + require.Equal(t, expect, string(actual)) +} diff --git a/agent/agentproc/agentproctest/proc.go b/agent/agentproc/agentproctest/proc.go index c36e04ec1cdc3..4fa1c698b50bc 100644 --- a/agent/agentproc/agentproctest/proc.go +++ b/agent/agentproc/agentproctest/proc.go @@ -2,6 +2,7 @@ package agentproctest import ( "fmt" + "strconv" "testing" "github.com/spf13/afero" @@ -29,8 +30,9 @@ func GenerateProcess(t *testing.T, fs afero.Fs, muts ...func(*agentproc.Process) cmdline := fmt.Sprintf("%s\x00%s\x00%s", arg1, arg2, arg3) process := agentproc.Process{ - CmdLine: cmdline, - PID: int32(pid), + CmdLine: cmdline, + PID: int32(pid), + OOMScoreAdj: 0, } for _, mut := range muts { @@ -45,5 +47,9 @@ func GenerateProcess(t *testing.T, fs afero.Fs, muts ...func(*agentproc.Process) err = afero.WriteFile(fs, fmt.Sprintf("%s/cmdline", process.Dir), []byte(process.CmdLine), 0o444) require.NoError(t, err) + score := strconv.Itoa(process.OOMScoreAdj) + err = afero.WriteFile(fs, fmt.Sprintf("%s/oom_score_adj", process.Dir), []byte(score), 0o444) + require.NoError(t, err) + return process } diff --git a/agent/agentproc/proc_unix.go b/agent/agentproc/proc_unix.go index f52caed52ee33..2eeb7d5a2253f 100644 --- a/agent/agentproc/proc_unix.go +++ b/agent/agentproc/proc_unix.go @@ -5,6 +5,7 @@ package agentproc import ( "errors" + "os" "path/filepath" "strconv" "strings" @@ -50,10 +51,26 @@ func List(fs afero.Fs, syscaller Syscaller) ([]*Process, error) { } return nil, xerrors.Errorf("read cmdline: %w", err) } + + oomScore, err := afero.ReadFile(fs, filepath.Join(defaultProcDir, entry, "oom_score_adj")) + if err != nil { + if xerrors.Is(err, os.ErrPermission) { + continue + } + + return nil, xerrors.Errorf("read oom_score_adj: %w", err) + } + + oom, err := strconv.Atoi(strings.TrimSpace(string(oomScore))) + if err != nil { + return nil, xerrors.Errorf("convert oom score: %w", err) + } + processes = append(processes, &Process{ - PID: int32(pid), - CmdLine: string(cmdline), - Dir: filepath.Join(defaultProcDir, entry), + PID: int32(pid), + CmdLine: string(cmdline), + Dir: filepath.Join(defaultProcDir, entry), + OOMScoreAdj: oom, }) } diff --git a/agent/agentproc/syscaller.go b/agent/agentproc/syscaller.go index 25dc6cfd54c36..fba3bf32ce5c9 100644 --- a/agent/agentproc/syscaller.go +++ b/agent/agentproc/syscaller.go @@ -14,7 +14,8 @@ type Syscaller interface { const defaultProcDir = "/proc" type Process struct { - Dir string - CmdLine string - PID int32 + Dir string + CmdLine string + PID int32 + OOMScoreAdj int } From 403a63719d20b36ea156f5b48121d3c3c94914b5 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Wed, 20 Mar 2024 23:59:24 +0000 Subject: [PATCH 4/9] fmt --- agent/agent.go | 6 ++++-- agent/agent_test.go | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index aa73e4850aba0..76e27d426b47d 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -62,8 +62,10 @@ const ( // EnvProcPrioMgmt determines whether we attempt to manage // process CPU and OOM Killer priority. -const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT" -const EnvProcOOMScore = "CODER_PROC_OOM_SCORE" +const ( + EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT" + EnvProcOOMScore = "CODER_PROC_OOM_SCORE" +) type Options struct { Filesystem afero.Fs diff --git a/agent/agent_test.go b/agent/agent_test.go index c8ca43c4d8e50..fe0431bb260ab 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2677,7 +2677,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) { for _, proc := range actualProcs { requireFileEquals(t, fs, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), "-567") } - }) t.Run("DisabledByDefault", func(t *testing.T) { From a21497444929c948d6730185d9202d8f3cf26c65 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Thu, 21 Mar 2024 00:04:54 +0000 Subject: [PATCH 5/9] lint --- agent/agent_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index fe0431bb260ab..945e16d58fcff 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2798,15 +2798,15 @@ func requireEcho(t *testing.T, conn net.Conn) { require.Equal(t, "test", string(b)) } -func requireFileWrite(t testing.TB, fs afero.Fs, path, data string) { +func requireFileWrite(t testing.TB, fs afero.Fs, fp, data string) { t.Helper() - err := afero.WriteFile(fs, path, []byte(data), 0o600) + err := afero.WriteFile(fs, fp, []byte(data), 0o600) require.NoError(t, err) } -func requireFileEquals(t testing.TB, fs afero.Fs, path, expect string) { +func requireFileEquals(t testing.TB, fs afero.Fs, fp, expect string) { t.Helper() - actual, err := afero.ReadFile(fs, path) + actual, err := afero.ReadFile(fs, fp) require.NoError(t, err) require.Equal(t, expect, string(actual)) From 17714eda6fd9af4c57d8669d2bf2e2627af52cb4 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Tue, 2 Apr 2024 22:51:28 +0000 Subject: [PATCH 6/9] debounce logs --- agent/agent.go | 81 +++++++++++++++++++++++++++++------- agent/stats_internal_test.go | 59 ++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 15 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 6404b34d4b6ac..2c93b7e3693aa 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1581,15 +1581,28 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { oomScore := unsetOOMScore if scoreStr, ok := a.environmentVariables[EnvProcOOMScore]; ok { score, err := strconv.Atoi(strings.TrimSpace(scoreStr)) - if err == nil { + if err == nil && score >= -1000 && score <= 1000 { oomScore = score + } else { + a.logger.Error(ctx, "invalid oom score", + slog.F("min_value", -1000), + slog.F("max_value", 1000), + slog.F("value", scoreStr), + ) } } + debouncer := &logDebouncer{ + logger: a.logger, + messages: map[string]time.Time{}, + interval: time.Minute, + } + for { - procs, err := a.manageProcessPriority(ctx, oomScore) + procs, err := a.manageProcessPriority(ctx, debouncer, oomScore) + // Avoid spamming the logs too often. if err != nil { - a.logger.Error(ctx, "manage process priority", + debouncer.Error(ctx, "manage process priority", slog.Error(err), ) } @@ -1605,13 +1618,16 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() { } } +// unsetOOMScore is set to an invalid OOM score to imply an unset value. const unsetOOMScore = 1001 -func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*agentproc.Process, error) { +func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebouncer, oomScore int) ([]*agentproc.Process, error) { const ( niceness = 10 ) + // We fetch the agent score each time because it's possible someone updates the + // value after it is started. agentScore, err := a.getAgentOOMScore() if err != nil { agentScore = unsetOOMScore @@ -1629,14 +1645,9 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age var ( modProcs = []*agentproc.Process{} - logger slog.Logger ) for _, proc := range procs { - logger = a.logger.With( - slog.F("cmd", proc.Cmd()), - slog.F("pid", proc.PID), - ) containsFn := func(e string) bool { contains := strings.Contains(proc.Cmd(), e) @@ -1651,7 +1662,9 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age score, niceErr := proc.Niceness(a.syscaller) if niceErr != nil && !xerrors.Is(niceErr, os.ErrPermission) { - logger.Warn(ctx, "unable to get proc niceness", + debouncer.Warn(ctx, "unable to get proc niceness", + slog.F("cmd", proc.Cmd()), + slog.F("pid", proc.PID), slog.Error(niceErr), ) continue @@ -1662,27 +1675,31 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age // Getpriority actually returns priority for the nice value // which is niceness + 20, so here 20 = a niceness of 0 (aka unset). if score != 20 { + // We don't log here since it can get spammy continue } if niceErr == nil { err := proc.SetNiceness(a.syscaller, niceness) if err != nil && !xerrors.Is(err, os.ErrPermission) { - logger.Warn(ctx, "unable to set proc niceness", + debouncer.Warn(ctx, "unable to set proc niceness", + slog.F("cmd", proc.Cmd()), + slog.F("pid", proc.PID), slog.F("niceness", niceness), slog.Error(err), ) } } - // If the oom score is valid and it's not already set and isn't a custom value set by another process - // then it's ok to update it. + // If the oom score is valid and it's not already set and isn't a custom value set by another process then it's ok to update it. if oomScore != unsetOOMScore && oomScore != proc.OOMScoreAdj && !isCustomOOMScore(agentScore, proc) { oomScoreStr := strconv.Itoa(oomScore) err := afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte(oomScoreStr), 0o644) if err != nil && !xerrors.Is(err, os.ErrPermission) { - logger.Warn(ctx, "unable to set oom_score_adj", - slog.F("score", "0"), + debouncer.Warn(ctx, "unable to set oom_score_adj", + slog.F("cmd", proc.Cmd()), + slog.F("pid", proc.PID), + slog.F("score", oomScoreStr), slog.Error(err), ) } @@ -2081,3 +2098,37 @@ func isCustomOOMScore(agentScore int, process *agentproc.Process) bool { score := process.OOMScoreAdj return agentScore != score && score != 1000 && score != 0 && score != 998 } + +// logDebouncer prevents generating a log for a particular message if +// it's been emitted within the given interval duration. +// It's a shoddy implementation use in one spot that should be replaced at +// some point. +type logDebouncer struct { + logger slog.Logger + messages map[string]time.Time + interval time.Duration +} + +func (l *logDebouncer) Warn(ctx context.Context, msg string, fields ...any) { + l.log(ctx, slog.LevelWarn, msg, fields...) +} + +func (l *logDebouncer) Error(ctx context.Context, msg string, fields ...any) { + l.log(ctx, slog.LevelError, msg, fields...) +} + +func (l *logDebouncer) log(ctx context.Context, level slog.Level, msg string, fields ...any) { + // This (bad) implementation assumes you wouldn't reuse the same msg + // for different levels. + last, ok := l.messages[msg] + if ok && time.Since(last) < l.interval { + return + } + switch level { + case slog.LevelWarn: + l.logger.Warn(ctx, msg, fields...) + case slog.LevelError: + l.logger.Error(ctx, msg, fields...) + } + l.messages[msg] = time.Now() +} diff --git a/agent/stats_internal_test.go b/agent/stats_internal_test.go index bfd6a3436d499..57b21a655a493 100644 --- a/agent/stats_internal_test.go +++ b/agent/stats_internal_test.go @@ -1,7 +1,10 @@ package agent import ( + "bytes" "context" + "encoding/json" + "io" "net/netip" "sync" "testing" @@ -14,6 +17,7 @@ import ( "tailscale.com/types/netlogtype" "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogjson" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/testutil" @@ -210,3 +214,58 @@ func newFakeStatsDest() *fakeStatsDest { resps: make(chan *proto.UpdateStatsResponse), } } + +func Test_logDebouncer(t *testing.T) { + t.Parallel() + + var ( + buf bytes.Buffer + logger = slog.Make(slogjson.Sink(&buf)) + ctx = context.Background() + ) + + debouncer := &logDebouncer{ + logger: logger, + messages: map[string]time.Time{}, + interval: time.Minute, + } + + fields := map[string]interface{}{ + "field_1": float64(1), + "field_2": "2", + } + + debouncer.Error(ctx, "my message", "field_1", 1, "field_2", "2") + debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2") + // Shouldn't log this. + debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2") + + require.Len(t, debouncer.messages, 2) + + type entry struct { + Msg string `json:"msg"` + Level string `json:"level"` + Fields map[string]interface{} `json:"fields"` + } + + assertLog := func(msg string, level string, fields map[string]interface{}) { + line, err := buf.ReadString('\n') + require.NoError(t, err) + + var e entry + err = json.Unmarshal([]byte(line), &e) + require.NoError(t, err) + require.Equal(t, msg, e.Msg) + require.Equal(t, level, e.Level) + require.Equal(t, fields, e.Fields) + } + assertLog("my message", "ERROR", fields) + assertLog("another message", "WARN", fields) + + debouncer.messages["another message"] = time.Now().Add(-2 * time.Minute) + debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2") + assertLog("another message", "WARN", fields) + // Assert nothing else was written. + _, err := buf.ReadString('\n') + require.ErrorIs(t, err, io.EOF) +} From 561279b095a27279ddde99411098fc54a3ba2d20 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Tue, 2 Apr 2024 22:58:42 +0000 Subject: [PATCH 7/9] lint --- agent/agent.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 2c93b7e3693aa..b25c8bf78f058 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1648,7 +1648,6 @@ func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebounc ) for _, proc := range procs { - containsFn := func(e string) bool { contains := strings.Contains(proc.Cmd(), e) return contains @@ -2099,9 +2098,9 @@ func isCustomOOMScore(agentScore int, process *agentproc.Process) bool { return agentScore != score && score != 1000 && score != 0 && score != 998 } -// logDebouncer prevents generating a log for a particular message if +// logDebouncer skips writing a log for a particular message if // it's been emitted within the given interval duration. -// It's a shoddy implementation use in one spot that should be replaced at +// It's a shoddy implementation used in one spot that should be replaced at // some point. type logDebouncer struct { logger slog.Logger @@ -2120,8 +2119,7 @@ func (l *logDebouncer) Error(ctx context.Context, msg string, fields ...any) { func (l *logDebouncer) log(ctx context.Context, level slog.Level, msg string, fields ...any) { // This (bad) implementation assumes you wouldn't reuse the same msg // for different levels. - last, ok := l.messages[msg] - if ok && time.Since(last) < l.interval { + if last, ok := l.messages[msg]; ok && time.Since(last) < l.interval { return } switch level { From 2ddeb8e562f8d19d9c2dd8a0809ef01b6c5ff309 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Tue, 2 Apr 2024 22:59:18 +0000 Subject: [PATCH 8/9] fmt --- agent/agent.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index b25c8bf78f058..abbe9c8ea44e6 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1643,9 +1643,7 @@ func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebounc return nil, xerrors.Errorf("list: %w", err) } - var ( - modProcs = []*agentproc.Process{} - ) + modProcs := []*agentproc.Process{} for _, proc := range procs { containsFn := func(e string) bool { From 9f5f63c44b67e8ddd12583d38935ca04ee29af39 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Wed, 3 Apr 2024 14:32:43 +0000 Subject: [PATCH 9/9] pass through env var --- cli/agent.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cli/agent.go b/cli/agent.go index aaef3805e63dd..1f91f1c98bb8d 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -283,6 +283,9 @@ func (r *RootCmd) workspaceAgent() *serpent.Command { if v, ok := os.LookupEnv(agent.EnvProcPrioMgmt); ok { environmentVariables[agent.EnvProcPrioMgmt] = v } + if v, ok := os.LookupEnv(agent.EnvProcOOMScore); ok { + environmentVariables[agent.EnvProcOOMScore] = v + } agnt := agent.New(agent.Options{ Client: client,