Skip to content

Commit 17714ed

Browse files
committed
debounce logs
1 parent 54b7f4c commit 17714ed

File tree

2 files changed

+125
-15
lines changed

2 files changed

+125
-15
lines changed

agent/agent.go

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1581,15 +1581,28 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() {
15811581
oomScore := unsetOOMScore
15821582
if scoreStr, ok := a.environmentVariables[EnvProcOOMScore]; ok {
15831583
score, err := strconv.Atoi(strings.TrimSpace(scoreStr))
1584-
if err == nil {
1584+
if err == nil && score >= -1000 && score <= 1000 {
15851585
oomScore = score
1586+
} else {
1587+
a.logger.Error(ctx, "invalid oom score",
1588+
slog.F("min_value", -1000),
1589+
slog.F("max_value", 1000),
1590+
slog.F("value", scoreStr),
1591+
)
15861592
}
15871593
}
15881594

1595+
debouncer := &logDebouncer{
1596+
logger: a.logger,
1597+
messages: map[string]time.Time{},
1598+
interval: time.Minute,
1599+
}
1600+
15891601
for {
1590-
procs, err := a.manageProcessPriority(ctx, oomScore)
1602+
procs, err := a.manageProcessPriority(ctx, debouncer, oomScore)
1603+
// Avoid spamming the logs too often.
15911604
if err != nil {
1592-
a.logger.Error(ctx, "manage process priority",
1605+
debouncer.Error(ctx, "manage process priority",
15931606
slog.Error(err),
15941607
)
15951608
}
@@ -1605,13 +1618,16 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() {
16051618
}
16061619
}
16071620

1621+
// unsetOOMScore is set to an invalid OOM score to imply an unset value.
16081622
const unsetOOMScore = 1001
16091623

1610-
func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*agentproc.Process, error) {
1624+
func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebouncer, oomScore int) ([]*agentproc.Process, error) {
16111625
const (
16121626
niceness = 10
16131627
)
16141628

1629+
// We fetch the agent score each time because it's possible someone updates the
1630+
// value after it is started.
16151631
agentScore, err := a.getAgentOOMScore()
16161632
if err != nil {
16171633
agentScore = unsetOOMScore
@@ -1629,14 +1645,9 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age
16291645

16301646
var (
16311647
modProcs = []*agentproc.Process{}
1632-
logger slog.Logger
16331648
)
16341649

16351650
for _, proc := range procs {
1636-
logger = a.logger.With(
1637-
slog.F("cmd", proc.Cmd()),
1638-
slog.F("pid", proc.PID),
1639-
)
16401651

16411652
containsFn := func(e string) bool {
16421653
contains := strings.Contains(proc.Cmd(), e)
@@ -1651,7 +1662,9 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age
16511662

16521663
score, niceErr := proc.Niceness(a.syscaller)
16531664
if niceErr != nil && !xerrors.Is(niceErr, os.ErrPermission) {
1654-
logger.Warn(ctx, "unable to get proc niceness",
1665+
debouncer.Warn(ctx, "unable to get proc niceness",
1666+
slog.F("cmd", proc.Cmd()),
1667+
slog.F("pid", proc.PID),
16551668
slog.Error(niceErr),
16561669
)
16571670
continue
@@ -1662,27 +1675,31 @@ func (a *agent) manageProcessPriority(ctx context.Context, oomScore int) ([]*age
16621675
// Getpriority actually returns priority for the nice value
16631676
// which is niceness + 20, so here 20 = a niceness of 0 (aka unset).
16641677
if score != 20 {
1678+
// We don't log here since it can get spammy
16651679
continue
16661680
}
16671681

16681682
if niceErr == nil {
16691683
err := proc.SetNiceness(a.syscaller, niceness)
16701684
if err != nil && !xerrors.Is(err, os.ErrPermission) {
1671-
logger.Warn(ctx, "unable to set proc niceness",
1685+
debouncer.Warn(ctx, "unable to set proc niceness",
1686+
slog.F("cmd", proc.Cmd()),
1687+
slog.F("pid", proc.PID),
16721688
slog.F("niceness", niceness),
16731689
slog.Error(err),
16741690
)
16751691
}
16761692
}
16771693

1678-
// If the oom score is valid and it's not already set and isn't a custom value set by another process
1679-
// then it's ok to update it.
1694+
// If the oom score is valid and it's not already set and isn't a custom value set by another process then it's ok to update it.
16801695
if oomScore != unsetOOMScore && oomScore != proc.OOMScoreAdj && !isCustomOOMScore(agentScore, proc) {
16811696
oomScoreStr := strconv.Itoa(oomScore)
16821697
err := afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte(oomScoreStr), 0o644)
16831698
if err != nil && !xerrors.Is(err, os.ErrPermission) {
1684-
logger.Warn(ctx, "unable to set oom_score_adj",
1685-
slog.F("score", "0"),
1699+
debouncer.Warn(ctx, "unable to set oom_score_adj",
1700+
slog.F("cmd", proc.Cmd()),
1701+
slog.F("pid", proc.PID),
1702+
slog.F("score", oomScoreStr),
16861703
slog.Error(err),
16871704
)
16881705
}
@@ -2081,3 +2098,37 @@ func isCustomOOMScore(agentScore int, process *agentproc.Process) bool {
20812098
score := process.OOMScoreAdj
20822099
return agentScore != score && score != 1000 && score != 0 && score != 998
20832100
}
2101+
2102+
// logDebouncer prevents generating a log for a particular message if
2103+
// it's been emitted within the given interval duration.
2104+
// It's a shoddy implementation use in one spot that should be replaced at
2105+
// some point.
2106+
type logDebouncer struct {
2107+
logger slog.Logger
2108+
messages map[string]time.Time
2109+
interval time.Duration
2110+
}
2111+
2112+
func (l *logDebouncer) Warn(ctx context.Context, msg string, fields ...any) {
2113+
l.log(ctx, slog.LevelWarn, msg, fields...)
2114+
}
2115+
2116+
func (l *logDebouncer) Error(ctx context.Context, msg string, fields ...any) {
2117+
l.log(ctx, slog.LevelError, msg, fields...)
2118+
}
2119+
2120+
func (l *logDebouncer) log(ctx context.Context, level slog.Level, msg string, fields ...any) {
2121+
// This (bad) implementation assumes you wouldn't reuse the same msg
2122+
// for different levels.
2123+
last, ok := l.messages[msg]
2124+
if ok && time.Since(last) < l.interval {
2125+
return
2126+
}
2127+
switch level {
2128+
case slog.LevelWarn:
2129+
l.logger.Warn(ctx, msg, fields...)
2130+
case slog.LevelError:
2131+
l.logger.Error(ctx, msg, fields...)
2132+
}
2133+
l.messages[msg] = time.Now()
2134+
}

agent/stats_internal_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
package agent
22

33
import (
4+
"bytes"
45
"context"
6+
"encoding/json"
7+
"io"
58
"net/netip"
69
"sync"
710
"testing"
@@ -14,6 +17,7 @@ import (
1417
"tailscale.com/types/netlogtype"
1518

1619
"cdr.dev/slog"
20+
"cdr.dev/slog/sloggers/slogjson"
1721
"cdr.dev/slog/sloggers/slogtest"
1822
"github.com/coder/coder/v2/agent/proto"
1923
"github.com/coder/coder/v2/testutil"
@@ -210,3 +214,58 @@ func newFakeStatsDest() *fakeStatsDest {
210214
resps: make(chan *proto.UpdateStatsResponse),
211215
}
212216
}
217+
218+
func Test_logDebouncer(t *testing.T) {
219+
t.Parallel()
220+
221+
var (
222+
buf bytes.Buffer
223+
logger = slog.Make(slogjson.Sink(&buf))
224+
ctx = context.Background()
225+
)
226+
227+
debouncer := &logDebouncer{
228+
logger: logger,
229+
messages: map[string]time.Time{},
230+
interval: time.Minute,
231+
}
232+
233+
fields := map[string]interface{}{
234+
"field_1": float64(1),
235+
"field_2": "2",
236+
}
237+
238+
debouncer.Error(ctx, "my message", "field_1", 1, "field_2", "2")
239+
debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2")
240+
// Shouldn't log this.
241+
debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2")
242+
243+
require.Len(t, debouncer.messages, 2)
244+
245+
type entry struct {
246+
Msg string `json:"msg"`
247+
Level string `json:"level"`
248+
Fields map[string]interface{} `json:"fields"`
249+
}
250+
251+
assertLog := func(msg string, level string, fields map[string]interface{}) {
252+
line, err := buf.ReadString('\n')
253+
require.NoError(t, err)
254+
255+
var e entry
256+
err = json.Unmarshal([]byte(line), &e)
257+
require.NoError(t, err)
258+
require.Equal(t, msg, e.Msg)
259+
require.Equal(t, level, e.Level)
260+
require.Equal(t, fields, e.Fields)
261+
}
262+
assertLog("my message", "ERROR", fields)
263+
assertLog("another message", "WARN", fields)
264+
265+
debouncer.messages["another message"] = time.Now().Add(-2 * time.Minute)
266+
debouncer.Warn(ctx, "another message", "field_1", 1, "field_2", "2")
267+
assertLog("another message", "WARN", fields)
268+
// Assert nothing else was written.
269+
_, err := buf.ReadString('\n')
270+
require.ErrorIs(t, err, io.EOF)
271+
}

0 commit comments

Comments
 (0)