@@ -14,8 +14,6 @@ import (
14
14
"os"
15
15
"os/user"
16
16
"path/filepath"
17
- "runtime"
18
- "runtime/debug"
19
17
"sort"
20
18
"strconv"
21
19
"strings"
@@ -305,8 +303,6 @@ func (a *agent) init() {
305
303
// may be happening, but regardless after the intermittent
306
304
// failure, you'll want the agent to reconnect.
307
305
func (a * agent ) runLoop () {
308
- go a .manageProcessPriorityUntilGracefulShutdown ()
309
-
310
306
// need to keep retrying up to the hardCtx so that we can send graceful shutdown-related
311
307
// messages.
312
308
ctx := a .hardCtx
@@ -1564,162 +1560,6 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
1564
1560
return stats
1565
1561
}
1566
1562
1567
- var prioritizedProcs = []string {"coder agent" }
1568
-
1569
- func (a * agent ) manageProcessPriorityUntilGracefulShutdown () {
1570
- // process priority can stop as soon as we are gracefully shutting down
1571
- ctx := a .gracefulCtx
1572
- defer func () {
1573
- if r := recover (); r != nil {
1574
- a .logger .Critical (ctx , "recovered from panic" ,
1575
- slog .F ("panic" , r ),
1576
- slog .F ("stack" , string (debug .Stack ())),
1577
- )
1578
- }
1579
- }()
1580
-
1581
- if val := a .environmentVariables [EnvProcPrioMgmt ]; val == "" || runtime .GOOS != "linux" {
1582
- a .logger .Debug (ctx , "process priority not enabled, agent will not manage process niceness/oom_score_adj " ,
1583
- slog .F ("env_var" , EnvProcPrioMgmt ),
1584
- slog .F ("value" , val ),
1585
- slog .F ("goos" , runtime .GOOS ),
1586
- )
1587
- return
1588
- }
1589
-
1590
- if a .processManagementTick == nil {
1591
- ticker := time .NewTicker (time .Second )
1592
- defer ticker .Stop ()
1593
- a .processManagementTick = ticker .C
1594
- }
1595
-
1596
- oomScore := unsetOOMScore
1597
- if scoreStr , ok := a .environmentVariables [EnvProcOOMScore ]; ok {
1598
- score , err := strconv .Atoi (strings .TrimSpace (scoreStr ))
1599
- if err == nil && score >= - 1000 && score <= 1000 {
1600
- oomScore = score
1601
- } else {
1602
- a .logger .Error (ctx , "invalid oom score" ,
1603
- slog .F ("min_value" , - 1000 ),
1604
- slog .F ("max_value" , 1000 ),
1605
- slog .F ("value" , scoreStr ),
1606
- )
1607
- }
1608
- }
1609
-
1610
- debouncer := & logDebouncer {
1611
- logger : a .logger ,
1612
- messages : map [string ]time.Time {},
1613
- interval : time .Minute ,
1614
- }
1615
-
1616
- for {
1617
- procs , err := a .manageProcessPriority (ctx , debouncer , oomScore )
1618
- // Avoid spamming the logs too often.
1619
- if err != nil {
1620
- debouncer .Error (ctx , "manage process priority" ,
1621
- slog .Error (err ),
1622
- )
1623
- }
1624
- if a .modifiedProcs != nil {
1625
- a .modifiedProcs <- procs
1626
- }
1627
-
1628
- select {
1629
- case <- a .processManagementTick :
1630
- case <- ctx .Done ():
1631
- return
1632
- }
1633
- }
1634
- }
1635
-
1636
- // unsetOOMScore is set to an invalid OOM score to imply an unset value.
1637
- const unsetOOMScore = 1001
1638
-
1639
- func (a * agent ) manageProcessPriority (ctx context.Context , debouncer * logDebouncer , oomScore int ) ([]* agentproc.Process , error ) {
1640
- const (
1641
- niceness = 10
1642
- )
1643
-
1644
- // We fetch the agent score each time because it's possible someone updates the
1645
- // value after it is started.
1646
- agentScore , err := a .getAgentOOMScore ()
1647
- if err != nil {
1648
- agentScore = unsetOOMScore
1649
- }
1650
- if oomScore == unsetOOMScore && agentScore != unsetOOMScore {
1651
- // If the child score has not been explicitly specified we should
1652
- // set it to a score relative to the agent score.
1653
- oomScore = childOOMScore (agentScore )
1654
- }
1655
-
1656
- procs , err := agentproc .List (a .filesystem , a .syscaller )
1657
- if err != nil {
1658
- return nil , xerrors .Errorf ("list: %w" , err )
1659
- }
1660
-
1661
- modProcs := []* agentproc.Process {}
1662
-
1663
- for _ , proc := range procs {
1664
- containsFn := func (e string ) bool {
1665
- contains := strings .Contains (proc .Cmd (), e )
1666
- return contains
1667
- }
1668
-
1669
- // If the process is prioritized we should adjust
1670
- // it's oom_score_adj and avoid lowering its niceness.
1671
- if slices .ContainsFunc (prioritizedProcs , containsFn ) {
1672
- continue
1673
- }
1674
-
1675
- score , niceErr := proc .Niceness (a .syscaller )
1676
- if niceErr != nil && ! isBenignProcessErr (niceErr ) {
1677
- debouncer .Warn (ctx , "unable to get proc niceness" ,
1678
- slog .F ("cmd" , proc .Cmd ()),
1679
- slog .F ("pid" , proc .PID ),
1680
- slog .Error (niceErr ),
1681
- )
1682
- }
1683
-
1684
- // We only want processes that don't have a nice value set
1685
- // so we don't override user nice values.
1686
- // Getpriority actually returns priority for the nice value
1687
- // which is niceness + 20, so here 20 = a niceness of 0 (aka unset).
1688
- if score != 20 {
1689
- // We don't log here since it can get spammy
1690
- continue
1691
- }
1692
-
1693
- if niceErr == nil {
1694
- err := proc .SetNiceness (a .syscaller , niceness )
1695
- if err != nil && ! isBenignProcessErr (err ) {
1696
- debouncer .Warn (ctx , "unable to set proc niceness" ,
1697
- slog .F ("cmd" , proc .Cmd ()),
1698
- slog .F ("pid" , proc .PID ),
1699
- slog .F ("niceness" , niceness ),
1700
- slog .Error (err ),
1701
- )
1702
- }
1703
- }
1704
-
1705
- // If the oom score is valid and it's not already set and isn't a custom value set by another process then it's ok to update it.
1706
- if oomScore != unsetOOMScore && oomScore != proc .OOMScoreAdj && ! isCustomOOMScore (agentScore , proc ) {
1707
- oomScoreStr := strconv .Itoa (oomScore )
1708
- err := afero .WriteFile (a .filesystem , fmt .Sprintf ("/proc/%d/oom_score_adj" , proc .PID ), []byte (oomScoreStr ), 0o644 )
1709
- if err != nil && ! isBenignProcessErr (err ) {
1710
- debouncer .Warn (ctx , "unable to set oom_score_adj" ,
1711
- slog .F ("cmd" , proc .Cmd ()),
1712
- slog .F ("pid" , proc .PID ),
1713
- slog .F ("score" , oomScoreStr ),
1714
- slog .Error (err ),
1715
- )
1716
- }
1717
- }
1718
- modProcs = append (modProcs , proc )
1719
- }
1720
- return modProcs , nil
1721
- }
1722
-
1723
1563
// isClosed returns whether the API is closed or not.
1724
1564
func (a * agent ) isClosed () bool {
1725
1565
return a .hardCtx .Err () != nil
@@ -2113,88 +1953,3 @@ func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl
2113
1953
}
2114
1954
})
2115
1955
}
2116
-
2117
- // childOOMScore returns the oom_score_adj for a child process. It is based
2118
- // on the oom_score_adj of the agent process.
2119
- func childOOMScore (agentScore int ) int {
2120
- // If the agent has a negative oom_score_adj, we set the child to 0
2121
- // so it's treated like every other process.
2122
- if agentScore < 0 {
2123
- return 0
2124
- }
2125
-
2126
- // If the agent is already almost at the maximum then set it to the max.
2127
- if agentScore >= 998 {
2128
- return 1000
2129
- }
2130
-
2131
- // If the agent oom_score_adj is >=0, we set the child to slightly
2132
- // less than the maximum. If users want a different score they set it
2133
- // directly.
2134
- return 998
2135
- }
2136
-
2137
- func (a * agent ) getAgentOOMScore () (int , error ) {
2138
- scoreStr , err := afero .ReadFile (a .filesystem , "/proc/self/oom_score_adj" )
2139
- if err != nil {
2140
- return 0 , xerrors .Errorf ("read file: %w" , err )
2141
- }
2142
-
2143
- score , err := strconv .Atoi (strings .TrimSpace (string (scoreStr )))
2144
- if err != nil {
2145
- return 0 , xerrors .Errorf ("parse int: %w" , err )
2146
- }
2147
-
2148
- return score , nil
2149
- }
2150
-
2151
- // isCustomOOMScore checks to see if the oom_score_adj is not a value that would
2152
- // originate from an agent-spawned process.
2153
- func isCustomOOMScore (agentScore int , process * agentproc.Process ) bool {
2154
- score := process .OOMScoreAdj
2155
- return agentScore != score && score != 1000 && score != 0 && score != 998
2156
- }
2157
-
2158
- // logDebouncer skips writing a log for a particular message if
2159
- // it's been emitted within the given interval duration.
2160
- // It's a shoddy implementation used in one spot that should be replaced at
2161
- // some point.
2162
- type logDebouncer struct {
2163
- logger slog.Logger
2164
- messages map [string ]time.Time
2165
- interval time.Duration
2166
- }
2167
-
2168
- func (l * logDebouncer ) Warn (ctx context.Context , msg string , fields ... any ) {
2169
- l .log (ctx , slog .LevelWarn , msg , fields ... )
2170
- }
2171
-
2172
- func (l * logDebouncer ) Error (ctx context.Context , msg string , fields ... any ) {
2173
- l .log (ctx , slog .LevelError , msg , fields ... )
2174
- }
2175
-
2176
- func (l * logDebouncer ) log (ctx context.Context , level slog.Level , msg string , fields ... any ) {
2177
- // This (bad) implementation assumes you wouldn't reuse the same msg
2178
- // for different levels.
2179
- if last , ok := l .messages [msg ]; ok && time .Since (last ) < l .interval {
2180
- return
2181
- }
2182
- switch level {
2183
- case slog .LevelWarn :
2184
- l .logger .Warn (ctx , msg , fields ... )
2185
- case slog .LevelError :
2186
- l .logger .Error (ctx , msg , fields ... )
2187
- }
2188
- l .messages [msg ] = time .Now ()
2189
- }
2190
-
2191
- func isBenignProcessErr (err error ) bool {
2192
- return err != nil &&
2193
- (xerrors .Is (err , os .ErrNotExist ) ||
2194
- xerrors .Is (err , os .ErrPermission ) ||
2195
- isNoSuchProcessErr (err ))
2196
- }
2197
-
2198
- func isNoSuchProcessErr (err error ) bool {
2199
- return err != nil && strings .Contains (err .Error (), "no such process" )
2200
- }
0 commit comments