Skip to content

Commit bb22d9b

Browse files
committed
Merge branch 'main' of https://github.com/coder/coder into spike/9428-pt2-integrate-acquirer
2 parents 23d0e3c + 5de5d20 commit bb22d9b

File tree

267 files changed

+6779
-2556
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

267 files changed

+6779
-2556
lines changed

.github/actions/upload-datadog/action.yaml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Upload tests to datadog
1+
name: Upload tests to Datadog
22
if: always()
33
inputs:
44
api-key:
@@ -7,8 +7,26 @@ inputs:
77
runs:
88
using: "composite"
99
steps:
10+
- name: Set work dir
11+
shell: bash
12+
run: |
13+
WORK_DIR=${{ runner.temp }}/datadog-ci
14+
mkdir -p $WORK_DIR
15+
echo "WORK_DIR=$WORK_DIR" >> $GITHUB_ENV
16+
# The npm install was taking 30s to 1m, accounting for 20+% of the total
17+
# job time.
18+
- name: Cache datadog-ci
19+
uses: buildjet/cache@v3
20+
with:
21+
path: |
22+
${{ env.WORK_DIR }}
23+
key: datadog-ci-${{ runner.os }}
24+
restore-keys: |
25+
datadog-ci-${{ runner.os }}-
26+
datadog-ci-
1027
- shell: bash
1128
run: |
29+
cd ${{ env.WORK_DIR }}
1230
owner=${{ github.repository_owner }}
1331
echo "owner: $owner"
1432
if [[ $owner != "coder" ]]; then
@@ -20,8 +38,8 @@ runs:
2038
echo "No API key provided, skipping..."
2139
exit 0
2240
fi
23-
npm install -g @datadog/datadog-ci@2.10.0
24-
datadog-ci junit upload --service coder ./gotests.xml \
41+
npm install @datadog/datadog-ci@2.10.0
42+
npm x -- datadog-ci junit upload --service coder ./gotests.xml \
2543
--tags os:${{runner.os}} --tags runner_name:${{runner.name}}
2644
env:
2745
DATADOG_API_KEY: ${{ inputs.api-key }}

.github/workflows/ci.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ jobs:
235235
run: ./scripts/check_unstaged.sh
236236

237237
test-go:
238-
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'buildjet-4vcpu-ubuntu-2204' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'macos-latest-xl' || matrix.os == 'windows-2019' && github.repository_owner == 'coder' && 'windows-latest-8-cores' || matrix.os }}
238+
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'buildjet-4vcpu-ubuntu-2204' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'macos-latest-xl' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
239239
needs: changes
240240
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
241241
timeout-minutes: 20
@@ -245,7 +245,7 @@ jobs:
245245
os:
246246
- ubuntu-latest
247247
- macos-latest
248-
- windows-2019
248+
- windows-2022
249249
steps:
250250
- name: Checkout
251251
uses: actions/checkout@v4
@@ -387,7 +387,7 @@ jobs:
387387

388388
deploy:
389389
name: "deploy"
390-
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
390+
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-16vcpu-ubuntu-2204' || 'ubuntu-latest' }}
391391
timeout-minutes: 30
392392
needs: changes
393393
if: |
@@ -510,7 +510,7 @@ jobs:
510510
flags: unittest-js
511511

512512
test-e2e:
513-
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
513+
runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-16vcpu-ubuntu-2204' || 'ubuntu-latest' }}
514514
needs: changes
515515
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ts == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
516516
timeout-minutes: 20

agent/agent.go

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"os/exec"
1616
"os/user"
1717
"path/filepath"
18+
"runtime"
19+
"runtime/debug"
1820
"sort"
1921
"strconv"
2022
"strings"
@@ -34,6 +36,7 @@ import (
3436
"tailscale.com/types/netlogtype"
3537

3638
"cdr.dev/slog"
39+
"github.com/coder/coder/v2/agent/agentproc"
3740
"github.com/coder/coder/v2/agent/agentssh"
3841
"github.com/coder/coder/v2/agent/reconnectingpty"
3942
"github.com/coder/coder/v2/buildinfo"
@@ -51,6 +54,10 @@ const (
5154
ProtocolDial = "dial"
5255
)
5356

57+
// EnvProcPrioMgmt determines whether we attempt to manage
58+
// process CPU and OOM Killer priority.
59+
const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT"
60+
5461
type Options struct {
5562
Filesystem afero.Fs
5663
LogDir string
@@ -68,6 +75,11 @@ type Options struct {
6875
PrometheusRegistry *prometheus.Registry
6976
ReportMetadataInterval time.Duration
7077
ServiceBannerRefreshInterval time.Duration
78+
Syscaller agentproc.Syscaller
79+
// ModifiedProcesses is used for testing process priority management.
80+
ModifiedProcesses chan []*agentproc.Process
81+
// ProcessManagementTick is used for testing process priority management.
82+
ProcessManagementTick <-chan time.Time
7183
}
7284

7385
type Client interface {
@@ -120,6 +132,10 @@ func New(options Options) Agent {
120132
prometheusRegistry = prometheus.NewRegistry()
121133
}
122134

135+
if options.Syscaller == nil {
136+
options.Syscaller = agentproc.NewSyscaller()
137+
}
138+
123139
ctx, cancelFunc := context.WithCancel(context.Background())
124140
a := &agent{
125141
tailnetListenPort: options.TailnetListenPort,
@@ -143,6 +159,9 @@ func New(options Options) Agent {
143159
sshMaxTimeout: options.SSHMaxTimeout,
144160
subsystems: options.Subsystems,
145161
addresses: options.Addresses,
162+
syscaller: options.Syscaller,
163+
modifiedProcs: options.ModifiedProcesses,
164+
processManagementTick: options.ProcessManagementTick,
146165

147166
prometheusRegistry: prometheusRegistry,
148167
metrics: newAgentMetrics(prometheusRegistry),
@@ -197,6 +216,12 @@ type agent struct {
197216

198217
prometheusRegistry *prometheus.Registry
199218
metrics *agentMetrics
219+
syscaller agentproc.Syscaller
220+
221+
// modifiedProcs is used for testing process priority management.
222+
modifiedProcs chan []*agentproc.Process
223+
// processManagementTick is used for testing process priority management.
224+
processManagementTick <-chan time.Time
200225
}
201226

202227
func (a *agent) TailnetConn() *tailnet.Conn {
@@ -225,6 +250,7 @@ func (a *agent) runLoop(ctx context.Context) {
225250
go a.reportLifecycleLoop(ctx)
226251
go a.reportMetadataLoop(ctx)
227252
go a.fetchServiceBannerLoop(ctx)
253+
go a.manageProcessPriorityLoop(ctx)
228254

229255
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
230256
a.logger.Info(ctx, "connecting to coderd")
@@ -1253,6 +1279,119 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
12531279
}
12541280
}
12551281

1282+
var prioritizedProcs = []string{"coder agent"}
1283+
1284+
func (a *agent) manageProcessPriorityLoop(ctx context.Context) {
1285+
defer func() {
1286+
if r := recover(); r != nil {
1287+
a.logger.Critical(ctx, "recovered from panic",
1288+
slog.F("panic", r),
1289+
slog.F("stack", string(debug.Stack())),
1290+
)
1291+
}
1292+
}()
1293+
1294+
if val := a.envVars[EnvProcPrioMgmt]; val == "" || runtime.GOOS != "linux" {
1295+
a.logger.Debug(ctx, "process priority not enabled, agent will not manage process niceness/oom_score_adj ",
1296+
slog.F("env_var", EnvProcPrioMgmt),
1297+
slog.F("value", val),
1298+
slog.F("goos", runtime.GOOS),
1299+
)
1300+
return
1301+
}
1302+
1303+
if a.processManagementTick == nil {
1304+
ticker := time.NewTicker(time.Second)
1305+
defer ticker.Stop()
1306+
a.processManagementTick = ticker.C
1307+
}
1308+
1309+
for {
1310+
procs, err := a.manageProcessPriority(ctx)
1311+
if err != nil {
1312+
a.logger.Error(ctx, "manage process priority",
1313+
slog.Error(err),
1314+
)
1315+
}
1316+
if a.modifiedProcs != nil {
1317+
a.modifiedProcs <- procs
1318+
}
1319+
1320+
select {
1321+
case <-a.processManagementTick:
1322+
case <-ctx.Done():
1323+
return
1324+
}
1325+
}
1326+
}
1327+
1328+
func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process, error) {
1329+
const (
1330+
niceness = 10
1331+
)
1332+
1333+
procs, err := agentproc.List(a.filesystem, a.syscaller)
1334+
if err != nil {
1335+
return nil, xerrors.Errorf("list: %w", err)
1336+
}
1337+
1338+
var (
1339+
modProcs = []*agentproc.Process{}
1340+
logger slog.Logger
1341+
)
1342+
1343+
for _, proc := range procs {
1344+
logger = a.logger.With(
1345+
slog.F("cmd", proc.Cmd()),
1346+
slog.F("pid", proc.PID),
1347+
)
1348+
1349+
containsFn := func(e string) bool {
1350+
contains := strings.Contains(proc.Cmd(), e)
1351+
return contains
1352+
}
1353+
1354+
// If the process is prioritized we should adjust
1355+
// it's oom_score_adj and avoid lowering its niceness.
1356+
if slices.ContainsFunc[[]string, string](prioritizedProcs, containsFn) {
1357+
continue
1358+
}
1359+
1360+
score, err := proc.Niceness(a.syscaller)
1361+
if err != nil {
1362+
logger.Warn(ctx, "unable to get proc niceness",
1363+
slog.Error(err),
1364+
)
1365+
continue
1366+
}
1367+
1368+
// We only want processes that don't have a nice value set
1369+
// so we don't override user nice values.
1370+
// Getpriority actually returns priority for the nice value
1371+
// which is niceness + 20, so here 20 = a niceness of 0 (aka unset).
1372+
if score != 20 {
1373+
if score != niceness {
1374+
logger.Debug(ctx, "skipping process due to custom niceness",
1375+
slog.F("niceness", score),
1376+
)
1377+
}
1378+
continue
1379+
}
1380+
1381+
err = proc.SetNiceness(a.syscaller, niceness)
1382+
if err != nil {
1383+
logger.Warn(ctx, "unable to set proc niceness",
1384+
slog.F("niceness", niceness),
1385+
slog.Error(err),
1386+
)
1387+
continue
1388+
}
1389+
1390+
modProcs = append(modProcs, proc)
1391+
}
1392+
return modProcs, nil
1393+
}
1394+
12561395
// isClosed returns whether the API is closed or not.
12571396
func (a *agent) isClosed() bool {
12581397
select {

0 commit comments

Comments
 (0)