coder
diff --git a/‎.github/actions/upload-datadog/action.yaml
Lines changed: 21 additions & 3 deletions b/‎.github/actions/upload-datadog/action.yaml
Lines changed: 21 additions & 3 deletions
diff --git a/‎.github/workflows/ci.yaml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/ci.yaml
Lines changed: 4 additions & 4 deletions
diff --git a/‎agent/agent.go
Lines changed: 139 additions & 0 deletions b/‎agent/agent.go
Lines changed: 139 additions & 0 deletions
@@ -1,4 +1,4 @@
-name: Upload tests to datadog
+name: Upload tests to Datadog
 if: always()
 inputs:
   api-key:
@@ -7,8 +7,26 @@ inputs:
 runs:
   using: "composite"
   steps:
+    - name: Set work dir
+      shell: bash
+      run: |
+        WORK_DIR=${{ runner.temp }}/datadog-ci
+        mkdir -p $WORK_DIR
+        echo "WORK_DIR=$WORK_DIR" >> $GITHUB_ENV
+    # The npm install was taking 30s to 1m, accounting for 20+% of the total
+    # job time.
+    - name: Cache datadog-ci
+      uses: buildjet/cache@v3
+      with:
+        path: |
+          ${{ env.WORK_DIR }}
+        key: datadog-ci-${{ runner.os }}
+        restore-keys: |
+          datadog-ci-${{ runner.os }}-
+          datadog-ci-
     - shell: bash
       run: |
+        cd ${{ env.WORK_DIR }}
         owner=${{ github.repository_owner	 }}
         echo "owner: $owner"
         if [[  $owner != "coder" ]]; then
@@ -20,8 +38,8 @@ runs:
           echo "No API key provided, skipping..."
           exit 0
         fi
-        npm install -g @datadog/datadog-ci@2.10.0
-        datadog-ci junit upload --service coder ./gotests.xml \
+        npm install @datadog/datadog-ci@2.10.0
+        npm x -- datadog-ci junit upload --service coder ./gotests.xml \
           --tags os:${{runner.os}} --tags runner_name:${{runner.name}}
       env:
         DATADOG_API_KEY: ${{ inputs.api-key }}
@@ -235,7 +235,7 @@ jobs:
         run: ./scripts/check_unstaged.sh
 
   test-go:
-    runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'buildjet-4vcpu-ubuntu-2204' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'macos-latest-xl' || matrix.os == 'windows-2019' && github.repository_owner == 'coder' && 'windows-latest-8-cores' || matrix.os }}
+    runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'buildjet-4vcpu-ubuntu-2204' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'macos-latest-xl' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
     needs: changes
     if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
     timeout-minutes: 20
@@ -245,7 +245,7 @@ jobs:
         os:
           - ubuntu-latest
           - macos-latest
-          - windows-2019
+          - windows-2022
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -387,7 +387,7 @@ jobs:
 
   deploy:
     name: "deploy"
-    runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
+    runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-16vcpu-ubuntu-2204' || 'ubuntu-latest' }}
     timeout-minutes: 30
     needs: changes
     if: |
@@ -510,7 +510,7 @@ jobs:
           flags: unittest-js
 
   test-e2e:
-    runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-8vcpu-ubuntu-2204' || 'ubuntu-latest' }}
+    runs-on: ${{ github.repository_owner == 'coder' && 'buildjet-16vcpu-ubuntu-2204' || 'ubuntu-latest' }}
     needs: changes
     if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ts == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
     timeout-minutes: 20
 
@@ -15,6 +15,8 @@ import (
 	"os/exec"
 	"os/user"
 	"path/filepath"
+	"runtime"
+	"runtime/debug"
 	"sort"
 	"strconv"
 	"strings"
@@ -34,6 +36,7 @@ import (
 	"tailscale.com/types/netlogtype"
 
 	"cdr.dev/slog"
+	"github.com/coder/coder/v2/agent/agentproc"
 	"github.com/coder/coder/v2/agent/agentssh"
 	"github.com/coder/coder/v2/agent/reconnectingpty"
 	"github.com/coder/coder/v2/buildinfo"
@@ -51,6 +54,10 @@ const (
 	ProtocolDial            = "dial"
 )
 
+// EnvProcPrioMgmt determines whether we attempt to manage
+// process CPU and OOM Killer priority.
+const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT"
+
 type Options struct {
 	Filesystem                   afero.Fs
 	LogDir                       string
@@ -68,6 +75,11 @@ type Options struct {
 	PrometheusRegistry           *prometheus.Registry
 	ReportMetadataInterval       time.Duration
 	ServiceBannerRefreshInterval time.Duration
+	Syscaller                    agentproc.Syscaller
+	// ModifiedProcesses is used for testing process priority management.
+	ModifiedProcesses chan []*agentproc.Process
+	// ProcessManagementTick is used for testing process priority management.
+	ProcessManagementTick <-chan time.Time
 }
 
 type Client interface {
@@ -120,6 +132,10 @@ func New(options Options) Agent {
 		prometheusRegistry = prometheus.NewRegistry()
 	}
 
+	if options.Syscaller == nil {
+		options.Syscaller = agentproc.NewSyscaller()
+	}
+
 	ctx, cancelFunc := context.WithCancel(context.Background())
 	a := &agent{
 		tailnetListenPort:            options.TailnetListenPort,
@@ -143,6 +159,9 @@ func New(options Options) Agent {
 		sshMaxTimeout:                options.SSHMaxTimeout,
 		subsystems:                   options.Subsystems,
 		addresses:                    options.Addresses,
+		syscaller:                    options.Syscaller,
+		modifiedProcs:                options.ModifiedProcesses,
+		processManagementTick:        options.ProcessManagementTick,
 
 		prometheusRegistry: prometheusRegistry,
 		metrics:            newAgentMetrics(prometheusRegistry),
@@ -197,6 +216,12 @@ type agent struct {
 
 	prometheusRegistry *prometheus.Registry
 	metrics            *agentMetrics
+	syscaller          agentproc.Syscaller
+
+	// modifiedProcs is used for testing process priority management.
+	modifiedProcs chan []*agentproc.Process
+	// processManagementTick is used for testing process priority management.
+	processManagementTick <-chan time.Time
 }
 
 func (a *agent) TailnetConn() *tailnet.Conn {
@@ -225,6 +250,7 @@ func (a *agent) runLoop(ctx context.Context) {
 	go a.reportLifecycleLoop(ctx)
 	go a.reportMetadataLoop(ctx)
 	go a.fetchServiceBannerLoop(ctx)
+	go a.manageProcessPriorityLoop(ctx)
 
 	for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
 		a.logger.Info(ctx, "connecting to coderd")
@@ -1253,6 +1279,119 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) {
 	}
 }
 
+var prioritizedProcs = []string{"coder agent"}
+
+func (a *agent) manageProcessPriorityLoop(ctx context.Context) {
+	defer func() {
+		if r := recover(); r != nil {
+			a.logger.Critical(ctx, "recovered from panic",
+				slog.F("panic", r),
+				slog.F("stack", string(debug.Stack())),
+			)
+		}
+	}()
+
+	if val := a.envVars[EnvProcPrioMgmt]; val == "" || runtime.GOOS != "linux" {
+		a.logger.Debug(ctx, "process priority not enabled, agent will not manage process niceness/oom_score_adj ",
+			slog.F("env_var", EnvProcPrioMgmt),
+			slog.F("value", val),
+			slog.F("goos", runtime.GOOS),
+		)
+		return
+	}
+
+	if a.processManagementTick == nil {
+		ticker := time.NewTicker(time.Second)
+		defer ticker.Stop()
+		a.processManagementTick = ticker.C
+	}
+
+	for {
+		procs, err := a.manageProcessPriority(ctx)
+		if err != nil {
+			a.logger.Error(ctx, "manage process priority",
+				slog.Error(err),
+			)
+		}
+		if a.modifiedProcs != nil {
+			a.modifiedProcs <- procs
+		}
+
+		select {
+		case <-a.processManagementTick:
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process, error) {
+	const (
+		niceness = 10
+	)
+
+	procs, err := agentproc.List(a.filesystem, a.syscaller)
+	if err != nil {
+		return nil, xerrors.Errorf("list: %w", err)
+	}
+
+	var (
+		modProcs = []*agentproc.Process{}
+		logger   slog.Logger
+	)
+
+	for _, proc := range procs {
+		logger = a.logger.With(
+			slog.F("cmd", proc.Cmd()),
+			slog.F("pid", proc.PID),
+		)
+
+		containsFn := func(e string) bool {
+			contains := strings.Contains(proc.Cmd(), e)
+			return contains
+		}
+
+		// If the process is prioritized we should adjust
+		// it's oom_score_adj and avoid lowering its niceness.
+		if slices.ContainsFunc[[]string, string](prioritizedProcs, containsFn) {
+			continue
+		}
+
+		score, err := proc.Niceness(a.syscaller)
+		if err != nil {
+			logger.Warn(ctx, "unable to get proc niceness",
+				slog.Error(err),
+			)
+			continue
+		}
+
+		// We only want processes that don't have a nice value set
+		// so we don't override user nice values.
+		// Getpriority actually returns priority for the nice value
+		// which is niceness + 20, so here 20 = a niceness of 0 (aka unset).
+		if score != 20 {
+			if score != niceness {
+				logger.Debug(ctx, "skipping process due to custom niceness",
+					slog.F("niceness", score),
+				)
+			}
+			continue
+		}
+
+		err = proc.SetNiceness(a.syscaller, niceness)
+		if err != nil {
+			logger.Warn(ctx, "unable to set proc niceness",
+				slog.F("niceness", niceness),
+				slog.Error(err),
+			)
+			continue
+		}
+
+		modProcs = append(modProcs, proc)
+	}
+	return modProcs, nil
+}
+
 // isClosed returns whether the API is closed or not.
 func (a *agent) isClosed() bool {
 	select {