Skip to content

Commit 269fe66

Browse files
committed
Merge branch 'main' into provisioner-fixes
2 parents 718e918 + 0c2b432 commit 269fe66

File tree

87 files changed

+3021
-1152
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+3021
-1152
lines changed

.github/workflows/security.yaml

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,11 @@ permissions:
66
security-events: write
77

88
on:
9-
push:
10-
branches: ["main"]
11-
12-
pull_request:
13-
branches: ["main"]
14-
159
workflow_dispatch:
1610

1711
schedule:
18-
# Run every week at 10:24 on Thursday.
19-
- cron: "24 10 * * 4"
12+
# Run every 6 hours Monday-Friday!
13+
- cron: "0 0,6,12,18 * * 1-5"
2014

2115
# Cancel in-progress runs for pull requests when developers push
2216
# additional changes
@@ -59,6 +53,17 @@ jobs:
5953
- name: Perform CodeQL Analysis
6054
uses: github/codeql-action/analyze@v2
6155

56+
- name: Send Slack notification on failure
57+
if: ${{ failure() }}
58+
run: |
59+
msg="❌ CodeQL Failed\n\nhttps://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
60+
curl \
61+
-qfsSL \
62+
-X POST \
63+
-H "Content-Type: application/json" \
64+
--data "{\"content\": \"$msg\"}" \
65+
"${{ secrets.SLACK_SECURITY_FAILURE_WEBHOOK_URL }}"
66+
6267
trivy:
6368
runs-on: ${{ github.repository_owner == 'coder' && 'ubuntu-latest-8-cores' || 'ubuntu-latest' }}
6469
steps:
@@ -135,3 +140,14 @@ jobs:
135140
name: trivy
136141
path: trivy-results.sarif
137142
retention-days: 7
143+
144+
- name: Send Slack notification on failure
145+
if: ${{ failure() }}
146+
run: |
147+
msg="❌ CodeQL Failed\n\nhttps://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
148+
curl \
149+
-qfsSL \
150+
-X POST \
151+
-H "Content-Type: application/json" \
152+
--data "{\"content\": \"$msg\"}" \
153+
"${{ secrets.SLACK_SECURITY_FAILURE_WEBHOOK_URL }}"

agent/agent.go

Lines changed: 111 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,10 @@ func New(options Options) io.Closer {
121121
logDir: options.LogDir,
122122
tempDir: options.TempDir,
123123
lifecycleUpdate: make(chan struct{}, 1),
124-
connStatsChan: make(chan *agentsdk.Stats, 1),
124+
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
125+
// TODO: This is a temporary hack to make tests not flake.
126+
// @kylecarbs has a better solution in here: https://github.com/coder/coder/pull/6469
127+
connStatsChan: make(chan *agentsdk.Stats, 8),
125128
}
126129
a.init(ctx)
127130
return a
@@ -149,9 +152,10 @@ type agent struct {
149152
sessionToken atomic.Pointer[string]
150153
sshServer *ssh.Server
151154

152-
lifecycleUpdate chan struct{}
153-
lifecycleMu sync.Mutex // Protects following.
154-
lifecycleState codersdk.WorkspaceAgentLifecycle
155+
lifecycleUpdate chan struct{}
156+
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
157+
lifecycleMu sync.RWMutex // Protects following.
158+
lifecycleState codersdk.WorkspaceAgentLifecycle
155159

156160
network *tailnet.Conn
157161
connStatsChan chan *agentsdk.Stats
@@ -207,9 +211,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
207211
}
208212

209213
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
210-
a.lifecycleMu.Lock()
214+
a.lifecycleMu.RLock()
211215
state := a.lifecycleState
212-
a.lifecycleMu.Unlock()
216+
a.lifecycleMu.RUnlock()
213217

214218
if state == lastReported {
215219
break
@@ -222,6 +226,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
222226
})
223227
if err == nil {
224228
lastReported = state
229+
select {
230+
case a.lifecycleReported <- state:
231+
case <-a.lifecycleReported:
232+
a.lifecycleReported <- state
233+
}
225234
break
226235
}
227236
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
@@ -233,13 +242,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
233242
}
234243
}
235244

245+
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
246+
// The state is only updated if it's a valid state transition.
236247
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
237248
a.lifecycleMu.Lock()
238-
defer a.lifecycleMu.Unlock()
239-
240-
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState))
241-
249+
lastState := a.lifecycleState
250+
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
251+
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
252+
a.lifecycleMu.Unlock()
253+
return
254+
}
242255
a.lifecycleState = state
256+
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
257+
a.lifecycleMu.Unlock()
258+
243259
select {
244260
case a.lifecycleUpdate <- struct{}{}:
245261
default:
@@ -299,9 +315,10 @@ func (a *agent) run(ctx context.Context) error {
299315
}
300316
}
301317

318+
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
302319
scriptDone := make(chan error, 1)
303320
scriptStart := time.Now()
304-
err := a.trackConnGoroutine(func() {
321+
err = a.trackConnGoroutine(func() {
305322
defer close(scriptDone)
306323
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript)
307324
})
@@ -329,16 +346,17 @@ func (a *agent) run(ctx context.Context) error {
329346
if errors.Is(err, context.Canceled) {
330347
return
331348
}
332-
execTime := time.Since(scriptStart)
333-
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady
334-
if err != nil {
335-
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
336-
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError
337-
} else {
338-
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
349+
// Only log if there was a startup script.
350+
if metadata.StartupScript != "" {
351+
execTime := time.Since(scriptStart)
352+
if err != nil {
353+
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
354+
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
355+
} else {
356+
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
357+
}
339358
}
340-
341-
a.setLifecycle(ctx, lifecycleStatus)
359+
a.setLifecycle(ctx, lifecycleState)
342360
}()
343361
}
344362

@@ -606,14 +624,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
606624
}
607625

608626
func (a *agent) runStartupScript(ctx context.Context, script string) error {
627+
return a.runScript(ctx, "startup", script)
628+
}
629+
630+
func (a *agent) runShutdownScript(ctx context.Context, script string) error {
631+
return a.runScript(ctx, "shutdown", script)
632+
}
633+
634+
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
609635
if script == "" {
610636
return nil
611637
}
612638

613-
a.logger.Info(ctx, "running startup script", slog.F("script", script))
614-
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
639+
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
640+
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
615641
if err != nil {
616-
return xerrors.Errorf("open startup script log file: %w", err)
642+
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
617643
}
618644
defer func() {
619645
_ = writer.Close()
@@ -774,7 +800,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
774800

775801
rawMetadata := a.metadata.Load()
776802
if rawMetadata == nil {
777-
return nil, xerrors.Errorf("no metadata was provided: %w", err)
803+
return nil, xerrors.Errorf("no metadata was provided")
778804
}
779805
metadata, valid := rawMetadata.(agentsdk.Metadata)
780806
if !valid {
@@ -1290,13 +1316,73 @@ func (a *agent) Close() error {
12901316
if a.isClosed() {
12911317
return nil
12921318
}
1319+
1320+
ctx := context.Background()
1321+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShuttingDown)
1322+
1323+
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1324+
if metadata, ok := a.metadata.Load().(agentsdk.Metadata); ok && metadata.ShutdownScript != "" {
1325+
scriptDone := make(chan error, 1)
1326+
scriptStart := time.Now()
1327+
go func() {
1328+
defer close(scriptDone)
1329+
scriptDone <- a.runShutdownScript(ctx, metadata.ShutdownScript)
1330+
}()
1331+
1332+
var timeout <-chan time.Time
1333+
// If timeout is zero, an older version of the coder
1334+
// provider was used. Otherwise a timeout is always > 0.
1335+
if metadata.ShutdownScriptTimeout > 0 {
1336+
t := time.NewTimer(metadata.ShutdownScriptTimeout)
1337+
defer t.Stop()
1338+
timeout = t.C
1339+
}
1340+
1341+
var err error
1342+
select {
1343+
case err = <-scriptDone:
1344+
case <-timeout:
1345+
a.logger.Warn(ctx, "shutdown script timed out")
1346+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
1347+
err = <-scriptDone // The script can still complete after a timeout.
1348+
}
1349+
execTime := time.Since(scriptStart)
1350+
if err != nil {
1351+
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
1352+
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
1353+
} else {
1354+
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
1355+
}
1356+
}
1357+
1358+
// Set final state and wait for it to be reported because context
1359+
// cancellation will stop the report loop.
1360+
a.setLifecycle(ctx, lifecycleState)
1361+
1362+
// Wait for the lifecycle to be reported, but don't wait forever so
1363+
// that we don't break user expectations.
1364+
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
1365+
defer cancel()
1366+
lifecycleWaitLoop:
1367+
for {
1368+
select {
1369+
case <-ctx.Done():
1370+
break lifecycleWaitLoop
1371+
case s := <-a.lifecycleReported:
1372+
if s == lifecycleState {
1373+
break lifecycleWaitLoop
1374+
}
1375+
}
1376+
}
1377+
12931378
close(a.closed)
12941379
a.closeCancel()
1380+
_ = a.sshServer.Close()
12951381
if a.network != nil {
12961382
_ = a.network.Close()
12971383
}
1298-
_ = a.sshServer.Close()
12991384
a.connCloseWait.Wait()
1385+
13001386
return nil
13011387
}
13021388

0 commit comments

Comments
 (0)