Skip to content

Commit 465fe86

Browse files
authored
chore: skip timing-sensistive AgentMetadata test in the standard suite (#7237)
* chore: skip timing-sensistive AgentMetadata test in the standard suite * Add test-timing target * fix windows? * Works on my Windows desktop? * Use tag system * fixup! Use tag system
1 parent 398d08a commit 465fe86

File tree

4 files changed

+136
-67
lines changed

4 files changed

+136
-67
lines changed

agent/agent.go

+17-7
Original file line numberDiff line numberDiff line change
@@ -210,25 +210,31 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM
210210
var out bytes.Buffer
211211
result := &codersdk.WorkspaceAgentMetadataResult{
212212
// CollectedAt is set here for testing purposes and overrode by
213-
// the server to the time the server received the result to protect
214-
// against clock skew.
213+
// coderd to the time of server receipt to solve clock skew.
215214
//
216215
// In the future, the server may accept the timestamp from the agent
217-
// if it is certain the clocks are in sync.
216+
// if it can guarantee the clocks are synchronized.
218217
CollectedAt: time.Now(),
219218
}
220219
cmd, err := a.sshServer.CreateCommand(ctx, md.Script, nil)
221220
if err != nil {
222-
result.Error = err.Error()
221+
result.Error = fmt.Sprintf("create cmd: %+v", err)
223222
return result
224223
}
225224

226225
cmd.Stdout = &out
227226
cmd.Stderr = &out
227+
cmd.Stdin = io.LimitReader(nil, 0)
228228

229-
// The error isn't mutually exclusive with useful output.
230-
err = cmd.Run()
229+
// We split up Start and Wait instead of calling Run so that we can return a more precise error.
230+
err = cmd.Start()
231+
if err != nil {
232+
result.Error = fmt.Sprintf("start cmd: %+v", err)
233+
return result
234+
}
231235

236+
// This error isn't mutually exclusive with useful output.
237+
err = cmd.Wait()
232238
const bufLimit = 10 << 10
233239
if out.Len() > bufLimit {
234240
err = errors.Join(
@@ -238,8 +244,12 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM
238244
out.Truncate(bufLimit)
239245
}
240246

247+
// Important: if the command times out, we may see a misleading error like
248+
// "exit status 1", so it's important to include the context error.
249+
err = errors.Join(err, ctx.Err())
250+
241251
if err != nil {
242-
result.Error = err.Error()
252+
result.Error = fmt.Sprintf("run cmd: %+v", err)
243253
}
244254
result.Value = out.String()
245255
return result

agent/agent_test.go

+91-60
Original file line numberDiff line numberDiff line change
@@ -951,19 +951,17 @@ func TestAgent_StartupScript(t *testing.T) {
951951
func TestAgent_Metadata(t *testing.T) {
952952
t.Parallel()
953953

954+
echoHello := "echo 'hello'"
955+
954956
t.Run("Once", func(t *testing.T) {
955957
t.Parallel()
956-
script := "echo -n hello"
957-
if runtime.GOOS == "windows" {
958-
script = "powershell " + script
959-
}
960958
//nolint:dogsled
961959
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
962960
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
963961
{
964962
Key: "greeting",
965963
Interval: 0,
966-
Script: script,
964+
Script: echoHello,
967965
},
968966
},
969967
}, 0)
@@ -986,78 +984,111 @@ func TestAgent_Metadata(t *testing.T) {
986984
})
987985

988986
t.Run("Many", func(t *testing.T) {
989-
if runtime.GOOS == "windows" {
990-
// Shell scripting in Windows is a pain, and we have already tested
991-
// that the OS logic works in the simpler "Once" test above.
992-
t.Skip()
993-
}
994987
t.Parallel()
995-
996-
dir := t.TempDir()
997-
998-
const reportInterval = 2
999-
const intervalUnit = 100 * time.Millisecond
1000-
var (
1001-
greetingPath = filepath.Join(dir, "greeting")
1002-
script = "echo hello | tee -a " + greetingPath
1003-
)
988+
//nolint:dogsled
1004989
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
1005990
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
1006991
{
1007992
Key: "greeting",
1008-
Interval: reportInterval,
1009-
Script: script,
1010-
},
1011-
{
1012-
Key: "bad",
1013-
Interval: reportInterval,
1014-
Script: "exit 1",
993+
Interval: 1,
994+
Timeout: 100,
995+
Script: echoHello,
1015996
},
1016997
},
1017998
}, 0)
1018999

1000+
var gotMd map[string]agentsdk.PostMetadataRequest
10191001
require.Eventually(t, func() bool {
1020-
return len(client.getMetadata()) == 2
1002+
gotMd = client.getMetadata()
1003+
return len(gotMd) == 1
10211004
}, testutil.WaitShort, testutil.IntervalMedium)
10221005

1023-
for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) {
1024-
md := client.getMetadata()
1025-
if len(md) != 2 {
1026-
panic("unexpected number of metadata entries")
1027-
}
1006+
collectedAt1 := gotMd["greeting"].CollectedAt
1007+
if !assert.Equal(t, "hello", strings.TrimSpace(gotMd["greeting"].Value)) {
1008+
t.Errorf("got: %+v", gotMd)
1009+
}
10281010

1029-
require.Equal(t, "hello\n", md["greeting"].Value)
1030-
require.Equal(t, "exit status 1", md["bad"].Error)
1011+
if !assert.Eventually(t, func() bool {
1012+
gotMd = client.getMetadata()
1013+
return gotMd["greeting"].CollectedAt.After(collectedAt1)
1014+
}, testutil.WaitShort, testutil.IntervalMedium) {
1015+
t.Fatalf("expected metadata to be collected again")
1016+
}
1017+
})
1018+
}
10311019

1032-
greetingByt, err := os.ReadFile(greetingPath)
1033-
require.NoError(t, err)
1020+
func TestAgentMetadata_Timing(t *testing.T) {
1021+
if runtime.GOOS == "windows" {
1022+
// Shell scripting in Windows is a pain, and we have already tested
1023+
// that the OS logic works in the simpler tests.
1024+
t.Skip()
1025+
}
1026+
testutil.SkipIfNotTiming(t)
1027+
t.Parallel()
10341028

1035-
var (
1036-
numGreetings = bytes.Count(greetingByt, []byte("hello"))
1037-
idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit)
1038-
// We allow a 50% error margin because the report loop may backlog
1039-
// in CI and other toasters. In production, there is no hard
1040-
// guarantee on timing either, and the frontend gives similar
1041-
// wiggle room to the staleness of the value.
1042-
upperBound = int(idealNumGreetings) + 1
1043-
lowerBound = (int(idealNumGreetings) / 2)
1044-
)
1045-
1046-
if idealNumGreetings < 50 {
1047-
// There is an insufficient sample size.
1048-
continue
1049-
}
1029+
dir := t.TempDir()
10501030

1051-
t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings)
1052-
// The report loop may slow down on load, but it should never, ever
1053-
// speed up.
1054-
if numGreetings > upperBound {
1055-
t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start))
1056-
} else if numGreetings < lowerBound {
1057-
t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound)
1058-
}
1031+
const reportInterval = 2
1032+
const intervalUnit = 100 * time.Millisecond
1033+
var (
1034+
greetingPath = filepath.Join(dir, "greeting")
1035+
script = "echo hello | tee -a " + greetingPath
1036+
)
1037+
//nolint:dogsled
1038+
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
1039+
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
1040+
{
1041+
Key: "greeting",
1042+
Interval: reportInterval,
1043+
Script: script,
1044+
},
1045+
{
1046+
Key: "bad",
1047+
Interval: reportInterval,
1048+
Script: "exit 1",
1049+
},
1050+
},
1051+
}, 0)
1052+
1053+
require.Eventually(t, func() bool {
1054+
return len(client.getMetadata()) == 2
1055+
}, testutil.WaitShort, testutil.IntervalMedium)
1056+
1057+
for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) {
1058+
md := client.getMetadata()
1059+
require.Len(t, md, 2, "got: %+v", md)
1060+
1061+
require.Equal(t, "hello\n", md["greeting"].Value)
1062+
require.Equal(t, "run cmd: exit status 1", md["bad"].Error)
1063+
1064+
greetingByt, err := os.ReadFile(greetingPath)
1065+
require.NoError(t, err)
1066+
1067+
var (
1068+
numGreetings = bytes.Count(greetingByt, []byte("hello"))
1069+
idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit)
1070+
// We allow a 50% error margin because the report loop may backlog
1071+
// in CI and other toasters. In production, there is no hard
1072+
// guarantee on timing either, and the frontend gives similar
1073+
// wiggle room to the staleness of the value.
1074+
upperBound = int(idealNumGreetings) + 1
1075+
lowerBound = (int(idealNumGreetings) / 2)
1076+
)
1077+
1078+
if idealNumGreetings < 50 {
1079+
// There is an insufficient sample size.
1080+
continue
10591081
}
1060-
})
1082+
1083+
t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings)
1084+
// The report loop may slow down on load, but it should never, ever
1085+
// speed up.
1086+
if numGreetings > upperBound {
1087+
t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start))
1088+
} else if numGreetings < lowerBound {
1089+
t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound)
1090+
}
1091+
}
10611092
}
10621093

10631094
func TestAgent_Lifecycle(t *testing.T) {

testutil/enable_timing.go

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
//go:build timing
2+
3+
package testutil
4+
5+
var _ = func() any {
6+
timing = true
7+
return nil
8+
}()

testutil/timing.go

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package testutil
2+
3+
import (
4+
"testing"
5+
)
6+
7+
// We can't run timing-sensitive tests in CI because of the
8+
// great variance in runner performance. Instead of not testing timing at all,
9+
// we relegate it to humans manually running certain tests with the "-timing"
10+
// flag from time to time.
11+
//
12+
// Eventually, we should run all timing tests in a self-hosted runner.
13+
14+
var timing bool
15+
16+
func SkipIfNotTiming(t *testing.T) {
17+
if !timing {
18+
t.Skip("skipping timing-sensitive test")
19+
}
20+
}

0 commit comments

Comments
 (0)