Skip to content

chore: skip timing-sensistive AgentMetadata test in the standard suite #7237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,25 +210,31 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM
var out bytes.Buffer
result := &codersdk.WorkspaceAgentMetadataResult{
// CollectedAt is set here for testing purposes and overrode by
// the server to the time the server received the result to protect
// against clock skew.
// coderd to the time of server receipt to solve clock skew.
//
// In the future, the server may accept the timestamp from the agent
// if it is certain the clocks are in sync.
// if it can guarantee the clocks are synchronized.
CollectedAt: time.Now(),
}
cmd, err := a.sshServer.CreateCommand(ctx, md.Script, nil)
if err != nil {
result.Error = err.Error()
result.Error = fmt.Sprintf("create cmd: %+v", err)
return result
}

cmd.Stdout = &out
cmd.Stderr = &out
cmd.Stdin = io.LimitReader(nil, 0)

// The error isn't mutually exclusive with useful output.
err = cmd.Run()
// We split up Start and Wait instead of calling Run so that we can return a more precise error.
err = cmd.Start()
if err != nil {
result.Error = fmt.Sprintf("start cmd: %+v", err)
return result
}

// This error isn't mutually exclusive with useful output.
err = cmd.Wait()
const bufLimit = 10 << 10
if out.Len() > bufLimit {
err = errors.Join(
Expand All @@ -238,8 +244,12 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM
out.Truncate(bufLimit)
}

// Important: if the command times out, we may see a misleading error like
// "exit status 1", so it's important to include the context error.
err = errors.Join(err, ctx.Err())

if err != nil {
result.Error = err.Error()
result.Error = fmt.Sprintf("run cmd: %+v", err)
}
result.Value = out.String()
return result
Expand Down
151 changes: 91 additions & 60 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,19 +951,17 @@ func TestAgent_StartupScript(t *testing.T) {
func TestAgent_Metadata(t *testing.T) {
t.Parallel()

echoHello := "echo 'hello'"

t.Run("Once", func(t *testing.T) {
t.Parallel()
script := "echo -n hello"
if runtime.GOOS == "windows" {
script = "powershell " + script
}
//nolint:dogsled
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
{
Key: "greeting",
Interval: 0,
Script: script,
Script: echoHello,
},
},
}, 0)
Expand All @@ -986,78 +984,111 @@ func TestAgent_Metadata(t *testing.T) {
})

t.Run("Many", func(t *testing.T) {
if runtime.GOOS == "windows" {
// Shell scripting in Windows is a pain, and we have already tested
// that the OS logic works in the simpler "Once" test above.
t.Skip()
}
t.Parallel()

dir := t.TempDir()

const reportInterval = 2
const intervalUnit = 100 * time.Millisecond
var (
greetingPath = filepath.Join(dir, "greeting")
script = "echo hello | tee -a " + greetingPath
)
//nolint:dogsled
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
{
Key: "greeting",
Interval: reportInterval,
Script: script,
},
{
Key: "bad",
Interval: reportInterval,
Script: "exit 1",
Interval: 1,
Timeout: 100,
Script: echoHello,
},
},
}, 0)

var gotMd map[string]agentsdk.PostMetadataRequest
require.Eventually(t, func() bool {
return len(client.getMetadata()) == 2
gotMd = client.getMetadata()
return len(gotMd) == 1
}, testutil.WaitShort, testutil.IntervalMedium)

for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) {
md := client.getMetadata()
if len(md) != 2 {
panic("unexpected number of metadata entries")
}
collectedAt1 := gotMd["greeting"].CollectedAt
if !assert.Equal(t, "hello", strings.TrimSpace(gotMd["greeting"].Value)) {
t.Errorf("got: %+v", gotMd)
}

require.Equal(t, "hello\n", md["greeting"].Value)
require.Equal(t, "exit status 1", md["bad"].Error)
if !assert.Eventually(t, func() bool {
gotMd = client.getMetadata()
return gotMd["greeting"].CollectedAt.After(collectedAt1)
}, testutil.WaitShort, testutil.IntervalMedium) {
t.Fatalf("expected metadata to be collected again")
}
})
}

greetingByt, err := os.ReadFile(greetingPath)
require.NoError(t, err)
func TestAgentMetadata_Timing(t *testing.T) {
if runtime.GOOS == "windows" {
// Shell scripting in Windows is a pain, and we have already tested
// that the OS logic works in the simpler tests.
t.Skip()
}
testutil.SkipIfNotTiming(t)
t.Parallel()

var (
numGreetings = bytes.Count(greetingByt, []byte("hello"))
idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit)
// We allow a 50% error margin because the report loop may backlog
// in CI and other toasters. In production, there is no hard
// guarantee on timing either, and the frontend gives similar
// wiggle room to the staleness of the value.
upperBound = int(idealNumGreetings) + 1
lowerBound = (int(idealNumGreetings) / 2)
)

if idealNumGreetings < 50 {
// There is an insufficient sample size.
continue
}
dir := t.TempDir()

t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings)
// The report loop may slow down on load, but it should never, ever
// speed up.
if numGreetings > upperBound {
t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start))
} else if numGreetings < lowerBound {
t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound)
}
const reportInterval = 2
const intervalUnit = 100 * time.Millisecond
var (
greetingPath = filepath.Join(dir, "greeting")
script = "echo hello | tee -a " + greetingPath
)
//nolint:dogsled
_, client, _, _, _ := setupAgent(t, agentsdk.Manifest{
Metadata: []codersdk.WorkspaceAgentMetadataDescription{
{
Key: "greeting",
Interval: reportInterval,
Script: script,
},
{
Key: "bad",
Interval: reportInterval,
Script: "exit 1",
},
},
}, 0)

require.Eventually(t, func() bool {
return len(client.getMetadata()) == 2
}, testutil.WaitShort, testutil.IntervalMedium)

for start := time.Now(); time.Since(start) < testutil.WaitMedium; time.Sleep(testutil.IntervalMedium) {
md := client.getMetadata()
require.Len(t, md, 2, "got: %+v", md)

require.Equal(t, "hello\n", md["greeting"].Value)
require.Equal(t, "run cmd: exit status 1", md["bad"].Error)

greetingByt, err := os.ReadFile(greetingPath)
require.NoError(t, err)

var (
numGreetings = bytes.Count(greetingByt, []byte("hello"))
idealNumGreetings = time.Since(start) / (reportInterval * intervalUnit)
// We allow a 50% error margin because the report loop may backlog
// in CI and other toasters. In production, there is no hard
// guarantee on timing either, and the frontend gives similar
// wiggle room to the staleness of the value.
upperBound = int(idealNumGreetings) + 1
lowerBound = (int(idealNumGreetings) / 2)
)

if idealNumGreetings < 50 {
// There is an insufficient sample size.
continue
}
})

t.Logf("numGreetings: %d, idealNumGreetings: %d", numGreetings, idealNumGreetings)
// The report loop may slow down on load, but it should never, ever
// speed up.
if numGreetings > upperBound {
t.Fatalf("too many greetings: %d > %d in %v", numGreetings, upperBound, time.Since(start))
} else if numGreetings < lowerBound {
t.Fatalf("too few greetings: %d < %d", numGreetings, lowerBound)
}
}
}

func TestAgent_Lifecycle(t *testing.T) {
Expand Down
8 changes: 8 additions & 0 deletions testutil/enable_timing.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
//go:build timing

package testutil

var _ = func() any {
timing = true
return nil
}()
20 changes: 20 additions & 0 deletions testutil/timing.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package testutil

import (
"testing"
)

// We can't run timing-sensitive tests in CI because of the
// great variance in runner performance. Instead of not testing timing at all,
// we relegate it to humans manually running certain tests with the "-timing"
// flag from time to time.
//
// Eventually, we should run all timing tests in a self-hosted runner.

var timing bool

func SkipIfNotTiming(t *testing.T) {
if !timing {
t.Skip("skipping timing-sensitive test")
}
}