From 3b0b1f368bb73af50e6cbc7af98e09bfe52e2b2b Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Wed, 19 Jul 2023 23:45:16 +0000 Subject: [PATCH 01/11] flake(agent) fix `TestAgent_Metadata/Once` flake There's a race condition between checking `lastCollectedAts` and spawning the singleflight that could cause a metadata to execute multiple times, even if specified to run once. This moves the `lastCollectedAts` check to inside the singleflight to actually ensure the validity of the checks. --- agent/agent.go | 51 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 4b573f56ee5b8..33abe9fc43084 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -316,9 +316,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { const metadataLimit = 128 var ( - baseTicker = time.NewTicker(a.reportMetadataInterval) - lastCollectedAts = make(map[string]time.Time) - metadataResults = make(chan metadataResultAndKey, metadataLimit) + baseTicker = time.NewTicker(a.reportMetadataInterval) + lastCollectedAtMu sync.RWMutex + lastCollectedAts = make(map[string]time.Time) + metadataResults = make(chan metadataResultAndKey, metadataLimit) ) defer baseTicker.Stop() @@ -367,6 +368,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // If the manifest changes (e.g. on agent reconnect) we need to // purge old cache values to prevent lastCollectedAt from growing // boundlessly. + lastCollectedAtMu.Lock() for key := range lastCollectedAts { if slices.IndexFunc(manifest.Metadata, func(md codersdk.WorkspaceAgentMetadataDescription) bool { return md.Key == key @@ -374,36 +376,45 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { delete(lastCollectedAts, key) } } + lastCollectedAtMu.Unlock() // Spawn a goroutine for each metadata collection, and use a // channel to synchronize the results and avoid both messy // mutex logic and overloading the API. for _, md := range manifest.Metadata { - collectedAt, ok := lastCollectedAts[md.Key] - if ok { - // If the interval is zero, we assume the user just wants - // a single collection at startup, not a spinning loop. - if md.Interval == 0 { - continue - } - // The last collected value isn't quite stale yet, so we skip it. - if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) { - continue - } - } - md := md // We send the result to the channel in the goroutine to avoid // sending the same result multiple times. So, we don't care about // the return values. go flight.Do(md.Key, func() { + lastCollectedAtMu.RLock() + collectedAt, ok := lastCollectedAts[md.Key] + lastCollectedAtMu.RUnlock() + if ok { + // If the interval is zero, we assume the user just wants + // a single collection at startup, not a spinning loop. + if md.Interval == 0 { + return + } + // The last collected value isn't quite stale yet, so we skip it. + if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) { + return + } + } + timeout := md.Timeout if timeout == 0 { - timeout = md.Interval + if md.Interval != 0 { + timeout = md.Interval + } else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 { + // Fallback to the report interval + timeout = interval + } else { + // If the interval is still 0, default to 5. + timeout = 5 + } } - ctx, cancel := context.WithTimeout(ctx, - time.Duration(timeout)*time.Second, - ) + ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) defer cancel() select { From cf2f7dcbade9e69266439becde3c0aa47f41e832 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Wed, 19 Jul 2023 23:53:14 +0000 Subject: [PATCH 02/11] fixup! flake(agent) fix `TestAgent_Metadata/Once` flake --- agent/agent.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 33abe9fc43084..67eb939acde06 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -410,7 +410,9 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // Fallback to the report interval timeout = interval } else { - // If the interval is still 0, default to 5. + // If the interval is still 0 (possible if the interval + // is less than a second), default to 5. This was + // randomly picked. timeout = 5 } } From 3f73d16e669b4aad9f0b17d694db4c5bdb0727c6 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 00:08:05 +0000 Subject: [PATCH 03/11] fixup! flake(agent) fix `TestAgent_Metadata/Once` flake --- agent/agent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/agent/agent.go b/agent/agent.go index 67eb939acde06..00dadf3a4e4d8 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -339,6 +339,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { if err != nil { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err)) } + continue case <-baseTicker.C: } From 7b4097aed5089eb61157f28544614f5362018403 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 04:19:37 +0000 Subject: [PATCH 04/11] fixup! flake(agent) fix `TestAgent_Metadata/Once` flake --- agent/agent_test.go | 7 ++++++- agent/agenttest/client.go | 21 +++++++++++++++------ coderd/tailnet_test.go | 2 +- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 65a2abb11f87c..2ccee3154046a 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1066,6 +1066,7 @@ func TestAgent_StartupScript(t *testing.T) { t.Parallel() logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) client := agenttest.NewClient(t, + logger, uuid.New(), agentsdk.Manifest{ StartupScript: command, @@ -1097,6 +1098,7 @@ func TestAgent_StartupScript(t *testing.T) { t.Parallel() logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) client := agenttest.NewClient(t, + logger, uuid.New(), agentsdk.Manifest{ StartupScript: command, @@ -1470,6 +1472,7 @@ func TestAgent_Lifecycle(t *testing.T) { derpMap, _ := tailnettest.RunDERPAndSTUN(t) client := agenttest.NewClient(t, + logger, uuid.New(), agentsdk.Manifest{ DERPMap: derpMap, @@ -1742,6 +1745,7 @@ func TestAgent_Reconnect(t *testing.T) { statsCh := make(chan *agentsdk.Stats, 50) derpMap, _ := tailnettest.RunDERPAndSTUN(t) client := agenttest.NewClient(t, + logger, agentID, agentsdk.Manifest{ DERPMap: derpMap, @@ -1776,6 +1780,7 @@ func TestAgent_WriteVSCodeConfigs(t *testing.T) { defer coordinator.Close() client := agenttest.NewClient(t, + logger, uuid.New(), agentsdk.Manifest{ GitAuthConfigs: 1, @@ -1900,7 +1905,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati }) statsCh := make(chan *agentsdk.Stats, 50) fs := afero.NewMemMapFs() - c := agenttest.NewClient(t, metadata.AgentID, metadata, statsCh, coordinator) + c := agenttest.NewClient(t, logger, metadata.AgentID, metadata, statsCh, coordinator) options := agent.Options{ Client: c, diff --git a/agent/agenttest/client.go b/agent/agenttest/client.go index c69ff59eb730b..a87607cf2dab8 100644 --- a/agent/agenttest/client.go +++ b/agent/agenttest/client.go @@ -18,6 +18,7 @@ import ( ) func NewClient(t testing.TB, + logger slog.Logger, agentID uuid.UUID, manifest agentsdk.Manifest, statsChan chan *agentsdk.Stats, @@ -28,6 +29,7 @@ func NewClient(t testing.TB, } return &Client{ t: t, + logger: logger.Named("client"), agentID: agentID, manifest: manifest, statsChan: statsChan, @@ -37,6 +39,7 @@ func NewClient(t testing.TB, type Client struct { t testing.TB + logger slog.Logger agentID uuid.UUID manifest agentsdk.Manifest metadata map[string]agentsdk.PostMetadataRequest @@ -110,14 +113,16 @@ func (c *Client) GetLifecycleStates() []codersdk.WorkspaceAgentLifecycle { return c.lifecycleStates } -func (c *Client) PostLifecycle(_ context.Context, req agentsdk.PostLifecycleRequest) error { +func (c *Client) PostLifecycle(ctx context.Context, req agentsdk.PostLifecycleRequest) error { c.mu.Lock() defer c.mu.Unlock() c.lifecycleStates = append(c.lifecycleStates, req.State) + c.logger.Debug(ctx, "post lifecycle", slog.F("req", req)) return nil } -func (*Client) PostAppHealth(_ context.Context, _ agentsdk.PostAppHealthsRequest) error { +func (c *Client) PostAppHealth(ctx context.Context, req agentsdk.PostAppHealthsRequest) error { + c.logger.Debug(ctx, "post app health", slog.F("req", req)) return nil } @@ -133,20 +138,22 @@ func (c *Client) GetMetadata() map[string]agentsdk.PostMetadataRequest { return maps.Clone(c.metadata) } -func (c *Client) PostMetadata(_ context.Context, key string, req agentsdk.PostMetadataRequest) error { +func (c *Client) PostMetadata(ctx context.Context, key string, req agentsdk.PostMetadataRequest) error { c.mu.Lock() defer c.mu.Unlock() if c.metadata == nil { c.metadata = make(map[string]agentsdk.PostMetadataRequest) } c.metadata[key] = req + c.logger.Debug(ctx, "post metadata", slog.F("key", key), slog.F("req", req)) return nil } -func (c *Client) PostStartup(_ context.Context, startup agentsdk.PostStartupRequest) error { +func (c *Client) PostStartup(ctx context.Context, startup agentsdk.PostStartupRequest) error { c.mu.Lock() defer c.mu.Unlock() c.startup = startup + c.logger.Debug(ctx, "post startup", slog.F("req", startup)) return nil } @@ -156,13 +163,14 @@ func (c *Client) GetStartupLogs() []agentsdk.StartupLog { return c.logs } -func (c *Client) PatchStartupLogs(_ context.Context, logs agentsdk.PatchStartupLogs) error { +func (c *Client) PatchStartupLogs(ctx context.Context, logs agentsdk.PatchStartupLogs) error { c.mu.Lock() defer c.mu.Unlock() if c.PatchWorkspaceLogs != nil { return c.PatchWorkspaceLogs() } c.logs = append(c.logs, logs.Logs...) + c.logger.Debug(ctx, "patch startup logs", slog.F("req", logs)) return nil } @@ -173,9 +181,10 @@ func (c *Client) SetServiceBannerFunc(f func() (codersdk.ServiceBannerConfig, er c.GetServiceBannerFunc = f } -func (c *Client) GetServiceBanner(_ context.Context) (codersdk.ServiceBannerConfig, error) { +func (c *Client) GetServiceBanner(ctx context.Context) (codersdk.ServiceBannerConfig, error) { c.mu.Lock() defer c.mu.Unlock() + c.logger.Debug(ctx, "get service banner") if c.GetServiceBannerFunc != nil { return c.GetServiceBannerFunc() } diff --git a/coderd/tailnet_test.go b/coderd/tailnet_test.go index d6341391934a7..4daf71f30a62c 100644 --- a/coderd/tailnet_test.go +++ b/coderd/tailnet_test.go @@ -137,7 +137,7 @@ func setupAgent(t *testing.T, agentAddresses []netip.Prefix) (uuid.UUID, agent.A _ = coord.Close() }) - c := agenttest.NewClient(t, manifest.AgentID, manifest, make(chan *agentsdk.Stats, 50), coord) + c := agenttest.NewClient(t, logger, manifest.AgentID, manifest, make(chan *agentsdk.Stats, 50), coord) options := agent.Options{ Client: c, From 76b3bb8c9819ac096d157314004e3ab1a6f0473b Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 17:20:33 +0000 Subject: [PATCH 05/11] more debug logging --- agent/agent.go | 23 ++++++++++++++++++----- agent/agent_test.go | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 00dadf3a4e4d8..5a869a2122b79 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -320,6 +320,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { lastCollectedAtMu sync.RWMutex lastCollectedAts = make(map[string]time.Time) metadataResults = make(chan metadataResultAndKey, metadataLimit) + logger = a.logger.Named("metadata") ) defer baseTicker.Stop() @@ -330,6 +331,8 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { var flight trySingleflight for { + var delay time.Duration + select { case <-ctx.Done(): return @@ -340,14 +343,16 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err)) } continue - case <-baseTicker.C: + case t := <-baseTicker.C: + delay = time.Since(t) } + logger.Debug(ctx, "report metadata ticking", slog.F("delay", delay.String())) + if len(metadataResults) > 0 { // The inner collection loop expects the channel is empty before spinning up // all the collection goroutines. - a.logger.Debug( - ctx, "metadata collection backpressured", + logger.Debug(ctx, "metadata collection backpressured", slog.F("queue_len", len(metadataResults)), ) continue @@ -359,7 +364,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { } if len(manifest.Metadata) > metadataLimit { - a.logger.Error( + logger.Error( ctx, "metadata limit exceeded", slog.F("limit", metadataLimit), slog.F("got", len(manifest.Metadata)), ) @@ -374,6 +379,9 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { if slices.IndexFunc(manifest.Metadata, func(md codersdk.WorkspaceAgentMetadataDescription) bool { return md.Key == key }) < 0 { + logger.Debug(ctx, "deleting lastCollected key, missing from manifest", + slog.F("key", key), + ) delete(lastCollectedAts, key) } } @@ -388,6 +396,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // sending the same result multiple times. So, we don't care about // the return values. go flight.Do(md.Key, func() { + ctx := slog.With(ctx, slog.F("key", md.Key)) lastCollectedAtMu.RLock() collectedAt, ok := lastCollectedAts[md.Key] lastCollectedAtMu.RUnlock() @@ -395,10 +404,12 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // If the interval is zero, we assume the user just wants // a single collection at startup, not a spinning loop. if md.Interval == 0 { + logger.Debug(ctx, "skipping metadata, interval 0 and alredy collected once") return } // The last collected value isn't quite stale yet, so we skip it. if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) { + logger.Debug(ctx, "skipping metadata, ") return } } @@ -417,11 +428,13 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { timeout = 5 } } - ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) + ctxTimeout := time.Duration(timeout) * time.Second + ctx, cancel := context.WithTimeout(ctx, ctxTimeout) defer cancel() select { case <-ctx.Done(): + logger.Debug(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout)) case metadataResults <- metadataResultAndKey{ key: md.Key, result: a.collectMetadata(ctx, md), diff --git a/agent/agent_test.go b/agent/agent_test.go index 2ccee3154046a..94a9c5326e950 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -1905,7 +1905,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati }) statsCh := make(chan *agentsdk.Stats, 50) fs := afero.NewMemMapFs() - c := agenttest.NewClient(t, logger, metadata.AgentID, metadata, statsCh, coordinator) + c := agenttest.NewClient(t, logger.Named("agent"), metadata.AgentID, metadata, statsCh, coordinator) options := agent.Options{ Client: c, From 4b08319cb6b046072f29bd522985d36d12b1bc50 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 17:40:16 +0000 Subject: [PATCH 06/11] fixup! more debug logging --- agent/agent.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agent/agent.go b/agent/agent.go index 5a869a2122b79..0547c31bc3216 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -337,7 +337,9 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { case <-ctx.Done(): return case mr := <-metadataResults: + lastCollectedAtMu.Lock() lastCollectedAts[mr.key] = mr.result.CollectedAt + lastCollectedAtMu.Unlock() err := a.client.PostMetadata(ctx, mr.key, *mr.result) if err != nil { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err)) @@ -439,6 +441,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { key: md.Key, result: a.collectMetadata(ctx, md), }: + logger.Debug(ctx, "sent metadata") } }) } From 2ae25c5f9652550d73b15520b3370e10684c2a27 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 18:01:56 +0000 Subject: [PATCH 07/11] fixup! more debug logging --- agent/agent.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 0547c31bc3216..ed4295f3285df 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -242,7 +242,7 @@ func (a *agent) runLoop(ctx context.Context) { } } -func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentMetadataDescription) *codersdk.WorkspaceAgentMetadataResult { +func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentMetadataDescription, now time.Time) *codersdk.WorkspaceAgentMetadataResult { var out bytes.Buffer result := &codersdk.WorkspaceAgentMetadataResult{ // CollectedAt is set here for testing purposes and overrode by @@ -250,7 +250,7 @@ func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentM // // In the future, the server may accept the timestamp from the agent // if it can guarantee the clocks are synchronized. - CollectedAt: time.Now(), + CollectedAt: now, } cmdPty, err := a.sshServer.CreateCommand(ctx, md.Script, nil) if err != nil { @@ -337,9 +337,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { case <-ctx.Done(): return case mr := <-metadataResults: - lastCollectedAtMu.Lock() - lastCollectedAts[mr.key] = mr.result.CollectedAt - lastCollectedAtMu.Unlock() + err := a.client.PostMetadata(ctx, mr.key, *mr.result) if err != nil { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err)) @@ -406,7 +404,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // If the interval is zero, we assume the user just wants // a single collection at startup, not a spinning loop. if md.Interval == 0 { - logger.Debug(ctx, "skipping metadata, interval 0 and alredy collected once") + logger.Debug(ctx, "skipping metadata, interval 0 and already collected once") return } // The last collected value isn't quite stale yet, so we skip it. @@ -434,14 +432,18 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { ctx, cancel := context.WithTimeout(ctx, ctxTimeout) defer cancel() + now := time.Now() select { case <-ctx.Done(): logger.Debug(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout)) case metadataResults <- metadataResultAndKey{ key: md.Key, - result: a.collectMetadata(ctx, md), + result: a.collectMetadata(ctx, md, now), }: - logger.Debug(ctx, "sent metadata") + logger.Debug(ctx, "sent metadata", slog.F("took", time.Since(now).String())) + lastCollectedAtMu.Lock() + lastCollectedAts[md.Key] = now + lastCollectedAtMu.Unlock() } }) } From 07a5b9ab940ea7e813f2e866d1fc23c40290b5f5 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 18:16:31 +0000 Subject: [PATCH 08/11] fixup! more debug logging --- agent/agent.go | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index ed4295f3285df..108883fbc52dd 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -298,17 +298,26 @@ type metadataResultAndKey struct { } type trySingleflight struct { - m sync.Map + mu sync.Mutex + m map[string]struct{} } func (t *trySingleflight) Do(key string, fn func()) { - _, loaded := t.m.LoadOrStore(key, struct{}{}) - if !loaded { - // There is already a goroutine running for this key. + t.mu.Lock() + _, ok := t.m[key] + if ok { + t.mu.Unlock() return } - defer t.m.Delete(key) + t.m[key] = struct{}{} + t.mu.Unlock() + defer func() { + t.mu.Lock() + delete(t.m, key) + t.mu.Unlock() + }() + fn() } @@ -328,7 +337,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // a goroutine running for a given key. This is to prevent a build-up of // goroutines waiting on Do when the script takes many multiples of // baseInterval to run. - var flight trySingleflight + var flight = trySingleflight{m: map[string]struct{}{}} for { var delay time.Duration From d19564060c7a6dcb30e0fe3d80ca44dca2d35195 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 18:32:10 +0000 Subject: [PATCH 09/11] fixup! more debug logging --- agent/agent.go | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 108883fbc52dd..7c4a358d90f04 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -340,8 +340,6 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { var flight = trySingleflight{m: map[string]struct{}{}} for { - var delay time.Duration - select { case <-ctx.Done(): return @@ -352,12 +350,9 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err)) } continue - case t := <-baseTicker.C: - delay = time.Since(t) + case <-baseTicker.C: } - logger.Debug(ctx, "report metadata ticking", slog.F("delay", delay.String())) - if len(metadataResults) > 0 { // The inner collection loop expects the channel is empty before spinning up // all the collection goroutines. @@ -413,12 +408,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // If the interval is zero, we assume the user just wants // a single collection at startup, not a spinning loop. if md.Interval == 0 { - logger.Debug(ctx, "skipping metadata, interval 0 and already collected once") return } // The last collected value isn't quite stale yet, so we skip it. if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) { - logger.Debug(ctx, "skipping metadata, ") return } } @@ -429,7 +422,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { timeout = md.Interval } else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 { // Fallback to the report interval - timeout = interval + timeout = interval * 3 } else { // If the interval is still 0 (possible if the interval // is less than a second), default to 5. This was @@ -444,12 +437,11 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { now := time.Now() select { case <-ctx.Done(): - logger.Debug(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout)) + logger.Warn(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout)) case metadataResults <- metadataResultAndKey{ key: md.Key, result: a.collectMetadata(ctx, md, now), }: - logger.Debug(ctx, "sent metadata", slog.F("took", time.Since(now).String())) lastCollectedAtMu.Lock() lastCollectedAts[md.Key] = now lastCollectedAtMu.Unlock() From 14eb5a709f1de106eec32c2ec82af3a4fa64dee3 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 18:34:51 +0000 Subject: [PATCH 10/11] fixup! more debug logging --- agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 7c4a358d90f04..68bb9eed59ae8 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -337,7 +337,7 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { // a goroutine running for a given key. This is to prevent a build-up of // goroutines waiting on Do when the script takes many multiples of // baseInterval to run. - var flight = trySingleflight{m: map[string]struct{}{}} + flight := trySingleflight{m: map[string]struct{}{}} for { select { From b55e04bc8a1af05675bddbcca4087441a5244517 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 20 Jul 2023 18:39:07 +0000 Subject: [PATCH 11/11] fixup! more debug logging --- agent/agent.go | 1 - 1 file changed, 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 68bb9eed59ae8..2e89960f60105 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -344,7 +344,6 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { case <-ctx.Done(): return case mr := <-metadataResults: - err := a.client.PostMetadata(ctx, mr.key, *mr.result) if err != nil { a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))