Skip to content

Commit d3b79c6

Browse files
committed
fix(agent): check agent metadata every second instead of minute
I believe this is the fix for #8577. When looking at the websocket, metadata keys go over 2 minutes without updating, even when their interval is much lower. This fixes `reportMetadataLoop` to run every second and properly calculate the interval. A previous refactor set it to tick each minute instead of the previous behavior of each second.
1 parent 3988917 commit d3b79c6

File tree

1 file changed

+55
-11
lines changed

1 file changed

+55
-11
lines changed

agent/agent.go

+55-11
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func New(options Options) Agent {
108108
}
109109
}
110110
if options.ReportMetadataInterval == 0 {
111-
options.ReportMetadataInterval = 1 * time.Minute
111+
options.ReportMetadataInterval = time.Second
112112
}
113113
if options.ServiceBannerRefreshInterval == 0 {
114114
options.ServiceBannerRefreshInterval = 2 * time.Minute
@@ -328,16 +328,45 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
328328
// baseInterval to run.
329329
var flight trySingleflight
330330

331+
postMetadata := func(mr metadataResultAndKey) {
332+
lastCollectedAts[mr.key] = mr.result.CollectedAt
333+
err := a.client.PostMetadata(ctx, mr.key, *mr.result)
334+
if err != nil {
335+
a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))
336+
}
337+
}
338+
flushAllMetadata := func() {
339+
wg := sync.WaitGroup{}
340+
defer wg.Wait()
341+
for {
342+
select {
343+
case <-ctx.Done():
344+
return
345+
case mr := <-metadataResults:
346+
wg.Add(1)
347+
go func() {
348+
defer wg.Done()
349+
postMetadata(mr)
350+
}()
351+
continue
352+
default:
353+
return
354+
}
355+
}
356+
}
357+
331358
for {
359+
// Ensure all backpressured metadata is posted.
360+
if len(metadataResults) > 1 {
361+
flushAllMetadata()
362+
}
363+
332364
select {
333365
case <-ctx.Done():
334366
return
335367
case mr := <-metadataResults:
336-
lastCollectedAts[mr.key] = mr.result.CollectedAt
337-
err := a.client.PostMetadata(ctx, mr.key, *mr.result)
338-
if err != nil {
339-
a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))
340-
}
368+
postMetadata(mr)
369+
continue
341370
case <-baseTicker.C:
342371
}
343372

@@ -386,8 +415,15 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
386415
if md.Interval == 0 {
387416
continue
388417
}
418+
419+
intervalUnit := time.Second
420+
// reportMetadataInterval is only less than a second in tests,
421+
// so adjust the interval unit for them.
422+
if a.reportMetadataInterval < time.Second {
423+
intervalUnit = 100 * time.Millisecond
424+
}
389425
// The last collected value isn't quite stale yet, so we skip it.
390-
if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) {
426+
if collectedAt.Add(time.Duration(md.Interval) * intervalUnit).After(time.Now()) {
391427
continue
392428
}
393429
}
@@ -399,11 +435,19 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
399435
go flight.Do(md.Key, func() {
400436
timeout := md.Timeout
401437
if timeout == 0 {
402-
timeout = md.Interval
438+
if md.Interval != 0 {
439+
timeout = md.Interval
440+
} else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 {
441+
// Fallback to the report interval
442+
timeout = interval
443+
} else {
444+
// If the interval is still 0 (possible if the interval
445+
// is less than a second), default to 5. This was
446+
// randomly picked.
447+
timeout = 5
448+
}
403449
}
404-
ctx, cancel := context.WithTimeout(ctx,
405-
time.Duration(timeout)*time.Second,
406-
)
450+
ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
407451
defer cancel()
408452

409453
select {

0 commit comments

Comments
 (0)