Skip to content

fix(agent): check agent metadata every second instead of minute #8614

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 20, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 55 additions & 11 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func New(options Options) Agent {
}
}
if options.ReportMetadataInterval == 0 {
options.ReportMetadataInterval = 1 * time.Minute
options.ReportMetadataInterval = time.Second
}
if options.ServiceBannerRefreshInterval == 0 {
options.ServiceBannerRefreshInterval = 2 * time.Minute
Expand Down Expand Up @@ -328,16 +328,45 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
// baseInterval to run.
var flight trySingleflight

postMetadata := func(mr metadataResultAndKey) {
lastCollectedAts[mr.key] = mr.result.CollectedAt
err := a.client.PostMetadata(ctx, mr.key, *mr.result)
if err != nil {
a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))
}
}
flushAllMetadata := func() {
wg := sync.WaitGroup{}
defer wg.Wait()
for {
select {
case <-ctx.Done():
return
case mr := <-metadataResults:
wg.Add(1)
go func() {
defer wg.Done()
postMetadata(mr)
}()
continue
default:
return
}
}
}

for {
// Ensure all backpressured metadata is posted.
if len(metadataResults) > 1 {
flushAllMetadata()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could perhaps be moved to after the select below (i.e. when baseTicker.C is triggered). For now it seems to be doing the exact same thing as the select, except when baseTicker.C is triggered. (We probably don't need the len check either.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't really seem to be necessary, so i've just removed it for now

}

select {
case <-ctx.Done():
return
case mr := <-metadataResults:
lastCollectedAts[mr.key] = mr.result.CollectedAt
err := a.client.PostMetadata(ctx, mr.key, *mr.result)
if err != nil {
a.logger.Error(ctx, "agent failed to report metadata", slog.Error(err))
}
postMetadata(mr)
continue
case <-baseTicker.C:
}

Expand Down Expand Up @@ -386,8 +415,15 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
if md.Interval == 0 {
continue
}

intervalUnit := time.Second
// reportMetadataInterval is only less than a second in tests,
// so adjust the interval unit for them.
if a.reportMetadataInterval < time.Second {
intervalUnit = 100 * time.Millisecond
}
// The last collected value isn't quite stale yet, so we skip it.
if collectedAt.Add(a.reportMetadataInterval).After(time.Now()) {
if collectedAt.Add(time.Duration(md.Interval) * intervalUnit).After(time.Now()) {
continue
}
}
Expand All @@ -399,11 +435,19 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
go flight.Do(md.Key, func() {
timeout := md.Timeout
if timeout == 0 {
timeout = md.Interval
if md.Interval != 0 {
timeout = md.Interval
} else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 {
// Fallback to the report interval
timeout = interval
} else {
// If the interval is still 0 (possible if the interval
// is less than a second), default to 5. This was
// randomly picked.
timeout = 5
}
}
ctx, cancel := context.WithTimeout(ctx,
time.Duration(timeout)*time.Second,
)
ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()

select {
Expand Down