Skip to content

chore: zero out session stats from agent with experiment enabled #13579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Prev Previous commit
Next Next commit
stop stats reporting at agent level
  • Loading branch information
f0ssel committed Jun 17, 2024
commit f90bed23f1361a6fce8f41cf5b8abdda0472f684
47 changes: 47 additions & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ type Options struct {
PrometheusRegistry *prometheus.Registry
ReportMetadataInterval time.Duration
ServiceBannerRefreshInterval time.Duration
ExperimentRefreshInterval time.Duration
FetchExperiments func(ctx context.Context) (codersdk.Experiments, error)
Syscaller agentproc.Syscaller
// ModifiedProcesses is used for testing process priority management.
ModifiedProcesses chan []*agentproc.Process
Expand Down Expand Up @@ -134,6 +136,14 @@ func New(options Options) Agent {
return "", nil
}
}
if options.FetchExperiments == nil {
options.FetchExperiments = func(ctx context.Context) (codersdk.Experiments, error) {
return codersdk.Experiments{}, nil
}
}
if options.ExperimentRefreshInterval == 0 {
options.ExperimentRefreshInterval = 5 * time.Minute
}
if options.ReportMetadataInterval == 0 {
options.ReportMetadataInterval = time.Second
}
Expand Down Expand Up @@ -167,6 +177,7 @@ func New(options Options) Agent {
environmentVariables: options.EnvironmentVariables,
client: options.Client,
exchangeToken: options.ExchangeToken,
fetchExperiments: options.FetchExperiments,
filesystem: options.Filesystem,
logDir: options.LogDir,
tempDir: options.TempDir,
Expand Down Expand Up @@ -249,6 +260,10 @@ type agent struct {
lifecycleStates []agentsdk.PostLifecycleRequest
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.

fetchExperiments func(ctx context.Context) (codersdk.Experiments, error)
fetchExperimentsInterval time.Duration
experiments atomic.Pointer[codersdk.Experiments]

network *tailnet.Conn
addresses []netip.Prefix
statsReporter *statsReporter
Expand Down Expand Up @@ -737,6 +752,28 @@ func (a *agent) fetchServiceBannerLoop(ctx context.Context, conn drpc.Conn) erro
}
}

// fetchExperimentsLoop fetches experiments on an interval.
func (a *agent) fetchExperimentsLoop(ctx context.Context) error {
ticker := time.NewTicker(a.fetchExperimentsInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
experiments, err := a.fetchExperiments(ctx)
if err != nil {
if ctx.Err() != nil {
return ctx.Err()
}
a.logger.Error(ctx, "failed to update experiments", slog.Error(err))
return err
}
a.experiments.Store(&experiments)
}
}
}

func (a *agent) run() (retErr error) {
// This allows the agent to refresh it's token if necessary.
// For instance identity this is required, since the instance
Expand All @@ -747,6 +784,12 @@ func (a *agent) run() (retErr error) {
}
a.sessionToken.Store(&sessionToken)

exp, err := a.fetchExperiments(a.hardCtx)
if err != nil {
return xerrors.Errorf("fetch experiments: %w", err)
}
a.experiments.Store(&exp)

// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs
conn, err := a.client.ConnectRPC(a.hardCtx)
if err != nil {
Expand Down Expand Up @@ -856,6 +899,10 @@ func (a *agent) run() (retErr error) {

connMan.start("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop)

connMan.start("fetch experiments loop", gracefulShutdownBehaviorStop, func(ctx context.Context, _ drpc.Conn) error {
return a.fetchExperimentsLoop(ctx)
})

connMan.start("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, conn drpc.Conn) error {
if err := networkOK.wait(ctx); err != nil {
return xerrors.Errorf("no network: %w", err)
Expand Down
20 changes: 17 additions & 3 deletions agent/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ import (
"sync"
"time"

"go.uber.org/atomic"
"golang.org/x/xerrors"
"tailscale.com/types/netlogtype"

"cdr.dev/slog"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/codersdk"
)

const maxConns = 2048
Expand All @@ -36,9 +38,10 @@ type statsReporter struct {
unreported bool
lastInterval time.Duration

source networkStatsSource
collector statsCollector
logger slog.Logger
source networkStatsSource
collector statsCollector
logger slog.Logger
experiments atomic.Pointer[codersdk.Experiments]
}

func newStatsReporter(logger slog.Logger, source networkStatsSource, collector statsCollector) *statsReporter {
Expand Down Expand Up @@ -112,6 +115,17 @@ func (s *statsReporter) reportLocked(
s.L.Unlock()
defer s.L.Lock()
stats := s.collector.Collect(ctx, networkStats)

// if the experiment is enabled we zero out certain session stats
// as we migrate to the client reporting these stats instead.
Comment on lines +128 to +129
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this get implemented serverside instead? It seems like a lot of changes in the agent for something that could be zeroed out in the stats API on the server.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, this would keep the change in 'one place' and remove the need for plumbing this through AgentAPI

Copy link
Contributor Author

@f0ssel f0ssel Jun 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really and here's why - If customers update coder but do not rebuild the workspace, we will have an old agent client and old cli version in the workspace. If the experiment is enabled and we zero out the value server side, we will lose data for that workspace because the old cli will not be reporting the new data via the usage endpoint. So the graphs will go to zero and slowly come back up over time as each workspace is restarted.

So we need the endpoint to still save this data for old workspaces, but also allow new workspaces to report it in the new way. Or we accept downtime on the stats on upgrade until all workspaces are updated. Thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the experiment is enabled and we zero out the value server side, we will lose data for that workspace because the old cli will not be reporting the new data via the usage endpoint.

Won't the same thing happen even if the feature was GA? Old agent/CLI versions need to be supported on new servers for multiple months.

This will also affect stats collection for older CLIs being used outside of workspaces, which arguably is where more people use the CLI anyways. I don't think anyone is using the CLI in a workspace to connect to a workspace.

So we need the endpoint to still save this data for old workspaces, but also allow new workspaces to report it in the new way. Or we accept downtime on the stats on upgrade until all workspaces are updated.

I think either way there will be a "downtime" of stats with or without the agent changes when an older CLI is being used on the local machine.

I think this PR should change into just the CLI stats upload portion and have both sides report stats for now. Then in a few months you can remove stats reporting from the API by deprecating the field and ignoring it serverside.

if s.experiments.Load().Enabled(codersdk.ExperimentWorkspaceUsage) {
stats.SessionCountSsh = 0
// TODO: More session types will be enabled as we migrate over.
// stats.SessionCountVscode = 0
// stats.SessionCountJetbrains = 0
// stats.SessionCountReconnectingPty = 0
}

resp, err := dest.UpdateStats(ctx, &proto.UpdateStatsRequest{Stats: stats})
if err != nil {
return err
Expand Down
3 changes: 3 additions & 0 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,9 @@ func (r *RootCmd) workspaceAgent() *serpent.Command {
client.SetSessionToken(resp.SessionToken)
return resp.SessionToken, nil
},
FetchExperiments: func(ctx context.Context) (codersdk.Experiments, error) {
return client.SDK.Experiments(ctx)
},
EnvironmentVariables: environmentVariables,
IgnorePorts: ignorePorts,
SSHMaxTimeout: sshMaxTimeout,
Expand Down
Loading