-
Notifications
You must be signed in to change notification settings - Fork 936
fix: reduce excessive logging when database is unreachable #17363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3f95841
0448a74
0136b70
8d94c3c
f92e852
4b0fd6a
68867da
6f60cbc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ import ( | |
"github.com/coder/coder/v2/coderd/database/dbfake" | ||
"github.com/coder/coder/v2/coderd/database/dbgen" | ||
"github.com/coder/coder/v2/coderd/database/dbmem" | ||
"github.com/coder/coder/v2/coderd/database/dbtestutil" | ||
"github.com/coder/coder/v2/coderd/database/dbtime" | ||
"github.com/coder/coder/v2/coderd/database/pubsub" | ||
"github.com/coder/coder/v2/coderd/externalauth" | ||
|
@@ -537,6 +538,46 @@ func TestWorkspaceAgentTailnet(t *testing.T) { | |
require.Equal(t, "test", strings.TrimSpace(string(output))) | ||
} | ||
|
||
// TestWorkspaceAgentDialFailure validates that the tailnet controller will retry connecting to the control plane until | ||
// its context times out, when the dialer fails its healthcheck. | ||
func TestWorkspaceAgentDialFailure(t *testing.T) { | ||
t.Parallel() | ||
|
||
store, ps := dbtestutil.NewDB(t) | ||
|
||
// Given: a database which will fail its Ping(ctx) call. | ||
// NOTE: The Ping(ctx) call is made by the Dialer. | ||
pdb := &pingFailingDB{ | ||
Store: store, | ||
} | ||
client := coderdtest.New(t, &coderdtest.Options{ | ||
Database: pdb, | ||
Pubsub: ps, | ||
IncludeProvisionerDaemon: true, | ||
}) | ||
user := coderdtest.CreateFirstUser(t, client) | ||
|
||
// When: a workspace agent is setup and we try dial it. | ||
r := dbfake.WorkspaceBuild(t, pdb, database.WorkspaceTable{ | ||
OrganizationID: user.OrganizationID, | ||
OwnerID: user.UserID, | ||
}).WithAgent().Do() | ||
_ = agenttest.New(t, client.URL, r.AgentToken) | ||
resources := coderdtest.AwaitWorkspaceAgents(t, client, r.Workspace.ID) | ||
require.Len(t, resources, 1) | ||
require.Len(t, resources[0].Agents, 1) | ||
|
||
// When: the db is marked as unhealthy (i.e. will fail its Ping). | ||
// This needs to be done *after* the server "starts" otherwise it'll fail straight away when trying to initialize. | ||
pdb.MarkUnhealthy() | ||
|
||
// Then: the tailnet controller will continually try to dial the coordination endpoint, exceeding its context timeout. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is wrong, we don't continually retry because Furthermore, I don't think the SDK |
||
ctx := testutil.Context(t, testutil.WaitMedium) | ||
conn, err := workspacesdk.New(client).DialAgent(ctx, resources[0].Agents[0].ID, nil) | ||
require.ErrorContains(t, err, codersdk.DatabaseNotReachable) | ||
require.Nil(t, conn) | ||
} | ||
|
||
func TestWorkspaceAgentClientCoordinate_BadVersion(t *testing.T) { | ||
t.Parallel() | ||
client, db := coderdtest.NewWithDatabase(t, nil) | ||
|
@@ -2591,3 +2632,22 @@ func TestAgentConnectionInfo(t *testing.T) { | |
require.True(t, info.DisableDirectConnections) | ||
require.True(t, info.DERPForceWebSockets) | ||
} | ||
|
||
type pingFailingDB struct { | ||
database.Store | ||
|
||
unhealthy bool | ||
} | ||
|
||
func (p *pingFailingDB) Ping(context.Context) (time.Duration, error) { | ||
if !p.unhealthy { | ||
return time.Nanosecond, nil | ||
} | ||
|
||
// Simulate a database connection error. | ||
return 0, xerrors.New("oops") | ||
} | ||
|
||
func (p *pingFailingDB) MarkUnhealthy() { | ||
p.unhealthy = true | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package codersdk | ||
|
||
import "golang.org/x/xerrors" | ||
|
||
const DatabaseNotReachable = "database not reachable" | ||
|
||
var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,12 +20,13 @@ import ( | |
"golang.org/x/xerrors" | ||
|
||
"cdr.dev/slog" | ||
"github.com/coder/retry" | ||
|
||
"github.com/coder/coder/v2/coderd/tracing" | ||
"github.com/coder/coder/v2/codersdk" | ||
"github.com/coder/coder/v2/provisionerd/proto" | ||
"github.com/coder/coder/v2/provisionerd/runner" | ||
sdkproto "github.com/coder/coder/v2/provisionersdk/proto" | ||
"github.com/coder/retry" | ||
) | ||
|
||
// Dialer represents the function to create a daemon client connection. | ||
|
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() { | |
defer p.wg.Done() | ||
defer func() { close(p.acquireDoneCh) }() | ||
ctx := p.closeContext | ||
for { | ||
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); { | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if p.acquireExit() { | ||
return | ||
} | ||
|
@@ -299,7 +300,10 @@ func (p *Server) acquireLoop() { | |
p.opts.Logger.Debug(ctx, "shut down before client (re) connected") | ||
return | ||
} | ||
p.acquireAndRunOne(client) | ||
err := p.acquireAndRunOne(client) | ||
if err != nil && ctx.Err() == nil { // Only log if context is not done. | ||
p.opts.Logger.Debug(ctx, "retrying to acquire job", slog.F("retry_in_ms", retrier.Delay.Milliseconds()), slog.Error(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Self-review: |
||
} | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
|
@@ -318,7 +322,7 @@ func (p *Server) acquireExit() bool { | |
return false | ||
} | ||
|
||
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | ||
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error { | ||
ctx := p.closeContext | ||
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne") | ||
job, err := p.acquireGraceful(client) | ||
|
@@ -327,15 +331,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | |
if errors.Is(err, context.Canceled) || | ||
errors.Is(err, yamux.ErrSessionShutdown) || | ||
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) { | ||
return | ||
return err | ||
} | ||
|
||
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err)) | ||
return | ||
return xerrors.Errorf("failed to acquire job: %w", err) | ||
} | ||
if job.JobId == "" { | ||
p.opts.Logger.Debug(ctx, "acquire job successfully canceled") | ||
return | ||
return xerrors.New("canceled") | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
if len(job.TraceMetadata) > 0 { | ||
|
@@ -392,7 +396,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | |
if err != nil { | ||
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err)) | ||
} | ||
return | ||
return xerrors.Errorf("provisioner job failed: %w", err) | ||
dannykopping marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
p.mutex.Lock() | ||
|
@@ -416,6 +420,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | |
p.mutex.Lock() | ||
p.activeJob = nil | ||
p.mutex.Unlock() | ||
return nil | ||
} | ||
|
||
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut | ||
|
Uh oh!
There was an error while loading. Please reload this page.