Skip to content

Commit 9ea3910

Browse files
fix: reduce excessive logging when database is unreachable (cherry-pick #17363) (#17411)
Co-authored-by: Danny Kopping <danny@coder.com> Fixes #17045
1 parent 8b5adaa commit 9ea3910

File tree

10 files changed

+160
-20
lines changed

10 files changed

+160
-20
lines changed

coderd/coderd.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -665,10 +665,11 @@ func New(options *Options) *API {
665665
api.Auditor.Store(&options.Auditor)
666666
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
667667
dialer := &InmemTailnetDialer{
668-
CoordPtr: &api.TailnetCoordinator,
669-
DERPFn: api.DERPMap,
670-
Logger: options.Logger,
671-
ClientID: uuid.New(),
668+
CoordPtr: &api.TailnetCoordinator,
669+
DERPFn: api.DERPMap,
670+
Logger: options.Logger,
671+
ClientID: uuid.New(),
672+
DatabaseHealthCheck: api.Database,
672673
}
673674
stn, err := NewServerTailnet(api.ctx,
674675
options.Logger,

coderd/tailnet.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
534536
return m
535537
}
536538

539+
type Pinger interface {
540+
Ping(context.Context) (time.Duration, error)
541+
}
542+
537543
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538544
// service running in the same memory space.
539545
type InmemTailnetDialer struct {
540546
CoordPtr *atomic.Pointer[tailnet.Coordinator]
541547
DERPFn func() *tailcfg.DERPMap
542548
Logger slog.Logger
543549
ClientID uuid.UUID
550+
// DatabaseHealthCheck is used to validate that the store is reachable.
551+
DatabaseHealthCheck Pinger
544552
}
545553

546-
func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
554+
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
555+
if a.DatabaseHealthCheck != nil {
556+
if _, err := a.DatabaseHealthCheck.Ping(ctx); err != nil {
557+
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
558+
}
559+
}
560+
547561
coord := a.CoordPtr.Load()
548562
if coord == nil {
549563
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")

coderd/tailnet_test.go

+41
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ import (
1111
"strconv"
1212
"sync/atomic"
1313
"testing"
14+
"time"
1415

1516
"github.com/google/uuid"
1617
"github.com/prometheus/client_golang/prometheus"
1718
"github.com/spf13/afero"
1819
"github.com/stretchr/testify/assert"
1920
"github.com/stretchr/testify/require"
2021
"go.opentelemetry.io/otel/trace"
22+
"golang.org/x/xerrors"
2123
"tailscale.com/tailcfg"
2224

2325
"github.com/coder/coder/v2/agent"
2426
"github.com/coder/coder/v2/agent/agenttest"
2527
"github.com/coder/coder/v2/agent/proto"
2628
"github.com/coder/coder/v2/coderd"
2729
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
30+
"github.com/coder/coder/v2/codersdk"
2831
"github.com/coder/coder/v2/codersdk/agentsdk"
2932
"github.com/coder/coder/v2/codersdk/workspacesdk"
3033
"github.com/coder/coder/v2/tailnet"
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365368
})
366369
}
367370

371+
func TestDialFailure(t *testing.T) {
372+
t.Parallel()
373+
374+
// Setup.
375+
ctx := testutil.Context(t, testutil.WaitShort)
376+
logger := testutil.Logger(t)
377+
378+
// Given: a tailnet coordinator.
379+
coord := tailnet.NewCoordinator(logger)
380+
t.Cleanup(func() {
381+
_ = coord.Close()
382+
})
383+
coordPtr := atomic.Pointer[tailnet.Coordinator]{}
384+
coordPtr.Store(&coord)
385+
386+
// Given: a fake DB healthchecker which will always fail.
387+
fch := &failingHealthcheck{}
388+
389+
// When: dialing the in-memory coordinator.
390+
dialer := &coderd.InmemTailnetDialer{
391+
CoordPtr: &coordPtr,
392+
Logger: logger,
393+
ClientID: uuid.UUID{5},
394+
DatabaseHealthCheck: fch,
395+
}
396+
_, err := dialer.Dial(ctx, nil)
397+
398+
// Then: the error returned reflects the database has failed its healthcheck.
399+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
400+
}
401+
402+
type failingHealthcheck struct{}
403+
404+
func (failingHealthcheck) Ping(context.Context) (time.Duration, error) {
405+
// Simulate a database connection error.
406+
return 0, xerrors.New("oops")
407+
}
408+
368409
type wrappedListener struct {
369410
net.Listener
370411
dials int32

coderd/workspaceagents.go

+10
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
989989
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
990990
ctx := r.Context()
991991

992+
// Ensure the database is reachable before proceeding.
993+
_, err := api.Database.Ping(ctx)
994+
if err != nil {
995+
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
996+
Message: codersdk.DatabaseNotReachable,
997+
Detail: err.Error(),
998+
})
999+
return
1000+
}
1001+
9921002
// This route accepts user API key auth and workspace proxy auth. The moon actor has
9931003
// full permissions so should be able to pass this authz check.
9941004
workspace := httpmw.WorkspaceParam(r)

codersdk/database.go

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import "golang.org/x/xerrors"
4+
5+
const DatabaseNotReachable = "database not reachable"
6+
7+
var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable)

codersdk/workspacesdk/dialer.go

+11-4
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
var permanentErrorStatuses = []int{
22-
http.StatusConflict, // returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest, // returned if API mismatch
24-
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict, // returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest, // returned if API mismatch
25+
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError, // returned if database is not reachable,
2527
}
2628

2729
type WebsocketDialer struct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
if sdkErr.Message == codersdk.DatabaseNotReachable &&
96+
sdkErr.StatusCode() == http.StatusInternalServerError {
97+
err = xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
98+
}
9299
}
93100
w.connected <- err
94101
return tailnet.ControlProtocolClients{}, err

codersdk/workspacesdk/workspacesdk_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
package workspacesdk_test
22

33
import (
4+
"net/http"
5+
"net/http/httptest"
46
"net/url"
57
"testing"
68

79
"github.com/stretchr/testify/require"
810
"tailscale.com/tailcfg"
911

12+
"github.com/coder/websocket"
13+
14+
"github.com/coder/coder/v2/coderd/httpapi"
15+
"github.com/coder/coder/v2/codersdk"
1016
"github.com/coder/coder/v2/codersdk/agentsdk"
17+
"github.com/coder/coder/v2/codersdk/workspacesdk"
18+
"github.com/coder/coder/v2/testutil"
1119
)
1220

1321
func TestWorkspaceRewriteDERPMap(t *testing.T) {
@@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
3745
require.Equal(t, "coconuts.org", node.HostName)
3846
require.Equal(t, 44558, node.DERPPort)
3947
}
48+
49+
func TestWorkspaceDialerFailure(t *testing.T) {
50+
t.Parallel()
51+
52+
// Setup.
53+
ctx := testutil.Context(t, testutil.WaitShort)
54+
logger := testutil.Logger(t)
55+
56+
// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
57+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
58+
httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{
59+
Message: codersdk.DatabaseNotReachable,
60+
Detail: "oops",
61+
})
62+
}))
63+
t.Cleanup(srv.Close)
64+
65+
u, err := url.Parse(srv.URL)
66+
require.NoError(t, err)
67+
68+
// When: calling the coordination endpoint.
69+
dialer := workspacesdk.NewWebsocketDialer(logger, u, &websocket.DialOptions{})
70+
_, err = dialer.Dial(ctx, nil)
71+
72+
// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
73+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
74+
}

provisionerd/provisionerd.go

+21-9
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
defer p.wg.Done()
291292
defer func() { close(p.acquireDoneCh) }()
292293
ctx := p.closeContext
293-
for {
294+
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
294295
if p.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err := p.acquireAndRunOne(client)
304+
if err != nil && ctx.Err() == nil { // Only log if context is not done.
305+
// Short-circuit: don't wait for the retry delay to exit, if required.
306+
if p.acquireExit() {
307+
return
308+
}
309+
p.opts.Logger.Warn(ctx, "failed to acquire job, retrying", slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())), slog.Error(err))
310+
} else {
311+
// Reset the retrier after each successful acquisition.
312+
retrier.Reset()
313+
}
303314
}
304315
}
305316

@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329
return false
319330
}
320331

321-
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
332+
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
322333
ctx := p.closeContext
323334
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
324335
job, err := p.acquireGraceful(client)
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338
if errors.Is(err, context.Canceled) ||
328339
errors.Is(err, yamux.ErrSessionShutdown) ||
329340
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
330-
return
341+
return err
331342
}
332343

333344
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
334-
return
345+
return xerrors.Errorf("failed to acquire job: %w", err)
335346
}
336347
if job.JobId == "" {
337348
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
338-
return
349+
return nil
339350
}
340351

341352
if len(job.TraceMetadata) > 0 {
@@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
390401
Error: fmt.Sprintf("failed to connect to provisioner: %s", resp.Error),
391402
})
392403
if err != nil {
393-
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
404+
p.opts.Logger.Error(ctx, "failed to report provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
394405
}
395-
return
406+
return xerrors.Errorf("failed to report provisioner job failed: %w", err)
396407
}
397408

398409
p.mutex.Lock()
@@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416427
p.mutex.Lock()
417428
p.activeJob = nil
418429
p.mutex.Unlock()
430+
return nil
419431
}
420432

421433
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

site/src/api/typesGenerated.ts

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tailnet/controllers.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"maps"
@@ -20,11 +21,12 @@ import (
2021
"tailscale.com/util/dnsname"
2122

2223
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2327
"github.com/coder/coder/v2/coderd/util/ptr"
2428
"github.com/coder/coder/v2/codersdk"
2529
"github.com/coder/coder/v2/tailnet/proto"
26-
"github.com/coder/quartz"
27-
"github.com/coder/retry"
2830
)
2931

3032
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1362,6 +1364,14 @@ func (c *Controller) Run(ctx context.Context) {
13621364
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
13631365
return
13641366
}
1367+
1368+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1369+
if errors.Is(err, codersdk.ErrDatabaseNotReachable) {
1370+
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
1371+
slog.Error(err), slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())))
1372+
continue
1373+
}
1374+
13651375
errF := slog.Error(err)
13661376
var sdkErr *codersdk.Error
13671377
if xerrors.As(err, &sdkErr) {

0 commit comments

Comments
 (0)