Skip to content

fix: reduce excessive logging when database is unreachable #17363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 15, 2025
9 changes: 5 additions & 4 deletions coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -675,10 +675,11 @@ func New(options *Options) *API {
api.Auditor.Store(&options.Auditor)
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
dialer := &InmemTailnetDialer{
CoordPtr: &api.TailnetCoordinator,
DERPFn: api.DERPMap,
Logger: options.Logger,
ClientID: uuid.New(),
CoordPtr: &api.TailnetCoordinator,
DERPFn: api.DERPMap,
Logger: options.Logger,
ClientID: uuid.New(),
DatabaseHealthCheck: api.Database,
}
stn, err := NewServerTailnet(api.ctx,
options.Logger,
Expand Down
16 changes: 15 additions & 1 deletion coderd/tailnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ import (
"tailscale.com/tailcfg"

"cdr.dev/slog"

"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/coderd/workspaceapps"
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/site"
"github.com/coder/coder/v2/tailnet"
Expand Down Expand Up @@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
return m
}

type Pinger interface {
Ping(context.Context) (time.Duration, error)
}

// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
// service running in the same memory space.
type InmemTailnetDialer struct {
CoordPtr *atomic.Pointer[tailnet.Coordinator]
DERPFn func() *tailcfg.DERPMap
Logger slog.Logger
ClientID uuid.UUID
// DatabaseHealthCheck is used to validate that the store is reachable.
DatabaseHealthCheck Pinger
}

func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
if a.DatabaseHealthCheck != nil {
if _, err := a.DatabaseHealthCheck.Ping(ctx); err != nil {
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
}
}

coord := a.CoordPtr.Load()
if coord == nil {
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")
Expand Down
41 changes: 41 additions & 0 deletions coderd/tailnet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,23 @@ import (
"strconv"
"sync/atomic"
"testing"
"time"

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel/trace"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"

"github.com/coder/coder/v2/agent"
"github.com/coder/coder/v2/agent/agenttest"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd"
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/tailnet"
Expand Down Expand Up @@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
})
}

func TestDialFailure(t *testing.T) {
t.Parallel()

// Setup.
ctx := testutil.Context(t, testutil.WaitShort)
logger := testutil.Logger(t)

// Given: a tailnet coordinator.
coord := tailnet.NewCoordinator(logger)
t.Cleanup(func() {
_ = coord.Close()
})
coordPtr := atomic.Pointer[tailnet.Coordinator]{}
coordPtr.Store(&coord)

// Given: a fake DB healthchecker which will always fail.
fch := &failingHealthcheck{}

// When: dialing the in-memory coordinator.
dialer := &coderd.InmemTailnetDialer{
CoordPtr: &coordPtr,
Logger: logger,
ClientID: uuid.UUID{5},
DatabaseHealthCheck: fch,
}
_, err := dialer.Dial(ctx, nil)

// Then: the error returned reflects the database has failed its healthcheck.
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
}

type failingHealthcheck struct{}

func (failingHealthcheck) Ping(context.Context) (time.Duration, error) {
// Simulate a database connection error.
return 0, xerrors.New("oops")
}

type wrappedListener struct {
net.Listener
dials int32
Expand Down
10 changes: 10 additions & 0 deletions coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()

// Ensure the database is reachable before proceeding.
_, err := api.Database.Ping(ctx)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: codersdk.DatabaseNotReachable,
Detail: err.Error(),
})
return
}

// This route accepts user API key auth and workspace proxy auth. The moon actor has
// full permissions so should be able to pass this authz check.
workspace := httpmw.WorkspaceParam(r)
Expand Down
7 changes: 7 additions & 0 deletions codersdk/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package codersdk

import "golang.org/x/xerrors"

const DatabaseNotReachable = "database not reachable"

var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable)
15 changes: 11 additions & 4 deletions codersdk/workspacesdk/dialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@ import (
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/websocket"

"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
"github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/websocket"
)

var permanentErrorStatuses = []int{
http.StatusConflict, // returned if client/agent connections disabled (browser only)
http.StatusBadRequest, // returned if API mismatch
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
http.StatusConflict, // returned if client/agent connections disabled (browser only)
http.StatusBadRequest, // returned if API mismatch
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
http.StatusInternalServerError, // returned if database is not reachable,
}

type WebsocketDialer struct {
Expand Down Expand Up @@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
"Ensure your client release version (%s, different than the API version) matches the server release version",
buildinfo.Version())
}

if sdkErr.Message == codersdk.DatabaseNotReachable &&
sdkErr.StatusCode() == http.StatusInternalServerError {
err = xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
}
}
w.connected <- err
return tailnet.ControlProtocolClients{}, err
Expand Down
35 changes: 35 additions & 0 deletions codersdk/workspacesdk/workspacesdk_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
package workspacesdk_test

import (
"net/http"
"net/http/httptest"
"net/url"
"testing"

"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"

"github.com/coder/websocket"

"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/testutil"
)

func TestWorkspaceRewriteDERPMap(t *testing.T) {
Expand Down Expand Up @@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
require.Equal(t, "coconuts.org", node.HostName)
require.Equal(t, 44558, node.DERPPort)
}

func TestWorkspaceDialerFailure(t *testing.T) {
t.Parallel()

// Setup.
ctx := testutil.Context(t, testutil.WaitShort)
logger := testutil.Logger(t)

// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{
Message: codersdk.DatabaseNotReachable,
Detail: "oops",
})
}))
t.Cleanup(srv.Close)

u, err := url.Parse(srv.URL)
require.NoError(t, err)

// When: calling the coordination endpoint.
dialer := workspacesdk.NewWebsocketDialer(logger, u, &websocket.DialOptions{})
_, err = dialer.Dial(ctx, nil)

// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
}
30 changes: 21 additions & 9 deletions provisionerd/provisionerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ import (
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/retry"

"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/provisionerd/proto"
"github.com/coder/coder/v2/provisionerd/runner"
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
"github.com/coder/retry"
)

// Dialer represents the function to create a daemon client connection.
Expand Down Expand Up @@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
defer p.wg.Done()
defer func() { close(p.acquireDoneCh) }()
ctx := p.closeContext
for {
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
if p.acquireExit() {
return
}
Expand All @@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
return
}
p.acquireAndRunOne(client)
err := p.acquireAndRunOne(client)
if err != nil && ctx.Err() == nil { // Only log if context is not done.
// Short-circuit: don't wait for the retry delay to exit, if required.
if p.acquireExit() {
return
}
p.opts.Logger.Warn(ctx, "failed to acquire job, retrying", slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())), slog.Error(err))
} else {
// Reset the retrier after each successful acquisition.
retrier.Reset()
}
}
}

Expand All @@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
return false
}

func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
ctx := p.closeContext
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
job, err := p.acquireGraceful(client)
Expand All @@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
if errors.Is(err, context.Canceled) ||
errors.Is(err, yamux.ErrSessionShutdown) ||
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
return
return err
}

p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
return
return xerrors.Errorf("failed to acquire job: %w", err)
}
if job.JobId == "" {
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
return
return nil
}

if len(job.TraceMetadata) > 0 {
Expand Down Expand Up @@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
Error: fmt.Sprintf("failed to connect to provisioner: %s", resp.Error),
})
if err != nil {
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
p.opts.Logger.Error(ctx, "failed to report provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
}
return
return xerrors.Errorf("failed to report provisioner job failed: %w", err)
}

p.mutex.Lock()
Expand All @@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
p.mutex.Lock()
p.activeJob = nil
p.mutex.Unlock()
return nil
}

// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut
Expand Down
3 changes: 3 additions & 0 deletions site/src/api/typesGenerated.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions tailnet/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package tailnet

import (
"context"
"errors"
"fmt"
"io"
"maps"
Expand All @@ -21,11 +22,12 @@ import (
"tailscale.com/util/dnsname"

"cdr.dev/slog"
"github.com/coder/quartz"
"github.com/coder/retry"

"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
"github.com/coder/retry"
)

// A Controller connects to the tailnet control plane, and then uses the control protocols to
Expand Down Expand Up @@ -1381,6 +1383,14 @@ func (c *Controller) Run(ctx context.Context) {
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
return
}

// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
if errors.Is(err, codersdk.ErrDatabaseNotReachable) {
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
slog.Error(err), slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())))
continue
}

errF := slog.Error(err)
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) {
Expand Down
Loading