Skip to content

Commit 374127e

Browse files
committed
fix: prevent single replica proxies from staying unhealthy (#12641)
In the peer healthcheck code, when an error pinging peers is detected we write a "replicaErr" string with the error reason. However, if there are no peer replicas to ping we returned early without setting the string to empty. This would cause replicas that had peers (which were failing) and then the peers left to permanently show an error until a new peer appeared. Also demotes DERP replica checking to a "warning" rather than an "error" which should prevent the primary from removing the proxy from the region map if DERP meshing is non-functional. This can happen without causing problems if the peer is shutting down so we don't want to disrupt everything if there isn't an issue. (cherry picked from commit cf50461)
1 parent 793df2e commit 374127e

File tree

2 files changed

+145
-30
lines changed

2 files changed

+145
-30
lines changed

enterprise/wsproxy/wsproxy.go

+26-25
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,21 @@ func (s *Server) handleRegister(res wsproxysdk.RegisterWorkspaceProxyResponse) e
449449
}
450450

451451
func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
452+
ctx, cancel := context.WithTimeout(s.ctx, 10*time.Second)
453+
defer cancel()
454+
455+
errStr := pingSiblingReplicas(ctx, s.Logger, &s.replicaPingSingleflight, s.derpMeshTLSConfig, replicas)
456+
s.replicaErrMut.Lock()
457+
s.replicaErr = errStr
458+
s.replicaErrMut.Unlock()
459+
if s.Options.ReplicaErrCallback != nil {
460+
s.Options.ReplicaErrCallback(replicas, s.replicaErr)
461+
}
462+
}
463+
464+
func pingSiblingReplicas(ctx context.Context, logger slog.Logger, sf *singleflight.Group, derpMeshTLSConfig *tls.Config, replicas []codersdk.Replica) string {
452465
if len(replicas) == 0 {
453-
return
466+
return ""
454467
}
455468

456469
// Avoid pinging multiple times at once if the list hasn't changed.
@@ -462,18 +475,11 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
462475
singleflightStr := strings.Join(relayURLs, " ") // URLs can't contain spaces.
463476

464477
//nolint:dogsled
465-
_, _, _ = s.replicaPingSingleflight.Do(singleflightStr, func() (any, error) {
466-
const (
467-
perReplicaTimeout = 3 * time.Second
468-
fullTimeout = 10 * time.Second
469-
)
470-
ctx, cancel := context.WithTimeout(s.ctx, fullTimeout)
471-
defer cancel()
472-
478+
errStrInterface, _, _ := sf.Do(singleflightStr, func() (any, error) {
473479
client := http.Client{
474-
Timeout: perReplicaTimeout,
480+
Timeout: 3 * time.Second,
475481
Transport: &http.Transport{
476-
TLSClientConfig: s.derpMeshTLSConfig,
482+
TLSClientConfig: derpMeshTLSConfig,
477483
DisableKeepAlives: true,
478484
},
479485
}
@@ -485,7 +491,7 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
485491
err := replicasync.PingPeerReplica(ctx, client, peer.RelayAddress)
486492
if err != nil {
487493
errs <- xerrors.Errorf("ping sibling replica %s (%s): %w", peer.Hostname, peer.RelayAddress, err)
488-
s.Logger.Warn(ctx, "failed to ping sibling replica, this could happen if the replica has shutdown",
494+
logger.Warn(ctx, "failed to ping sibling replica, this could happen if the replica has shutdown",
489495
slog.F("replica_hostname", peer.Hostname),
490496
slog.F("replica_relay_address", peer.RelayAddress),
491497
slog.Error(err),
@@ -504,20 +510,14 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
504510
}
505511
}
506512

507-
s.replicaErrMut.Lock()
508-
defer s.replicaErrMut.Unlock()
509-
s.replicaErr = ""
510-
if len(replicaErrs) > 0 {
511-
s.replicaErr = fmt.Sprintf("Failed to dial peers: %s", strings.Join(replicaErrs, ", "))
512-
}
513-
if s.Options.ReplicaErrCallback != nil {
514-
s.Options.ReplicaErrCallback(replicas, s.replicaErr)
513+
if len(replicaErrs) == 0 {
514+
return "", nil
515515
}
516-
517-
//nolint:nilnil // we don't actually use the return value of the
518-
// singleflight here
519-
return nil, nil
516+
return fmt.Sprintf("Failed to dial peers: %s", strings.Join(replicaErrs, ", ")), nil
520517
})
518+
519+
//nolint:forcetypeassert
520+
return errStrInterface.(string)
521521
}
522522

523523
func (s *Server) handleRegisterFailure(err error) {
@@ -590,7 +590,8 @@ func (s *Server) healthReport(rw http.ResponseWriter, r *http.Request) {
590590

591591
s.replicaErrMut.Lock()
592592
if s.replicaErr != "" {
593-
report.Errors = append(report.Errors, "High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
593+
report.Warnings = append(report.Warnings,
594+
"High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
594595
}
595596
s.replicaErrMut.Unlock()
596597

enterprise/wsproxy/wsproxy_test.go

+119-5
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
563563
return proxyRes
564564
}
565565

566-
registerBrokenProxy := func(ctx context.Context, t *testing.T, primaryAccessURL *url.URL, accessURL, token string) {
566+
registerBrokenProxy := func(ctx context.Context, t *testing.T, primaryAccessURL *url.URL, accessURL, token string) uuid.UUID {
567567
t.Helper()
568568
// Create a HTTP server that always replies with 500.
569569
srv := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
@@ -574,21 +574,23 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
574574
// Register a proxy.
575575
wsproxyClient := wsproxysdk.New(primaryAccessURL)
576576
wsproxyClient.SetSessionToken(token)
577-
578577
hostname, err := cryptorand.String(6)
579578
require.NoError(t, err)
579+
replicaID := uuid.New()
580580
_, err = wsproxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{
581581
AccessURL: accessURL,
582582
WildcardHostname: "",
583583
DerpEnabled: true,
584584
DerpOnly: false,
585-
ReplicaID: uuid.New(),
585+
ReplicaID: replicaID,
586586
ReplicaHostname: hostname,
587587
ReplicaError: "",
588588
ReplicaRelayAddress: srv.URL,
589589
Version: buildinfo.Version(),
590590
})
591591
require.NoError(t, err)
592+
593+
return replicaID
592594
}
593595

594596
t.Run("ProbeOK", func(t *testing.T) {
@@ -781,8 +783,120 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
781783
resp.Body.Close()
782784
require.NoError(t, err)
783785

784-
require.Len(t, respJSON.Errors, 1, "proxy is healthy")
785-
require.Contains(t, respJSON.Errors[0], "High availability networking")
786+
require.Len(t, respJSON.Warnings, 1, "proxy is healthy")
787+
require.Contains(t, respJSON.Warnings[0], "High availability networking")
788+
})
789+
790+
// This test catches a regression we detected on dogfood which caused
791+
// proxies to remain unhealthy after a mesh failure if they dropped to zero
792+
// siblings after the failure.
793+
t.Run("HealthyZero", func(t *testing.T) {
794+
t.Parallel()
795+
796+
deploymentValues := coderdtest.DeploymentValues(t)
797+
deploymentValues.Experiments = []string{
798+
"*",
799+
}
800+
801+
client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
802+
Options: &coderdtest.Options{
803+
DeploymentValues: deploymentValues,
804+
AppHostname: "*.primary.test.coder.com",
805+
IncludeProvisionerDaemon: true,
806+
RealIPConfig: &httpmw.RealIPConfig{
807+
TrustedOrigins: []*net.IPNet{{
808+
IP: net.ParseIP("127.0.0.1"),
809+
Mask: net.CIDRMask(8, 32),
810+
}},
811+
TrustedHeaders: []string{
812+
"CF-Connecting-IP",
813+
},
814+
},
815+
},
816+
LicenseOptions: &coderdenttest.LicenseOptions{
817+
Features: license.Features{
818+
codersdk.FeatureWorkspaceProxy: 1,
819+
},
820+
},
821+
})
822+
t.Cleanup(func() {
823+
_ = closer.Close()
824+
})
825+
826+
proxyURL, err := url.Parse("https://proxy2.test.coder.com")
827+
require.NoError(t, err)
828+
829+
// Create 1 real proxy replica.
830+
replicaPingErr := make(chan string, 4)
831+
proxy := coderdenttest.NewWorkspaceProxyReplica(t, api, client, &coderdenttest.ProxyOptions{
832+
Name: "proxy-2",
833+
ProxyURL: proxyURL,
834+
ReplicaPingCallback: func(replicas []codersdk.Replica, err string) {
835+
replicaPingErr <- err
836+
},
837+
})
838+
839+
ctx := testutil.Context(t, testutil.WaitLong)
840+
otherReplicaID := registerBrokenProxy(ctx, t, api.AccessURL, proxyURL.String(), proxy.Options.ProxySessionToken)
841+
842+
// Force the proxy to re-register immediately.
843+
err = proxy.RegisterNow()
844+
require.NoError(t, err, "failed to force proxy to re-register")
845+
846+
// Wait for the ping to fail.
847+
for {
848+
replicaErr := testutil.RequireRecvCtx(ctx, t, replicaPingErr)
849+
t.Log("replica ping error:", replicaErr)
850+
if replicaErr != "" {
851+
break
852+
}
853+
}
854+
855+
// GET /healthz-report
856+
u := proxy.ServerURL.ResolveReference(&url.URL{Path: "/healthz-report"})
857+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
858+
require.NoError(t, err)
859+
resp, err := http.DefaultClient.Do(req)
860+
require.NoError(t, err)
861+
var respJSON codersdk.ProxyHealthReport
862+
err = json.NewDecoder(resp.Body).Decode(&respJSON)
863+
resp.Body.Close()
864+
require.NoError(t, err)
865+
require.Len(t, respJSON.Warnings, 1, "proxy is healthy")
866+
require.Contains(t, respJSON.Warnings[0], "High availability networking")
867+
868+
// Deregister the other replica.
869+
wsproxyClient := wsproxysdk.New(api.AccessURL)
870+
wsproxyClient.SetSessionToken(proxy.Options.ProxySessionToken)
871+
err = wsproxyClient.DeregisterWorkspaceProxy(ctx, wsproxysdk.DeregisterWorkspaceProxyRequest{
872+
ReplicaID: otherReplicaID,
873+
})
874+
require.NoError(t, err)
875+
876+
// Force the proxy to re-register immediately.
877+
err = proxy.RegisterNow()
878+
require.NoError(t, err, "failed to force proxy to re-register")
879+
880+
// Wait for the ping to be skipped.
881+
for {
882+
replicaErr := testutil.RequireRecvCtx(ctx, t, replicaPingErr)
883+
t.Log("replica ping error:", replicaErr)
884+
// Should be empty because there are no more peers. This was where
885+
// the regression was.
886+
if replicaErr == "" {
887+
break
888+
}
889+
}
890+
891+
// GET /healthz-report
892+
req, err = http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
893+
require.NoError(t, err)
894+
resp, err = http.DefaultClient.Do(req)
895+
require.NoError(t, err)
896+
err = json.NewDecoder(resp.Body).Decode(&respJSON)
897+
resp.Body.Close()
898+
require.NoError(t, err)
899+
require.Len(t, respJSON.Warnings, 0, "proxy is unhealthy")
786900
})
787901
}
788902

0 commit comments

Comments
 (0)