fix: fix pgcoord to delete coordinator row last (coder#12155)

spikecurtis · web-flow · commit 627232eae90d · 2024-02-15T16:34:29.000+04:00
Fixes coder#12141 Fixes coder#11750 PGCoord shutdown was uncoordinated, so an update at an inopportune time during shutdown would be rejected because the coordinator row was already deleted. This PR ensures that the PGCoord subcomponents that write updates are shut down before we take down the heartbeats, which is responsible for deleting the coordinator row.
diff --git a/enterprise/tailnet/pgcoord.go b/enterprise/tailnet/pgcoord.go
@@ -129,6 +129,10 @@ func newPGCoordInternal(
 	// signals when first heartbeat has been sent, so it's safe to start binding.
 	fHB := make(chan struct{})
 
+	// we need to arrange for the querier to stop _after_ the tunneler and binder, since we delete
+	// the coordinator when the querier stops (via the heartbeats).  If the tunneler and binder are
+	// still running, they could run afoul of foreign key constraints.
+	querierCtx, querierCancel := context.WithCancel(dbauthz.As(context.Background(), pgCoordSubject))
 	c := &pgCoord{
 		ctx:              ctx,
 		cancel:           cancel,
@@ -142,9 +146,17 @@ func newPGCoordInternal(
 		tunneler:         newTunneler(ctx, logger, id, store, sCh, fHB),
 		tunnelerCh:       sCh,
 		id:               id,
-		querier:          newQuerier(ctx, logger, id, ps, store, id, cCh, ccCh, numQuerierWorkers, fHB),
+		querier:          newQuerier(querierCtx, logger, id, ps, store, id, cCh, ccCh, numQuerierWorkers, fHB),
 		closed:           make(chan struct{}),
 	}
+	go func() {
+		// when the main context is canceled, or the coordinator closed, the binder and tunneler
+		// always eventually stop.  Once they stop it's safe to cancel the querier context, which
+		// has the effect of deleting the coordinator from the database and ceasing heartbeats.
+		c.binder.workerWG.Wait()
+		c.tunneler.workerWG.Wait()
+		querierCancel()
+	}()
 	logger.Info(ctx, "starting coordinator")
 	return c, nil
 }
@@ -255,6 +267,8 @@ type tunneler struct {
 	mu     sync.Mutex
 	latest map[uuid.UUID]map[uuid.UUID]tunnel
 	workQ  *workQ[tKey]
+
+	workerWG sync.WaitGroup
 }
 
 func newTunneler(ctx context.Context,
@@ -274,6 +288,9 @@ func newTunneler(ctx context.Context,
 		workQ:         newWorkQ[tKey](ctx),
 	}
 	go s.handle()
+	// add to the waitgroup immediately to avoid any races waiting for it before
+	// the workers start.
+	s.workerWG.Add(numTunnelerWorkers)
 	go func() {
 		<-startWorkers
 		for i := 0; i < numTunnelerWorkers; i++ {
@@ -297,6 +314,7 @@ func (t *tunneler) handle() {
 }
 
 func (t *tunneler) worker() {
+	defer t.workerWG.Done()
 	eb := backoff.NewExponentialBackOff()
 	eb.MaxElapsedTime = 0 // retry indefinitely
 	eb.MaxInterval = dbMaxBackoff
@@ -435,6 +453,8 @@ type binder struct {
 	mu     sync.Mutex
 	latest map[bKey]binding
 	workQ  *workQ[bKey]
+
+	workerWG sync.WaitGroup
 }
 
 func newBinder(ctx context.Context,
@@ -454,6 +474,9 @@ func newBinder(ctx context.Context,
 		workQ:         newWorkQ[bKey](ctx),
 	}
 	go b.handleBindings()
+	// add to the waitgroup immediately to avoid any races waiting for it before
+	// the workers start.
+	b.workerWG.Add(numBinderWorkers)
 	go func() {
 		<-startWorkers
 		for i := 0; i < numBinderWorkers; i++ {
@@ -477,6 +500,7 @@ func (b *binder) handleBindings() {
 }
 
 func (b *binder) worker() {
+	defer b.workerWG.Done()
 	eb := backoff.NewExponentialBackOff()
 	eb.MaxElapsedTime = 0 // retry indefinitely
 	eb.MaxInterval = dbMaxBackoff