Skip to content

Commit e90a010

Browse files
Switchover must wait for the inner goroutine before it returns. (zalando#343)
* Switchover must wait for the inner goroutine before it returns. Otherwise, two corner cases may happen: - waitForPodLabel writes to the podLabelErr channel that has been already closed by the outer routine - the outer routine exists and the caller subscribes to the pod the inner goroutine has already subscribed to, resulting in panic. The previous commit zalando@fe47f9e that touched that code added the cancellation channel, but didn't bother to actually wait for the goroutine to be cancelled. Per report and review from @valer-cara. Original issue: zalando#342
1 parent b7b950e commit e90a010

File tree

1 file changed

+25
-14
lines changed

1 file changed

+25
-14
lines changed

pkg/cluster/cluster.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -867,41 +867,52 @@ func (c *Cluster) GetStatus() *spec.ClusterStatus {
867867
}
868868

869869
// Switchover does a switchover (via Patroni) to a candidate pod
870-
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
870+
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) (err error) {
871+
871872
c.logger.Debugf("failing over from %q to %q", curMaster.Name, candidate)
872873

874+
var wg sync.WaitGroup
875+
873876
podLabelErr := make(chan error)
874877
stopCh := make(chan struct{})
875-
defer close(podLabelErr)
878+
879+
wg.Add(1)
876880

877881
go func() {
882+
defer wg.Done()
878883
ch := c.registerPodSubscriber(candidate)
879884
defer c.unregisterPodSubscriber(candidate)
880885

881886
role := Master
882887

883888
select {
884889
case <-stopCh:
885-
case podLabelErr <- func() error {
886-
_, err := c.waitForPodLabel(ch, stopCh, &role)
887-
return err
890+
case podLabelErr <- func() (err error) {
891+
_, err = c.waitForPodLabel(ch, stopCh, &role)
892+
return
888893
}():
889894
}
890895
}()
891896

892-
if err := c.patroni.Switchover(curMaster, candidate.Name); err != nil {
893-
close(stopCh)
894-
return fmt.Errorf("could not failover: %v", err)
897+
if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil {
898+
c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate)
899+
if err = <-podLabelErr; err != nil {
900+
err = fmt.Errorf("could not get master pod label: %v", err)
901+
}
902+
} else {
903+
err = fmt.Errorf("could not failover: %v", err)
895904
}
896-
c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate)
897905

898-
defer close(stopCh)
906+
// signal the role label waiting goroutine to close the shop and go home
907+
close(stopCh)
908+
// wait until the goroutine terminates, since unregisterPodSubscriber
909+
// must be called before the outer return; otherwsise we risk subscribing to the same pod twice.
910+
wg.Wait()
911+
// close the label waiting channel no sooner than the waiting goroutine terminates.
912+
close(podLabelErr)
899913

900-
if err := <-podLabelErr; err != nil {
901-
return fmt.Errorf("could not get master pod label: %v", err)
902-
}
914+
return
903915

904-
return nil
905916
}
906917

907918
// Lock locks the cluster

0 commit comments

Comments
 (0)