Skip to content

Commit 6f3d8d5

Browse files
author
Amit Kapila
committed
Fix the intermittent buildfarm failures in 040_standby_failover_slots_sync.
It is possible that even if the primary waits for the subscriber to catch up and then disables the subscription, the XLOG_RUNNING_XACTS record gets inserted between the two steps by bgwriter and walsender processes it. This can move the restart_lsn of the corresponding slot in an unpredictable way which further leads to slot sync failure. To ensure predictable behaviour, we drop the subscription and manually create the slot before the test. The other idea we discussed to write a predictable test is to use injection points to control the bgwriter logging XLOG_RUNNING_XACTS but that needs more analysis. We can add a separate test using injection points. Per buildfarm Author: Hou Zhijie Reviewed-by: Amit Kapila, Shveta Malik Discussion: https://postgr.es/m/CAA4eK1JD8h_XLRsK_o_Xh=5MhTzm+6d4Cb4_uPgFJ2wSQDah=g@mail.gmail.com
1 parent 8a1b31e commit 6f3d8d5

File tree

1 file changed

+24
-38
lines changed

1 file changed

+24
-38
lines changed

src/test/recovery/t/040_standby_failover_slots_sync.pl

Lines changed: 24 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,14 @@
123123
"cannot sync slots on a non-standby server");
124124

125125
##################################################
126-
# Test logical failover slots on the standby
126+
# Test logical failover slots corresponding to different plugins can be
127+
# synced to the standby.
128+
#
127129
# Configure standby1 to replicate and synchronize logical slots configured
128130
# for failover on the primary
129131
#
130-
# failover slot lsub1_slot ->| ----> subscriber1 (connected via logical replication)
131-
# failover slot lsub2_slot | inactive
132+
# failover slot lsub1_slot | output_plugin: pgoutput
133+
# failover slot lsub2_slot | output_plugin: test_decoding
132134
# primary ---> |
133135
# physical slot sb1_slot --->| ----> standby1 (connected via streaming replication)
134136
# | lsub1_slot, lsub2_slot (synced_slot)
@@ -159,6 +161,16 @@
159161
$primary->append_conf('postgresql.conf', "log_min_messages = 'debug2'");
160162
$primary->reload;
161163

164+
# Drop the subscription to prevent further advancement of the restart_lsn for
165+
# the lsub1_slot.
166+
$subscriber1->safe_psql('postgres', "DROP SUBSCRIPTION regress_mysub1;");
167+
168+
# To ensure that restart_lsn has moved to a recent WAL position, we re-create
169+
# the lsub1_slot.
170+
$primary->psql('postgres',
171+
q{SELECT pg_create_logical_replication_slot('lsub1_slot', 'pgoutput', false, false, true);}
172+
);
173+
162174
$primary->psql('postgres',
163175
q{SELECT pg_create_logical_replication_slot('lsub2_slot', 'test_decoding', false, false, true);}
164176
);
@@ -169,25 +181,13 @@
169181
# Start the standby so that slot syncing can begin
170182
$standby1->start;
171183

172-
$primary->wait_for_catchup('regress_mysub1');
173-
174-
# Do not allow any further advancement of the restart_lsn for the lsub1_slot.
175-
$subscriber1->safe_psql('postgres',
176-
"ALTER SUBSCRIPTION regress_mysub1 DISABLE");
177-
178-
# Wait for the replication slot to become inactive on the publisher
179-
$primary->poll_query_until(
180-
'postgres',
181-
"SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE slot_name = 'lsub1_slot' AND active = 'f'",
182-
1);
183-
184184
# Capture the inactive_since of the slot from the primary. Note that the slot
185-
# will be inactive since the corresponding subscription is disabled.
185+
# will be inactive since the corresponding subscription was dropped.
186186
my $inactive_since_on_primary =
187187
$primary->validate_slot_inactive_since('lsub1_slot', $slot_creation_time_on_primary);
188188

189189
# Wait for the standby to catch up so that the standby is not lagging behind
190-
# the subscriber.
190+
# the failover slots.
191191
$primary->wait_for_replay_catchup($standby1);
192192

193193
# Synchronize the primary server slots to the standby.
@@ -262,39 +262,26 @@
262262
$standby1->reload;
263263

264264
# Capture the time before the logical failover slot is created on the primary.
265-
# Note that the subscription creates the slot again on the primary.
266265
$slot_creation_time_on_primary = $publisher->safe_psql(
267266
'postgres', qq[
268267
SELECT current_timestamp;
269268
]);
270269

271270
# To ensure that restart_lsn has moved to a recent WAL position, we re-create
272-
# the subscription and the logical slot.
273-
$subscriber1->safe_psql(
271+
# the lsub1_slot.
272+
$primary->safe_psql(
274273
'postgres', qq[
275-
DROP SUBSCRIPTION regress_mysub1;
276-
CREATE SUBSCRIPTION regress_mysub1 CONNECTION '$publisher_connstr' PUBLICATION regress_mypub WITH (slot_name = lsub1_slot, copy_data = false, failover = true);
274+
SELECT pg_drop_replication_slot('lsub1_slot');
275+
SELECT pg_create_logical_replication_slot('lsub1_slot', 'pgoutput', false, false, true);
277276
]);
278277

279-
$primary->wait_for_catchup('regress_mysub1');
280-
281-
# Do not allow any further advancement of the restart_lsn for the lsub1_slot.
282-
$subscriber1->safe_psql('postgres',
283-
"ALTER SUBSCRIPTION regress_mysub1 DISABLE");
284-
285-
# Wait for the replication slot to become inactive on the publisher
286-
$primary->poll_query_until(
287-
'postgres',
288-
"SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE slot_name = 'lsub1_slot' AND active = 'f'",
289-
1);
290-
291278
# Capture the inactive_since of the slot from the primary. Note that the slot
292-
# will be inactive since the corresponding subscription is disabled.
279+
# will be inactive since the corresponding subscription was dropped.
293280
$inactive_since_on_primary =
294281
$primary->validate_slot_inactive_since('lsub1_slot', $slot_creation_time_on_primary);
295282

296283
# Wait for the standby to catch up so that the standby is not lagging behind
297-
# the subscriber.
284+
# the failover slots.
298285
$primary->wait_for_replay_catchup($standby1);
299286

300287
my $log_offset = -s $standby1->logfile;
@@ -571,8 +558,7 @@
571558
$subscriber1->safe_psql(
572559
'postgres', qq[
573560
CREATE TABLE tab_int (a int PRIMARY KEY);
574-
ALTER SUBSCRIPTION regress_mysub1 ENABLE;
575-
ALTER SUBSCRIPTION regress_mysub1 REFRESH PUBLICATION;
561+
CREATE SUBSCRIPTION regress_mysub1 CONNECTION '$publisher_connstr' PUBLICATION regress_mypub WITH (slot_name = lsub1_slot, failover = true, create_slot = false);
576562
]);
577563

578564
$subscriber1->wait_for_subscription_sync;

0 commit comments

Comments
 (0)