Skip to content

Commit 0e16281

Browse files
committed
Fix test race between primary XLOG_RUNNING_XACTS and standby logical slot.
Before the previous commit, the test could hang until LOG_SNAPSHOT_INTERVAL_MS (15s), until checkpoint_timeout (300s), or indefinitely. An indefinite hang was awfully improbable. It entailed the test reaching checkpoint_timeout before the DecodingContextFindStartpoint() of a CREATE SUBSCRIPTION, yet after the preceding WAL record. Back-patch to v16, which introduced the test. Bertrand Drouvot, reported by Noah Misch. Discussion: https://postgr.es/m/20240211010227.a2.nmisch@google.com
1 parent 4791f87 commit 0e16281

File tree

2 files changed

+34
-16
lines changed

2 files changed

+34
-16
lines changed

src/test/perl/PostgreSQL/Test/Cluster.pm

+32-14
Original file line numberDiff line numberDiff line change
@@ -3181,6 +3181,36 @@ $SIG{TERM} = $SIG{INT} = sub {
31813181

31823182
=pod
31833183
3184+
=item $node->log_standby_snapshot(self, standby, slot_name)
3185+
3186+
Log a standby snapshot on primary once the slot restart_lsn is determined on
3187+
the standby.
3188+
3189+
=cut
3190+
3191+
sub log_standby_snapshot
3192+
{
3193+
my ($self, $standby, $slot_name) = @_;
3194+
3195+
# Once the slot's restart_lsn is determined, the standby looks for
3196+
# xl_running_xacts WAL record from the restart_lsn onwards. First wait
3197+
# until the slot restart_lsn is determined.
3198+
3199+
$standby->poll_query_until(
3200+
'postgres', qq[
3201+
SELECT restart_lsn IS NOT NULL
3202+
FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name'
3203+
])
3204+
or die
3205+
"timed out waiting for logical slot to calculate its restart_lsn";
3206+
3207+
# Then arrange for the xl_running_xacts record for which the standby is
3208+
# waiting.
3209+
$self->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()');
3210+
}
3211+
3212+
=pod
3213+
31843214
=item $node->create_logical_slot_on_standby(self, primary, slot_name, dbname)
31853215
31863216
Create logical replication slot on given standby
@@ -3206,21 +3236,9 @@ sub create_logical_slot_on_standby
32063236
'2>',
32073237
\$stderr);
32083238

3209-
# Once the slot's restart_lsn is determined, the standby looks for
3210-
# xl_running_xacts WAL record from the restart_lsn onwards. First wait
3211-
# until the slot restart_lsn is determined.
3212-
3213-
$self->poll_query_until(
3214-
'postgres', qq[
3215-
SELECT restart_lsn IS NOT NULL
3216-
FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name'
3217-
])
3218-
or die
3219-
"timed out waiting for logical slot to calculate its restart_lsn";
3220-
3221-
# Then arrange for the xl_running_xacts record for which pg_recvlogical is
3239+
# Arrange for the xl_running_xacts record for which pg_recvlogical is
32223240
# waiting.
3223-
$primary->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()');
3241+
$primary->log_standby_snapshot($self, $slot_name);
32243242

32253243
$handle->finish();
32263244

src/test/recovery/t/035_standby_logical_decoding.pl

+2-2
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,8 @@ sub wait_until_vacuum_can_remove
465465

466466
$psql_subscriber{run}->pump_nb();
467467

468-
# Speed up the subscription creation
469-
$node_primary->safe_psql('postgres', "SELECT pg_log_standby_snapshot()");
468+
# Log the standby snapshot to speed up the subscription creation
469+
$node_primary->log_standby_snapshot($node_standby, 'tap_sub');
470470

471471
# Explicitly shut down psql instance gracefully - to avoid hangs
472472
# or worse on windows

0 commit comments

Comments
 (0)