Skip to content

Commit 46d8587

Browse files
committed
Improve stability of recovery test 035_standby_logical_decoding
This commit tweaks a couple of things in 035_standby_logical_decoding to hopefully stabilize it: - Autovacuum is now disabled, as it could hold a global xmin with a transaction. - Conflicts are generated with command sequences that removed rows (on catalogs, shared or non-shared, or just plain relations) followed by a VACUUM. This was unstable because these did not check that the horizon moved between the SQL commands and the VACUUM. The logic is refactored as follows, to ensure that VACUUM removes dead rows before testing for slot invalidation on a standby (idea suggested by Andres Freund): -- Grab the current horizon. -- Launch SQL commands removing rows. -- Check that the snapshot horizon has been updated. -- Launch VACUUM on the relation whose rows have been removed by the first step. Note that there are still some issues because of standby snapshot WAL records generated by the bgwriter, but this makes the test much more stable. Per reports from buildfarm members dikkop and skink, with analysis and tests from Alexander Lakhin. While on it, fix a couple of incorrect comments. Author: Bertrand Drouvot Reviewed-by: Alexander Lakhin, Michael Paquier Discussion: https://postgr.es/m/OSZPR01MB6310ED3CEDB531BCEDBC6AF2FD479@OSZPR01MB6310.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/bf67e076-b163-9ba3-4ade-b9fc51a3a8f6@gmail.com Backpatch-through: 16
1 parent b262ad4 commit 46d8587

File tree

1 file changed

+56
-34
lines changed

1 file changed

+56
-34
lines changed

src/test/recovery/t/035_standby_logical_decoding.pl

+56-34
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,40 @@ sub check_for_invalidation
238238
) or die "Timed out waiting confl_active_logicalslot to be updated";
239239
}
240240

241+
# Launch $sql query, wait for a new snapshot that has a newer horizon and
242+
# launch a VACUUM. $vac_option is the set of options to be passed to the
243+
# VACUUM command, $sql the sql to launch before triggering the vacuum and
244+
# $to_vac the relation to vacuum.
245+
#
246+
# Note that pg_current_snapshot() is used to get the horizon. It does
247+
# not generate a Transaction/COMMIT WAL record, decreasing the risk of
248+
# seeing a xl_running_xacts that would advance an active replication slot's
249+
# catalog_xmin. Advancing the active replication slot's catalog_xmin
250+
# would break some tests that expect the active slot to conflict with
251+
# the catalog xmin horizon.
252+
sub wait_until_vacuum_can_remove
253+
{
254+
my ($vac_option, $sql, $to_vac) = @_;
255+
256+
# Get the current xid horizon,
257+
my $xid_horizon = $node_primary->safe_psql('testdb',
258+
qq[select pg_snapshot_xmin(pg_current_snapshot());]);
259+
260+
# Launch our sql.
261+
$node_primary->safe_psql('testdb', qq[$sql]);
262+
263+
# Wait until we get a newer horizon.
264+
$node_primary->poll_query_until('testdb',
265+
"SELECT (select pg_snapshot_xmin(pg_current_snapshot())::text::int - $xid_horizon) > 0"
266+
) or die "new snapshot does not have a newer horizon";
267+
268+
# Launch the vacuum command and insert into flush_wal (see CREATE
269+
# TABLE flush_wal for the reason).
270+
$node_primary->safe_psql(
271+
'testdb', qq[VACUUM $vac_option verbose $to_vac;
272+
INSERT INTO flush_wal DEFAULT VALUES;]);
273+
}
274+
241275
########################
242276
# Initialize primary node
243277
########################
@@ -248,6 +282,7 @@ sub check_for_invalidation
248282
wal_level = 'logical'
249283
max_replication_slots = 4
250284
max_wal_senders = 4
285+
autovacuum = off
251286
});
252287
$node_primary->dump_info;
253288
$node_primary->start;
@@ -468,13 +503,9 @@ sub check_for_invalidation
468503
0, 1);
469504

470505
# This should trigger the conflict
471-
$node_primary->safe_psql(
472-
'testdb', qq[
473-
CREATE TABLE conflict_test(x integer, y text);
474-
DROP TABLE conflict_test;
475-
VACUUM full pg_class;
476-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
477-
]);
506+
wait_until_vacuum_can_remove(
507+
'full', 'CREATE TABLE conflict_test(x integer, y text);
508+
DROP TABLE conflict_test;', 'pg_class');
478509

479510
$node_primary->wait_for_replay_catchup($node_standby);
480511

@@ -545,18 +576,15 @@ sub check_for_invalidation
545576
my $logstart = -s $node_standby->logfile;
546577

547578
# One way to produce recovery conflict is to create/drop a relation and
548-
# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
579+
# launch a vacuum on pg_class with hot_standby_feedback turned off on the
580+
# standby.
549581
reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
550582
0, 1);
551583

552584
# This should trigger the conflict
553-
$node_primary->safe_psql(
554-
'testdb', qq[
555-
CREATE TABLE conflict_test(x integer, y text);
556-
DROP TABLE conflict_test;
557-
VACUUM pg_class;
558-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
559-
]);
585+
wait_until_vacuum_can_remove(
586+
'', 'CREATE TABLE conflict_test(x integer, y text);
587+
DROP TABLE conflict_test;', 'pg_class');
560588

561589
$node_primary->wait_for_replay_catchup($node_standby);
562590

@@ -582,19 +610,16 @@ sub check_for_invalidation
582610
# get the position to search from in the standby logfile
583611
$logstart = -s $node_standby->logfile;
584612

585-
# One way to produce recovery conflict is to create/drop a relation and
586-
# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
613+
# One way to produce recovery conflict on a shared catalog table is to
614+
# create/drop a role and launch a vacuum on pg_authid with
615+
# hot_standby_feedback turned off on the standby.
587616
reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
588617
'shared_row_removal_', 0, 1);
589618

590619
# Trigger the conflict
591-
$node_primary->safe_psql(
592-
'testdb', qq[
593-
CREATE ROLE create_trash;
594-
DROP ROLE create_trash;
595-
VACUUM pg_authid;
596-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
597-
]);
620+
wait_until_vacuum_can_remove(
621+
'', 'CREATE ROLE create_trash;
622+
DROP ROLE create_trash;', 'pg_authid');
598623

599624
$node_primary->wait_for_replay_catchup($node_standby);
600625

@@ -625,14 +650,11 @@ sub check_for_invalidation
625650
'no_conflict_', 0, 1);
626651

627652
# This should not trigger a conflict
628-
$node_primary->safe_psql(
629-
'testdb', qq[
630-
CREATE TABLE conflict_test(x integer, y text);
631-
INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
632-
UPDATE conflict_test set x=1, y=1;
633-
VACUUM conflict_test;
634-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
635-
]);
653+
wait_until_vacuum_can_remove(
654+
'', 'CREATE TABLE conflict_test(x integer, y text);
655+
INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
656+
UPDATE conflict_test set x=1, y=1;', 'conflict_test');
657+
636658
$node_primary->wait_for_replay_catchup($node_standby);
637659

638660
# message should not be issued
@@ -671,7 +693,7 @@ sub check_for_invalidation
671693

672694
##################################################
673695
# Recovery conflict: Invalidate conflicting slots, including in-use slots
674-
# Scenario 4: conflict due to on-access pruning.
696+
# Scenario 5: conflict due to on-access pruning.
675697
##################################################
676698

677699
# get the position to search from in the standby logfile
@@ -711,7 +733,7 @@ sub check_for_invalidation
711733

712734
##################################################
713735
# Recovery conflict: Invalidate conflicting slots, including in-use slots
714-
# Scenario 5: incorrect wal_level on primary.
736+
# Scenario 6: incorrect wal_level on primary.
715737
##################################################
716738

717739
# get the position to search from in the standby logfile

0 commit comments

Comments
 (0)