Skip to content

Commit 42bb4fb

Browse files
committed
Fix test races between syscache-update-pruned.spec and autovacuum.
This spec fails ~3% of my Valgrind runs, and the spec has failed on Valgrind buildfarm member skink at a similar rate. Two problems contributed to that: - A competing buffer pin triggered VACUUM's lazy_scan_noprune() path, causing "tuples missed: 1 dead from 1 pages not removed due to cleanup lock contention". FREEZE fixes that. - The spec ran lazy VACUUM immediately after VACUUM FULL. The spec implicitly assumed lazy VACUUM prunes the one tuple that VACUUM FULL made dead. First wait for old snapshots, making that assumption reliable. This also adds two forms of defense in depth: - Wait for snapshots using shared catalog pruning rules (VISHORIZON_SHARED). This avoids the removable cutoff moving backward when an XID-bearing autoanalyze process runs in another database. That may never happen in this test, but it's cheap insurance. - Use lazy VACUUM option DISABLE_PAGE_SKIPPING. Commit c2dc1a7 did this for a related requirement in other tests, but I suspect FREEZE is necessary and sufficient in all these tests. Back-patch to v17, where the test first appeared. Reported-by: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/sv3taq4e6ea4qckimien3nxp3sz4b6cw6sfcy4nhwl52zpur4g@h6i6tohxmizu Backpatch-through: 17
1 parent 17a165d commit 42bb4fb

File tree

4 files changed

+41
-16
lines changed

4 files changed

+41
-16
lines changed

src/test/modules/injection_points/expected/syscache-update-pruned.out

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ step at2:
77
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
88
<waiting ...>
99
step waitprunable4: CALL vactest.wait_prunable();
10-
step vac4: VACUUM pg_class;
10+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
1111
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
1212
step wakeinval4:
1313
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@@ -30,7 +30,7 @@ step at2:
3030
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
3131
<waiting ...>
3232
step waitprunable4: CALL vactest.wait_prunable();
33-
step vac4: VACUUM pg_class;
33+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
3434
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
3535
step wakeinval4:
3636
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@@ -61,7 +61,7 @@ step mkrels4:
6161

6262
step r3: ROLLBACK;
6363
step waitprunable4: CALL vactest.wait_prunable();
64-
step vac4: VACUUM pg_class;
64+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
6565
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
6666
step wakeinval4:
6767
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');

src/test/modules/injection_points/expected/syscache-update-pruned_1.out

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ step at2:
77
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
88
<waiting ...>
99
step waitprunable4: CALL vactest.wait_prunable();
10-
step vac4: VACUUM pg_class;
10+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
1111
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
1212
step wakeinval4:
1313
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@@ -29,7 +29,7 @@ step at2:
2929
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
3030
<waiting ...>
3131
step waitprunable4: CALL vactest.wait_prunable();
32-
step vac4: VACUUM pg_class;
32+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
3333
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
3434
step wakeinval4:
3535
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@@ -59,7 +59,7 @@ step mkrels4:
5959

6060
step r3: ROLLBACK;
6161
step waitprunable4: CALL vactest.wait_prunable();
62-
step vac4: VACUUM pg_class;
62+
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
6363
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
6464
step wakeinval4:
6565
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');

src/test/modules/injection_points/regress_injection.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
#include "access/table.h"
1818
#include "fmgr.h"
1919
#include "miscadmin.h"
20+
#include "postmaster/autovacuum.h"
2021
#include "storage/procarray.h"
22+
#include "utils/rel.h"
2123
#include "utils/xid8.h"
2224

2325
/*
@@ -28,11 +30,12 @@
2830
* that. For the causes of backward movement, see
2931
* postgr.es/m/CAEze2Wj%2BV0kTx86xB_YbyaqTr5hnE_igdWAwuhSyjXBYscf5-Q%40mail.gmail.com
3032
* and the header comment for ComputeXidHorizons(). One can assume this
31-
* doesn't move backward if one arranges for concurrent activity not to reach
32-
* AbortTransaction() and not to allocate an XID while connected to another
33-
* database. Non-runningcheck tests can control most concurrent activity,
34-
* except autovacuum and the isolationtester control connection. Neither
35-
* allocates XIDs, and AbortTransaction() in those would justify test failure.
33+
* doesn't move backward if one (a) passes a shared catalog as the argument
34+
* and (b) arranges for concurrent activity not to reach AbortTransaction().
35+
* Non-runningcheck tests can control most concurrent activity, except
36+
* autovacuum and the isolationtester control connection. AbortTransaction()
37+
* in those would justify test failure. Seeing autoanalyze can allocate an
38+
* XID in any database, (a) ensures we'll consistently not ignore those XIDs.
3639
*/
3740
PG_FUNCTION_INFO_V1(removable_cutoff);
3841
Datum
@@ -47,6 +50,10 @@ removable_cutoff(PG_FUNCTION_ARGS)
4750
if (!PG_ARGISNULL(0))
4851
rel = table_open(PG_GETARG_OID(0), AccessShareLock);
4952

53+
if (!rel->rd_rel->relisshared && autovacuum_start_daemon)
54+
elog(WARNING,
55+
"removable_cutoff(non-shared-rel) can move backward under autovacuum=on");
56+
5057
/*
5158
* No lock or snapshot necessarily prevents oldestXid from advancing past
5259
* "xid" while this function runs. That concerns us only in that we must

src/test/modules/injection_points/specs/syscache-update-pruned.spec

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,37 @@ setup
5353
barrier := pg_current_xact_id();
5454
-- autovacuum worker RelationCacheInitializePhase3() or the
5555
-- isolationtester control connection might hold a snapshot that
56-
-- limits pruning. Sleep until that clears.
56+
-- limits pruning. Sleep until that clears. See comments at
57+
-- removable_cutoff() for why we pass a shared catalog rather than
58+
-- pg_class, the table we'll prune.
5759
LOOP
5860
ROLLBACK; -- release MyProc->xmin, which could be the oldest
59-
cutoff := removable_cutoff('pg_class');
61+
cutoff := removable_cutoff('pg_database');
6062
EXIT WHEN cutoff >= barrier;
6163
RAISE LOG 'removable cutoff %; waiting for %', cutoff, barrier;
6264
PERFORM pg_sleep(.1);
6365
END LOOP;
6466
END
6567
$$;
6668
}
67-
setup { CALL vactest.wait_prunable(); -- maximize next two VACUUMs }
69+
# Eliminate HEAPTUPLE_DEAD and HEAPTUPLE_RECENTLY_DEAD from pg_class.
70+
# Minimize free space.
71+
#
72+
# If we kept HEAPTUPLE_RECENTLY_DEAD, step vac4 could prune what we missed,
73+
# breaking some permutation assumptions. Specifically, the next pg_class
74+
# tuple could end up in free space we failed to liberate here, instead of
75+
# going in the specific free space vac4 intended to liberate for it.
76+
setup { CALL vactest.wait_prunable(); -- maximize VACUUM FULL }
6877
setup { VACUUM FULL pg_class; -- reduce free space }
69-
setup { VACUUM FREEZE pg_class; -- populate fsm etc. }
78+
# Remove the one tuple that VACUUM FULL makes dead, a tuple pertaining to
79+
# pg_class itself. Populate the FSM for pg_class.
80+
#
81+
# wait_prunable waits for snapshots that would thwart pruning, while FREEZE
82+
# waits for buffer pins that would thwart pruning. DISABLE_PAGE_SKIPPING
83+
# isn't actually needed, but other pruning-dependent tests use it. If those
84+
# tests remove it, remove it here.
85+
setup { CALL vactest.wait_prunable(); -- maximize lazy VACUUM }
86+
setup { VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class; -- fill fsm etc. }
7087
setup
7188
{
7289
SELECT FROM vactest.mkrels('orig', 1, 49);
@@ -115,7 +132,8 @@ step r3 { ROLLBACK; }
115132
# Non-blocking actions.
116133
session s4
117134
step waitprunable4 { CALL vactest.wait_prunable(); }
118-
step vac4 { VACUUM pg_class; }
135+
# Eliminate HEAPTUPLE_DEAD. See above discussion of FREEZE.
136+
step vac4 { VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class; }
119137
# Reuse the lp that s1 is waiting to change. I've observed reuse at the 1st
120138
# or 18th CREATE, so create excess.
121139
step mkrels4 {

0 commit comments

Comments
 (0)