Skip to content

Commit fdd965d

Browse files
committed
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
1 parent 1e94756 commit fdd965d

File tree

8 files changed

+368
-93
lines changed

8 files changed

+368
-93
lines changed

contrib/amcheck/t/002_cic.pl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
# Copyright (c) 2021, PostgreSQL Global Development Group
3+
4+
# Test CREATE INDEX CONCURRENTLY with concurrent modifications
5+
use strict;
6+
use warnings;
7+
8+
use Config;
9+
use PostgresNode;
10+
use TestLib;
11+
12+
use Test::More tests => 4;
13+
14+
my ($node, $result);
15+
16+
#
17+
# Test set-up
18+
#
19+
$node = PostgresNode->new('CIC_test');
20+
$node->init;
21+
$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
22+
$node->start;
23+
$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
24+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
25+
$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
26+
27+
#
28+
# Stress CIC with pgbench
29+
#
30+
31+
# Run background pgbench with CIC. We cannot mix-in this script into single
32+
# pgbench: CIC will deadlock with itself occasionally.
33+
my $pgbench_out = '';
34+
my $pgbench_timer = IPC::Run::timeout(180);
35+
my $pgbench_h = $node->background_pgbench(
36+
'--no-vacuum --client=1 --transactions=200',
37+
{
38+
'002_pgbench_concurrent_cic' => q(
39+
DROP INDEX CONCURRENTLY idx;
40+
CREATE INDEX CONCURRENTLY idx ON tbl(i);
41+
SELECT bt_index_check('idx',true);
42+
)
43+
},
44+
\$pgbench_out,
45+
$pgbench_timer);
46+
47+
# Run pgbench.
48+
$node->pgbench(
49+
'--no-vacuum --client=5 --transactions=200',
50+
0,
51+
[qr{actually processed}],
52+
[qr{^$}],
53+
'concurrent INSERTs',
54+
{
55+
'002_pgbench_concurrent_transaction' => q(
56+
BEGIN;
57+
INSERT INTO tbl VALUES(0);
58+
COMMIT;
59+
),
60+
'002_pgbench_concurrent_transaction_savepoints' => q(
61+
BEGIN;
62+
SAVEPOINT s1;
63+
INSERT INTO tbl VALUES(0);
64+
COMMIT;
65+
)
66+
});
67+
68+
$pgbench_h->pump_nb;
69+
$pgbench_h->finish();
70+
$result =
71+
($Config{osname} eq "MSWin32")
72+
? ($pgbench_h->full_results)[0]
73+
: $pgbench_h->result(0);
74+
is($result, 0, "pgbench with CIC works");
75+
76+
# done
77+
$node->stop;
78+
done_testing();

src/backend/utils/cache/inval.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
628628
int i;
629629

630630
if (msg->rc.relId == InvalidOid)
631-
RelationCacheInvalidate();
631+
RelationCacheInvalidate(false);
632632
else
633633
RelationCacheInvalidateEntry(msg->rc.relId);
634634

@@ -685,12 +685,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
685685
*/
686686
void
687687
InvalidateSystemCaches(void)
688+
{
689+
InvalidateSystemCachesExtended(false);
690+
}
691+
692+
void
693+
InvalidateSystemCachesExtended(bool debug_discard)
688694
{
689695
int i;
690696

691697
InvalidateCatalogSnapshot();
692698
ResetCatalogCaches();
693-
RelationCacheInvalidate(); /* gets smgr and relmap too */
699+
RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
694700

695701
for (i = 0; i < syscache_callback_count; i++)
696702
{
@@ -759,7 +765,7 @@ AcceptInvalidationMessages(void)
759765
if (recursion_depth < debug_discard_caches)
760766
{
761767
recursion_depth++;
762-
InvalidateSystemCaches();
768+
InvalidateSystemCachesExtended(true);
763769
recursion_depth--;
764770
}
765771
}

src/backend/utils/cache/relcache.c

Lines changed: 109 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,24 @@ bool criticalSharedRelcachesBuilt = false;
150150
*/
151151
static long relcacheInvalsReceived = 0L;
152152

153+
/*
154+
* in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
155+
* INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
156+
* It critically relies on each backend absorbing those changes no later than
157+
* next transaction start. Hence, RelationBuildDesc() loops until it finishes
158+
* without accepting a relevant invalidation. (Most invalidation consumers
159+
* don't do this.)
160+
*/
161+
typedef struct inprogressent
162+
{
163+
Oid reloid; /* OID of relation being built */
164+
bool invalidated; /* whether an invalidation arrived for it */
165+
} InProgressEnt;
166+
167+
static InProgressEnt *in_progress_list;
168+
static int in_progress_list_len;
169+
static int in_progress_list_maxlen;
170+
153171
/*
154172
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
155173
* cleanup work. This list intentionally has limited size; if it overflows,
@@ -1000,6 +1018,7 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
10001018
static Relation
10011019
RelationBuildDesc(Oid targetRelId, bool insertIt)
10021020
{
1021+
int in_progress_offset;
10031022
Relation relation;
10041023
Oid relid;
10051024
HeapTuple pg_class_tuple;
@@ -1033,6 +1052,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10331052
}
10341053
#endif
10351054

1055+
/* Register to catch invalidation messages */
1056+
if (in_progress_list_len >= in_progress_list_maxlen)
1057+
{
1058+
int allocsize;
1059+
1060+
allocsize = in_progress_list_maxlen * 2;
1061+
in_progress_list = repalloc(in_progress_list,
1062+
allocsize * sizeof(*in_progress_list));
1063+
in_progress_list_maxlen = allocsize;
1064+
}
1065+
in_progress_offset = in_progress_list_len++;
1066+
in_progress_list[in_progress_offset].reloid = targetRelId;
1067+
retry:
1068+
in_progress_list[in_progress_offset].invalidated = false;
1069+
10361070
/*
10371071
* find the tuple in pg_class corresponding to the given relation id
10381072
*/
@@ -1051,6 +1085,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10511085
MemoryContextDelete(tmpcxt);
10521086
}
10531087
#endif
1088+
Assert(in_progress_offset + 1 == in_progress_list_len);
1089+
in_progress_list_len--;
10541090
return NULL;
10551091
}
10561092

@@ -1213,6 +1249,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
12131249
*/
12141250
heap_freetuple(pg_class_tuple);
12151251

1252+
/*
1253+
* If an invalidation arrived mid-build, start over. Between here and the
1254+
* end of this function, don't add code that does or reasonably could read
1255+
* system catalogs. That range must be free from invalidation processing
1256+
* for the !insertIt case. For the insertIt case, RelationCacheInsert()
1257+
* will enroll this relation in ordinary relcache invalidation processing,
1258+
*/
1259+
if (in_progress_list[in_progress_offset].invalidated)
1260+
{
1261+
RelationDestroyRelation(relation, false);
1262+
goto retry;
1263+
}
1264+
Assert(in_progress_offset + 1 == in_progress_list_len);
1265+
in_progress_list_len--;
1266+
12161267
/*
12171268
* Insert newly created relation into relcache hash table, if requested.
12181269
*
@@ -2566,6 +2617,14 @@ RelationClearRelation(Relation relation, bool rebuild)
25662617

25672618
/* Build temporary entry, but don't link it into hashtable */
25682619
newrel = RelationBuildDesc(save_relid, false);
2620+
2621+
/*
2622+
* Between here and the end of the swap, don't add code that does or
2623+
* reasonably could read system catalogs. That range must be free
2624+
* from invalidation processing. See RelationBuildDesc() manipulation
2625+
* of in_progress_list.
2626+
*/
2627+
25692628
if (newrel == NULL)
25702629
{
25712630
/*
@@ -2805,6 +2864,14 @@ RelationCacheInvalidateEntry(Oid relationId)
28052864
relcacheInvalsReceived++;
28062865
RelationFlushRelation(relation);
28072866
}
2867+
else
2868+
{
2869+
int i;
2870+
2871+
for (i = 0; i < in_progress_list_len; i++)
2872+
if (in_progress_list[i].reloid == relationId)
2873+
in_progress_list[i].invalidated = true;
2874+
}
28082875
}
28092876

28102877
/*
@@ -2813,11 +2880,11 @@ RelationCacheInvalidateEntry(Oid relationId)
28132880
* and rebuild those with positive reference counts. Also reset the smgr
28142881
* relation cache and re-read relation mapping data.
28152882
*
2816-
* This is currently used only to recover from SI message buffer overflow,
2817-
* so we do not touch relations having new-in-transaction relfilenodes; they
2818-
* cannot be targets of cross-backend SI updates (and our own updates now go
2819-
* through a separate linked list that isn't limited by the SI message
2820-
* buffer size).
2883+
* Apart from debug_discard_caches, this is currently used only to recover
2884+
* from SI message buffer overflow, so we do not touch relations having
2885+
* new-in-transaction relfilenodes; they cannot be targets of cross-backend
2886+
* SI updates (and our own updates now go through a separate linked list
2887+
* that isn't limited by the SI message buffer size).
28212888
*
28222889
* We do this in two phases: the first pass deletes deletable items, and
28232890
* the second one rebuilds the rebuildable items. This is essential for
@@ -2835,16 +2902,22 @@ RelationCacheInvalidateEntry(Oid relationId)
28352902
* second pass processes nailed-in-cache items before other nondeletable
28362903
* items. This should ensure that system catalogs are up to date before
28372904
* we attempt to use them to reload information about other open relations.
2905+
*
2906+
* After those two phases of work having immediate effects, we normally
2907+
* signal any RelationBuildDesc() on the stack to start over. However, we
2908+
* don't do this if called as part of debug_discard_caches. Otherwise,
2909+
* RelationBuildDesc() would become an infinite loop.
28382910
*/
28392911
void
2840-
RelationCacheInvalidate(void)
2912+
RelationCacheInvalidate(bool debug_discard)
28412913
{
28422914
HASH_SEQ_STATUS status;
28432915
RelIdCacheEnt *idhentry;
28442916
Relation relation;
28452917
List *rebuildFirstList = NIL;
28462918
List *rebuildList = NIL;
28472919
ListCell *l;
2920+
int i;
28482921

28492922
/*
28502923
* Reload relation mapping data before starting to reconstruct cache.
@@ -2931,6 +3004,11 @@ RelationCacheInvalidate(void)
29313004
RelationClearRelation(relation, true);
29323005
}
29333006
list_free(rebuildList);
3007+
3008+
if (!debug_discard)
3009+
/* Any RelationBuildDesc() on the stack must start over. */
3010+
for (i = 0; i < in_progress_list_len; i++)
3011+
in_progress_list[i].invalidated = true;
29343012
}
29353013

29363014
/*
@@ -3081,6 +3159,13 @@ AtEOXact_RelationCache(bool isCommit)
30813159
RelIdCacheEnt *idhentry;
30823160
int i;
30833161

3162+
/*
3163+
* Forget in_progress_list. This is relevant when we're aborting due to
3164+
* an error during RelationBuildDesc().
3165+
*/
3166+
Assert(in_progress_list_len == 0 || !isCommit);
3167+
in_progress_list_len = 0;
3168+
30843169
/*
30853170
* Unless the eoxact_list[] overflowed, we only need to examine the rels
30863171
* listed in it. Otherwise fall back on a hash_seq_search scan.
@@ -3227,6 +3312,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
32273312
RelIdCacheEnt *idhentry;
32283313
int i;
32293314

3315+
/*
3316+
* Forget in_progress_list. This is relevant when we're aborting due to
3317+
* an error during RelationBuildDesc(). We don't commit subtransactions
3318+
* during RelationBuildDesc().
3319+
*/
3320+
Assert(in_progress_list_len == 0 || !isCommit);
3321+
in_progress_list_len = 0;
3322+
32303323
/*
32313324
* Unless the eoxact_list[] overflowed, we only need to examine the rels
32323325
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
@@ -3775,6 +3868,7 @@ void
37753868
RelationCacheInitialize(void)
37763869
{
37773870
HASHCTL ctl;
3871+
int allocsize;
37783872

37793873
/*
37803874
* make sure cache memory context exists
@@ -3790,6 +3884,15 @@ RelationCacheInitialize(void)
37903884
RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
37913885
&ctl, HASH_ELEM | HASH_BLOBS);
37923886

3887+
/*
3888+
* reserve enough in_progress_list slots for many cases
3889+
*/
3890+
allocsize = 4;
3891+
in_progress_list =
3892+
MemoryContextAlloc(CacheMemoryContext,
3893+
allocsize * sizeof(*in_progress_list));
3894+
in_progress_list_maxlen = allocsize;
3895+
37933896
/*
37943897
* relation mapper needs to be initialized too
37953898
*/

0 commit comments

Comments
 (0)