Skip to content

Commit db86746

Browse files
committed
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
1 parent 438d467 commit db86746

File tree

8 files changed

+409
-5
lines changed

8 files changed

+409
-5
lines changed

src/backend/utils/cache/inval.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
584584
int i;
585585

586586
if (msg->rc.relId == InvalidOid)
587-
RelationCacheInvalidate();
587+
RelationCacheInvalidate(false);
588588
else
589589
RelationCacheInvalidateEntry(msg->rc.relId);
590590

@@ -641,12 +641,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
641641
*/
642642
void
643643
InvalidateSystemCaches(void)
644+
{
645+
InvalidateSystemCachesExtended(false);
646+
}
647+
648+
void
649+
InvalidateSystemCachesExtended(bool debug_discard)
644650
{
645651
int i;
646652

647653
InvalidateCatalogSnapshot();
648654
ResetCatalogCaches();
649-
RelationCacheInvalidate(); /* gets smgr and relmap too */
655+
RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
650656

651657
for (i = 0; i < syscache_callback_count; i++)
652658
{
@@ -717,7 +723,7 @@ AcceptInvalidationMessages(void)
717723
if (recursion_depth < 3)
718724
{
719725
recursion_depth++;
720-
InvalidateSystemCaches();
726+
InvalidateSystemCachesExtended(true);
721727
recursion_depth--;
722728
}
723729
}

src/backend/utils/cache/relcache.c

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,24 @@ bool criticalSharedRelcachesBuilt = false;
142142
*/
143143
static long relcacheInvalsReceived = 0L;
144144

145+
/*
146+
* in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
147+
* INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
148+
* It critically relies on each backend absorbing those changes no later than
149+
* next transaction start. Hence, RelationBuildDesc() loops until it finishes
150+
* without accepting a relevant invalidation. (Most invalidation consumers
151+
* don't do this.)
152+
*/
153+
typedef struct inprogressent
154+
{
155+
Oid reloid; /* OID of relation being built */
156+
bool invalidated; /* whether an invalidation arrived for it */
157+
} InProgressEnt;
158+
159+
static InProgressEnt *in_progress_list;
160+
static int in_progress_list_len;
161+
static int in_progress_list_maxlen;
162+
145163
/*
146164
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
147165
* cleanup work. This list intentionally has limited size; if it overflows,
@@ -1197,11 +1215,27 @@ equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
11971215
static Relation
11981216
RelationBuildDesc(Oid targetRelId, bool insertIt)
11991217
{
1218+
int in_progress_offset;
12001219
Relation relation;
12011220
Oid relid;
12021221
HeapTuple pg_class_tuple;
12031222
Form_pg_class relp;
12041223

1224+
/* Register to catch invalidation messages */
1225+
if (in_progress_list_len >= in_progress_list_maxlen)
1226+
{
1227+
int allocsize;
1228+
1229+
allocsize = in_progress_list_maxlen * 2;
1230+
in_progress_list = repalloc(in_progress_list,
1231+
allocsize * sizeof(*in_progress_list));
1232+
in_progress_list_maxlen = allocsize;
1233+
}
1234+
in_progress_offset = in_progress_list_len++;
1235+
in_progress_list[in_progress_offset].reloid = targetRelId;
1236+
retry:
1237+
in_progress_list[in_progress_offset].invalidated = false;
1238+
12051239
/*
12061240
* find the tuple in pg_class corresponding to the given relation id
12071241
*/
@@ -1211,7 +1245,11 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
12111245
* if no such tuple exists, return NULL
12121246
*/
12131247
if (!HeapTupleIsValid(pg_class_tuple))
1248+
{
1249+
Assert(in_progress_offset + 1 == in_progress_list_len);
1250+
in_progress_list_len--;
12141251
return NULL;
1252+
}
12151253

12161254
/*
12171255
* get information from the pg_class_tuple
@@ -1355,6 +1393,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
13551393
*/
13561394
heap_freetuple(pg_class_tuple);
13571395

1396+
/*
1397+
* If an invalidation arrived mid-build, start over. Between here and the
1398+
* end of this function, don't add code that does or reasonably could read
1399+
* system catalogs. That range must be free from invalidation processing
1400+
* for the !insertIt case. For the insertIt case, RelationCacheInsert()
1401+
* will enroll this relation in ordinary relcache invalidation processing,
1402+
*/
1403+
if (in_progress_list[in_progress_offset].invalidated)
1404+
{
1405+
RelationDestroyRelation(relation, false);
1406+
goto retry;
1407+
}
1408+
Assert(in_progress_offset + 1 == in_progress_list_len);
1409+
in_progress_list_len--;
1410+
13581411
/*
13591412
* Insert newly created relation into relcache hash table, if requested.
13601413
*
@@ -2572,6 +2625,14 @@ RelationClearRelation(Relation relation, bool rebuild)
25722625

25732626
/* Build temporary entry, but don't link it into hashtable */
25742627
newrel = RelationBuildDesc(save_relid, false);
2628+
2629+
/*
2630+
* Between here and the end of the swap, don't add code that does or
2631+
* reasonably could read system catalogs. That range must be free
2632+
* from invalidation processing. See RelationBuildDesc() manipulation
2633+
* of in_progress_list.
2634+
*/
2635+
25752636
if (newrel == NULL)
25762637
{
25772638
/*
@@ -2763,6 +2824,14 @@ RelationCacheInvalidateEntry(Oid relationId)
27632824
relcacheInvalsReceived++;
27642825
RelationFlushRelation(relation);
27652826
}
2827+
else
2828+
{
2829+
int i;
2830+
2831+
for (i = 0; i < in_progress_list_len; i++)
2832+
if (in_progress_list[i].reloid == relationId)
2833+
in_progress_list[i].invalidated = true;
2834+
}
27662835
}
27672836

27682837
/*
@@ -2794,16 +2863,22 @@ RelationCacheInvalidateEntry(Oid relationId)
27942863
* second pass processes nailed-in-cache items before other nondeletable
27952864
* items. This should ensure that system catalogs are up to date before
27962865
* we attempt to use them to reload information about other open relations.
2866+
*
2867+
* After those two phases of work having immediate effects, we normally
2868+
* signal any RelationBuildDesc() on the stack to start over. However, we
2869+
* don't do this if called as part of debug_discard_caches. Otherwise,
2870+
* RelationBuildDesc() would become an infinite loop.
27972871
*/
27982872
void
2799-
RelationCacheInvalidate(void)
2873+
RelationCacheInvalidate(bool debug_discard)
28002874
{
28012875
HASH_SEQ_STATUS status;
28022876
RelIdCacheEnt *idhentry;
28032877
Relation relation;
28042878
List *rebuildFirstList = NIL;
28052879
List *rebuildList = NIL;
28062880
ListCell *l;
2881+
int i;
28072882

28082883
/*
28092884
* Reload relation mapping data before starting to reconstruct cache.
@@ -2890,6 +2965,11 @@ RelationCacheInvalidate(void)
28902965
RelationClearRelation(relation, true);
28912966
}
28922967
list_free(rebuildList);
2968+
2969+
if (!debug_discard)
2970+
/* Any RelationBuildDesc() on the stack must start over. */
2971+
for (i = 0; i < in_progress_list_len; i++)
2972+
in_progress_list[i].invalidated = true;
28932973
}
28942974

28952975
/*
@@ -2962,6 +3042,13 @@ AtEOXact_RelationCache(bool isCommit)
29623042
RelIdCacheEnt *idhentry;
29633043
int i;
29643044

3045+
/*
3046+
* Forget in_progress_list. This is relevant when we're aborting due to
3047+
* an error during RelationBuildDesc().
3048+
*/
3049+
Assert(in_progress_list_len == 0 || !isCommit);
3050+
in_progress_list_len = 0;
3051+
29653052
/*
29663053
* Unless the eoxact_list[] overflowed, we only need to examine the rels
29673054
* listed in it. Otherwise fall back on a hash_seq_search scan.
@@ -3111,6 +3198,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
31113198
RelIdCacheEnt *idhentry;
31123199
int i;
31133200

3201+
/*
3202+
* Forget in_progress_list. This is relevant when we're aborting due to
3203+
* an error during RelationBuildDesc(). We don't commit subtransactions
3204+
* during RelationBuildDesc().
3205+
*/
3206+
Assert(in_progress_list_len == 0 || !isCommit);
3207+
in_progress_list_len = 0;
3208+
31143209
/*
31153210
* Unless the eoxact_list[] overflowed, we only need to examine the rels
31163211
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
@@ -3597,6 +3692,7 @@ void
35973692
RelationCacheInitialize(void)
35983693
{
35993694
HASHCTL ctl;
3695+
int allocsize;
36003696

36013697
/*
36023698
* make sure cache memory context exists
@@ -3613,6 +3709,15 @@ RelationCacheInitialize(void)
36133709
RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
36143710
&ctl, HASH_ELEM | HASH_BLOBS);
36153711

3712+
/*
3713+
* reserve enough in_progress_list slots for many cases
3714+
*/
3715+
allocsize = 4;
3716+
in_progress_list =
3717+
MemoryContextAlloc(CacheMemoryContext,
3718+
allocsize * sizeof(*in_progress_list));
3719+
in_progress_list_maxlen = allocsize;
3720+
36163721
/*
36173722
* relation mapper needs to be initialized too
36183723
*/

src/bin/pgbench/t/022_cic.pl

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
2+
# Copyright (c) 2021, PostgreSQL Global Development Group
3+
4+
# Test CREATE INDEX CONCURRENTLY with concurrent modifications
5+
use strict;
6+
use warnings;
7+
8+
use Config;
9+
use PostgresNode;
10+
use TestLib;
11+
12+
use Test::More tests => 4;
13+
14+
my ($node, $result);
15+
16+
#
17+
# Test set-up
18+
#
19+
$node = get_new_node('CIC_test');
20+
$node->init;
21+
$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
22+
$node->start;
23+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
24+
$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
25+
$node->safe_psql(
26+
'postgres', q(
27+
CREATE FUNCTION heapallindexed() RETURNS void AS $$
28+
DECLARE
29+
count_seqscan int;
30+
count_idxscan int;
31+
BEGIN
32+
count_seqscan := (SELECT count(*) FROM tbl);
33+
SET enable_seqscan = off;
34+
count_idxscan := (SELECT count(*) FROM tbl);
35+
RESET enable_seqscan;
36+
IF count_seqscan <> count_idxscan THEN
37+
RAISE 'seqscan found % rows, but idxscan found % rows',
38+
count_seqscan, count_idxscan;
39+
END IF;
40+
END
41+
$$ LANGUAGE plpgsql;
42+
));
43+
44+
#
45+
# Stress CIC with pgbench
46+
#
47+
48+
# Run background pgbench with CIC. We cannot mix-in this script into single
49+
# pgbench: CIC will deadlock with itself occasionally.
50+
my $pgbench_out = '';
51+
my $pgbench_timer = IPC::Run::timeout(180);
52+
my $pgbench_h = $node->background_pgbench(
53+
'--no-vacuum --client=1 --transactions=200',
54+
{
55+
'002_pgbench_concurrent_cic' => q(
56+
DROP INDEX CONCURRENTLY idx;
57+
CREATE INDEX CONCURRENTLY idx ON tbl(i);
58+
BEGIN ISOLATION LEVEL REPEATABLE READ;
59+
SELECT heapallindexed();
60+
ROLLBACK;
61+
)
62+
},
63+
\$pgbench_out,
64+
$pgbench_timer);
65+
66+
# Run pgbench.
67+
$node->pgbench(
68+
'--no-vacuum --client=5 --transactions=200',
69+
0,
70+
[qr{actually processed}],
71+
[qr{^$}],
72+
'concurrent INSERTs',
73+
{
74+
'002_pgbench_concurrent_transaction' => q(
75+
BEGIN;
76+
INSERT INTO tbl VALUES(0);
77+
COMMIT;
78+
),
79+
'002_pgbench_concurrent_transaction_savepoints' => q(
80+
BEGIN;
81+
SAVEPOINT s1;
82+
INSERT INTO tbl VALUES(0);
83+
COMMIT;
84+
)
85+
});
86+
87+
$pgbench_h->pump_nb;
88+
$pgbench_h->finish();
89+
unlike($pgbench_out, qr/aborted in command/, "pgbench with CIC works");
90+
91+
# done
92+
$node->stop;
93+
done_testing();

src/include/utils/inval.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
6363
extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
6464

6565
extern void InvalidateSystemCaches(void);
66+
extern void InvalidateSystemCachesExtended(bool debug_discard);
6667
#endif /* INVAL_H */

src/include/utils/relcache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ extern void RelationForgetRelation(Oid rid);
114114

115115
extern void RelationCacheInvalidateEntry(Oid relationId);
116116

117-
extern void RelationCacheInvalidate(void);
117+
extern void RelationCacheInvalidate(bool debug_discard);
118118

119119
extern void RelationCloseSmgrByOid(Oid relationId);
120120

0 commit comments

Comments
 (0)