Skip to content

Commit 5141e47

Browse files
committed
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
1 parent e6c5f1b commit 5141e47

File tree

7 files changed

+346
-5
lines changed

7 files changed

+346
-5
lines changed

src/backend/utils/cache/inval.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
584584
int i;
585585

586586
if (msg->rc.relId == InvalidOid)
587-
RelationCacheInvalidate();
587+
RelationCacheInvalidate(false);
588588
else
589589
RelationCacheInvalidateEntry(msg->rc.relId);
590590

@@ -641,12 +641,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
641641
*/
642642
void
643643
InvalidateSystemCaches(void)
644+
{
645+
InvalidateSystemCachesExtended(false);
646+
}
647+
648+
void
649+
InvalidateSystemCachesExtended(bool debug_discard)
644650
{
645651
int i;
646652

647653
InvalidateCatalogSnapshot();
648654
ResetCatalogCaches();
649-
RelationCacheInvalidate(); /* gets smgr and relmap too */
655+
RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
650656

651657
for (i = 0; i < syscache_callback_count; i++)
652658
{
@@ -717,7 +723,7 @@ AcceptInvalidationMessages(void)
717723
if (recursion_depth < 3)
718724
{
719725
recursion_depth++;
720-
InvalidateSystemCaches();
726+
InvalidateSystemCachesExtended(true);
721727
recursion_depth--;
722728
}
723729
}

src/backend/utils/cache/relcache.c

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,24 @@ bool criticalSharedRelcachesBuilt = false;
144144
*/
145145
static long relcacheInvalsReceived = 0L;
146146

147+
/*
148+
* in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
149+
* INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
150+
* It critically relies on each backend absorbing those changes no later than
151+
* next transaction start. Hence, RelationBuildDesc() loops until it finishes
152+
* without accepting a relevant invalidation. (Most invalidation consumers
153+
* don't do this.)
154+
*/
155+
typedef struct inprogressent
156+
{
157+
Oid reloid; /* OID of relation being built */
158+
bool invalidated; /* whether an invalidation arrived for it */
159+
} InProgressEnt;
160+
161+
static InProgressEnt *in_progress_list;
162+
static int in_progress_list_len;
163+
static int in_progress_list_maxlen;
164+
147165
/*
148166
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
149167
* cleanup work. This list intentionally has limited size; if it overflows,
@@ -1088,11 +1106,27 @@ equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
10881106
static Relation
10891107
RelationBuildDesc(Oid targetRelId, bool insertIt)
10901108
{
1109+
int in_progress_offset;
10911110
Relation relation;
10921111
Oid relid;
10931112
HeapTuple pg_class_tuple;
10941113
Form_pg_class relp;
10951114

1115+
/* Register to catch invalidation messages */
1116+
if (in_progress_list_len >= in_progress_list_maxlen)
1117+
{
1118+
int allocsize;
1119+
1120+
allocsize = in_progress_list_maxlen * 2;
1121+
in_progress_list = repalloc(in_progress_list,
1122+
allocsize * sizeof(*in_progress_list));
1123+
in_progress_list_maxlen = allocsize;
1124+
}
1125+
in_progress_offset = in_progress_list_len++;
1126+
in_progress_list[in_progress_offset].reloid = targetRelId;
1127+
retry:
1128+
in_progress_list[in_progress_offset].invalidated = false;
1129+
10961130
/*
10971131
* find the tuple in pg_class corresponding to the given relation id
10981132
*/
@@ -1102,7 +1136,11 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
11021136
* if no such tuple exists, return NULL
11031137
*/
11041138
if (!HeapTupleIsValid(pg_class_tuple))
1139+
{
1140+
Assert(in_progress_offset + 1 == in_progress_list_len);
1141+
in_progress_list_len--;
11051142
return NULL;
1143+
}
11061144

11071145
/*
11081146
* get information from the pg_class_tuple
@@ -1246,6 +1284,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
12461284
*/
12471285
heap_freetuple(pg_class_tuple);
12481286

1287+
/*
1288+
* If an invalidation arrived mid-build, start over. Between here and the
1289+
* end of this function, don't add code that does or reasonably could read
1290+
* system catalogs. That range must be free from invalidation processing
1291+
* for the !insertIt case. For the insertIt case, RelationCacheInsert()
1292+
* will enroll this relation in ordinary relcache invalidation processing,
1293+
*/
1294+
if (in_progress_list[in_progress_offset].invalidated)
1295+
{
1296+
RelationDestroyRelation(relation, false);
1297+
goto retry;
1298+
}
1299+
Assert(in_progress_offset + 1 == in_progress_list_len);
1300+
in_progress_list_len--;
1301+
12491302
/*
12501303
* Insert newly created relation into relcache hash table, if requested.
12511304
*
@@ -2469,6 +2522,14 @@ RelationClearRelation(Relation relation, bool rebuild)
24692522

24702523
/* Build temporary entry, but don't link it into hashtable */
24712524
newrel = RelationBuildDesc(save_relid, false);
2525+
2526+
/*
2527+
* Between here and the end of the swap, don't add code that does or
2528+
* reasonably could read system catalogs. That range must be free
2529+
* from invalidation processing. See RelationBuildDesc() manipulation
2530+
* of in_progress_list.
2531+
*/
2532+
24722533
if (newrel == NULL)
24732534
{
24742535
/*
@@ -2660,6 +2721,14 @@ RelationCacheInvalidateEntry(Oid relationId)
26602721
relcacheInvalsReceived++;
26612722
RelationFlushRelation(relation);
26622723
}
2724+
else
2725+
{
2726+
int i;
2727+
2728+
for (i = 0; i < in_progress_list_len; i++)
2729+
if (in_progress_list[i].reloid == relationId)
2730+
in_progress_list[i].invalidated = true;
2731+
}
26632732
}
26642733

26652734
/*
@@ -2691,16 +2760,22 @@ RelationCacheInvalidateEntry(Oid relationId)
26912760
* second pass processes nailed-in-cache items before other nondeletable
26922761
* items. This should ensure that system catalogs are up to date before
26932762
* we attempt to use them to reload information about other open relations.
2763+
*
2764+
* After those two phases of work having immediate effects, we normally
2765+
* signal any RelationBuildDesc() on the stack to start over. However, we
2766+
* don't do this if called as part of debug_discard_caches. Otherwise,
2767+
* RelationBuildDesc() would become an infinite loop.
26942768
*/
26952769
void
2696-
RelationCacheInvalidate(void)
2770+
RelationCacheInvalidate(bool debug_discard)
26972771
{
26982772
HASH_SEQ_STATUS status;
26992773
RelIdCacheEnt *idhentry;
27002774
Relation relation;
27012775
List *rebuildFirstList = NIL;
27022776
List *rebuildList = NIL;
27032777
ListCell *l;
2778+
int i;
27042779

27052780
/*
27062781
* Reload relation mapping data before starting to reconstruct cache.
@@ -2787,6 +2862,11 @@ RelationCacheInvalidate(void)
27872862
RelationClearRelation(relation, true);
27882863
}
27892864
list_free(rebuildList);
2865+
2866+
if (!debug_discard)
2867+
/* Any RelationBuildDesc() on the stack must start over. */
2868+
for (i = 0; i < in_progress_list_len; i++)
2869+
in_progress_list[i].invalidated = true;
27902870
}
27912871

27922872
/*
@@ -2859,6 +2939,13 @@ AtEOXact_RelationCache(bool isCommit)
28592939
RelIdCacheEnt *idhentry;
28602940
int i;
28612941

2942+
/*
2943+
* Forget in_progress_list. This is relevant when we're aborting due to
2944+
* an error during RelationBuildDesc().
2945+
*/
2946+
Assert(in_progress_list_len == 0 || !isCommit);
2947+
in_progress_list_len = 0;
2948+
28622949
/*
28632950
* Unless the eoxact_list[] overflowed, we only need to examine the rels
28642951
* listed in it. Otherwise fall back on a hash_seq_search scan.
@@ -3008,6 +3095,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
30083095
RelIdCacheEnt *idhentry;
30093096
int i;
30103097

3098+
/*
3099+
* Forget in_progress_list. This is relevant when we're aborting due to
3100+
* an error during RelationBuildDesc(). We don't commit subtransactions
3101+
* during RelationBuildDesc().
3102+
*/
3103+
Assert(in_progress_list_len == 0 || !isCommit);
3104+
in_progress_list_len = 0;
3105+
30113106
/*
30123107
* Unless the eoxact_list[] overflowed, we only need to examine the rels
30133108
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
@@ -3497,6 +3592,7 @@ void
34973592
RelationCacheInitialize(void)
34983593
{
34993594
HASHCTL ctl;
3595+
int allocsize;
35003596

35013597
/*
35023598
* make sure cache memory context exists
@@ -3513,6 +3609,15 @@ RelationCacheInitialize(void)
35133609
RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
35143610
&ctl, HASH_ELEM | HASH_BLOBS);
35153611

3612+
/*
3613+
* reserve enough in_progress_list slots for many cases
3614+
*/
3615+
allocsize = 4;
3616+
in_progress_list =
3617+
MemoryContextAlloc(CacheMemoryContext,
3618+
allocsize * sizeof(*in_progress_list));
3619+
in_progress_list_maxlen = allocsize;
3620+
35163621
/*
35173622
* relation mapper needs to be initialized too
35183623
*/

src/bin/pgbench/t/022_cic.pl

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
2+
# Copyright (c) 2021, PostgreSQL Global Development Group
3+
4+
# Test CREATE INDEX CONCURRENTLY with concurrent modifications
5+
use strict;
6+
use warnings;
7+
8+
use Config;
9+
use PostgresNode;
10+
use TestLib;
11+
12+
use Test::More tests => 4;
13+
14+
my ($node, $result);
15+
16+
#
17+
# Test set-up
18+
#
19+
$node = get_new_node('CIC_test');
20+
$node->init;
21+
$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
22+
$node->start;
23+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
24+
$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
25+
$node->safe_psql(
26+
'postgres', q(
27+
CREATE FUNCTION heapallindexed() RETURNS void AS $$
28+
DECLARE
29+
count_seqscan int;
30+
count_idxscan int;
31+
BEGIN
32+
count_seqscan := (SELECT count(*) FROM tbl);
33+
SET enable_seqscan = off;
34+
count_idxscan := (SELECT count(*) FROM tbl);
35+
RESET enable_seqscan;
36+
IF count_seqscan <> count_idxscan THEN
37+
RAISE 'seqscan found % rows, but idxscan found % rows',
38+
count_seqscan, count_idxscan;
39+
END IF;
40+
END
41+
$$ LANGUAGE plpgsql;
42+
));
43+
44+
#
45+
# Stress CIC with pgbench
46+
#
47+
48+
# Run background pgbench with CIC. We cannot mix-in this script into single
49+
# pgbench: CIC will deadlock with itself occasionally.
50+
my $pgbench_out = '';
51+
my $pgbench_timer = IPC::Run::timeout(180);
52+
my $pgbench_h = $node->background_pgbench(
53+
'--no-vacuum --client=1 --transactions=200',
54+
{
55+
'002_pgbench_concurrent_cic' => q(
56+
DROP INDEX CONCURRENTLY idx;
57+
CREATE INDEX CONCURRENTLY idx ON tbl(i);
58+
BEGIN ISOLATION LEVEL REPEATABLE READ;
59+
SELECT heapallindexed();
60+
ROLLBACK;
61+
)
62+
},
63+
\$pgbench_out,
64+
$pgbench_timer);
65+
66+
# Run pgbench.
67+
$node->pgbench(
68+
'--no-vacuum --client=5 --transactions=200',
69+
0,
70+
[qr{actually processed}],
71+
[qr{^$}],
72+
'concurrent INSERTs',
73+
{
74+
'002_pgbench_concurrent_transaction' => q(
75+
BEGIN;
76+
INSERT INTO tbl VALUES(0);
77+
COMMIT;
78+
),
79+
'002_pgbench_concurrent_transaction_savepoints' => q(
80+
BEGIN;
81+
SAVEPOINT s1;
82+
INSERT INTO tbl VALUES(0);
83+
COMMIT;
84+
)
85+
});
86+
87+
$pgbench_h->pump_nb;
88+
$pgbench_h->finish();
89+
unlike($pgbench_out, qr/aborted in command/, "pgbench with CIC works");
90+
91+
# done
92+
$node->stop;
93+
done_testing();

src/include/utils/inval.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
6363
extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
6464

6565
extern void InvalidateSystemCaches(void);
66+
extern void InvalidateSystemCachesExtended(bool debug_discard);
6667
#endif /* INVAL_H */

src/include/utils/relcache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ extern void RelationForgetRelation(Oid rid);
120120

121121
extern void RelationCacheInvalidateEntry(Oid relationId);
122122

123-
extern void RelationCacheInvalidate(void);
123+
extern void RelationCacheInvalidate(bool debug_discard);
124124

125125
extern void RelationCloseSmgrByOid(Oid relationId);
126126

0 commit comments

Comments
 (0)