Skip to content

Commit 0869e53

Browse files
committed
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
1 parent fb1aa48 commit 0869e53

File tree

8 files changed

+331
-5
lines changed

8 files changed

+331
-5
lines changed

contrib/amcheck/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
99

1010
REGRESS = check check_btree
1111

12+
TAP_TESTS = 1
13+
1214
ifdef USE_PGXS
1315
PG_CONFIG = pg_config
1416
PGXS := $(shell $(PG_CONFIG) --pgxs)

contrib/amcheck/t/002_cic.pl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
# Copyright (c) 2021, PostgreSQL Global Development Group
3+
4+
# Test CREATE INDEX CONCURRENTLY with concurrent modifications
5+
use strict;
6+
use warnings;
7+
8+
use Config;
9+
use PostgresNode;
10+
use TestLib;
11+
12+
use Test::More tests => 4;
13+
14+
my ($node, $result);
15+
16+
#
17+
# Test set-up
18+
#
19+
$node = get_new_node('CIC_test');
20+
$node->init;
21+
$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
22+
$node->start;
23+
$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
24+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
25+
$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
26+
27+
#
28+
# Stress CIC with pgbench
29+
#
30+
31+
# Run background pgbench with CIC. We cannot mix-in this script into single
32+
# pgbench: CIC will deadlock with itself occasionally.
33+
my $pgbench_out = '';
34+
my $pgbench_timer = IPC::Run::timeout(180);
35+
my $pgbench_h = $node->background_pgbench(
36+
'--no-vacuum --client=1 --transactions=200',
37+
{
38+
'002_pgbench_concurrent_cic' => q(
39+
DROP INDEX CONCURRENTLY idx;
40+
CREATE INDEX CONCURRENTLY idx ON tbl(i);
41+
SELECT bt_index_check('idx',true);
42+
)
43+
},
44+
\$pgbench_out,
45+
$pgbench_timer);
46+
47+
# Run pgbench.
48+
$node->pgbench(
49+
'--no-vacuum --client=5 --transactions=200',
50+
0,
51+
[qr{actually processed}],
52+
[qr{^$}],
53+
'concurrent INSERTs',
54+
{
55+
'002_pgbench_concurrent_transaction' => q(
56+
BEGIN;
57+
INSERT INTO tbl VALUES(0);
58+
COMMIT;
59+
),
60+
'002_pgbench_concurrent_transaction_savepoints' => q(
61+
BEGIN;
62+
SAVEPOINT s1;
63+
INSERT INTO tbl VALUES(0);
64+
COMMIT;
65+
)
66+
});
67+
68+
$pgbench_h->pump_nb;
69+
$pgbench_h->finish();
70+
$result =
71+
($Config{osname} eq "MSWin32")
72+
? ($pgbench_h->full_results)[0]
73+
: $pgbench_h->result(0);
74+
is($result, 0, "pgbench with CIC works");
75+
76+
# done
77+
$node->stop;
78+
done_testing();

src/backend/utils/cache/inval.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
584584
int i;
585585

586586
if (msg->rc.relId == InvalidOid)
587-
RelationCacheInvalidate();
587+
RelationCacheInvalidate(false);
588588
else
589589
RelationCacheInvalidateEntry(msg->rc.relId);
590590

@@ -641,12 +641,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
641641
*/
642642
void
643643
InvalidateSystemCaches(void)
644+
{
645+
InvalidateSystemCachesExtended(false);
646+
}
647+
648+
void
649+
InvalidateSystemCachesExtended(bool debug_discard)
644650
{
645651
int i;
646652

647653
InvalidateCatalogSnapshot();
648654
ResetCatalogCaches();
649-
RelationCacheInvalidate(); /* gets smgr and relmap too */
655+
RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
650656

651657
for (i = 0; i < syscache_callback_count; i++)
652658
{
@@ -717,7 +723,7 @@ AcceptInvalidationMessages(void)
717723
if (recursion_depth < 3)
718724
{
719725
recursion_depth++;
720-
InvalidateSystemCaches();
726+
InvalidateSystemCachesExtended(true);
721727
recursion_depth--;
722728
}
723729
}

src/backend/utils/cache/relcache.c

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,24 @@ bool criticalSharedRelcachesBuilt = false;
154154
*/
155155
static long relcacheInvalsReceived = 0L;
156156

157+
/*
158+
* in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
159+
* INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
160+
* It critically relies on each backend absorbing those changes no later than
161+
* next transaction start. Hence, RelationBuildDesc() loops until it finishes
162+
* without accepting a relevant invalidation. (Most invalidation consumers
163+
* don't do this.)
164+
*/
165+
typedef struct inprogressent
166+
{
167+
Oid reloid; /* OID of relation being built */
168+
bool invalidated; /* whether an invalidation arrived for it */
169+
} InProgressEnt;
170+
171+
static InProgressEnt *in_progress_list;
172+
static int in_progress_list_len;
173+
static int in_progress_list_maxlen;
174+
157175
/*
158176
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
159177
* cleanup work. This list intentionally has limited size; if it overflows,
@@ -1042,6 +1060,7 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
10421060
static Relation
10431061
RelationBuildDesc(Oid targetRelId, bool insertIt)
10441062
{
1063+
int in_progress_offset;
10451064
Relation relation;
10461065
Oid relid;
10471066
HeapTuple pg_class_tuple;
@@ -1069,6 +1088,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10691088
oldcxt = MemoryContextSwitchTo(tmpcxt);
10701089
#endif
10711090

1091+
/* Register to catch invalidation messages */
1092+
if (in_progress_list_len >= in_progress_list_maxlen)
1093+
{
1094+
int allocsize;
1095+
1096+
allocsize = in_progress_list_maxlen * 2;
1097+
in_progress_list = repalloc(in_progress_list,
1098+
allocsize * sizeof(*in_progress_list));
1099+
in_progress_list_maxlen = allocsize;
1100+
}
1101+
in_progress_offset = in_progress_list_len++;
1102+
in_progress_list[in_progress_offset].reloid = targetRelId;
1103+
retry:
1104+
in_progress_list[in_progress_offset].invalidated = false;
1105+
10721106
/*
10731107
* find the tuple in pg_class corresponding to the given relation id
10741108
*/
@@ -1084,6 +1118,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10841118
MemoryContextSwitchTo(oldcxt);
10851119
MemoryContextDelete(tmpcxt);
10861120
#endif
1121+
Assert(in_progress_offset + 1 == in_progress_list_len);
1122+
in_progress_list_len--;
10871123
return NULL;
10881124
}
10891125

@@ -1251,6 +1287,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
12511287
*/
12521288
heap_freetuple(pg_class_tuple);
12531289

1290+
/*
1291+
* If an invalidation arrived mid-build, start over. Between here and the
1292+
* end of this function, don't add code that does or reasonably could read
1293+
* system catalogs. That range must be free from invalidation processing
1294+
* for the !insertIt case. For the insertIt case, RelationCacheInsert()
1295+
* will enroll this relation in ordinary relcache invalidation processing,
1296+
*/
1297+
if (in_progress_list[in_progress_offset].invalidated)
1298+
{
1299+
RelationDestroyRelation(relation, false);
1300+
goto retry;
1301+
}
1302+
Assert(in_progress_offset + 1 == in_progress_list_len);
1303+
in_progress_list_len--;
1304+
12541305
/*
12551306
* Insert newly created relation into relcache hash table, if requested.
12561307
*
@@ -2554,6 +2605,14 @@ RelationClearRelation(Relation relation, bool rebuild)
25542605

25552606
/* Build temporary entry, but don't link it into hashtable */
25562607
newrel = RelationBuildDesc(save_relid, false);
2608+
2609+
/*
2610+
* Between here and the end of the swap, don't add code that does or
2611+
* reasonably could read system catalogs. That range must be free
2612+
* from invalidation processing. See RelationBuildDesc() manipulation
2613+
* of in_progress_list.
2614+
*/
2615+
25572616
if (newrel == NULL)
25582617
{
25592618
/*
@@ -2765,6 +2824,14 @@ RelationCacheInvalidateEntry(Oid relationId)
27652824
relcacheInvalsReceived++;
27662825
RelationFlushRelation(relation);
27672826
}
2827+
else
2828+
{
2829+
int i;
2830+
2831+
for (i = 0; i < in_progress_list_len; i++)
2832+
if (in_progress_list[i].reloid == relationId)
2833+
in_progress_list[i].invalidated = true;
2834+
}
27682835
}
27692836

27702837
/*
@@ -2796,16 +2863,22 @@ RelationCacheInvalidateEntry(Oid relationId)
27962863
* second pass processes nailed-in-cache items before other nondeletable
27972864
* items. This should ensure that system catalogs are up to date before
27982865
* we attempt to use them to reload information about other open relations.
2866+
*
2867+
* After those two phases of work having immediate effects, we normally
2868+
* signal any RelationBuildDesc() on the stack to start over. However, we
2869+
* don't do this if called as part of debug_discard_caches. Otherwise,
2870+
* RelationBuildDesc() would become an infinite loop.
27992871
*/
28002872
void
2801-
RelationCacheInvalidate(void)
2873+
RelationCacheInvalidate(bool debug_discard)
28022874
{
28032875
HASH_SEQ_STATUS status;
28042876
RelIdCacheEnt *idhentry;
28052877
Relation relation;
28062878
List *rebuildFirstList = NIL;
28072879
List *rebuildList = NIL;
28082880
ListCell *l;
2881+
int i;
28092882

28102883
/*
28112884
* Reload relation mapping data before starting to reconstruct cache.
@@ -2892,6 +2965,11 @@ RelationCacheInvalidate(void)
28922965
RelationClearRelation(relation, true);
28932966
}
28942967
list_free(rebuildList);
2968+
2969+
if (!debug_discard)
2970+
/* Any RelationBuildDesc() on the stack must start over. */
2971+
for (i = 0; i < in_progress_list_len; i++)
2972+
in_progress_list[i].invalidated = true;
28952973
}
28962974

28972975
/*
@@ -2964,6 +3042,13 @@ AtEOXact_RelationCache(bool isCommit)
29643042
RelIdCacheEnt *idhentry;
29653043
int i;
29663044

3045+
/*
3046+
* Forget in_progress_list. This is relevant when we're aborting due to
3047+
* an error during RelationBuildDesc().
3048+
*/
3049+
Assert(in_progress_list_len == 0 || !isCommit);
3050+
in_progress_list_len = 0;
3051+
29673052
/*
29683053
* Unless the eoxact_list[] overflowed, we only need to examine the rels
29693054
* listed in it. Otherwise fall back on a hash_seq_search scan.
@@ -3100,6 +3185,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
31003185
RelIdCacheEnt *idhentry;
31013186
int i;
31023187

3188+
/*
3189+
* Forget in_progress_list. This is relevant when we're aborting due to
3190+
* an error during RelationBuildDesc(). We don't commit subtransactions
3191+
* during RelationBuildDesc().
3192+
*/
3193+
Assert(in_progress_list_len == 0 || !isCommit);
3194+
in_progress_list_len = 0;
3195+
31033196
/*
31043197
* Unless the eoxact_list[] overflowed, we only need to examine the rels
31053198
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
@@ -3601,6 +3694,7 @@ void
36013694
RelationCacheInitialize(void)
36023695
{
36033696
HASHCTL ctl;
3697+
int allocsize;
36043698

36053699
/*
36063700
* make sure cache memory context exists
@@ -3617,6 +3711,15 @@ RelationCacheInitialize(void)
36173711
RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
36183712
&ctl, HASH_ELEM | HASH_BLOBS);
36193713

3714+
/*
3715+
* reserve enough in_progress_list slots for many cases
3716+
*/
3717+
allocsize = 4;
3718+
in_progress_list =
3719+
MemoryContextAlloc(CacheMemoryContext,
3720+
allocsize * sizeof(*in_progress_list));
3721+
in_progress_list_maxlen = allocsize;
3722+
36203723
/*
36213724
* relation mapper needs to be initialized too
36223725
*/

src/include/utils/inval.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
6161
extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
6262

6363
extern void InvalidateSystemCaches(void);
64+
extern void InvalidateSystemCachesExtended(bool debug_discard);
6465
#endif /* INVAL_H */

src/include/utils/relcache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ extern void RelationForgetRelation(Oid rid);
117117

118118
extern void RelationCacheInvalidateEntry(Oid relationId);
119119

120-
extern void RelationCacheInvalidate(void);
120+
extern void RelationCacheInvalidate(bool debug_discard);
121121

122122
extern void RelationCloseSmgrByOid(Oid relationId);
123123

0 commit comments

Comments
 (0)