Skip to content

Commit 2e33b43

Browse files
committed
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
1 parent 7c949f1 commit 2e33b43

File tree

8 files changed

+336
-10
lines changed

8 files changed

+336
-10
lines changed

contrib/amcheck/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
1111

1212
REGRESS = check check_btree
1313

14+
TAP_TESTS = 1
15+
1416
ifdef USE_PGXS
1517
PG_CONFIG = pg_config
1618
PGXS := $(shell $(PG_CONFIG) --pgxs)

contrib/amcheck/t/002_cic.pl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
# Copyright (c) 2021, PostgreSQL Global Development Group
3+
4+
# Test CREATE INDEX CONCURRENTLY with concurrent modifications
5+
use strict;
6+
use warnings;
7+
8+
use Config;
9+
use PostgresNode;
10+
use TestLib;
11+
12+
use Test::More tests => 4;
13+
14+
my ($node, $result);
15+
16+
#
17+
# Test set-up
18+
#
19+
$node = get_new_node('CIC_test');
20+
$node->init;
21+
$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
22+
$node->start;
23+
$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
24+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
25+
$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
26+
27+
#
28+
# Stress CIC with pgbench
29+
#
30+
31+
# Run background pgbench with CIC. We cannot mix-in this script into single
32+
# pgbench: CIC will deadlock with itself occasionally.
33+
my $pgbench_out = '';
34+
my $pgbench_timer = IPC::Run::timeout(180);
35+
my $pgbench_h = $node->background_pgbench(
36+
'--no-vacuum --client=1 --transactions=200',
37+
{
38+
'002_pgbench_concurrent_cic' => q(
39+
DROP INDEX CONCURRENTLY idx;
40+
CREATE INDEX CONCURRENTLY idx ON tbl(i);
41+
SELECT bt_index_check('idx',true);
42+
)
43+
},
44+
\$pgbench_out,
45+
$pgbench_timer);
46+
47+
# Run pgbench.
48+
$node->pgbench(
49+
'--no-vacuum --client=5 --transactions=200',
50+
0,
51+
[qr{actually processed}],
52+
[qr{^$}],
53+
'concurrent INSERTs',
54+
{
55+
'002_pgbench_concurrent_transaction' => q(
56+
BEGIN;
57+
INSERT INTO tbl VALUES(0);
58+
COMMIT;
59+
),
60+
'002_pgbench_concurrent_transaction_savepoints' => q(
61+
BEGIN;
62+
SAVEPOINT s1;
63+
INSERT INTO tbl VALUES(0);
64+
COMMIT;
65+
)
66+
});
67+
68+
$pgbench_h->pump_nb;
69+
$pgbench_h->finish();
70+
$result =
71+
($Config{osname} eq "MSWin32")
72+
? ($pgbench_h->full_results)[0]
73+
: $pgbench_h->result(0);
74+
is($result, 0, "pgbench with CIC works");
75+
76+
# done
77+
$node->stop;
78+
done_testing();

src/backend/utils/cache/inval.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
584584
int i;
585585

586586
if (msg->rc.relId == InvalidOid)
587-
RelationCacheInvalidate();
587+
RelationCacheInvalidate(false);
588588
else
589589
RelationCacheInvalidateEntry(msg->rc.relId);
590590

@@ -641,12 +641,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
641641
*/
642642
void
643643
InvalidateSystemCaches(void)
644+
{
645+
InvalidateSystemCachesExtended(false);
646+
}
647+
648+
void
649+
InvalidateSystemCachesExtended(bool debug_discard)
644650
{
645651
int i;
646652

647653
InvalidateCatalogSnapshot();
648654
ResetCatalogCaches();
649-
RelationCacheInvalidate(); /* gets smgr and relmap too */
655+
RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
650656

651657
for (i = 0; i < syscache_callback_count; i++)
652658
{
@@ -717,7 +723,7 @@ AcceptInvalidationMessages(void)
717723
if (recursion_depth < 3)
718724
{
719725
recursion_depth++;
720-
InvalidateSystemCaches();
726+
InvalidateSystemCachesExtended(true);
721727
recursion_depth--;
722728
}
723729
}

src/backend/utils/cache/relcache.c

Lines changed: 109 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,24 @@ bool criticalSharedRelcachesBuilt = false;
150150
*/
151151
static long relcacheInvalsReceived = 0L;
152152

153+
/*
154+
* in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
155+
* INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
156+
* It critically relies on each backend absorbing those changes no later than
157+
* next transaction start. Hence, RelationBuildDesc() loops until it finishes
158+
* without accepting a relevant invalidation. (Most invalidation consumers
159+
* don't do this.)
160+
*/
161+
typedef struct inprogressent
162+
{
163+
Oid reloid; /* OID of relation being built */
164+
bool invalidated; /* whether an invalidation arrived for it */
165+
} InProgressEnt;
166+
167+
static InProgressEnt *in_progress_list;
168+
static int in_progress_list_len;
169+
static int in_progress_list_maxlen;
170+
153171
/*
154172
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
155173
* cleanup work. This list intentionally has limited size; if it overflows,
@@ -1043,6 +1061,7 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
10431061
static Relation
10441062
RelationBuildDesc(Oid targetRelId, bool insertIt)
10451063
{
1064+
int in_progress_offset;
10461065
Relation relation;
10471066
Oid relid;
10481067
HeapTuple pg_class_tuple;
@@ -1070,6 +1089,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10701089
oldcxt = MemoryContextSwitchTo(tmpcxt);
10711090
#endif
10721091

1092+
/* Register to catch invalidation messages */
1093+
if (in_progress_list_len >= in_progress_list_maxlen)
1094+
{
1095+
int allocsize;
1096+
1097+
allocsize = in_progress_list_maxlen * 2;
1098+
in_progress_list = repalloc(in_progress_list,
1099+
allocsize * sizeof(*in_progress_list));
1100+
in_progress_list_maxlen = allocsize;
1101+
}
1102+
in_progress_offset = in_progress_list_len++;
1103+
in_progress_list[in_progress_offset].reloid = targetRelId;
1104+
retry:
1105+
in_progress_list[in_progress_offset].invalidated = false;
1106+
10731107
/*
10741108
* find the tuple in pg_class corresponding to the given relation id
10751109
*/
@@ -1085,6 +1119,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
10851119
MemoryContextSwitchTo(oldcxt);
10861120
MemoryContextDelete(tmpcxt);
10871121
#endif
1122+
Assert(in_progress_offset + 1 == in_progress_list_len);
1123+
in_progress_list_len--;
10881124
return NULL;
10891125
}
10901126

@@ -1244,6 +1280,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
12441280
*/
12451281
heap_freetuple(pg_class_tuple);
12461282

1283+
/*
1284+
* If an invalidation arrived mid-build, start over. Between here and the
1285+
* end of this function, don't add code that does or reasonably could read
1286+
* system catalogs. That range must be free from invalidation processing
1287+
* for the !insertIt case. For the insertIt case, RelationCacheInsert()
1288+
* will enroll this relation in ordinary relcache invalidation processing,
1289+
*/
1290+
if (in_progress_list[in_progress_offset].invalidated)
1291+
{
1292+
RelationDestroyRelation(relation, false);
1293+
goto retry;
1294+
}
1295+
Assert(in_progress_offset + 1 == in_progress_list_len);
1296+
in_progress_list_len--;
1297+
12471298
/*
12481299
* Insert newly created relation into relcache hash table, if requested.
12491300
*
@@ -2586,6 +2637,14 @@ RelationClearRelation(Relation relation, bool rebuild)
25862637

25872638
/* Build temporary entry, but don't link it into hashtable */
25882639
newrel = RelationBuildDesc(save_relid, false);
2640+
2641+
/*
2642+
* Between here and the end of the swap, don't add code that does or
2643+
* reasonably could read system catalogs. That range must be free
2644+
* from invalidation processing. See RelationBuildDesc() manipulation
2645+
* of in_progress_list.
2646+
*/
2647+
25892648
if (newrel == NULL)
25902649
{
25912650
/*
@@ -2816,6 +2875,14 @@ RelationCacheInvalidateEntry(Oid relationId)
28162875
relcacheInvalsReceived++;
28172876
RelationFlushRelation(relation);
28182877
}
2878+
else
2879+
{
2880+
int i;
2881+
2882+
for (i = 0; i < in_progress_list_len; i++)
2883+
if (in_progress_list[i].reloid == relationId)
2884+
in_progress_list[i].invalidated = true;
2885+
}
28192886
}
28202887

28212888
/*
@@ -2824,11 +2891,11 @@ RelationCacheInvalidateEntry(Oid relationId)
28242891
* and rebuild those with positive reference counts. Also reset the smgr
28252892
* relation cache and re-read relation mapping data.
28262893
*
2827-
* This is currently used only to recover from SI message buffer overflow,
2828-
* so we do not touch relations having new-in-transaction relfilenodes; they
2829-
* cannot be targets of cross-backend SI updates (and our own updates now go
2830-
* through a separate linked list that isn't limited by the SI message
2831-
* buffer size).
2894+
* Apart from debug_discard_caches, this is currently used only to recover
2895+
* from SI message buffer overflow, so we do not touch relations having
2896+
* new-in-transaction relfilenodes; they cannot be targets of cross-backend
2897+
* SI updates (and our own updates now go through a separate linked list
2898+
* that isn't limited by the SI message buffer size).
28322899
*
28332900
* We do this in two phases: the first pass deletes deletable items, and
28342901
* the second one rebuilds the rebuildable items. This is essential for
@@ -2846,16 +2913,22 @@ RelationCacheInvalidateEntry(Oid relationId)
28462913
* second pass processes nailed-in-cache items before other nondeletable
28472914
* items. This should ensure that system catalogs are up to date before
28482915
* we attempt to use them to reload information about other open relations.
2916+
*
2917+
* After those two phases of work having immediate effects, we normally
2918+
* signal any RelationBuildDesc() on the stack to start over. However, we
2919+
* don't do this if called as part of debug_discard_caches. Otherwise,
2920+
* RelationBuildDesc() would become an infinite loop.
28492921
*/
28502922
void
2851-
RelationCacheInvalidate(void)
2923+
RelationCacheInvalidate(bool debug_discard)
28522924
{
28532925
HASH_SEQ_STATUS status;
28542926
RelIdCacheEnt *idhentry;
28552927
Relation relation;
28562928
List *rebuildFirstList = NIL;
28572929
List *rebuildList = NIL;
28582930
ListCell *l;
2931+
int i;
28592932

28602933
/*
28612934
* Reload relation mapping data before starting to reconstruct cache.
@@ -2942,6 +3015,11 @@ RelationCacheInvalidate(void)
29423015
RelationClearRelation(relation, true);
29433016
}
29443017
list_free(rebuildList);
3018+
3019+
if (!debug_discard)
3020+
/* Any RelationBuildDesc() on the stack must start over. */
3021+
for (i = 0; i < in_progress_list_len; i++)
3022+
in_progress_list[i].invalidated = true;
29453023
}
29463024

29473025
/*
@@ -3092,6 +3170,13 @@ AtEOXact_RelationCache(bool isCommit)
30923170
RelIdCacheEnt *idhentry;
30933171
int i;
30943172

3173+
/*
3174+
* Forget in_progress_list. This is relevant when we're aborting due to
3175+
* an error during RelationBuildDesc().
3176+
*/
3177+
Assert(in_progress_list_len == 0 || !isCommit);
3178+
in_progress_list_len = 0;
3179+
30953180
/*
30963181
* Unless the eoxact_list[] overflowed, we only need to examine the rels
30973182
* listed in it. Otherwise fall back on a hash_seq_search scan.
@@ -3238,6 +3323,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
32383323
RelIdCacheEnt *idhentry;
32393324
int i;
32403325

3326+
/*
3327+
* Forget in_progress_list. This is relevant when we're aborting due to
3328+
* an error during RelationBuildDesc(). We don't commit subtransactions
3329+
* during RelationBuildDesc().
3330+
*/
3331+
Assert(in_progress_list_len == 0 || !isCommit);
3332+
in_progress_list_len = 0;
3333+
32413334
/*
32423335
* Unless the eoxact_list[] overflowed, we only need to examine the rels
32433336
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
@@ -3786,6 +3879,7 @@ void
37863879
RelationCacheInitialize(void)
37873880
{
37883881
HASHCTL ctl;
3882+
int allocsize;
37893883

37903884
/*
37913885
* make sure cache memory context exists
@@ -3802,6 +3896,15 @@ RelationCacheInitialize(void)
38023896
RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
38033897
&ctl, HASH_ELEM | HASH_BLOBS);
38043898

3899+
/*
3900+
* reserve enough in_progress_list slots for many cases
3901+
*/
3902+
allocsize = 4;
3903+
in_progress_list =
3904+
MemoryContextAlloc(CacheMemoryContext,
3905+
allocsize * sizeof(*in_progress_list));
3906+
in_progress_list_maxlen = allocsize;
3907+
38053908
/*
38063909
* relation mapper needs to be initialized too
38073910
*/

src/include/utils/inval.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
6161
extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
6262

6363
extern void InvalidateSystemCaches(void);
64+
extern void InvalidateSystemCachesExtended(bool debug_discard);
6465
#endif /* INVAL_H */

src/include/utils/relcache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ extern void RelationForgetRelation(Oid rid);
120120

121121
extern void RelationCacheInvalidateEntry(Oid relationId);
122122

123-
extern void RelationCacheInvalidate(void);
123+
extern void RelationCacheInvalidate(bool debug_discard);
124124

125125
extern void RelationCloseSmgrByOid(Oid relationId);
126126

0 commit comments

Comments
 (0)