Skip to content

Commit c5c239e

Browse files
Use streaming read I/O in btree vacuuming
Btree vacuum processes all index pages in physical order. Now it uses the read stream API to get the next buffer instead of explicitly invoking ReadBuffer(). It is possible for concurrent insertions to cause page splits during index vacuuming. This can lead to index entries that have yet to be vacuumed being moved to pages that have already been vacuumed. Btree vacuum code handles this by backtracking to reprocess those pages. So, while sequentially encountered pages are now read through the read stream API, backtracked pages are still read with explicit ReadBuffer() calls. Author: Andrey Borodin <x4mmm@yandex-team.ru> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Junwang Zhao <zhjwpku@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Discussion: https://postgr.es/m/flat/CAAKRu_bW1UOyup%3DjdFw%2BkOF9bCaAm%3D9UpiyZtbPMn8n_vnP%2Big%40mail.gmail.com#3b3a84132fc683b3ee5b40bc4c2ea2a5
1 parent 1d617a2 commit c5c239e

File tree

1 file changed

+66
-25
lines changed

1 file changed

+66
-25
lines changed

src/backend/access/nbtree/nbtree.c

Lines changed: 66 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
8686
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
8787
IndexBulkDeleteCallback callback, void *callback_state,
8888
BTCycleId cycleid);
89-
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
89+
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
9090
static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
9191
IndexTuple posting,
9292
OffsetNumber updatedoffset,
@@ -991,8 +991,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
991991
Relation rel = info->index;
992992
BTVacState vstate;
993993
BlockNumber num_pages;
994-
BlockNumber scanblkno;
995994
bool needLock;
995+
BlockRangeReadStreamPrivate p;
996+
ReadStream *stream = NULL;
996997

997998
/*
998999
* Reset fields that track information about the entire index now. This
@@ -1061,9 +1062,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10611062
*/
10621063
needLock = !RELATION_IS_LOCAL(rel);
10631064

1064-
scanblkno = BTREE_METAPAGE + 1;
1065+
p.current_blocknum = BTREE_METAPAGE + 1;
1066+
stream = read_stream_begin_relation(READ_STREAM_FULL,
1067+
info->strategy,
1068+
rel,
1069+
MAIN_FORKNUM,
1070+
block_range_read_stream_cb,
1071+
&p,
1072+
0);
10651073
for (;;)
10661074
{
1075+
Buffer buf;
1076+
10671077
/* Get the current relation length */
10681078
if (needLock)
10691079
LockRelationForExtension(rel, ExclusiveLock);
@@ -1076,18 +1086,44 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10761086
num_pages);
10771087

10781088
/* Quit if we've scanned the whole relation */
1079-
if (scanblkno >= num_pages)
1089+
if (p.current_blocknum >= num_pages)
10801090
break;
1081-
/* Iterate over pages, then loop back to recheck length */
1082-
for (; scanblkno < num_pages; scanblkno++)
1091+
1092+
1093+
p.last_exclusive = num_pages;
1094+
1095+
/* Iterate over pages, then loop back to recheck relation length */
1096+
while (true)
10831097
{
1084-
btvacuumpage(&vstate, scanblkno);
1098+
BlockNumber current_block;
1099+
1100+
/* call vacuum_delay_point while not holding any buffer lock */
1101+
vacuum_delay_point(false);
1102+
1103+
buf = read_stream_next_buffer(stream, NULL);
1104+
1105+
if (!BufferIsValid(buf))
1106+
break;
1107+
1108+
current_block = btvacuumpage(&vstate, buf);
1109+
10851110
if (info->report_progress)
10861111
pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1087-
scanblkno);
1112+
current_block);
10881113
}
1114+
1115+
Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
1116+
1117+
/*
1118+
* We have to reset the read stream to use it again. After returning
1119+
* InvalidBuffer, the read stream API won't invoke our callback again
1120+
* until the stream has been reset.
1121+
*/
1122+
read_stream_reset(stream);
10891123
}
10901124

1125+
read_stream_end(stream);
1126+
10911127
/* Set statistics num_pages field to final size of index */
10921128
stats->num_pages = num_pages;
10931129

@@ -1111,14 +1147,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
11111147
* btvacuumpage --- VACUUM one page
11121148
*
11131149
* This processes a single page for btvacuumscan(). In some cases we must
1114-
* backtrack to re-examine and VACUUM pages that were the scanblkno during
1150+
* backtrack to re-examine and VACUUM pages that were on buf's page during
11151151
* a previous call here. This is how we handle page splits (that happened
11161152
* after our cycleid was acquired) whose right half page happened to reuse
11171153
* a block that we might have processed at some point before it was
11181154
* recycled (i.e. before the page split).
1155+
*
1156+
* Returns BlockNumber of a scanned page (not backtracked).
11191157
*/
1120-
static void
1121-
btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1158+
static BlockNumber
1159+
btvacuumpage(BTVacState *vstate, Buffer buf)
11221160
{
11231161
IndexVacuumInfo *info = vstate->info;
11241162
IndexBulkDeleteResult *stats = vstate->stats;
@@ -1129,7 +1167,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
11291167
bool attempt_pagedel;
11301168
BlockNumber blkno,
11311169
backtrack_to;
1132-
Buffer buf;
1170+
BlockNumber scanblkno = BufferGetBlockNumber(buf);
11331171
Page page;
11341172
BTPageOpaque opaque;
11351173

@@ -1140,17 +1178,6 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
11401178
attempt_pagedel = false;
11411179
backtrack_to = P_NONE;
11421180

1143-
/* call vacuum_delay_point while not holding any buffer lock */
1144-
vacuum_delay_point(false);
1145-
1146-
/*
1147-
* We can't use _bt_getbuf() here because it always applies
1148-
* _bt_checkpage(), which will barf on an all-zero page. We want to
1149-
* recycle all-zero pages, not fail. Also, we want to use a nondefault
1150-
* buffer access strategy.
1151-
*/
1152-
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1153-
info->strategy);
11541181
_bt_lockbuf(rel, buf, BT_READ);
11551182
page = BufferGetPage(buf);
11561183
opaque = NULL;
@@ -1186,7 +1213,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
11861213
errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
11871214
blkno, scanblkno, RelationGetRelationName(rel))));
11881215
_bt_relbuf(rel, buf);
1189-
return;
1216+
return scanblkno;
11901217
}
11911218

11921219
/*
@@ -1206,7 +1233,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
12061233
{
12071234
/* Done with current scanblkno (and all lower split pages) */
12081235
_bt_relbuf(rel, buf);
1209-
return;
1236+
return scanblkno;
12101237
}
12111238
}
12121239

@@ -1437,8 +1464,22 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
14371464
if (backtrack_to != P_NONE)
14381465
{
14391466
blkno = backtrack_to;
1467+
1468+
/* check for vacuum delay while not holding any buffer lock */
1469+
vacuum_delay_point(false);
1470+
1471+
/*
1472+
* We can't use _bt_getbuf() here because it always applies
1473+
* _bt_checkpage(), which will barf on an all-zero page. We want to
1474+
* recycle all-zero pages, not fail. Also, we want to use a
1475+
* nondefault buffer access strategy.
1476+
*/
1477+
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1478+
info->strategy);
14401479
goto backtrack;
14411480
}
1481+
1482+
return scanblkno;
14421483
}
14431484

14441485
/*

0 commit comments

Comments
 (0)