Skip to content

Commit d526575

Browse files
committed
Make large sequential scans and VACUUMs work in a limited-size "ring" of
buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom.
1 parent 0a6f2ee commit d526575

File tree

24 files changed

+723
-263
lines changed

24 files changed

+723
-263
lines changed

src/backend/access/hash/hash.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* This file contains only the public interface routines.
@@ -547,8 +547,9 @@ hashbulkdelete(PG_FUNCTION_ARGS)
547547

548548
vacuum_delay_point();
549549

550-
buf = _hash_getbuf(rel, blkno, HASH_WRITE,
551-
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
550+
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
551+
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
552+
info->strategy);
552553
page = BufferGetPage(buf);
553554
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
554555
Assert(opaque->hasho_bucket == cur_bucket);
@@ -596,7 +597,8 @@ hashbulkdelete(PG_FUNCTION_ARGS)
596597

597598
/* If we deleted anything, try to compact free space */
598599
if (bucket_dirty)
599-
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
600+
_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
601+
info->strategy);
600602

601603
/* Release bucket lock */
602604
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

src/backend/access/hash/hashovfl.c

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* Overflow pages look like ordinary relation pages.
@@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map)
362362
* Remove this overflow page from its bucket's chain, and mark the page as
363363
* free. On entry, ovflbuf is write-locked; it is released before exiting.
364364
*
365+
* Since this function is invoked in VACUUM, we provide an access strategy
366+
* parameter that controls fetches of the bucket pages.
367+
*
365368
* Returns the block number of the page that followed the given page
366369
* in the bucket, or InvalidBlockNumber if no following page.
367370
*
@@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map)
370373
* on the bucket, too.
371374
*/
372375
BlockNumber
373-
_hash_freeovflpage(Relation rel, Buffer ovflbuf)
376+
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
377+
BufferAccessStrategy bstrategy)
374378
{
375379
HashMetaPage metap;
376380
Buffer metabuf;
@@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
413417
*/
414418
if (BlockNumberIsValid(prevblkno))
415419
{
416-
Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE,
417-
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
420+
Buffer prevbuf = _hash_getbuf_with_strategy(rel,
421+
prevblkno,
422+
HASH_WRITE,
423+
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
424+
bstrategy);
418425
Page prevpage = BufferGetPage(prevbuf);
419426
HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
420427

@@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
424431
}
425432
if (BlockNumberIsValid(nextblkno))
426433
{
427-
Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE,
428-
LH_OVERFLOW_PAGE);
434+
Buffer nextbuf = _hash_getbuf_with_strategy(rel,
435+
nextblkno,
436+
HASH_WRITE,
437+
LH_OVERFLOW_PAGE,
438+
bstrategy);
429439
Page nextpage = BufferGetPage(nextbuf);
430440
HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
431441

@@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
434444
_hash_wrtbuf(rel, nextbuf);
435445
}
436446

447+
/* Note: bstrategy is intentionally not used for metapage and bitmap */
448+
437449
/* Read the metapage so we can determine which bitmap page to use */
438450
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
439451
metap = (HashMetaPage) BufferGetPage(metabuf);
@@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
558570
*
559571
* Caller must hold exclusive lock on the target bucket. This allows
560572
* us to safely lock multiple pages in the bucket.
573+
*
574+
* Since this function is invoked in VACUUM, we provide an access strategy
575+
* parameter that controls fetches of the bucket pages.
561576
*/
562577
void
563578
_hash_squeezebucket(Relation rel,
564579
Bucket bucket,
565-
BlockNumber bucket_blkno)
580+
BlockNumber bucket_blkno,
581+
BufferAccessStrategy bstrategy)
566582
{
567583
Buffer wbuf;
568584
Buffer rbuf = 0;
@@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel,
581597
* start squeezing into the base bucket page.
582598
*/
583599
wblkno = bucket_blkno;
584-
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE);
600+
wbuf = _hash_getbuf_with_strategy(rel,
601+
wblkno,
602+
HASH_WRITE,
603+
LH_BUCKET_PAGE,
604+
bstrategy);
585605
wpage = BufferGetPage(wbuf);
586606
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
587607

@@ -595,16 +615,22 @@ _hash_squeezebucket(Relation rel,
595615
}
596616

597617
/*
598-
* find the last page in the bucket chain by starting at the base bucket
599-
* page and working forward.
618+
* Find the last page in the bucket chain by starting at the base bucket
619+
* page and working forward. Note: we assume that a hash bucket chain is
620+
* usually smaller than the buffer ring being used by VACUUM, else using
621+
* the access strategy here would be counterproductive.
600622
*/
601623
ropaque = wopaque;
602624
do
603625
{
604626
rblkno = ropaque->hasho_nextblkno;
605627
if (ropaque != wopaque)
606628
_hash_relbuf(rel, rbuf);
607-
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
629+
rbuf = _hash_getbuf_with_strategy(rel,
630+
rblkno,
631+
HASH_WRITE,
632+
LH_OVERFLOW_PAGE,
633+
bstrategy);
608634
rpage = BufferGetPage(rbuf);
609635
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
610636
Assert(ropaque->hasho_bucket == bucket);
@@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel,
644670
return;
645671
}
646672

647-
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
673+
wbuf = _hash_getbuf_with_strategy(rel,
674+
wblkno,
675+
HASH_WRITE,
676+
LH_OVERFLOW_PAGE,
677+
bstrategy);
648678
wpage = BufferGetPage(wbuf);
649679
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
650680
Assert(wopaque->hasho_bucket == bucket);
@@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel,
688718
/* yes, so release wbuf lock first */
689719
_hash_wrtbuf(rel, wbuf);
690720
/* free this overflow page (releases rbuf) */
691-
_hash_freeovflpage(rel, rbuf);
721+
_hash_freeovflpage(rel, rbuf, bstrategy);
692722
/* done */
693723
return;
694724
}
695725

696726
/* free this overflow page, then get the previous one */
697-
_hash_freeovflpage(rel, rbuf);
727+
_hash_freeovflpage(rel, rbuf, bstrategy);
698728

699-
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
729+
rbuf = _hash_getbuf_with_strategy(rel,
730+
rblkno,
731+
HASH_WRITE,
732+
LH_OVERFLOW_PAGE,
733+
bstrategy);
700734
rpage = BufferGetPage(rbuf);
701735
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
702736
Assert(ropaque->hasho_bucket == bucket);

src/backend/access/hash/hashpage.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
214214
return buf;
215215
}
216216

217+
/*
218+
* _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
219+
*
220+
* This is identical to _hash_getbuf() but also allows a buffer access
221+
* strategy to be specified. We use this for VACUUM operations.
222+
*/
223+
Buffer
224+
_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
225+
int access, int flags,
226+
BufferAccessStrategy bstrategy)
227+
{
228+
Buffer buf;
229+
230+
if (blkno == P_NEW)
231+
elog(ERROR, "hash AM does not use P_NEW");
232+
233+
buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
234+
235+
if (access != HASH_NOLOCK)
236+
LockBuffer(buf, access);
237+
238+
/* ref count and lock type are correct */
239+
240+
_hash_checkpage(rel, buf, flags);
241+
242+
return buf;
243+
}
244+
217245
/*
218246
* _hash_relbuf() -- release a locked buffer.
219247
*
@@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel,
840868
_hash_wrtbuf(rel, obuf);
841869
_hash_wrtbuf(rel, nbuf);
842870

843-
_hash_squeezebucket(rel, obucket, start_oblkno);
871+
_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
844872
}

src/backend/access/heap/heapam.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key)
8383
*/
8484
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
8585

86+
/*
87+
* If the table is large relative to NBuffers, use a bulk-read access
88+
* strategy, else use the default random-access strategy. During a
89+
* rescan, don't make a new strategy object if we don't have to.
90+
*/
91+
if (scan->rs_nblocks > NBuffers / 4 &&
92+
!scan->rs_rd->rd_istemp)
93+
{
94+
if (scan->rs_strategy == NULL)
95+
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
96+
}
97+
else
98+
{
99+
if (scan->rs_strategy != NULL)
100+
FreeAccessStrategy(scan->rs_strategy);
101+
scan->rs_strategy = NULL;
102+
}
103+
86104
scan->rs_inited = false;
87105
scan->rs_ctup.t_data = NULL;
88106
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
123141

124142
Assert(page < scan->rs_nblocks);
125143

126-
scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
127-
scan->rs_rd,
128-
page);
144+
/* release previous scan buffer, if any */
145+
if (BufferIsValid(scan->rs_cbuf))
146+
{
147+
ReleaseBuffer(scan->rs_cbuf);
148+
scan->rs_cbuf = InvalidBuffer;
149+
}
150+
151+
/* read page using selected strategy */
152+
scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
153+
page,
154+
scan->rs_strategy);
129155
scan->rs_cblock = page;
130156

131157
if (!scan->rs_pageatatime)
@@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
938964
scan->rs_rd = relation;
939965
scan->rs_snapshot = snapshot;
940966
scan->rs_nkeys = nkeys;
967+
scan->rs_strategy = NULL; /* set in initscan */
941968

942969
/*
943970
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan)
10071034
if (scan->rs_key)
10081035
pfree(scan->rs_key);
10091036

1037+
if (scan->rs_strategy != NULL)
1038+
FreeAccessStrategy(scan->rs_strategy);
1039+
10101040
pfree(scan);
10111041
}
10121042

src/backend/access/nbtree/nbtree.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $
15+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -786,9 +786,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
786786
/*
787787
* We can't use _bt_getbuf() here because it always applies
788788
* _bt_checkpage(), which will barf on an all-zero page. We want to
789-
* recycle all-zero pages, not fail.
789+
* recycle all-zero pages, not fail. Also, we want to use a nondefault
790+
* buffer access strategy.
790791
*/
791-
buf = ReadBuffer(rel, blkno);
792+
buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
792793
LockBuffer(buf, BT_READ);
793794
page = BufferGetPage(buf);
794795
opaque = (BTPageOpaque) PageGetSpecialPointer(page);

src/backend/access/transam/xlog.c

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record)
17991799
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
18001800
}
18011801

1802+
/*
1803+
* Test whether XLOG data has been flushed up to (at least) the given position.
1804+
*
1805+
* Returns true if a flush is still needed. (It may be that someone else
1806+
* is already in process of flushing that far, however.)
1807+
*/
1808+
bool
1809+
XLogNeedsFlush(XLogRecPtr record)
1810+
{
1811+
/* Quick exit if already known flushed */
1812+
if (XLByteLE(record, LogwrtResult.Flush))
1813+
return false;
1814+
1815+
/* read LogwrtResult and update local state */
1816+
{
1817+
/* use volatile pointer to prevent code rearrangement */
1818+
volatile XLogCtlData *xlogctl = XLogCtl;
1819+
1820+
SpinLockAcquire(&xlogctl->info_lck);
1821+
LogwrtResult = xlogctl->LogwrtResult;
1822+
SpinLockRelease(&xlogctl->info_lck);
1823+
}
1824+
1825+
/* check again */
1826+
if (XLByteLE(record, LogwrtResult.Flush))
1827+
return false;
1828+
1829+
return true;
1830+
}
1831+
18021832
/*
18031833
* Create a new XLOG file segment, or open a pre-existing one.
18041834
*

src/backend/catalog/index.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.283 2007/05/16 17:28:20 alvherre Exp $
11+
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -1658,6 +1658,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
16581658
ivinfo.vacuum_full = false;
16591659
ivinfo.message_level = DEBUG2;
16601660
ivinfo.num_heap_tuples = -1;
1661+
ivinfo.strategy = NULL;
16611662

16621663
state.tuplesort = tuplesort_begin_datum(TIDOID,
16631664
TIDLessOperator, false,

0 commit comments

Comments
 (0)