Skip to content

Commit a04a423

Browse files
committed
Arrange for large sequential scans to synchronize with each other, so that
when multiple backends are scanning the same relation concurrently, each page is (ideally) read only once. Jeff Davis, with review by Heikki and Tom.
1 parent 6d6d14b commit a04a423

File tree

10 files changed

+485
-31
lines changed

10 files changed

+485
-31
lines changed

src/backend/access/heap/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
# Makefile for access/heap
55
#
66
# IDENTIFICATION
7-
# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.15 2007/04/08 01:26:27 tgl Exp $
7+
# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $
88
#
99
#-------------------------------------------------------------------------
1010

1111
subdir = src/backend/access/heap
1212
top_builddir = ../../../..
1313
include $(top_builddir)/src/Makefile.global
1414

15-
OBJS = heapam.o hio.o rewriteheap.o tuptoaster.o
15+
OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
1616

1717
all: SUBSYS.o
1818

src/backend/access/heap/heapam.c

Lines changed: 112 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.235 2007/06/08 18:23:52 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -78,29 +78,44 @@ initscan(HeapScanDesc scan, ScanKey key)
7878
* Determine the number of blocks we have to scan.
7979
*
8080
* It is sufficient to do this once at scan start, since any tuples added
81-
* while the scan is in progress will be invisible to my transaction
82-
* anyway...
81+
* while the scan is in progress will be invisible to my snapshot
82+
* anyway. (That is not true when using a non-MVCC snapshot. However,
83+
* we couldn't guarantee to return tuples added after scan start anyway,
84+
* since they might go into pages we already scanned. To guarantee
85+
* consistent results for a non-MVCC snapshot, the caller must hold some
86+
* higher-level lock that ensures the interesting tuple(s) won't change.)
8387
*/
8488
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
8589

8690
/*
8791
* If the table is large relative to NBuffers, use a bulk-read access
88-
* strategy, else use the default random-access strategy. During a
89-
* rescan, don't make a new strategy object if we don't have to.
92+
* strategy and enable synchronized scanning (see syncscan.c). Although
93+
* the thresholds for these features could be different, we make them the
94+
* same so that there are only two behaviors to tune rather than four.
95+
*
96+
* During a rescan, don't make a new strategy object if we don't have to.
9097
*/
9198
if (scan->rs_nblocks > NBuffers / 4 &&
9299
!scan->rs_rd->rd_istemp)
93100
{
94101
if (scan->rs_strategy == NULL)
95102
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
103+
104+
scan->rs_syncscan = true;
105+
scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
96106
}
97107
else
98108
{
99109
if (scan->rs_strategy != NULL)
100110
FreeAccessStrategy(scan->rs_strategy);
101111
scan->rs_strategy = NULL;
112+
113+
scan->rs_syncscan = false;
114+
scan->rs_startblock = 0;
102115
}
103116

117+
/* rs_pageatatime was set when the snapshot was filled in */
118+
104119
scan->rs_inited = false;
105120
scan->rs_ctup.t_data = NULL;
106121
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -229,6 +244,7 @@ heapgettup(HeapScanDesc scan,
229244
Snapshot snapshot = scan->rs_snapshot;
230245
bool backward = ScanDirectionIsBackward(dir);
231246
BlockNumber page;
247+
bool finished;
232248
Page dp;
233249
int lines;
234250
OffsetNumber lineoff;
@@ -251,7 +267,7 @@ heapgettup(HeapScanDesc scan,
251267
tuple->t_data = NULL;
252268
return;
253269
}
254-
page = 0; /* first page */
270+
page = scan->rs_startblock; /* first page */
255271
heapgetpage(scan, page);
256272
lineoff = FirstOffsetNumber; /* first offnum */
257273
scan->rs_inited = true;
@@ -285,7 +301,18 @@ heapgettup(HeapScanDesc scan,
285301
tuple->t_data = NULL;
286302
return;
287303
}
288-
page = scan->rs_nblocks - 1; /* final page */
304+
/*
305+
* Disable reporting to syncscan logic in a backwards scan; it's
306+
* not very likely anyone else is doing the same thing at the same
307+
* time, and much more likely that we'll just bollix things for
308+
* forward scanners.
309+
*/
310+
scan->rs_syncscan = false;
311+
/* start from last page of the scan */
312+
if (scan->rs_startblock > 0)
313+
page = scan->rs_startblock - 1;
314+
else
315+
page = scan->rs_nblocks - 1;
289316
heapgetpage(scan, page);
290317
}
291318
else
@@ -397,10 +424,43 @@ heapgettup(HeapScanDesc scan,
397424
*/
398425
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
399426

427+
/*
428+
* advance to next/prior page and detect end of scan
429+
*/
430+
if (backward)
431+
{
432+
finished = (page == scan->rs_startblock);
433+
if (page == 0)
434+
page = scan->rs_nblocks;
435+
page--;
436+
}
437+
else
438+
{
439+
page++;
440+
if (page >= scan->rs_nblocks)
441+
page = 0;
442+
finished = (page == scan->rs_startblock);
443+
444+
/*
445+
* Report our new scan position for synchronization purposes.
446+
* We don't do that when moving backwards, however. That would
447+
* just mess up any other forward-moving scanners.
448+
*
449+
* Note: we do this before checking for end of scan so that the
450+
* final state of the position hint is back at the start of the
451+
* rel. That's not strictly necessary, but otherwise when you run
452+
* the same query multiple times the starting position would shift
453+
* a little bit backwards on every invocation, which is confusing.
454+
* We don't guarantee any specific ordering in general, though.
455+
*/
456+
if (scan->rs_syncscan)
457+
ss_report_location(scan->rs_rd, page);
458+
}
459+
400460
/*
401461
* return NULL if we've exhausted all the pages
402462
*/
403-
if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
463+
if (finished)
404464
{
405465
if (BufferIsValid(scan->rs_cbuf))
406466
ReleaseBuffer(scan->rs_cbuf);
@@ -411,8 +471,6 @@ heapgettup(HeapScanDesc scan,
411471
return;
412472
}
413473

414-
page = backward ? (page - 1) : (page + 1);
415-
416474
heapgetpage(scan, page);
417475

418476
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
@@ -455,6 +513,7 @@ heapgettup_pagemode(HeapScanDesc scan,
455513
HeapTuple tuple = &(scan->rs_ctup);
456514
bool backward = ScanDirectionIsBackward(dir);
457515
BlockNumber page;
516+
bool finished;
458517
Page dp;
459518
int lines;
460519
int lineindex;
@@ -478,7 +537,7 @@ heapgettup_pagemode(HeapScanDesc scan,
478537
tuple->t_data = NULL;
479538
return;
480539
}
481-
page = 0; /* first page */
540+
page = scan->rs_startblock; /* first page */
482541
heapgetpage(scan, page);
483542
lineindex = 0;
484543
scan->rs_inited = true;
@@ -509,7 +568,18 @@ heapgettup_pagemode(HeapScanDesc scan,
509568
tuple->t_data = NULL;
510569
return;
511570
}
512-
page = scan->rs_nblocks - 1; /* final page */
571+
/*
572+
* Disable reporting to syncscan logic in a backwards scan; it's
573+
* not very likely anyone else is doing the same thing at the same
574+
* time, and much more likely that we'll just bollix things for
575+
* forward scanners.
576+
*/
577+
scan->rs_syncscan = false;
578+
/* start from last page of the scan */
579+
if (scan->rs_startblock > 0)
580+
page = scan->rs_startblock - 1;
581+
else
582+
page = scan->rs_nblocks - 1;
513583
heapgetpage(scan, page);
514584
}
515585
else
@@ -616,11 +686,40 @@ heapgettup_pagemode(HeapScanDesc scan,
616686
* if we get here, it means we've exhausted the items on this page and
617687
* it's time to move to the next.
618688
*/
689+
if (backward)
690+
{
691+
finished = (page == scan->rs_startblock);
692+
if (page == 0)
693+
page = scan->rs_nblocks;
694+
page--;
695+
}
696+
else
697+
{
698+
page++;
699+
if (page >= scan->rs_nblocks)
700+
page = 0;
701+
finished = (page == scan->rs_startblock);
702+
703+
/*
704+
* Report our new scan position for synchronization purposes.
705+
* We don't do that when moving backwards, however. That would
706+
* just mess up any other forward-moving scanners.
707+
*
708+
* Note: we do this before checking for end of scan so that the
709+
* final state of the position hint is back at the start of the
710+
* rel. That's not strictly necessary, but otherwise when you run
711+
* the same query multiple times the starting position would shift
712+
* a little bit backwards on every invocation, which is confusing.
713+
* We don't guarantee any specific ordering in general, though.
714+
*/
715+
if (scan->rs_syncscan)
716+
ss_report_location(scan->rs_rd, page);
717+
}
619718

620719
/*
621720
* return NULL if we've exhausted all the pages
622721
*/
623-
if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
722+
if (finished)
624723
{
625724
if (BufferIsValid(scan->rs_cbuf))
626725
ReleaseBuffer(scan->rs_cbuf);
@@ -631,7 +730,6 @@ heapgettup_pagemode(HeapScanDesc scan,
631730
return;
632731
}
633732

634-
page = backward ? (page - 1) : (page + 1);
635733
heapgetpage(scan, page);
636734

637735
dp = (Page) BufferGetPage(scan->rs_cbuf);

0 commit comments

Comments
 (0)