Skip to content

Commit ae3df4b

Browse files
committed
read_stream: Introduce and use optional batchmode support
Submitting IO in larger batches can be more efficient than doing so one-by-one, particularly for many small reads. It does, however, require the ReadStreamBlockNumberCB callback to abide by the restrictions of AIO batching (c.f. pgaio_enter_batchmode()). Basically, the callback may not: a) block without first calling pgaio_submit_staged(), unless a to-be-waited-on lock cannot be part of a deadlock, e.g. because it is never held while waiting for IO. b) directly or indirectly start another batch pgaio_enter_batchmode() As this requires care and is nontrivial in some cases, batching is only used with explicit opt-in. This patch adds an explicit flag (READ_STREAM_USE_BATCHING) to read_stream and uses it where appropriate. There are two cases where batching would likely be beneficial, but where we aren't using it yet: 1) bitmap heap scans, because the callback reads the VM This should soon be solved, because we are planning to remove the use of the VM, due to that not being sound. 2) The first phase of heap vacuum This could be made to support batchmode, but would require some care. Reviewed-by: Noah Misch <noah@leadboat.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
1 parent f4d0730 commit ae3df4b

File tree

12 files changed

+129
-12
lines changed

12 files changed

+129
-12
lines changed

contrib/amcheck/verify_heapam.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,12 +447,23 @@ verify_heapam(PG_FUNCTION_ARGS)
447447

448448
if (skip_option == SKIP_PAGES_NONE)
449449
{
450+
/*
451+
* It is safe to use batchmode as block_range_read_stream_cb takes no
452+
* locks.
453+
*/
450454
stream_cb = block_range_read_stream_cb;
451-
stream_flags = READ_STREAM_SEQUENTIAL | READ_STREAM_FULL;
455+
stream_flags = READ_STREAM_SEQUENTIAL |
456+
READ_STREAM_FULL |
457+
READ_STREAM_USE_BATCHING;
452458
stream_data = &stream_skip_data.range;
453459
}
454460
else
455461
{
462+
/*
463+
* It would not be safe to naively use use batchmode, as
464+
* heapcheck_read_stream_next_unskippable takes locks. It shouldn't be
465+
* too hard to convert though.
466+
*/
456467
stream_cb = heapcheck_read_stream_next_unskippable;
457468
stream_flags = READ_STREAM_DEFAULT;
458469
stream_data = &stream_skip_data;

contrib/pg_prewarm/pg_prewarm.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,12 @@ pg_prewarm(PG_FUNCTION_ARGS)
198198
p.current_blocknum = first_block;
199199
p.last_exclusive = last_block + 1;
200200

201-
stream = read_stream_begin_relation(READ_STREAM_FULL,
201+
/*
202+
* It is safe to use batchmode as block_range_read_stream_cb takes no
203+
* locks.
204+
*/
205+
stream = read_stream_begin_relation(READ_STREAM_FULL |
206+
READ_STREAM_USE_BATCHING,
202207
NULL,
203208
rel,
204209
forkNumber,

contrib/pg_visibility/pg_visibility.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,13 @@ collect_visibility_data(Oid relid, bool include_pd)
526526
{
527527
p.current_blocknum = 0;
528528
p.last_exclusive = nblocks;
529-
stream = read_stream_begin_relation(READ_STREAM_FULL,
529+
530+
/*
531+
* It is safe to use batchmode as block_range_read_stream_cb takes no
532+
* locks.
533+
*/
534+
stream = read_stream_begin_relation(READ_STREAM_FULL |
535+
READ_STREAM_USE_BATCHING,
530536
bstrategy,
531537
rel,
532538
MAIN_FORKNUM,

src/backend/access/gist/gistvacuum.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,13 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
210210
needLock = !RELATION_IS_LOCAL(rel);
211211

212212
p.current_blocknum = GIST_ROOT_BLKNO;
213-
stream = read_stream_begin_relation(READ_STREAM_FULL,
213+
214+
/*
215+
* It is safe to use batchmode as block_range_read_stream_cb takes no
216+
* locks.
217+
*/
218+
stream = read_stream_begin_relation(READ_STREAM_FULL |
219+
READ_STREAM_USE_BATCHING,
214220
info->strategy,
215221
rel,
216222
MAIN_FORKNUM,

src/backend/access/heap/heapam.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1206,7 +1206,15 @@ heap_beginscan(Relation relation, Snapshot snapshot,
12061206
else
12071207
cb = heap_scan_stream_read_next_serial;
12081208

1209-
scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL,
1209+
/* ---
1210+
* It is safe to use batchmode as the only locks taken by `cb`
1211+
* are never taken while waiting for IO:
1212+
* - SyncScanLock is used in the non-parallel case
1213+
* - in the parallel case, only spinlocks and atomics are used
1214+
* ---
1215+
*/
1216+
scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL |
1217+
READ_STREAM_USE_BATCHING,
12101218
scan->rs_strategy,
12111219
scan->rs_base.rs_rd,
12121220
MAIN_FORKNUM,
@@ -1216,6 +1224,12 @@ heap_beginscan(Relation relation, Snapshot snapshot,
12161224
}
12171225
else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
12181226
{
1227+
/*
1228+
* Currently we can't trivially use batching, due to the
1229+
* VM_ALL_VISIBLE check in bitmapheap_stream_read_next. While that
1230+
* could be made safe, we are about to remove the all-visible logic
1231+
* from bitmap scans due to its unsoundness.
1232+
*/
12191233
scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT,
12201234
scan->rs_strategy,
12211235
scan->rs_base.rs_rd,

src/backend/access/heap/vacuumlazy.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,7 +1225,12 @@ lazy_scan_heap(LVRelState *vacrel)
12251225
vacrel->next_unskippable_eager_scanned = false;
12261226
vacrel->next_unskippable_vmbuffer = InvalidBuffer;
12271227

1228-
/* Set up the read stream for vacuum's first pass through the heap */
1228+
/*
1229+
* Set up the read stream for vacuum's first pass through the heap.
1230+
*
1231+
* This could be made safe for READ_STREAM_USE_BATCHING, but only with
1232+
* explicit work in heap_vac_scan_next_block.
1233+
*/
12291234
stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
12301235
vacrel->bstrategy,
12311236
vacrel->rel,
@@ -2669,6 +2674,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel)
26692674
* Read stream callback for vacuum's third phase (second pass over the heap).
26702675
* Gets the next block from the TID store and returns it or InvalidBlockNumber
26712676
* if there are no further blocks to vacuum.
2677+
*
2678+
* NB: Assumed to be safe to use with READ_STREAM_USE_BATCHING.
26722679
*/
26732680
static BlockNumber
26742681
vacuum_reap_lp_read_stream_next(ReadStream *stream,
@@ -2732,8 +2739,16 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
27322739

27332740
iter = TidStoreBeginIterate(vacrel->dead_items);
27342741

2735-
/* Set up the read stream for vacuum's second pass through the heap */
2736-
stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
2742+
/*
2743+
* Set up the read stream for vacuum's second pass through the heap.
2744+
*
2745+
* It is safe to use batchmode, as vacuum_reap_lp_read_stream_next() does
2746+
* not need to wait for IO and does not perform locking. Once we support
2747+
* parallelism it should still be fine, as presumably the holder of locks
2748+
* would never be blocked by IO while holding the lock.
2749+
*/
2750+
stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
2751+
READ_STREAM_USE_BATCHING,
27372752
vacrel->bstrategy,
27382753
vacrel->rel,
27392754
MAIN_FORKNUM,

src/backend/access/nbtree/nbtree.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,13 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
10641064
needLock = !RELATION_IS_LOCAL(rel);
10651065

10661066
p.current_blocknum = BTREE_METAPAGE + 1;
1067-
stream = read_stream_begin_relation(READ_STREAM_FULL,
1067+
1068+
/*
1069+
* It is safe to use batchmode as block_range_read_stream_cb takes no
1070+
* locks.
1071+
*/
1072+
stream = read_stream_begin_relation(READ_STREAM_FULL |
1073+
READ_STREAM_USE_BATCHING,
10681074
info->strategy,
10691075
rel,
10701076
MAIN_FORKNUM,

src/backend/access/spgist/spgvacuum.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,13 @@ spgvacuumscan(spgBulkDeleteState *bds)
822822
/* We can skip locking for new or temp relations */
823823
needLock = !RELATION_IS_LOCAL(index);
824824
p.current_blocknum = SPGIST_METAPAGE_BLKNO + 1;
825-
stream = read_stream_begin_relation(READ_STREAM_FULL,
825+
826+
/*
827+
* It is safe to use batchmode as block_range_read_stream_cb takes no
828+
* locks.
829+
*/
830+
stream = read_stream_begin_relation(READ_STREAM_FULL |
831+
READ_STREAM_USE_BATCHING,
826832
bds->info->strategy,
827833
index,
828834
MAIN_FORKNUM,

src/backend/commands/analyze.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1237,7 +1237,12 @@ acquire_sample_rows(Relation onerel, int elevel,
12371237
scan = table_beginscan_analyze(onerel);
12381238
slot = table_slot_create(onerel, NULL);
12391239

1240-
stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
1240+
/*
1241+
* It is safe to use batching, as block_sampling_read_stream_next never
1242+
* blocks.
1243+
*/
1244+
stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1245+
READ_STREAM_USE_BATCHING,
12411246
vac_strategy,
12421247
scan->rs_rd,
12431248
MAIN_FORKNUM,

src/backend/storage/aio/read_stream.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ struct ReadStream
102102
int16 initialized_buffers;
103103
int read_buffers_flags;
104104
bool sync_mode; /* using io_method=sync */
105+
bool batch_mode; /* READ_STREAM_USE_BATCHING */
105106
bool advice_enabled;
106107
bool temporary;
107108

@@ -403,6 +404,15 @@ read_stream_start_pending_read(ReadStream *stream)
403404
static void
404405
read_stream_look_ahead(ReadStream *stream)
405406
{
407+
/*
408+
* Allow amortizing the cost of submitting IO over multiple IOs. This
409+
* requires that we don't do any operations that could lead to a deadlock
410+
* with staged-but-unsubmitted IO. The callback needs to opt-in to being
411+
* careful.
412+
*/
413+
if (stream->batch_mode)
414+
pgaio_enter_batchmode();
415+
406416
while (stream->ios_in_progress < stream->max_ios &&
407417
stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
408418
{
@@ -450,6 +460,8 @@ read_stream_look_ahead(ReadStream *stream)
450460
{
451461
/* We've hit the buffer or I/O limit. Rewind and stop here. */
452462
read_stream_unget_block(stream, blocknum);
463+
if (stream->batch_mode)
464+
pgaio_exit_batchmode();
453465
return;
454466
}
455467
}
@@ -484,6 +496,9 @@ read_stream_look_ahead(ReadStream *stream)
484496
* time.
485497
*/
486498
Assert(stream->pinned_buffers > 0 || stream->distance == 0);
499+
500+
if (stream->batch_mode)
501+
pgaio_exit_batchmode();
487502
}
488503

489504
/*
@@ -617,6 +632,7 @@ read_stream_begin_impl(int flags,
617632
MAXALIGN(&stream->ios[Max(1, max_ios)]);
618633

619634
stream->sync_mode = io_method == IOMETHOD_SYNC;
635+
stream->batch_mode = flags & READ_STREAM_USE_BATCHING;
620636

621637
#ifdef USE_PREFETCH
622638

src/backend/storage/buffer/bufmgr.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5100,7 +5100,13 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
51005100
p.current_blocknum = 0;
51015101
p.last_exclusive = nblocks;
51025102
src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5103-
src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL,
5103+
5104+
/*
5105+
* It is safe to use batchmode as block_range_read_stream_cb takes no
5106+
* locks.
5107+
*/
5108+
src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
5109+
READ_STREAM_USE_BATCHING,
51045110
bstrategy_src,
51055111
src_smgr,
51065112
permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,

src/include/storage/read_stream.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,27 @@
4242
*/
4343
#define READ_STREAM_FULL 0x04
4444

45+
/* ---
46+
* Opt-in to using AIO batchmode.
47+
*
48+
* Submitting IO in larger batches can be more efficient than doing so
49+
* one-by-one, particularly for many small reads. It does, however, require
50+
* the ReadStreamBlockNumberCB callback to abide by the restrictions of AIO
51+
* batching (c.f. pgaio_enter_batchmode()). Basically, the callback may not:
52+
*
53+
* a) block without first calling pgaio_submit_staged(), unless a
54+
* to-be-waited-on lock cannot be part of a deadlock, e.g. because it is
55+
* never held while waiting for IO.
56+
*
57+
* b) start another batch (without first exiting batchmode and re-entering
58+
* before returning)
59+
*
60+
* As this requires care and is nontrivial in some cases, batching is only
61+
* used with explicit opt-in.
62+
* ---
63+
*/
64+
#define READ_STREAM_USE_BATCHING 0x08
65+
4566
struct ReadStream;
4667
typedef struct ReadStream ReadStream;
4768

0 commit comments

Comments
 (0)