Skip to content

Commit 7ea8cd1

Browse files
committed
Improve read_stream.c advice for dense streams.
read_stream.c tries not to issue read-ahead advice when it thinks the kernel's own read-ahead should be active, ie when using buffered I/O and reading sequential blocks. It previously gave up too easily, and issued advice only for the first read of up to io_combine_limit blocks in a larger range of sequential blocks after random jump. The following read could suffer an avoidable I/O stall. Fix, by continuing to issue advice until the corresponding preadv() calls catch up with the start of the region we're currently issuing advice for, if ever. That's when the kernel actually sees the sequential pattern. Advice is now disabled only when the stream is entirely sequential as far as we can see in the look-ahead window, or in other words, when a sequential region is larger than we can cover with the current io_concurrency and io_combine_limit settings. While refactoring the advice control logic, also get rid of the "suppress_advice" argument that was passed around between functions to skip useless posix_fadvise() calls immediately followed by preadv(). read_stream_start_pending_read() can figure that out, so let's concentrate knowledge of advice heuristics in fewer places (our goal being to make advice-based I/O concurrency a legacy mode soon). The problem cases were revealed by Tomas Vondra's extensive regression testing with many different disk access patterns using Melanie Plageman's streaming Bitmap Heap Scan patch, in a battle against the venerable always-issue-advice-and-always-one-block-at-a-time code. Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version) Reported-by: Melanie Plageman <melanieplageman@gmail.com> Reported-by: Tomas Vondra <tomas@vondra.me> Reported-by: Andres Freund <andres@anarazel.de> Tested-by: Melanie Plageman <melanieplageman@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com
1 parent 11bd831 commit 7ea8cd1

File tree

1 file changed

+45
-20
lines changed

1 file changed

+45
-20
lines changed

src/backend/storage/aio/read_stream.c

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct ReadStream
133133

134134
/* Next expected block, for detecting sequential access. */
135135
BlockNumber seq_blocknum;
136+
BlockNumber seq_until_processed;
136137

137138
/* The read operation we are currently preparing. */
138139
BlockNumber pending_read_blocknum;
@@ -238,7 +239,7 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
238239
* distance to a level that prevents look-ahead until buffers are released.
239240
*/
240241
static bool
241-
read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
242+
read_stream_start_pending_read(ReadStream *stream)
242243
{
243244
bool need_wait;
244245
int nblocks;
@@ -262,16 +263,32 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
262263
else
263264
Assert(stream->next_buffer_index == stream->oldest_buffer_index);
264265

265-
/*
266-
* If advice hasn't been suppressed, this system supports it, and this
267-
* isn't a strictly sequential pattern, then we'll issue advice.
268-
*/
269-
if (!suppress_advice &&
270-
stream->advice_enabled &&
271-
stream->pending_read_blocknum != stream->seq_blocknum)
272-
flags = READ_BUFFERS_ISSUE_ADVICE;
273-
else
274-
flags = 0;
266+
/* Do we need to issue read-ahead advice? */
267+
flags = 0;
268+
if (stream->advice_enabled)
269+
{
270+
if (stream->pending_read_blocknum == stream->seq_blocknum)
271+
{
272+
/*
273+
* Sequential: Issue advice until the preadv() calls have caught
274+
* up with the first advice issued for this sequential region, and
275+
* then stay of the way of the kernel's own read-ahead.
276+
*/
277+
if (stream->seq_until_processed != InvalidBlockNumber)
278+
flags = READ_BUFFERS_ISSUE_ADVICE;
279+
}
280+
else
281+
{
282+
/*
283+
* Random jump: Note the starting location of a new potential
284+
* sequential region and start issuing advice. Skip it this time
285+
* if the preadv() follows immediately, eg first block in stream.
286+
*/
287+
stream->seq_until_processed = stream->pending_read_blocknum;
288+
if (stream->pinned_buffers > 0)
289+
flags = READ_BUFFERS_ISSUE_ADVICE;
290+
}
291+
}
275292

276293
/* How many more buffers is this backend allowed? */
277294
if (stream->temporary)
@@ -360,7 +377,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
360377
}
361378

362379
static void
363-
read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
380+
read_stream_look_ahead(ReadStream *stream)
364381
{
365382
while (stream->ios_in_progress < stream->max_ios &&
366383
stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
@@ -371,8 +388,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
371388

372389
if (stream->pending_read_nblocks == stream->io_combine_limit)
373390
{
374-
read_stream_start_pending_read(stream, suppress_advice);
375-
suppress_advice = false;
391+
read_stream_start_pending_read(stream);
376392
continue;
377393
}
378394

@@ -405,15 +421,13 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
405421
/* We have to start the pending read before we can build another. */
406422
while (stream->pending_read_nblocks > 0)
407423
{
408-
if (!read_stream_start_pending_read(stream, suppress_advice) ||
424+
if (!read_stream_start_pending_read(stream) ||
409425
stream->ios_in_progress == stream->max_ios)
410426
{
411427
/* We've hit the buffer or I/O limit. Rewind and stop here. */
412428
read_stream_unget_block(stream, blocknum);
413429
return;
414430
}
415-
416-
suppress_advice = false;
417431
}
418432

419433
/* This is the start of a new pending read. */
@@ -437,7 +451,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
437451
stream->pinned_buffers == 0) ||
438452
stream->distance == 0) &&
439453
stream->ios_in_progress < stream->max_ios)
440-
read_stream_start_pending_read(stream, suppress_advice);
454+
read_stream_start_pending_read(stream);
441455

442456
/*
443457
* There should always be something pinned when we leave this function,
@@ -613,6 +627,8 @@ read_stream_begin_impl(int flags,
613627
stream->callback = callback;
614628
stream->callback_private_data = callback_private_data;
615629
stream->buffered_blocknum = InvalidBlockNumber;
630+
stream->seq_blocknum = InvalidBlockNumber;
631+
stream->seq_until_processed = InvalidBlockNumber;
616632
stream->temporary = SmgrIsTemp(smgr);
617633

618634
/*
@@ -793,7 +809,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
793809
* space for more, but if we're just starting up we'll need to crank
794810
* the handle to get started.
795811
*/
796-
read_stream_look_ahead(stream, true);
812+
read_stream_look_ahead(stream);
797813

798814
/* End of stream reached? */
799815
if (stream->pinned_buffers == 0)
@@ -854,6 +870,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
854870
stream->distance = distance;
855871
}
856872
}
873+
874+
/*
875+
* If we've reached the first block of a sequential region we're
876+
* issuing advice for, cancel that until the next jump. The kernel
877+
* will see the sequential preadv() pattern starting here.
878+
*/
879+
if (stream->advice_enabled &&
880+
stream->ios[io_index].op.blocknum == stream->seq_until_processed)
881+
stream->seq_until_processed = InvalidBlockNumber;
857882
}
858883

859884
#ifdef CLOBBER_FREED_MEMORY
@@ -899,7 +924,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
899924
stream->oldest_buffer_index = 0;
900925

901926
/* Prepare for the next call. */
902-
read_stream_look_ahead(stream, false);
927+
read_stream_look_ahead(stream);
903928

904929
#ifndef READ_STREAM_DISABLE_FAST_PATH
905930
/* See if we can take the fast path for all-cached scans next time. */

0 commit comments

Comments
 (0)