Skip to content

Commit faeedbc

Browse files
committed
Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a later commit, we need the addresses of user space buffers to be well aligned. The exact requirements vary by OS and file system (typically sectors and/or memory pages). The address alignment size is set to 4096, which is enough for currently known systems: it matches modern sectors and common memory page size. There is no standard governing O_DIRECT's requirements so we might eventually have to reconsider this with more information from the field or future systems. Aligning I/O buffers on memory pages is also known to improve regular buffered I/O performance. Three classes of I/O buffers for regular data pages are adjusted: (1) Heap buffers are now allocated with the new palloc_aligned() or MemoryContextAllocAligned() functions introduced by commit 439f617. (2) Stack buffers now use a new struct PGIOAlignedBlock to respect PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer pool is also aligned in shared memory. WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus for O_DIRECT WAL writes to fail to be well aligned, but that's a pre-existing condition and will be addressed by a later commit. BufFiles are not yet addressed (there's no current plan to use O_DIRECT for those, but they could potentially get some incidental speedup even in plain buffered I/O operations through better alignment). If we can't align stack objects suitably using the compiler extensions we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to 0. This avoids the need to consider systems that have O_DIRECT but can't align stack objects the way we want; such systems could in theory be supported with more work but we don't currently know of any such machines, so it's easier to pretend there is no O_DIRECT support instead. That's an existing and tested class of system. Add assertions that all buffers passed into smgrread(), smgrwrite() and smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack alignment tricks may be unavailable) or the block size has been set too small to allow arrays of buffers to be all aligned. Author: Thomas Munro <thomas.munro@gmail.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
1 parent d73c285 commit faeedbc

File tree

26 files changed

+108
-45
lines changed

26 files changed

+108
-45
lines changed

contrib/bloom/blinsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ blbuildempty(Relation index)
166166
Page metapage;
167167

168168
/* Construct metapage. */
169-
metapage = (Page) palloc(BLCKSZ);
169+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
170170
BloomFillMetapage(index, metapage);
171171

172172
/*

contrib/pg_prewarm/pg_prewarm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ typedef enum
3636
PREWARM_BUFFER
3737
} PrewarmType;
3838

39-
static PGAlignedBlock blockbuffer;
39+
static PGIOAlignedBlock blockbuffer;
4040

4141
/*
4242
* pg_prewarm(regclass, mode text, fork text,

src/backend/access/gist/gistbuild.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state)
415415
* Write an empty page as a placeholder for the root page. It will be
416416
* replaced with the real root page at the end.
417417
*/
418-
page = palloc0(BLCKSZ);
418+
page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
419419
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
420420
page, true);
421421
state->pages_allocated++;
@@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
509509
levelstate->current_page++;
510510

511511
if (levelstate->pages[levelstate->current_page] == NULL)
512-
levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
512+
levelstate->pages[levelstate->current_page] =
513+
palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
513514

514515
newPage = levelstate->pages[levelstate->current_page];
515516
gistinitpage(newPage, old_page_flags);
@@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
579580

580581
/* Create page and copy data */
581582
data = (char *) (dist->list);
582-
target = palloc0(BLCKSZ);
583+
target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
583584
gistinitpage(target, isleaf ? F_LEAF : 0);
584585
for (int i = 0; i < dist->block.num; i++)
585586
{
@@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
630631
if (parent == NULL)
631632
{
632633
parent = palloc0(sizeof(GistSortedBuildLevelState));
633-
parent->pages[0] = (Page) palloc(BLCKSZ);
634+
parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
634635
parent->parent = NULL;
635636
gistinitpage(parent->pages[0], 0);
636637

src/backend/access/hash/hashpage.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,7 @@ static bool
992992
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
993993
{
994994
BlockNumber lastblock;
995-
PGAlignedBlock zerobuf;
995+
PGIOAlignedBlock zerobuf;
996996
Page page;
997997
HashPageOpaque ovflopaque;
998998

src/backend/access/heap/rewriteheap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
255255

256256
state->rs_old_rel = old_heap;
257257
state->rs_new_rel = new_heap;
258-
state->rs_buffer = (Page) palloc(BLCKSZ);
258+
state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
259259
/* new_heap needn't be empty, just locked */
260260
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
261261
state->rs_buffer_valid = false;

src/backend/access/nbtree/nbtree.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ btbuildempty(Relation index)
154154
Page metapage;
155155

156156
/* Construct metapage. */
157-
metapage = (Page) palloc(BLCKSZ);
157+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
158158
_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
159159

160160
/*

src/backend/access/nbtree/nbtsort.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level)
619619
Page page;
620620
BTPageOpaque opaque;
621621

622-
page = (Page) palloc(BLCKSZ);
622+
page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
623623

624624
/* Zero the page and set up standard page header info */
625625
_bt_pageinit(page, BLCKSZ);
@@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
660660
while (blkno > wstate->btws_pages_written)
661661
{
662662
if (!wstate->btws_zeropage)
663-
wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
663+
wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
664+
PG_IO_ALIGN_SIZE,
665+
MCXT_ALLOC_ZERO);
664666
/* don't set checksum for all-zero page */
665667
smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
666668
wstate->btws_pages_written++,
@@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
11701172
* set to point to "P_NONE"). This changes the index to the "valid" state
11711173
* by filling in a valid magic number in the metapage.
11721174
*/
1173-
metapage = (Page) palloc(BLCKSZ);
1175+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
11741176
_bt_initmetapage(metapage, rootblkno, rootlevel,
11751177
wstate->inskey->allequalimage);
11761178
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);

src/backend/access/spgist/spginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ spgbuildempty(Relation index)
158158
Page page;
159159

160160
/* Construct metapage. */
161-
page = (Page) palloc(BLCKSZ);
161+
page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
162162
SpGistInitMetapage(page);
163163

164164
/*

src/backend/access/transam/generic_xlog.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,17 @@ typedef struct
5858
char delta[MAX_DELTA_SIZE]; /* delta between page images */
5959
} PageData;
6060

61-
/* State of generic xlog record construction */
61+
/*
62+
* State of generic xlog record construction. Must be allocated at an I/O
63+
* aligned address.
64+
*/
6265
struct GenericXLogState
6366
{
67+
/* Page images (properly aligned, must be first) */
68+
PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
6469
/* Info about each page, see above */
6570
PageData pages[MAX_GENERIC_XLOG_PAGES];
6671
bool isLogged;
67-
/* Page images (properly aligned) */
68-
PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
6972
};
7073

7174
static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -269,7 +272,9 @@ GenericXLogStart(Relation relation)
269272
GenericXLogState *state;
270273
int i;
271274

272-
state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
275+
state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
276+
PG_IO_ALIGN_SIZE,
277+
0);
273278
state->isLogged = RelationNeedsWAL(relation);
274279

275280
for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4506,7 +4506,7 @@ XLOGShmemSize(void)
45064506
/* xlblocks array */
45074507
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
45084508
/* extra alignment padding for XLOG I/O buffers */
4509-
size = add_size(size, XLOG_BLCKSZ);
4509+
size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
45104510
/* and the buffers themselves */
45114511
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
45124512

0 commit comments

Comments
 (0)