Skip to content

Commit 4d330a6

Browse files
committed
Add smgrzeroextend(), FileZero(), FileFallocate()
smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple blocks. When extending by a small number of blocks, use FileZero() instead, as using posix_fallocate() for small numbers of blocks is inefficient for some file systems / operating systems. FileZero() is also used as the fallback for FileFallocate() on platforms / filesystems that don't support fallocate. A big advantage of using posix_fallocate() is that it typically won't cause dirty buffers in the kernel pagecache. So far the most common pattern in our code is that we smgrextend() a page full of zeroes and put the corresponding page into shared buffers, from where we later write out the actual contents of the page. If the kernel, e.g. due to memory pressure or elapsed time, already wrote back the all-zeroes page, this can lead to doubling the amount of writes reaching storage. There are no users of smgrzeroextend() as of this commit. That will follow in future commits. Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: John Naylor <john.naylor@enterprisedb.com> Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
1 parent 4766eef commit 4d330a6

File tree

6 files changed

+231
-0
lines changed

6 files changed

+231
-0
lines changed

src/backend/storage/file/fd.c

+88
Original file line numberDiff line numberDiff line change
@@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info)
22062206
return returnCode;
22072207
}
22082208

2209+
/*
2210+
* Zero a region of the file.
2211+
*
2212+
* Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2213+
* appropriate error.
2214+
*/
2215+
int
2216+
FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2217+
{
2218+
int returnCode;
2219+
ssize_t written;
2220+
2221+
Assert(FileIsValid(file));
2222+
2223+
DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2224+
file, VfdCache[file].fileName,
2225+
(int64) offset, (int64) amount));
2226+
2227+
returnCode = FileAccess(file);
2228+
if (returnCode < 0)
2229+
return returnCode;
2230+
2231+
pgstat_report_wait_start(wait_event_info);
2232+
written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2233+
pgstat_report_wait_end();
2234+
2235+
if (written < 0)
2236+
return -1;
2237+
else if (written != amount)
2238+
{
2239+
/* if errno is unset, assume problem is no disk space */
2240+
if (errno == 0)
2241+
errno = ENOSPC;
2242+
return -1;
2243+
}
2244+
2245+
return 0;
2246+
}
2247+
2248+
/*
2249+
* Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2250+
* not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2251+
* use FileZero() instead.
2252+
*
2253+
* Note that at least glibc() implements posix_fallocate() in userspace if not
2254+
* implemented by the filesystem. That's not the case for all environments
2255+
* though.
2256+
*
2257+
* Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2258+
* appropriate error.
2259+
*/
2260+
int
2261+
FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2262+
{
2263+
#ifdef HAVE_POSIX_FALLOCATE
2264+
int returnCode;
2265+
2266+
Assert(FileIsValid(file));
2267+
2268+
DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2269+
file, VfdCache[file].fileName,
2270+
(int64) offset, (int64) amount));
2271+
2272+
returnCode = FileAccess(file);
2273+
if (returnCode < 0)
2274+
return -1;
2275+
2276+
pgstat_report_wait_start(wait_event_info);
2277+
returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2278+
pgstat_report_wait_end();
2279+
2280+
if (returnCode == 0)
2281+
return 0;
2282+
2283+
/* for compatibility with %m printing etc */
2284+
errno = returnCode;
2285+
2286+
/*
2287+
* Return in cases of a "real" failure, if fallocate is not supported,
2288+
* fall through to the FileZero() backed implementation.
2289+
*/
2290+
if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2291+
return -1;
2292+
#endif
2293+
2294+
return FileZero(file, offset, amount, wait_event_info);
2295+
}
2296+
22092297
off_t
22102298
FileSize(File file)
22112299
{

src/backend/storage/smgr/md.c

+108
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,114 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
500500
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
501501
}
502502

503+
/*
504+
* mdzeroextend() -- Add new zeroed out blocks to the specified relation.
505+
*
506+
* Similar to mdextend(), except the relation can be extended by multiple
507+
* blocks at once and the added blocks will be filled with zeroes.
508+
*/
509+
void
510+
mdzeroextend(SMgrRelation reln, ForkNumber forknum,
511+
BlockNumber blocknum, int nblocks, bool skipFsync)
512+
{
513+
MdfdVec *v;
514+
BlockNumber curblocknum = blocknum;
515+
int remblocks = nblocks;
516+
517+
Assert(nblocks > 0);
518+
519+
/* This assert is too expensive to have on normally ... */
520+
#ifdef CHECK_WRITE_VS_EXTEND
521+
Assert(blocknum >= mdnblocks(reln, forknum));
522+
#endif
523+
524+
/*
525+
* If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
526+
* more --- we mustn't create a block whose number actually is
527+
* InvalidBlockNumber or larger.
528+
*/
529+
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
530+
ereport(ERROR,
531+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
532+
errmsg("cannot extend file \"%s\" beyond %u blocks",
533+
relpath(reln->smgr_rlocator, forknum),
534+
InvalidBlockNumber)));
535+
536+
while (remblocks > 0)
537+
{
538+
BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
539+
off_t seekpos = (off_t) BLCKSZ * segstartblock;
540+
int numblocks;
541+
542+
if (segstartblock + remblocks > RELSEG_SIZE)
543+
numblocks = RELSEG_SIZE - segstartblock;
544+
else
545+
numblocks = remblocks;
546+
547+
v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
548+
549+
Assert(segstartblock < RELSEG_SIZE);
550+
Assert(segstartblock + numblocks <= RELSEG_SIZE);
551+
552+
/*
553+
* If available and useful, use posix_fallocate() (via FileAllocate())
554+
* to extend the relation. That's often more efficient than using
555+
* write(), as it commonly won't cause the kernel to allocate page
556+
* cache space for the extended pages.
557+
*
558+
* However, we don't use FileAllocate() for small extensions, as it
559+
* defeats delayed allocation on some filesystems. Not clear where
560+
* that decision should be made though? For now just use a cutoff of
561+
* 8, anything between 4 and 8 worked OK in some local testing.
562+
*/
563+
if (numblocks > 8)
564+
{
565+
int ret;
566+
567+
ret = FileFallocate(v->mdfd_vfd,
568+
seekpos, (off_t) BLCKSZ * numblocks,
569+
WAIT_EVENT_DATA_FILE_EXTEND);
570+
if (ret != 0)
571+
{
572+
ereport(ERROR,
573+
errcode_for_file_access(),
574+
errmsg("could not extend file \"%s\" with FileFallocate(): %m",
575+
FilePathName(v->mdfd_vfd)),
576+
errhint("Check free disk space."));
577+
}
578+
}
579+
else
580+
{
581+
int ret;
582+
583+
/*
584+
* Even if we don't want to use fallocate, we can still extend a
585+
* bit more efficiently than writing each 8kB block individually.
586+
* pg_pwrite_zeroes() (via FileZero()) uses
587+
* pg_pwritev_with_retry() to avoid multiple writes or needing a
588+
* zeroed buffer for the whole length of the extension.
589+
*/
590+
ret = FileZero(v->mdfd_vfd,
591+
seekpos, (off_t) BLCKSZ * numblocks,
592+
WAIT_EVENT_DATA_FILE_EXTEND);
593+
if (ret < 0)
594+
ereport(ERROR,
595+
errcode_for_file_access(),
596+
errmsg("could not extend file \"%s\": %m",
597+
FilePathName(v->mdfd_vfd)),
598+
errhint("Check free disk space."));
599+
}
600+
601+
if (!skipFsync && !SmgrIsTemp(reln))
602+
register_dirty_segment(reln, forknum, v);
603+
604+
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
605+
606+
remblocks -= numblocks;
607+
curblocknum += numblocks;
608+
}
609+
}
610+
503611
/*
504612
* mdopenfork() -- Open one fork of the specified relation.
505613
*

src/backend/storage/smgr/smgr.c

+28
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ typedef struct f_smgr
5050
bool isRedo);
5151
void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
5252
BlockNumber blocknum, const void *buffer, bool skipFsync);
53+
void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
54+
BlockNumber blocknum, int nblocks, bool skipFsync);
5355
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
5456
BlockNumber blocknum);
5557
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
@@ -75,6 +77,7 @@ static const f_smgr smgrsw[] = {
7577
.smgr_exists = mdexists,
7678
.smgr_unlink = mdunlink,
7779
.smgr_extend = mdextend,
80+
.smgr_zeroextend = mdzeroextend,
7881
.smgr_prefetch = mdprefetch,
7982
.smgr_read = mdread,
8083
.smgr_write = mdwrite,
@@ -507,6 +510,31 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
507510
reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
508511
}
509512

513+
/*
514+
* smgrzeroextend() -- Add new zeroed out blocks to a file.
515+
*
516+
* Similar to smgrextend(), except the relation can be extended by
517+
* multiple blocks at once and the added blocks will be filled with
518+
* zeroes.
519+
*/
520+
void
521+
smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
522+
int nblocks, bool skipFsync)
523+
{
524+
smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
525+
nblocks, skipFsync);
526+
527+
/*
528+
* Normally we expect this to increase the fork size by nblocks, but if
529+
* the cached value isn't as expected, just invalidate it so the next call
530+
* asks the kernel.
531+
*/
532+
if (reln->smgr_cached_nblocks[forknum] == blocknum)
533+
reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
534+
else
535+
reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
536+
}
537+
510538
/*
511539
* smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
512540
*

src/include/storage/fd.h

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event
106106
extern int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
107107
extern int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
108108
extern int FileSync(File file, uint32 wait_event_info);
109+
extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info);
110+
extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
111+
109112
extern off_t FileSize(File file);
110113
extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
111114
extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);

src/include/storage/md.h

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
2828
extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
2929
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
3030
BlockNumber blocknum, const void *buffer, bool skipFsync);
31+
extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
32+
BlockNumber blocknum, int nblocks, bool skipFsync);
3133
extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
3234
BlockNumber blocknum);
3335
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

src/include/storage/smgr.h

+2
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
9292
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
9393
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
9494
BlockNumber blocknum, const void *buffer, bool skipFsync);
95+
extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
96+
BlockNumber blocknum, int nblocks, bool skipFsync);
9597
extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
9698
BlockNumber blocknum);
9799
extern void smgrread(SMgrRelation reln, ForkNumber forknum,

0 commit comments

Comments
 (0)