Skip to content

Commit 4908c58

Browse files
committed
Provide vectored variants of smgrread() and smgrwrite().
smgrreadv() and smgrwritev() and their md.c implementations call FileReadV() and FileWriteV(). A range of disk blocks beginning at 'blocknum' and extending for 'nblocks' can be scattered to or gathered from multiple buffers with a single system call. The traditional smgrread() and smgrwrite() functions are implemented in terms of the new functions. Later commits will introduce calls with nblocks > 1, but the following behavioral changes can be seen already: * After a short transfer we'll now retry until we eventually read 0 bytes (= EOF) or get ENOSPC, EDQUOT, EFBIG etc, where previously we would infer the reason. Retrying is consistent with xlog.c's treatment of large WAL writes, and arguably also xlog.c and fd.c's treatment of EINTR. Arbitrary short returns for larger transfers have been observed on several OSes, and might in theory also happen for transient reasons with our own pg_p*v() fallback code. * After unexpected EOF or -1, the error thrown now talks about a range even for the single block case, eg "blocks 42..42". Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Discussion: https://postgr.es/m/CA+hUKGJkOiOCa+mag4BF+zHo7qo=o9CFheB8=g6uT5TUm2gkvA@mail.gmail.com
1 parent b7412e2 commit 4908c58

File tree

5 files changed

+279
-121
lines changed

5 files changed

+279
-121
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6868,7 +6868,7 @@ FROM pg_stat_get_backend_idset() AS backendid;
68686868
arg5 is the ID of the backend which created the temporary relation for a
68696869
local buffer, or <symbol>InvalidBackendId</symbol> (-1) for a shared buffer.
68706870
arg6 is the number of bytes actually read, while arg7 is the number
6871-
requested (if these are different it indicates trouble).</entry>
6871+
requested (if these are different it indicates a short read).</entry>
68726872
</row>
68736873
<row>
68746874
<entry><literal>smgr-md-write-start</literal></entry>
@@ -6890,7 +6890,7 @@ FROM pg_stat_get_backend_idset() AS backendid;
68906890
arg5 is the ID of the backend which created the temporary relation for a
68916891
local buffer, or <symbol>InvalidBackendId</symbol> (-1) for a shared buffer.
68926892
arg6 is the number of bytes actually written, while arg7 is the number
6893-
requested (if these are different it indicates trouble).</entry>
6893+
requested (if these are different it indicates a short write).</entry>
68946894
</row>
68956895
<row>
68966896
<entry><literal>sort-start</literal></entry>

src/backend/storage/smgr/md.c

Lines changed: 231 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "access/xlog.h"
2929
#include "access/xlogutils.h"
3030
#include "commands/tablespace.h"
31+
#include "common/file_utils.h"
3132
#include "miscadmin.h"
3233
#include "pg_trace.h"
3334
#include "pgstat.h"
@@ -754,138 +755,274 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
754755
}
755756

756757
/*
757-
* mdread() -- Read the specified block from a relation.
758+
* Convert an array of buffer address into an array of iovec objects, and
759+
* return the number that were required. 'iov' must have enough space for up
760+
* to 'nblocks' elements, but the number used may be less depending on
761+
* merging. In the case of a run of fully contiguous buffers, a single iovec
762+
* will be populated that can be handled as a plain non-vectored I/O.
758763
*/
759-
void
760-
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
761-
void *buffer)
764+
static int
765+
buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
762766
{
763-
off_t seekpos;
764-
int nbytes;
765-
MdfdVec *v;
767+
struct iovec *iovp;
768+
int iovcnt;
766769

767-
/* If this build supports direct I/O, the buffer must be I/O aligned. */
768-
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
769-
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
770+
Assert(nblocks >= 1);
770771

771-
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
772-
reln->smgr_rlocator.locator.spcOid,
773-
reln->smgr_rlocator.locator.dbOid,
774-
reln->smgr_rlocator.locator.relNumber,
775-
reln->smgr_rlocator.backend);
776-
777-
v = _mdfd_getseg(reln, forknum, blocknum, false,
778-
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
772+
/* If this build supports direct I/O, buffers must be I/O aligned. */
773+
for (int i = 0; i < nblocks; ++i)
774+
{
775+
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
776+
Assert((uintptr_t) buffers[i] ==
777+
TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
778+
}
779779

780-
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
780+
/* Start the first iovec off with the first buffer. */
781+
iovp = &iov[0];
782+
iovp->iov_base = buffers[0];
783+
iovp->iov_len = BLCKSZ;
784+
iovcnt = 1;
781785

782-
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
786+
/* Try to merge the rest. */
787+
for (int i = 1; i < nblocks; ++i)
788+
{
789+
void *buffer = buffers[i];
783790

784-
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
791+
if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
792+
{
793+
/* Contiguous with the last iovec. */
794+
iovp->iov_len += BLCKSZ;
795+
}
796+
else
797+
{
798+
/* Need a new iovec. */
799+
iovp++;
800+
iovp->iov_base = buffer;
801+
iovp->iov_len = BLCKSZ;
802+
iovcnt++;
803+
}
804+
}
785805

786-
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
787-
reln->smgr_rlocator.locator.spcOid,
788-
reln->smgr_rlocator.locator.dbOid,
789-
reln->smgr_rlocator.locator.relNumber,
790-
reln->smgr_rlocator.backend,
791-
nbytes,
792-
BLCKSZ);
806+
return iovcnt;
807+
}
793808

794-
if (nbytes != BLCKSZ)
809+
/*
810+
* mdreadv() -- Read the specified blocks from a relation.
811+
*/
812+
void
813+
mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
814+
void **buffers, BlockNumber nblocks)
815+
{
816+
while (nblocks > 0)
795817
{
796-
if (nbytes < 0)
797-
ereport(ERROR,
798-
(errcode_for_file_access(),
799-
errmsg("could not read block %u in file \"%s\": %m",
800-
blocknum, FilePathName(v->mdfd_vfd))));
818+
struct iovec iov[PG_IOV_MAX];
819+
int iovcnt;
820+
off_t seekpos;
821+
int nbytes;
822+
MdfdVec *v;
823+
BlockNumber nblocks_this_segment;
824+
size_t transferred_this_segment;
825+
size_t size_this_segment;
826+
827+
v = _mdfd_getseg(reln, forknum, blocknum, false,
828+
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
829+
830+
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
831+
832+
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
833+
834+
nblocks_this_segment =
835+
Min(nblocks,
836+
RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
837+
nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
838+
839+
iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
840+
size_this_segment = nblocks_this_segment * BLCKSZ;
841+
transferred_this_segment = 0;
801842

802843
/*
803-
* Short read: we are at or past EOF, or we read a partial block at
804-
* EOF. Normally this is an error; upper levels should never try to
805-
* read a nonexistent block. However, if zero_damaged_pages is ON or
806-
* we are InRecovery, we should instead return zeroes without
807-
* complaining. This allows, for example, the case of trying to
808-
* update a block that was later truncated away.
844+
* Inner loop to continue after a short read. We'll keep going until
845+
* we hit EOF rather than assuming that a short read means we hit the
846+
* end.
809847
*/
810-
if (zero_damaged_pages || InRecovery)
811-
MemSet(buffer, 0, BLCKSZ);
812-
else
813-
ereport(ERROR,
814-
(errcode(ERRCODE_DATA_CORRUPTED),
815-
errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
816-
blocknum, FilePathName(v->mdfd_vfd),
817-
nbytes, BLCKSZ)));
848+
for (;;)
849+
{
850+
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
851+
reln->smgr_rlocator.locator.spcOid,
852+
reln->smgr_rlocator.locator.dbOid,
853+
reln->smgr_rlocator.locator.relNumber,
854+
reln->smgr_rlocator.backend);
855+
nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
856+
WAIT_EVENT_DATA_FILE_READ);
857+
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
858+
reln->smgr_rlocator.locator.spcOid,
859+
reln->smgr_rlocator.locator.dbOid,
860+
reln->smgr_rlocator.locator.relNumber,
861+
reln->smgr_rlocator.backend,
862+
nbytes,
863+
size_this_segment - transferred_this_segment);
864+
865+
#ifdef SIMULATE_SHORT_READ
866+
nbytes = Min(nbytes, 4096);
867+
#endif
868+
869+
if (nbytes < 0)
870+
ereport(ERROR,
871+
(errcode_for_file_access(),
872+
errmsg("could not read blocks %u..%u in file \"%s\": %m",
873+
blocknum,
874+
blocknum + nblocks_this_segment - 1,
875+
FilePathName(v->mdfd_vfd))));
876+
877+
if (nbytes == 0)
878+
{
879+
/*
880+
* We are at or past EOF, or we read a partial block at EOF.
881+
* Normally this is an error; upper levels should never try to
882+
* read a nonexistent block. However, if zero_damaged_pages
883+
* is ON or we are InRecovery, we should instead return zeroes
884+
* without complaining. This allows, for example, the case of
885+
* trying to update a block that was later truncated away.
886+
*/
887+
if (zero_damaged_pages || InRecovery)
888+
{
889+
for (BlockNumber i = transferred_this_segment / BLCKSZ;
890+
i < nblocks_this_segment;
891+
++i)
892+
memset(buffers[i], 0, BLCKSZ);
893+
break;
894+
}
895+
else
896+
ereport(ERROR,
897+
(errcode(ERRCODE_DATA_CORRUPTED),
898+
errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
899+
blocknum,
900+
blocknum + nblocks_this_segment - 1,
901+
FilePathName(v->mdfd_vfd),
902+
transferred_this_segment,
903+
size_this_segment)));
904+
}
905+
906+
/* One loop should usually be enough. */
907+
transferred_this_segment += nbytes;
908+
Assert(transferred_this_segment <= size_this_segment);
909+
if (transferred_this_segment == size_this_segment)
910+
break;
911+
912+
/* Adjust position and vectors after a short read. */
913+
seekpos += nbytes;
914+
iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
915+
}
916+
917+
nblocks -= nblocks_this_segment;
918+
buffers += nblocks_this_segment;
919+
blocknum += nblocks_this_segment;
818920
}
819921
}
820922

821923
/*
822-
* mdwrite() -- Write the supplied block at the appropriate location.
924+
* mdwritev() -- Write the supplied blocks at the appropriate location.
823925
*
824926
* This is to be used only for updating already-existing blocks of a
825927
* relation (ie, those before the current EOF). To extend a relation,
826928
* use mdextend().
827929
*/
828930
void
829-
mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
830-
const void *buffer, bool skipFsync)
931+
mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
932+
const void **buffers, BlockNumber nblocks, bool skipFsync)
831933
{
832-
off_t seekpos;
833-
int nbytes;
834-
MdfdVec *v;
835-
836-
/* If this build supports direct I/O, the buffer must be I/O aligned. */
837-
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
838-
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
839-
840934
/* This assert is too expensive to have on normally ... */
841935
#ifdef CHECK_WRITE_VS_EXTEND
842936
Assert(blocknum < mdnblocks(reln, forknum));
843937
#endif
844938

845-
TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
846-
reln->smgr_rlocator.locator.spcOid,
847-
reln->smgr_rlocator.locator.dbOid,
848-
reln->smgr_rlocator.locator.relNumber,
849-
reln->smgr_rlocator.backend);
939+
while (nblocks > 0)
940+
{
941+
struct iovec iov[PG_IOV_MAX];
942+
int iovcnt;
943+
off_t seekpos;
944+
int nbytes;
945+
MdfdVec *v;
946+
BlockNumber nblocks_this_segment;
947+
size_t transferred_this_segment;
948+
size_t size_this_segment;
850949

851-
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
852-
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
950+
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
951+
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
853952

854-
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
953+
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
855954

856-
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
955+
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
857956

858-
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
957+
nblocks_this_segment =
958+
Min(nblocks,
959+
RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
960+
nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
859961

860-
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
861-
reln->smgr_rlocator.locator.spcOid,
862-
reln->smgr_rlocator.locator.dbOid,
863-
reln->smgr_rlocator.locator.relNumber,
864-
reln->smgr_rlocator.backend,
865-
nbytes,
866-
BLCKSZ);
962+
iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
963+
size_this_segment = nblocks_this_segment * BLCKSZ;
964+
transferred_this_segment = 0;
867965

868-
if (nbytes != BLCKSZ)
869-
{
870-
if (nbytes < 0)
871-
ereport(ERROR,
872-
(errcode_for_file_access(),
873-
errmsg("could not write block %u in file \"%s\": %m",
874-
blocknum, FilePathName(v->mdfd_vfd))));
875-
/* short write: complain appropriately */
876-
ereport(ERROR,
877-
(errcode(ERRCODE_DISK_FULL),
878-
errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
879-
blocknum,
880-
FilePathName(v->mdfd_vfd),
881-
nbytes, BLCKSZ),
882-
errhint("Check free disk space.")));
883-
}
966+
/*
967+
* Inner loop to continue after a short write. If the reason is that
968+
* we're out of disk space, a future attempt should get an ENOSPC
969+
* error from the kernel.
970+
*/
971+
for (;;)
972+
{
973+
TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
974+
reln->smgr_rlocator.locator.spcOid,
975+
reln->smgr_rlocator.locator.dbOid,
976+
reln->smgr_rlocator.locator.relNumber,
977+
reln->smgr_rlocator.backend);
978+
nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
979+
WAIT_EVENT_DATA_FILE_WRITE);
980+
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
981+
reln->smgr_rlocator.locator.spcOid,
982+
reln->smgr_rlocator.locator.dbOid,
983+
reln->smgr_rlocator.locator.relNumber,
984+
reln->smgr_rlocator.backend,
985+
nbytes,
986+
size_this_segment - transferred_this_segment);
987+
988+
#ifdef SIMULATE_SHORT_WRITE
989+
nbytes = Min(nbytes, 4096);
990+
#endif
884991

885-
if (!skipFsync && !SmgrIsTemp(reln))
886-
register_dirty_segment(reln, forknum, v);
992+
if (nbytes < 0)
993+
{
994+
bool enospc = errno == ENOSPC;
995+
996+
ereport(ERROR,
997+
(errcode_for_file_access(),
998+
errmsg("could not write blocks %u..%u in file \"%s\": %m",
999+
blocknum,
1000+
blocknum + nblocks_this_segment - 1,
1001+
FilePathName(v->mdfd_vfd)),
1002+
enospc ? errhint("Check free disk space.") : 0));
1003+
}
1004+
1005+
/* One loop should usually be enough. */
1006+
transferred_this_segment += nbytes;
1007+
Assert(transferred_this_segment <= size_this_segment);
1008+
if (transferred_this_segment == size_this_segment)
1009+
break;
1010+
1011+
/* Adjust position and iovecs after a short write. */
1012+
seekpos += nbytes;
1013+
iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1014+
}
1015+
1016+
if (!skipFsync && !SmgrIsTemp(reln))
1017+
register_dirty_segment(reln, forknum, v);
1018+
1019+
nblocks -= nblocks_this_segment;
1020+
buffers += nblocks_this_segment;
1021+
blocknum += nblocks_this_segment;
1022+
}
8871023
}
8881024

1025+
8891026
/*
8901027
* mdwriteback() -- Tell the kernel to write pages back to storage.
8911028
*

0 commit comments

Comments
 (0)