|
28 | 28 | #include "access/xlog.h"
|
29 | 29 | #include "access/xlogutils.h"
|
30 | 30 | #include "commands/tablespace.h"
|
| 31 | +#include "common/file_utils.h" |
31 | 32 | #include "miscadmin.h"
|
32 | 33 | #include "pg_trace.h"
|
33 | 34 | #include "pgstat.h"
|
@@ -754,138 +755,274 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
754 | 755 | }
|
755 | 756 |
|
756 | 757 | /*
|
757 |
| - * mdread() -- Read the specified block from a relation. |
| 758 | + * Convert an array of buffer address into an array of iovec objects, and |
| 759 | + * return the number that were required. 'iov' must have enough space for up |
| 760 | + * to 'nblocks' elements, but the number used may be less depending on |
| 761 | + * merging. In the case of a run of fully contiguous buffers, a single iovec |
| 762 | + * will be populated that can be handled as a plain non-vectored I/O. |
758 | 763 | */
|
759 |
| -void |
760 |
| -mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
761 |
| - void *buffer) |
| 764 | +static int |
| 765 | +buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks) |
762 | 766 | {
|
763 |
| - off_t seekpos; |
764 |
| - int nbytes; |
765 |
| - MdfdVec *v; |
| 767 | + struct iovec *iovp; |
| 768 | + int iovcnt; |
766 | 769 |
|
767 |
| - /* If this build supports direct I/O, the buffer must be I/O aligned. */ |
768 |
| - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) |
769 |
| - Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); |
| 770 | + Assert(nblocks >= 1); |
770 | 771 |
|
771 |
| - TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, |
772 |
| - reln->smgr_rlocator.locator.spcOid, |
773 |
| - reln->smgr_rlocator.locator.dbOid, |
774 |
| - reln->smgr_rlocator.locator.relNumber, |
775 |
| - reln->smgr_rlocator.backend); |
776 |
| - |
777 |
| - v = _mdfd_getseg(reln, forknum, blocknum, false, |
778 |
| - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| 772 | + /* If this build supports direct I/O, buffers must be I/O aligned. */ |
| 773 | + for (int i = 0; i < nblocks; ++i) |
| 774 | + { |
| 775 | + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) |
| 776 | + Assert((uintptr_t) buffers[i] == |
| 777 | + TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i])); |
| 778 | + } |
779 | 779 |
|
780 |
| - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| 780 | + /* Start the first iovec off with the first buffer. */ |
| 781 | + iovp = &iov[0]; |
| 782 | + iovp->iov_base = buffers[0]; |
| 783 | + iovp->iov_len = BLCKSZ; |
| 784 | + iovcnt = 1; |
781 | 785 |
|
782 |
| - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| 786 | + /* Try to merge the rest. */ |
| 787 | + for (int i = 1; i < nblocks; ++i) |
| 788 | + { |
| 789 | + void *buffer = buffers[i]; |
783 | 790 |
|
784 |
| - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); |
| 791 | + if (((char *) iovp->iov_base + iovp->iov_len) == buffer) |
| 792 | + { |
| 793 | + /* Contiguous with the last iovec. */ |
| 794 | + iovp->iov_len += BLCKSZ; |
| 795 | + } |
| 796 | + else |
| 797 | + { |
| 798 | + /* Need a new iovec. */ |
| 799 | + iovp++; |
| 800 | + iovp->iov_base = buffer; |
| 801 | + iovp->iov_len = BLCKSZ; |
| 802 | + iovcnt++; |
| 803 | + } |
| 804 | + } |
785 | 805 |
|
786 |
| - TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, |
787 |
| - reln->smgr_rlocator.locator.spcOid, |
788 |
| - reln->smgr_rlocator.locator.dbOid, |
789 |
| - reln->smgr_rlocator.locator.relNumber, |
790 |
| - reln->smgr_rlocator.backend, |
791 |
| - nbytes, |
792 |
| - BLCKSZ); |
| 806 | + return iovcnt; |
| 807 | +} |
793 | 808 |
|
794 |
| - if (nbytes != BLCKSZ) |
| 809 | +/* |
| 810 | + * mdreadv() -- Read the specified blocks from a relation. |
| 811 | + */ |
| 812 | +void |
| 813 | +mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| 814 | + void **buffers, BlockNumber nblocks) |
| 815 | +{ |
| 816 | + while (nblocks > 0) |
795 | 817 | {
|
796 |
| - if (nbytes < 0) |
797 |
| - ereport(ERROR, |
798 |
| - (errcode_for_file_access(), |
799 |
| - errmsg("could not read block %u in file \"%s\": %m", |
800 |
| - blocknum, FilePathName(v->mdfd_vfd)))); |
| 818 | + struct iovec iov[PG_IOV_MAX]; |
| 819 | + int iovcnt; |
| 820 | + off_t seekpos; |
| 821 | + int nbytes; |
| 822 | + MdfdVec *v; |
| 823 | + BlockNumber nblocks_this_segment; |
| 824 | + size_t transferred_this_segment; |
| 825 | + size_t size_this_segment; |
| 826 | + |
| 827 | + v = _mdfd_getseg(reln, forknum, blocknum, false, |
| 828 | + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| 829 | + |
| 830 | + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| 831 | + |
| 832 | + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| 833 | + |
| 834 | + nblocks_this_segment = |
| 835 | + Min(nblocks, |
| 836 | + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); |
| 837 | + nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov)); |
| 838 | + |
| 839 | + iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment); |
| 840 | + size_this_segment = nblocks_this_segment * BLCKSZ; |
| 841 | + transferred_this_segment = 0; |
801 | 842 |
|
802 | 843 | /*
|
803 |
| - * Short read: we are at or past EOF, or we read a partial block at |
804 |
| - * EOF. Normally this is an error; upper levels should never try to |
805 |
| - * read a nonexistent block. However, if zero_damaged_pages is ON or |
806 |
| - * we are InRecovery, we should instead return zeroes without |
807 |
| - * complaining. This allows, for example, the case of trying to |
808 |
| - * update a block that was later truncated away. |
| 844 | + * Inner loop to continue after a short read. We'll keep going until |
| 845 | + * we hit EOF rather than assuming that a short read means we hit the |
| 846 | + * end. |
809 | 847 | */
|
810 |
| - if (zero_damaged_pages || InRecovery) |
811 |
| - MemSet(buffer, 0, BLCKSZ); |
812 |
| - else |
813 |
| - ereport(ERROR, |
814 |
| - (errcode(ERRCODE_DATA_CORRUPTED), |
815 |
| - errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", |
816 |
| - blocknum, FilePathName(v->mdfd_vfd), |
817 |
| - nbytes, BLCKSZ))); |
| 848 | + for (;;) |
| 849 | + { |
| 850 | + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, |
| 851 | + reln->smgr_rlocator.locator.spcOid, |
| 852 | + reln->smgr_rlocator.locator.dbOid, |
| 853 | + reln->smgr_rlocator.locator.relNumber, |
| 854 | + reln->smgr_rlocator.backend); |
| 855 | + nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos, |
| 856 | + WAIT_EVENT_DATA_FILE_READ); |
| 857 | + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, |
| 858 | + reln->smgr_rlocator.locator.spcOid, |
| 859 | + reln->smgr_rlocator.locator.dbOid, |
| 860 | + reln->smgr_rlocator.locator.relNumber, |
| 861 | + reln->smgr_rlocator.backend, |
| 862 | + nbytes, |
| 863 | + size_this_segment - transferred_this_segment); |
| 864 | + |
| 865 | +#ifdef SIMULATE_SHORT_READ |
| 866 | + nbytes = Min(nbytes, 4096); |
| 867 | +#endif |
| 868 | + |
| 869 | + if (nbytes < 0) |
| 870 | + ereport(ERROR, |
| 871 | + (errcode_for_file_access(), |
| 872 | + errmsg("could not read blocks %u..%u in file \"%s\": %m", |
| 873 | + blocknum, |
| 874 | + blocknum + nblocks_this_segment - 1, |
| 875 | + FilePathName(v->mdfd_vfd)))); |
| 876 | + |
| 877 | + if (nbytes == 0) |
| 878 | + { |
| 879 | + /* |
| 880 | + * We are at or past EOF, or we read a partial block at EOF. |
| 881 | + * Normally this is an error; upper levels should never try to |
| 882 | + * read a nonexistent block. However, if zero_damaged_pages |
| 883 | + * is ON or we are InRecovery, we should instead return zeroes |
| 884 | + * without complaining. This allows, for example, the case of |
| 885 | + * trying to update a block that was later truncated away. |
| 886 | + */ |
| 887 | + if (zero_damaged_pages || InRecovery) |
| 888 | + { |
| 889 | + for (BlockNumber i = transferred_this_segment / BLCKSZ; |
| 890 | + i < nblocks_this_segment; |
| 891 | + ++i) |
| 892 | + memset(buffers[i], 0, BLCKSZ); |
| 893 | + break; |
| 894 | + } |
| 895 | + else |
| 896 | + ereport(ERROR, |
| 897 | + (errcode(ERRCODE_DATA_CORRUPTED), |
| 898 | + errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes", |
| 899 | + blocknum, |
| 900 | + blocknum + nblocks_this_segment - 1, |
| 901 | + FilePathName(v->mdfd_vfd), |
| 902 | + transferred_this_segment, |
| 903 | + size_this_segment))); |
| 904 | + } |
| 905 | + |
| 906 | + /* One loop should usually be enough. */ |
| 907 | + transferred_this_segment += nbytes; |
| 908 | + Assert(transferred_this_segment <= size_this_segment); |
| 909 | + if (transferred_this_segment == size_this_segment) |
| 910 | + break; |
| 911 | + |
| 912 | + /* Adjust position and vectors after a short read. */ |
| 913 | + seekpos += nbytes; |
| 914 | + iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes); |
| 915 | + } |
| 916 | + |
| 917 | + nblocks -= nblocks_this_segment; |
| 918 | + buffers += nblocks_this_segment; |
| 919 | + blocknum += nblocks_this_segment; |
818 | 920 | }
|
819 | 921 | }
|
820 | 922 |
|
821 | 923 | /*
|
822 |
| - * mdwrite() -- Write the supplied block at the appropriate location. |
| 924 | + * mdwritev() -- Write the supplied blocks at the appropriate location. |
823 | 925 | *
|
824 | 926 | * This is to be used only for updating already-existing blocks of a
|
825 | 927 | * relation (ie, those before the current EOF). To extend a relation,
|
826 | 928 | * use mdextend().
|
827 | 929 | */
|
828 | 930 | void
|
829 |
| -mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
830 |
| - const void *buffer, bool skipFsync) |
| 931 | +mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| 932 | + const void **buffers, BlockNumber nblocks, bool skipFsync) |
831 | 933 | {
|
832 |
| - off_t seekpos; |
833 |
| - int nbytes; |
834 |
| - MdfdVec *v; |
835 |
| - |
836 |
| - /* If this build supports direct I/O, the buffer must be I/O aligned. */ |
837 |
| - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) |
838 |
| - Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); |
839 |
| - |
840 | 934 | /* This assert is too expensive to have on normally ... */
|
841 | 935 | #ifdef CHECK_WRITE_VS_EXTEND
|
842 | 936 | Assert(blocknum < mdnblocks(reln, forknum));
|
843 | 937 | #endif
|
844 | 938 |
|
845 |
| - TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, |
846 |
| - reln->smgr_rlocator.locator.spcOid, |
847 |
| - reln->smgr_rlocator.locator.dbOid, |
848 |
| - reln->smgr_rlocator.locator.relNumber, |
849 |
| - reln->smgr_rlocator.backend); |
| 939 | + while (nblocks > 0) |
| 940 | + { |
| 941 | + struct iovec iov[PG_IOV_MAX]; |
| 942 | + int iovcnt; |
| 943 | + off_t seekpos; |
| 944 | + int nbytes; |
| 945 | + MdfdVec *v; |
| 946 | + BlockNumber nblocks_this_segment; |
| 947 | + size_t transferred_this_segment; |
| 948 | + size_t size_this_segment; |
850 | 949 |
|
851 |
| - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, |
852 |
| - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| 950 | + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, |
| 951 | + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
853 | 952 |
|
854 |
| - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| 953 | + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
855 | 954 |
|
856 |
| - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| 955 | + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
857 | 956 |
|
858 |
| - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); |
| 957 | + nblocks_this_segment = |
| 958 | + Min(nblocks, |
| 959 | + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); |
| 960 | + nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov)); |
859 | 961 |
|
860 |
| - TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, |
861 |
| - reln->smgr_rlocator.locator.spcOid, |
862 |
| - reln->smgr_rlocator.locator.dbOid, |
863 |
| - reln->smgr_rlocator.locator.relNumber, |
864 |
| - reln->smgr_rlocator.backend, |
865 |
| - nbytes, |
866 |
| - BLCKSZ); |
| 962 | + iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment); |
| 963 | + size_this_segment = nblocks_this_segment * BLCKSZ; |
| 964 | + transferred_this_segment = 0; |
867 | 965 |
|
868 |
| - if (nbytes != BLCKSZ) |
869 |
| - { |
870 |
| - if (nbytes < 0) |
871 |
| - ereport(ERROR, |
872 |
| - (errcode_for_file_access(), |
873 |
| - errmsg("could not write block %u in file \"%s\": %m", |
874 |
| - blocknum, FilePathName(v->mdfd_vfd)))); |
875 |
| - /* short write: complain appropriately */ |
876 |
| - ereport(ERROR, |
877 |
| - (errcode(ERRCODE_DISK_FULL), |
878 |
| - errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", |
879 |
| - blocknum, |
880 |
| - FilePathName(v->mdfd_vfd), |
881 |
| - nbytes, BLCKSZ), |
882 |
| - errhint("Check free disk space."))); |
883 |
| - } |
| 966 | + /* |
| 967 | + * Inner loop to continue after a short write. If the reason is that |
| 968 | + * we're out of disk space, a future attempt should get an ENOSPC |
| 969 | + * error from the kernel. |
| 970 | + */ |
| 971 | + for (;;) |
| 972 | + { |
| 973 | + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, |
| 974 | + reln->smgr_rlocator.locator.spcOid, |
| 975 | + reln->smgr_rlocator.locator.dbOid, |
| 976 | + reln->smgr_rlocator.locator.relNumber, |
| 977 | + reln->smgr_rlocator.backend); |
| 978 | + nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos, |
| 979 | + WAIT_EVENT_DATA_FILE_WRITE); |
| 980 | + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, |
| 981 | + reln->smgr_rlocator.locator.spcOid, |
| 982 | + reln->smgr_rlocator.locator.dbOid, |
| 983 | + reln->smgr_rlocator.locator.relNumber, |
| 984 | + reln->smgr_rlocator.backend, |
| 985 | + nbytes, |
| 986 | + size_this_segment - transferred_this_segment); |
| 987 | + |
| 988 | +#ifdef SIMULATE_SHORT_WRITE |
| 989 | + nbytes = Min(nbytes, 4096); |
| 990 | +#endif |
884 | 991 |
|
885 |
| - if (!skipFsync && !SmgrIsTemp(reln)) |
886 |
| - register_dirty_segment(reln, forknum, v); |
| 992 | + if (nbytes < 0) |
| 993 | + { |
| 994 | + bool enospc = errno == ENOSPC; |
| 995 | + |
| 996 | + ereport(ERROR, |
| 997 | + (errcode_for_file_access(), |
| 998 | + errmsg("could not write blocks %u..%u in file \"%s\": %m", |
| 999 | + blocknum, |
| 1000 | + blocknum + nblocks_this_segment - 1, |
| 1001 | + FilePathName(v->mdfd_vfd)), |
| 1002 | + enospc ? errhint("Check free disk space.") : 0)); |
| 1003 | + } |
| 1004 | + |
| 1005 | + /* One loop should usually be enough. */ |
| 1006 | + transferred_this_segment += nbytes; |
| 1007 | + Assert(transferred_this_segment <= size_this_segment); |
| 1008 | + if (transferred_this_segment == size_this_segment) |
| 1009 | + break; |
| 1010 | + |
| 1011 | + /* Adjust position and iovecs after a short write. */ |
| 1012 | + seekpos += nbytes; |
| 1013 | + iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes); |
| 1014 | + } |
| 1015 | + |
| 1016 | + if (!skipFsync && !SmgrIsTemp(reln)) |
| 1017 | + register_dirty_segment(reln, forknum, v); |
| 1018 | + |
| 1019 | + nblocks -= nblocks_this_segment; |
| 1020 | + buffers += nblocks_this_segment; |
| 1021 | + blocknum += nblocks_this_segment; |
| 1022 | + } |
887 | 1023 | }
|
888 | 1024 |
|
| 1025 | + |
889 | 1026 | /*
|
890 | 1027 | * mdwriteback() -- Tell the kernel to write pages back to storage.
|
891 | 1028 | *
|
|
0 commit comments