|
31 | 31 | #include "miscadmin.h"
|
32 | 32 | #include "pg_trace.h"
|
33 | 33 | #include "pgstat.h"
|
| 34 | +#include "storage/aio.h" |
34 | 35 | #include "storage/bufmgr.h"
|
35 | 36 | #include "storage/fd.h"
|
36 | 37 | #include "storage/md.h"
|
@@ -152,6 +153,15 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
|
152 | 153 | static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
|
153 | 154 | MdfdVec *seg);
|
154 | 155 |
|
| 156 | +static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data); |
| 157 | +static void md_readv_report(PgAioResult result, const PgAioTargetData *target_data, int elevel); |
| 158 | + |
| 159 | +const PgAioHandleCallbacks aio_md_readv_cb = { |
| 160 | + .complete_shared = md_readv_complete, |
| 161 | + .report = md_readv_report, |
| 162 | +}; |
| 163 | + |
| 164 | + |
155 | 165 | static inline int
|
156 | 166 | _mdfd_open_flags(void)
|
157 | 167 | {
|
@@ -937,6 +947,69 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
937 | 947 | }
|
938 | 948 | }
|
939 | 949 |
|
| 950 | +/* |
| 951 | + * mdstartreadv() -- Asynchronous version of mdreadv(). |
| 952 | + */ |
| 953 | +void |
| 954 | +mdstartreadv(PgAioHandle *ioh, |
| 955 | + SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| 956 | + void **buffers, BlockNumber nblocks) |
| 957 | +{ |
| 958 | + off_t seekpos; |
| 959 | + MdfdVec *v; |
| 960 | + BlockNumber nblocks_this_segment; |
| 961 | + struct iovec *iov; |
| 962 | + int iovcnt; |
| 963 | + int ret; |
| 964 | + |
| 965 | + v = _mdfd_getseg(reln, forknum, blocknum, false, |
| 966 | + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| 967 | + |
| 968 | + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| 969 | + |
| 970 | + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| 971 | + |
| 972 | + nblocks_this_segment = |
| 973 | + Min(nblocks, |
| 974 | + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); |
| 975 | + |
| 976 | + if (nblocks_this_segment != nblocks) |
| 977 | + elog(ERROR, "read crossing segment boundary"); |
| 978 | + |
| 979 | + iovcnt = pgaio_io_get_iovec(ioh, &iov); |
| 980 | + |
| 981 | + Assert(nblocks <= iovcnt); |
| 982 | + |
| 983 | + iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment); |
| 984 | + |
| 985 | + Assert(iovcnt <= nblocks_this_segment); |
| 986 | + |
| 987 | + if (!(io_direct_flags & IO_DIRECT_DATA)) |
| 988 | + pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED); |
| 989 | + |
| 990 | + pgaio_io_set_target_smgr(ioh, |
| 991 | + reln, |
| 992 | + forknum, |
| 993 | + blocknum, |
| 994 | + nblocks, |
| 995 | + false); |
| 996 | + pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0); |
| 997 | + |
| 998 | + ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ); |
| 999 | + if (ret != 0) |
| 1000 | + ereport(ERROR, |
| 1001 | + (errcode_for_file_access(), |
| 1002 | + errmsg("could not start reading blocks %u..%u in file \"%s\": %m", |
| 1003 | + blocknum, |
| 1004 | + blocknum + nblocks_this_segment - 1, |
| 1005 | + FilePathName(v->mdfd_vfd)))); |
| 1006 | + |
| 1007 | + /* |
| 1008 | + * The error checks corresponding to the post-read checks in mdreadv() are |
| 1009 | + * in md_readv_complete(). |
| 1010 | + */ |
| 1011 | +} |
| 1012 | + |
940 | 1013 | /*
|
941 | 1014 | * mdwritev() -- Write the supplied blocks at the appropriate location.
|
942 | 1015 | *
|
@@ -1365,6 +1438,21 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
|
1365 | 1438 | }
|
1366 | 1439 | }
|
1367 | 1440 |
|
| 1441 | +int |
| 1442 | +mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) |
| 1443 | +{ |
| 1444 | + MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL); |
| 1445 | + |
| 1446 | + v = _mdfd_getseg(reln, forknum, blocknum, false, |
| 1447 | + EXTENSION_FAIL); |
| 1448 | + |
| 1449 | + *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| 1450 | + |
| 1451 | + Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE); |
| 1452 | + |
| 1453 | + return FileGetRawDesc(v->mdfd_vfd); |
| 1454 | +} |
| 1455 | + |
1368 | 1456 | /*
|
1369 | 1457 | * register_dirty_segment() -- Mark a relation segment as needing fsync
|
1370 | 1458 | *
|
@@ -1841,3 +1929,111 @@ mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
|
1841 | 1929 | */
|
1842 | 1930 | return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
|
1843 | 1931 | }
|
| 1932 | + |
| 1933 | +/* |
| 1934 | + * AIO completion callback for mdstartreadv(). |
| 1935 | + */ |
| 1936 | +static PgAioResult |
| 1937 | +md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data) |
| 1938 | +{ |
| 1939 | + PgAioTargetData *td = pgaio_io_get_target_data(ioh); |
| 1940 | + PgAioResult result = prior_result; |
| 1941 | + |
| 1942 | + if (prior_result.result < 0) |
| 1943 | + { |
| 1944 | + result.status = PGAIO_RS_ERROR; |
| 1945 | + result.id = PGAIO_HCB_MD_READV; |
| 1946 | + /* For "hard" errors, track the error number in error_data */ |
| 1947 | + result.error_data = -prior_result.result; |
| 1948 | + result.result = 0; |
| 1949 | + |
| 1950 | + /* |
| 1951 | + * Immediately log a message about the IO error, but only to the |
| 1952 | + * server log. The reason to do so immediately is that the originator |
| 1953 | + * might not process the query result immediately (because it is busy |
| 1954 | + * doing another part of query processing) or at all (e.g. if it was |
| 1955 | + * cancelled or errored out due to another IO also failing). The |
| 1956 | + * issuer of the IO will emit an ERROR when processing the IO's |
| 1957 | + * results |
| 1958 | + */ |
| 1959 | + pgaio_result_report(result, td, LOG_SERVER_ONLY); |
| 1960 | + |
| 1961 | + return result; |
| 1962 | + } |
| 1963 | + |
| 1964 | + /* |
| 1965 | + * As explained above smgrstartreadv(), the smgr API operates on the level |
| 1966 | + * of blocks, rather than bytes. Convert. |
| 1967 | + */ |
| 1968 | + result.result /= BLCKSZ; |
| 1969 | + |
| 1970 | + Assert(result.result <= td->smgr.nblocks); |
| 1971 | + |
| 1972 | + if (result.result == 0) |
| 1973 | + { |
| 1974 | + /* consider 0 blocks read a failure */ |
| 1975 | + result.status = PGAIO_RS_ERROR; |
| 1976 | + result.id = PGAIO_HCB_MD_READV; |
| 1977 | + result.error_data = 0; |
| 1978 | + |
| 1979 | + /* see comment above the "hard error" case */ |
| 1980 | + pgaio_result_report(result, td, LOG_SERVER_ONLY); |
| 1981 | + |
| 1982 | + return result; |
| 1983 | + } |
| 1984 | + |
| 1985 | + if (result.status != PGAIO_RS_ERROR && |
| 1986 | + result.result < td->smgr.nblocks) |
| 1987 | + { |
| 1988 | + /* partial reads should be retried at upper level */ |
| 1989 | + result.status = PGAIO_RS_PARTIAL; |
| 1990 | + result.id = PGAIO_HCB_MD_READV; |
| 1991 | + } |
| 1992 | + |
| 1993 | + return result; |
| 1994 | +} |
| 1995 | + |
| 1996 | +/* |
| 1997 | + * AIO error reporting callback for mdstartreadv(). |
| 1998 | + * |
| 1999 | + * Errors are encoded as follows: |
| 2000 | + * - PgAioResult.error_data != 0 encodes IO that failed with that errno |
| 2001 | + * - PgAioResult.error_data == 0 encodes IO that didn't read all data |
| 2002 | + */ |
| 2003 | +static void |
| 2004 | +md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel) |
| 2005 | +{ |
| 2006 | + RelPathStr path; |
| 2007 | + |
| 2008 | + path = relpathbackend(td->smgr.rlocator, |
| 2009 | + td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER, |
| 2010 | + td->smgr.forkNum); |
| 2011 | + |
| 2012 | + if (result.error_data != 0) |
| 2013 | + { |
| 2014 | + /* for errcode_for_file_access() and %m */ |
| 2015 | + errno = result.error_data; |
| 2016 | + |
| 2017 | + ereport(elevel, |
| 2018 | + errcode_for_file_access(), |
| 2019 | + errmsg("could not read blocks %u..%u in file \"%s\": %m", |
| 2020 | + td->smgr.blockNum, |
| 2021 | + td->smgr.blockNum + td->smgr.nblocks - 1, |
| 2022 | + path.str)); |
| 2023 | + } |
| 2024 | + else |
| 2025 | + { |
| 2026 | + /* |
| 2027 | + * NB: This will typically only be output in debug messages, while |
| 2028 | + * retrying a partial IO. |
| 2029 | + */ |
| 2030 | + ereport(elevel, |
| 2031 | + errcode(ERRCODE_DATA_CORRUPTED), |
| 2032 | + errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes", |
| 2033 | + td->smgr.blockNum, |
| 2034 | + td->smgr.blockNum + td->smgr.nblocks - 1, |
| 2035 | + path.str, |
| 2036 | + result.result * (size_t) BLCKSZ, |
| 2037 | + td->smgr.nblocks * (size_t) BLCKSZ)); |
| 2038 | + } |
| 2039 | +} |
0 commit comments