Skip to content

Commit c2ba3fd

Browse files
committed
In basebackup.c, refactor to create read_file_data_into_buffer.
This further reduces the length and complexity of sendFile(), hopefully make it easier to understand and modify. In addition to moving some logic into a new function, I took this opportunity to make a few slight adjustments to sendFile() itself, including renaming the 'len' variable to 'bytes_done', since we use it to represent the number of bytes we've already handled so far, not the total length of the file. Patch by me, reviewed by David Steele. Discussion: http://postgr.es/m/CA+TgmoYt5jXH4U6cu1dm9Oe2FTn1aae6hBNhZzJJjyjbE_zYig@mail.gmail.com
1 parent 0531831 commit c2ba3fd

File tree

1 file changed

+133
-98
lines changed

1 file changed

+133
-98
lines changed

src/backend/backup/basebackup.c

Lines changed: 133 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
8383
static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
8484
struct stat *statbuf, bool missing_ok, Oid dboid,
8585
backup_manifest_info *manifest, const char *spcoid);
86+
static off_t read_file_data_into_buffer(bbsink *sink,
87+
const char *readfilename, int fd,
88+
off_t offset, size_t length,
89+
BlockNumber blkno,
90+
bool verify_checksum,
91+
int *checksum_failures);
8692
static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
8793
BlockNumber blkno,
8894
uint16 *expected_checksum);
@@ -1490,9 +1496,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
14901496
BlockNumber blkno = 0;
14911497
int checksum_failures = 0;
14921498
off_t cnt;
1493-
int i;
1494-
pgoff_t len = 0;
1495-
char *page;
1499+
pgoff_t bytes_done = 0;
14961500
int segmentno = 0;
14971501
char *segmentpath;
14981502
bool verify_checksum = false;
@@ -1514,6 +1518,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15141518

15151519
_tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
15161520

1521+
/*
1522+
* Checksums are verified in multiples of BLCKSZ, so the buffer length
1523+
* should be a multiple of the block size as well.
1524+
*/
1525+
Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1526+
15171527
if (!noverify_checksums && DataChecksumsEnabled())
15181528
{
15191529
char *filename;
@@ -1551,23 +1561,21 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15511561
* for a base backup we can ignore such extended data. It will be restored
15521562
* from WAL.
15531563
*/
1554-
while (len < statbuf->st_size)
1564+
while (bytes_done < statbuf->st_size)
15551565
{
1556-
size_t remaining = statbuf->st_size - len;
1566+
size_t remaining = statbuf->st_size - bytes_done;
15571567

15581568
/* Try to read some more data. */
1559-
cnt = basebackup_read_file(fd, sink->bbs_buffer,
1560-
Min(sink->bbs_buffer_length, remaining),
1561-
len, readfilename, true);
1569+
cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done,
1570+
remaining,
1571+
blkno + segmentno * RELSEG_SIZE,
1572+
verify_checksum,
1573+
&checksum_failures);
15621574

15631575
/*
1564-
* The checksums are verified at block level, so we iterate over the
1565-
* buffer in chunks of BLCKSZ, after making sure that
1566-
* TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1567-
* BLCKSZ bytes.
1576+
* If the amount of data we were able to read was not a multiple of
1577+
* BLCKSZ, we cannot verify checksums, which are block-level.
15681578
*/
1569-
Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1570-
15711579
if (verify_checksum && (cnt % BLCKSZ != 0))
15721580
{
15731581
ereport(WARNING,
@@ -1578,84 +1586,6 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15781586
verify_checksum = false;
15791587
}
15801588

1581-
if (verify_checksum)
1582-
{
1583-
for (i = 0; i < cnt / BLCKSZ; i++)
1584-
{
1585-
int reread_cnt;
1586-
uint16 expected_checksum;
1587-
1588-
page = sink->bbs_buffer + BLCKSZ * i;
1589-
1590-
/* If the page is OK, go on to the next one. */
1591-
if (verify_page_checksum(page, sink->bbs_state->startptr,
1592-
blkno + i + segmentno * RELSEG_SIZE,
1593-
&expected_checksum))
1594-
continue;
1595-
1596-
/*
1597-
* Retry the block on the first failure. It's possible that
1598-
* we read the first 4K page of the block just before postgres
1599-
* updated the entire block so it ends up looking torn to us.
1600-
* If, before we retry the read, the concurrent write of the
1601-
* block finishes, the page LSN will be updated and we'll
1602-
* realize that we should ignore this block.
1603-
*
1604-
* There's no guarantee that this will actually happen,
1605-
* though: the torn write could take an arbitrarily long time
1606-
* to complete. Retrying multiple times wouldn't fix this
1607-
* problem, either, though it would reduce the chances of it
1608-
* happening in practice. The only real fix here seems to be
1609-
* to have some kind of interlock that allows us to wait until
1610-
* we can be certain that no write to the block is in
1611-
* progress. Since we don't have any such thing right now, we
1612-
* just do this and hope for the best.
1613-
*/
1614-
reread_cnt =
1615-
basebackup_read_file(fd,
1616-
sink->bbs_buffer + BLCKSZ * i,
1617-
BLCKSZ, len + BLCKSZ * i,
1618-
readfilename,
1619-
false);
1620-
if (reread_cnt == 0)
1621-
{
1622-
/*
1623-
* If we hit end-of-file, a concurrent truncation must
1624-
* have occurred, so break out of this loop just as if the
1625-
* initial fread() returned 0. We'll drop through to the
1626-
* same code that handles that case. (We must fix up cnt
1627-
* first, though.)
1628-
*/
1629-
cnt = BLCKSZ * i;
1630-
break;
1631-
}
1632-
1633-
/* If the page now looks OK, go on to the next one. */
1634-
if (verify_page_checksum(page, sink->bbs_state->startptr,
1635-
blkno + i + segmentno * RELSEG_SIZE,
1636-
&expected_checksum))
1637-
continue;
1638-
1639-
/* Handle checksum failure. */
1640-
checksum_failures++;
1641-
if (checksum_failures <= 5)
1642-
ereport(WARNING,
1643-
(errmsg("checksum verification failed in "
1644-
"file \"%s\", block %u: calculated "
1645-
"%X but expected %X",
1646-
readfilename, blkno + i, expected_checksum,
1647-
((PageHeader) page)->pd_checksum)));
1648-
if (checksum_failures == 5)
1649-
ereport(WARNING,
1650-
(errmsg("further checksum verification "
1651-
"failures in file \"%s\" will not "
1652-
"be reported", readfilename)));
1653-
}
1654-
1655-
/* Update block number for next pass through the outer loop. */
1656-
blkno += i;
1657-
}
1658-
16591589
/*
16601590
* If we hit end-of-file, a concurrent truncation must have occurred.
16611591
* That's not an error condition, because WAL replay will fix things
@@ -1664,21 +1594,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16641594
if (cnt == 0)
16651595
break;
16661596

1597+
/* Update block number and # of bytes done for next loop iteration. */
1598+
blkno += cnt / BLCKSZ;
1599+
bytes_done += cnt;
1600+
16671601
/* Archive the data we just read. */
16681602
bbsink_archive_contents(sink, cnt);
16691603

16701604
/* Also feed it to the checksum machinery. */
16711605
if (pg_checksum_update(&checksum_ctx,
16721606
(uint8 *) sink->bbs_buffer, cnt) < 0)
16731607
elog(ERROR, "could not update checksum of base backup");
1674-
1675-
len += cnt;
16761608
}
16771609

16781610
/* If the file was truncated while we were sending it, pad it with zeros */
1679-
while (len < statbuf->st_size)
1611+
while (bytes_done < statbuf->st_size)
16801612
{
1681-
size_t remaining = statbuf->st_size - len;
1613+
size_t remaining = statbuf->st_size - bytes_done;
16821614
size_t nbytes = Min(sink->bbs_buffer_length, remaining);
16831615

16841616
MemSet(sink->bbs_buffer, 0, nbytes);
@@ -1687,15 +1619,15 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16871619
nbytes) < 0)
16881620
elog(ERROR, "could not update checksum of base backup");
16891621
bbsink_archive_contents(sink, nbytes);
1690-
len += nbytes;
1622+
bytes_done += nbytes;
16911623
}
16921624

16931625
/*
16941626
* Pad to a block boundary, per tar format requirements. (This small piece
16951627
* of data is probably not worth throttling, and is not checksummed
16961628
* because it's not actually part of the file.)
16971629
*/
1698-
_tarWritePadding(sink, len);
1630+
_tarWritePadding(sink, bytes_done);
16991631

17001632
CloseTransientFile(fd);
17011633

@@ -1718,6 +1650,109 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
17181650
return true;
17191651
}
17201652

1653+
/*
1654+
* Read some more data from the file into the bbsink's buffer, verifying
1655+
* checksums as required.
1656+
*
1657+
* 'offset' is the file offset from which we should begin to read, and
1658+
* 'length' is the amount of data that should be read. The actual amount
1659+
* of data read will be less than the requested amount if the bbsink's
1660+
* buffer isn't big enough to hold it all, or if the underlying file has
1661+
* been truncated. The return value is the number of bytes actually read.
1662+
*
1663+
* 'blkno' is the block number of the first page in the bbsink's buffer
1664+
* relative to the start of the relation.
1665+
*
1666+
* 'verify_checksum' indicates whether we should try to verify checksums
1667+
* for the blocks we read. If we do this, we'll update *checksum_failures
1668+
* and issue warnings as appropriate.
1669+
*/
1670+
static off_t
1671+
read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1672+
off_t offset, size_t length, BlockNumber blkno,
1673+
bool verify_checksum, int *checksum_failures)
1674+
{
1675+
off_t cnt;
1676+
int i;
1677+
char *page;
1678+
1679+
/* Try to read some more data. */
1680+
cnt = basebackup_read_file(fd, sink->bbs_buffer,
1681+
Min(sink->bbs_buffer_length, length),
1682+
offset, readfilename, true);
1683+
1684+
/* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1685+
if (!verify_checksum || (cnt % BLCKSZ) != 0)
1686+
return cnt;
1687+
1688+
/* Verify checksum for each block. */
1689+
for (i = 0; i < cnt / BLCKSZ; i++)
1690+
{
1691+
int reread_cnt;
1692+
uint16 expected_checksum;
1693+
1694+
page = sink->bbs_buffer + BLCKSZ * i;
1695+
1696+
/* If the page is OK, go on to the next one. */
1697+
if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1698+
&expected_checksum))
1699+
continue;
1700+
1701+
/*
1702+
* Retry the block on the first failure. It's possible that we read
1703+
* the first 4K page of the block just before postgres updated the
1704+
* entire block so it ends up looking torn to us. If, before we retry
1705+
* the read, the concurrent write of the block finishes, the page LSN
1706+
* will be updated and we'll realize that we should ignore this block.
1707+
*
1708+
* There's no guarantee that this will actually happen, though: the
1709+
* torn write could take an arbitrarily long time to complete.
1710+
* Retrying multiple times wouldn't fix this problem, either, though
1711+
* it would reduce the chances of it happening in practice. The only
1712+
* real fix here seems to be to have some kind of interlock that
1713+
* allows us to wait until we can be certain that no write to the
1714+
* block is in progress. Since we don't have any such thing right now,
1715+
* we just do this and hope for the best.
1716+
*/
1717+
reread_cnt =
1718+
basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1719+
BLCKSZ, offset + BLCKSZ * i,
1720+
readfilename, false);
1721+
if (reread_cnt == 0)
1722+
{
1723+
/*
1724+
* If we hit end-of-file, a concurrent truncation must have
1725+
* occurred, so reduce cnt to reflect only the blocks already
1726+
* processed and break out of this loop.
1727+
*/
1728+
cnt = BLCKSZ * i;
1729+
break;
1730+
}
1731+
1732+
/* If the page now looks OK, go on to the next one. */
1733+
if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1734+
&expected_checksum))
1735+
continue;
1736+
1737+
/* Handle checksum failure. */
1738+
(*checksum_failures)++;
1739+
if (*checksum_failures <= 5)
1740+
ereport(WARNING,
1741+
(errmsg("checksum verification failed in "
1742+
"file \"%s\", block %u: calculated "
1743+
"%X but expected %X",
1744+
readfilename, blkno + i, expected_checksum,
1745+
((PageHeader) page)->pd_checksum)));
1746+
if (*checksum_failures == 5)
1747+
ereport(WARNING,
1748+
(errmsg("further checksum verification "
1749+
"failures in file \"%s\" will not "
1750+
"be reported", readfilename)));
1751+
}
1752+
1753+
return cnt;
1754+
}
1755+
17211756
/*
17221757
* Try to verify the checksum for the provided page, if it seems appropriate
17231758
* to do so.

0 commit comments

Comments
 (0)