@@ -83,6 +83,12 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
83
83
static bool sendFile (bbsink * sink , const char * readfilename , const char * tarfilename ,
84
84
struct stat * statbuf , bool missing_ok , Oid dboid ,
85
85
backup_manifest_info * manifest , const char * spcoid );
86
+ static off_t read_file_data_into_buffer (bbsink * sink ,
87
+ const char * readfilename , int fd ,
88
+ off_t offset , size_t length ,
89
+ BlockNumber blkno ,
90
+ bool verify_checksum ,
91
+ int * checksum_failures );
86
92
static bool verify_page_checksum (Page page , XLogRecPtr start_lsn ,
87
93
BlockNumber blkno ,
88
94
uint16 * expected_checksum );
@@ -1490,9 +1496,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1490
1496
BlockNumber blkno = 0 ;
1491
1497
int checksum_failures = 0 ;
1492
1498
off_t cnt ;
1493
- int i ;
1494
- pgoff_t len = 0 ;
1495
- char * page ;
1499
+ pgoff_t bytes_done = 0 ;
1496
1500
int segmentno = 0 ;
1497
1501
char * segmentpath ;
1498
1502
bool verify_checksum = false;
@@ -1514,6 +1518,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1514
1518
1515
1519
_tarWriteHeader (sink , tarfilename , NULL , statbuf , false);
1516
1520
1521
+ /*
1522
+ * Checksums are verified in multiples of BLCKSZ, so the buffer length
1523
+ * should be a multiple of the block size as well.
1524
+ */
1525
+ Assert ((sink -> bbs_buffer_length % BLCKSZ ) == 0 );
1526
+
1517
1527
if (!noverify_checksums && DataChecksumsEnabled ())
1518
1528
{
1519
1529
char * filename ;
@@ -1551,23 +1561,21 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1551
1561
* for a base backup we can ignore such extended data. It will be restored
1552
1562
* from WAL.
1553
1563
*/
1554
- while (len < statbuf -> st_size )
1564
+ while (bytes_done < statbuf -> st_size )
1555
1565
{
1556
- size_t remaining = statbuf -> st_size - len ;
1566
+ size_t remaining = statbuf -> st_size - bytes_done ;
1557
1567
1558
1568
/* Try to read some more data. */
1559
- cnt = basebackup_read_file (fd , sink -> bbs_buffer ,
1560
- Min (sink -> bbs_buffer_length , remaining ),
1561
- len , readfilename , true);
1569
+ cnt = read_file_data_into_buffer (sink , readfilename , fd , bytes_done ,
1570
+ remaining ,
1571
+ blkno + segmentno * RELSEG_SIZE ,
1572
+ verify_checksum ,
1573
+ & checksum_failures );
1562
1574
1563
1575
/*
1564
- * The checksums are verified at block level, so we iterate over the
1565
- * buffer in chunks of BLCKSZ, after making sure that
1566
- * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1567
- * BLCKSZ bytes.
1576
+ * If the amount of data we were able to read was not a multiple of
1577
+ * BLCKSZ, we cannot verify checksums, which are block-level.
1568
1578
*/
1569
- Assert ((sink -> bbs_buffer_length % BLCKSZ ) == 0 );
1570
-
1571
1579
if (verify_checksum && (cnt % BLCKSZ != 0 ))
1572
1580
{
1573
1581
ereport (WARNING ,
@@ -1578,84 +1586,6 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1578
1586
verify_checksum = false;
1579
1587
}
1580
1588
1581
- if (verify_checksum )
1582
- {
1583
- for (i = 0 ; i < cnt / BLCKSZ ; i ++ )
1584
- {
1585
- int reread_cnt ;
1586
- uint16 expected_checksum ;
1587
-
1588
- page = sink -> bbs_buffer + BLCKSZ * i ;
1589
-
1590
- /* If the page is OK, go on to the next one. */
1591
- if (verify_page_checksum (page , sink -> bbs_state -> startptr ,
1592
- blkno + i + segmentno * RELSEG_SIZE ,
1593
- & expected_checksum ))
1594
- continue ;
1595
-
1596
- /*
1597
- * Retry the block on the first failure. It's possible that
1598
- * we read the first 4K page of the block just before postgres
1599
- * updated the entire block so it ends up looking torn to us.
1600
- * If, before we retry the read, the concurrent write of the
1601
- * block finishes, the page LSN will be updated and we'll
1602
- * realize that we should ignore this block.
1603
- *
1604
- * There's no guarantee that this will actually happen,
1605
- * though: the torn write could take an arbitrarily long time
1606
- * to complete. Retrying multiple times wouldn't fix this
1607
- * problem, either, though it would reduce the chances of it
1608
- * happening in practice. The only real fix here seems to be
1609
- * to have some kind of interlock that allows us to wait until
1610
- * we can be certain that no write to the block is in
1611
- * progress. Since we don't have any such thing right now, we
1612
- * just do this and hope for the best.
1613
- */
1614
- reread_cnt =
1615
- basebackup_read_file (fd ,
1616
- sink -> bbs_buffer + BLCKSZ * i ,
1617
- BLCKSZ , len + BLCKSZ * i ,
1618
- readfilename ,
1619
- false);
1620
- if (reread_cnt == 0 )
1621
- {
1622
- /*
1623
- * If we hit end-of-file, a concurrent truncation must
1624
- * have occurred, so break out of this loop just as if the
1625
- * initial fread() returned 0. We'll drop through to the
1626
- * same code that handles that case. (We must fix up cnt
1627
- * first, though.)
1628
- */
1629
- cnt = BLCKSZ * i ;
1630
- break ;
1631
- }
1632
-
1633
- /* If the page now looks OK, go on to the next one. */
1634
- if (verify_page_checksum (page , sink -> bbs_state -> startptr ,
1635
- blkno + i + segmentno * RELSEG_SIZE ,
1636
- & expected_checksum ))
1637
- continue ;
1638
-
1639
- /* Handle checksum failure. */
1640
- checksum_failures ++ ;
1641
- if (checksum_failures <= 5 )
1642
- ereport (WARNING ,
1643
- (errmsg ("checksum verification failed in "
1644
- "file \"%s\", block %u: calculated "
1645
- "%X but expected %X" ,
1646
- readfilename , blkno + i , expected_checksum ,
1647
- ((PageHeader ) page )-> pd_checksum )));
1648
- if (checksum_failures == 5 )
1649
- ereport (WARNING ,
1650
- (errmsg ("further checksum verification "
1651
- "failures in file \"%s\" will not "
1652
- "be reported" , readfilename )));
1653
- }
1654
-
1655
- /* Update block number for next pass through the outer loop. */
1656
- blkno += i ;
1657
- }
1658
-
1659
1589
/*
1660
1590
* If we hit end-of-file, a concurrent truncation must have occurred.
1661
1591
* That's not an error condition, because WAL replay will fix things
@@ -1664,21 +1594,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1664
1594
if (cnt == 0 )
1665
1595
break ;
1666
1596
1597
+ /* Update block number and # of bytes done for next loop iteration. */
1598
+ blkno += cnt / BLCKSZ ;
1599
+ bytes_done += cnt ;
1600
+
1667
1601
/* Archive the data we just read. */
1668
1602
bbsink_archive_contents (sink , cnt );
1669
1603
1670
1604
/* Also feed it to the checksum machinery. */
1671
1605
if (pg_checksum_update (& checksum_ctx ,
1672
1606
(uint8 * ) sink -> bbs_buffer , cnt ) < 0 )
1673
1607
elog (ERROR , "could not update checksum of base backup" );
1674
-
1675
- len += cnt ;
1676
1608
}
1677
1609
1678
1610
/* If the file was truncated while we were sending it, pad it with zeros */
1679
- while (len < statbuf -> st_size )
1611
+ while (bytes_done < statbuf -> st_size )
1680
1612
{
1681
- size_t remaining = statbuf -> st_size - len ;
1613
+ size_t remaining = statbuf -> st_size - bytes_done ;
1682
1614
size_t nbytes = Min (sink -> bbs_buffer_length , remaining );
1683
1615
1684
1616
MemSet (sink -> bbs_buffer , 0 , nbytes );
@@ -1687,15 +1619,15 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1687
1619
nbytes ) < 0 )
1688
1620
elog (ERROR , "could not update checksum of base backup" );
1689
1621
bbsink_archive_contents (sink , nbytes );
1690
- len += nbytes ;
1622
+ bytes_done += nbytes ;
1691
1623
}
1692
1624
1693
1625
/*
1694
1626
* Pad to a block boundary, per tar format requirements. (This small piece
1695
1627
* of data is probably not worth throttling, and is not checksummed
1696
1628
* because it's not actually part of the file.)
1697
1629
*/
1698
- _tarWritePadding (sink , len );
1630
+ _tarWritePadding (sink , bytes_done );
1699
1631
1700
1632
CloseTransientFile (fd );
1701
1633
@@ -1718,6 +1650,109 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1718
1650
return true;
1719
1651
}
1720
1652
1653
+ /*
1654
+ * Read some more data from the file into the bbsink's buffer, verifying
1655
+ * checksums as required.
1656
+ *
1657
+ * 'offset' is the file offset from which we should begin to read, and
1658
+ * 'length' is the amount of data that should be read. The actual amount
1659
+ * of data read will be less than the requested amount if the bbsink's
1660
+ * buffer isn't big enough to hold it all, or if the underlying file has
1661
+ * been truncated. The return value is the number of bytes actually read.
1662
+ *
1663
+ * 'blkno' is the block number of the first page in the bbsink's buffer
1664
+ * relative to the start of the relation.
1665
+ *
1666
+ * 'verify_checksum' indicates whether we should try to verify checksums
1667
+ * for the blocks we read. If we do this, we'll update *checksum_failures
1668
+ * and issue warnings as appropriate.
1669
+ */
1670
+ static off_t
1671
+ read_file_data_into_buffer (bbsink * sink , const char * readfilename , int fd ,
1672
+ off_t offset , size_t length , BlockNumber blkno ,
1673
+ bool verify_checksum , int * checksum_failures )
1674
+ {
1675
+ off_t cnt ;
1676
+ int i ;
1677
+ char * page ;
1678
+
1679
+ /* Try to read some more data. */
1680
+ cnt = basebackup_read_file (fd , sink -> bbs_buffer ,
1681
+ Min (sink -> bbs_buffer_length , length ),
1682
+ offset , readfilename , true);
1683
+
1684
+ /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1685
+ if (!verify_checksum || (cnt % BLCKSZ ) != 0 )
1686
+ return cnt ;
1687
+
1688
+ /* Verify checksum for each block. */
1689
+ for (i = 0 ; i < cnt / BLCKSZ ; i ++ )
1690
+ {
1691
+ int reread_cnt ;
1692
+ uint16 expected_checksum ;
1693
+
1694
+ page = sink -> bbs_buffer + BLCKSZ * i ;
1695
+
1696
+ /* If the page is OK, go on to the next one. */
1697
+ if (verify_page_checksum (page , sink -> bbs_state -> startptr , blkno + i ,
1698
+ & expected_checksum ))
1699
+ continue ;
1700
+
1701
+ /*
1702
+ * Retry the block on the first failure. It's possible that we read
1703
+ * the first 4K page of the block just before postgres updated the
1704
+ * entire block so it ends up looking torn to us. If, before we retry
1705
+ * the read, the concurrent write of the block finishes, the page LSN
1706
+ * will be updated and we'll realize that we should ignore this block.
1707
+ *
1708
+ * There's no guarantee that this will actually happen, though: the
1709
+ * torn write could take an arbitrarily long time to complete.
1710
+ * Retrying multiple times wouldn't fix this problem, either, though
1711
+ * it would reduce the chances of it happening in practice. The only
1712
+ * real fix here seems to be to have some kind of interlock that
1713
+ * allows us to wait until we can be certain that no write to the
1714
+ * block is in progress. Since we don't have any such thing right now,
1715
+ * we just do this and hope for the best.
1716
+ */
1717
+ reread_cnt =
1718
+ basebackup_read_file (fd , sink -> bbs_buffer + BLCKSZ * i ,
1719
+ BLCKSZ , offset + BLCKSZ * i ,
1720
+ readfilename , false);
1721
+ if (reread_cnt == 0 )
1722
+ {
1723
+ /*
1724
+ * If we hit end-of-file, a concurrent truncation must have
1725
+ * occurred, so reduce cnt to reflect only the blocks already
1726
+ * processed and break out of this loop.
1727
+ */
1728
+ cnt = BLCKSZ * i ;
1729
+ break ;
1730
+ }
1731
+
1732
+ /* If the page now looks OK, go on to the next one. */
1733
+ if (verify_page_checksum (page , sink -> bbs_state -> startptr , blkno + i ,
1734
+ & expected_checksum ))
1735
+ continue ;
1736
+
1737
+ /* Handle checksum failure. */
1738
+ (* checksum_failures )++ ;
1739
+ if (* checksum_failures <= 5 )
1740
+ ereport (WARNING ,
1741
+ (errmsg ("checksum verification failed in "
1742
+ "file \"%s\", block %u: calculated "
1743
+ "%X but expected %X" ,
1744
+ readfilename , blkno + i , expected_checksum ,
1745
+ ((PageHeader ) page )-> pd_checksum )));
1746
+ if (* checksum_failures == 5 )
1747
+ ereport (WARNING ,
1748
+ (errmsg ("further checksum verification "
1749
+ "failures in file \"%s\" will not "
1750
+ "be reported" , readfilename )));
1751
+ }
1752
+
1753
+ return cnt ;
1754
+ }
1755
+
1721
1756
/*
1722
1757
* Try to verify the checksum for the provided page, if it seems appropriate
1723
1758
* to do so.
0 commit comments