Skip to content

Commit dee8002

Browse files
committed
Fix mis-attribution of checksum failure stats to the wrong database
Checksum failure stats could be attributed to the wrong database in two cases: - when a read of a shared relation encountered a checksum error , it would be attributed to the current database, instead of the "database" representing shared relations - when using CREATE DATABASE ... STRATEGY WAL_LOG checksum errors in the source database would be attributed to the current database The checksum stats reporting via PageIsVerifiedExtended(PIV_REPORT_STAT) does not have access to the information about what database a page belongs to. This fixes the issue by removing PIV_REPORT_STAT and delegating the responsibility to report stats to the caller, which now can learn about the number of stats via a new optional argument. As this changes the signature of PageIsVerifiedExtended() and all callers should adapt to the new signature, use the occasion to rename the function to PageIsVerified() and remove the compatibility macro. We could instead have fixed this by adding information about the database to the args of PageIsVerified(), but there are soon-to-be-applied patches that need to separate the stats reporting from the PageIsVerified() call anyway. Those patches also include testing for the failure paths, something we inexplicably have not had. As there is no caller of pgstat_report_checksum_failure() left, remove it. It'd be possible, but awkward to fix this in the back branches. We considered doing the work not quite worth it, as mis-attributed stats should still elicit concern. The emitted error messages do allow to attribute the errors correctly. Discussion: https://postgr.es/m/5tyic6epvdlmd6eddgelv47syg2b5cpwffjam54axp25xyq2ga@ptwkinxqo3az Discussion: https://postgr.es/m/mglpvvbhighzuwudjxzu4br65qqcxsnyvio3nl4fbog3qknwhg@e4gt7npsohuz
1 parent 68f97ae commit dee8002

File tree

6 files changed

+48
-34
lines changed

6 files changed

+48
-34
lines changed

src/backend/catalog/storage.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "catalog/storage.h"
2828
#include "catalog/storage_xlog.h"
2929
#include "miscadmin.h"
30+
#include "pgstat.h"
3031
#include "storage/bulk_write.h"
3132
#include "storage/freespace.h"
3233
#include "storage/proc.h"
@@ -507,15 +508,26 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
507508
for (blkno = 0; blkno < nblocks; blkno++)
508509
{
509510
BulkWriteBuffer buf;
511+
bool checksum_failure;
512+
bool verified;
510513

511514
/* If we got a cancel signal during the copy of the data, quit */
512515
CHECK_FOR_INTERRUPTS();
513516

514517
buf = smgr_bulk_get_buf(bulkstate);
515518
smgrread(src, forkNum, blkno, (Page) buf);
516519

517-
if (!PageIsVerifiedExtended((Page) buf, blkno,
518-
PIV_LOG_WARNING | PIV_REPORT_STAT))
520+
verified = PageIsVerified((Page) buf, blkno, PIV_LOG_WARNING,
521+
&checksum_failure);
522+
523+
if (checksum_failure)
524+
{
525+
RelFileLocatorBackend rloc = src->smgr_rlocator;
526+
527+
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
528+
}
529+
530+
if (!verified)
519531
{
520532
/*
521533
* For paranoia's sake, capture the file path before invoking the

src/backend/storage/buffer/bufmgr.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -770,7 +770,7 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
770770
* In RBM_NORMAL mode, the page is read from disk, and the page header is
771771
* validated. An error is thrown if the page header is not valid. (But
772772
* note that an all-zero page is considered "valid"; see
773-
* PageIsVerifiedExtended().)
773+
* PageIsVerified().)
774774
*
775775
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
776776
* valid, the page is zeroed instead of throwing an error. This is intended
@@ -1569,6 +1569,8 @@ WaitReadBuffers(ReadBuffersOperation *operation)
15691569
{
15701570
BufferDesc *bufHdr;
15711571
Block bufBlock;
1572+
bool verified;
1573+
bool checksum_failure;
15721574

15731575
if (persistence == RELPERSISTENCE_TEMP)
15741576
{
@@ -1582,8 +1584,16 @@ WaitReadBuffers(ReadBuffersOperation *operation)
15821584
}
15831585

15841586
/* check for garbage data */
1585-
if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1586-
PIV_LOG_WARNING | PIV_REPORT_STAT))
1587+
verified = PageIsVerified((Page) bufBlock, io_first_block + j,
1588+
PIV_LOG_WARNING, &checksum_failure);
1589+
if (checksum_failure)
1590+
{
1591+
RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator;
1592+
1593+
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
1594+
}
1595+
1596+
if (!verified)
15871597
{
15881598
if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
15891599
{

src/backend/storage/page/bufpage.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ PageInit(Page page, Size pageSize, Size specialSize)
6161

6262

6363
/*
64-
* PageIsVerifiedExtended
64+
* PageIsVerified
6565
* Check that the page header and checksum (if any) appear valid.
6666
*
6767
* This is called when a page has just been read in from disk. The idea is
@@ -81,18 +81,23 @@ PageInit(Page page, Size pageSize, Size specialSize)
8181
* If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of
8282
* a checksum failure.
8383
*
84-
* If flag PIV_REPORT_STAT is set, a checksum failure is reported directly
85-
* to pgstat.
84+
* To allow the caller to report statistics about checksum failures,
85+
* *checksum_failure_p can be passed in. Note that there may be checksum
86+
* failures even if this function returns true, due to
87+
* ignore_checksum_failure.
8688
*/
8789
bool
88-
PageIsVerifiedExtended(PageData *page, BlockNumber blkno, int flags)
90+
PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
8991
{
9092
const PageHeaderData *p = (const PageHeaderData *) page;
9193
size_t *pagebytes;
9294
bool checksum_failure = false;
9395
bool header_sane = false;
9496
uint16 checksum = 0;
9597

98+
if (checksum_failure_p)
99+
*checksum_failure_p = false;
100+
96101
/*
97102
* Don't verify page data unless the page passes basic non-zero test
98103
*/
@@ -103,7 +108,11 @@ PageIsVerifiedExtended(PageData *page, BlockNumber blkno, int flags)
103108
checksum = pg_checksum_page(page, blkno);
104109

105110
if (checksum != p->pd_checksum)
111+
{
106112
checksum_failure = true;
113+
if (checksum_failure_p)
114+
*checksum_failure_p = true;
115+
}
107116
}
108117

109118
/*
@@ -141,9 +150,6 @@ PageIsVerifiedExtended(PageData *page, BlockNumber blkno, int flags)
141150
errmsg("page verification failed, calculated checksum %u but expected %u",
142151
checksum, p->pd_checksum)));
143152

144-
if ((flags & PIV_REPORT_STAT) != 0)
145-
pgstat_report_checksum_failure();
146-
147153
if (header_sane && ignore_checksum_failure)
148154
return true;
149155
}

src/backend/utils/activity/pgstat_database.c

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,15 +159,6 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
159159
pgstat_unlock_entry(entry_ref);
160160
}
161161

162-
/*
163-
* Report one checksum failure in the current database.
164-
*/
165-
void
166-
pgstat_report_checksum_failure(void)
167-
{
168-
pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
169-
}
170-
171162
/*
172163
* Report creation of temporary file.
173164
*/

src/include/pgstat.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,6 @@ extern void pgstat_report_autovac(Oid dboid);
612612
extern void pgstat_report_recovery_conflict(int reason);
613613
extern void pgstat_report_deadlock(void);
614614
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
615-
extern void pgstat_report_checksum_failure(void);
616615
extern void pgstat_report_connect(Oid dboid);
617616
extern void pgstat_update_parallel_workers_stats(PgStat_Counter workers_to_launch,
618617
PgStat_Counter workers_launched);

src/include/storage/bufpage.h

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -465,31 +465,27 @@ do { \
465465
#define PAI_OVERWRITE (1 << 0)
466466
#define PAI_IS_HEAP (1 << 1)
467467

468-
/* flags for PageIsVerifiedExtended() */
468+
/* flags for PageIsVerified() */
469469
#define PIV_LOG_WARNING (1 << 0)
470-
#define PIV_REPORT_STAT (1 << 1)
471470

472471
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
473472
PageAddItemExtended(page, item, size, offsetNumber, \
474473
((overwrite) ? PAI_OVERWRITE : 0) | \
475474
((is_heap) ? PAI_IS_HEAP : 0))
476475

477-
#define PageIsVerified(page, blkno) \
478-
PageIsVerifiedExtended(page, blkno, \
479-
PIV_LOG_WARNING | PIV_REPORT_STAT)
480-
481476
/*
482-
* Check that BLCKSZ is a multiple of sizeof(size_t). In
483-
* PageIsVerifiedExtended(), it is much faster to check if a page is
484-
* full of zeroes using the native word size. Note that this assertion
485-
* is kept within a header to make sure that StaticAssertDecl() works
486-
* across various combinations of platforms and compilers.
477+
* Check that BLCKSZ is a multiple of sizeof(size_t). In PageIsVerified(), it
478+
* is much faster to check if a page is full of zeroes using the native word
479+
* size. Note that this assertion is kept within a header to make sure that
480+
* StaticAssertDecl() works across various combinations of platforms and
481+
* compilers.
487482
*/
488483
StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
489484
"BLCKSZ has to be a multiple of sizeof(size_t)");
490485

491486
extern void PageInit(Page page, Size pageSize, Size specialSize);
492-
extern bool PageIsVerifiedExtended(PageData *page, BlockNumber blkno, int flags);
487+
extern bool PageIsVerified(PageData *page, BlockNumber blkno, int flags,
488+
bool *checksum_failure_p);
493489
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
494490
OffsetNumber offsetNumber, int flags);
495491
extern Page PageGetTempPage(const PageData *page);

0 commit comments

Comments
 (0)