Skip to content

Commit b96d3c3

Browse files
committed
pgstat: Allow checksum errors to be reported in critical sections
For AIO we execute completion callbacks in critical sections (to ensure that AIO can in the future be used for WAL, which in turn requires that we can call completion callbacks in critical sections, to get the resources for WAL io). To report checksum errors a backend now has to call pgstat_prepare_report_checksum_failure(), before entering a critical section, which guarantees the relevant pgstats entry is in shared memory, the relevant DSM segment is mapped into the backend's memory and the address is known via a PgStat_EntryRef. Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/wkjj4p2rmkevutkwc6tewoovdqznj6c6nvjmvii4oo5wmbh5sr@retq7d6uqs4j
1 parent 4244cf6 commit b96d3c3

File tree

5 files changed

+52
-3
lines changed

5 files changed

+52
-3
lines changed

src/backend/backup/basebackup.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
18171817
checksum_failures,
18181818
readfilename, checksum_failures)));
18191819

1820+
pgstat_prepare_report_checksum_failure(dboid);
18201821
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
18211822
}
18221823

src/backend/catalog/storage.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
524524
{
525525
RelFileLocatorBackend rloc = src->smgr_rlocator;
526526

527+
pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
527528
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
528529
}
529530

src/backend/storage/buffer/bufmgr.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
15901590
{
15911591
RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator;
15921592

1593+
pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
15931594
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
15941595
}
15951596

src/backend/utils/activity/pgstat_database.c

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,34 @@ pgstat_report_deadlock(void)
133133
dbent->deadlocks++;
134134
}
135135

136+
/*
137+
* Allow this backend to later report checksum failures for dboid, even if in
138+
* a critical section at the time of the report.
139+
*
140+
* Without this function having been called first, the backend might need to
141+
* allocate an EntryRef or might need to map in DSM segments. Neither should
142+
* happen in a critical section.
143+
*/
144+
void
145+
pgstat_prepare_report_checksum_failure(Oid dboid)
146+
{
147+
Assert(!CritSectionCount);
148+
149+
/*
150+
* Just need to ensure this backend has an entry ref for the database.
151+
* That will allows us to report checksum failures without e.g. needing to
152+
* map in DSM segments.
153+
*/
154+
pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
155+
true, NULL);
156+
}
157+
136158
/*
137159
* Report one or more checksum failures.
160+
*
161+
* To be allowed to report checksum failures in critical sections, we require
162+
* pgstat_prepare_report_checksum_failure() to have been called before this
163+
* function is called.
138164
*/
139165
void
140166
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
@@ -147,10 +173,29 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
147173

148174
/*
149175
* Update the shared stats directly - checksum failures should never be
150-
* common enough for that to be a problem.
176+
* common enough for that to be a problem. Note that we pass create=false
177+
* here, as we want to be sure to not require memory allocations, so this
178+
* can be called in critical sections.
179+
*/
180+
entry_ref = pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
181+
false, NULL);
182+
183+
/*
184+
* Should always have been created by
185+
* pgstat_prepare_report_checksum_failure().
186+
*
187+
* When not using assertions, we don't want to crash should something have
188+
* gone wrong, so just return.
151189
*/
152-
entry_ref =
153-
pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false);
190+
Assert(entry_ref);
191+
if (!entry_ref)
192+
{
193+
elog(WARNING, "could not report %d conflicts for DB %u",
194+
failurecount, dboid);
195+
return;
196+
}
197+
198+
pgstat_lock_entry(entry_ref, false);
154199

155200
sharedent = (PgStatShared_Database *) entry_ref->shared_stats;
156201
sharedent->stats.checksum_failures += failurecount;

src/include/pgstat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ extern void pgstat_drop_database(Oid databaseid);
611611
extern void pgstat_report_autovac(Oid dboid);
612612
extern void pgstat_report_recovery_conflict(int reason);
613613
extern void pgstat_report_deadlock(void);
614+
extern void pgstat_prepare_report_checksum_failure(Oid dboid);
614615
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
615616
extern void pgstat_report_connect(Oid dboid);
616617
extern void pgstat_update_parallel_workers_stats(PgStat_Counter workers_to_launch,

0 commit comments

Comments
 (0)