Skip to content

Commit 2ce64ca

Browse files
committed
Fix bugs in vacuum of shared rels, by keeping their relcache entries current.
When vacuum processes a relation it uses the corresponding relcache entry's relfrozenxid / relminmxid as a cutoff for when to remove tuples etc. Unfortunately for nailed relations (i.e. critical system catalogs) bugs could frequently lead to the corresponding relcache entry being stale. This set of bugs could cause actual data corruption as vacuum would potentially not remove the correct row versions, potentially reviving them at a later point. After 699bf7d some corruptions in this vein were prevented, but the additional error checks could also trigger spuriously. Examples of such errors are: ERROR: found xmin ... from before relfrozenxid ... and ERROR: found multixact ... from before relminmxid ... To be caused by this bug the errors have to occur on system catalog tables. The two bugs are: 1) Invalidations for nailed relations were ignored, based on the theory that the relcache entry for such tables doesn't change. Which is largely true, except for fields like relfrozenxid etc. This means that changes to relations vacuumed in other sessions weren't picked up by already existing sessions. Luckily autovacuum doesn't have particularly longrunning sessions. 2) For shared *and* nailed relations, the shared relcache init file was never invalidated while running. That means that for such tables (e.g. pg_authid, pg_database) it's not just already existing sessions that are affected, but even new connections are as well. That explains why the reports usually were about pg_authid et. al. To fix 1), revalidate the rd_rel portion of a relcache entry when invalid. This implies a bit of extra complexity to deal with bootstrapping, but it's not too bad. The fix for 2) is simpler, simply always remove both the shared and local init files. Author: Andres Freund Reviewed-By: Alvaro Herrera Discussion: https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com Backpatch: 9.3-
1 parent b10edaf commit 2ce64ca

File tree

3 files changed

+146
-74
lines changed

3 files changed

+146
-74
lines changed

src/backend/utils/cache/inval.c

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -521,12 +521,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
521521
(void) GetCurrentCommandId(true);
522522

523523
/*
524-
* If the relation being invalidated is one of those cached in the local
525-
* relcache init file, mark that we need to zap that file at commit. Same
526-
* is true when we are invalidating whole relcache.
524+
* If the relation being invalidated is one of those cached in a relcache
525+
* init file, mark that we need to zap that file at commit. For simplicity
526+
* invalidations for a specific database always invalidate the shared file
527+
* as well. Also zap when we are invalidating whole relcache.
527528
*/
528-
if (OidIsValid(dbId) &&
529-
(RelationIdIsInInitFile(relId) || relId == InvalidOid))
529+
if (relId == InvalidOid || RelationIdIsInInitFile(relId))
530530
transInvalInfo->RelcacheInitFileInval = true;
531531
}
532532

@@ -881,18 +881,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
881881

882882
if (RelcacheInitFileInval)
883883
{
884+
elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
885+
dbid);
886+
884887
/*
885-
* RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
886-
* but we should not use SetDatabasePath during recovery, since it is
888+
* RelationCacheInitFilePreInvalidate, when the invalidation message
889+
* is for a specific database, requires DatabasePath to be set, but we
890+
* should not use SetDatabasePath during recovery, since it is
887891
* intended to be used only once by normal backends. Hence, a quick
888892
* hack: set DatabasePath directly then unset after use.
889893
*/
890-
DatabasePath = GetDatabasePath(dbid, tsid);
891-
elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
892-
DatabasePath);
894+
if (OidIsValid(dbid))
895+
DatabasePath = GetDatabasePath(dbid, tsid);
896+
893897
RelationCacheInitFilePreInvalidate();
894-
pfree(DatabasePath);
895-
DatabasePath = NULL;
898+
899+
if (OidIsValid(dbid))
900+
{
901+
pfree(DatabasePath);
902+
DatabasePath = NULL;
903+
}
896904
}
897905

898906
SendSharedInvalidMessages(msgs, nmsgs);

src/backend/utils/cache/relcache.c

Lines changed: 125 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
248248
static void RelationClearRelation(Relation relation, bool rebuild);
249249

250250
static void RelationReloadIndexInfo(Relation relation);
251+
static void RelationReloadNailed(Relation relation);
251252
static void RelationFlushRelation(Relation relation);
252253
static void RememberToFreeTupleDescAtEOX(TupleDesc td);
253254
static void AtEOXact_cleanup(Relation relation, bool isCommit);
@@ -285,7 +286,7 @@ static void IndexSupportInitialize(oidvector *indclass,
285286
static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
286287
StrategyNumber numSupport);
287288
static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
288-
static void unlink_initfile(const char *initfilename);
289+
static void unlink_initfile(const char *initfilename, int elevel);
289290
static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
290291
PartitionDesc partdesc2);
291292

@@ -2052,7 +2053,16 @@ RelationIdGetRelation(Oid relationId)
20522053
RelationReloadIndexInfo(rd);
20532054
else
20542055
RelationClearRelation(rd, true);
2055-
Assert(rd->rd_isvalid);
2056+
2057+
/*
2058+
* Normally entries need to be valid here, but before the relcache
2059+
* has been initialized, not enough infrastructure exists to
2060+
* perform pg_class lookups. The structure of such entries doesn't
2061+
* change, but we still want to update the rd_rel entry. So
2062+
* rd_isvalid = false is left in place for a later lookup.
2063+
*/
2064+
Assert(rd->rd_isvalid ||
2065+
(rd->rd_isnailed && !criticalRelcachesBuilt));
20562066
}
20572067
return rd;
20582068
}
@@ -2255,6 +2265,81 @@ RelationReloadIndexInfo(Relation relation)
22552265
relation->rd_isvalid = true;
22562266
}
22572267

2268+
/*
2269+
* RelationReloadNailed - reload minimal information for nailed relations.
2270+
*
2271+
* The structure of a nailed relation can never change (which is good, because
2272+
* we rely on knowing their structure to be able to read catalog content). But
2273+
* some parts, e.g. pg_class.relfrozenxid, are still important to have
2274+
* accurate content for. Therefore those need to be reloaded after the arrival
2275+
* of invalidations.
2276+
*/
2277+
static void
2278+
RelationReloadNailed(Relation relation)
2279+
{
2280+
Assert(relation->rd_isnailed);
2281+
2282+
/*
2283+
* Redo RelationInitPhysicalAddr in case it is a mapped relation whose
2284+
* mapping changed.
2285+
*/
2286+
RelationInitPhysicalAddr(relation);
2287+
2288+
/* flag as needing to be revalidated */
2289+
relation->rd_isvalid = false;
2290+
2291+
/*
2292+
* Can only reread catalog contents if in a transaction. If the relation
2293+
* is currently open (not counting the nailed refcount), do so
2294+
* immediately. Otherwise we've already marked the entry as possibly
2295+
* invalid, and it'll be fixed when next opened.
2296+
*/
2297+
if (!IsTransactionState() || relation->rd_refcnt <= 1)
2298+
return;
2299+
2300+
if (relation->rd_rel->relkind == RELKIND_INDEX)
2301+
{
2302+
/*
2303+
* If it's a nailed-but-not-mapped index, then we need to re-read the
2304+
* pg_class row to see if its relfilenode changed.
2305+
*/
2306+
RelationReloadIndexInfo(relation);
2307+
}
2308+
else
2309+
{
2310+
/*
2311+
* Reload a non-index entry. We can't easily do so if relcaches
2312+
* aren't yet built, but that's fine because at that stage the
2313+
* attributes that need to be current (like relfrozenxid) aren't yet
2314+
* accessed. To ensure the entry will later be revalidated, we leave
2315+
* it in invalid state, but allow use (cf. RelationIdGetRelation()).
2316+
*/
2317+
if (criticalRelcachesBuilt)
2318+
{
2319+
HeapTuple pg_class_tuple;
2320+
Form_pg_class relp;
2321+
2322+
/*
2323+
* NB: Mark the entry as valid before starting to scan, to avoid
2324+
* self-recursion when re-building pg_class.
2325+
*/
2326+
relation->rd_isvalid = true;
2327+
2328+
pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
2329+
true, false);
2330+
relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
2331+
memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
2332+
heap_freetuple(pg_class_tuple);
2333+
2334+
/*
2335+
* Again mark as valid, to protect against concurrently arriving
2336+
* invalidations.
2337+
*/
2338+
relation->rd_isvalid = true;
2339+
}
2340+
}
2341+
}
2342+
22582343
/*
22592344
* RelationDestroyRelation
22602345
*
@@ -2368,26 +2453,12 @@ RelationClearRelation(Relation relation, bool rebuild)
23682453
RelationCloseSmgr(relation);
23692454

23702455
/*
2371-
* Never, never ever blow away a nailed-in system relation, because we'd
2372-
* be unable to recover. However, we must redo RelationInitPhysicalAddr
2373-
* in case it is a mapped relation whose mapping changed.
2374-
*
2375-
* If it's a nailed-but-not-mapped index, then we need to re-read the
2376-
* pg_class row to see if its relfilenode changed. We do that immediately
2377-
* if we're inside a valid transaction and the relation is open (not
2378-
* counting the nailed refcount). Otherwise just mark the entry as
2379-
* possibly invalid, and it'll be fixed when next opened.
2456+
* Treat nailed-in system relations separately, they always need to be
2457+
* accessible, so we can't blow them away.
23802458
*/
23812459
if (relation->rd_isnailed)
23822460
{
2383-
RelationInitPhysicalAddr(relation);
2384-
2385-
if (relation->rd_rel->relkind == RELKIND_INDEX)
2386-
{
2387-
relation->rd_isvalid = false; /* needs to be revalidated */
2388-
if (relation->rd_refcnt > 1 && IsTransactionState())
2389-
RelationReloadIndexInfo(relation);
2390-
}
2461+
RelationReloadNailed(relation);
23912462
return;
23922463
}
23932464

@@ -5904,24 +5975,26 @@ write_item(const void *data, Size len, FILE *fp)
59045975

59055976
/*
59065977
* Determine whether a given relation (identified by OID) is one of the ones
5907-
* we should store in the local relcache init file.
5978+
* we should store in a relcache init file.
59085979
*
59095980
* We must cache all nailed rels, and for efficiency we should cache every rel
59105981
* that supports a syscache. The former set is almost but not quite a subset
5911-
* of the latter. Currently, we must special-case TriggerRelidNameIndexId,
5912-
* which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
5913-
* but which does not support any syscache.
5914-
*
5915-
* Note: this function is currently never called for shared rels. If it were,
5916-
* we'd probably also need a special case for DatabaseNameIndexId, which is
5917-
* critical but does not support a syscache.
5982+
* of the latter. The special cases are relations where
5983+
* RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
5984+
* which do not support any syscache.
59185985
*/
59195986
bool
59205987
RelationIdIsInInitFile(Oid relationId)
59215988
{
5922-
if (relationId == TriggerRelidNameIndexId)
5989+
if (relationId == SharedSecLabelRelationId ||
5990+
relationId == TriggerRelidNameIndexId ||
5991+
relationId == DatabaseNameIndexId ||
5992+
relationId == SharedSecLabelObjectIndexId)
59235993
{
5924-
/* If this Assert fails, we don't need this special case anymore. */
5994+
/*
5995+
* If this Assert fails, we don't need the applicable special case
5996+
* anymore.
5997+
*/
59255998
Assert(!RelationSupportsSysCache(relationId));
59265999
return true;
59276000
}
@@ -5991,38 +6064,30 @@ RelationHasUnloggedIndex(Relation rel)
59916064
* We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
59926065
* then release the lock in RelationCacheInitFilePostInvalidate. Caller must
59936066
* send any pending SI messages between those calls.
5994-
*
5995-
* Notice this deals only with the local init file, not the shared init file.
5996-
* The reason is that there can never be a "significant" change to the
5997-
* relcache entry of a shared relation; the most that could happen is
5998-
* updates of noncritical fields such as relpages/reltuples. So, while
5999-
* it's worth updating the shared init file from time to time, it can never
6000-
* be invalid enough to make it necessary to remove it.
60016067
*/
60026068
void
60036069
RelationCacheInitFilePreInvalidate(void)
60046070
{
6005-
char initfilename[MAXPGPATH];
6071+
char localinitfname[MAXPGPATH];
6072+
char sharedinitfname[MAXPGPATH];
60066073

6007-
snprintf(initfilename, sizeof(initfilename), "%s/%s",
6008-
DatabasePath, RELCACHE_INIT_FILENAME);
6074+
if (DatabasePath)
6075+
snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
6076+
DatabasePath, RELCACHE_INIT_FILENAME);
6077+
snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
6078+
RELCACHE_INIT_FILENAME);
60096079

60106080
LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
60116081

6012-
if (unlink(initfilename) < 0)
6013-
{
6014-
/*
6015-
* The file might not be there if no backend has been started since
6016-
* the last removal. But complain about failures other than ENOENT.
6017-
* Fortunately, it's not too late to abort the transaction if we can't
6018-
* get rid of the would-be-obsolete init file.
6019-
*/
6020-
if (errno != ENOENT)
6021-
ereport(ERROR,
6022-
(errcode_for_file_access(),
6023-
errmsg("could not remove cache file \"%s\": %m",
6024-
initfilename)));
6025-
}
6082+
/*
6083+
* The files might not be there if no backend has been started since the
6084+
* last removal. But complain about failures other than ENOENT with
6085+
* ERROR. Fortunately, it's not too late to abort the transaction if we
6086+
* can't get rid of the would-be-obsolete init file.
6087+
*/
6088+
if (DatabasePath)
6089+
unlink_initfile(localinitfname, ERROR);
6090+
unlink_initfile(sharedinitfname, ERROR);
60266091
}
60276092

60286093
void
@@ -6048,13 +6113,9 @@ RelationCacheInitFileRemove(void)
60486113
struct dirent *de;
60496114
char path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
60506115

6051-
/*
6052-
* We zap the shared cache file too. In theory it can't get out of sync
6053-
* enough to be a problem, but in data-corruption cases, who knows ...
6054-
*/
60556116
snprintf(path, sizeof(path), "global/%s",
60566117
RELCACHE_INIT_FILENAME);
6057-
unlink_initfile(path);
6118+
unlink_initfile(path, LOG);
60586119

60596120
/* Scan everything in the default tablespace */
60606121
RelationCacheInitFileRemoveInDir("base");
@@ -6106,20 +6167,23 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
61066167
/* Try to remove the init file in each database */
61076168
snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
61086169
tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
6109-
unlink_initfile(initfilename);
6170+
unlink_initfile(initfilename, LOG);
61106171
}
61116172
}
61126173

61136174
FreeDir(dir);
61146175
}
61156176

61166177
static void
6117-
unlink_initfile(const char *initfilename)
6178+
unlink_initfile(const char *initfilename, int elevel)
61186179
{
61196180
if (unlink(initfilename) < 0)
61206181
{
61216182
/* It might not be there, but log any error other than ENOENT */
61226183
if (errno != ENOENT)
6123-
elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
6184+
ereport(elevel,
6185+
(errcode_for_file_access(),
6186+
errmsg("could not remove cache file \"%s\": %m",
6187+
initfilename)));
61246188
}
61256189
}

src/include/storage/standbydefs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ typedef struct xl_invalidations
6464
{
6565
Oid dbId; /* MyDatabaseId */
6666
Oid tsId; /* MyDatabaseTableSpace */
67-
bool relcacheInitFileInval; /* invalidate relcache init file */
67+
bool relcacheInitFileInval; /* invalidate relcache init files */
6868
int nmsgs; /* number of shared inval msgs */
6969
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
7070
} xl_invalidations;

0 commit comments

Comments
 (0)