Skip to content

Commit 817f9f9

Browse files
committed
Fix bugs in vacuum of shared rels, by keeping their relcache entries current.
When vacuum processes a relation it uses the corresponding relcache entry's relfrozenxid / relminmxid as a cutoff for when to remove tuples etc. Unfortunately for nailed relations (i.e. critical system catalogs) bugs could frequently lead to the corresponding relcache entry being stale. This set of bugs could cause actual data corruption as vacuum would potentially not remove the correct row versions, potentially reviving them at a later point. After 699bf7d some corruptions in this vein were prevented, but the additional error checks could also trigger spuriously. Examples of such errors are: ERROR: found xmin ... from before relfrozenxid ... and ERROR: found multixact ... from before relminmxid ... To be caused by this bug the errors have to occur on system catalog tables. The two bugs are: 1) Invalidations for nailed relations were ignored, based on the theory that the relcache entry for such tables doesn't change. Which is largely true, except for fields like relfrozenxid etc. This means that changes to relations vacuumed in other sessions weren't picked up by already existing sessions. Luckily autovacuum doesn't have particularly longrunning sessions. 2) For shared *and* nailed relations, the shared relcache init file was never invalidated while running. That means that for such tables (e.g. pg_authid, pg_database) it's not just already existing sessions that are affected, but even new connections are as well. That explains why the reports usually were about pg_authid et. al. To fix 1), revalidate the rd_rel portion of a relcache entry when invalid. This implies a bit of extra complexity to deal with bootstrapping, but it's not too bad. The fix for 2) is simpler, simply always remove both the shared and local init files. Author: Andres Freund Reviewed-By: Alvaro Herrera Discussion: https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com Backpatch: 9.3-
1 parent 25a8561 commit 817f9f9

File tree

2 files changed

+143
-71
lines changed

2 files changed

+143
-71
lines changed

src/backend/utils/cache/inval.c

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -514,10 +514,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
514514
(void) GetCurrentCommandId(true);
515515

516516
/*
517-
* If the relation being invalidated is one of those cached in the local
518-
* relcache init file, mark that we need to zap that file at commit.
517+
* If the relation being invalidated is one of those cached in a relcache
518+
* init file, mark that we need to zap that file at commit. For simplicity
519+
* invalidations for a specific database always invalidate the shared file
520+
* as well.
519521
*/
520-
if (OidIsValid(dbId) && RelationIdIsInInitFile(relId))
522+
if (RelationIdIsInInitFile(relId))
521523
transInvalInfo->RelcacheInitFileInval = true;
522524
}
523525

@@ -865,18 +867,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
865867

866868
if (RelcacheInitFileInval)
867869
{
870+
elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
871+
dbid);
872+
868873
/*
869-
* RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
870-
* but we should not use SetDatabasePath during recovery, since it is
874+
* RelationCacheInitFilePreInvalidate, when the invalidation message
875+
* is for a specific database, requires DatabasePath to be set, but we
876+
* should not use SetDatabasePath during recovery, since it is
871877
* intended to be used only once by normal backends. Hence, a quick
872878
* hack: set DatabasePath directly then unset after use.
873879
*/
874-
DatabasePath = GetDatabasePath(dbid, tsid);
875-
elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
876-
DatabasePath);
880+
if (OidIsValid(dbid))
881+
DatabasePath = GetDatabasePath(dbid, tsid);
882+
877883
RelationCacheInitFilePreInvalidate();
878-
pfree(DatabasePath);
879-
DatabasePath = NULL;
884+
885+
if (OidIsValid(dbid))
886+
{
887+
pfree(DatabasePath);
888+
DatabasePath = NULL;
889+
}
880890
}
881891

882892
SendSharedInvalidMessages(msgs, nmsgs);

src/backend/utils/cache/relcache.c

Lines changed: 123 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
236236
static void RelationClearRelation(Relation relation, bool rebuild);
237237

238238
static void RelationReloadIndexInfo(Relation relation);
239+
static void RelationReloadNailed(Relation relation);
239240
static void RelationFlushRelation(Relation relation);
240241
static void RememberToFreeTupleDescAtEOX(TupleDesc td);
241242
static void AtEOXact_cleanup(Relation relation, bool isCommit);
@@ -270,7 +271,7 @@ static void IndexSupportInitialize(oidvector *indclass,
270271
static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
271272
StrategyNumber numSupport);
272273
static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
273-
static void unlink_initfile(const char *initfilename);
274+
static void unlink_initfile(const char *initfilename, int elevel);
274275

275276

276277
/*
@@ -1669,7 +1670,16 @@ RelationIdGetRelation(Oid relationId)
16691670
RelationReloadIndexInfo(rd);
16701671
else
16711672
RelationClearRelation(rd, true);
1672-
Assert(rd->rd_isvalid);
1673+
1674+
/*
1675+
* Normally entries need to be valid here, but before the relcache
1676+
* has been initialized, not enough infrastructure exists to
1677+
* perform pg_class lookups. The structure of such entries doesn't
1678+
* change, but we still want to update the rd_rel entry. So
1679+
* rd_isvalid = false is left in place for a later lookup.
1680+
*/
1681+
Assert(rd->rd_isvalid ||
1682+
(rd->rd_isnailed && !criticalRelcachesBuilt));
16731683
}
16741684
return rd;
16751685
}
@@ -1872,6 +1882,81 @@ RelationReloadIndexInfo(Relation relation)
18721882
relation->rd_isvalid = true;
18731883
}
18741884

1885+
/*
1886+
* RelationReloadNailed - reload minimal information for nailed relations.
1887+
*
1888+
* The structure of a nailed relation can never change (which is good, because
1889+
* we rely on knowing their structure to be able to read catalog content). But
1890+
* some parts, e.g. pg_class.relfrozenxid, are still important to have
1891+
* accurate content for. Therefore those need to be reloaded after the arrival
1892+
* of invalidations.
1893+
*/
1894+
static void
1895+
RelationReloadNailed(Relation relation)
1896+
{
1897+
Assert(relation->rd_isnailed);
1898+
1899+
/*
1900+
* Redo RelationInitPhysicalAddr in case it is a mapped relation whose
1901+
* mapping changed.
1902+
*/
1903+
RelationInitPhysicalAddr(relation);
1904+
1905+
/* flag as needing to be revalidated */
1906+
relation->rd_isvalid = false;
1907+
1908+
/*
1909+
* Can only reread catalog contents if in a transaction. If the relation
1910+
* is currently open (not counting the nailed refcount), do so
1911+
* immediately. Otherwise we've already marked the entry as possibly
1912+
* invalid, and it'll be fixed when next opened.
1913+
*/
1914+
if (!IsTransactionState() || relation->rd_refcnt <= 1)
1915+
return;
1916+
1917+
if (relation->rd_rel->relkind == RELKIND_INDEX)
1918+
{
1919+
/*
1920+
* If it's a nailed-but-not-mapped index, then we need to re-read the
1921+
* pg_class row to see if its relfilenode changed.
1922+
*/
1923+
RelationReloadIndexInfo(relation);
1924+
}
1925+
else
1926+
{
1927+
/*
1928+
* Reload a non-index entry. We can't easily do so if relcaches
1929+
* aren't yet built, but that's fine because at that stage the
1930+
* attributes that need to be current (like relfrozenxid) aren't yet
1931+
* accessed. To ensure the entry will later be revalidated, we leave
1932+
* it in invalid state, but allow use (cf. RelationIdGetRelation()).
1933+
*/
1934+
if (criticalRelcachesBuilt)
1935+
{
1936+
HeapTuple pg_class_tuple;
1937+
Form_pg_class relp;
1938+
1939+
/*
1940+
* NB: Mark the entry as valid before starting to scan, to avoid
1941+
* self-recursion when re-building pg_class.
1942+
*/
1943+
relation->rd_isvalid = true;
1944+
1945+
pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
1946+
true, false);
1947+
relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
1948+
memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
1949+
heap_freetuple(pg_class_tuple);
1950+
1951+
/*
1952+
* Again mark as valid, to protect against concurrently arriving
1953+
* invalidations.
1954+
*/
1955+
relation->rd_isvalid = true;
1956+
}
1957+
}
1958+
}
1959+
18751960
/*
18761961
* RelationDestroyRelation
18771962
*
@@ -1975,26 +2060,12 @@ RelationClearRelation(Relation relation, bool rebuild)
19752060
RelationCloseSmgr(relation);
19762061

19772062
/*
1978-
* Never, never ever blow away a nailed-in system relation, because we'd
1979-
* be unable to recover. However, we must redo RelationInitPhysicalAddr
1980-
* in case it is a mapped relation whose mapping changed.
1981-
*
1982-
* If it's a nailed-but-not-mapped index, then we need to re-read the
1983-
* pg_class row to see if its relfilenode changed. We do that immediately
1984-
* if we're inside a valid transaction and the relation is open (not
1985-
* counting the nailed refcount). Otherwise just mark the entry as
1986-
* possibly invalid, and it'll be fixed when next opened.
2063+
* Treat nailed-in system relations separately, they always need to be
2064+
* accessible, so we can't blow them away.
19872065
*/
19882066
if (relation->rd_isnailed)
19892067
{
1990-
RelationInitPhysicalAddr(relation);
1991-
1992-
if (relation->rd_rel->relkind == RELKIND_INDEX)
1993-
{
1994-
relation->rd_isvalid = false; /* needs to be revalidated */
1995-
if (relation->rd_refcnt > 1 && IsTransactionState())
1996-
RelationReloadIndexInfo(relation);
1997-
}
2068+
RelationReloadNailed(relation);
19982069
return;
19992070
}
20002071

@@ -5061,24 +5132,24 @@ write_item(const void *data, Size len, FILE *fp)
50615132

50625133
/*
50635134
* Determine whether a given relation (identified by OID) is one of the ones
5064-
* we should store in the local relcache init file.
5135+
* we should store in a relcache init file.
50655136
*
50665137
* We must cache all nailed rels, and for efficiency we should cache every rel
50675138
* that supports a syscache. The former set is almost but not quite a subset
5068-
* of the latter. Currently, we must special-case TriggerRelidNameIndexId,
5069-
* which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
5070-
* but which does not support any syscache.
5071-
*
5072-
* Note: this function is currently never called for shared rels. If it were,
5073-
* we'd probably also need a special case for DatabaseNameIndexId, which is
5074-
* critical but does not support a syscache.
5139+
* of the latter. The special cases are relations where
5140+
* RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
5141+
* which do not support any syscache.
50755142
*/
50765143
bool
50775144
RelationIdIsInInitFile(Oid relationId)
50785145
{
5079-
if (relationId == TriggerRelidNameIndexId)
5146+
if (relationId == TriggerRelidNameIndexId ||
5147+
relationId == DatabaseNameIndexId)
50805148
{
5081-
/* If this Assert fails, we don't need this special case anymore. */
5149+
/*
5150+
* If this Assert fails, we don't need the applicable special case
5151+
* anymore.
5152+
*/
50825153
Assert(!RelationSupportsSysCache(relationId));
50835154
return true;
50845155
}
@@ -5106,38 +5177,30 @@ RelationIdIsInInitFile(Oid relationId)
51065177
* We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
51075178
* then release the lock in RelationCacheInitFilePostInvalidate. Caller must
51085179
* send any pending SI messages between those calls.
5109-
*
5110-
* Notice this deals only with the local init file, not the shared init file.
5111-
* The reason is that there can never be a "significant" change to the
5112-
* relcache entry of a shared relation; the most that could happen is
5113-
* updates of noncritical fields such as relpages/reltuples. So, while
5114-
* it's worth updating the shared init file from time to time, it can never
5115-
* be invalid enough to make it necessary to remove it.
51165180
*/
51175181
void
51185182
RelationCacheInitFilePreInvalidate(void)
51195183
{
5120-
char initfilename[MAXPGPATH];
5184+
char localinitfname[MAXPGPATH];
5185+
char sharedinitfname[MAXPGPATH];
51215186

5122-
snprintf(initfilename, sizeof(initfilename), "%s/%s",
5123-
DatabasePath, RELCACHE_INIT_FILENAME);
5187+
if (DatabasePath)
5188+
snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
5189+
DatabasePath, RELCACHE_INIT_FILENAME);
5190+
snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
5191+
RELCACHE_INIT_FILENAME);
51245192

51255193
LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
51265194

5127-
if (unlink(initfilename) < 0)
5128-
{
5129-
/*
5130-
* The file might not be there if no backend has been started since
5131-
* the last removal. But complain about failures other than ENOENT.
5132-
* Fortunately, it's not too late to abort the transaction if we can't
5133-
* get rid of the would-be-obsolete init file.
5134-
*/
5135-
if (errno != ENOENT)
5136-
ereport(ERROR,
5137-
(errcode_for_file_access(),
5138-
errmsg("could not remove cache file \"%s\": %m",
5139-
initfilename)));
5140-
}
5195+
/*
5196+
* The files might not be there if no backend has been started since the
5197+
* last removal. But complain about failures other than ENOENT with
5198+
* ERROR. Fortunately, it's not too late to abort the transaction if we
5199+
* can't get rid of the would-be-obsolete init file.
5200+
*/
5201+
if (DatabasePath)
5202+
unlink_initfile(localinitfname, ERROR);
5203+
unlink_initfile(sharedinitfname, ERROR);
51415204
}
51425205

51435206
void
@@ -5163,13 +5226,9 @@ RelationCacheInitFileRemove(void)
51635226
struct dirent *de;
51645227
char path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
51655228

5166-
/*
5167-
* We zap the shared cache file too. In theory it can't get out of sync
5168-
* enough to be a problem, but in data-corruption cases, who knows ...
5169-
*/
51705229
snprintf(path, sizeof(path), "global/%s",
51715230
RELCACHE_INIT_FILENAME);
5172-
unlink_initfile(path);
5231+
unlink_initfile(path, LOG);
51735232

51745233
/* Scan everything in the default tablespace */
51755234
RelationCacheInitFileRemoveInDir("base");
@@ -5221,20 +5280,23 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
52215280
/* Try to remove the init file in each database */
52225281
snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
52235282
tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
5224-
unlink_initfile(initfilename);
5283+
unlink_initfile(initfilename, LOG);
52255284
}
52265285
}
52275286

52285287
FreeDir(dir);
52295288
}
52305289

52315290
static void
5232-
unlink_initfile(const char *initfilename)
5291+
unlink_initfile(const char *initfilename, int elevel)
52335292
{
52345293
if (unlink(initfilename) < 0)
52355294
{
52365295
/* It might not be there, but log any error other than ENOENT */
52375296
if (errno != ENOENT)
5238-
elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
5297+
ereport(elevel,
5298+
(errcode_for_file_access(),
5299+
errmsg("could not remove cache file \"%s\": %m",
5300+
initfilename)));
52395301
}
52405302
}

0 commit comments

Comments
 (0)