Fix bugs in vacuum of shared rels, by keeping their relcache entries current.

anarazel · anarazel · commit 817f9f9a8a19 · 2018-06-12T11:13:22.000-07:00
When vacuum processes a relation it uses the corresponding relcache entry's relfrozenxid / relminmxid as a cutoff for when to remove tuples etc. Unfortunately for nailed relations (i.e. critical system catalogs) bugs could frequently lead to the corresponding relcache entry being stale. This set of bugs could cause actual data corruption as vacuum would potentially not remove the correct row versions, potentially reviving them at a later point. After 699bf7d some corruptions in this vein were prevented, but the additional error checks could also trigger spuriously. Examples of such errors are: ERROR: found xmin ... from before relfrozenxid ... and ERROR: found multixact ... from before relminmxid ... To be caused by this bug the errors have to occur on system catalog tables. The two bugs are: 1) Invalidations for nailed relations were ignored, based on the theory that the relcache entry for such tables doesn't change. Which is largely true, except for fields like relfrozenxid etc. This means that changes to relations vacuumed in other sessions weren't picked up by already existing sessions. Luckily autovacuum doesn't have particularly longrunning sessions. 2) For shared *and* nailed relations, the shared relcache init file was never invalidated while running. That means that for such tables (e.g. pg_authid, pg_database) it's not just already existing sessions that are affected, but even new connections are as well. That explains why the reports usually were about pg_authid et. al. To fix 1), revalidate the rd_rel portion of a relcache entry when invalid. This implies a bit of extra complexity to deal with bootstrapping, but it's not too bad. The fix for 2) is simpler, simply always remove both the shared and local init files. Author: Andres Freund Reviewed-By: Alvaro Herrera Discussion: https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com Backpatch: 9.3-
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
@@ -514,10 +514,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
 	(void) GetCurrentCommandId(true);
 
 	/*
-	 * If the relation being invalidated is one of those cached in the local
-	 * relcache init file, mark that we need to zap that file at commit.
+	 * If the relation being invalidated is one of those cached in a relcache
+	 * init file, mark that we need to zap that file at commit. For simplicity
+	 * invalidations for a specific database always invalidate the shared file
+	 * as well.
 	 */
-	if (OidIsValid(dbId) && RelationIdIsInInitFile(relId))
+	if (RelationIdIsInInitFile(relId))
 		transInvalInfo->RelcacheInitFileInval = true;
 }
 
@@ -865,18 +867,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
 
 	if (RelcacheInitFileInval)
 	{
+		elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
+			 dbid);
+
 		/*
-		 * RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
-		 * but we should not use SetDatabasePath during recovery, since it is
+		 * RelationCacheInitFilePreInvalidate, when the invalidation message
+		 * is for a specific database, requires DatabasePath to be set, but we
+		 * should not use SetDatabasePath during recovery, since it is
 		 * intended to be used only once by normal backends.  Hence, a quick
 		 * hack: set DatabasePath directly then unset after use.
 		 */
-		DatabasePath = GetDatabasePath(dbid, tsid);
-		elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
-			 DatabasePath);
+		if (OidIsValid(dbid))
+			DatabasePath = GetDatabasePath(dbid, tsid);
+
 		RelationCacheInitFilePreInvalidate();
-		pfree(DatabasePath);
-		DatabasePath = NULL;
+
+		if (OidIsValid(dbid))
+		{
+			pfree(DatabasePath);
+			DatabasePath = NULL;
+		}
 	}
 
 	SendSharedInvalidMessages(msgs, nmsgs);
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
@@ -236,6 +236,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
 static void RelationClearRelation(Relation relation, bool rebuild);
 
 static void RelationReloadIndexInfo(Relation relation);
+static void RelationReloadNailed(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static void RememberToFreeTupleDescAtEOX(TupleDesc td);
 static void AtEOXact_cleanup(Relation relation, bool isCommit);
@@ -270,7 +271,7 @@ static void IndexSupportInitialize(oidvector *indclass,
 static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
 				  StrategyNumber numSupport);
 static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
-static void unlink_initfile(const char *initfilename);
+static void unlink_initfile(const char *initfilename, int elevel);
 
 
 /*
@@ -1669,7 +1670,16 @@ RelationIdGetRelation(Oid relationId)
 				RelationReloadIndexInfo(rd);
 			else
 				RelationClearRelation(rd, true);
-			Assert(rd->rd_isvalid);
+
+			/*
+			 * Normally entries need to be valid here, but before the relcache
+			 * has been initialized, not enough infrastructure exists to
+			 * perform pg_class lookups. The structure of such entries doesn't
+			 * change, but we still want to update the rd_rel entry. So
+			 * rd_isvalid = false is left in place for a later lookup.
+			 */
+			Assert(rd->rd_isvalid ||
+				   (rd->rd_isnailed && !criticalRelcachesBuilt));
 		}
 		return rd;
 	}
@@ -1872,6 +1882,81 @@ RelationReloadIndexInfo(Relation relation)
 	relation->rd_isvalid = true;
 }
 
+/*
+ * RelationReloadNailed - reload minimal information for nailed relations.
+ *
+ * The structure of a nailed relation can never change (which is good, because
+ * we rely on knowing their structure to be able to read catalog content). But
+ * some parts, e.g. pg_class.relfrozenxid, are still important to have
+ * accurate content for. Therefore those need to be reloaded after the arrival
+ * of invalidations.
+ */
+static void
+RelationReloadNailed(Relation relation)
+{
+	Assert(relation->rd_isnailed);
+
+	/*
+	 * Redo RelationInitPhysicalAddr in case it is a mapped relation whose
+	 * mapping changed.
+	 */
+	RelationInitPhysicalAddr(relation);
+
+	/* flag as needing to be revalidated */
+	relation->rd_isvalid = false;
+
+	/*
+	 * Can only reread catalog contents if in a transaction.  If the relation
+	 * is currently open (not counting the nailed refcount), do so
+	 * immediately. Otherwise we've already marked the entry as possibly
+	 * invalid, and it'll be fixed when next opened.
+	 */
+	if (!IsTransactionState() || relation->rd_refcnt <= 1)
+		return;
+
+	if (relation->rd_rel->relkind == RELKIND_INDEX)
+	{
+		/*
+		 * If it's a nailed-but-not-mapped index, then we need to re-read the
+		 * pg_class row to see if its relfilenode changed.
+		 */
+		RelationReloadIndexInfo(relation);
+	}
+	else
+	{
+		/*
+		 * Reload a non-index entry.  We can't easily do so if relcaches
+		 * aren't yet built, but that's fine because at that stage the
+		 * attributes that need to be current (like relfrozenxid) aren't yet
+		 * accessed.  To ensure the entry will later be revalidated, we leave
+		 * it in invalid state, but allow use (cf. RelationIdGetRelation()).
+		 */
+		if (criticalRelcachesBuilt)
+		{
+			HeapTuple	pg_class_tuple;
+			Form_pg_class relp;
+
+			/*
+			 * NB: Mark the entry as valid before starting to scan, to avoid
+			 * self-recursion when re-building pg_class.
+			 */
+			relation->rd_isvalid = true;
+
+			pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
+											true, false);
+			relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
+			memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
+			heap_freetuple(pg_class_tuple);
+
+			/*
+			 * Again mark as valid, to protect against concurrently arriving
+			 * invalidations.
+			 */
+			relation->rd_isvalid = true;
+		}
+	}
+}
+
 /*
  * RelationDestroyRelation
  *
@@ -1975,26 +2060,12 @@ RelationClearRelation(Relation relation, bool rebuild)
 	RelationCloseSmgr(relation);
 
 	/*
-	 * Never, never ever blow away a nailed-in system relation, because we'd
-	 * be unable to recover.  However, we must redo RelationInitPhysicalAddr
-	 * in case it is a mapped relation whose mapping changed.
-	 *
-	 * If it's a nailed-but-not-mapped index, then we need to re-read the
-	 * pg_class row to see if its relfilenode changed. We do that immediately
-	 * if we're inside a valid transaction and the relation is open (not
-	 * counting the nailed refcount).  Otherwise just mark the entry as
-	 * possibly invalid, and it'll be fixed when next opened.
+	 * Treat nailed-in system relations separately, they always need to be
+	 * accessible, so we can't blow them away.
 	 */
 	if (relation->rd_isnailed)
 	{
-		RelationInitPhysicalAddr(relation);
-
-		if (relation->rd_rel->relkind == RELKIND_INDEX)
-		{
-			relation->rd_isvalid = false;		/* needs to be revalidated */
-			if (relation->rd_refcnt > 1 && IsTransactionState())
-				RelationReloadIndexInfo(relation);
-		}
+		RelationReloadNailed(relation);
 		return;
 	}
 
@@ -5061,24 +5132,24 @@ write_item(const void *data, Size len, FILE *fp)
 
 /*
  * Determine whether a given relation (identified by OID) is one of the ones
- * we should store in the local relcache init file.
+ * we should store in a relcache init file.
  *
  * We must cache all nailed rels, and for efficiency we should cache every rel
  * that supports a syscache.  The former set is almost but not quite a subset
- * of the latter.  Currently, we must special-case TriggerRelidNameIndexId,
- * which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
- * but which does not support any syscache.
- *
- * Note: this function is currently never called for shared rels.  If it were,
- * we'd probably also need a special case for DatabaseNameIndexId, which is
- * critical but does not support a syscache.
+ * of the latter. The special cases are relations where
+ * RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
+ * which do not support any syscache.
  */
 bool
 RelationIdIsInInitFile(Oid relationId)
 {
-	if (relationId == TriggerRelidNameIndexId)
+	if (relationId == TriggerRelidNameIndexId ||
+		relationId == DatabaseNameIndexId)
 	{
-		/* If this Assert fails, we don't need this special case anymore. */
+		/*
+		 * If this Assert fails, we don't need the applicable special case
+		 * anymore.
+		 */
 		Assert(!RelationSupportsSysCache(relationId));
 		return true;
 	}
@@ -5106,38 +5177,30 @@ RelationIdIsInInitFile(Oid relationId)
  * We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
  * then release the lock in RelationCacheInitFilePostInvalidate.  Caller must
  * send any pending SI messages between those calls.
- *
- * Notice this deals only with the local init file, not the shared init file.
- * The reason is that there can never be a "significant" change to the
- * relcache entry of a shared relation; the most that could happen is
- * updates of noncritical fields such as relpages/reltuples.  So, while
- * it's worth updating the shared init file from time to time, it can never
- * be invalid enough to make it necessary to remove it.
  */
 void
 RelationCacheInitFilePreInvalidate(void)
 {
-	char		initfilename[MAXPGPATH];
+	char		localinitfname[MAXPGPATH];
+	char		sharedinitfname[MAXPGPATH];
 
-	snprintf(initfilename, sizeof(initfilename), "%s/%s",
-			 DatabasePath, RELCACHE_INIT_FILENAME);
+	if (DatabasePath)
+		snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
+				 DatabasePath, RELCACHE_INIT_FILENAME);
+	snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
+			 RELCACHE_INIT_FILENAME);
 
 	LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
 
-	if (unlink(initfilename) < 0)
-	{
-		/*
-		 * The file might not be there if no backend has been started since
-		 * the last removal.  But complain about failures other than ENOENT.
-		 * Fortunately, it's not too late to abort the transaction if we can't
-		 * get rid of the would-be-obsolete init file.
-		 */
-		if (errno != ENOENT)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not remove cache file \"%s\": %m",
-							initfilename)));
-	}
+	/*
+	 * The files might not be there if no backend has been started since the
+	 * last removal.  But complain about failures other than ENOENT with
+	 * ERROR.  Fortunately, it's not too late to abort the transaction if we
+	 * can't get rid of the would-be-obsolete init file.
+	 */
+	if (DatabasePath)
+		unlink_initfile(localinitfname, ERROR);
+	unlink_initfile(sharedinitfname, ERROR);
 }
 
 void
@@ -5163,13 +5226,9 @@ RelationCacheInitFileRemove(void)
 	struct dirent *de;
 	char		path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
 
-	/*
-	 * We zap the shared cache file too.  In theory it can't get out of sync
-	 * enough to be a problem, but in data-corruption cases, who knows ...
-	 */
 	snprintf(path, sizeof(path), "global/%s",
 			 RELCACHE_INIT_FILENAME);
-	unlink_initfile(path);
+	unlink_initfile(path, LOG);
 
 	/* Scan everything in the default tablespace */
 	RelationCacheInitFileRemoveInDir("base");
@@ -5221,20 +5280,23 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
 			/* Try to remove the init file in each database */
 			snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
 					 tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
-			unlink_initfile(initfilename);
+			unlink_initfile(initfilename, LOG);
 		}
 	}
 
 	FreeDir(dir);
 }
 
 static void
-unlink_initfile(const char *initfilename)
+unlink_initfile(const char *initfilename, int elevel)
 {
 	if (unlink(initfilename) < 0)
 	{
 		/* It might not be there, but log any error other than ENOENT */
 		if (errno != ENOENT)
-			elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not remove cache file \"%s\": %m",
+							initfilename)));
 	}
 }