Skip to content

Commit f64b11f

Browse files
committed
Fix an assertion failure related to an exclusive backup.
Previously multiple sessions could execute pg_start_backup() and pg_stop_backup() to start and stop an exclusive backup at the same time. This could trigger the assertion failure of "FailedAssertion("!(XLogCtl->Insert.exclusiveBackup)". This happend because, even while pg_start_backup() was starting an exclusive backup, other session could run pg_stop_backup() concurrently and mark the backup as not-in-progress unconditionally. This patch introduces ExclusiveBackupState indicating the state of an exclusive backup. This state is used to ensure that there is only one session running pg_start_backup() or pg_stop_backup() at the same time, to avoid the assertion failure. Back-patch to all supported versions. Author: Michael Paquier Reviewed-By: Kyotaro Horiguchi and me Reported-By: Andreas Seltenreich Discussion: <87mvktojme.fsf@credativ.de>
1 parent 47d32a4 commit f64b11f

File tree

1 file changed

+147
-62
lines changed
  • src/backend/access/transam

1 file changed

+147
-62
lines changed

src/backend/access/transam/xlog.c

Lines changed: 147 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,29 @@ typedef struct XLogwrtResult
348348
XLogRecPtr Flush; /* last byte + 1 flushed */
349349
} XLogwrtResult;
350350

351+
/*
352+
* State of an exclusive backup, necessary to control concurrent activities
353+
* across sessions when working on exclusive backups.
354+
*
355+
* EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
356+
* running, to be more precise pg_start_backup() is not being executed for
357+
* an exclusive backup and there is no exclusive backup in progress.
358+
* EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
359+
* exclusive backup.
360+
* EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
361+
* running and an exclusive backup is in progress. pg_stop_backup() is
362+
* needed to finish it.
363+
* EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
364+
* exclusive backup.
365+
*/
366+
typedef enum ExclusiveBackupState
367+
{
368+
EXCLUSIVE_BACKUP_NONE = 0,
369+
EXCLUSIVE_BACKUP_STARTING,
370+
EXCLUSIVE_BACKUP_IN_PROGRESS,
371+
EXCLUSIVE_BACKUP_STOPPING
372+
} ExclusiveBackupState;
373+
351374
/*
352375
* Shared state data for XLogInsert.
353376
*/
@@ -370,13 +393,15 @@ typedef struct XLogCtlInsert
370393
bool fullPageWrites;
371394

372395
/*
373-
* exclusiveBackup is true if a backup started with pg_start_backup() is
374-
* in progress, and nonExclusiveBackups is a counter indicating the number
375-
* of streaming base backups currently in progress. forcePageWrites is set
376-
* to true when either of these is non-zero. lastBackupStart is the latest
377-
* checkpoint redo location used as a starting point for an online backup.
396+
* exclusiveBackupState indicates the state of an exclusive backup
397+
* (see comments of ExclusiveBackupState for more details).
398+
* nonExclusiveBackups is a counter indicating the number of streaming
399+
* base backups currently in progress. forcePageWrites is set to true
400+
* when either of these is non-zero. lastBackupStart is the latest
401+
* checkpoint redo location used as a starting point for an online
402+
* backup.
378403
*/
379-
bool exclusiveBackup;
404+
ExclusiveBackupState exclusiveBackupState;
380405
int nonExclusiveBackups;
381406
XLogRecPtr lastBackupStart;
382407
} XLogCtlInsert;
@@ -693,6 +718,7 @@ static bool CheckForStandbyTrigger(void);
693718
static void xlog_outrec(StringInfo buf, XLogRecord *record);
694719
#endif
695720
static void pg_start_backup_callback(int code, Datum arg);
721+
static void pg_stop_backup_callback(int code, Datum arg);
696722
static bool read_backup_label(XLogRecPtr *checkPointLoc,
697723
bool *backupEndRequired, bool *backupFromStandby);
698724
static void rm_redo_error_callback(void *arg);
@@ -8699,15 +8725,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
86998725
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
87008726
if (exclusive)
87018727
{
8702-
if (XLogCtl->Insert.exclusiveBackup)
8728+
/*
8729+
* At first, mark that we're now starting an exclusive backup,
8730+
* to ensure that there are no other sessions currently running
8731+
* pg_start_backup() or pg_stop_backup().
8732+
*/
8733+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
87038734
{
87048735
LWLockRelease(WALInsertLock);
87058736
ereport(ERROR,
87068737
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
87078738
errmsg("a backup is already in progress"),
87088739
errhint("Run pg_stop_backup() and try again.")));
87098740
}
8710-
XLogCtl->Insert.exclusiveBackup = true;
8741+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
87118742
}
87128743
else
87138744
XLogCtl->Insert.nonExclusiveBackups++;
@@ -8867,7 +8898,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
88678898
{
88688899
/*
88698900
* Check for existing backup label --- implies a backup is already
8870-
* running. (XXX given that we checked exclusiveBackup above,
8901+
* running. (XXX given that we checked exclusiveBackupState above,
88718902
* maybe it would be OK to just unlink any such label file?)
88728903
*/
88738904
if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
@@ -8908,6 +8939,16 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
89088939
}
89098940
PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
89108941

8942+
/*
8943+
* Mark that start phase has correctly finished for an exclusive backup.
8944+
*/
8945+
if (exclusive)
8946+
{
8947+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8948+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
8949+
LWLockRelease(WALInsertLock);
8950+
}
8951+
89118952
/*
89128953
* We're done. As a convenience, return the starting WAL location.
89138954
*/
@@ -8926,23 +8967,41 @@ pg_start_backup_callback(int code, Datum arg)
89268967
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
89278968
if (exclusive)
89288969
{
8929-
Assert(XLogCtl->Insert.exclusiveBackup);
8930-
XLogCtl->Insert.exclusiveBackup = false;
8970+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
8971+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
89318972
}
89328973
else
89338974
{
89348975
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
89358976
XLogCtl->Insert.nonExclusiveBackups--;
89368977
}
89378978

8938-
if (!XLogCtl->Insert.exclusiveBackup &&
8979+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
89398980
XLogCtl->Insert.nonExclusiveBackups == 0)
89408981
{
89418982
XLogCtl->Insert.forcePageWrites = false;
89428983
}
89438984
LWLockRelease(WALInsertLock);
89448985
}
89458986

8987+
/*
8988+
* Error cleanup callback for pg_stop_backup
8989+
*/
8990+
static void
8991+
pg_stop_backup_callback(int code, Datum arg)
8992+
{
8993+
bool exclusive = DatumGetBool(arg);
8994+
8995+
/* Update backup status on failure */
8996+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8997+
if (exclusive)
8998+
{
8999+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
9000+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
9001+
}
9002+
LWLockRelease(WALInsertLock);
9003+
}
9004+
89469005
/*
89479006
* do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
89489007
* function.
@@ -9006,12 +9065,85 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
90069065
errmsg("WAL level not sufficient for making an online backup"),
90079066
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
90089067

9068+
if (exclusive)
9069+
{
9070+
/*
9071+
* At first, mark that we're now stopping an exclusive backup,
9072+
* to ensure that there are no other sessions currently running
9073+
* pg_start_backup() or pg_stop_backup().
9074+
*/
9075+
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9076+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
9077+
{
9078+
LWLockRelease(WALInsertLock);
9079+
ereport(ERROR,
9080+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9081+
errmsg("exclusive backup not in progress")));
9082+
}
9083+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
9084+
LWLockRelease(WALInsertLock);
9085+
9086+
/*
9087+
* Remove backup_label. In case of failure, the state for an exclusive
9088+
* backup is switched back to in-progress.
9089+
*/
9090+
PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
9091+
{
9092+
/*
9093+
* Read the existing label file into memory.
9094+
*/
9095+
struct stat statbuf;
9096+
int r;
9097+
9098+
if (stat(BACKUP_LABEL_FILE, &statbuf))
9099+
{
9100+
if (errno != ENOENT)
9101+
ereport(ERROR,
9102+
(errcode_for_file_access(),
9103+
errmsg("could not stat file \"%s\": %m",
9104+
BACKUP_LABEL_FILE)));
9105+
ereport(ERROR,
9106+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9107+
errmsg("a backup is not in progress")));
9108+
}
9109+
9110+
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9111+
if (!lfp)
9112+
{
9113+
ereport(ERROR,
9114+
(errcode_for_file_access(),
9115+
errmsg("could not read file \"%s\": %m",
9116+
BACKUP_LABEL_FILE)));
9117+
}
9118+
labelfile = palloc(statbuf.st_size + 1);
9119+
r = fread(labelfile, statbuf.st_size, 1, lfp);
9120+
labelfile[statbuf.st_size] = '\0';
9121+
9122+
/*
9123+
* Close and remove the backup label file
9124+
*/
9125+
if (r != 1 || ferror(lfp) || FreeFile(lfp))
9126+
ereport(ERROR,
9127+
(errcode_for_file_access(),
9128+
errmsg("could not read file \"%s\": %m",
9129+
BACKUP_LABEL_FILE)));
9130+
if (unlink(BACKUP_LABEL_FILE) != 0)
9131+
ereport(ERROR,
9132+
(errcode_for_file_access(),
9133+
errmsg("could not remove file \"%s\": %m",
9134+
BACKUP_LABEL_FILE)));
9135+
}
9136+
PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
9137+
}
9138+
90099139
/*
90109140
* OK to update backup counters and forcePageWrites
90119141
*/
90129142
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
90139143
if (exclusive)
9014-
XLogCtl->Insert.exclusiveBackup = false;
9144+
{
9145+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
9146+
}
90159147
else
90169148
{
90179149
/*
@@ -9024,60 +9156,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
90249156
XLogCtl->Insert.nonExclusiveBackups--;
90259157
}
90269158

9027-
if (!XLogCtl->Insert.exclusiveBackup &&
9159+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
90289160
XLogCtl->Insert.nonExclusiveBackups == 0)
90299161
{
90309162
XLogCtl->Insert.forcePageWrites = false;
90319163
}
90329164
LWLockRelease(WALInsertLock);
90339165

9034-
if (exclusive)
9035-
{
9036-
/*
9037-
* Read the existing label file into memory.
9038-
*/
9039-
struct stat statbuf;
9040-
int r;
9041-
9042-
if (stat(BACKUP_LABEL_FILE, &statbuf))
9043-
{
9044-
if (errno != ENOENT)
9045-
ereport(ERROR,
9046-
(errcode_for_file_access(),
9047-
errmsg("could not stat file \"%s\": %m",
9048-
BACKUP_LABEL_FILE)));
9049-
ereport(ERROR,
9050-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9051-
errmsg("a backup is not in progress")));
9052-
}
9053-
9054-
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9055-
if (!lfp)
9056-
{
9057-
ereport(ERROR,
9058-
(errcode_for_file_access(),
9059-
errmsg("could not read file \"%s\": %m",
9060-
BACKUP_LABEL_FILE)));
9061-
}
9062-
labelfile = palloc(statbuf.st_size + 1);
9063-
r = fread(labelfile, statbuf.st_size, 1, lfp);
9064-
labelfile[statbuf.st_size] = '\0';
9065-
9066-
/*
9067-
* Close and remove the backup label file
9068-
*/
9069-
if (r != 1 || ferror(lfp) || FreeFile(lfp))
9070-
ereport(ERROR,
9071-
(errcode_for_file_access(),
9072-
errmsg("could not read file \"%s\": %m",
9073-
BACKUP_LABEL_FILE)));
9074-
if (unlink(BACKUP_LABEL_FILE) != 0)
9075-
ereport(ERROR,
9076-
(errcode_for_file_access(),
9077-
errmsg("could not remove file \"%s\": %m",
9078-
BACKUP_LABEL_FILE)));
9079-
}
9080-
90819166
/*
90829167
* Read and parse the START WAL LOCATION line (this code is pretty crude,
90839168
* but we are not expecting any variability in the file format).
@@ -9318,7 +9403,7 @@ do_pg_abort_backup(void)
93189403
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
93199404
XLogCtl->Insert.nonExclusiveBackups--;
93209405

9321-
if (!XLogCtl->Insert.exclusiveBackup &&
9406+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
93229407
XLogCtl->Insert.nonExclusiveBackups == 0)
93239408
{
93249409
XLogCtl->Insert.forcePageWrites = false;

0 commit comments

Comments
 (0)