Skip to content

Commit 2a1cf97

Browse files
committed
Have pg_stop_backup() wait for all archive files to be sent, rather than
returing right away. This guarantees that when pg_stop_backup() returns, you have a valid backup. Simon Riggs
1 parent a0fad97 commit 2a1cf97

File tree

2 files changed

+59
-14
lines changed

2 files changed

+59
-14
lines changed

doc/src/sgml/backup.sgml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/backup.sgml,v 2.116 2008/03/28 15:00:28 heikki Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/backup.sgml,v 2.117 2008/04/05 01:34:05 momjian Exp $ -->
22

33
<chapter id="backup">
44
<title>Backup and Restore</title>
@@ -761,12 +761,19 @@ SELECT pg_stop_backup();
761761
<para>
762762
Once the WAL segment files used during the backup are archived, you are
763763
done. The file identified by <function>pg_stop_backup</>'s result is
764-
the last segment that needs to be archived to complete the backup.
765-
Archival of these files will happen automatically, since you have
766-
already configured <varname>archive_command</>. In many cases, this
767-
happens fairly quickly, but you are advised to monitor your archival
768-
system to ensure this has taken place so that you can be certain you
769-
have a complete backup.
764+
the last segment that is required to form a complete set of backup files.
765+
<function>pg_stop_backup</> does not return until the last segment has
766+
been archived.
767+
Archiving of these files happens automatically since you have
768+
already configured <varname>archive_command</>. In most cases this
769+
happens quickly, but you are advised to monitor your archive
770+
system to ensure there are no delays.
771+
If the archive process has fallen behind
772+
because of failures of the archive command, it will keep retrying
773+
until the archive succeeds and the backup is complete.
774+
If you wish to place a time limit on the execution of
775+
<function>pg_stop_backup</>, set an appropriate
776+
<varname>statement_timeout</varname> value.
770777
</para>
771778
</listitem>
772779
</orderedlist>
@@ -1044,7 +1051,7 @@ restore_command = 'cp /mnt/server/archivedir/%f %p'
10441051
<note>
10451052
<para>
10461053
The stop point must be after the ending time of the base backup, i.e.,
1047-
the time of <function>pg_stop_backup</>. You cannot use a base backup
1054+
the end time of <function>pg_stop_backup</>. You cannot use a base backup
10481055
to recover to a time when that backup was still going on. (To
10491056
recover to such a time, you must go back to your previous base backup
10501057
and roll forward from there.)
@@ -1322,6 +1329,7 @@ tar -rf /var/lib/pgsql/backup.tar /var/lib/pgsql/archive/
13221329
After the backup the switch file is removed. Archived WAL files are
13231330
then added to the backup so that both base backup and all required
13241331
WAL files are part of the same <application>tar</> file.
1332+
Please remember to add error handling to your backup scripts.
13251333
</para>
13261334
</sect3>
13271335

src/backend/access/transam/xlog.c

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.295 2008/03/25 22:42:42 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.296 2008/04/05 01:34:06 momjian Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -382,7 +382,7 @@ static bool InRedo = false;
382382

383383
static void XLogArchiveNotify(const char *xlog);
384384
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
385-
static bool XLogArchiveCheckDone(const char *xlog);
385+
static bool XLogArchiveCheckDone(const char *xlog, bool create_if_missing);
386386
static void XLogArchiveCleanup(const char *xlog);
387387
static void readRecoveryCommandFile(void);
388388
static void exitArchiveRecovery(TimeLineID endTLI,
@@ -1128,7 +1128,7 @@ XLogArchiveNotifySeg(uint32 log, uint32 seg)
11281128
* create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
11291129
*/
11301130
static bool
1131-
XLogArchiveCheckDone(const char *xlog)
1131+
XLogArchiveCheckDone(const char *xlog, bool create_if_missing)
11321132
{
11331133
char archiveStatusPath[MAXPGPATH];
11341134
struct stat stat_buf;
@@ -1153,7 +1153,9 @@ XLogArchiveCheckDone(const char *xlog)
11531153
return true;
11541154

11551155
/* Retry creation of the .ready file */
1156-
XLogArchiveNotify(xlog);
1156+
if (create_if_missing)
1157+
XLogArchiveNotify(xlog);
1158+
11571159
return false;
11581160
}
11591161

@@ -2704,7 +2706,7 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
27042706
strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
27052707
strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
27062708
{
2707-
if (XLogArchiveCheckDone(xlde->d_name))
2709+
if (XLogArchiveCheckDone(xlde->d_name, true))
27082710
{
27092711
snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
27102712

@@ -2771,7 +2773,7 @@ CleanupBackupHistory(void)
27712773
strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
27722774
".backup") == 0)
27732775
{
2774-
if (XLogArchiveCheckDone(xlde->d_name))
2776+
if (XLogArchiveCheckDone(xlde->d_name, true))
27752777
{
27762778
ereport(DEBUG2,
27772779
(errmsg("removing transaction log backup history file \"%s\"",
@@ -6556,6 +6558,8 @@ pg_stop_backup(PG_FUNCTION_ARGS)
65566558
FILE *fp;
65576559
char ch;
65586560
int ich;
6561+
int seconds_before_warning;
6562+
int waits = 0;
65596563

65606564
if (!superuser())
65616565
ereport(ERROR,
@@ -6659,6 +6663,39 @@ pg_stop_backup(PG_FUNCTION_ARGS)
66596663
*/
66606664
CleanupBackupHistory();
66616665

6666+
/*
6667+
* Wait until the history file has been archived. We assume that the
6668+
* alphabetic sorting property of the WAL files ensures the last WAL
6669+
* file is guaranteed archived by the time the history file is archived.
6670+
*
6671+
* We wait forever, since archive_command is supposed to work and
6672+
* we assume the admin wanted his backup to work completely. If you
6673+
* don't wish to wait, you can SET statement_timeout = xx;
6674+
*
6675+
* If the status file is missing, we assume that is because it was
6676+
* set to .ready before we slept, then while asleep it has been set
6677+
* to .done and then removed by a concurrent checkpoint.
6678+
*/
6679+
BackupHistoryFileName(histfilepath, ThisTimeLineID, _logId, _logSeg,
6680+
startpoint.xrecoff % XLogSegSize);
6681+
6682+
seconds_before_warning = 60;
6683+
waits = 0;
6684+
6685+
while (!XLogArchiveCheckDone(histfilepath, false))
6686+
{
6687+
CHECK_FOR_INTERRUPTS();
6688+
6689+
pg_usleep(1000000L);
6690+
6691+
if (++waits >= seconds_before_warning)
6692+
{
6693+
seconds_before_warning *= 2; /* This wraps in >10 years... */
6694+
elog(WARNING, "pg_stop_backup() waiting for archive to complete "
6695+
"(%d seconds delay)", waits);
6696+
}
6697+
}
6698+
66626699
/*
66636700
* We're done. As a convenience, return the ending WAL location.
66646701
*/

0 commit comments

Comments
 (0)