Skip to content

Commit 6534d54

Browse files
committed
PANIC on fsync() failure.
On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
1 parent 542e6f3 commit 6534d54

File tree

12 files changed

+99
-18
lines changed

12 files changed

+99
-18
lines changed

doc/src/sgml/config.sgml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8127,6 +8127,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
81278127
</listitem>
81288128
</varlistentry>
81298129

8130+
<varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
8131+
<term><varname>data_sync_retry</varname> (<type>boolean</type>)
8132+
<indexterm>
8133+
<primary><varname>data_sync_retry</varname> configuration parameter</primary>
8134+
</indexterm>
8135+
</term>
8136+
<listitem>
8137+
<para>
8138+
When set to false, which is the default, <productname>PostgreSQL</productname>
8139+
will raise a PANIC-level error on failure to flush modified data files
8140+
to the filesystem. This causes the database server to crash.
8141+
</para>
8142+
<para>
8143+
On some operating systems, the status of data in the kernel's page
8144+
cache is unknown after a write-back failure. In some cases it might
8145+
have been entirely forgotten, making it unsafe to retry; the second
8146+
attempt may be reported as successful, when in fact the data has been
8147+
lost. In these circumstances, the only way to avoid data loss is to
8148+
recover from the WAL after any failure is reported, preferably
8149+
after investigating the root cause of the failure and replacing any
8150+
faulty hardware.
8151+
</para>
8152+
<para>
8153+
If set to true, <productname>PostgreSQL</productname> will instead
8154+
report an error but continue to run so that the data flushing
8155+
operation can be retried in a later checkpoint. Only set it to true
8156+
after investigating the operating system's treatment of buffered data
8157+
in case of write-back failure.
8158+
</para>
8159+
</listitem>
8160+
</varlistentry>
8161+
81308162
</variablelist>
81318163

81328164
</sect1>

src/backend/access/heap/rewriteheap.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -978,7 +978,7 @@ logical_end_heap_rewrite(RewriteState state)
978978
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
979979
{
980980
if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
981-
ereport(ERROR,
981+
ereport(data_sync_elevel(ERROR),
982982
(errcode_for_file_access(),
983983
errmsg("could not fsync file \"%s\": %m", src->path)));
984984
FileClose(src->vfd);
@@ -1199,7 +1199,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
11991199
*/
12001200
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
12011201
if (pg_fsync(fd) != 0)
1202-
ereport(ERROR,
1202+
ereport(data_sync_elevel(ERROR),
12031203
(errcode_for_file_access(),
12041204
errmsg("could not fsync file \"%s\": %m", path)));
12051205
pgstat_report_wait_end();
@@ -1298,7 +1298,7 @@ CheckPointLogicalRewriteHeap(void)
12981298
*/
12991299
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
13001300
if (pg_fsync(fd) != 0)
1301-
ereport(ERROR,
1301+
ereport(data_sync_elevel(ERROR),
13021302
(errcode_for_file_access(),
13031303
errmsg("could not fsync file \"%s\": %m", path)));
13041304
pgstat_report_wait_end();

src/backend/access/transam/slru.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
928928
path, offset)));
929929
break;
930930
case SLRU_FSYNC_FAILED:
931-
ereport(ERROR,
931+
ereport(data_sync_elevel(ERROR),
932932
(errcode_for_file_access(),
933933
errmsg("could not access status of transaction %u", xid),
934934
errdetail("Could not fsync file \"%s\": %m.",

src/backend/access/transam/timeline.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
406406

407407
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
408408
if (pg_fsync(fd) != 0)
409-
ereport(ERROR,
409+
ereport(data_sync_elevel(ERROR),
410410
(errcode_for_file_access(),
411411
errmsg("could not fsync file \"%s\": %m", tmppath)));
412412
pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
485485

486486
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
487487
if (pg_fsync(fd) != 0)
488-
ereport(ERROR,
488+
ereport(data_sync_elevel(ERROR),
489489
(errcode_for_file_access(),
490490
errmsg("could not fsync file \"%s\": %m", tmppath)));
491491
pgstat_report_wait_end();

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3468,7 +3468,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
34683468

34693469
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
34703470
if (pg_fsync(fd) != 0)
3471-
ereport(ERROR,
3471+
ereport(data_sync_elevel(ERROR),
34723472
(errcode_for_file_access(),
34733473
errmsg("could not fsync file \"%s\": %m", tmppath)));
34743474
pgstat_report_wait_end();

src/backend/replication/logical/snapbuild.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
16291629
* fsync the file before renaming so that even if we crash after this we
16301630
* have either a fully valid file or nothing.
16311631
*
1632+
* It's safe to just ERROR on fsync() here because we'll retry the whole
1633+
* operation including the writes.
1634+
*
16321635
* TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
16331636
* some noticeable overhead since it's performed synchronously during
16341637
* decoding?

src/backend/storage/file/fd.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ int max_files_per_process = 1000;
145145
*/
146146
int max_safe_fds = 32; /* default if not changed */
147147

148+
/* Whether it is safe to continue running after fsync() fails. */
149+
bool data_sync_retry = false;
148150

149151
/* Debugging.... */
150152

@@ -442,11 +444,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
442444
*/
443445
rc = sync_file_range(fd, offset, nbytes,
444446
SYNC_FILE_RANGE_WRITE);
445-
446-
/* don't error out, this is just a performance optimization */
447447
if (rc != 0)
448448
{
449-
ereport(WARNING,
449+
ereport(data_sync_elevel(WARNING),
450450
(errcode_for_file_access(),
451451
errmsg("could not flush dirty data: %m")));
452452
}
@@ -518,7 +518,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
518518
rc = msync(p, (size_t) nbytes, MS_ASYNC);
519519
if (rc != 0)
520520
{
521-
ereport(WARNING,
521+
ereport(data_sync_elevel(WARNING),
522522
(errcode_for_file_access(),
523523
errmsg("could not flush dirty data: %m")));
524524
/* NB: need to fall through to munmap()! */
@@ -574,7 +574,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
574574
void
575575
fsync_fname(const char *fname, bool isdir)
576576
{
577-
fsync_fname_ext(fname, isdir, false, ERROR);
577+
fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
578578
}
579579

580580
/*
@@ -1050,7 +1050,8 @@ LruDelete(File file)
10501050
* to leak the FD than to mess up our internal state.
10511051
*/
10521052
if (close(vfdP->fd))
1053-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1053+
elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1054+
"could not close file \"%s\": %m", vfdP->fileName);
10541055
vfdP->fd = VFD_CLOSED;
10551056
--nfile;
10561057

@@ -1754,7 +1755,14 @@ FileClose(File file)
17541755
{
17551756
/* close the file */
17561757
if (close(vfdP->fd))
1757-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1758+
{
1759+
/*
1760+
* We may need to panic on failure to close non-temporary files;
1761+
* see LruDelete.
1762+
*/
1763+
elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1764+
"could not close file \"%s\": %m", vfdP->fileName);
1765+
}
17581766

17591767
--nfile;
17601768
vfdP->fd = VFD_CLOSED;
@@ -3250,6 +3258,9 @@ looks_like_temp_rel_name(const char *name)
32503258
* harmless cases such as read-only files in the data directory, and that's
32513259
* not good either.
32523260
*
3261+
* Note that if we previously crashed due to a PANIC on fsync(), we'll be
3262+
* rewriting all changes again during recovery.
3263+
*
32533264
* Note we assume we're chdir'd into PGDATA to begin with.
32543265
*/
32553266
void
@@ -3572,3 +3583,26 @@ MakePGDirectory(const char *directoryName)
35723583
{
35733584
return mkdir(directoryName, pg_dir_create_mode);
35743585
}
3586+
3587+
/*
3588+
* Return the passed-in error level, or PANIC if data_sync_retry is off.
3589+
*
3590+
* Failure to fsync any data file is cause for immediate panic, unless
3591+
* data_sync_retry is enabled. Data may have been written to the operating
3592+
* system and removed from our buffer pool already, and if we are running on
3593+
* an operating system that forgets dirty data on write-back failure, there
3594+
* may be only one copy of the data remaining: in the WAL. A later attempt to
3595+
* fsync again might falsely report success. Therefore we must not allow any
3596+
* further checkpoints to be attempted. data_sync_retry can in theory be
3597+
* enabled on systems known not to drop dirty buffered data on write-back
3598+
* failure (with the likely outcome that checkpoints will continue to fail
3599+
* until the underlying problem is fixed).
3600+
*
3601+
* Any code that reports a failure from fsync() or related functions should
3602+
* filter the error level with this function.
3603+
*/
3604+
int
3605+
data_sync_elevel(int elevel)
3606+
{
3607+
return data_sync_retry ? elevel : PANIC;
3608+
}

src/backend/storage/smgr/md.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
10391039
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
10401040

10411041
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1042-
ereport(ERROR,
1042+
ereport(data_sync_elevel(ERROR),
10431043
(errcode_for_file_access(),
10441044
errmsg("could not fsync file \"%s\": %m",
10451045
FilePathName(v->mdfd_vfd))));
@@ -1284,7 +1284,7 @@ mdsync(void)
12841284
bms_join(new_requests, requests);
12851285

12861286
errno = save_errno;
1287-
ereport(ERROR,
1287+
ereport(data_sync_elevel(ERROR),
12881288
(errcode_for_file_access(),
12891289
errmsg("could not fsync file \"%s\": %m",
12901290
path)));
@@ -1458,7 +1458,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
14581458
(errmsg("could not forward fsync request because request queue is full")));
14591459

14601460
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1461-
ereport(ERROR,
1461+
ereport(data_sync_elevel(ERROR),
14621462
(errcode_for_file_access(),
14631463
errmsg("could not fsync file \"%s\": %m",
14641464
FilePathName(seg->mdfd_vfd))));

src/backend/utils/cache/relmapper.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
799799
*/
800800
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
801801
if (pg_fsync(fd) != 0)
802-
ereport(ERROR,
802+
ereport(data_sync_elevel(ERROR),
803803
(errcode_for_file_access(),
804804
errmsg("could not fsync relation mapping file \"%s\": %m",
805805
mapfilename)));

src/backend/utils/misc/guc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,6 +1824,15 @@ static struct config_bool ConfigureNamesBool[] =
18241824
NULL, NULL, NULL
18251825
},
18261826

1827+
{
1828+
{"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
1829+
gettext_noop("Whether to continue running after a failure to sync data files."),
1830+
},
1831+
&data_sync_retry,
1832+
false,
1833+
NULL, NULL, NULL
1834+
},
1835+
18271836
/* End-of-list marker */
18281837
{
18291838
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,7 @@
665665

666666
#exit_on_error = off # terminate session on any error?
667667
#restart_after_crash = on # reinitialize after backend crash?
668+
#data_sync_retry = off # retry or panic on failure to fsync data?
668669

669670

670671
#------------------------------------------------------------------------------

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ typedef int File;
5151

5252
/* GUC parameter */
5353
extern PGDLLIMPORT int max_files_per_process;
54+
extern PGDLLIMPORT bool data_sync_retry;
5455

5556
/*
5657
* This is private to fd.c, but exported for save/restore_backend_variables()
@@ -138,6 +139,7 @@ extern int durable_rename(const char *oldfile, const char *newfile, int loglevel
138139
extern int durable_unlink(const char *fname, int loglevel);
139140
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
140141
extern void SyncDataDirectory(void);
142+
extern int data_sync_elevel(int elevel);
141143

142144
/* Filename components */
143145
#define PG_TEMP_FILES_DIR "pgsql_tmp"

0 commit comments

Comments
 (0)