Skip to content

Commit afbe03f

Browse files
committed
PANIC on fsync() failure.
On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
1 parent 2811776 commit afbe03f

File tree

12 files changed

+99
-18
lines changed

12 files changed

+99
-18
lines changed

doc/src/sgml/config.sgml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7765,6 +7765,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
77657765
</listitem>
77667766
</varlistentry>
77677767

7768+
<varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
7769+
<term><varname>data_sync_retry</varname> (<type>boolean</type>)
7770+
<indexterm>
7771+
<primary><varname>data_sync_retry</varname> configuration parameter</primary>
7772+
</indexterm>
7773+
</term>
7774+
<listitem>
7775+
<para>
7776+
When set to false, which is the default, <productname>PostgreSQL</productname>
7777+
will raise a PANIC-level error on failure to flush modified data files
7778+
to the filesystem. This causes the database server to crash.
7779+
</para>
7780+
<para>
7781+
On some operating systems, the status of data in the kernel's page
7782+
cache is unknown after a write-back failure. In some cases it might
7783+
have been entirely forgotten, making it unsafe to retry; the second
7784+
attempt may be reported as successful, when in fact the data has been
7785+
lost. In these circumstances, the only way to avoid data loss is to
7786+
recover from the WAL after any failure is reported, preferably
7787+
after investigating the root cause of the failure and replacing any
7788+
faulty hardware.
7789+
</para>
7790+
<para>
7791+
If set to true, <productname>PostgreSQL</productname> will instead
7792+
report an error but continue to run so that the data flushing
7793+
operation can be retried in a later checkpoint. Only set it to true
7794+
after investigating the operating system's treatment of buffered data
7795+
in case of write-back failure.
7796+
</para>
7797+
</listitem>
7798+
</varlistentry>
7799+
77687800
</variablelist>
77697801

77707802
</sect1>

src/backend/access/heap/rewriteheap.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,7 @@ logical_end_heap_rewrite(RewriteState state)
977977
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
978978
{
979979
if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
980-
ereport(ERROR,
980+
ereport(data_sync_elevel(ERROR),
981981
(errcode_for_file_access(),
982982
errmsg("could not fsync file \"%s\": %m", src->path)));
983983
FileClose(src->vfd);
@@ -1200,7 +1200,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
12001200
*/
12011201
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
12021202
if (pg_fsync(fd) != 0)
1203-
ereport(ERROR,
1203+
ereport(data_sync_elevel(ERROR),
12041204
(errcode_for_file_access(),
12051205
errmsg("could not fsync file \"%s\": %m", path)));
12061206
pgstat_report_wait_end();
@@ -1299,7 +1299,7 @@ CheckPointLogicalRewriteHeap(void)
12991299
*/
13001300
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
13011301
if (pg_fsync(fd) != 0)
1302-
ereport(ERROR,
1302+
ereport(data_sync_elevel(ERROR),
13031303
(errcode_for_file_access(),
13041304
errmsg("could not fsync file \"%s\": %m", path)));
13051305
pgstat_report_wait_end();

src/backend/access/transam/slru.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
929929
path, offset)));
930930
break;
931931
case SLRU_FSYNC_FAILED:
932-
ereport(ERROR,
932+
ereport(data_sync_elevel(ERROR),
933933
(errcode_for_file_access(),
934934
errmsg("could not access status of transaction %u", xid),
935935
errdetail("Could not fsync file \"%s\": %m.",

src/backend/access/transam/timeline.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
407407

408408
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
409409
if (pg_fsync(fd) != 0)
410-
ereport(ERROR,
410+
ereport(data_sync_elevel(ERROR),
411411
(errcode_for_file_access(),
412412
errmsg("could not fsync file \"%s\": %m", tmppath)));
413413
pgstat_report_wait_end();
@@ -487,7 +487,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
487487

488488
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
489489
if (pg_fsync(fd) != 0)
490-
ereport(ERROR,
490+
ereport(data_sync_elevel(ERROR),
491491
(errcode_for_file_access(),
492492
errmsg("could not fsync file \"%s\": %m", tmppath)));
493493
pgstat_report_wait_end();

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3445,7 +3445,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
34453445

34463446
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
34473447
if (pg_fsync(fd) != 0)
3448-
ereport(ERROR,
3448+
ereport(data_sync_elevel(ERROR),
34493449
(errcode_for_file_access(),
34503450
errmsg("could not fsync file \"%s\": %m", tmppath)));
34513451
pgstat_report_wait_end();

src/backend/replication/logical/snapbuild.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,6 +1630,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
16301630
* fsync the file before renaming so that even if we crash after this we
16311631
* have either a fully valid file or nothing.
16321632
*
1633+
* It's safe to just ERROR on fsync() here because we'll retry the whole
1634+
* operation including the writes.
1635+
*
16331636
* TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
16341637
* some noticeable overhead since it's performed synchronously during
16351638
* decoding?

src/backend/storage/file/fd.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ int max_files_per_process = 1000;
138138
*/
139139
int max_safe_fds = 32; /* default if not changed */
140140

141+
/* Whether it is safe to continue running after fsync() fails. */
142+
bool data_sync_retry = false;
141143

142144
/* Debugging.... */
143145

@@ -433,11 +435,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
433435
*/
434436
rc = sync_file_range(fd, offset, nbytes,
435437
SYNC_FILE_RANGE_WRITE);
436-
437-
/* don't error out, this is just a performance optimization */
438438
if (rc != 0)
439439
{
440-
ereport(WARNING,
440+
ereport(data_sync_elevel(WARNING),
441441
(errcode_for_file_access(),
442442
errmsg("could not flush dirty data: %m")));
443443
}
@@ -509,7 +509,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
509509
rc = msync(p, (size_t) nbytes, MS_ASYNC);
510510
if (rc != 0)
511511
{
512-
ereport(WARNING,
512+
ereport(data_sync_elevel(WARNING),
513513
(errcode_for_file_access(),
514514
errmsg("could not flush dirty data: %m")));
515515
/* NB: need to fall through to munmap()! */
@@ -565,7 +565,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
565565
void
566566
fsync_fname(const char *fname, bool isdir)
567567
{
568-
fsync_fname_ext(fname, isdir, false, ERROR);
568+
fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
569569
}
570570

571571
/*
@@ -1031,7 +1031,8 @@ LruDelete(File file)
10311031
* to leak the FD than to mess up our internal state.
10321032
*/
10331033
if (close(vfdP->fd))
1034-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1034+
elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1035+
"could not close file \"%s\": %m", vfdP->fileName);
10351036
vfdP->fd = VFD_CLOSED;
10361037
--nfile;
10371038

@@ -1510,7 +1511,14 @@ FileClose(File file)
15101511
{
15111512
/* close the file */
15121513
if (close(vfdP->fd))
1513-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1514+
{
1515+
/*
1516+
* We may need to panic on failure to close non-temporary files;
1517+
* see LruDelete.
1518+
*/
1519+
elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1520+
"could not close file \"%s\": %m", vfdP->fileName);
1521+
}
15141522

15151523
--nfile;
15161524
vfdP->fd = VFD_CLOSED;
@@ -2949,6 +2957,9 @@ looks_like_temp_rel_name(const char *name)
29492957
* harmless cases such as read-only files in the data directory, and that's
29502958
* not good either.
29512959
*
2960+
* Note that if we previously crashed due to a PANIC on fsync(), we'll be
2961+
* rewriting all changes again during recovery.
2962+
*
29522963
* Note we assume we're chdir'd into PGDATA to begin with.
29532964
*/
29542965
void
@@ -3235,3 +3246,26 @@ fsync_parent_path(const char *fname, int elevel)
32353246

32363247
return 0;
32373248
}
3249+
3250+
/*
3251+
* Return the passed-in error level, or PANIC if data_sync_retry is off.
3252+
*
3253+
* Failure to fsync any data file is cause for immediate panic, unless
3254+
* data_sync_retry is enabled. Data may have been written to the operating
3255+
* system and removed from our buffer pool already, and if we are running on
3256+
* an operating system that forgets dirty data on write-back failure, there
3257+
* may be only one copy of the data remaining: in the WAL. A later attempt to
3258+
* fsync again might falsely report success. Therefore we must not allow any
3259+
* further checkpoints to be attempted. data_sync_retry can in theory be
3260+
* enabled on systems known not to drop dirty buffered data on write-back
3261+
* failure (with the likely outcome that checkpoints will continue to fail
3262+
* until the underlying problem is fixed).
3263+
*
3264+
* Any code that reports a failure from fsync() or related functions should
3265+
* filter the error level with this function.
3266+
*/
3267+
int
3268+
data_sync_elevel(int elevel)
3269+
{
3270+
return data_sync_retry ? elevel : PANIC;
3271+
}

src/backend/storage/smgr/md.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,7 +1040,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
10401040
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
10411041

10421042
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1043-
ereport(ERROR,
1043+
ereport(data_sync_elevel(ERROR),
10441044
(errcode_for_file_access(),
10451045
errmsg("could not fsync file \"%s\": %m",
10461046
FilePathName(v->mdfd_vfd))));
@@ -1285,7 +1285,7 @@ mdsync(void)
12851285
bms_join(new_requests, requests);
12861286

12871287
errno = save_errno;
1288-
ereport(ERROR,
1288+
ereport(data_sync_elevel(ERROR),
12891289
(errcode_for_file_access(),
12901290
errmsg("could not fsync file \"%s\": %m",
12911291
path)));
@@ -1459,7 +1459,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
14591459
(errmsg("could not forward fsync request because request queue is full")));
14601460

14611461
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1462-
ereport(ERROR,
1462+
ereport(data_sync_elevel(ERROR),
14631463
(errcode_for_file_access(),
14641464
errmsg("could not fsync file \"%s\": %m",
14651465
FilePathName(seg->mdfd_vfd))));

src/backend/utils/cache/relmapper.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
798798
*/
799799
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
800800
if (pg_fsync(fd) != 0)
801-
ereport(ERROR,
801+
ereport(data_sync_elevel(ERROR),
802802
(errcode_for_file_access(),
803803
errmsg("could not fsync relation mapping file \"%s\": %m",
804804
mapfilename)));

src/backend/utils/misc/guc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,6 +1667,15 @@ static struct config_bool ConfigureNamesBool[] =
16671667
NULL, NULL, NULL
16681668
},
16691669

1670+
{
1671+
{"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
1672+
gettext_noop("Whether to continue running after a failure to sync data files."),
1673+
},
1674+
&data_sync_retry,
1675+
false,
1676+
NULL, NULL, NULL
1677+
},
1678+
16701679
/* End-of-list marker */
16711680
{
16721681
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,7 @@
635635

636636
#exit_on_error = off # terminate session on any error?
637637
#restart_after_crash = on # reinitialize after backend crash?
638+
#data_sync_retry = off # retry or panic on failure to fsync data?
638639

639640

640641
#------------------------------------------------------------------------------

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ typedef int File;
5353

5454
/* GUC parameter */
5555
extern PGDLLIMPORT int max_files_per_process;
56+
extern PGDLLIMPORT bool data_sync_retry;
5657

5758
/*
5859
* This is private to fd.c, but exported for save/restore_backend_variables()
@@ -124,6 +125,7 @@ extern int durable_rename(const char *oldfile, const char *newfile, int loglevel
124125
extern int durable_unlink(const char *fname, int loglevel);
125126
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
126127
extern void SyncDataDirectory(void);
128+
extern int data_sync_elevel(int elevel);
127129

128130
/* Filename components for OpenTemporaryFile */
129131
#define PG_TEMP_FILES_DIR "pgsql_tmp"

0 commit comments

Comments
 (0)