Skip to content

Commit 9ccdd7f

Browse files
committed
PANIC on fsync() failure.
On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
1 parent 1556cb2 commit 9ccdd7f

File tree

12 files changed

+99
-18
lines changed

12 files changed

+99
-18
lines changed

doc/src/sgml/config.sgml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8161,6 +8161,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
81618161
</listitem>
81628162
</varlistentry>
81638163

8164+
<varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
8165+
<term><varname>data_sync_retry</varname> (<type>boolean</type>)
8166+
<indexterm>
8167+
<primary><varname>data_sync_retry</varname> configuration parameter</primary>
8168+
</indexterm>
8169+
</term>
8170+
<listitem>
8171+
<para>
8172+
When set to false, which is the default, <productname>PostgreSQL</productname>
8173+
will raise a PANIC-level error on failure to flush modified data files
8174+
to the filesystem. This causes the database server to crash.
8175+
</para>
8176+
<para>
8177+
On some operating systems, the status of data in the kernel's page
8178+
cache is unknown after a write-back failure. In some cases it might
8179+
have been entirely forgotten, making it unsafe to retry; the second
8180+
attempt may be reported as successful, when in fact the data has been
8181+
lost. In these circumstances, the only way to avoid data loss is to
8182+
recover from the WAL after any failure is reported, preferably
8183+
after investigating the root cause of the failure and replacing any
8184+
faulty hardware.
8185+
</para>
8186+
<para>
8187+
If set to true, <productname>PostgreSQL</productname> will instead
8188+
report an error but continue to run so that the data flushing
8189+
operation can be retried in a later checkpoint. Only set it to true
8190+
after investigating the operating system's treatment of buffered data
8191+
in case of write-back failure.
8192+
</para>
8193+
</listitem>
8194+
</varlistentry>
8195+
81648196
</variablelist>
81658197

81668198
</sect1>

src/backend/access/heap/rewriteheap.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -978,7 +978,7 @@ logical_end_heap_rewrite(RewriteState state)
978978
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
979979
{
980980
if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
981-
ereport(ERROR,
981+
ereport(data_sync_elevel(ERROR),
982982
(errcode_for_file_access(),
983983
errmsg("could not fsync file \"%s\": %m", src->path)));
984984
FileClose(src->vfd);
@@ -1199,7 +1199,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
11991199
*/
12001200
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
12011201
if (pg_fsync(fd) != 0)
1202-
ereport(ERROR,
1202+
ereport(data_sync_elevel(ERROR),
12031203
(errcode_for_file_access(),
12041204
errmsg("could not fsync file \"%s\": %m", path)));
12051205
pgstat_report_wait_end();
@@ -1298,7 +1298,7 @@ CheckPointLogicalRewriteHeap(void)
12981298
*/
12991299
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
13001300
if (pg_fsync(fd) != 0)
1301-
ereport(ERROR,
1301+
ereport(data_sync_elevel(ERROR),
13021302
(errcode_for_file_access(),
13031303
errmsg("could not fsync file \"%s\": %m", path)));
13041304
pgstat_report_wait_end();

src/backend/access/transam/slru.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
928928
path, offset)));
929929
break;
930930
case SLRU_FSYNC_FAILED:
931-
ereport(ERROR,
931+
ereport(data_sync_elevel(ERROR),
932932
(errcode_for_file_access(),
933933
errmsg("could not access status of transaction %u", xid),
934934
errdetail("Could not fsync file \"%s\": %m.",

src/backend/access/transam/timeline.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
406406

407407
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
408408
if (pg_fsync(fd) != 0)
409-
ereport(ERROR,
409+
ereport(data_sync_elevel(ERROR),
410410
(errcode_for_file_access(),
411411
errmsg("could not fsync file \"%s\": %m", tmppath)));
412412
pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
485485

486486
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
487487
if (pg_fsync(fd) != 0)
488-
ereport(ERROR,
488+
ereport(data_sync_elevel(ERROR),
489489
(errcode_for_file_access(),
490490
errmsg("could not fsync file \"%s\": %m", tmppath)));
491491
pgstat_report_wait_end();

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3455,7 +3455,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
34553455

34563456
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
34573457
if (pg_fsync(fd) != 0)
3458-
ereport(ERROR,
3458+
ereport(data_sync_elevel(ERROR),
34593459
(errcode_for_file_access(),
34603460
errmsg("could not fsync file \"%s\": %m", tmppath)));
34613461
pgstat_report_wait_end();

src/backend/replication/logical/snapbuild.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
16291629
* fsync the file before renaming so that even if we crash after this we
16301630
* have either a fully valid file or nothing.
16311631
*
1632+
* It's safe to just ERROR on fsync() here because we'll retry the whole
1633+
* operation including the writes.
1634+
*
16321635
* TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
16331636
* some noticeable overhead since it's performed synchronously during
16341637
* decoding?

src/backend/storage/file/fd.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ int max_files_per_process = 1000;
145145
*/
146146
int max_safe_fds = 32; /* default if not changed */
147147

148+
/* Whether it is safe to continue running after fsync() fails. */
149+
bool data_sync_retry = false;
148150

149151
/* Debugging.... */
150152

@@ -430,11 +432,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
430432
*/
431433
rc = sync_file_range(fd, offset, nbytes,
432434
SYNC_FILE_RANGE_WRITE);
433-
434-
/* don't error out, this is just a performance optimization */
435435
if (rc != 0)
436436
{
437-
ereport(WARNING,
437+
ereport(data_sync_elevel(WARNING),
438438
(errcode_for_file_access(),
439439
errmsg("could not flush dirty data: %m")));
440440
}
@@ -506,7 +506,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
506506
rc = msync(p, (size_t) nbytes, MS_ASYNC);
507507
if (rc != 0)
508508
{
509-
ereport(WARNING,
509+
ereport(data_sync_elevel(WARNING),
510510
(errcode_for_file_access(),
511511
errmsg("could not flush dirty data: %m")));
512512
/* NB: need to fall through to munmap()! */
@@ -562,7 +562,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
562562
void
563563
fsync_fname(const char *fname, bool isdir)
564564
{
565-
fsync_fname_ext(fname, isdir, false, ERROR);
565+
fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
566566
}
567567

568568
/*
@@ -1022,7 +1022,8 @@ LruDelete(File file)
10221022
* to leak the FD than to mess up our internal state.
10231023
*/
10241024
if (close(vfdP->fd))
1025-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1025+
elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1026+
"could not close file \"%s\": %m", vfdP->fileName);
10261027
vfdP->fd = VFD_CLOSED;
10271028
--nfile;
10281029

@@ -1698,7 +1699,14 @@ FileClose(File file)
16981699
{
16991700
/* close the file */
17001701
if (close(vfdP->fd))
1701-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1702+
{
1703+
/*
1704+
* We may need to panic on failure to close non-temporary files;
1705+
* see LruDelete.
1706+
*/
1707+
elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1708+
"could not close file \"%s\": %m", vfdP->fileName);
1709+
}
17021710

17031711
--nfile;
17041712
vfdP->fd = VFD_CLOSED;
@@ -3091,6 +3099,9 @@ looks_like_temp_rel_name(const char *name)
30913099
* harmless cases such as read-only files in the data directory, and that's
30923100
* not good either.
30933101
*
3102+
* Note that if we previously crashed due to a PANIC on fsync(), we'll be
3103+
* rewriting all changes again during recovery.
3104+
*
30943105
* Note we assume we're chdir'd into PGDATA to begin with.
30953106
*/
30963107
void
@@ -3413,3 +3424,26 @@ MakePGDirectory(const char *directoryName)
34133424
{
34143425
return mkdir(directoryName, pg_dir_create_mode);
34153426
}
3427+
3428+
/*
3429+
* Return the passed-in error level, or PANIC if data_sync_retry is off.
3430+
*
3431+
* Failure to fsync any data file is cause for immediate panic, unless
3432+
* data_sync_retry is enabled. Data may have been written to the operating
3433+
* system and removed from our buffer pool already, and if we are running on
3434+
* an operating system that forgets dirty data on write-back failure, there
3435+
* may be only one copy of the data remaining: in the WAL. A later attempt to
3436+
* fsync again might falsely report success. Therefore we must not allow any
3437+
* further checkpoints to be attempted. data_sync_retry can in theory be
3438+
* enabled on systems known not to drop dirty buffered data on write-back
3439+
* failure (with the likely outcome that checkpoints will continue to fail
3440+
* until the underlying problem is fixed).
3441+
*
3442+
* Any code that reports a failure from fsync() or related functions should
3443+
* filter the error level with this function.
3444+
*/
3445+
int
3446+
data_sync_elevel(int elevel)
3447+
{
3448+
return data_sync_retry ? elevel : PANIC;
3449+
}

src/backend/storage/smgr/md.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,7 +1012,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
10121012
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
10131013

10141014
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1015-
ereport(ERROR,
1015+
ereport(data_sync_elevel(ERROR),
10161016
(errcode_for_file_access(),
10171017
errmsg("could not fsync file \"%s\": %m",
10181018
FilePathName(v->mdfd_vfd))));
@@ -1257,7 +1257,7 @@ mdsync(void)
12571257
bms_join(new_requests, requests);
12581258

12591259
errno = save_errno;
1260-
ereport(ERROR,
1260+
ereport(data_sync_elevel(ERROR),
12611261
(errcode_for_file_access(),
12621262
errmsg("could not fsync file \"%s\": %m",
12631263
path)));
@@ -1431,7 +1431,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
14311431
(errmsg("could not forward fsync request because request queue is full")));
14321432

14331433
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1434-
ereport(ERROR,
1434+
ereport(data_sync_elevel(ERROR),
14351435
(errcode_for_file_access(),
14361436
errmsg("could not fsync file \"%s\": %m",
14371437
FilePathName(seg->mdfd_vfd))));

src/backend/utils/cache/relmapper.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
876876
*/
877877
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
878878
if (pg_fsync(fd) != 0)
879-
ereport(ERROR,
879+
ereport(data_sync_elevel(ERROR),
880880
(errcode_for_file_access(),
881881
errmsg("could not fsync file \"%s\": %m",
882882
mapfilename)));

src/backend/utils/misc/guc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,6 +1830,15 @@ static struct config_bool ConfigureNamesBool[] =
18301830
NULL, NULL, NULL
18311831
},
18321832

1833+
{
1834+
{"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
1835+
gettext_noop("Whether to continue running after a failure to sync data files."),
1836+
},
1837+
&data_sync_retry,
1838+
false,
1839+
NULL, NULL, NULL
1840+
},
1841+
18331842
/* End-of-list marker */
18341843
{
18351844
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@
666666

667667
#exit_on_error = off # terminate session on any error?
668668
#restart_after_crash = on # reinitialize after backend crash?
669+
#data_sync_retry = off # retry or panic on failure to fsync data?
669670

670671

671672
#------------------------------------------------------------------------------

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ typedef int File;
4747

4848
/* GUC parameter */
4949
extern PGDLLIMPORT int max_files_per_process;
50+
extern PGDLLIMPORT bool data_sync_retry;
5051

5152
/*
5253
* This is private to fd.c, but exported for save/restore_backend_variables()
@@ -134,6 +135,7 @@ extern int durable_rename(const char *oldfile, const char *newfile, int loglevel
134135
extern int durable_unlink(const char *fname, int loglevel);
135136
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
136137
extern void SyncDataDirectory(void);
138+
extern int data_sync_elevel(int elevel);
137139

138140
/* Filename components */
139141
#define PG_TEMP_FILES_DIR "pgsql_tmp"

0 commit comments

Comments
 (0)