Skip to content

Commit f1ff5f5

Browse files
committed
PANIC on fsync() failure.
On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
1 parent 2b2010d commit f1ff5f5

File tree

12 files changed

+97
-14
lines changed

12 files changed

+97
-14
lines changed

doc/src/sgml/config.sgml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6829,6 +6829,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
68296829
</listitem>
68306830
</varlistentry>
68316831

6832+
<varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
6833+
<term><varname>data_sync_retry</varname> (<type>boolean</type>)
6834+
<indexterm>
6835+
<primary><varname>data_sync_retry</varname> configuration parameter</primary>
6836+
</indexterm>
6837+
</term>
6838+
<listitem>
6839+
<para>
6840+
When set to false, which is the default, <productname>PostgreSQL</productname>
6841+
will raise a PANIC-level error on failure to flush modified data files
6842+
to the filesystem. This causes the database server to crash.
6843+
</para>
6844+
<para>
6845+
On some operating systems, the status of data in the kernel's page
6846+
cache is unknown after a write-back failure. In some cases it might
6847+
have been entirely forgotten, making it unsafe to retry; the second
6848+
attempt may be reported as successful, when in fact the data has been
6849+
lost. In these circumstances, the only way to avoid data loss is to
6850+
recover from the WAL after any failure is reported, preferably
6851+
after investigating the root cause of the failure and replacing any
6852+
faulty hardware.
6853+
</para>
6854+
<para>
6855+
If set to true, <productname>PostgreSQL</productname> will instead
6856+
report an error but continue to run so that the data flushing
6857+
operation can be retried in a later checkpoint. Only set it to true
6858+
after investigating the operating system's treatment of buffered data
6859+
in case of write-back failure.
6860+
</para>
6861+
</listitem>
6862+
</varlistentry>
6863+
68326864
</variablelist>
68336865

68346866
</sect1>

src/backend/access/heap/rewriteheap.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -984,7 +984,7 @@ logical_end_heap_rewrite(RewriteState state)
984984
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
985985
{
986986
if (FileSync(src->vfd) != 0)
987-
ereport(ERROR,
987+
ereport(data_sync_elevel(ERROR),
988988
(errcode_for_file_access(),
989989
errmsg("could not fsync file \"%s\": %m", src->path)));
990990
FileClose(src->vfd);
@@ -1202,7 +1202,7 @@ heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
12021202
* doesn't seem worth the trouble.
12031203
*/
12041204
if (pg_fsync(fd) != 0)
1205-
ereport(ERROR,
1205+
ereport(data_sync_elevel(ERROR),
12061206
(errcode_for_file_access(),
12071207
errmsg("could not fsync file \"%s\": %m", path)));
12081208

@@ -1299,7 +1299,7 @@ CheckPointLogicalRewriteHeap(void)
12991299
* but it's currently not deemed worth the effort.
13001300
*/
13011301
else if (pg_fsync(fd) != 0)
1302-
ereport(ERROR,
1302+
ereport(data_sync_elevel(ERROR),
13031303
(errcode_for_file_access(),
13041304
errmsg("could not fsync file \"%s\": %m", path)));
13051305
CloseTransientFile(fd);

src/backend/access/transam/slru.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
903903
path, offset)));
904904
break;
905905
case SLRU_FSYNC_FAILED:
906-
ereport(ERROR,
906+
ereport(data_sync_elevel(ERROR),
907907
(errcode_for_file_access(),
908908
errmsg("could not access status of transaction %u", xid),
909909
errdetail("Could not fsync file \"%s\": %m.",

src/backend/access/transam/timeline.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
402402
}
403403

404404
if (pg_fsync(fd) != 0)
405-
ereport(ERROR,
405+
ereport(data_sync_elevel(ERROR),
406406
(errcode_for_file_access(),
407407
errmsg("could not fsync file \"%s\": %m", tmppath)));
408408

@@ -478,7 +478,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
478478
}
479479

480480
if (pg_fsync(fd) != 0)
481-
ereport(ERROR,
481+
ereport(data_sync_elevel(ERROR),
482482
(errcode_for_file_access(),
483483
errmsg("could not fsync file \"%s\": %m", tmppath)));
484484

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3388,7 +3388,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
33883388
}
33893389

33903390
if (pg_fsync(fd) != 0)
3391-
ereport(ERROR,
3391+
ereport(data_sync_elevel(ERROR),
33923392
(errcode_for_file_access(),
33933393
errmsg("could not fsync file \"%s\": %m", tmppath)));
33943394

src/backend/replication/logical/snapbuild.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1589,6 +1589,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
15891589
* fsync the file before renaming so that even if we crash after this we
15901590
* have either a fully valid file or nothing.
15911591
*
1592+
* It's safe to just ERROR on fsync() here because we'll retry the whole
1593+
* operation including the writes.
1594+
*
15921595
* TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
15931596
* some noticeable overhead since it's performed synchronously during
15941597
* decoding?

src/backend/storage/file/fd.c

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ int max_files_per_process = 1000;
130130
*/
131131
int max_safe_fds = 32; /* default if not changed */
132132

133+
/* Whether it is safe to continue running after fsync() fails. */
134+
bool data_sync_retry = false;
133135

134136
/* Debugging.... */
135137

@@ -423,7 +425,7 @@ pg_flush_data(int fd, off_t offset, off_t amount)
423425
void
424426
fsync_fname(const char *fname, bool isdir)
425427
{
426-
fsync_fname_ext(fname, isdir, false, ERROR);
428+
fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
427429
}
428430

429431
/*
@@ -852,7 +854,8 @@ LruDelete(File file)
852854
* to leak the FD than to mess up our internal state.
853855
*/
854856
if (close(vfdP->fd))
855-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
857+
elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
858+
"could not close file \"%s\": %m", vfdP->fileName);
856859
vfdP->fd = VFD_CLOSED;
857860
--nfile;
858861

@@ -1331,7 +1334,14 @@ FileClose(File file)
13311334
{
13321335
/* close the file */
13331336
if (close(vfdP->fd))
1334-
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1337+
{
1338+
/*
1339+
* We may need to panic on failure to close non-temporary files;
1340+
* see LruDelete.
1341+
*/
1342+
elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
1343+
"could not close file \"%s\": %m", vfdP->fileName);
1344+
}
13351345

13361346
--nfile;
13371347
vfdP->fd = VFD_CLOSED;
@@ -2697,6 +2707,9 @@ looks_like_temp_rel_name(const char *name)
26972707
* harmless cases such as read-only files in the data directory, and that's
26982708
* not good either.
26992709
*
2710+
* Note that if we previously crashed due to a PANIC on fsync(), we'll be
2711+
* rewriting all changes again during recovery.
2712+
*
27002713
* Note we assume we're chdir'd into PGDATA to begin with.
27012714
*/
27022715
void
@@ -2978,3 +2991,26 @@ fsync_parent_path(const char *fname, int elevel)
29782991

29792992
return 0;
29802993
}
2994+
2995+
/*
2996+
* Return the passed-in error level, or PANIC if data_sync_retry is off.
2997+
*
2998+
* Failure to fsync any data file is cause for immediate panic, unless
2999+
* data_sync_retry is enabled. Data may have been written to the operating
3000+
* system and removed from our buffer pool already, and if we are running on
3001+
* an operating system that forgets dirty data on write-back failure, there
3002+
* may be only one copy of the data remaining: in the WAL. A later attempt to
3003+
* fsync again might falsely report success. Therefore we must not allow any
3004+
* further checkpoints to be attempted. data_sync_retry can in theory be
3005+
* enabled on systems known not to drop dirty buffered data on write-back
3006+
* failure (with the likely outcome that checkpoints will continue to fail
3007+
* until the underlying problem is fixed).
3008+
*
3009+
* Any code that reports a failure from fsync() or related functions should
3010+
* filter the error level with this function.
3011+
*/
3012+
int
3013+
data_sync_elevel(int elevel)
3014+
{
3015+
return data_sync_retry ? elevel : PANIC;
3016+
}

src/backend/storage/smgr/md.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
963963
while (v != NULL)
964964
{
965965
if (FileSync(v->mdfd_vfd) < 0)
966-
ereport(ERROR,
966+
ereport(data_sync_elevel(ERROR),
967967
(errcode_for_file_access(),
968968
errmsg("could not fsync file \"%s\": %m",
969969
FilePathName(v->mdfd_vfd))));
@@ -1206,7 +1206,7 @@ mdsync(void)
12061206
bms_join(new_requests, requests);
12071207

12081208
errno = save_errno;
1209-
ereport(ERROR,
1209+
ereport(data_sync_elevel(ERROR),
12101210
(errcode_for_file_access(),
12111211
errmsg("could not fsync file \"%s\": %m",
12121212
path)));
@@ -1380,7 +1380,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
13801380
(errmsg("could not forward fsync request because request queue is full")));
13811381

13821382
if (FileSync(seg->mdfd_vfd) < 0)
1383-
ereport(ERROR,
1383+
ereport(data_sync_elevel(ERROR),
13841384
(errcode_for_file_access(),
13851385
errmsg("could not fsync file \"%s\": %m",
13861386
FilePathName(seg->mdfd_vfd))));

src/backend/utils/cache/relmapper.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
796796
* CheckPointRelationMap.
797797
*/
798798
if (pg_fsync(fd) != 0)
799-
ereport(ERROR,
799+
ereport(data_sync_elevel(ERROR),
800800
(errcode_for_file_access(),
801801
errmsg("could not fsync relation mapping file \"%s\": %m",
802802
mapfilename)));

src/backend/utils/misc/guc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,15 @@ static struct config_bool ConfigureNamesBool[] =
15121512
NULL, NULL, NULL
15131513
},
15141514

1515+
{
1516+
{"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
1517+
gettext_noop("Whether to continue running after a failure to sync data files."),
1518+
},
1519+
&data_sync_retry,
1520+
false,
1521+
NULL, NULL, NULL
1522+
},
1523+
15151524
/* End-of-list marker */
15161525
{
15171526
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,7 @@
587587

588588
#exit_on_error = off # terminate session on any error?
589589
#restart_after_crash = on # reinitialize after backend crash?
590+
#data_sync_retry = off # retry or panic on failure to fsync data?
590591

591592

592593
#------------------------------------------------------------------------------

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ typedef int File;
5353

5454
/* GUC parameter */
5555
extern PGDLLIMPORT int max_files_per_process;
56+
extern PGDLLIMPORT bool data_sync_retry;
5657

5758
/*
5859
* This is private to fd.c, but exported for save/restore_backend_variables()
@@ -119,6 +120,7 @@ extern void fsync_fname(const char *fname, bool isdir);
119120
extern int durable_rename(const char *oldfile, const char *newfile, int loglevel);
120121
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
121122
extern void SyncDataDirectory(void);
123+
extern int data_sync_elevel(int elevel);
122124

123125
/* Filename components for OpenTemporaryFile */
124126
#define PG_TEMP_FILES_DIR "pgsql_tmp"

0 commit comments

Comments
 (0)