Skip to content

Commit 43b491a

Browse files
committed
Introduce durable_rename() and durable_link_or_rename().
Renaming a file using rename(2) is not guaranteed to be durable in face of crashes; especially on filesystems like xfs and ext4 when mounted with data=writeback. To be certain that a rename() atomically replaces the previous file contents in the face of crashes and different filesystems, one has to fsync the old filename, rename the file, fsync the new filename, fsync the containing directory. This sequence is not generally adhered to currently; which exposes us to data loss risks. To avoid having to repeat this arduous sequence, introduce durable_rename(), which wraps all that. Also add durable_link_or_rename(). Several places use link() (with a fallback to rename()) to rename a file, trying to avoid replacing the target file out of paranoia. Some of those rename sequences need to be durable as well. There seems little reason extend several copies of the same logic, so centralize the link() callers. This commit does not yet make use of the new functions; they're used in a followup commit. Author: Michael Paquier, Andres Freund Discussion: 56583BDD.9060302@2ndquadrant.com Backpatch: All supported branches
1 parent da93620 commit 43b491a

File tree

4 files changed

+214
-51
lines changed

4 files changed

+214
-51
lines changed

src/backend/replication/slot.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
10431043
START_CRIT_SECTION();
10441044

10451045
fsync_fname(path, false);
1046-
fsync_fname((char *) dir, true);
1046+
fsync_fname(dir, true);
10471047
fsync_fname("pg_replslot", true);
10481048

10491049
END_CRIT_SECTION();

src/backend/storage/file/fd.c

+209-48
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,10 @@ static void walkdir(const char *path,
305305
#ifdef PG_FLUSH_DATA_WORKS
306306
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
307307
#endif
308-
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
308+
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
309+
310+
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
311+
static int fsync_parent_path(const char *fname, int elevel);
309312

310313

311314
/*
@@ -412,54 +415,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
412415
* indicate the OS just doesn't allow/require fsyncing directories.
413416
*/
414417
void
415-
fsync_fname(char *fname, bool isdir)
418+
fsync_fname(const char *fname, bool isdir)
419+
{
420+
fsync_fname_ext(fname, isdir, false, ERROR);
421+
}
422+
423+
/*
424+
* durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
425+
*
426+
* This routine ensures that, after returning, the effect of renaming file
427+
* persists in case of a crash. A crash while this routine is running will
428+
* leave you with either the pre-existing or the moved file in place of the
429+
* new file; no mixed state or truncated files are possible.
430+
*
431+
* It does so by using fsync on the old filename and the possibly existing
432+
* target filename before the rename, and the target file and directory after.
433+
*
434+
* Note that rename() cannot be used across arbitrary directories, as they
435+
* might not be on the same filesystem. Therefore this routine does not
436+
* support renaming across directories.
437+
*
438+
* Log errors with the caller specified severity.
439+
*
440+
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
441+
* valid upon return.
442+
*/
443+
int
444+
durable_rename(const char *oldfile, const char *newfile, int elevel)
416445
{
417446
int fd;
418-
int returncode;
419447

420448
/*
421-
* Some OSs require directories to be opened read-only whereas other
422-
* systems don't allow us to fsync files opened read-only; so we need both
423-
* cases here
449+
* First fsync the old and target path (if it exists), to ensure that they
450+
* are properly persistent on disk. Syncing the target file is not
451+
* strictly necessary, but it makes it easier to reason about crashes;
452+
* because it's then guaranteed that either source or target file exists
453+
* after a crash.
424454
*/
425-
if (!isdir)
426-
fd = OpenTransientFile(fname,
427-
O_RDWR | PG_BINARY,
428-
S_IRUSR | S_IWUSR);
455+
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
456+
return -1;
457+
458+
fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
459+
if (fd < 0)
460+
{
461+
if (errno != ENOENT)
462+
{
463+
ereport(elevel,
464+
(errcode_for_file_access(),
465+
errmsg("could not open file \"%s\": %m", newfile)));
466+
return -1;
467+
}
468+
}
429469
else
430-
fd = OpenTransientFile(fname,
431-
O_RDONLY | PG_BINARY,
432-
S_IRUSR | S_IWUSR);
470+
{
471+
if (pg_fsync(fd) != 0)
472+
{
473+
int save_errno;
474+
475+
/* close file upon error, might not be in transaction context */
476+
save_errno = errno;
477+
CloseTransientFile(fd);
478+
errno = save_errno;
479+
480+
ereport(elevel,
481+
(errcode_for_file_access(),
482+
errmsg("could not fsync file \"%s\": %m", newfile)));
483+
return -1;
484+
}
485+
CloseTransientFile(fd);
486+
}
487+
488+
/* Time to do the real deal... */
489+
if (rename(oldfile, newfile) < 0)
490+
{
491+
ereport(elevel,
492+
(errcode_for_file_access(),
493+
errmsg("could not rename file \"%s\" to \"%s\": %m",
494+
oldfile, newfile)));
495+
return -1;
496+
}
433497

434498
/*
435-
* Some OSs don't allow us to open directories at all (Windows returns
436-
* EACCES)
499+
* To guarantee renaming the file is persistent, fsync the file with its
500+
* new name, and its containing directory.
437501
*/
438-
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
439-
return;
502+
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
503+
return -1;
440504

441-
else if (fd < 0)
442-
ereport(ERROR,
443-
(errcode_for_file_access(),
444-
errmsg("could not open file \"%s\": %m", fname)));
505+
if (fsync_parent_path(newfile, elevel) != 0)
506+
return -1;
445507

446-
returncode = pg_fsync(fd);
508+
return 0;
509+
}
510+
511+
/*
512+
* durable_link_or_rename -- rename a file in a durable manner.
513+
*
514+
* Similar to durable_rename(), except that this routine tries (but does not
515+
* guarantee) not to overwrite the target file.
516+
*
517+
* Note that a crash in an unfortunate moment can leave you with two links to
518+
* the target file.
519+
*
520+
* Log errors with the caller specified severity.
521+
*
522+
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
523+
* valid upon return.
524+
*/
525+
int
526+
durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
527+
{
528+
/*
529+
* Ensure that, if we crash directly after the rename/link, a file with
530+
* valid contents is moved into place.
531+
*/
532+
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
533+
return -1;
447534

448-
/* Some OSs don't allow us to fsync directories at all */
449-
if (returncode != 0 && isdir && errno == EBADF)
535+
#if HAVE_WORKING_LINK
536+
if (link(oldfile, newfile) < 0)
450537
{
451-
CloseTransientFile(fd);
452-
return;
538+
ereport(elevel,
539+
(errcode_for_file_access(),
540+
errmsg("could not link file \"%s\" to \"%s\": %m",
541+
oldfile, newfile)));
542+
return -1;
453543
}
454-
455-
if (returncode != 0)
456-
ereport(ERROR,
544+
unlink(oldfile);
545+
#else
546+
/* XXX: Add racy file existence check? */
547+
if (rename(oldfile, newfile) < 0)
548+
{
549+
ereport(elevel,
457550
(errcode_for_file_access(),
458-
errmsg("could not fsync file \"%s\": %m", fname)));
551+
errmsg("could not rename file \"%s\" to \"%s\": %m",
552+
oldfile, newfile)));
553+
return -1;
554+
}
555+
#endif
459556

460-
CloseTransientFile(fd);
461-
}
557+
/*
558+
* Make change persistent in case of an OS crash, both the new entry and
559+
* its parent directory need to be flushed.
560+
*/
561+
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
562+
return -1;
563+
564+
/* Same for parent directory */
565+
if (fsync_parent_path(newfile, elevel) != 0)
566+
return -1;
462567

568+
return 0;
569+
}
463570

464571
/*
465572
* InitFileAccess --- initialize this module during backend startup
@@ -2546,10 +2653,10 @@ SyncDataDirectory(void)
25462653
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
25472654
* so we don't worry about optimizing it.
25482655
*/
2549-
walkdir(".", fsync_fname_ext, false, LOG);
2656+
walkdir(".", datadir_fsync_fname, false, LOG);
25502657
if (xlog_is_symlink)
2551-
walkdir("pg_xlog", fsync_fname_ext, false, LOG);
2552-
walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
2658+
walkdir("pg_xlog", datadir_fsync_fname, false, LOG);
2659+
walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
25532660
}
25542661

25552662
/*
@@ -2663,15 +2770,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
26632770

26642771
#endif /* PG_FLUSH_DATA_WORKS */
26652772

2773+
static void
2774+
datadir_fsync_fname(const char *fname, bool isdir, int elevel)
2775+
{
2776+
/*
2777+
* We want to silently ignoring errors about unreadable files. Pass that
2778+
* desire on to fsync_fname_ext().
2779+
*/
2780+
fsync_fname_ext(fname, isdir, true, elevel);
2781+
}
2782+
26662783
/*
26672784
* fsync_fname_ext -- Try to fsync a file or directory
26682785
*
2669-
* Ignores errors trying to open unreadable files, or trying to fsync
2670-
* directories on systems where that isn't allowed/required, and logs other
2671-
* errors at a caller-specified level.
2786+
* If ignore_perm is true, ignore errors upon trying to open unreadable
2787+
* files. Logs other errors at a caller-specified level.
2788+
*
2789+
* Returns 0 if the operation succeeded, -1 otherwise.
26722790
*/
2673-
static void
2674-
fsync_fname_ext(const char *fname, bool isdir, int elevel)
2791+
static int
2792+
fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
26752793
{
26762794
int fd;
26772795
int flags;
@@ -2689,20 +2807,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
26892807
else
26902808
flags |= O_RDONLY;
26912809

2810+
fd = OpenTransientFile((char *) fname, flags, 0);
2811+
26922812
/*
2693-
* Open the file, silently ignoring errors about unreadable files (or
2694-
* unsupported operations, e.g. opening a directory under Windows), and
2695-
* logging others.
2813+
* Some OSs don't allow us to open directories at all (Windows returns
2814+
* EACCES), just ignore the error in that case. If desired also silently
2815+
* ignoring errors about unreadable files. Log others.
26962816
*/
2697-
fd = OpenTransientFile((char *) fname, flags, 0);
2698-
if (fd < 0)
2817+
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
2818+
return 0;
2819+
else if (fd < 0 && ignore_perm && errno == EACCES)
2820+
return 0;
2821+
else if (fd < 0)
26992822
{
2700-
if (errno == EACCES || (isdir && errno == EISDIR))
2701-
return;
27022823
ereport(elevel,
27032824
(errcode_for_file_access(),
27042825
errmsg("could not open file \"%s\": %m", fname)));
2705-
return;
2826+
return -1;
27062827
}
27072828

27082829
returncode = pg_fsync(fd);
@@ -2712,9 +2833,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
27122833
* those errors. Anything else needs to be logged.
27132834
*/
27142835
if (returncode != 0 && !(isdir && errno == EBADF))
2836+
{
2837+
int save_errno;
2838+
2839+
/* close file upon error, might not be in transaction context */
2840+
save_errno = errno;
2841+
(void) CloseTransientFile(fd);
2842+
errno = save_errno;
2843+
27152844
ereport(elevel,
27162845
(errcode_for_file_access(),
27172846
errmsg("could not fsync file \"%s\": %m", fname)));
2847+
return -1;
2848+
}
27182849

27192850
(void) CloseTransientFile(fd);
2851+
2852+
return 0;
2853+
}
2854+
2855+
/*
2856+
* fsync_parent_path -- fsync the parent path of a file or directory
2857+
*
2858+
* This is aimed at making file operations persistent on disk in case of
2859+
* an OS crash or power failure.
2860+
*/
2861+
static int
2862+
fsync_parent_path(const char *fname, int elevel)
2863+
{
2864+
char parentpath[MAXPGPATH];
2865+
2866+
strlcpy(parentpath, fname, MAXPGPATH);
2867+
get_parent_directory(parentpath);
2868+
2869+
/*
2870+
* get_parent_directory() returns an empty string if the input argument is
2871+
* just a file name (see comments in path.c), so handle that as being the
2872+
* current directory.
2873+
*/
2874+
if (strlen(parentpath) == 0)
2875+
strlcpy(parentpath, ".", MAXPGPATH);
2876+
2877+
if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
2878+
return -1;
2879+
2880+
return 0;
27202881
}

src/backend/storage/file/reinit.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
385385

386386
FreeDir(dbspace_dir);
387387

388-
fsync_fname((char *) dbspacedirname, true);
388+
fsync_fname(dbspacedirname, true);
389389
}
390390
}
391391

src/include/storage/fd.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ extern int pg_fsync_no_writethrough(int fd);
113113
extern int pg_fsync_writethrough(int fd);
114114
extern int pg_fdatasync(int fd);
115115
extern int pg_flush_data(int fd, off_t offset, off_t amount);
116-
extern void fsync_fname(char *fname, bool isdir);
116+
extern void fsync_fname(const char *fname, bool isdir);
117+
extern int durable_rename(const char *oldfile, const char *newfile, int loglevel);
118+
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
117119
extern void SyncDataDirectory(void);
118120

119121
/* Filename components for OpenTemporaryFile */

0 commit comments

Comments
 (0)