Skip to content

Commit 61752af

Browse files
committed
Provide recovery_init_sync_method=syncfs.
Since commit 2ce439f we have opened every file in the data directory and called fsync() at the start of crash recovery. This can be very slow if there are many files, leading to field complaints of systems taking minutes or even hours to begin crash recovery. Provide an alternative method, for Linux only, where we call syncfs() on every possibly different filesystem under the data directory. This is equivalent, but avoids faulting in potentially many inodes from potentially slow storage. The new mode comes with some caveats, described in the documentation, so the default value for the new setting is "fsync", preserving the older behavior. Reported-by: Michael Brown <michael.brown@discourse.org> Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com> Reviewed-by: Paul Guo <guopa@vmware.com> Reviewed-by: Bruce Momjian <bruce@momjian.us> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: David Steele <david@pgmasters.net> Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
1 parent b822ae1 commit 61752af

File tree

9 files changed

+129
-2
lines changed

9 files changed

+129
-2
lines changed

configure

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15409,7 +15409,7 @@ fi
1540915409
LIBS_including_readline="$LIBS"
1541015410
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
1541115411

15412-
for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev
15412+
for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev
1541315413
do :
1541415414
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
1541515415
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,6 +1701,7 @@ AC_CHECK_FUNCS(m4_normalize([
17011701
strchrnul
17021702
strsignal
17031703
symlink
1704+
syncfs
17041705
sync_file_range
17051706
uselocale
17061707
wcstombs_l

doc/src/sgml/config.sgml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9721,6 +9721,41 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
97219721
</listitem>
97229722
</varlistentry>
97239723

9724+
<varlistentry id="guc-recovery-init-sync-method" xreflabel="recovery_init_sync_method">
9725+
<term><varname>recovery_init_sync_method</varname> (<type>enum</type>)
9726+
<indexterm>
9727+
<primary><varname>recovery_init_sync_method</varname> configuration parameter</primary>
9728+
</indexterm>
9729+
</term>
9730+
<listitem>
9731+
<para>
9732+
When set to <literal>fsync</literal>, which is the default,
9733+
<productname>PostgreSQL</productname> will recursively open and
9734+
synchronize all files in the data directory before crash recovery
9735+
begins. The search for files will follow symbolic links for the WAL
9736+
directory and each configured tablespace (but not any other symbolic
9737+
links). This is intended to make sure that all WAL and data files are
9738+
durably stored on disk before replaying changes. This applies whenever
9739+
starting a database cluster that did not shut down cleanly, including
9740+
copies created with <application>pg_basebackup</application>.
9741+
</para>
9742+
<para>
9743+
On Linux, <literal>syncfs</literal> may be used instead, to ask the
9744+
operating system to synchronize the whole file systems that contain the
9745+
data directory, the WAL files and each tablespace (but not any other
9746+
file systems that may be reachable through symbolic links). This may
9747+
be a lot faster than the <literal>fsync</literal> setting, because it
9748+
doesn't need to open each file one by one. On the other hand, it may
9749+
be slower if a file system is shared by other applications that
9750+
modify a lot of files, since those files will also be written to disk.
9751+
Furthermore, on versions of Linux before 5.8, I/O errors encountered
9752+
while writing data to disk may not be reported to
9753+
<productname>PostgreSQL</productname>, and relevant error messages may
9754+
appear only in kernel logs.
9755+
</para>
9756+
</listitem>
9757+
</varlistentry>
9758+
97249759
</variablelist>
97259760

97269761
</sect1>

src/backend/storage/file/fd.c

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,11 @@
7272

7373
#include "postgres.h"
7474

75+
#include <dirent.h>
7576
#include <sys/file.h>
7677
#include <sys/param.h>
7778
#include <sys/stat.h>
79+
#include <sys/types.h>
7880
#ifndef WIN32
7981
#include <sys/mman.h>
8082
#endif
@@ -158,6 +160,9 @@ int max_safe_fds = FD_MINFREE; /* default if not changed */
158160
/* Whether it is safe to continue running after fsync() fails. */
159161
bool data_sync_retry = false;
160162

163+
/* How SyncDataDirectory() should do its job. */
164+
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
165+
161166
/* Debugging.... */
162167

163168
#ifdef FDDEBUG
@@ -3265,9 +3270,31 @@ looks_like_temp_rel_name(const char *name)
32653270
return true;
32663271
}
32673272

3273+
#ifdef HAVE_SYNCFS
3274+
static void
3275+
do_syncfs(const char *path)
3276+
{
3277+
int fd;
3278+
3279+
fd = OpenTransientFile(path, O_RDONLY);
3280+
if (fd < 0)
3281+
{
3282+
ereport(LOG,
3283+
(errcode_for_file_access(),
3284+
errmsg("could not open %s: %m", path)));
3285+
return;
3286+
}
3287+
if (syncfs(fd) < 0)
3288+
ereport(LOG,
3289+
(errcode_for_file_access(),
3290+
errmsg("could not sync filesystem for \"%s\": %m", path)));
3291+
CloseTransientFile(fd);
3292+
}
3293+
#endif
32683294

32693295
/*
3270-
* Issue fsync recursively on PGDATA and all its contents.
3296+
* Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3297+
* all potential filesystem, depending on recovery_init_sync_method setting.
32713298
*
32723299
* We fsync regular files and directories wherever they are, but we
32733300
* follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3319,6 +3346,42 @@ SyncDataDirectory(void)
33193346
xlog_is_symlink = true;
33203347
#endif
33213348

3349+
#ifdef HAVE_SYNCFS
3350+
if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
3351+
{
3352+
DIR *dir;
3353+
struct dirent *de;
3354+
3355+
/*
3356+
* On Linux, we don't have to open every single file one by one. We
3357+
* can use syncfs() to sync whole filesystems. We only expect
3358+
* filesystem boundaries to exist where we tolerate symlinks, namely
3359+
* pg_wal and the tablespaces, so we call syncfs() for each of those
3360+
* directories.
3361+
*/
3362+
3363+
/* Sync the top level pgdata directory. */
3364+
do_syncfs(".");
3365+
/* If any tablespaces are configured, sync each of those. */
3366+
dir = AllocateDir("pg_tblspc");
3367+
while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3368+
{
3369+
char path[MAXPGPATH];
3370+
3371+
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3372+
continue;
3373+
3374+
snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3375+
do_syncfs(path);
3376+
}
3377+
FreeDir(dir);
3378+
/* If pg_wal is a symlink, process that too. */
3379+
if (xlog_is_symlink)
3380+
do_syncfs("pg_wal");
3381+
return;
3382+
}
3383+
#endif /* !HAVE_SYNCFS */
3384+
33223385
/*
33233386
* If possible, hint to the kernel that we're soon going to fsync the data
33243387
* directory and its contents. Errors in this step are even less

src/backend/utils/misc/guc.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = {
488488
StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2),
489489
"array length mismatch");
490490

491+
static struct config_enum_entry recovery_init_sync_method_options[] = {
492+
{"fsync", RECOVERY_INIT_SYNC_METHOD_FSYNC, false},
493+
#ifdef HAVE_SYNCFS
494+
{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
495+
#endif
496+
{NULL, 0, false}
497+
};
498+
491499
static struct config_enum_entry shared_memory_options[] = {
492500
#ifndef WIN32
493501
{"sysv", SHMEM_TYPE_SYSV, false},
@@ -4871,6 +4879,15 @@ static struct config_enum ConfigureNamesEnum[] =
48714879
NULL, NULL, NULL
48724880
},
48734881

4882+
{
4883+
{"recovery_init_sync_method", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
4884+
gettext_noop("Sets the method for synchronizing the data directory before crash recovery."),
4885+
},
4886+
&recovery_init_sync_method,
4887+
RECOVERY_INIT_SYNC_METHOD_FSYNC, recovery_init_sync_method_options,
4888+
NULL, NULL, NULL
4889+
},
4890+
48744891
/* End-of-list marker */
48754892
{
48764893
{NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@
761761
#restart_after_crash = on # reinitialize after backend crash?
762762
#remove_temp_files_after_crash = on # remove temporary files after
763763
# backend crash?
764+
#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+)
764765
#data_sync_retry = off # retry or panic on failure to fsync
765766
# data?
766767
# (change requires restart)

src/include/pg_config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,9 @@
590590
/* Define to 1 if you have the `symlink' function. */
591591
#undef HAVE_SYMLINK
592592

593+
/* Define to 1 if you have the `syncfs' function. */
594+
#undef HAVE_SYNCFS
595+
593596
/* Define to 1 if you have the `sync_file_range' function. */
594597
#undef HAVE_SYNC_FILE_RANGE
595598

src/include/storage/fd.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545

4646
#include <dirent.h>
4747

48+
typedef enum RecoveryInitSyncMethod {
49+
RECOVERY_INIT_SYNC_METHOD_FSYNC,
50+
RECOVERY_INIT_SYNC_METHOD_SYNCFS
51+
} RecoveryInitSyncMethod;
52+
4853
struct iovec; /* avoid including port/pg_iovec.h here */
4954

5055
typedef int File;
@@ -53,6 +58,7 @@ typedef int File;
5358
/* GUC parameter */
5459
extern PGDLLIMPORT int max_files_per_process;
5560
extern PGDLLIMPORT bool data_sync_retry;
61+
extern int recovery_init_sync_method;
5662

5763
/*
5864
* This is private to fd.c, but exported for save/restore_backend_variables()

src/tools/msvc/Solution.pm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ sub GenerateFiles
388388
HAVE_STRUCT_TM_TM_ZONE => undef,
389389
HAVE_SYNC_FILE_RANGE => undef,
390390
HAVE_SYMLINK => 1,
391+
HAVE_SYNCFS => undef,
391392
HAVE_SYSLOG => undef,
392393
HAVE_SYS_EPOLL_H => undef,
393394
HAVE_SYS_EVENT_H => undef,

0 commit comments

Comments
 (0)