@@ -305,7 +305,10 @@ static void walkdir(const char *path,
305
305
#ifdef PG_FLUSH_DATA_WORKS
306
306
static void pre_sync_fname (const char * fname , bool isdir , int elevel );
307
307
#endif
308
- static void fsync_fname_ext (const char * fname , bool isdir , int elevel );
308
+ static void datadir_fsync_fname (const char * fname , bool isdir , int elevel );
309
+
310
+ static int fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel );
311
+ static int fsync_parent_path (const char * fname , int elevel );
309
312
310
313
311
314
/*
@@ -412,54 +415,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
412
415
* indicate the OS just doesn't allow/require fsyncing directories.
413
416
*/
414
417
void
415
- fsync_fname (char * fname , bool isdir )
418
+ fsync_fname (const char * fname , bool isdir )
419
+ {
420
+ fsync_fname_ext (fname , isdir , false, ERROR );
421
+ }
422
+
423
+ /*
424
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
425
+ *
426
+ * This routine ensures that, after returning, the effect of renaming file
427
+ * persists in case of a crash. A crash while this routine is running will
428
+ * leave you with either the pre-existing or the moved file in place of the
429
+ * new file; no mixed state or truncated files are possible.
430
+ *
431
+ * It does so by using fsync on the old filename and the possibly existing
432
+ * target filename before the rename, and the target file and directory after.
433
+ *
434
+ * Note that rename() cannot be used across arbitrary directories, as they
435
+ * might not be on the same filesystem. Therefore this routine does not
436
+ * support renaming across directories.
437
+ *
438
+ * Log errors with the caller specified severity.
439
+ *
440
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
441
+ * valid upon return.
442
+ */
443
+ int
444
+ durable_rename (const char * oldfile , const char * newfile , int elevel )
416
445
{
417
446
int fd ;
418
- int returncode ;
419
447
420
448
/*
421
- * Some OSs require directories to be opened read-only whereas other
422
- * systems don't allow us to fsync files opened read-only; so we need both
423
- * cases here
449
+ * First fsync the old and target path (if it exists), to ensure that they
450
+ * are properly persistent on disk. Syncing the target file is not
451
+ * strictly necessary, but it makes it easier to reason about crashes;
452
+ * because it's then guaranteed that either source or target file exists
453
+ * after a crash.
424
454
*/
425
- if (!isdir )
426
- fd = OpenTransientFile (fname ,
427
- O_RDWR | PG_BINARY ,
428
- S_IRUSR | S_IWUSR );
455
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
456
+ return -1 ;
457
+
458
+ fd = OpenTransientFile ((char * ) newfile , PG_BINARY | O_RDWR , 0 );
459
+ if (fd < 0 )
460
+ {
461
+ if (errno != ENOENT )
462
+ {
463
+ ereport (elevel ,
464
+ (errcode_for_file_access (),
465
+ errmsg ("could not open file \"%s\": %m" , newfile )));
466
+ return -1 ;
467
+ }
468
+ }
429
469
else
430
- fd = OpenTransientFile (fname ,
431
- O_RDONLY | PG_BINARY ,
432
- S_IRUSR | S_IWUSR );
470
+ {
471
+ if (pg_fsync (fd ) != 0 )
472
+ {
473
+ int save_errno ;
474
+
475
+ /* close file upon error, might not be in transaction context */
476
+ save_errno = errno ;
477
+ CloseTransientFile (fd );
478
+ errno = save_errno ;
479
+
480
+ ereport (elevel ,
481
+ (errcode_for_file_access (),
482
+ errmsg ("could not fsync file \"%s\": %m" , newfile )));
483
+ return -1 ;
484
+ }
485
+ CloseTransientFile (fd );
486
+ }
487
+
488
+ /* Time to do the real deal... */
489
+ if (rename (oldfile , newfile ) < 0 )
490
+ {
491
+ ereport (elevel ,
492
+ (errcode_for_file_access (),
493
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
494
+ oldfile , newfile )));
495
+ return -1 ;
496
+ }
433
497
434
498
/*
435
- * Some OSs don't allow us to open directories at all (Windows returns
436
- * EACCES)
499
+ * To guarantee renaming the file is persistent, fsync the file with its
500
+ * new name, and its containing directory.
437
501
*/
438
- if (fd < 0 && isdir && ( errno == EISDIR || errno == EACCES ) )
439
- return ;
502
+ if (fsync_fname_ext ( newfile , false, false, elevel ) != 0 )
503
+ return -1 ;
440
504
441
- else if (fd < 0 )
442
- ereport (ERROR ,
443
- (errcode_for_file_access (),
444
- errmsg ("could not open file \"%s\": %m" , fname )));
505
+ if (fsync_parent_path (newfile , elevel ) != 0 )
506
+ return -1 ;
445
507
446
- returncode = pg_fsync (fd );
508
+ return 0 ;
509
+ }
510
+
511
+ /*
512
+ * durable_link_or_rename -- rename a file in a durable manner.
513
+ *
514
+ * Similar to durable_rename(), except that this routine tries (but does not
515
+ * guarantee) not to overwrite the target file.
516
+ *
517
+ * Note that a crash in an unfortunate moment can leave you with two links to
518
+ * the target file.
519
+ *
520
+ * Log errors with the caller specified severity.
521
+ *
522
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
523
+ * valid upon return.
524
+ */
525
+ int
526
+ durable_link_or_rename (const char * oldfile , const char * newfile , int elevel )
527
+ {
528
+ /*
529
+ * Ensure that, if we crash directly after the rename/link, a file with
530
+ * valid contents is moved into place.
531
+ */
532
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
533
+ return -1 ;
447
534
448
- /* Some OSs don't allow us to fsync directories at all */
449
- if (returncode != 0 && isdir && errno == EBADF )
535
+ #if HAVE_WORKING_LINK
536
+ if (link ( oldfile , newfile ) < 0 )
450
537
{
451
- CloseTransientFile (fd );
452
- return ;
538
+ ereport (elevel ,
539
+ (errcode_for_file_access (),
540
+ errmsg ("could not link file \"%s\" to \"%s\": %m" ,
541
+ oldfile , newfile )));
542
+ return -1 ;
453
543
}
454
-
455
- if (returncode != 0 )
456
- ereport (ERROR ,
544
+ unlink (oldfile );
545
+ #else
546
+ /* XXX: Add racy file existence check? */
547
+ if (rename (oldfile , newfile ) < 0 )
548
+ {
549
+ ereport (elevel ,
457
550
(errcode_for_file_access (),
458
- errmsg ("could not fsync file \"%s\": %m" , fname )));
551
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
552
+ oldfile , newfile )));
553
+ return -1 ;
554
+ }
555
+ #endif
459
556
460
- CloseTransientFile (fd );
461
- }
557
+ /*
558
+ * Make change persistent in case of an OS crash, both the new entry and
559
+ * its parent directory need to be flushed.
560
+ */
561
+ if (fsync_fname_ext (newfile , false, false, elevel ) != 0 )
562
+ return -1 ;
563
+
564
+ /* Same for parent directory */
565
+ if (fsync_parent_path (newfile , elevel ) != 0 )
566
+ return -1 ;
462
567
568
+ return 0 ;
569
+ }
463
570
464
571
/*
465
572
* InitFileAccess --- initialize this module during backend startup
@@ -2546,10 +2653,10 @@ SyncDataDirectory(void)
2546
2653
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2547
2654
* so we don't worry about optimizing it.
2548
2655
*/
2549
- walkdir ("." , fsync_fname_ext , false, LOG );
2656
+ walkdir ("." , datadir_fsync_fname , false, LOG );
2550
2657
if (xlog_is_symlink )
2551
- walkdir ("pg_xlog" , fsync_fname_ext , false, LOG );
2552
- walkdir ("pg_tblspc" , fsync_fname_ext , true, LOG );
2658
+ walkdir ("pg_xlog" , datadir_fsync_fname , false, LOG );
2659
+ walkdir ("pg_tblspc" , datadir_fsync_fname , true, LOG );
2553
2660
}
2554
2661
2555
2662
/*
@@ -2663,15 +2770,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
2663
2770
2664
2771
#endif /* PG_FLUSH_DATA_WORKS */
2665
2772
2773
+ static void
2774
+ datadir_fsync_fname (const char * fname , bool isdir , int elevel )
2775
+ {
2776
+ /*
2777
+ * We want to silently ignoring errors about unreadable files. Pass that
2778
+ * desire on to fsync_fname_ext().
2779
+ */
2780
+ fsync_fname_ext (fname , isdir , true, elevel );
2781
+ }
2782
+
2666
2783
/*
2667
2784
* fsync_fname_ext -- Try to fsync a file or directory
2668
2785
*
2669
- * Ignores errors trying to open unreadable files, or trying to fsync
2670
- * directories on systems where that isn't allowed/required, and logs other
2671
- * errors at a caller-specified level.
2786
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
2787
+ * files. Logs other errors at a caller-specified level.
2788
+ *
2789
+ * Returns 0 if the operation succeeded, -1 otherwise.
2672
2790
*/
2673
- static void
2674
- fsync_fname_ext (const char * fname , bool isdir , int elevel )
2791
+ static int
2792
+ fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel )
2675
2793
{
2676
2794
int fd ;
2677
2795
int flags ;
@@ -2689,20 +2807,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2689
2807
else
2690
2808
flags |= O_RDONLY ;
2691
2809
2810
+ fd = OpenTransientFile ((char * ) fname , flags , 0 );
2811
+
2692
2812
/*
2693
- * Open the file, silently ignoring errors about unreadable files (or
2694
- * unsupported operations, e.g. opening a directory under Windows), and
2695
- * logging others.
2813
+ * Some OSs don't allow us to open directories at all (Windows returns
2814
+ * EACCES), just ignore the error in that case. If desired also silently
2815
+ * ignoring errors about unreadable files. Log others.
2696
2816
*/
2697
- fd = OpenTransientFile ((char * ) fname , flags , 0 );
2698
- if (fd < 0 )
2817
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES ))
2818
+ return 0 ;
2819
+ else if (fd < 0 && ignore_perm && errno == EACCES )
2820
+ return 0 ;
2821
+ else if (fd < 0 )
2699
2822
{
2700
- if (errno == EACCES || (isdir && errno == EISDIR ))
2701
- return ;
2702
2823
ereport (elevel ,
2703
2824
(errcode_for_file_access (),
2704
2825
errmsg ("could not open file \"%s\": %m" , fname )));
2705
- return ;
2826
+ return -1 ;
2706
2827
}
2707
2828
2708
2829
returncode = pg_fsync (fd );
@@ -2712,9 +2833,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2712
2833
* those errors. Anything else needs to be logged.
2713
2834
*/
2714
2835
if (returncode != 0 && !(isdir && errno == EBADF ))
2836
+ {
2837
+ int save_errno ;
2838
+
2839
+ /* close file upon error, might not be in transaction context */
2840
+ save_errno = errno ;
2841
+ (void ) CloseTransientFile (fd );
2842
+ errno = save_errno ;
2843
+
2715
2844
ereport (elevel ,
2716
2845
(errcode_for_file_access (),
2717
2846
errmsg ("could not fsync file \"%s\": %m" , fname )));
2847
+ return -1 ;
2848
+ }
2718
2849
2719
2850
(void ) CloseTransientFile (fd );
2851
+
2852
+ return 0 ;
2853
+ }
2854
+
2855
+ /*
2856
+ * fsync_parent_path -- fsync the parent path of a file or directory
2857
+ *
2858
+ * This is aimed at making file operations persistent on disk in case of
2859
+ * an OS crash or power failure.
2860
+ */
2861
+ static int
2862
+ fsync_parent_path (const char * fname , int elevel )
2863
+ {
2864
+ char parentpath [MAXPGPATH ];
2865
+
2866
+ strlcpy (parentpath , fname , MAXPGPATH );
2867
+ get_parent_directory (parentpath );
2868
+
2869
+ /*
2870
+ * get_parent_directory() returns an empty string if the input argument is
2871
+ * just a file name (see comments in path.c), so handle that as being the
2872
+ * current directory.
2873
+ */
2874
+ if (strlen (parentpath ) == 0 )
2875
+ strlcpy (parentpath , "." , MAXPGPATH );
2876
+
2877
+ if (fsync_fname_ext (parentpath , true, false, elevel ) != 0 )
2878
+ return -1 ;
2879
+
2880
+ return 0 ;
2720
2881
}
0 commit comments