60
60
/*
61
61
* The map file is critical data: we have no automatic method for recovering
62
62
* from loss or corruption of it. We use a CRC so that we can detect
63
- * corruption. To minimize the risk of failed updates, the map file should
64
- * be kept to no more than one standard-size disk sector (ie 512 bytes),
65
- * and we use overwrite-in-place rather than playing renaming games.
66
- * The struct layout below is designed to occupy exactly 512 bytes, which
67
- * might make filesystem updates a bit more efficient.
63
+ * corruption. Since the file might be more than one standard-size disk
64
+ * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
65
+ * a new file and rename it into place, atomically replacing the original file.
68
66
*
69
67
* Entries in the mappings[] array are in no particular order. We could
70
68
* speed searching by insisting on OID order, but it really shouldn't be
71
69
* worth the trouble given the intended size of the mapping sets.
72
70
*/
73
71
#define RELMAPPER_FILENAME "pg_filenode.map"
72
+ #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
74
73
75
74
#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
76
75
77
- #define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
76
+ /*
77
+ * There's no need for this constant to have any particular value, and we
78
+ * can raise it as necessary if we end up with more mapped relations. For
79
+ * now, we just pick a round number that is modestly larger than the expected
80
+ * number of mappings.
81
+ */
82
+ #define MAX_MAPPINGS 64
78
83
79
84
typedef struct RelMapping
80
85
{
@@ -88,7 +93,6 @@ typedef struct RelMapFile
88
93
int32 num_mappings ; /* number of valid RelMapping entries */
89
94
RelMapping mappings [MAX_MAPPINGS ];
90
95
pg_crc32c crc ; /* CRC of all above */
91
- int32 pad ; /* to make the struct size be 512 exactly */
92
96
} RelMapFile ;
93
97
94
98
/*
@@ -877,6 +881,7 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
877
881
{
878
882
int fd ;
879
883
char mapfilename [MAXPGPATH ];
884
+ char maptempfilename [MAXPGPATH ];
880
885
881
886
/*
882
887
* Fill in the overhead fields and update CRC.
@@ -890,17 +895,47 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
890
895
FIN_CRC32C (newmap -> crc );
891
896
892
897
/*
893
- * Open the target file. We prefer to do this before entering the
894
- * critical section, so that an open() failure need not force PANIC .
898
+ * Construct filenames -- a temporary file that we'll create to write the
899
+ * data initially, and then the permanent name to which we will rename it .
895
900
*/
896
901
snprintf (mapfilename , sizeof (mapfilename ), "%s/%s" ,
897
902
dbpath , RELMAPPER_FILENAME );
898
- fd = OpenTransientFile (mapfilename , O_WRONLY | O_CREAT | PG_BINARY );
903
+ snprintf (maptempfilename , sizeof (maptempfilename ), "%s/%s" ,
904
+ dbpath , RELMAPPER_TEMP_FILENAME );
905
+
906
+ /*
907
+ * Open a temporary file. If a file already exists with this name, it must
908
+ * be left over from a previous crash, so we can overwrite it. Concurrent
909
+ * calls to this function are not allowed.
910
+ */
911
+ fd = OpenTransientFile (maptempfilename ,
912
+ O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY );
899
913
if (fd < 0 )
900
914
ereport (ERROR ,
901
915
(errcode_for_file_access (),
902
916
errmsg ("could not open file \"%s\": %m" ,
903
- mapfilename )));
917
+ maptempfilename )));
918
+
919
+ /* Write new data to the file. */
920
+ pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_WRITE );
921
+ if (write (fd , newmap , sizeof (RelMapFile )) != sizeof (RelMapFile ))
922
+ {
923
+ /* if write didn't set errno, assume problem is no disk space */
924
+ if (errno == 0 )
925
+ errno = ENOSPC ;
926
+ ereport (ERROR ,
927
+ (errcode_for_file_access (),
928
+ errmsg ("could not write file \"%s\": %m" ,
929
+ maptempfilename )));
930
+ }
931
+ pgstat_report_wait_end ();
932
+
933
+ /* And close the file. */
934
+ if (CloseTransientFile (fd ) != 0 )
935
+ ereport (ERROR ,
936
+ (errcode_for_file_access (),
937
+ errmsg ("could not close file \"%s\": %m" ,
938
+ maptempfilename )));
904
939
905
940
if (write_wal )
906
941
{
@@ -924,40 +959,17 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
924
959
XLogFlush (lsn );
925
960
}
926
961
927
- errno = 0 ;
928
- pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_WRITE );
929
- if (write (fd , newmap , sizeof (RelMapFile )) != sizeof (RelMapFile ))
930
- {
931
- /* if write didn't set errno, assume problem is no disk space */
932
- if (errno == 0 )
933
- errno = ENOSPC ;
934
- ereport (ERROR ,
935
- (errcode_for_file_access (),
936
- errmsg ("could not write file \"%s\": %m" ,
937
- mapfilename )));
938
- }
939
- pgstat_report_wait_end ();
940
-
941
962
/*
942
- * We choose to fsync the data to disk before considering the task done.
943
- * It would be possible to relax this if it turns out to be a performance
944
- * issue, but it would complicate checkpointing --- see notes for
945
- * CheckPointRelationMap.
963
+ * durable_rename() does all the hard work of making sure that we rename
964
+ * the temporary file into place in a crash-safe manner.
965
+ *
966
+ * NB: Although we instruct durable_rename() to use ERROR, we will often
967
+ * be in a critical section at this point; if so, ERROR will become PANIC.
946
968
*/
947
- pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_SYNC );
948
- if (pg_fsync (fd ) != 0 )
949
- ereport (data_sync_elevel (ERROR ),
950
- (errcode_for_file_access (),
951
- errmsg ("could not fsync file \"%s\": %m" ,
952
- mapfilename )));
969
+ pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_REPLACE );
970
+ durable_rename (maptempfilename , mapfilename , ERROR );
953
971
pgstat_report_wait_end ();
954
972
955
- if (CloseTransientFile (fd ) != 0 )
956
- ereport (ERROR ,
957
- (errcode_for_file_access (),
958
- errmsg ("could not close file \"%s\": %m" ,
959
- mapfilename )));
960
-
961
973
/*
962
974
* Now that the file is safely on disk, send sinval message to let other
963
975
* backends know to re-read it. We must do this inside the critical
0 commit comments