Skip to content

Commit 99aeb84

Browse files
committed
pg_combinebackup: Add -k, --link option.
This is similar to pg_upgrade's --link option, except that here we won't typically be able to use it for every input file: sometimes we will need to reconstruct a complete backup from blocks stored in different files. However, when a whole file does need to be copied, we can use an optimized copying strategy: see the existing --clone and --copy-file-range options and the code to use CopyFile() on Windows. This commit adds a new strategy: add a hard link to an existing file. Making a hard link doesn't actually copy anything, but it makes sense for the code to treat it as doing so. This is useful when the input directories are merely staging directories that will be removed once the restore is complete. In such cases, there is no need to actually copy the data, and making a bunch of new hard links can be very quick. However, it would be quite dangerous to use it if the input directories might later be reused for any other purpose, since starting postgres on the output directory would destructively modify the input directories. For that reason, using this new option causes pg_combinebackup to emit a warning about the danger involved. Author: Israel Barth Rubio <barthisrael@gmail.com> Co-authored-by: Robert Haas <robertmhaas@gmail.com> (cosmetic changes) Reviewed-by: Vignesh C <vignesh21@gmail.com> Discussion: http://postgr.es/m/CA+TgmoaEFsYHsMefNaNkU=2SnMRufKE3eVJxvAaX=OWgcnPmPg@mail.gmail.com
1 parent ed762e9 commit 99aeb84

File tree

6 files changed

+245
-3
lines changed

6 files changed

+245
-3
lines changed

doc/src/sgml/ref/pg_combinebackup.sgml

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,35 @@ PostgreSQL documentation
137137
</listitem>
138138
</varlistentry>
139139

140+
<varlistentry>
141+
<term><option>-k</option></term>
142+
<term><option>--link</option></term>
143+
<listitem>
144+
<para>
145+
Use hard links instead of copying files to the synthetic backup.
146+
Reconstruction of the synthetic backup might be faster (no file copying)
147+
and use less disk space, but care must be taken when using the output
148+
directory, because any modifications to that directory (for example,
149+
starting the server) can also affect the input directories. Likewise,
150+
changes to the input directories (for example, starting the server on
151+
the full backup) could affect the output directory. Thus, this option
152+
is best used when the input directories are only copies that will be
153+
removed after <application>pg_combinebackup</application> has completed.
154+
</para>
155+
156+
<para>
157+
Requires that the input backups and the output directory are in the
158+
same file system.
159+
</para>
160+
161+
<para>
162+
If a backup manifest is not available or does not contain checksum of
163+
the right type, hard links will still be created, but the file will be
164+
also read block-by-block for the checksum calculation.
165+
</para>
166+
</listitem>
167+
</varlistentry>
168+
140169
<varlistentry>
141170
<term><option>--clone</option></term>
142171
<listitem>
@@ -167,7 +196,8 @@ PostgreSQL documentation
167196
<listitem>
168197
<para>
169198
Perform regular file copy. This is the default. (See also
170-
<option>--copy-file-range</option> and <option>--clone</option>.)
199+
<option>--copy-file-range</option>, <option>--clone</option>, and
200+
<option>-k</option>/<option>--link</option>.)
171201
</para>
172202
</listitem>
173203
</varlistentry>

src/bin/pg_combinebackup/copy_file.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ static void copy_file_copyfile(const char *src, const char *dst,
4040
pg_checksum_context *checksum_ctx);
4141
#endif
4242

43+
static void copy_file_link(const char *src, const char *dest,
44+
pg_checksum_context *checksum_ctx);
45+
4346
/*
4447
* Copy a regular file, optionally computing a checksum, and emitting
4548
* appropriate debug messages. But if we're in dry-run mode, then just emit
@@ -69,7 +72,13 @@ copy_file(const char *src, const char *dst,
6972
}
7073

7174
#ifdef WIN32
72-
copy_method = COPY_METHOD_COPYFILE;
75+
/*
76+
* We have no specific switch to enable CopyFile on Windows, because
77+
* it's supported (as far as we know) on all Windows machines. So,
78+
* automatically enable it unless some other strategy was selected.
79+
*/
80+
if (copy_method == COPY_METHOD_COPY)
81+
copy_method = COPY_METHOD_COPYFILE;
7382
#endif
7483

7584
/* Determine the name of the copy strategy for use in log messages. */
@@ -93,6 +102,10 @@ copy_file(const char *src, const char *dst,
93102
strategy_implementation = copy_file_copyfile;
94103
break;
95104
#endif
105+
case COPY_METHOD_LINK:
106+
strategy_name = "link";
107+
strategy_implementation = copy_file_link;
108+
break;
96109
}
97110

98111
if (dry_run)
@@ -304,3 +317,21 @@ copy_file_copyfile(const char *src, const char *dst,
304317
checksum_file(src, checksum_ctx);
305318
}
306319
#endif /* WIN32 */
320+
321+
/*
322+
* copy_file_link
323+
* Hard-links a file from src to dest.
324+
*
325+
* If needed, also reads the file and calculates the checksum.
326+
*/
327+
static void
328+
copy_file_link(const char *src, const char *dest,
329+
pg_checksum_context *checksum_ctx)
330+
{
331+
if (link(src, dest) < 0)
332+
pg_fatal("error while linking file from \"%s\" to \"%s\": %m",
333+
src, dest);
334+
335+
/* if needed, calculate checksum of the file */
336+
checksum_file(src, checksum_ctx);
337+
}

src/bin/pg_combinebackup/copy_file.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ typedef enum CopyMethod
2525
#ifdef WIN32
2626
COPY_METHOD_COPYFILE,
2727
#endif
28+
COPY_METHOD_LINK,
2829
} CopyMethod;
2930

3031
extern void copy_file(const char *src, const char *dst,

src/bin/pg_combinebackup/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ tests += {
3737
't/007_wal_level_minimal.pl',
3838
't/008_promote.pl',
3939
't/009_no_full_file.pl',
40+
't/010_hardlink.pl',
4041
],
4142
}
4243
}

src/bin/pg_combinebackup/pg_combinebackup.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ main(int argc, char *argv[])
135135
{"no-sync", no_argument, NULL, 'N'},
136136
{"output", required_argument, NULL, 'o'},
137137
{"tablespace-mapping", required_argument, NULL, 'T'},
138+
{"link", no_argument, NULL, 'k'},
138139
{"manifest-checksums", required_argument, NULL, 1},
139140
{"no-manifest", no_argument, NULL, 2},
140141
{"sync-method", required_argument, NULL, 3},
@@ -172,7 +173,7 @@ main(int argc, char *argv[])
172173
opt.copy_method = COPY_METHOD_COPY;
173174

174175
/* process command-line options */
175-
while ((c = getopt_long(argc, argv, "dnNo:T:",
176+
while ((c = getopt_long(argc, argv, "dknNo:T:",
176177
long_options, &optindex)) != -1)
177178
{
178179
switch (c)
@@ -181,6 +182,9 @@ main(int argc, char *argv[])
181182
opt.debug = true;
182183
pg_logging_increase_verbosity();
183184
break;
185+
case 'k':
186+
opt.copy_method = COPY_METHOD_LINK;
187+
break;
184188
case 'n':
185189
opt.dry_run = true;
186190
break;
@@ -424,6 +428,11 @@ main(int argc, char *argv[])
424428
}
425429
}
426430

431+
/* Warn about the possibility of compromising the backups, when link mode */
432+
if (opt.copy_method == COPY_METHOD_LINK)
433+
pg_log_warning("--link mode was used; any modifications to the output "
434+
"directory may destructively modify input directories");
435+
427436
/* It's a success, so don't remove the output directories. */
428437
reset_directory_cleanup_list();
429438
exit(0);
@@ -761,6 +770,7 @@ help(const char *progname)
761770
printf(_(" %s [OPTION]... DIRECTORY...\n"), progname);
762771
printf(_("\nOptions:\n"));
763772
printf(_(" -d, --debug generate lots of debugging output\n"));
773+
printf(_(" -k, --link link files instead of copying\n"));
764774
printf(_(" -n, --dry-run do not actually do anything\n"));
765775
printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
766776
printf(_(" -o, --output=DIRECTORY output directory\n"));
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) 2025, PostgreSQL Global Development Group
2+
#
3+
# This test aims to validate that hard links are created as expected in the
4+
# output directory, when running pg_combinebackup with --link mode.
5+
6+
use strict;
7+
use warnings FATAL => 'all';
8+
use PostgreSQL::Test::Cluster;
9+
use PostgreSQL::Test::Utils;
10+
use Test::More;
11+
12+
# Set up a new database instance.
13+
my $primary = PostgreSQL::Test::Cluster->new('primary');
14+
$primary->init(has_archiving => 1, allows_streaming => 1);
15+
$primary->append_conf('postgresql.conf', 'summarize_wal = on');
16+
# We disable autovacuum to prevent "something else" to modify our test tables.
17+
$primary->append_conf('postgresql.conf', 'autovacuum = off');
18+
$primary->start;
19+
20+
# Create a couple of tables (~264KB each).
21+
# Note: Cirrus CI runs some tests with a very small segment size, so, in that
22+
# environment, a single table of 264KB would have both a segment with a link
23+
# count of 1 and also one with a link count of 2. But in a normal installation,
24+
# segment size is 1GB. Therefore, we use 2 different tables here: for test_1,
25+
# all segments (or the only one) will have two hard links; for test_2, the
26+
# last segment (or the only one) will have 1 hard link, and any others will
27+
# have 2.
28+
my $query = <<'EOM';
29+
CREATE TABLE test_%s AS
30+
SELECT x.id::bigint,
31+
repeat('a', 1600) AS value
32+
FROM generate_series(1, 100) AS x(id);
33+
EOM
34+
35+
$primary->safe_psql('postgres', sprintf($query, '1'));
36+
$primary->safe_psql('postgres', sprintf($query, '2'));
37+
38+
# Fetch information about the data files.
39+
$query = <<'EOM';
40+
SELECT pg_relation_filepath(oid)
41+
FROM pg_class
42+
WHERE relname = 'test_%s';
43+
EOM
44+
45+
my $test_1_path = $primary->safe_psql('postgres', sprintf($query, '1'));
46+
note "test_1 path is $test_1_path";
47+
48+
my $test_2_path = $primary->safe_psql('postgres', sprintf($query, '2'));
49+
note "test_2 path is $test_2_path";
50+
51+
# Take a full backup.
52+
my $backup1path = $primary->backup_dir . '/backup1';
53+
$primary->command_ok(
54+
[
55+
'pg_basebackup',
56+
'--pgdata' => $backup1path,
57+
'--no-sync',
58+
'--checkpoint' => 'fast',
59+
'--wal-method' => 'none'
60+
],
61+
"full backup");
62+
63+
# Perform an insert that touches a page of the last segment of the data file of
64+
# table test_2.
65+
$primary->safe_psql('postgres', <<EOM);
66+
INSERT INTO test_2 (id, value) VALUES (101, repeat('a', 1600));
67+
EOM
68+
69+
# Take an incremental backup.
70+
my $backup2path = $primary->backup_dir . '/backup2';
71+
$primary->command_ok(
72+
[
73+
'pg_basebackup',
74+
'--pgdata' => $backup2path,
75+
'--no-sync',
76+
'--checkpoint' => 'fast',
77+
'--wal-method' => 'none',
78+
'--incremental' => $backup1path . '/backup_manifest'
79+
],
80+
"incremental backup");
81+
82+
# Restore the incremental backup and use it to create a new node.
83+
my $restore = PostgreSQL::Test::Cluster->new('restore');
84+
$restore->init_from_backup(
85+
$primary, 'backup2',
86+
combine_with_prior => ['backup1'],
87+
combine_mode => '--link');
88+
89+
# Ensure files have the expected count of hard links. We expect all data files
90+
# from test_1 to contain 2 hard links, because they were not touched between the
91+
# full and incremental backups, and the last data file of table test_2 to
92+
# contain a single hard link because of changes in its last page.
93+
my $test_1_full_path = join('/', $restore->data_dir, $test_1_path);
94+
check_data_file($test_1_full_path, 2);
95+
96+
my $test_2_full_path = join('/', $restore->data_dir, $test_2_path);
97+
check_data_file($test_2_full_path, 1);
98+
99+
# OK, that's all.
100+
done_testing();
101+
102+
103+
# Given the path to the first segment of a data file, inspect its parent
104+
# directory to find all the segments of that data file, and make sure all the
105+
# segments contain 2 hard links. The last one must have the given number of hard
106+
# links.
107+
#
108+
# Parameters:
109+
# * data_file: path to the first segment of a data file, as per the output of
110+
# pg_relation_filepath.
111+
# * last_segment_nlinks: the number of hard links expected in the last segment
112+
# of the given data file.
113+
sub check_data_file
114+
{
115+
my ($data_file, $last_segment_nlinks) = @_;
116+
117+
my @data_file_segments = ($data_file);
118+
119+
# Start checking for additional segments
120+
my $segment_number = 1;
121+
122+
while (1)
123+
{
124+
my $next_segment = $data_file . '.' . $segment_number;
125+
126+
# If the file exists and is a regular file, add it to the list
127+
if (-f $next_segment)
128+
{
129+
push @data_file_segments, $next_segment;
130+
$segment_number++;
131+
}
132+
# Stop the loop if the file doesn't exist
133+
else
134+
{
135+
last;
136+
}
137+
}
138+
139+
# All segments of the given data file should contain 2 hard links, except
140+
# for the last one, which should match the given number of links.
141+
my $last_segment = pop @data_file_segments;
142+
143+
for my $segment (@data_file_segments)
144+
{
145+
# Get the file's stat information of each segment
146+
my $nlink_count = get_hard_link_count($segment);
147+
ok($nlink_count == 2, "File '$segment' has 2 hard links");
148+
}
149+
150+
# Get the file's stat information of the last segment
151+
my $nlink_count = get_hard_link_count($last_segment);
152+
ok($nlink_count == $last_segment_nlinks,
153+
"File '$last_segment' has $last_segment_nlinks hard link(s)");
154+
}
155+
156+
157+
# Subroutine to get hard link count of a given file.
158+
# Receives the path to a file, and returns the number of hard links of
159+
# that file.
160+
sub get_hard_link_count
161+
{
162+
my ($file) = @_;
163+
164+
# Get file stats
165+
my @stats = stat($file);
166+
my $nlink = $stats[3]; # Number of hard links
167+
168+
return $nlink;
169+
}

0 commit comments

Comments
 (0)