Skip to content

Commit 5c99513

Browse files
committed
Fix various checksum check problems for pg_verify_checksums and base backups
Three issues are fixed in this patch: - Base backups forgot to ignore files specific to EXEC_BACKEND, leading to spurious warnings when checksums are enabled, per analysis from me. - pg_verify_checksums forgot about files specific to EXEC_BACKEND, leading to failures of the tool on any such build, particularly Windows. This error was originally found by newly-introduced TAP tests in various buildfarm members using EXEC_BACKEND. - pg_verify_checksums forgot to count for temporary files and temporary paths, which could be valid relation files, without checksums, per report from Andres Freund. More tests are added to cover this case. A new test case which emulates corruption for a file in a different tablespace is added, coming from from Michael Banck, while I have coded the main code and refactored the test code. Author: Michael Banck, Michael Paquier Reviewed-by: Stephen Frost, David Steele Discussion: https://postgr.es/m/20181021134206.GA14282@paquier.xyz
1 parent a1c91dd commit 5c99513

File tree

3 files changed

+121
-46
lines changed

3 files changed

+121
-46
lines changed

src/backend/replication/basebackup.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,19 @@ static const char *excludeFiles[] =
189189

190190
/*
191191
* List of files excluded from checksum validation.
192+
*
193+
* Note: this list should be kept in sync with what pg_verify_checksums.c
194+
* includes.
192195
*/
193196
static const char *const noChecksumFiles[] = {
194197
"pg_control",
195198
"pg_filenode.map",
196199
"pg_internal.init",
197200
"PG_VERSION",
201+
#ifdef EXEC_BACKEND
202+
"config_exec_params",
203+
"config_exec_params.new",
204+
#endif
198205
NULL,
199206
};
200207

src/bin/pg_verify_checksums/pg_verify_checksums.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "storage/bufpage.h"
2121
#include "storage/checksum.h"
2222
#include "storage/checksum_impl.h"
23+
#include "storage/fd.h"
2324

2425

2526
static int64 files = 0;
@@ -49,11 +50,20 @@ usage(void)
4950
printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
5051
}
5152

53+
/*
54+
* List of files excluded from checksum validation.
55+
*
56+
* Note: this list should be kept in sync with what basebackup.c includes.
57+
*/
5258
static const char *const skip[] = {
5359
"pg_control",
5460
"pg_filenode.map",
5561
"pg_internal.init",
5662
"PG_VERSION",
63+
#ifdef EXEC_BACKEND
64+
"config_exec_params",
65+
"config_exec_params.new",
66+
#endif
5767
NULL,
5868
};
5969

@@ -62,13 +72,10 @@ skipfile(const char *fn)
6272
{
6373
const char *const *f;
6474

65-
if (strcmp(fn, ".") == 0 ||
66-
strcmp(fn, "..") == 0)
67-
return true;
68-
6975
for (f = skip; *f; f++)
7076
if (strcmp(*f, fn) == 0)
7177
return true;
78+
7279
return false;
7380
}
7481

@@ -146,9 +153,22 @@ scan_directory(const char *basedir, const char *subdir)
146153
char fn[MAXPGPATH];
147154
struct stat st;
148155

149-
if (skipfile(de->d_name))
156+
if (strcmp(de->d_name, ".") == 0 ||
157+
strcmp(de->d_name, "..") == 0)
150158
continue;
151159

160+
/* Skip temporary files */
161+
if (strncmp(de->d_name,
162+
PG_TEMP_FILE_PREFIX,
163+
strlen(PG_TEMP_FILE_PREFIX)) == 0)
164+
continue;
165+
166+
/* Skip temporary folders */
167+
if (strncmp(de->d_name,
168+
PG_TEMP_FILES_DIR,
169+
strlen(PG_TEMP_FILES_DIR)) == 0)
170+
return;
171+
152172
snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
153173
if (lstat(fn, &st) < 0)
154174
{
@@ -163,6 +183,9 @@ scan_directory(const char *basedir, const char *subdir)
163183
*segmentpath;
164184
BlockNumber segmentno = 0;
165185

186+
if (skipfile(de->d_name))
187+
continue;
188+
166189
/*
167190
* Cut off at the segment boundary (".") to get the segment number
168191
* in order to mix it into the checksum. Then also cut off at the

src/bin/pg_verify_checksums/t/002_actions.pl

Lines changed: 86 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,74 @@
55
use warnings;
66
use PostgresNode;
77
use TestLib;
8-
use Test::More tests => 36;
8+
use Test::More tests => 45;
9+
10+
11+
# Utility routine to create and check a table with corrupted checksums
12+
# on a wanted tablespace. Note that this stops and starts the node
13+
# multiple times to perform the checks, leaving the node started
14+
# at the end.
15+
sub check_relation_corruption
16+
{
17+
my $node = shift;
18+
my $table = shift;
19+
my $tablespace = shift;
20+
my $pgdata = $node->data_dir;
21+
22+
$node->safe_psql('postgres',
23+
"SELECT a INTO $table FROM generate_series(1,10000) AS a;
24+
ALTER TABLE $table SET (autovacuum_enabled=false);");
25+
26+
$node->safe_psql('postgres',
27+
"ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";");
28+
29+
my $file_corrupted = $node->safe_psql('postgres',
30+
"SELECT pg_relation_filepath('$table');");
31+
my $relfilenode_corrupted = $node->safe_psql('postgres',
32+
"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
33+
34+
# Set page header and block size
35+
my $pageheader_size = 24;
36+
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
37+
$node->stop;
38+
39+
# Checksums are correct for single relfilenode as the table is not
40+
# corrupted yet.
41+
command_ok(['pg_verify_checksums', '-D', $pgdata,
42+
'-r', $relfilenode_corrupted],
43+
"succeeds for single relfilenode on tablespace $tablespace with offline cluster");
44+
45+
# Time to create some corruption
46+
open my $file, '+<', "$pgdata/$file_corrupted";
47+
seek($file, $pageheader_size, 0);
48+
syswrite($file, '\0\0\0\0\0\0\0\0\0');
49+
close $file;
50+
51+
# Checksum checks on single relfilenode fail
52+
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
53+
$relfilenode_corrupted],
54+
1,
55+
[qr/Bad checksums:.*1/],
56+
[qr/checksum verification failed/],
57+
"fails with corrupted data for single relfilenode on tablespace $tablespace");
58+
59+
# Global checksum checks fail as well
60+
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata],
61+
1,
62+
[qr/Bad checksums:.*1/],
63+
[qr/checksum verification failed/],
64+
"fails with corrupted data on tablespace $tablespace");
65+
66+
# Drop corrupted table again and make sure there is no more corruption.
67+
$node->start;
68+
$node->safe_psql('postgres', "DROP TABLE $table;");
69+
$node->stop;
70+
$node->command_ok(['pg_verify_checksums', '-D', $pgdata],
71+
"succeeds again after table drop on tablespace $tablespace");
72+
73+
$node->start;
74+
return;
75+
}
976

1077
# Initialize node with checksums enabled.
1178
my $node = get_new_node('node_checksum');
@@ -27,6 +94,12 @@
2794
append_to_file "$pgdata/global/99999_fsm.123", "";
2895
append_to_file "$pgdata/global/99999_vm.123", "";
2996

97+
# These are temporary files and folders with dummy contents, which
98+
# should be ignored by the scan.
99+
append_to_file "$pgdata/global/pgsql_tmp_123", "foo";
100+
mkdir "$pgdata/global/pgsql_tmp";
101+
append_to_file "$pgdata/global/pgsql_tmp/1.1", "foo";
102+
30103
# Checksums pass on a newly-created cluster
31104
command_ok(['pg_verify_checksums', '-D', $pgdata],
32105
"succeeds with offline cluster");
@@ -36,47 +109,16 @@
36109
command_fails(['pg_verify_checksums', '-D', $pgdata],
37110
"fails with online cluster");
38111

39-
# Create table to corrupt and get its relfilenode
40-
$node->safe_psql('postgres',
41-
"SELECT a INTO corrupt1 FROM generate_series(1,10000) AS a;
42-
ALTER TABLE corrupt1 SET (autovacuum_enabled=false);");
43-
44-
my $file_corrupted = $node->safe_psql('postgres',
45-
"SELECT pg_relation_filepath('corrupt1')");
46-
my $relfilenode_corrupted = $node->safe_psql('postgres',
47-
"SELECT relfilenode FROM pg_class WHERE relname = 'corrupt1';");
48-
49-
# Set page header and block size
50-
my $pageheader_size = 24;
51-
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
52-
$node->stop;
53-
54-
# Checksums are correct for single relfilenode as the table is not
55-
# corrupted yet.
56-
command_ok(['pg_verify_checksums', '-D', $pgdata,
57-
'-r', $relfilenode_corrupted],
58-
"succeeds for single relfilenode with offline cluster");
59-
60-
# Time to create some corruption
61-
open my $file, '+<', "$pgdata/$file_corrupted";
62-
seek($file, $pageheader_size, 0);
63-
syswrite($file, '\0\0\0\0\0\0\0\0\0');
64-
close $file;
112+
# Check corruption of table on default tablespace.
113+
check_relation_corruption($node, 'corrupt1', 'pg_default');
65114

66-
# Global checksum checks fail
67-
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata],
68-
1,
69-
[qr/Bad checksums:.*1/],
70-
[qr/checksum verification failed/],
71-
'fails with corrupted data');
72-
73-
# Checksum checks on single relfilenode fail
74-
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
75-
$relfilenode_corrupted],
76-
1,
77-
[qr/Bad checksums:.*1/],
78-
[qr/checksum verification failed/],
79-
'fails for corrupted data on single relfilenode');
115+
# Create tablespace to check corruptions in a non-default tablespace.
116+
my $basedir = $node->basedir;
117+
my $tablespace_dir = "$basedir/ts_corrupt_dir";
118+
mkdir ($tablespace_dir);
119+
$node->safe_psql('postgres',
120+
"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
121+
check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
80122

81123
# Utility routine to check that pg_verify_checksums is able to detect
82124
# correctly-named relation files filled with some corrupted data.
@@ -101,6 +143,9 @@ sub fail_corrupt
101143
return;
102144
}
103145

146+
# Stop instance for the follow-up checks.
147+
$node->stop;
148+
104149
# Authorized relation files filled with corrupted data cause the
105150
# checksum checks to fail. Make sure to use file names different
106151
# than the previous ones.

0 commit comments

Comments
 (0)