Skip to content

Commit e2f65f4

Browse files
committed
Fix old-fd issues using global barriers everywhere.
Commits 4eb2176 and b74e94d introduced a way to force every backend to close all relation files, to fix an ancient Windows-only bug. This commit extends that behavior to all operating systems and adds a couple of extra barrier points, to fix a totally different class of bug: the reuse of relfilenodes in scenarios that have no other kind of cache invalidation to prevent file descriptor mix-ups. In all releases, data corruption could occur when you moved a database to another tablespace and then back again. Despite that, no back-patch for now as the infrastructure required is too new and invasive. In master only, since commit aa01051, it could also happen when using CREATE DATABASE with a user-supplied OID or via pg_upgrade. Author: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Discussion: https://postgr.es/m/20220209220004.kb3dgtn2x2k2gtdm%40alap3.anarazel.de
1 parent b74e94d commit e2f65f4

File tree

5 files changed

+241
-25
lines changed

5 files changed

+241
-25
lines changed

src/backend/commands/dbcommands.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,10 +1687,8 @@ dropdb(const char *dbname, bool missing_ok, bool force)
16871687
*/
16881688
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
16891689

1690-
#if defined(USE_BARRIER_SMGRRELEASE)
16911690
/* Close all smgr fds in all backends. */
16921691
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
1693-
#endif
16941692

16951693
/*
16961694
* Remove all tablespace subdirs belonging to the database.
@@ -1940,10 +1938,8 @@ movedb(const char *dbname, const char *tblspcname)
19401938
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT
19411939
| CHECKPOINT_FLUSH_ALL);
19421940

1943-
#if defined(USE_BARRIER_SMGRRELEASE)
19441941
/* Close all smgr fds in all backends. */
19451942
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
1946-
#endif
19471943

19481944
/*
19491945
* Now drop all buffers holding data of the target database; they should
@@ -3054,6 +3050,9 @@ dbase_redo(XLogReaderState *record)
30543050
*/
30553051
FlushDatabaseBuffers(xlrec->src_db_id);
30563052

3053+
/* Close all sgmr fds in all backends. */
3054+
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
3055+
30573056
/*
30583057
* Copy this subdirectory to the new location
30593058
*
@@ -3111,10 +3110,8 @@ dbase_redo(XLogReaderState *record)
31113110
/* Clean out the xlog relcache too */
31123111
XLogDropDatabase(xlrec->db_id);
31133112

3114-
#if defined(USE_BARRIER_SMGRRELEASE)
31153113
/* Close all sgmr fds in all backends. */
31163114
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
3117-
#endif
31183115

31193116
for (i = 0; i < xlrec->ntablespaces; i++)
31203117
{

src/backend/commands/tablespace.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -548,11 +548,10 @@ DropTableSpace(DropTableSpaceStmt *stmt)
548548
* use a global barrier to ask all backends to close all files, and
549549
* wait until they're finished.
550550
*/
551-
#if defined(USE_BARRIER_SMGRRELEASE)
552551
LWLockRelease(TablespaceCreateLock);
553552
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
554553
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
555-
#endif
554+
556555
/* And now try again. */
557556
if (!destroy_tablespace_directories(tablespaceoid, false))
558557
{
@@ -1574,6 +1573,9 @@ tblspc_redo(XLogReaderState *record)
15741573
{
15751574
xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
15761575

1576+
/* Close all smgr fds in all backends. */
1577+
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
1578+
15771579
/*
15781580
* If we issued a WAL record for a drop tablespace it implies that
15791581
* there were no files in it at all when the DROP was done. That means
@@ -1591,11 +1593,6 @@ tblspc_redo(XLogReaderState *record)
15911593
*/
15921594
if (!destroy_tablespace_directories(xlrec->ts_id, true))
15931595
{
1594-
#if defined(USE_BARRIER_SMGRRELEASE)
1595-
/* Close all smgr fds in all backends. */
1596-
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
1597-
#endif
1598-
15991596
ResolveRecoveryConflictWithTablespace(xlrec->ts_id);
16001597

16011598
/*

src/include/pg_config_manual.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -152,17 +152,6 @@
152152
#define EXEC_BACKEND
153153
#endif
154154

155-
/*
156-
* If USE_BARRIER_SMGRRELEASE is defined, certain code paths that unlink
157-
* directories will ask other backends to close all smgr file descriptors.
158-
* This is enabled on Windows, because otherwise unlinked but still open files
159-
* can prevent rmdir(containing_directory) from succeeding. On other
160-
* platforms, it can be defined to exercise those code paths.
161-
*/
162-
#if defined(WIN32)
163-
#define USE_BARRIER_SMGRRELEASE
164-
#endif
165-
166155
/*
167156
* Define this if your operating system supports link()
168157
*/

src/test/recovery/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#
1010
#-------------------------------------------------------------------------
1111

12-
EXTRA_INSTALL=contrib/test_decoding
12+
EXTRA_INSTALL=contrib/test_decoding contrib/pg_prewarm
1313

1414
subdir = src/test/recovery
1515
top_builddir = ../../..
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
use strict;
2+
use warnings;
3+
use PostgreSQL::Test::Cluster;
4+
use PostgreSQL::Test::Utils;
5+
use Test::More;
6+
use File::Basename;
7+
8+
9+
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
10+
$node_primary->init(allows_streaming => 1);
11+
$node_primary->append_conf('postgresql.conf', q[
12+
allow_in_place_tablespaces = true
13+
log_connections=on
14+
# to avoid "repairing" corruption
15+
full_page_writes=off
16+
log_min_messages=debug2
17+
autovacuum_naptime=1s
18+
shared_buffers=1MB
19+
]);
20+
$node_primary->start;
21+
22+
23+
# Create streaming standby linking to primary
24+
my $backup_name = 'my_backup';
25+
$node_primary->backup($backup_name);
26+
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
27+
$node_standby->init_from_backup($node_primary, $backup_name,
28+
has_streaming => 1);
29+
$node_standby->start;
30+
31+
# To avoid hanging while expecting some specific input from a psql
32+
# instance being driven by us, add a timeout high enough that it
33+
# should never trigger even on very slow machines, unless something
34+
# is really wrong.
35+
my $psql_timeout = IPC::Run::timer(300);
36+
37+
my %psql_primary = (stdin => '', stdout => '', stderr => '');
38+
$psql_primary{run} = IPC::Run::start(
39+
[ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ],
40+
'<',
41+
\$psql_primary{stdin},
42+
'>',
43+
\$psql_primary{stdout},
44+
'2>',
45+
\$psql_primary{stderr},
46+
$psql_timeout);
47+
48+
my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => '');
49+
$psql_standby{run} = IPC::Run::start(
50+
[ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ],
51+
'<',
52+
\$psql_standby{stdin},
53+
'>',
54+
\$psql_standby{stdout},
55+
'2>',
56+
\$psql_standby{stderr},
57+
$psql_timeout);
58+
59+
60+
# Create template database with a table that we'll update, to trigger dirty
61+
# rows. Using a template database + preexisting rows makes it a bit easier to
62+
# reproduce, because there's no cache invalidations generated.
63+
64+
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;");
65+
$node_primary->safe_psql('conflict_db_template', q[
66+
CREATE TABLE large(id serial primary key, dataa text, datab text);
67+
INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]);
68+
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
69+
70+
$node_primary->safe_psql('postgres', q[
71+
CREATE EXTENSION pg_prewarm;
72+
CREATE TABLE replace_sb(data text);
73+
INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]);
74+
75+
# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files
76+
send_query_and_wait(
77+
\%psql_primary,
78+
q[BEGIN;],
79+
qr/BEGIN/m);
80+
send_query_and_wait(
81+
\%psql_standby,
82+
q[BEGIN;],
83+
qr/BEGIN/m);
84+
85+
# Cause lots of dirty rows in shared_buffers
86+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;");
87+
88+
# Now do a bunch of work in another database. That will end up needing to
89+
# write back dirty data from the previous step, opening the relevant file
90+
# descriptors
91+
cause_eviction(\%psql_primary, \%psql_standby);
92+
93+
# drop and recreate database
94+
$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;");
95+
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
96+
97+
verify($node_primary, $node_standby, 1,
98+
"initial contents as expected");
99+
100+
# Again cause lots of dirty rows in shared_buffers, but use a different update
101+
# value so we can check everything is OK
102+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;");
103+
104+
# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX
105+
# adjust after bugfix) the already opened file descriptor.
106+
# FIXME
107+
cause_eviction(\%psql_primary, \%psql_standby);
108+
109+
verify($node_primary, $node_standby, 2,
110+
"update to reused relfilenode (due to DB oid conflict) is not lost");
111+
112+
113+
$node_primary->safe_psql('conflict_db', "VACUUM FULL large;");
114+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;");
115+
116+
verify($node_primary, $node_standby, 3,
117+
"restored contents as expected");
118+
119+
# Test for old filehandles after moving a database in / out of tablespace
120+
$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']);
121+
122+
# cause dirty buffers
123+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;");
124+
# cause files to be opened in backend in other database
125+
cause_eviction(\%psql_primary, \%psql_standby);
126+
127+
# move database back / forth
128+
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
129+
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default');
130+
131+
# cause dirty buffers
132+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;");
133+
cause_eviction(\%psql_primary, \%psql_standby);
134+
135+
verify($node_primary, $node_standby, 5,
136+
"post move contents as expected");
137+
138+
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
139+
140+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;");
141+
cause_eviction(\%psql_primary, \%psql_standby);
142+
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;");
143+
$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db');
144+
$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace');
145+
146+
$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database');
147+
148+
149+
# explicitly shut down psql instances gracefully - to avoid hangs
150+
# or worse on windows
151+
$psql_primary{stdin} .= "\\q\n";
152+
$psql_primary{run}->finish;
153+
$psql_standby{stdin} .= "\\q\n";
154+
$psql_standby{run}->finish;
155+
156+
$node_primary->stop();
157+
$node_standby->stop();
158+
159+
# Make sure that there weren't crashes during shutdown
160+
161+
command_like([ 'pg_controldata', $node_primary->data_dir ],
162+
qr/Database cluster state:\s+shut down\n/, 'primary shut down ok');
163+
command_like([ 'pg_controldata', $node_standby->data_dir ],
164+
qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok');
165+
done_testing();
166+
167+
sub verify
168+
{
169+
my ($primary, $standby, $counter, $message) = @_;
170+
171+
my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10";
172+
is($primary->safe_psql('conflict_db', $query),
173+
"$counter|4000",
174+
"primary: $message");
175+
176+
$primary->wait_for_catchup($standby);
177+
is($standby->safe_psql('conflict_db', $query),
178+
"$counter|4000",
179+
"standby: $message");
180+
}
181+
182+
sub cause_eviction
183+
{
184+
my ($psql_primary, $psql_standby) = @_;
185+
186+
send_query_and_wait(
187+
$psql_primary,
188+
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
189+
qr/warmed_buffers/m);
190+
191+
send_query_and_wait(
192+
$psql_standby,
193+
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
194+
qr/warmed_buffers/m);
195+
}
196+
197+
# Send query, wait until string matches
198+
sub send_query_and_wait
199+
{
200+
my ($psql, $query, $untl) = @_;
201+
my $ret;
202+
203+
# send query
204+
$$psql{stdin} .= $query;
205+
$$psql{stdin} .= "\n";
206+
207+
# wait for query results
208+
$$psql{run}->pump_nb();
209+
while (1)
210+
{
211+
last if $$psql{stdout} =~ /$untl/;
212+
213+
if ($psql_timeout->is_expired)
214+
{
215+
BAIL_OUT("aborting wait: program timed out\n"
216+
. "stream contents: >>$$psql{stdout}<<\n"
217+
. "pattern searched for: $untl\n");
218+
return 0;
219+
}
220+
if (not $$psql{run}->pumpable())
221+
{
222+
BAIL_OUT("aborting wait: program died\n"
223+
. "stream contents: >>$$psql{stdout}<<\n"
224+
. "pattern searched for: $untl\n");
225+
return 0;
226+
}
227+
$$psql{run}->pump();
228+
}
229+
230+
$$psql{stdout} = '';
231+
232+
return 1;
233+
}

0 commit comments

Comments
 (0)