Skip to content

Commit 5ada1fc

Browse files
committed
Accept that server might not be able to send error in crash recovery test.
As it turns out we can't rely that the script's monitoring session is terminated with a proper error by the server, because the session might be terminated while already trying to send data. Also improve robustness and error reporting facilities of the test, developed while debugging this issue. Discussion: https://postgr.es/m/20170920020038.kllxgilo7xzwmtto@alap3.anarazel.de
1 parent 2d484f9 commit 5ada1fc

File tree

1 file changed

+74
-24
lines changed

1 file changed

+74
-24
lines changed

src/test/recovery/t/013_crash_restart.pl

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@
2424
}
2525
else
2626
{
27-
plan tests => 12;
27+
plan tests => 18;
2828
}
2929

3030
# To avoid hanging while expecting some specific input from a psql
3131
# instance being driven by us, add a timeout high enough that it
3232
# should never trigger in a normal run, but low enough to actually see
3333
# failures in a realistic amount of time.
34-
my $psql_timeout = 180;
34+
my $psql_timeout = IPC::Run::timer(10);
3535

3636
my $node = get_new_node('master');
3737
$node->init(allows_streaming => 1);
@@ -54,7 +54,7 @@
5454
\$killme_stdout,
5555
'2>',
5656
\$killme_stderr,
57-
IPC::Run::timeout($psql_timeout));
57+
$psql_timeout);
5858

5959
# Need a second psql to check if crash-restart happened.
6060
my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
@@ -67,26 +67,30 @@
6767
\$monitor_stdout,
6868
'2>',
6969
\$monitor_stderr,
70-
IPC::Run::timeout($psql_timeout));
70+
$psql_timeout);
7171

7272
#create table, insert row that should survive
7373
$killme_stdin .= q[
7474
CREATE TABLE alive(status text);
7575
INSERT INTO alive VALUES($$committed-before-sigquit$$);
7676
SELECT pg_backend_pid();
7777
];
78-
$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
78+
ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
79+
'acquired pid for SIGQUIT');
7980
my $pid = $killme_stdout;
8081
chomp($pid);
8182
$killme_stdout = '';
83+
$killme_stderr = '';
8284

8385
#insert a row that should *not* survive, due to in-progress xact
8486
$killme_stdin .= q[
8587
BEGIN;
8688
INSERT INTO alive VALUES($$in-progress-before-sigquit$$) RETURNING status;
8789
];
88-
$killme->pump until $killme_stdout =~ /in-progress-before-sigquit/;
90+
ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigquit/m),
91+
'inserted in-progress-before-sigquit');
8992
$killme_stdout = '';
93+
$killme_stderr = '';
9094

9195

9296
# Start longrunning query in second session, it's failure will signal
@@ -96,31 +100,38 @@
96100
SELECT $$psql-connected$$;
97101
SELECT pg_sleep(3600);
98102
];
99-
$monitor->pump until $monitor_stdout =~ /psql-connected/;
100-
103+
ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
104+
'monitor connected');
105+
$monitor_stdout = '';
106+
$monitor_stderr = '';
101107

102108
# kill once with QUIT - we expect psql to exit, while emitting error message first
103109
my $cnt = kill 'QUIT', $pid;
104110

105111
# Exactly process should have been alive to be killed
106112
is($cnt, 1, "exactly one process killed with SIGQUIT");
107113

108-
# Check that psql sees the killed backend as having been terminated
114+
# Check that psql sees the killed backend as having been terminated
109115
$killme_stdin .= q[
110116
SELECT 1;
111117
];
112-
$killme->pump until $killme_stderr =~ /WARNING: terminating connection because of crash of another server process/;
113-
114-
ok(1, "psql query died successfully after SIGQUIT");
118+
ok(pump_until($killme, \$killme_stderr, qr/WARNING: terminating connection because of crash of another server process/m),
119+
"psql query died successfully after SIGQUIT");
120+
$killme_stderr = '';
121+
$killme_stdout = '';
115122
$killme->kill_kill;
116123

117-
# Check if the crash-restart cycle has occurred
118-
$monitor->pump until $monitor_stderr =~ /WARNING: terminating connection because of crash of another server process/;
124+
# Wait till server restarts - we should get the WARNING here, but
125+
# sometimes the server is unable to send that, if interrupted while
126+
# sending.
127+
ok(pump_until($monitor, \$monitor_stderr, qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly/m),
128+
"psql monitor died successfully after SIGQUIT");
119129
$monitor->kill_kill;
120-
ok(1, "psql monitor died successfully after SIGQUIT");
121130

122131
# Wait till server restarts
123-
is($node->poll_query_until('postgres', 'SELECT $$restarted$$;', 'restarted'), "1", "reconnected after SIGQUIT");
132+
is($node->poll_query_until('postgres', 'SELECT $$restarted after sigquit$$;', 'restarted after sigquit'),
133+
"1", "reconnected after SIGQUIT");
134+
124135

125136
# restart psql processes, now that the crash cycle finished
126137
($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
@@ -133,19 +144,24 @@
133144
$killme_stdin .= q[
134145
SELECT pg_backend_pid();
135146
];
136-
$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
147+
ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
148+
"acquired pid for SIGKILL");
137149
$pid = $killme_stdout;
138150
chomp($pid);
139151
$pid = $killme_stdout;
152+
$killme_stdout = '';
153+
$killme_stderr = '';
140154

141155
# Insert test rows
142156
$killme_stdin .= q[
143157
INSERT INTO alive VALUES($$committed-before-sigkill$$) RETURNING status;
144158
BEGIN;
145159
INSERT INTO alive VALUES($$in-progress-before-sigkill$$) RETURNING status;
146160
];
147-
$killme->pump until $killme_stdout =~ /in-progress-before-sigkill/;
161+
ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
162+
'inserted in-progress-before-sigkill');
148163
$killme_stdout = '';
164+
$killme_stderr = '';
149165

150166
# Re-start longrunning query in second session, it's failure will
151167
# signal that crash-restart has occurred. The initial wait for the
@@ -155,8 +171,10 @@
155171
SELECT $$psql-connected$$;
156172
SELECT pg_sleep(3600);
157173
];
158-
$monitor->pump until $monitor_stdout =~ /psql-connected/;
174+
ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
175+
'monitor connected');
159176
$monitor_stdout = '';
177+
$monitor_stderr = '';
160178

161179

162180
# kill with SIGKILL this time - we expect the backend to exit, without
@@ -169,13 +187,15 @@
169187
$killme_stdin .= q[
170188
SELECT 1;
171189
];
172-
$killme->pump until $killme_stderr =~ /server closed the connection unexpectedly/;
190+
ok(pump_until($killme, \$killme_stderr, qr/server closed the connection unexpectedly/m),
191+
"psql query died successfully after SIGKILL");
173192
$killme->kill_kill;
174-
ok(1, "psql query died successfully after SIGKILL");
175193

176-
# Wait till server restarts (note that we should get the WARNING here)
177-
$monitor->pump until $monitor_stderr =~ /WARNING: terminating connection because of crash of another server process/;
178-
ok(1, "psql monitor died successfully after SIGKILL");
194+
# Wait till server restarts - we should get the WARNING here, but
195+
# sometimes the server is unable to send that, if interrupted while
196+
# sending.
197+
ok(pump_until($monitor, \$monitor_stderr, qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly/m),
198+
"psql monitor died successfully after SIGKILL");
179199
$monitor->kill_kill;
180200

181201
# Wait till server restarts
@@ -198,3 +218,33 @@
198218
'after-orderly-restart', 'can still write after orderly restart');
199219

200220
$node->stop();
221+
222+
# Pump until string is matched, or timeout occurs
223+
sub pump_until
224+
{
225+
my ($proc, $stream, $untl) = @_;
226+
$proc->pump_nb();
227+
while (1)
228+
{
229+
if ($psql_timeout->is_expired)
230+
{
231+
diag("aborting wait: program timed out");
232+
diag("stream contents: >>", $$stream,"<<");
233+
diag("pattern searched for: ", $untl);
234+
235+
return 0;
236+
}
237+
if (not $proc->pumpable())
238+
{
239+
diag("aborting wait: program died");
240+
diag("stream contents: >>", $$stream,"<<");
241+
diag("pattern searched for: ", $untl);
242+
243+
return 0;
244+
}
245+
$proc->pump();
246+
last if $$stream =~ /$untl/;
247+
}
248+
return 1;
249+
250+
};

0 commit comments

Comments
 (0)