Skip to content

Commit 16acf7f

Browse files
committed
pgstat: add tests for handling of restarts, including crashes.
Test that stats are restored during normal restarts, discarded after a crash / immediate restart, and that a corrupted stats file leads to stats being reset. Author: Melanie Plageman <melanieplageman@gmail.com> Author: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/20220303021600.hs34ghqcw6zcokdh@alap3.anarazel.de
1 parent 99392cd commit 16acf7f

File tree

1 file changed

+307
-0
lines changed

1 file changed

+307
-0
lines changed
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
# Copyright (c) 2021-2022, PostgreSQL Global Development Group
2+
3+
# Tests statistics handling around restarts, including handling of crashes and
4+
# invalid stats files, as well as restorting stats after "normal" restarts.
5+
6+
use strict;
7+
use warnings;
8+
use PostgreSQL::Test::Cluster;
9+
use PostgreSQL::Test::Utils;
10+
use Test::More;
11+
use File::Copy;
12+
13+
my $node = PostgreSQL::Test::Cluster->new('primary');
14+
$node->init(allows_streaming => 1);
15+
$node->append_conf('postgresql.conf', "track_functions = 'all'");
16+
$node->start;
17+
18+
my $connect_db = 'postgres';
19+
my $db_under_test = 'test';
20+
21+
# create test objects
22+
$node->safe_psql($connect_db, "CREATE DATABASE $db_under_test");
23+
$node->safe_psql($db_under_test,
24+
"CREATE TABLE tab_stats_crash_discard_test1 AS SELECT generate_series(1,100) AS a"
25+
);
26+
$node->safe_psql($db_under_test,
27+
"CREATE FUNCTION func_stats_crash_discard1() RETURNS VOID AS 'select 2;' LANGUAGE SQL IMMUTABLE"
28+
);
29+
30+
# collect object oids
31+
my $dboid = $node->safe_psql($db_under_test,
32+
"SELECT oid FROM pg_database WHERE datname = '$db_under_test'");
33+
my $funcoid = $node->safe_psql($db_under_test,
34+
"SELECT 'func_stats_crash_discard1()'::regprocedure::oid");
35+
my $tableoid = $node->safe_psql($db_under_test,
36+
"SELECT 'tab_stats_crash_discard_test1'::regclass::oid");
37+
38+
# generate stats and flush them
39+
trigger_funcrel_stat();
40+
41+
# verify stats objects exist
42+
my $sect = "initial";
43+
is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist");
44+
is(have_stats('function', $dboid, $funcoid),
45+
't', "$sect: function stats do exist");
46+
is(have_stats('relation', $dboid, $tableoid),
47+
't', "$sect: relation stats do exist");
48+
49+
# regular shutdown
50+
$node->stop();
51+
52+
# backup stats files
53+
my $statsfile = $PostgreSQL::Test::Utils::tmp_check . '/' . "discard_stats1";
54+
ok(!-f "$statsfile", "backup statsfile cannot already exist");
55+
56+
my $datadir = $node->data_dir();
57+
my $og_stats = "$datadir/pg_stat/pgstat.stat";
58+
ok(-f "$og_stats", "origin stats file must exist");
59+
copy($og_stats, $statsfile) or die "Copy failed: $!";
60+
61+
62+
## test discarding of stats file after crash etc
63+
64+
$node->start;
65+
66+
$sect = "copy";
67+
is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist");
68+
is(have_stats('function', $dboid, $funcoid),
69+
't', "$sect: function stats do exist");
70+
is(have_stats('relation', $dboid, $tableoid),
71+
't', "$sect: relation stats do exist");
72+
73+
$node->stop('immediate');
74+
75+
ok(!-f "$og_stats", "no stats file should exist after immediate shutdown");
76+
77+
# copy the old stats back to test we discard stats after crash restart
78+
copy($statsfile, $og_stats) or die "Copy failed: $!";
79+
80+
$node->start;
81+
82+
# stats should have been discarded
83+
$sect = "post immediate";
84+
is(have_stats('database', $dboid, 0), 'f', "$sect: db stats do not exist");
85+
is(have_stats('function', $dboid, $funcoid),
86+
'f', "$sect: function stats do exist");
87+
is(have_stats('relation', $dboid, $tableoid),
88+
'f', "$sect: relation stats do not exist");
89+
90+
# get rid of backup statsfile
91+
unlink $statsfile or die "cannot unlink $statsfile $!";
92+
93+
94+
# generate new stats and flush them
95+
trigger_funcrel_stat();
96+
97+
$sect = "post immediate, new";
98+
is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist");
99+
is(have_stats('function', $dboid, $funcoid),
100+
't', "$sect: function stats do exist");
101+
is(have_stats('relation', $dboid, $tableoid),
102+
't', "$sect: relation stats do exist");
103+
104+
# regular shutdown
105+
$node->stop();
106+
107+
108+
## check an invalid stats file is handled
109+
110+
overwrite_file($og_stats, "ZZZZZZZZZZZZZ");
111+
112+
# normal startup and no issues despite invalid stats file
113+
$node->start;
114+
115+
# no stats present due to invalid stats file
116+
$sect = "invalid";
117+
is(have_stats('database', $dboid, 0), 'f', "$sect: db stats do not exist");
118+
is(have_stats('function', $dboid, $funcoid),
119+
'f', "$sect: function stats do not exist");
120+
is(have_stats('relation', $dboid, $tableoid),
121+
'f', "$sect: relation stats do not exist");
122+
123+
124+
## checks related to stats persistency around restarts and resets
125+
126+
# Ensure enough checkpoints to protect against races for test after reset,
127+
# even on very slow machines.
128+
$node->safe_psql($connect_db, "CHECKPOINT; CHECKPOINT;");
129+
130+
131+
## check checkpoint and wal stats are incremented due to restart
132+
133+
my $ckpt_start = checkpoint_stats();
134+
my $wal_start = wal_stats();
135+
$node->restart;
136+
137+
$sect = "post restart";
138+
my $ckpt_restart = checkpoint_stats();
139+
my $wal_restart = wal_stats();
140+
141+
cmp_ok(
142+
$ckpt_start->{count}, '<',
143+
$ckpt_restart->{count},
144+
"$sect: increased checkpoint count");
145+
cmp_ok(
146+
$wal_start->{records}, '<',
147+
$wal_restart->{records},
148+
"$sect: increased wal record count");
149+
cmp_ok($wal_start->{bytes}, '<', $wal_restart->{bytes},
150+
"$sect: increased wal bytes");
151+
is( $ckpt_start->{reset},
152+
$ckpt_restart->{reset},
153+
"$sect: checkpoint stats_reset equal");
154+
is($wal_start->{reset}, $wal_restart->{reset},
155+
"$sect: wal stats_reset equal");
156+
157+
158+
## Check that checkpoint stats are reset, WAL stats aren't affected
159+
160+
$node->safe_psql($connect_db, "SELECT pg_stat_reset_shared('bgwriter')");
161+
162+
$sect = "post ckpt reset";
163+
my $ckpt_reset = checkpoint_stats();
164+
my $wal_ckpt_reset = wal_stats();
165+
166+
cmp_ok($ckpt_restart->{count},
167+
'>', $ckpt_reset->{count}, "$sect: checkpoint count smaller");
168+
cmp_ok($ckpt_start->{reset}, 'lt', $ckpt_reset->{reset},
169+
"$sect: stats_reset newer");
170+
171+
cmp_ok(
172+
$wal_restart->{records},
173+
'<=',
174+
$wal_ckpt_reset->{records},
175+
"$sect: wal record count not affected by reset");
176+
is( $wal_start->{reset},
177+
$wal_ckpt_reset->{reset},
178+
"$sect: wal stats_reset equal");
179+
180+
181+
## check that checkpoint stats stay reset after restart
182+
183+
$node->restart;
184+
185+
$sect = "post ckpt reset & restart";
186+
my $ckpt_restart_reset = checkpoint_stats();
187+
my $wal_restart2 = wal_stats();
188+
189+
# made sure above there's enough checkpoints that this will be stable even on slow machines
190+
cmp_ok(
191+
$ckpt_restart_reset->{count},
192+
'<',
193+
$ckpt_restart->{count},
194+
"$sect: checkpoint still reset");
195+
is($ckpt_restart_reset->{reset},
196+
$ckpt_reset->{reset}, "$sect: stats_reset same");
197+
198+
cmp_ok(
199+
$wal_ckpt_reset->{records},
200+
'<',
201+
$wal_restart2->{records},
202+
"$sect: increased wal record count");
203+
cmp_ok(
204+
$wal_ckpt_reset->{bytes},
205+
'<',
206+
$wal_restart2->{bytes},
207+
"$sect: increased wal bytes");
208+
is( $wal_start->{reset},
209+
$wal_restart2->{reset},
210+
"$sect: wal stats_reset equal");
211+
212+
213+
## check WAL stats stay reset
214+
215+
$node->safe_psql($connect_db, "SELECT pg_stat_reset_shared('wal')");
216+
217+
$sect = "post wal reset";
218+
my $wal_reset = wal_stats();
219+
220+
cmp_ok(
221+
$wal_reset->{records}, '<',
222+
$wal_restart2->{records},
223+
"$sect: smaller record count");
224+
cmp_ok(
225+
$wal_reset->{bytes}, '<',
226+
$wal_restart2->{bytes},
227+
"$sect: smaller bytes");
228+
cmp_ok(
229+
$wal_reset->{reset}, 'gt',
230+
$wal_restart2->{reset},
231+
"$sect: newer stats_reset");
232+
233+
$node->restart;
234+
235+
$sect = "post wal reset & restart";
236+
my $wal_reset_restart = wal_stats();
237+
238+
# enough WAL generated during prior tests and initdb to make this not racy
239+
cmp_ok(
240+
$wal_reset_restart->{records},
241+
'<',
242+
$wal_restart2->{records},
243+
"$sect: smaller record count");
244+
cmp_ok(
245+
$wal_reset->{bytes}, '<',
246+
$wal_restart2->{bytes},
247+
"$sect: smaller bytes");
248+
cmp_ok(
249+
$wal_reset->{reset}, 'gt',
250+
$wal_restart2->{reset},
251+
"$sect: newer stats_reset");
252+
253+
254+
$node->stop;
255+
done_testing();
256+
257+
sub trigger_funcrel_stat
258+
{
259+
$node->safe_psql(
260+
$db_under_test, q[
261+
SELECT * FROM tab_stats_crash_discard_test1;
262+
SELECT func_stats_crash_discard1();
263+
SELECT pg_stat_force_next_flush();]);
264+
}
265+
266+
sub have_stats
267+
{
268+
my ($kind, $dboid, $objoid) = @_;
269+
270+
return $node->safe_psql($connect_db,
271+
"SELECT pg_stat_have_stats('$kind', $dboid, $objoid)");
272+
}
273+
274+
sub overwrite_file
275+
{
276+
my ($filename, $str) = @_;
277+
open my $fh, ">", $filename
278+
or die "could not write \"$filename\": $!";
279+
print $fh $str;
280+
close $fh;
281+
return;
282+
}
283+
284+
sub checkpoint_stats
285+
{
286+
my %results;
287+
288+
$results{count} = $node->safe_psql($connect_db,
289+
"SELECT checkpoints_timed + checkpoints_req FROM pg_stat_bgwriter");
290+
$results{reset} = $node->safe_psql($connect_db,
291+
"SELECT stats_reset FROM pg_stat_bgwriter");
292+
293+
return \%results;
294+
}
295+
296+
sub wal_stats
297+
{
298+
my %results;
299+
$results{records} =
300+
$node->safe_psql($connect_db, "SELECT wal_records FROM pg_stat_wal");
301+
$results{bytes} =
302+
$node->safe_psql($connect_db, "SELECT wal_bytes FROM pg_stat_wal");
303+
$results{reset} =
304+
$node->safe_psql($connect_db, "SELECT stats_reset FROM pg_stat_wal");
305+
306+
return \%results;
307+
}

0 commit comments

Comments
 (0)