Skip to content

Commit cd91de0

Browse files
committed
Remove temporary files after backend crash
After a crash of a backend using temporary files, the files used to be left behind, on the basis that it might be useful for debugging. But we don't have any reports of anyone actually doing that, and it means the disk usage may grow over time due to repeated backend failures (possibly even hitting ENOSPC). So this behavior is a bit unfortunate, and fixing it required either manual cleanup (deleting files, which is error-prone) or restart of the instance (i.e. service disruption). This implements automatic cleanup of temporary files, controled by a new GUC remove_temp_files_after_crash. By default the files are removed, but it can be disabled to restore the old behavior if needed. Author: Euler Taveira Reviewed-by: Tomas Vondra, Michael Paquier, Anastasia Lubennikova, Thomas Munro Discussion: https://postgr.es/m/CAH503wDKdYzyq7U-QJqGn%3DGm6XmoK%2B6_6xTJ-Yn5WSvoHLY1Ww%40mail.gmail.com
1 parent da18d82 commit cd91de0

File tree

7 files changed

+233
-5
lines changed

7 files changed

+233
-5
lines changed

doc/src/sgml/config.sgml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9671,6 +9671,23 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
96719671
</listitem>
96729672
</varlistentry>
96739673

9674+
<varlistentry id="guc-remove-temp-files-after-crash" xreflabel="remove_temp_files_after_crash">
9675+
<term><varname>remove_temp_files_after_crash</varname> (<type>boolean</type>)
9676+
<indexterm>
9677+
<primary><varname>remove_temp_files_after_crash</varname> configuration parameter</primary>
9678+
</indexterm>
9679+
</term>
9680+
<listitem>
9681+
<para>
9682+
When set to <literal>on</literal>, which is the default,
9683+
<productname>PostgreSQL</productname> will automatically remove
9684+
temporary files after a backend crash. If disabled, the files will be
9685+
retained and may be used for debugging, for example. Repeated crashes
9686+
may however result in accumulation of useless files.
9687+
</para>
9688+
</listitem>
9689+
</varlistentry>
9690+
96749691
<varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
96759692
<term><varname>data_sync_retry</varname> (<type>boolean</type>)
96769693
<indexterm>

src/backend/postmaster/postmaster.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ bool Db_user_namespace = false;
242242
bool enable_bonjour = false;
243243
char *bonjour_name;
244244
bool restart_after_crash = true;
245+
bool remove_temp_files_after_crash = true;
245246

246247
/* PIDs of special child processes; 0 when not running */
247248
static pid_t StartupPID = 0,
@@ -3976,6 +3977,10 @@ PostmasterStateMachine(void)
39763977
ereport(LOG,
39773978
(errmsg("all server processes terminated; reinitializing")));
39783979

3980+
/* remove leftover temporary files after a crash */
3981+
if (remove_temp_files_after_crash)
3982+
RemovePgTempFiles();
3983+
39793984
/* allow background workers to immediately restart */
39803985
ResetBackgroundWorkerCrashTimes();
39813986

src/backend/storage/file/fd.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3024,11 +3024,13 @@ CleanupTempFiles(bool isCommit, bool isProcExit)
30243024
* remove any leftover files created by OpenTemporaryFile and any leftover
30253025
* temporary relation files created by mdcreate.
30263026
*
3027-
* NOTE: we could, but don't, call this during a post-backend-crash restart
3028-
* cycle. The argument for not doing it is that someone might want to examine
3029-
* the temp files for debugging purposes. This does however mean that
3030-
* OpenTemporaryFile had better allow for collision with an existing temp
3031-
* file name.
3027+
* During post-backend-crash restart cycle, this routine is called when
3028+
* remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3029+
* queries are using temp files could result in useless storage usage that can
3030+
* only be reclaimed by a service restart. The argument against enabling it is
3031+
* that someone might want to examine the temporary files for debugging
3032+
* purposes. This does however mean that OpenTemporaryFile had better allow for
3033+
* collision with an existing temp file name.
30323034
*
30333035
* NOTE: this function and its subroutines generally report syscall failures
30343036
* with ereport(LOG) and keep going. Removing temp files is not so critical

src/backend/utils/misc/guc.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,6 +1371,15 @@ static struct config_bool ConfigureNamesBool[] =
13711371
true,
13721372
NULL, NULL, NULL
13731373
},
1374+
{
1375+
{"remove_temp_files_after_crash", PGC_SIGHUP, ERROR_HANDLING_OPTIONS,
1376+
gettext_noop("Remove temporary files after backend crash."),
1377+
NULL
1378+
},
1379+
&remove_temp_files_after_crash,
1380+
true,
1381+
NULL, NULL, NULL
1382+
},
13741383

13751384
{
13761385
{"log_duration", PGC_SUSET, LOGGING_WHAT,

src/backend/utils/misc/postgresql.conf.sample

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,8 @@
758758

759759
#exit_on_error = off # terminate session on any error?
760760
#restart_after_crash = on # reinitialize after backend crash?
761+
#remove_temp_files_after_crash = on # remove temporary files after
762+
# backend crash?
761763
#data_sync_retry = off # retry or panic on failure to fsync
762764
# data?
763765
# (change requires restart)

src/include/postmaster/postmaster.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ extern bool log_hostname;
2929
extern bool enable_bonjour;
3030
extern char *bonjour_name;
3131
extern bool restart_after_crash;
32+
extern bool remove_temp_files_after_crash;
3233

3334
#ifdef WIN32
3435
extern HANDLE PostmasterHandle;
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# Test remove of temporary files after a crash.
2+
use strict;
3+
use warnings;
4+
use PostgresNode;
5+
use TestLib;
6+
use Test::More;
7+
use Config;
8+
use Time::HiRes qw(usleep);
9+
10+
plan tests => 9;
11+
12+
13+
# To avoid hanging while expecting some specific input from a psql
14+
# instance being driven by us, add a timeout high enough that it
15+
# should never trigger even on very slow machines, unless something
16+
# is really wrong.
17+
my $psql_timeout = IPC::Run::timer(60);
18+
19+
my $node = get_new_node('node_crash');
20+
$node->init();
21+
$node->start();
22+
23+
# By default, PostgresNode doesn't restart after crash
24+
# Reduce work_mem to generate temporary file with a few number of rows
25+
$node->safe_psql(
26+
'postgres',
27+
q[ALTER SYSTEM SET remove_temp_files_after_crash = on;
28+
ALTER SYSTEM SET log_connections = 1;
29+
ALTER SYSTEM SET work_mem = '64kB';
30+
ALTER SYSTEM SET restart_after_crash = on;
31+
SELECT pg_reload_conf();]);
32+
33+
# create table, insert rows
34+
$node->safe_psql(
35+
'postgres',
36+
q[CREATE TABLE tab_crash (a text);
37+
INSERT INTO tab_crash (a) SELECT gen_random_uuid() FROM generate_series(1, 500);]);
38+
39+
# Run psql, keeping session alive, so we have an alive backend to kill.
40+
my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
41+
my $killme = IPC::Run::start(
42+
[
43+
'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
44+
$node->connstr('postgres')
45+
],
46+
'<',
47+
\$killme_stdin,
48+
'>',
49+
\$killme_stdout,
50+
'2>',
51+
\$killme_stderr,
52+
$psql_timeout);
53+
54+
# Get backend pid
55+
$killme_stdin .= q[
56+
SELECT pg_backend_pid();
57+
];
58+
ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
59+
'acquired pid for SIGKILL');
60+
my $pid = $killme_stdout;
61+
chomp($pid);
62+
$killme_stdout = '';
63+
$killme_stderr = '';
64+
65+
# Run the query that generates a temporary file and that will be killed before
66+
# it finishes. Since the query that generates the temporary file does not
67+
# return before the connection is killed, use a SELECT before to trigger
68+
# pump_until.
69+
$killme_stdin .= q[
70+
BEGIN;
71+
SELECT $$in-progress-before-sigkill$$;
72+
WITH foo AS (SELECT a FROM tab_crash ORDER BY a) SELECT a, pg_sleep(1) FROM foo;
73+
];
74+
ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
75+
'select in-progress-before-sigkill');
76+
$killme_stdout = '';
77+
$killme_stderr = '';
78+
79+
# Wait some time so the temporary file is generated by SELECT
80+
usleep(10_000);
81+
82+
# Kill with SIGKILL
83+
my $ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
84+
is($ret, 0, 'killed process with KILL');
85+
86+
# Close psql session
87+
$killme->finish;
88+
89+
# Wait till server restarts
90+
$node->poll_query_until('postgres', 'SELECT 1', '1');
91+
92+
# Check for temporary files
93+
is($node->safe_psql(
94+
'postgres',
95+
'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
96+
qq(0), 'no temporary files');
97+
98+
#
99+
# Test old behavior (don't remove temporary files after crash)
100+
#
101+
$node->safe_psql(
102+
'postgres',
103+
q[ALTER SYSTEM SET remove_temp_files_after_crash = off;
104+
SELECT pg_reload_conf();]);
105+
106+
# Restart psql session
107+
($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
108+
$killme->run();
109+
110+
# Get backend pid
111+
$killme_stdin .= q[
112+
SELECT pg_backend_pid();
113+
];
114+
ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
115+
'acquired pid for SIGKILL');
116+
$pid = $killme_stdout;
117+
chomp($pid);
118+
$killme_stdout = '';
119+
$killme_stderr = '';
120+
121+
# Run the query that generates a temporary file and that will be killed before
122+
# it finishes. Since the query that generates the temporary file does not
123+
# return before the connection is killed, use a SELECT before to trigger
124+
# pump_until.
125+
$killme_stdin .= q[
126+
BEGIN;
127+
SELECT $$in-progress-before-sigkill$$;
128+
WITH foo AS (SELECT a FROM tab_crash ORDER BY a) SELECT a, pg_sleep(1) FROM foo;
129+
];
130+
ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
131+
'select in-progress-before-sigkill');
132+
$killme_stdout = '';
133+
$killme_stderr = '';
134+
135+
# Wait some time so the temporary file is generated by SELECT
136+
usleep(10_000);
137+
138+
# Kill with SIGKILL
139+
$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
140+
is($ret, 0, 'killed process with KILL');
141+
142+
# Close psql session
143+
$killme->finish;
144+
145+
# Wait till server restarts
146+
$node->poll_query_until('postgres', 'SELECT 1', '1');
147+
148+
# Check for temporary files -- should be there
149+
is($node->safe_psql(
150+
'postgres',
151+
'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
152+
qq(1), 'one temporary file');
153+
154+
# Restart should remove the temporary files
155+
$node->restart();
156+
157+
# Check the temporary files -- should be gone
158+
is($node->safe_psql(
159+
'postgres',
160+
'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
161+
qq(0), 'temporary file was removed');
162+
163+
$node->stop();
164+
165+
# Pump until string is matched, or timeout occurs
166+
sub pump_until
167+
{
168+
my ($proc, $stream, $untl) = @_;
169+
$proc->pump_nb();
170+
while (1)
171+
{
172+
last if $$stream =~ /$untl/;
173+
if ($psql_timeout->is_expired)
174+
{
175+
diag("aborting wait: program timed out");
176+
diag("stream contents: >>", $$stream, "<<");
177+
diag("pattern searched for: ", $untl);
178+
179+
return 0;
180+
}
181+
if (not $proc->pumpable())
182+
{
183+
diag("aborting wait: program died");
184+
diag("stream contents: >>", $$stream, "<<");
185+
diag("pattern searched for: ", $untl);
186+
187+
return 0;
188+
}
189+
$proc->pump();
190+
}
191+
return 1;
192+
}

0 commit comments

Comments
 (0)