Skip to content

Commit c8e8b5a

Browse files
committed
Try to log current the query string when a backend crashes.
To avoid minimize risk inside the postmaster, we subject this feature to a number of significant limitations. We very much wish to avoid doing any complex processing inside the postmaster, due to the posssibility that the crashed backend has completely corrupted shared memory. To that end, no encoding conversion is done; instead, we just replace anything that doesn't look like an ASCII character with a question mark. We limit the amount of data copied to 1024 characters, and carefully sanity check the source of that data. While these restrictions would doubtless be unacceptable in a general-purpose logging facility, even this limited facility seems like an improvement over the status quo ante. Marti Raudsepp, reviewed by PDXPUG and myself
1 parent 9802619 commit c8e8b5a

File tree

5 files changed

+125
-7
lines changed

5 files changed

+125
-7
lines changed

src/backend/postmaster/pgstat.c

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "storage/pg_shmem.h"
5959
#include "storage/pmsignal.h"
6060
#include "storage/procsignal.h"
61+
#include "utils/ascii.h"
6162
#include "utils/guc.h"
6263
#include "utils/memutils.h"
6364
#include "utils/ps_status.h"
@@ -2228,6 +2229,7 @@ static PgBackendStatus *MyBEEntry = NULL;
22282229
static char *BackendClientHostnameBuffer = NULL;
22292230
static char *BackendAppnameBuffer = NULL;
22302231
static char *BackendActivityBuffer = NULL;
2232+
static Size BackendActivityBufferSize = 0;
22312233

22322234

22332235
/*
@@ -2310,9 +2312,12 @@ CreateSharedBackendStatus(void)
23102312
}
23112313

23122314
/* Create or attach to the shared activity buffer */
2313-
size = mul_size(pgstat_track_activity_query_size, MaxBackends);
2315+
BackendActivityBufferSize = mul_size(pgstat_track_activity_query_size,
2316+
MaxBackends);
23142317
BackendActivityBuffer = (char *)
2315-
ShmemInitStruct("Backend Activity Buffer", size, &found);
2318+
ShmemInitStruct("Backend Activity Buffer",
2319+
BackendActivityBufferSize,
2320+
&found);
23162321

23172322
if (!found)
23182323
{
@@ -2751,6 +2756,70 @@ pgstat_get_backend_current_activity(int pid, bool checkUser)
27512756
return "<backend information not available>";
27522757
}
27532758

2759+
/* ----------
2760+
* pgstat_get_crashed_backend_activity() -
2761+
*
2762+
* Return a string representing the current activity of the backend with
2763+
* the specified PID. Like the function above, but reads shared memory with
2764+
* the expectation that it may be corrupt. Returns either a pointer to a
2765+
* constant string, or writes into the 'buffer' argument and returns it.
2766+
*
2767+
* This function is only intended to be used by postmaster to report the
2768+
* query that crashed the backend. In particular, no attempt is made to
2769+
* follow the correct concurrency protocol when accessing the
2770+
* BackendStatusArray. But that's OK, in the worst case we'll return a
2771+
* corrupted message. We also must take care not to trip on ereport(ERROR).
2772+
*
2773+
* Note: return strings for special cases match pg_stat_get_backend_activity.
2774+
* ----------
2775+
*/
2776+
const char *
2777+
pgstat_get_crashed_backend_activity(int pid, char *buffer,
2778+
int len)
2779+
{
2780+
volatile PgBackendStatus *beentry;
2781+
int i;
2782+
2783+
beentry = BackendStatusArray;
2784+
for (i = 1; i <= MaxBackends; i++)
2785+
{
2786+
if (beentry->st_procpid == pid)
2787+
{
2788+
/* Read pointer just once, so it can't change after validation */
2789+
const char *activity = beentry->st_activity;
2790+
const char *activity_last;
2791+
2792+
/*
2793+
* We can't access activity pointer before we verify that it
2794+
* falls into BackendActivityBuffer. To make sure that the entire
2795+
* string including its ending is contained within the buffer,
2796+
* we subtract one activity length from it.
2797+
*/
2798+
activity_last = BackendActivityBuffer + BackendActivityBufferSize
2799+
- pgstat_track_activity_query_size;
2800+
2801+
if (activity < BackendActivityBuffer ||
2802+
activity > activity_last)
2803+
return "<command string corrupt>";
2804+
2805+
if (*(activity) == '\0')
2806+
return "<command string empty>";
2807+
2808+
/*
2809+
* Copy only ASCII-safe characters so we don't run into encoding
2810+
* problems when reporting the message.
2811+
*/
2812+
ascii_safe_strncpy(buffer, activity, len);
2813+
2814+
return buffer;
2815+
}
2816+
2817+
beentry++;
2818+
}
2819+
2820+
/* PID not found */
2821+
return "<backend information not available>";
2822+
}
27542823

27552824
/* ------------------------------------------------------------
27562825
* Local support functions follow

src/backend/postmaster/postmaster.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2777,14 +2777,22 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
27772777
static void
27782778
LogChildExit(int lev, const char *procname, int pid, int exitstatus)
27792779
{
2780+
char activity_buffer[1024]; /* default track_activity_query_size */
2781+
const char *activity;
2782+
2783+
activity = pgstat_get_crashed_backend_activity(pid,
2784+
activity_buffer,
2785+
sizeof(activity_buffer));
2786+
27802787
if (WIFEXITED(exitstatus))
27812788
ereport(lev,
27822789

27832790
/*------
27842791
translator: %s is a noun phrase describing a child process, such as
27852792
"server process" */
27862793
(errmsg("%s (PID %d) exited with exit code %d",
2787-
procname, pid, WEXITSTATUS(exitstatus))));
2794+
procname, pid, WEXITSTATUS(exitstatus)),
2795+
errdetail("Running query: %s", activity)));
27882796
else if (WIFSIGNALED(exitstatus))
27892797
#if defined(WIN32)
27902798
ereport(lev,
@@ -2794,7 +2802,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
27942802
"server process" */
27952803
(errmsg("%s (PID %d) was terminated by exception 0x%X",
27962804
procname, pid, WTERMSIG(exitstatus)),
2797-
errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value.")));
2805+
errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
2806+
errdetail("Running query: %s", activity)));
27982807
#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
27992808
ereport(lev,
28002809

@@ -2804,15 +2813,17 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
28042813
(errmsg("%s (PID %d) was terminated by signal %d: %s",
28052814
procname, pid, WTERMSIG(exitstatus),
28062815
WTERMSIG(exitstatus) < NSIG ?
2807-
sys_siglist[WTERMSIG(exitstatus)] : "(unknown)")));
2816+
sys_siglist[WTERMSIG(exitstatus)] : "(unknown)"),
2817+
errdetail("Running query: %s", activity)));
28082818
#else
28092819
ereport(lev,
28102820

28112821
/*------
28122822
translator: %s is a noun phrase describing a child process, such as
28132823
"server process" */
28142824
(errmsg("%s (PID %d) was terminated by signal %d",
2815-
procname, pid, WTERMSIG(exitstatus))));
2825+
procname, pid, WTERMSIG(exitstatus)),
2826+
errdetail("Running query: %s", activity)));
28162827
#endif
28172828
else
28182829
ereport(lev,
@@ -2821,7 +2832,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
28212832
translator: %s is a noun phrase describing a child process, such as
28222833
"server process" */
28232834
(errmsg("%s (PID %d) exited with unrecognized status %d",
2824-
procname, pid, exitstatus)));
2835+
procname, pid, exitstatus),
2836+
errdetail("Running query: %s", activity)));
28252837
}
28262838

28272839
/*

src/backend/utils/adt/ascii.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,37 @@ to_ascii_default(PG_FUNCTION_ARGS)
158158

159159
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
160160
}
161+
162+
/* ----------
163+
* "Escape" a string in unknown encoding to a valid ASCII string.
164+
* Replace non-ASCII bytes with '?'
165+
* This must not trigger ereport(ERROR), as it is called from postmaster.
166+
*
167+
* Unlike C strncpy(), the result is always terminated with exactly one null
168+
* byte.
169+
* ----------
170+
*/
171+
void
172+
ascii_safe_strncpy(char *dest, const char *src, int len)
173+
{
174+
int i;
175+
176+
for (i = 0; i < (len - 1); i++)
177+
{
178+
unsigned char ch = src[i]; /* use unsigned char here to avoid compiler warning */
179+
180+
if (ch == '\0')
181+
break;
182+
/* Keep printable ASCII characters */
183+
if (32 <= ch && ch <= 127)
184+
dest[i] = ch;
185+
/* White-space is also OK */
186+
else if (ch == '\n' || ch == '\r' || ch == '\t')
187+
dest[i] = ch;
188+
/* Everything else is replaced with '?' */
189+
else
190+
dest[i] = '?';
191+
}
192+
193+
dest[i] = '\0';
194+
}

src/include/pgstat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,8 @@ extern void pgstat_report_appname(const char *appname);
720720
extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
721721
extern void pgstat_report_waiting(bool waiting);
722722
extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
723+
extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer,
724+
int len);
723725

724726
extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id);
725727
extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);

src/include/utils/ascii.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@
1616
extern Datum to_ascii_encname(PG_FUNCTION_ARGS);
1717
extern Datum to_ascii_enc(PG_FUNCTION_ARGS);
1818
extern Datum to_ascii_default(PG_FUNCTION_ARGS);
19+
extern void ascii_safe_strncpy(char *dest, const char *src, int len);
1920

2021
#endif /* _ASCII_H_ */

0 commit comments

Comments
 (0)