Skip to content

Commit 2cd4e83

Browse files
committed
Fix contrib/auto_explain to not cause problems in parallel workers.
A parallel worker process should not be making any decisions of its own about whether to auto-explain. If the parent session process passed down flags asking for instrumentation data, do that, otherwise not. Trying to enable instrumentation anyway leads to bugs like the "could not find key N in shm TOC" failure reported in bug #15821 from Christian Hofstaedtler. We can implement this cheaply by piggybacking on the existing logic for not doing anything when we've chosen not to sample a statement. While at it, clean up some tin-eared coding related to the sampling feature, including an off-by-one error that meant that asking for 1.0 sampling rate didn't actually result in sampling every statement. Although the specific case reported here only manifested in >= v11, I believe that related misbehaviors can be demonstrated in any version that has parallel query; and the off-by-one error is certainly there back to 9.6 where that feature was added. So back-patch to 9.6. Discussion: https://postgr.es/m/15821-5eb422e980594075@postgresql.org
1 parent eaf0292 commit 2cd4e83

File tree

1 file changed

+28
-15
lines changed

1 file changed

+28
-15
lines changed

contrib/auto_explain/auto_explain.c

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include <limits.h>
1616

17+
#include "access/parallel.h"
1718
#include "commands/explain.h"
1819
#include "executor/instrument.h"
1920
#include "jit/jit.h"
@@ -59,19 +60,20 @@ static const struct config_enum_entry loglevel_options[] = {
5960
/* Current nesting depth of ExecutorRun calls */
6061
static int nesting_level = 0;
6162

63+
/* Is the current top-level query to be sampled? */
64+
static bool current_query_sampled = false;
65+
66+
#define auto_explain_enabled() \
67+
(auto_explain_log_min_duration >= 0 && \
68+
(nesting_level == 0 || auto_explain_log_nested_statements) && \
69+
current_query_sampled)
70+
6271
/* Saved hook values in case of unload */
6372
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
6473
static ExecutorRun_hook_type prev_ExecutorRun = NULL;
6574
static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
6675
static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
6776

68-
/* Is the current query sampled, per backend */
69-
static bool current_query_sampled = true;
70-
71-
#define auto_explain_enabled() \
72-
(auto_explain_log_min_duration >= 0 && \
73-
(nesting_level == 0 || auto_explain_log_nested_statements))
74-
7577
void _PG_init(void);
7678
void _PG_fini(void);
7779

@@ -249,14 +251,25 @@ static void
249251
explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
250252
{
251253
/*
252-
* For rate sampling, randomly choose top-level statement. Either all
253-
* nested statements will be explained or none will.
254+
* At the beginning of each top-level statement, decide whether we'll
255+
* sample this statement. If nested-statement explaining is enabled,
256+
* either all nested statements will be explained or none will.
257+
*
258+
* When in a parallel worker, we should do nothing, which we can implement
259+
* cheaply by pretending we decided not to sample the current statement.
260+
* If EXPLAIN is active in the parent session, data will be collected and
261+
* reported back to the parent, and it's no business of ours to interfere.
254262
*/
255-
if (auto_explain_log_min_duration >= 0 && nesting_level == 0)
256-
current_query_sampled = (random() < auto_explain_sample_rate *
257-
MAX_RANDOM_VALUE);
263+
if (nesting_level == 0)
264+
{
265+
if (auto_explain_log_min_duration >= 0 && !IsParallelWorker())
266+
current_query_sampled = (random() < auto_explain_sample_rate *
267+
((double) MAX_RANDOM_VALUE + 1));
268+
else
269+
current_query_sampled = false;
270+
}
258271

259-
if (auto_explain_enabled() && current_query_sampled)
272+
if (auto_explain_enabled())
260273
{
261274
/* Enable per-node instrumentation iff log_analyze is required. */
262275
if (auto_explain_log_analyze && (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
@@ -275,7 +288,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
275288
else
276289
standard_ExecutorStart(queryDesc, eflags);
277290

278-
if (auto_explain_enabled() && current_query_sampled)
291+
if (auto_explain_enabled())
279292
{
280293
/*
281294
* Set up to track total elapsed time in ExecutorRun. Make sure the
@@ -346,7 +359,7 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
346359
static void
347360
explain_ExecutorEnd(QueryDesc *queryDesc)
348361
{
349-
if (queryDesc->totaltime && auto_explain_enabled() && current_query_sampled)
362+
if (queryDesc->totaltime && auto_explain_enabled())
350363
{
351364
double msec;
352365

0 commit comments

Comments
 (0)