Skip to content

Commit 93840f9

Browse files
committed
Improve contrib/pg_stat_statements' handling of garbage collection failure.
If we can't read the query texts file (whether because out-of-memory, or for some other reason), give up and reset the file to empty, discarding all stored query texts, though not the statistics per se. We used to leave things alone and hope for better luck next time, but the problem is that the file is only going to get bigger and even harder to slurp into memory. Better to do something that will get us out of trouble. Likewise reset the file to empty for any other failure within gc_qtexts(). The previous behavior after a write error was to discard query texts but not do anything to truncate the file, which is just weird. Also, increase the maximum supported file size from MaxAllocSize to MaxAllocHugeSize; this makes it more likely we'll be able to do a garbage collection successfully. Also, fix recalculation of mean_query_len within entry_dealloc() to match the calculation in gc_qtexts(). The previous coding overlooked the possibility of dropped texts (query_len == -1) and would underestimate the mean of the remaining entries in such cases, thus possibly causing excess garbage collection cycles. In passing, add some errdetail to the log entry that complains about insufficient memory to read the query texts file, which after all was Jim Nasby's original complaint. Back-patch to 9.4 where the current handling of query texts was introduced. Peter Geoghegan, rather editorialized upon by me
1 parent 4075fc4 commit 93840f9

File tree

1 file changed

+75
-18
lines changed

1 file changed

+75
-18
lines changed

contrib/pg_stat_statements/pg_stat_statements.c

+75-18
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ typedef struct pgssEntry
165165
pgssHashKey key; /* hash key of entry - MUST BE FIRST */
166166
Counters counters; /* the statistics for this query */
167167
Size query_offset; /* query text offset in external file */
168-
int query_len; /* # of valid bytes in query string */
168+
int query_len; /* # of valid bytes in query string, or -1 */
169169
int encoding; /* query text encoding */
170170
slock_t mutex; /* protects the counters only */
171171
} pgssEntry;
@@ -1639,7 +1639,8 @@ entry_cmp(const void *lhs, const void *rhs)
16391639
}
16401640

16411641
/*
1642-
* Deallocate least used entries.
1642+
* Deallocate least-used entries.
1643+
*
16431644
* Caller must hold an exclusive lock on pgss->lock.
16441645
*/
16451646
static void
@@ -1650,17 +1651,27 @@ entry_dealloc(void)
16501651
pgssEntry *entry;
16511652
int nvictims;
16521653
int i;
1653-
Size totlen = 0;
1654+
Size tottextlen;
1655+
int nvalidtexts;
16541656

16551657
/*
16561658
* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
16571659
* While we're scanning the table, apply the decay factor to the usage
1658-
* values.
1660+
* values, and update the mean query length.
1661+
*
1662+
* Note that the mean query length is almost immediately obsolete, since
1663+
* we compute it before not after discarding the least-used entries.
1664+
* Hopefully, that doesn't affect the mean too much; it doesn't seem worth
1665+
* making two passes to get a more current result. Likewise, the new
1666+
* cur_median_usage includes the entries we're about to zap.
16591667
*/
16601668

16611669
entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
16621670

16631671
i = 0;
1672+
tottextlen = 0;
1673+
nvalidtexts = 0;
1674+
16641675
hash_seq_init(&hash_seq, pgss_hash);
16651676
while ((entry = hash_seq_search(&hash_seq)) != NULL)
16661677
{
@@ -1670,20 +1681,27 @@ entry_dealloc(void)
16701681
entry->counters.usage *= STICKY_DECREASE_FACTOR;
16711682
else
16721683
entry->counters.usage *= USAGE_DECREASE_FACTOR;
1673-
/* Accumulate total size, too. */
1674-
totlen += entry->query_len + 1;
1684+
/* In the mean length computation, ignore dropped texts. */
1685+
if (entry->query_len >= 0)
1686+
{
1687+
tottextlen += entry->query_len + 1;
1688+
nvalidtexts++;
1689+
}
16751690
}
16761691

1692+
/* Sort into increasing order by usage */
16771693
qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
16781694

1695+
/* Record the (approximate) median usage */
16791696
if (i > 0)
1680-
{
1681-
/* Record the (approximate) median usage */
16821697
pgss->cur_median_usage = entries[i / 2]->counters.usage;
1683-
/* Record the mean query length */
1684-
pgss->mean_query_len = totlen / i;
1685-
}
1698+
/* Record the mean query length */
1699+
if (nvalidtexts > 0)
1700+
pgss->mean_query_len = tottextlen / nvalidtexts;
1701+
else
1702+
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
16861703

1704+
/* Now zap an appropriate fraction of lowest-usage entries */
16871705
nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
16881706
nvictims = Min(nvictims, i);
16891707

@@ -1826,15 +1844,17 @@ qtext_load_file(Size *buffer_size)
18261844
}
18271845

18281846
/* Allocate buffer; beware that off_t might be wider than size_t */
1829-
if (stat.st_size <= MaxAllocSize)
1847+
if (stat.st_size <= MaxAllocHugeSize)
18301848
buf = (char *) malloc(stat.st_size);
18311849
else
18321850
buf = NULL;
18331851
if (buf == NULL)
18341852
{
18351853
ereport(LOG,
18361854
(errcode(ERRCODE_OUT_OF_MEMORY),
1837-
errmsg("out of memory")));
1855+
errmsg("out of memory"),
1856+
errdetail("Could not allocate enough memory to read pg_stat_statement file \"%s\".",
1857+
PGSS_TEXT_FILE)));
18381858
CloseTransientFile(fd);
18391859
return NULL;
18401860
}
@@ -1936,13 +1956,17 @@ need_gc_qtexts(void)
19361956
* occur in the foreseeable future.
19371957
*
19381958
* The caller must hold an exclusive lock on pgss->lock.
1959+
*
1960+
* At the first sign of trouble we unlink the query text file to get a clean
1961+
* slate (although existing statistics are retained), rather than risk
1962+
* thrashing by allowing the same problem case to recur indefinitely.
19391963
*/
19401964
static void
19411965
gc_qtexts(void)
19421966
{
19431967
char *qbuffer;
19441968
Size qbuffer_size;
1945-
FILE *qfile;
1969+
FILE *qfile = NULL;
19461970
HASH_SEQ_STATUS hash_seq;
19471971
pgssEntry *entry;
19481972
Size extent;
@@ -1957,12 +1981,15 @@ gc_qtexts(void)
19571981
return;
19581982

19591983
/*
1960-
* Load the old texts file. If we fail (out of memory, for instance) just
1961-
* skip the garbage collection.
1984+
* Load the old texts file. If we fail (out of memory, for instance),
1985+
* invalidate query texts. Hopefully this is rare. It might seem better
1986+
* to leave things alone on an OOM failure, but the problem is that the
1987+
* file is only going to get bigger; hoping for a future non-OOM result is
1988+
* risky and can easily lead to complete denial of service.
19621989
*/
19631990
qbuffer = qtext_load_file(&qbuffer_size);
19641991
if (qbuffer == NULL)
1965-
return;
1992+
goto gc_fail;
19661993

19671994
/*
19681995
* We overwrite the query texts file in place, so as to reduce the risk of
@@ -1997,6 +2024,7 @@ gc_qtexts(void)
19972024
/* Trouble ... drop the text */
19982025
entry->query_offset = 0;
19992026
entry->query_len = -1;
2027+
/* entry will not be counted in mean query length computation */
20002028
continue;
20012029
}
20022030

@@ -2081,7 +2109,36 @@ gc_qtexts(void)
20812109
entry->query_len = -1;
20822110
}
20832111

2084-
/* Seems like a good idea to bump the GC count even though we failed */
2112+
/*
2113+
* Destroy the query text file and create a new, empty one
2114+
*/
2115+
(void) unlink(PGSS_TEXT_FILE);
2116+
qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2117+
if (qfile == NULL)
2118+
ereport(LOG,
2119+
(errcode_for_file_access(),
2120+
errmsg("could not write new pg_stat_statement file \"%s\": %m",
2121+
PGSS_TEXT_FILE)));
2122+
else
2123+
FreeFile(qfile);
2124+
2125+
/* Reset the shared extent pointer */
2126+
pgss->extent = 0;
2127+
2128+
/* Reset mean_query_len to match the new state */
2129+
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2130+
2131+
/*
2132+
* Bump the GC count even though we failed.
2133+
*
2134+
* This is needed to make concurrent readers of file without any lock on
2135+
* pgss->lock notice existence of new version of file. Once readers
2136+
* subsequently observe a change in GC count with pgss->lock held, that
2137+
* forces a safe reopen of file. Writers also require that we bump here,
2138+
* of course. (As required by locking protocol, readers and writers don't
2139+
* trust earlier file contents until gc_count is found unchanged after
2140+
* pgss->lock acquired in shared or exclusive mode respectively.)
2141+
*/
20852142
record_gc_qtexts();
20862143
}
20872144

0 commit comments

Comments
 (0)