Skip to content

Commit 7577dd8

Browse files
committed
Properly detoast data in brin_form_tuple
brin_form_tuple failed to consider the values may be toasted, inserting the toast pointer into the index. This may easily result in index corruption, as the toast data may be deleted and cleaned up by vacuum. The cleanup however does not care about indexes, leaving invalid toast pointers behind, which triggers errors like this: ERROR: missing chunk number 0 for toast value 16433 in pg_toast_16426 A less severe consequence are inconsistent failures due to the index row being too large, depending on whether brin_form_tuple operated on plain or toasted version of the row. For example CREATE TABLE t (val TEXT); INSERT INTO t VALUES ('... long value ...') CREATE INDEX idx ON t USING brin (val); would likely succeed, as the row would likely include toast pointer. Switching the order of INSERT and CREATE INDEX would likely fail: ERROR: index row size 8712 exceeds maximum 8152 for index "idx" because this happens before the row values are toasted. The bug exists since PostgreSQL 9.5 where BRIN indexes were introduced. So backpatch all the way back. Author: Tomas Vondra Reviewed-by: Alvaro Herrera Backpatch-through: 9.5 Discussion: https://postgr.es/m/20201001184133.oq5uq75sb45pu3aw@development Discussion: https://postgr.es/m/20201104010544.zexj52mlldagzowv%40development
1 parent eeda7f6 commit 7577dd8

File tree

3 files changed

+181
-1
lines changed

3 files changed

+181
-1
lines changed

src/backend/access/brin/brin_tuple.c

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,23 @@
3232
#include "postgres.h"
3333

3434
#include "access/brin_tuple.h"
35+
#include "access/detoast.h"
36+
#include "access/heaptoast.h"
3537
#include "access/htup_details.h"
38+
#include "access/toast_internals.h"
3639
#include "access/tupdesc.h"
3740
#include "access/tupmacs.h"
3841
#include "utils/datum.h"
3942
#include "utils/memutils.h"
4043

44+
45+
/*
46+
* This enables de-toasting of index entries. Needed until VACUUM is
47+
* smart enough to rebuild indexes from scratch.
48+
*/
49+
#define TOAST_INDEX_HACK
50+
51+
4152
static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
4253
char *tp, bits8 *nullbits, bool nulls,
4354
Datum *values, bool *allnulls, bool *hasnulls);
@@ -99,6 +110,12 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
99110
Size len,
100111
hoff,
101112
data_len;
113+
int i;
114+
115+
#ifdef TOAST_INDEX_HACK
116+
Datum *untoasted_values;
117+
int nuntoasted = 0;
118+
#endif
102119

103120
Assert(brdesc->bd_totalstored > 0);
104121

@@ -107,6 +124,10 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
107124
phony_nullbitmap = (bits8 *)
108125
palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
109126

127+
#ifdef TOAST_INDEX_HACK
128+
untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
129+
#endif
130+
110131
/*
111132
* Set up the values/nulls arrays for heap_fill_tuple
112133
*/
@@ -138,10 +159,84 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
138159
if (tuple->bt_columns[keyno].bv_hasnulls)
139160
anynulls = true;
140161

162+
/*
163+
* Now obtain the values of each stored datum. Note that some values
164+
* might be toasted, and we cannot rely on the original heap values
165+
* sticking around forever, so we must detoast them. Also try to
166+
* compress them.
167+
*/
141168
for (datumno = 0;
142169
datumno < brdesc->bd_info[keyno]->oi_nstored;
143170
datumno++)
144-
values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
171+
{
172+
Datum value = tuple->bt_columns[keyno].bv_values[datumno];
173+
174+
#ifdef TOAST_INDEX_HACK
175+
176+
/* We must look at the stored type, not at the index descriptor. */
177+
TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno];
178+
179+
/* Do we need to free the value at the end? */
180+
bool free_value = false;
181+
182+
/* For non-varlena types we don't need to do anything special */
183+
if (atttype->typlen != -1)
184+
{
185+
values[idxattno++] = value;
186+
continue;
187+
}
188+
189+
/*
190+
* Do nothing if value is not of varlena type. We don't need to
191+
* care about NULL values here, thanks to bv_allnulls above.
192+
*
193+
* If value is stored EXTERNAL, must fetch it so we are not
194+
* depending on outside storage.
195+
*
196+
* XXX Is this actually true? Could it be that the summary is
197+
* NULL even for range with non-NULL data? E.g. degenerate bloom
198+
* filter may be thrown away, etc.
199+
*/
200+
if (VARATT_IS_EXTERNAL(DatumGetPointer(value)))
201+
{
202+
value = PointerGetDatum(detoast_external_attr((struct varlena *)
203+
DatumGetPointer(value)));
204+
free_value = true;
205+
}
206+
207+
/*
208+
* If value is above size target, and is of a compressible datatype,
209+
* try to compress it in-line.
210+
*/
211+
if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) &&
212+
VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET &&
213+
(atttype->typstorage == TYPSTORAGE_EXTENDED ||
214+
atttype->typstorage == TYPSTORAGE_MAIN))
215+
{
216+
Datum cvalue = toast_compress_datum(value);
217+
218+
if (DatumGetPointer(cvalue) != NULL)
219+
{
220+
/* successful compression */
221+
if (free_value)
222+
pfree(DatumGetPointer(value));
223+
224+
value = cvalue;
225+
free_value = true;
226+
}
227+
}
228+
229+
/*
230+
* If we untoasted / compressed the value, we need to free it
231+
* after forming the index tuple.
232+
*/
233+
if (free_value)
234+
untoasted_values[nuntoasted++] = value;
235+
236+
#endif
237+
238+
values[idxattno++] = value;
239+
}
145240
}
146241

147242
/* Assert we did not overrun temp arrays */
@@ -193,6 +288,11 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
193288
pfree(nulls);
194289
pfree(phony_nullbitmap);
195290

291+
#ifdef TOAST_INDEX_HACK
292+
for (i = 0; i < nuntoasted; i++)
293+
pfree(DatumGetPointer(untoasted_values[i]));
294+
#endif
295+
196296
/*
197297
* Now fill in the real null bitmasks. allnulls first.
198298
*/

src/test/regress/expected/brin.out

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,3 +526,44 @@ EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
526526
Filter: (b = 1)
527527
(2 rows)
528528

529+
-- make sure data are properly de-toasted in BRIN index
530+
CREATE TABLE brintest_3 (a text, b text, c text, d text);
531+
-- long random strings (~2000 chars each, so ~6kB for min/max on two
532+
-- columns) to trigger toasting
533+
WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
534+
INSERT INTO brintest_3
535+
SELECT val, val, val, val FROM rand_value;
536+
CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
537+
DELETE FROM brintest_3;
538+
-- We need to wait a bit for all transactions to complete, so that the
539+
-- vacuum actually removes the TOAST rows. Creating an index concurrently
540+
-- is a one way to achieve that, because it does exactly such wait.
541+
CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
542+
DROP INDEX brin_test_temp_idx;
543+
-- vacuum the table, to discard TOAST data
544+
VACUUM brintest_3;
545+
-- retry insert with a different random-looking (but deterministic) value
546+
-- the value is different, and so should replace either min or max in the
547+
-- brin summary
548+
WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
549+
INSERT INTO brintest_3
550+
SELECT val, val, val, val FROM rand_value;
551+
-- now try some queries, accessing the brin index
552+
SET enable_seqscan = off;
553+
EXPLAIN (COSTS OFF)
554+
SELECT * FROM brintest_3 WHERE b < '0';
555+
QUERY PLAN
556+
------------------------------------------------
557+
Bitmap Heap Scan on brintest_3
558+
Recheck Cond: (b < '0'::text)
559+
-> Bitmap Index Scan on brin_test_toast_idx
560+
Index Cond: (b < '0'::text)
561+
(4 rows)
562+
563+
SELECT * FROM brintest_3 WHERE b < '0';
564+
a | b | c | d
565+
---+---+---+---
566+
(0 rows)
567+
568+
DROP TABLE brintest_3;
569+
RESET enable_seqscan;

src/test/regress/sql/brin.sql

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,3 +470,42 @@ VACUUM ANALYZE brin_test;
470470
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
471471
-- Ensure brin index is not used when values are not correlated
472472
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
473+
474+
-- make sure data are properly de-toasted in BRIN index
475+
CREATE TABLE brintest_3 (a text, b text, c text, d text);
476+
477+
-- long random strings (~2000 chars each, so ~6kB for min/max on two
478+
-- columns) to trigger toasting
479+
WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
480+
INSERT INTO brintest_3
481+
SELECT val, val, val, val FROM rand_value;
482+
483+
CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
484+
DELETE FROM brintest_3;
485+
486+
-- We need to wait a bit for all transactions to complete, so that the
487+
-- vacuum actually removes the TOAST rows. Creating an index concurrently
488+
-- is a one way to achieve that, because it does exactly such wait.
489+
CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
490+
DROP INDEX brin_test_temp_idx;
491+
492+
-- vacuum the table, to discard TOAST data
493+
VACUUM brintest_3;
494+
495+
-- retry insert with a different random-looking (but deterministic) value
496+
-- the value is different, and so should replace either min or max in the
497+
-- brin summary
498+
WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
499+
INSERT INTO brintest_3
500+
SELECT val, val, val, val FROM rand_value;
501+
502+
-- now try some queries, accessing the brin index
503+
SET enable_seqscan = off;
504+
505+
EXPLAIN (COSTS OFF)
506+
SELECT * FROM brintest_3 WHERE b < '0';
507+
508+
SELECT * FROM brintest_3 WHERE b < '0';
509+
510+
DROP TABLE brintest_3;
511+
RESET enable_seqscan;

0 commit comments

Comments
 (0)