Skip to content

Commit 7fc7a7c

Browse files
committed
Fix a violation of WAL coding rules in the recent patch to include an
"all tuples visible" flag in heap page headers. The flag update *must* be applied before calling XLogInsert, but heap_update and the tuple moving routines in VACUUM FULL were ignoring this rule. A crash and replay could therefore leave the flag incorrectly set, causing rows to appear visible in seqscans when they should not be. This might explain recent reports of data corruption from Jeff Ross and others. In passing, do a bit of editorialization on comments in visibilitymap.c.
1 parent cab9a06 commit 7fc7a7c

File tree

6 files changed

+105
-83
lines changed

6 files changed

+105
-83
lines changed

src/backend/access/heap/heapam.c

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.277 2009/06/11 14:48:53 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.278 2009/08/24 02:18:31 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -78,7 +78,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
7878
bool allow_strat, bool allow_sync,
7979
bool is_bitmapscan);
8080
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
81-
ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
81+
ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move,
82+
bool all_visible_cleared, bool new_all_visible_cleared);
8283
static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
8384
HeapTuple oldtup, HeapTuple newtup);
8485

@@ -2760,21 +2761,29 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
27602761
/* record address of new tuple in t_ctid of old one */
27612762
oldtup.t_data->t_ctid = heaptup->t_self;
27622763

2764+
/* clear PD_ALL_VISIBLE flags */
2765+
if (PageIsAllVisible(BufferGetPage(buffer)))
2766+
{
2767+
all_visible_cleared = true;
2768+
PageClearAllVisible(BufferGetPage(buffer));
2769+
}
2770+
if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
2771+
{
2772+
all_visible_cleared_new = true;
2773+
PageClearAllVisible(BufferGetPage(newbuf));
2774+
}
2775+
27632776
if (newbuf != buffer)
27642777
MarkBufferDirty(newbuf);
27652778
MarkBufferDirty(buffer);
27662779

2767-
/*
2768-
* Note: we mustn't clear PD_ALL_VISIBLE flags before writing the WAL
2769-
* record, because log_heap_update looks at those flags to set the
2770-
* corresponding flags in the WAL record.
2771-
*/
2772-
27732780
/* XLOG stuff */
27742781
if (!relation->rd_istemp)
27752782
{
27762783
XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
2777-
newbuf, heaptup, false);
2784+
newbuf, heaptup, false,
2785+
all_visible_cleared,
2786+
all_visible_cleared_new);
27782787

27792788
if (newbuf != buffer)
27802789
{
@@ -2785,18 +2794,6 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
27852794
PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
27862795
}
27872796

2788-
/* Clear PD_ALL_VISIBLE flags */
2789-
if (PageIsAllVisible(BufferGetPage(buffer)))
2790-
{
2791-
all_visible_cleared = true;
2792-
PageClearAllVisible(BufferGetPage(buffer));
2793-
}
2794-
if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
2795-
{
2796-
all_visible_cleared_new = true;
2797-
PageClearAllVisible(BufferGetPage(newbuf));
2798-
}
2799-
28002797
END_CRIT_SECTION();
28012798

28022799
if (newbuf != buffer)
@@ -3910,7 +3907,8 @@ log_heap_freeze(Relation reln, Buffer buffer,
39103907
*/
39113908
static XLogRecPtr
39123909
log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
3913-
Buffer newbuf, HeapTuple newtup, bool move)
3910+
Buffer newbuf, HeapTuple newtup, bool move,
3911+
bool all_visible_cleared, bool new_all_visible_cleared)
39143912
{
39153913
/*
39163914
* Note: xlhdr is declared to have adequate size and correct alignment for
@@ -3946,9 +3944,9 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
39463944

39473945
xlrec.target.node = reln->rd_node;
39483946
xlrec.target.tid = from;
3949-
xlrec.all_visible_cleared = PageIsAllVisible(BufferGetPage(oldbuf));
3947+
xlrec.all_visible_cleared = all_visible_cleared;
39503948
xlrec.newtid = newtup->t_self;
3951-
xlrec.new_all_visible_cleared = PageIsAllVisible(BufferGetPage(newbuf));
3949+
xlrec.new_all_visible_cleared = new_all_visible_cleared;
39523950

39533951
rdata[0].data = (char *) &xlrec;
39543952
rdata[0].len = SizeOfHeapUpdate;
@@ -4015,9 +4013,11 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
40154013
*/
40164014
XLogRecPtr
40174015
log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from,
4018-
Buffer newbuf, HeapTuple newtup)
4016+
Buffer newbuf, HeapTuple newtup,
4017+
bool all_visible_cleared, bool new_all_visible_cleared)
40194018
{
4020-
return log_heap_update(reln, oldbuf, from, newbuf, newtup, true);
4019+
return log_heap_update(reln, oldbuf, from, newbuf, newtup, true,
4020+
all_visible_cleared, new_all_visible_cleared);
40214021
}
40224022

40234023
/*
@@ -4222,7 +4222,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
42224222
blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
42234223

42244224
/*
4225-
* The visibility map always needs to be updated, even if the heap page is
4225+
* The visibility map may need to be fixed even if the heap page is
42264226
* already up-to-date.
42274227
*/
42284228
if (xlrec->all_visible_cleared)
@@ -4300,7 +4300,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
43004300
blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
43014301

43024302
/*
4303-
* The visibility map always needs to be updated, even if the heap page is
4303+
* The visibility map may need to be fixed even if the heap page is
43044304
* already up-to-date.
43054305
*/
43064306
if (xlrec->all_visible_cleared)
@@ -4412,7 +4412,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
44124412
Size freespace;
44134413

44144414
/*
4415-
* The visibility map always needs to be updated, even if the heap page is
4415+
* The visibility map may need to be fixed even if the heap page is
44164416
* already up-to-date.
44174417
*/
44184418
if (xlrec->all_visible_cleared)
@@ -4507,7 +4507,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
45074507
newt:;
45084508

45094509
/*
4510-
* The visibility map always needs to be updated, even if the heap page is
4510+
* The visibility map may need to be fixed even if the heap page is
45114511
* already up-to-date.
45124512
*/
45134513
if (xlrec->new_all_visible_cleared)

src/backend/access/heap/visibilitymap.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.5 2009/06/18 10:08:08 heikki Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.6 2009/08/24 02:18:31 tgl Exp $
1212
*
1313
* INTERFACE ROUTINES
1414
* visibilitymap_clear - clear a bit in the visibility map
@@ -19,10 +19,10 @@
1919
* NOTES
2020
*
2121
* The visibility map is a bitmap with one bit per heap page. A set bit means
22-
* that all tuples on the page are visible to all transactions, and doesn't
23-
* therefore need to be vacuumed. The map is conservative in the sense that we
24-
* make sure that whenever a bit is set, we know the condition is true, but if
25-
* a bit is not set, it might or might not be.
22+
* that all tuples on the page are known visible to all transactions, and
23+
* therefore the page doesn't need to be vacuumed. The map is conservative in
24+
* the sense that we make sure that whenever a bit is set, we know the
25+
* condition is true, but if a bit is not set, it might or might not be true.
2626
*
2727
* There's no explicit WAL logging in the functions in this file. The callers
2828
* must make sure that whenever a bit is cleared, the bit is cleared on WAL
@@ -34,10 +34,11 @@
3434
* make VACUUM skip pages that need vacuuming, until the next anti-wraparound
3535
* vacuum. The visibility map is not used for anti-wraparound vacuums, because
3636
* an anti-wraparound vacuum needs to freeze tuples and observe the latest xid
37-
* present in the table, also on pages that don't have any dead tuples.
37+
* present in the table, even on pages that don't have any dead tuples.
3838
*
3939
* Although the visibility map is just a hint at the moment, the PD_ALL_VISIBLE
40-
* flag on heap pages *must* be correct.
40+
* flag on heap pages *must* be correct, because it is used to skip visibility
41+
* checking.
4142
*
4243
* LOCKING
4344
*
@@ -55,33 +56,34 @@
5556
* When a bit is set, the LSN of the visibility map page is updated to make
5657
* sure that the visibility map update doesn't get written to disk before the
5758
* WAL record of the changes that made it possible to set the bit is flushed.
58-
* But when a bit is cleared, we don't have to do that because it's always OK
59-
* to clear a bit in the map from correctness point of view.
59+
* But when a bit is cleared, we don't have to do that because it's always
60+
* safe to clear a bit in the map from correctness point of view.
6061
*
6162
* TODO
6263
*
63-
* It would be nice to use the visibility map to skip visibility checkes in
64+
* It would be nice to use the visibility map to skip visibility checks in
6465
* index scans.
6566
*
6667
* Currently, the visibility map is not 100% correct all the time.
6768
* During updates, the bit in the visibility map is cleared after releasing
68-
* the lock on the heap page. During the window after releasing the lock
69+
* the lock on the heap page. During the window between releasing the lock
6970
* and clearing the bit in the visibility map, the bit in the visibility map
7071
* is set, but the new insertion or deletion is not yet visible to other
7172
* backends.
7273
*
7374
* That might actually be OK for the index scans, though. The newly inserted
7475
* tuple wouldn't have an index pointer yet, so all tuples reachable from an
7576
* index would still be visible to all other backends, and deletions wouldn't
76-
* be visible to other backends yet.
77+
* be visible to other backends yet. (But HOT breaks that argument, no?)
7778
*
7879
* There's another hole in the way the PD_ALL_VISIBLE flag is set. When
7980
* vacuum observes that all tuples are visible to all, it sets the flag on
8081
* the heap page, and also sets the bit in the visibility map. If we then
8182
* crash, and only the visibility map page was flushed to disk, we'll have
8283
* a bit set in the visibility map, but the corresponding flag on the heap
8384
* page is not set. If the heap page is then updated, the updater won't
84-
* know to clear the bit in the visibility map.
85+
* know to clear the bit in the visibility map. (Isn't that prevented by
86+
* the LSN interlock?)
8587
*
8688
*-------------------------------------------------------------------------
8789
*/

src/backend/commands/vacuum.c

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
*
1414
*
1515
* IDENTIFICATION
16-
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.389 2009/06/11 14:48:56 momjian Exp $
16+
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.390 2009/08/24 02:18:31 tgl Exp $
1717
*
1818
*-------------------------------------------------------------------------
1919
*/
@@ -2935,6 +2935,8 @@ move_chain_tuple(Relation rel,
29352935
OffsetNumber newoff;
29362936
ItemId newitemid;
29372937
Size tuple_len = old_tup->t_len;
2938+
bool all_visible_cleared = false;
2939+
bool all_visible_cleared_new = false;
29382940

29392941
/*
29402942
* make a modifiable copy of the source tuple.
@@ -3019,6 +3021,18 @@ move_chain_tuple(Relation rel,
30193021
newtup.t_data->t_ctid = *ctid;
30203022
*ctid = newtup.t_self;
30213023

3024+
/* clear PD_ALL_VISIBLE flags */
3025+
if (PageIsAllVisible(old_page))
3026+
{
3027+
all_visible_cleared = true;
3028+
PageClearAllVisible(old_page);
3029+
}
3030+
if (dst_buf != old_buf && PageIsAllVisible(dst_page))
3031+
{
3032+
all_visible_cleared_new = true;
3033+
PageClearAllVisible(dst_page);
3034+
}
3035+
30223036
MarkBufferDirty(dst_buf);
30233037
if (dst_buf != old_buf)
30243038
MarkBufferDirty(old_buf);
@@ -3027,7 +3041,9 @@ move_chain_tuple(Relation rel,
30273041
if (!rel->rd_istemp)
30283042
{
30293043
XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
3030-
dst_buf, &newtup);
3044+
dst_buf, &newtup,
3045+
all_visible_cleared,
3046+
all_visible_cleared_new);
30313047

30323048
if (old_buf != dst_buf)
30333049
{
@@ -3040,17 +3056,14 @@ move_chain_tuple(Relation rel,
30403056

30413057
END_CRIT_SECTION();
30423058

3043-
PageClearAllVisible(BufferGetPage(old_buf));
3044-
if (dst_buf != old_buf)
3045-
PageClearAllVisible(BufferGetPage(dst_buf));
3046-
30473059
LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
30483060
if (dst_buf != old_buf)
30493061
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
30503062

3051-
/* Clear the bits in the visibility map. */
3052-
visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
3053-
if (dst_buf != old_buf)
3063+
/* Clear bits in visibility map */
3064+
if (all_visible_cleared)
3065+
visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
3066+
if (all_visible_cleared_new)
30543067
visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
30553068

30563069
/* Create index entries for the moved tuple */
@@ -3084,6 +3097,8 @@ move_plain_tuple(Relation rel,
30843097
OffsetNumber newoff;
30853098
ItemId newitemid;
30863099
Size tuple_len = old_tup->t_len;
3100+
bool all_visible_cleared = false;
3101+
bool all_visible_cleared_new = false;
30873102

30883103
/* copy tuple */
30893104
heap_copytuple_with_tuple(old_tup, &newtup);
@@ -3134,14 +3149,28 @@ move_plain_tuple(Relation rel,
31343149
old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
31353150
HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
31363151

3152+
/* clear PD_ALL_VISIBLE flags */
3153+
if (PageIsAllVisible(old_page))
3154+
{
3155+
all_visible_cleared = true;
3156+
PageClearAllVisible(old_page);
3157+
}
3158+
if (PageIsAllVisible(dst_page))
3159+
{
3160+
all_visible_cleared_new = true;
3161+
PageClearAllVisible(dst_page);
3162+
}
3163+
31373164
MarkBufferDirty(dst_buf);
31383165
MarkBufferDirty(old_buf);
31393166

31403167
/* XLOG stuff */
31413168
if (!rel->rd_istemp)
31423169
{
31433170
XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
3144-
dst_buf, &newtup);
3171+
dst_buf, &newtup,
3172+
all_visible_cleared,
3173+
all_visible_cleared_new);
31453174

31463175
PageSetLSN(old_page, recptr);
31473176
PageSetTLI(old_page, ThisTimeLineID);
@@ -3151,29 +3180,18 @@ move_plain_tuple(Relation rel,
31513180

31523181
END_CRIT_SECTION();
31533182

3154-
/*
3155-
* Clear the visible-to-all hint bits on the page, and bits in the
3156-
* visibility map. Normally we'd release the locks on the heap pages
3157-
* before updating the visibility map, but doesn't really matter here
3158-
* because we're holding an AccessExclusiveLock on the relation anyway.
3159-
*/
3160-
if (PageIsAllVisible(dst_page))
3161-
{
3162-
PageClearAllVisible(dst_page);
3163-
visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
3164-
}
3165-
if (PageIsAllVisible(old_page))
3166-
{
3167-
PageClearAllVisible(old_page);
3168-
visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
3169-
}
3170-
31713183
dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page);
31723184
LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
31733185
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
31743186

31753187
dst_vacpage->offsets_used++;
31763188

3189+
/* Clear bits in visibility map */
3190+
if (all_visible_cleared)
3191+
visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
3192+
if (all_visible_cleared_new)
3193+
visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
3194+
31773195
/* insert index' tuples if needed */
31783196
if (ec->resultRelInfo->ri_NumIndices > 0)
31793197
{

0 commit comments

Comments
 (0)