@@ -148,6 +148,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
148
148
bool rootdescend );
149
149
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
150
150
BtreeLevel level );
151
+ static bool bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
152
+ BlockNumber start ,
153
+ BTPageOpaque start_opaque );
151
154
static void bt_recheck_sibling_links (BtreeCheckState * state ,
152
155
BlockNumber btpo_prev_from_target ,
153
156
BlockNumber leftcurrent );
@@ -776,7 +779,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
776
779
*/
777
780
if (state -> readonly )
778
781
{
779
- if (!P_LEFTMOST ( opaque ))
782
+ if (!bt_leftmost_ignoring_half_dead ( state , current , opaque ))
780
783
ereport (ERROR ,
781
784
(errcode (ERRCODE_INDEX_CORRUPTED ),
782
785
errmsg ("block %u is not leftmost in index \"%s\"" ,
@@ -830,8 +833,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
830
833
*/
831
834
}
832
835
833
- /* Sibling links should be in mutual agreement */
834
- if (opaque -> btpo_prev != leftcurrent )
836
+ /*
837
+ * Sibling links should be in mutual agreement. There arises
838
+ * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
839
+ * of the parent's low-key downlink is half-dead. (A half-dead page
840
+ * has no downlink from its parent.) Under heavyweight locking, the
841
+ * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
842
+ * Without heavyweight locking, validation of the P_NONE case remains
843
+ * unimplemented.
844
+ */
845
+ if (opaque -> btpo_prev != leftcurrent && leftcurrent != P_NONE )
835
846
bt_recheck_sibling_links (state , opaque -> btpo_prev , leftcurrent );
836
847
837
848
/* Check level */
@@ -912,6 +923,66 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
912
923
return nextleveldown ;
913
924
}
914
925
926
+ /*
927
+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
928
+ * half-dead, sibling-linked pages to the left. If a half-dead page appears
929
+ * under state->readonly, the database exited recovery between the first-stage
930
+ * and second-stage WAL records of a deletion.
931
+ */
932
+ static bool
933
+ bt_leftmost_ignoring_half_dead (BtreeCheckState * state ,
934
+ BlockNumber start ,
935
+ BTPageOpaque start_opaque )
936
+ {
937
+ BlockNumber reached = start_opaque -> btpo_prev ,
938
+ reached_from = start ;
939
+ bool all_half_dead = true;
940
+
941
+ /*
942
+ * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
943
+ * potentially observe nbtree/README "Page deletion and backwards scans".
944
+ */
945
+ Assert (state -> readonly );
946
+
947
+ while (reached != P_NONE && all_half_dead )
948
+ {
949
+ Page page = palloc_btree_page (state , reached );
950
+ BTPageOpaque reached_opaque = BTPageGetOpaque (page );
951
+
952
+ CHECK_FOR_INTERRUPTS ();
953
+
954
+ /*
955
+ * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
956
+ * writes that side-links will continue to point to the siblings.
957
+ * Check btpo_next for that property.
958
+ */
959
+ all_half_dead = P_ISHALFDEAD (reached_opaque ) &&
960
+ reached != start &&
961
+ reached != reached_from &&
962
+ reached_opaque -> btpo_next == reached_from ;
963
+ if (all_half_dead )
964
+ {
965
+ XLogRecPtr pagelsn = PageGetLSN (page );
966
+
967
+ /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
968
+ ereport (DEBUG1 ,
969
+ (errcode (ERRCODE_NO_DATA ),
970
+ errmsg_internal ("harmless interrupted page deletion detected in index \"%s\"" ,
971
+ RelationGetRelationName (state -> rel )),
972
+ errdetail_internal ("Block=%u right block=%u page lsn=%X/%X." ,
973
+ reached , reached_from ,
974
+ LSN_FORMAT_ARGS (pagelsn ))));
975
+
976
+ reached_from = reached ;
977
+ reached = reached_opaque -> btpo_prev ;
978
+ }
979
+
980
+ pfree (page );
981
+ }
982
+
983
+ return all_half_dead ;
984
+ }
985
+
915
986
/*
916
987
* Raise an error when target page's left link does not point back to the
917
988
* previous target page, called leftcurrent here. The leftcurrent page's
@@ -952,6 +1023,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
952
1023
BlockNumber btpo_prev_from_target ,
953
1024
BlockNumber leftcurrent )
954
1025
{
1026
+ /* passing metapage to BTPageGetOpaque() would give irrelevant findings */
1027
+ Assert (leftcurrent != P_NONE );
1028
+
955
1029
if (!state -> readonly )
956
1030
{
957
1031
Buffer lbuf ;
@@ -1935,7 +2009,8 @@ bt_child_highkey_check(BtreeCheckState *state,
1935
2009
opaque = BTPageGetOpaque (page );
1936
2010
1937
2011
/* The first page we visit at the level should be leftmost */
1938
- if (first && !BlockNumberIsValid (state -> prevrightlink ) && !P_LEFTMOST (opaque ))
2012
+ if (first && !BlockNumberIsValid (state -> prevrightlink ) &&
2013
+ !bt_leftmost_ignoring_half_dead (state , blkno , opaque ))
1939
2014
ereport (ERROR ,
1940
2015
(errcode (ERRCODE_INDEX_CORRUPTED ),
1941
2016
errmsg ("the first child of leftmost target page is not leftmost of its level in index \"%s\"" ,
0 commit comments