postgres
diff --git a/‎src/backend/access/nbtree/Makefile
Lines changed: 1 addition & 1 deletion b/‎src/backend/access/nbtree/Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/backend/access/nbtree/README
Lines changed: 44 additions & 3 deletions b/‎src/backend/access/nbtree/README
Lines changed: 44 additions & 3 deletions
diff --git a/‎src/backend/access/nbtree/nbtinsert.c
Lines changed: 1 addition & 286 deletions b/‎src/backend/access/nbtree/nbtinsert.c
Lines changed: 1 addition & 286 deletions
@@ -13,6 +13,6 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
-       nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o
+       nbtsplitloc.o nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o
 
 include $(top_srcdir)/src/backend/common.mk
@@ -143,9 +143,9 @@ Lehman and Yao assume fixed-size keys, but we must deal with
 variable-size keys.  Therefore there is not a fixed maximum number of
 keys per page; we just stuff in as many as will fit.  When we split a
 page, we try to equalize the number of bytes, not items, assigned to
-each of the resulting pages.  Note we must include the incoming item in
-this calculation, otherwise it is possible to find that the incoming
-item doesn't fit on the split page where it needs to go!
+pages (though suffix truncation is also considered).  Note we must include
+the incoming item in this calculation, otherwise it is possible to find
+that the incoming item doesn't fit on the split page where it needs to go!
 
 The Deletion Algorithm
 ----------------------
@@ -649,6 +649,47 @@ variable-length types, such as text.  An opclass support function could
 manufacture the shortest possible key value that still correctly separates
 each half of a leaf page split.
 
+There is sophisticated criteria for choosing a leaf page split point.  The
+general idea is to make suffix truncation effective without unduly
+influencing the balance of space for each half of the page split.  The
+choice of leaf split point can be thought of as a choice among points
+*between* items on the page to be split, at least if you pretend that the
+incoming tuple was placed on the page already (you have to pretend because
+there won't actually be enough space for it on the page).  Choosing the
+split point between two index tuples where the first non-equal attribute
+appears as early as possible results in truncating away as many suffix
+attributes as possible.  Evenly balancing space among each half of the
+split is usually the first concern, but even small adjustments in the
+precise split point can allow truncation to be far more effective.
+
+Suffix truncation is primarily valuable because it makes pivot tuples
+smaller, which delays splits of internal pages, but that isn't the only
+reason why it's effective.  Even truncation that doesn't make pivot tuples
+smaller due to alignment still prevents pivot tuples from being more
+restrictive than truly necessary in how they describe which values belong
+on which pages.
+
+While it's not possible to correctly perform suffix truncation during
+internal page splits, it's still useful to be discriminating when splitting
+an internal page.  The split point that implies a downlink be inserted in
+the parent that's the smallest one available within an acceptable range of
+the fillfactor-wise optimal split point is chosen.  This idea also comes
+from the Prefix B-Tree paper.  This process has much in common with what
+happens at the leaf level to make suffix truncation effective.  The overall
+effect is that suffix truncation tends to produce smaller, more
+discriminating pivot tuples, especially early in the lifetime of the index,
+while biasing internal page splits makes the earlier, smaller pivot tuples
+end up in the root page, delaying root page splits.
+
+Logical duplicates are given special consideration.  The logic for
+selecting a split point goes to great lengths to avoid having duplicates
+span more than one page, and almost always manages to pick a split point
+between two user-key-distinct tuples, accepting a completely lopsided split
+if it must.  When a page that's already full of duplicates must be split,
+the fallback strategy assumes that duplicates are mostly inserted in
+ascending heap TID order.  The page is split in a way that leaves the left
+half of the page mostly full, and the right half of the page mostly empty.
+
 Notes About Data Representation
 -------------------------------
 
 
@@ -28,26 +28,6 @@
 /* Minimum tree height for application of fastpath optimization */
 #define BTREE_FASTPATH_MIN_LEVEL	2
 
-typedef struct
-{
-	/* context data for _bt_checksplitloc */
-	Size		newitemsz;		/* size of new item to be inserted */
-	int			fillfactor;		/* needed when splitting rightmost page */
-	bool		is_leaf;		/* T if splitting a leaf page */
-	bool		is_rightmost;	/* T if splitting a rightmost page */
-	OffsetNumber newitemoff;	/* where the new item is to be inserted */
-	int			leftspace;		/* space available for items on left page */
-	int			rightspace;		/* space available for items on right page */
-	int			olddataitemstotal;	/* space taken by old items */
-
-	bool		have_split;		/* found a valid split? */
-
-	/* these fields valid only if have_split is true */
-	bool		newitemonleft;	/* new item on left or right of best split */
-	OffsetNumber firstright;	/* best split point */
-	int			best_delta;		/* best size delta so far */
-} FindSplitData;
-
 
 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
 
@@ -73,13 +53,6 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
 		  Size newitemsz, IndexTuple newitem, bool newitemonleft);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 				  BTStack stack, bool is_root, bool is_only);
-static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
-				 OffsetNumber newitemoff,
-				 Size newitemsz,
-				 bool *newitemonleft);
-static void _bt_checksplitloc(FindSplitData *state,
-				  OffsetNumber firstoldonright, bool newitemonleft,
-				  int dataitemstoleft, Size firstoldonrightsz);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 			 OffsetNumber itup_off);
 static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key,
@@ -1003,7 +976,7 @@ _bt_insertonpg(Relation rel,
 
 		/* Choose the split point */
 		firstright = _bt_findsplitloc(rel, page,
-									  newitemoff, itemsz,
+									  newitemoff, itemsz, itup,
 									  &newitemonleft);
 
 		/* split the buffer into left and right halves */
@@ -1687,264 +1660,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	return rbuf;
 }
 
-/*
- *	_bt_findsplitloc() -- find an appropriate place to split a page.
- *
- * The idea here is to equalize the free space that will be on each split
- * page, *after accounting for the inserted tuple*.  (If we fail to account
- * for it, we might find ourselves with too little room on the page that
- * it needs to go into!)
- *
- * If the page is the rightmost page on its level, we instead try to arrange
- * to leave the left split page fillfactor% full.  In this way, when we are
- * inserting successively increasing keys (consider sequences, timestamps,
- * etc) we will end up with a tree whose pages are about fillfactor% full,
- * instead of the 50% full result that we'd get without this special case.
- * This is the same as nbtsort.c produces for a newly-created tree.  Note
- * that leaf and nonleaf pages use different fillfactors.
- *
- * We are passed the intended insert position of the new tuple, expressed as
- * the offsetnumber of the tuple it must go in front of.  (This could be
- * maxoff+1 if the tuple is to go at the end.)
- *
- * We return the index of the first existing tuple that should go on the
- * righthand page, plus a boolean indicating whether the new tuple goes on
- * the left or right page.  The bool is necessary to disambiguate the case
- * where firstright == newitemoff.
- */
-static OffsetNumber
-_bt_findsplitloc(Relation rel,
-				 Page page,
-				 OffsetNumber newitemoff,
-				 Size newitemsz,
-				 bool *newitemonleft)
-{
-	BTPageOpaque opaque;
-	OffsetNumber offnum;
-	OffsetNumber maxoff;
-	ItemId		itemid;
-	FindSplitData state;
-	int			leftspace,
-				rightspace,
-				goodenough,
-				olddataitemstotal,
-				olddataitemstoleft;
-	bool		goodenoughfound;
-
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
-	newitemsz += sizeof(ItemIdData);
-
-	/* Total free space available on a btree page, after fixed overhead */
-	leftspace = rightspace =
-		PageGetPageSize(page) - SizeOfPageHeaderData -
-		MAXALIGN(sizeof(BTPageOpaqueData));
-
-	/* The right page will have the same high key as the old page */
-	if (!P_RIGHTMOST(opaque))
-	{
-		itemid = PageGetItemId(page, P_HIKEY);
-		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
-							 sizeof(ItemIdData));
-	}
-
-	/* Count up total space in data items without actually scanning 'em */
-	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
-
-	state.newitemsz = newitemsz;
-	state.is_leaf = P_ISLEAF(opaque);
-	state.is_rightmost = P_RIGHTMOST(opaque);
-	state.have_split = false;
-	if (state.is_leaf)
-		state.fillfactor = RelationGetFillFactor(rel,
-												 BTREE_DEFAULT_FILLFACTOR);
-	else
-		state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
-	state.newitemonleft = false;	/* these just to keep compiler quiet */
-	state.firstright = 0;
-	state.best_delta = 0;
-	state.leftspace = leftspace;
-	state.rightspace = rightspace;
-	state.olddataitemstotal = olddataitemstotal;
-	state.newitemoff = newitemoff;
-
-	/*
-	 * Finding the best possible split would require checking all the possible
-	 * split points, because of the high-key and left-key special cases.
-	 * That's probably more work than it's worth; instead, stop as soon as we
-	 * find a "good-enough" split, where good-enough is defined as an
-	 * imbalance in free space of no more than pagesize/16 (arbitrary...) This
-	 * should let us stop near the middle on most pages, instead of plowing to
-	 * the end.
-	 */
-	goodenough = leftspace / 16;
-
-	/*
-	 * Scan through the data items and calculate space usage for a split at
-	 * each possible position.
-	 */
-	olddataitemstoleft = 0;
-	goodenoughfound = false;
-	maxoff = PageGetMaxOffsetNumber(page);
-
-	for (offnum = P_FIRSTDATAKEY(opaque);
-		 offnum <= maxoff;
-		 offnum = OffsetNumberNext(offnum))
-	{
-		Size		itemsz;
-
-		itemid = PageGetItemId(page, offnum);
-		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
-
-		/*
-		 * Will the new item go to left or right of split?
-		 */
-		if (offnum > newitemoff)
-			_bt_checksplitloc(&state, offnum, true,
-							  olddataitemstoleft, itemsz);
-
-		else if (offnum < newitemoff)
-			_bt_checksplitloc(&state, offnum, false,
-							  olddataitemstoleft, itemsz);
-		else
-		{
-			/* need to try it both ways! */
-			_bt_checksplitloc(&state, offnum, true,
-							  olddataitemstoleft, itemsz);
-
-			_bt_checksplitloc(&state, offnum, false,
-							  olddataitemstoleft, itemsz);
-		}
-
-		/* Abort scan once we find a good-enough choice */
-		if (state.have_split && state.best_delta <= goodenough)
-		{
-			goodenoughfound = true;
-			break;
-		}
-
-		olddataitemstoleft += itemsz;
-	}
-
-	/*
-	 * If the new item goes as the last item, check for splitting so that all
-	 * the old items go to the left page and the new item goes to the right
-	 * page.
-	 */
-	if (newitemoff > maxoff && !goodenoughfound)
-		_bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
-
-	/*
-	 * I believe it is not possible to fail to find a feasible split, but just
-	 * in case ...
-	 */
-	if (!state.have_split)
-		elog(ERROR, "could not find a feasible split point for index \"%s\"",
-			 RelationGetRelationName(rel));
-
-	*newitemonleft = state.newitemonleft;
-	return state.firstright;
-}
-
-/*
- * Subroutine to analyze a particular possible split choice (ie, firstright
- * and newitemonleft settings), and record the best split so far in *state.
- *
- * firstoldonright is the offset of the first item on the original page
- * that goes to the right page, and firstoldonrightsz is the size of that
- * tuple. firstoldonright can be > max offset, which means that all the old
- * items go to the left page and only the new item goes to the right page.
- * In that case, firstoldonrightsz is not used.
- *
- * olddataitemstoleft is the total size of all old items to the left of
- * firstoldonright.
- */
-static void
-_bt_checksplitloc(FindSplitData *state,
-				  OffsetNumber firstoldonright,
-				  bool newitemonleft,
-				  int olddataitemstoleft,
-				  Size firstoldonrightsz)
-{
-	int			leftfree,
-				rightfree;
-	Size		firstrightitemsz;
-	bool		newitemisfirstonright;
-
-	/* Is the new item going to be the first item on the right page? */
-	newitemisfirstonright = (firstoldonright == state->newitemoff
-							 && !newitemonleft);
-
-	if (newitemisfirstonright)
-		firstrightitemsz = state->newitemsz;
-	else
-		firstrightitemsz = firstoldonrightsz;
-
-	/* Account for all the old tuples */
-	leftfree = state->leftspace - olddataitemstoleft;
-	rightfree = state->rightspace -
-		(state->olddataitemstotal - olddataitemstoleft);
-
-	/*
-	 * The first item on the right page becomes the high key of the left page;
-	 * therefore it counts against left space as well as right space. When
-	 * index has included attributes, then those attributes of left page high
-	 * key will be truncated leaving that page with slightly more free space.
-	 * However, that shouldn't affect our ability to find valid split
-	 * location, because anyway split location should exists even without high
-	 * key truncation.
-	 */
-	leftfree -= firstrightitemsz;
-
-	/* account for the new item */
-	if (newitemonleft)
-		leftfree -= (int) state->newitemsz;
-	else
-		rightfree -= (int) state->newitemsz;
-
-	/*
-	 * If we are not on the leaf level, we will be able to discard the key
-	 * data from the first item that winds up on the right page.
-	 */
-	if (!state->is_leaf)
-		rightfree += (int) firstrightitemsz -
-			(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
-
-	/*
-	 * If feasible split point, remember best delta.
-	 */
-	if (leftfree >= 0 && rightfree >= 0)
-	{
-		int			delta;
-
-		if (state->is_rightmost)
-		{
-			/*
-			 * If splitting a rightmost page, try to put (100-fillfactor)% of
-			 * free space on left page. See comments for _bt_findsplitloc.
-			 */
-			delta = (state->fillfactor * leftfree)
-				- ((100 - state->fillfactor) * rightfree);
-		}
-		else
-		{
-			/* Otherwise, aim for equal free space on both sides */
-			delta = leftfree - rightfree;
-		}
-
-		if (delta < 0)
-			delta = -delta;
-		if (!state->have_split || delta < state->best_delta)
-		{
-			state->have_split = true;
-			state->newitemonleft = newitemonleft;
-			state->firstright = firstoldonright;
-			state->best_delta = delta;
-		}
-	}
-}
-
 /*
  * _bt_insert_parent() -- Insert downlink into parent after a page split.
  *