Further cleanup of gistsplit.c.

tglsfdc · tglsfdc · commit c352ea2d74c4 · 2013-02-10T16:21:26.000-05:00
After further reflection I was unconvinced that the existing coding is
guaranteed to return valid union datums in every code path for multi-column
indexes.  Fix that by forcing a gistunionsubkey() call at the end of the
recursion.  Having done that, we can remove some clearly-redundant calls
elsewhere.  This should be a little faster for multi-column indexes (since
the previous coding would uselessly do such a call for each column while
unwinding the recursion), as well as much harder to break.

Also, simplify the handling of cases where one side or the other of a
primary split contains only don't-care tuples.  The previous coding used a
very ugly hack in removeDontCares() that essentially forced one random
tuple to be treated as non-don't-care, providing a random initial choice of
seed datum for the secondary split.  It seems unlikely that that method
will give better-than-random splits.  Instead, treat such a split as
degenerate and just let the next column determine the split, the same way
that we handle fully degenerate cases where the two sides produce identical
union datums.
diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c
@@ -162,23 +162,16 @@ findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec,
  * Remove tuples that are marked don't-cares from the tuple index array a[]
  * of length *len.	This is applied separately to the spl_left and spl_right
  * arrays.
- *
- * Corner case: we do not wish to reduce the index array to zero length.
- * (If we did, then the union key for this side would be null, and having just
- * one of spl_ldatum_exists and spl_rdatum_exists be TRUE might confuse
- * user-defined PickSplit methods.)  To avoid that, we'll forcibly redefine
- * one tuple as non-don't-care if necessary.  Hence, we must be able to adjust
- * caller's NumDontCare count.
  */
 static void
-removeDontCares(OffsetNumber *a, int *len, bool *dontcare, int *NumDontCare)
+removeDontCares(OffsetNumber *a, int *len, const bool *dontcare)
 {
 	int			origlen,
-				curlen,
+				newlen,
 				i;
 	OffsetNumber *curwpos;
 
-	origlen = curlen = *len;
+	origlen = newlen = *len;
 	curwpos = a;
 	for (i = 0; i < origlen; i++)
 	{
@@ -190,18 +183,11 @@ removeDontCares(OffsetNumber *a, int *len, bool *dontcare, int *NumDontCare)
 			*curwpos = ai;
 			curwpos++;
 		}
-		else if (curlen == 1)
-		{
-			/* corner case: don't let array become empty */
-			dontcare[ai] = FALSE;		/* mark item as non-dont-care */
-			*NumDontCare -= 1;
-			i--;				/* reprocess item on next iteration */
-		}
 		else
-			curlen--;
+			newlen--;
 	}
 
-	*len = curlen;
+	*len = newlen;
 }
 
 /*
@@ -304,8 +290,14 @@ supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno,
 					penalty2;
 
 		/*
-		 * there is only one previously defined union, so we just choose swap
-		 * or not by lowest penalty
+		 * There is only one previously defined union, so we just choose swap
+		 * or not by lowest penalty for that side.	We can only get here if a
+		 * secondary split happened to have all NULLs in its column in the
+		 * tuples that the outer recursion level had assigned to one side.
+		 * (Note that the null checks in gistSplitByKey don't prevent the
+		 * case, because they'll only be checking tuples that were considered
+		 * don't-cares at the outer recursion level, not the tuples that went
+		 * into determining the passed-down left and right union keys.)
 		 */
 		penalty1 = gistpenalty(giststate, attno, entry1, false, &entrySL, false);
 		penalty2 = gistpenalty(giststate, attno, entry1, false, &entrySR, false);
@@ -405,13 +397,19 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC
  * two vectors.
  *
  * Returns FALSE if split is complete (there are no more index columns, or
- * there is no need to consider them).	Note that in this case the union
- * keys for all columns must be computed here.
- * Returns TRUE and v->spl_dontcare = NULL if left and right unions of attno
- * column are the same, so we should split on next column instead.
+ * there is no need to consider them because split is optimal already).
+ *
+ * Returns TRUE and v->spl_dontcare = NULL if the picksplit result is
+ * degenerate (all tuples seem to be don't-cares), so we should just
+ * disregard this column and split on the next column(s) instead.
+ *
  * Returns TRUE and v->spl_dontcare != NULL if there are don't-care tuples
  * that could be relocated based on the next column(s).  The don't-care
  * tuples have been removed from the split and must be reinserted by caller.
+ * There is at least one non-don't-care tuple on each side of the split,
+ * and union keys for all columns are updated to include just those tuples.
+ *
+ * A TRUE result implies there is at least one more index column.
  */
 static bool
 gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v,
@@ -483,49 +481,57 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
 
 	/*
 	 * If index columns remain, then consider whether we can improve the split
-	 * by using them.  Even if we can't, we must compute union keys for those
-	 * columns before we can return FALSE.
+	 * by using them.
 	 */
 	v->spl_dontcare = NULL;
 
 	if (attno + 1 < giststate->tupdesc->natts)
 	{
 		int			NumDontCare;
 
+		/*
+		 * Make a quick check to see if left and right union keys are equal;
+		 * if so, the split is certainly degenerate, so tell caller to
+		 * re-split with the next column.
+		 */
 		if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum))
-		{
-			/*
-			 * Left and right union keys are equal, so we can get better split
-			 * by considering next column.
-			 */
 			return true;
-		}
 
 		/*
-		 * Locate don't-care tuples, if any
+		 * Locate don't-care tuples, if any.  If there are none, the split is
+		 * optimal, so just fall out and return false.
 		 */
 		v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1));
 
 		NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno);
 
-		if (NumDontCare == 0)
+		if (NumDontCare > 0)
 		{
 			/*
-			 * There are no don't-cares, so just compute the union keys for
-			 * remaining columns and we're done.
+			 * Remove don't-cares from spl_left[] and spl_right[].
 			 */
-			gistunionsubkey(giststate, itup, v);
-		}
-		else
-		{
+			removeDontCares(sv->spl_left, &sv->spl_nleft, v->spl_dontcare);
+			removeDontCares(sv->spl_right, &sv->spl_nright, v->spl_dontcare);
+
 			/*
-			 * Remove don't-cares from spl_left[] and spl_right[].  NOTE: this
-			 * could reduce NumDontCare to zero.
+			 * If all tuples on either side were don't-cares, the split is
+			 * degenerate, and we're best off to ignore it and split on the
+			 * next column.  (We used to try to press on with a secondary
+			 * split by forcing a random tuple on each side to be treated as
+			 * non-don't-care, but it seems unlikely that that technique
+			 * really gives a better result.  Note that we don't want to try a
+			 * secondary split with empty left or right primary split sides,
+			 * because then there is no union key on that side for the
+			 * PickSplit function to try to expand, so it can have no good
+			 * figure of merit for what it's doing.  Also note that this check
+			 * ensures we can't produce a bogus one-side-only split in the
+			 * NumDontCare == 1 special case below.)
 			 */
-			removeDontCares(sv->spl_left, &sv->spl_nleft,
-							v->spl_dontcare, &NumDontCare);
-			removeDontCares(sv->spl_right, &sv->spl_nright,
-							v->spl_dontcare, &NumDontCare);
+			if (sv->spl_nleft == 0 || sv->spl_nright == 0)
+			{
+				v->spl_dontcare = NULL;
+				return true;
+			}
 
 			/*
 			 * Recompute union keys, considering only non-don't-care tuples.
@@ -541,7 +547,9 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
 				/*
 				 * If there's only one don't-care tuple then we can't do a
 				 * PickSplit on it, so just choose whether to send it left or
-				 * right by comparing penalties.
+				 * right by comparing penalties.  We needed the
+				 * gistunionsubkey step anyway so that we have appropriate
+				 * union keys for figuring the penalties.
 				 */
 				OffsetNumber toMove;
 
@@ -556,13 +564,14 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
 				/* ... and assign it to cheaper side */
 				placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1);
 
-				/* recompute the union keys including this tuple */
-				v->spl_dontcare = NULL;
-				gistunionsubkey(giststate, itup, v);
+				/*
+				 * At this point the union keys are wrong, but we don't care
+				 * because we're done splitting.  The outermost recursion
+				 * level of gistSplitByKey will fix things before returning.
+				 */
 			}
-			else if (NumDontCare > 1)
+			else
 				return true;
-			/* else NumDontCare is now zero; handle same as above */
 		}
 	}
 
@@ -648,7 +657,7 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
 		 */
 		v->spl_risnull[attno] = v->spl_lisnull[attno] = TRUE;
 
-		if (attno + 1 < r->rd_att->natts)
+		if (attno + 1 < giststate->tupdesc->natts)
 			gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
 		else
 			gistSplitHalf(&v->splitVector, len);
@@ -673,28 +682,31 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
 			else
 				v->splitVector.spl_left[v->splitVector.spl_nleft++] = i;
 
-		/* Must compute union keys for this and any following columns */
-		v->spl_dontcare = NULL;
-		gistunionsubkey(giststate, itup, v);
+		/* Compute union keys, unless outer recursion level will handle it */
+		if (attno == 0 && giststate->tupdesc->natts == 1)
+		{
+			v->spl_dontcare = NULL;
+			gistunionsubkey(giststate, itup, v);
+		}
 	}
 	else
 	{
 		/*
-		 * all keys are not-null, so apply user-defined PickSplit method
+		 * All keys are not-null, so apply user-defined PickSplit method
 		 */
 		if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate))
 		{
 			/*
 			 * Splitting on attno column is not optimal, so consider
 			 * redistributing don't-care tuples according to the next column
 			 */
-			Assert(attno + 1 < r->rd_att->natts);
+			Assert(attno + 1 < giststate->tupdesc->natts);
 
 			if (v->spl_dontcare == NULL)
 			{
 				/*
-				 * Simple case: left and right keys for attno column are
-				 * equal, so just split according to the next column.
+				 * This split was actually degenerate, so ignore it altogether
+				 * and just split according to the next column.
 				 */
 				gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
 			}
@@ -741,10 +753,27 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
 					backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1];
 
 				v->splitVector = backupSplit;
-
-				/* recompute left and right union datums */
-				gistunionsubkey(giststate, itup, v);
 			}
 		}
 	}
+
+	/*
+	 * If we're handling a multicolumn index, at the end of the recursion
+	 * recompute the left and right union datums for all index columns.  This
+	 * makes sure we hand back correct union datums in all corner cases,
+	 * including when we haven't processed all columns to start with, or when
+	 * a secondary split moved "don't care" tuples from one side to the other
+	 * (we really shouldn't assume that that didn't change the union datums).
+	 *
+	 * Note: when we're in an internal recursion (attno > 0), we do not worry
+	 * about whether the union datums we return with are sensible, since
+	 * calling levels won't care.  Also, in a single-column index, we expect
+	 * that PickSplit (or the special cases above) produced correct union
+	 * datums.
+	 */
+	if (attno == 0 && giststate->tupdesc->natts > 1)
+	{
+		v->spl_dontcare = NULL;
+		gistunionsubkey(giststate, itup, v);
+	}
 }