Phase 2 of hashed-aggregation project. nodeAgg.c now knows how to do

tglsfdc · tglsfdc · commit 2103b7baa26f · 2002-11-06T22:31:24.000Z
hashed aggregation, but there's not yet planner support for it.
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c
@@ -15,7 +15,7 @@
  *	  locate group boundaries.
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.48 2002/11/06 00:00:43 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.49 2002/11/06 22:31:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -151,9 +151,8 @@ ExecInitGroup(Group *node, EState *estate, Plan *parent)
 	 */
 	grpstate = makeNode(GroupState);
 	node->grpstate = grpstate;
-	grpstate->grp_useFirstTuple = FALSE;
-	grpstate->grp_done = FALSE;
 	grpstate->grp_firstTuple = NULL;
+	grpstate->grp_done = FALSE;
 
 	/*
 	 * create expression context
@@ -236,7 +235,6 @@ ExecReScanGroup(Group *node, ExprContext *exprCtxt, Plan *parent)
 {
 	GroupState *grpstate = node->grpstate;
 
-	grpstate->grp_useFirstTuple = FALSE;
 	grpstate->grp_done = FALSE;
 	if (grpstate->grp_firstTuple != NULL)
 	{
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
@@ -7,7 +7,8 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
- *	$Id: nodeHash.c,v 1.66 2002/09/04 20:31:18 momjian Exp $
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v 1.67 2002/11/06 22:31:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,8 +32,6 @@
 #include "utils/lsyscache.h"
 
 
-static uint32 hashFunc(Datum key, int typLen, bool byVal);
-
 /* ----------------------------------------------------------------
  *		ExecHash
  *
@@ -532,7 +531,7 @@ ExecHashGetBucket(HashJoinTable hashtable,
 
 	/*
 	 * We reset the eval context each time to reclaim any memory leaked in
-	 * the hashkey expression or hashFunc itself.
+	 * the hashkey expression or ComputeHashFunc itself.
 	 */
 	ResetExprContext(econtext);
 
@@ -550,9 +549,9 @@ ExecHashGetBucket(HashJoinTable hashtable,
 		bucketno = 0;
 	else
 	{
-		bucketno = hashFunc(keyval,
-							(int) hashtable->typLen,
-							hashtable->typByVal)
+		bucketno = ComputeHashFunc(keyval,
+								   (int) hashtable->typLen,
+								   hashtable->typByVal)
 			% (uint32) hashtable->totalbuckets;
 	}
 
@@ -622,16 +621,16 @@ ExecScanHashBucket(HashJoinState *hjstate,
 }
 
 /* ----------------------------------------------------------------
- *		hashFunc
+ *		ComputeHashFunc
  *
- *		the hash function for hash joins
+ *		the hash function for hash joins (also used for hash aggregation)
  *
  *		XXX this probably ought to be replaced with datatype-specific
  *		hash functions, such as those already implemented for hash indexes.
  * ----------------------------------------------------------------
  */
-static uint32
-hashFunc(Datum key, int typLen, bool byVal)
+uint32
+ComputeHashFunc(Datum key, int typLen, bool byVal)
 {
 	unsigned char *k;
 
@@ -681,7 +680,7 @@ hashFunc(Datum key, int typLen, bool byVal)
 		}
 		else
 		{
-			elog(ERROR, "hashFunc: Invalid typLen %d", typLen);
+			elog(ERROR, "ComputeHashFunc: Invalid typLen %d", typLen);
 			k = NULL;			/* keep compiler quiet */
 		}
 	}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
@@ -15,7 +15,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.215 2002/11/06 00:00:43 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.216 2002/11/06 22:31:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -524,6 +524,7 @@ _copyAgg(Agg *from)
 		memcpy(newnode->grpColIdx, from->grpColIdx,
 			   from->numCols * sizeof(AttrNumber));
 	}
+	newnode->numGroups = from->numGroups;
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
@@ -5,7 +5,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.177 2002/11/06 00:00:44 tgl Exp $
+ *	$Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.178 2002/11/06 22:31:24 tgl Exp $
  *
  * NOTES
  *	  Every (plan) node in POSTGRES has an associated "out" routine which
@@ -597,8 +597,8 @@ _outAgg(StringInfo str, Agg *node)
 {
 	appendStringInfo(str, " AGG ");
 	_outPlanInfo(str, (Plan *) node);
-	appendStringInfo(str, " :aggstrategy %d :numCols %d ",
-					 (int) node->aggstrategy, node->numCols);
+	appendStringInfo(str, " :aggstrategy %d :numCols %d :numGroups %ld ",
+					 (int) node->aggstrategy, node->numCols, node->numGroups);
 }
 
 static void
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.120 2002/11/06 00:00:44 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.121 2002/11/06 22:31:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1675,6 +1675,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
 		plan->plan_rows *= 0.1;
 		if (plan->plan_rows < 1)
 			plan->plan_rows = 1;
+		node->numGroups = (long) plan->plan_rows;
 	}
 
 	plan->state = (EState *) NULL;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.126 2002/11/06 00:00:44 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.127 2002/11/06 22:31:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -931,6 +931,7 @@ grouping_planner(Query *parse, double tuple_fraction)
 		AttrNumber *groupColIdx = NULL;
 		Path	   *cheapest_path;
 		Path	   *sorted_path;
+		bool		use_hashed_grouping = false;
 
 		/* Preprocess targetlist in case we are inside an INSERT/UPDATE. */
 		tlist = preprocess_targetlist(tlist,
@@ -1209,6 +1210,29 @@ grouping_planner(Query *parse, double tuple_fraction)
 		group_pathkeys = canonicalize_pathkeys(parse, group_pathkeys);
 		sort_pathkeys = canonicalize_pathkeys(parse, sort_pathkeys);
 
+		/*
+		 * Consider whether we might want to use hashed grouping.
+		 */
+		if (parse->groupClause)
+		{
+			/*
+			 * Executor doesn't support hashed aggregation with DISTINCT
+			 * aggregates.  (Doing so would imply storing *all* the input
+			 * values in the hash table, which seems like a certain loser.)
+			 */
+			if (parse->hasAggs &&
+				(contain_distinct_agg_clause((Node *) tlist) ||
+				 contain_distinct_agg_clause(parse->havingQual)))
+				use_hashed_grouping = false;
+			else
+			{
+#if 0							/* much more to do here */
+				/* TEMPORARY HOTWIRE FOR TESTING */
+				use_hashed_grouping = true;
+#endif
+			}
+		}
+
 		/*
 		 * Select the best path and create a plan to execute it.
 		 *
@@ -1279,22 +1303,30 @@ grouping_planner(Query *parse, double tuple_fraction)
 		}
 
 		/*
-		 * If any aggregate is present, insert the Agg node, plus an explicit
-		 * sort if necessary.
+		 * Insert AGG or GROUP node if needed, plus an explicit sort step
+		 * if necessary.
 		 *
 		 * HAVING clause, if any, becomes qual of the Agg node
 		 */
-		if (parse->hasAggs)
+		if (use_hashed_grouping)
 		{
+			/* Hashed aggregate plan --- no sort needed */
+			result_plan = (Plan *) make_agg(tlist,
+											(List *) parse->havingQual,
+											AGG_HASHED,
+											length(parse->groupClause),
+											groupColIdx,
+											result_plan);
+			/* Hashed aggregation produces randomly-ordered results */
+			current_pathkeys = NIL;
+		}
+		else if (parse->hasAggs)
+		{
+			/* Plain aggregate plan --- sort if needed */
 			AggStrategy aggstrategy;
 
 			if (parse->groupClause)
 			{
-				aggstrategy = AGG_SORTED;
-				/*
-				 * Add an explicit sort if we couldn't make the path come out
-				 * the way the AGG node needs it.
-				 */
 				if (!pathkeys_contained_in(group_pathkeys, current_pathkeys))
 				{
 					result_plan = make_groupsortplan(parse,
@@ -1303,20 +1335,25 @@ grouping_planner(Query *parse, double tuple_fraction)
 													 result_plan);
 					current_pathkeys = group_pathkeys;
 				}
+				aggstrategy = AGG_SORTED;
+				/*
+				 * The AGG node will not change the sort ordering of its
+				 * groups, so current_pathkeys describes the result too.
+				 */
 			}
 			else
+			{
 				aggstrategy = AGG_PLAIN;
+				/* Result will be only one row anyway; no sort order */
+				current_pathkeys = NIL;
+			}
 
 			result_plan = (Plan *) make_agg(tlist,
 											(List *) parse->havingQual,
 											aggstrategy,
 											length(parse->groupClause),
 											groupColIdx,
 											result_plan);
-			/*
-			 * Note: plain or grouped Agg does not affect any existing
-			 * sort order of the tuples
-			 */
 		}
 		else
 		{
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/clauses.c,v 1.109 2002/09/11 14:48:54 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/clauses.c,v 1.110 2002/11/06 22:31:24 tgl Exp $
  *
  * HISTORY
  *	  AUTHOR			DATE			MAJOR EVENT
@@ -46,6 +46,7 @@ typedef struct
 } check_subplans_for_ungrouped_vars_context;
 
 static bool contain_agg_clause_walker(Node *node, void *context);
+static bool contain_distinct_agg_clause_walker(Node *node, void *context);
 static bool pull_agg_clause_walker(Node *node, List **listptr);
 static bool expression_returns_set_walker(Node *node, void *context);
 static bool contain_subplans_walker(Node *node, void *context);
@@ -410,6 +411,32 @@ contain_agg_clause_walker(Node *node, void *context)
 	return expression_tree_walker(node, contain_agg_clause_walker, context);
 }
 
+/*
+ * contain_distinct_agg_clause
+ *	  Recursively search for DISTINCT Aggref nodes within a clause.
+ *
+ *	  Returns true if any DISTINCT aggregate found.
+ */
+bool
+contain_distinct_agg_clause(Node *clause)
+{
+	return contain_distinct_agg_clause_walker(clause, NULL);
+}
+
+static bool
+contain_distinct_agg_clause_walker(Node *node, void *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Aggref))
+	{
+		if (((Aggref *) node)->aggdistinct)
+			return true;		/* abort the tree traversal and return
+								 * true */
+	}
+	return expression_tree_walker(node, contain_distinct_agg_clause_walker, context);
+}
+
 /*
  * pull_agg_clause
  *	  Recursively pulls all Aggref nodes from an expression tree.
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: nodeHash.h,v 1.24 2002/06/20 20:29:49 momjian Exp $
+ * $Id: nodeHash.h,v 1.25 2002/11/06 22:31:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,5 +36,6 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
 						int *virtualbuckets,
 						int *physicalbuckets,
 						int *numbatches);
+extern uint32 ComputeHashFunc(Datum key, int typLen, bool byVal);
 
 #endif   /* NODEHASH_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`* locate group boundaries.`
`16`	`16`	`*`
`17`	`17`	`* IDENTIFICATION`
`18`		`- * $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.48 2002/11/06 00:00:43 tgl Exp $`
	`18`	`+ * $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.49 2002/11/06 22:31:23 tgl Exp $`
`19`	`19`	`*`
`20`	`20`	`*-------------------------------------------------------------------------`
`21`	`21`	`*/`
`@@ -151,9 +151,8 @@ ExecInitGroup(Group node, EState estate, Plan *parent)`
`151`	`151`	`*/`
`152`	`152`	`grpstate = makeNode(GroupState);`
`153`	`153`	`node->grpstate = grpstate;`
`154`		`- grpstate->grp_useFirstTuple = FALSE;`
`155`		`- grpstate->grp_done = FALSE;`
`156`	`154`	`grpstate->grp_firstTuple = NULL;`
	`155`	`+ grpstate->grp_done = FALSE;`
`157`	`156`
`158`	`157`	`/*`
`159`	`158`	`* create expression context`
`@@ -236,7 +235,6 @@ ExecReScanGroup(Group node, ExprContext exprCtxt, Plan *parent)`
`236`	`235`	`{`
`237`	`236`	`GroupState *grpstate = node->grpstate;`
`238`	`237`
`239`		`- grpstate->grp_useFirstTuple = FALSE;`
`240`	`238`	`grpstate->grp_done = FALSE;`
`241`	`239`	`if (grpstate->grp_firstTuple != NULL)`
`242`	`240`	`{`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,8 @@`
`7`	`7`	`* Portions Copyright (c) 1994, Regents of the University of California`
`8`	`8`	`*`
`9`	`9`	`*`
`10`		`- * $Id: nodeHash.c,v 1.66 2002/09/04 20:31:18 momjian Exp $`
	`10`	`+ * IDENTIFICATION`
	`11`	`+ * $Header: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v 1.67 2002/11/06 22:31:23 tgl Exp $`
`11`	`12`	`*`
`12`	`13`	`*-------------------------------------------------------------------------`
`13`	`14`	`*/`
`@@ -31,8 +32,6 @@`
`31`	`32`	`#include "utils/lsyscache.h"`
`32`	`33`
`33`	`34`
`34`		`-static uint32 hashFunc(Datum key, int typLen, bool byVal);`
`35`		`-`
`36`	`35`	`/* ----------------------------------------------------------------`
`37`	`36`	`* ExecHash`
`38`	`37`	`*`
`@@ -532,7 +531,7 @@ ExecHashGetBucket(HashJoinTable hashtable,`
`532`	`531`
`533`	`532`	`/*`
`534`	`533`	`* We reset the eval context each time to reclaim any memory leaked in`
`535`		`- * the hashkey expression or hashFunc itself.`
	`534`	`+ * the hashkey expression or ComputeHashFunc itself.`
`536`	`535`	`*/`
`537`	`536`	`ResetExprContext(econtext);`
`538`	`537`
`@@ -550,9 +549,9 @@ ExecHashGetBucket(HashJoinTable hashtable,`
`550`	`549`	`bucketno = 0;`
`551`	`550`	`else`
`552`	`551`	`{`
`553`		`- bucketno = hashFunc(keyval,`
`554`		`- (int) hashtable->typLen,`
`555`		`- hashtable->typByVal)`
	`552`	`+ bucketno = ComputeHashFunc(keyval,`
	`553`	`+ (int) hashtable->typLen,`
	`554`	`+ hashtable->typByVal)`
`556`	`555`	`% (uint32) hashtable->totalbuckets;`
`557`	`556`	`}`
`558`	`557`
`@@ -622,16 +621,16 @@ ExecScanHashBucket(HashJoinState *hjstate,`
`622`	`621`	`}`
`623`	`622`
`624`	`623`	`/* ----------------------------------------------------------------`
`625`		`- * hashFunc`
	`624`	`+ * ComputeHashFunc`
`626`	`625`	`*`
`627`		`- * the hash function for hash joins`
	`626`	`+ * the hash function for hash joins (also used for hash aggregation)`
`628`	`627`	`*`
`629`	`628`	`* XXX this probably ought to be replaced with datatype-specific`
`630`	`629`	`* hash functions, such as those already implemented for hash indexes.`
`631`	`630`	`* ----------------------------------------------------------------`
`632`	`631`	`*/`
`633`		`-static uint32`
`634`		`-hashFunc(Datum key, int typLen, bool byVal)`
	`632`	`+uint32`
	`633`	`+ComputeHashFunc(Datum key, int typLen, bool byVal)`
`635`	`634`	`{`
`636`	`635`	`unsigned char *k;`
`637`	`636`
`@@ -681,7 +680,7 @@ hashFunc(Datum key, int typLen, bool byVal)`
`681`	`680`	`}`
`682`	`681`	`else`
`683`	`682`	`{`
`684`		`- elog(ERROR, "hashFunc: Invalid typLen %d", typLen);`
	`683`	`+ elog(ERROR, "ComputeHashFunc: Invalid typLen %d", typLen);`
`685`	`684`	`k = NULL; /* keep compiler quiet */`
`686`	`685`	`}`
`687`	`686`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group`
`6`	`6`	`* Portions Copyright (c) 1994, Regents of the University of California`
`7`	`7`	`*`
`8`		`- * $Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.177 2002/11/06 00:00:44 tgl Exp $`
	`8`	`+ * $Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.178 2002/11/06 22:31:24 tgl Exp $`
`9`	`9`	`*`
`10`	`10`	`* NOTES`
`11`	`11`	`* Every (plan) node in POSTGRES has an associated "out" routine which`
`@@ -597,8 +597,8 @@ _outAgg(StringInfo str, Agg *node)`
`597`	`597`	`{`
`598`	`598`	`appendStringInfo(str, " AGG ");`
`599`	`599`	`_outPlanInfo(str, (Plan *) node);`
`600`		`- appendStringInfo(str, " :aggstrategy %d :numCols %d ",`
`601`		`- (int) node->aggstrategy, node->numCols);`
	`600`	`+ appendStringInfo(str, " :aggstrategy %d :numCols %d :numGroups %ld ",`
	`601`	`+ (int) node->aggstrategy, node->numCols, node->numGroups);`
`602`	`602`	`}`
`603`	`603`
`604`	`604`	`static void`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`*`
`11`	`11`	`*`
`12`	`12`	`* IDENTIFICATION`
`13`		`- * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.120 2002/11/06 00:00:44 tgl Exp $`
	`13`	`+ * $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.121 2002/11/06 22:31:24 tgl Exp $`
`14`	`14`	`*`
`15`	`15`	`*-------------------------------------------------------------------------`
`16`	`16`	`*/`
`@@ -1675,6 +1675,7 @@ make_agg(List tlist, List qual, AggStrategy aggstrategy,`
`1675`	`1675`	`plan->plan_rows *= 0.1;`
`1676`	`1676`	`if (plan->plan_rows < 1)`
`1677`	`1677`	`plan->plan_rows = 1;`
	`1678`	`+ node->numGroups = (long) plan->plan_rows;`
`1678`	`1679`	`}`
`1679`	`1680`
`1680`	`1681`	`plan->state = (EState *) NULL;`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group`
`8`	`8`	`* Portions Copyright (c) 1994, Regents of the University of California`
`9`	`9`	`*`
`10`		`- * $Id: nodeHash.h,v 1.24 2002/06/20 20:29:49 momjian Exp $`
	`10`	`+ * $Id: nodeHash.h,v 1.25 2002/11/06 22:31:24 tgl Exp $`
`11`	`11`	`*`
`12`	`12`	`*-------------------------------------------------------------------------`
`13`	`13`	`*/`
`@@ -36,5 +36,6 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth,`
`36`	`36`	`int *virtualbuckets,`
`37`	`37`	`int *physicalbuckets,`
`38`	`38`	`int *numbatches);`
	`39`	`+extern uint32 ComputeHashFunc(Datum key, int typLen, bool byVal);`
`39`	`40`
`40`	`41`	`#endif /* NODEHASH_H */`