From edec8c8e00e3f2a9305ab92e2c81293457cf959a Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 15 Feb 2012 15:52:44 -0500
Subject: [PATCH 001/129] Fix VPATH builds, broken by my recent commit to speed
 up tuplesorting.

The relevant commit is 337b6f5ecf05b21b5e997986884d097d60e4e3d0.
---
 src/backend/utils/sort/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index f46ce41610529..370b12cee6cc9 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -12,6 +12,8 @@ subdir = src/backend/utils/sort
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
+override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
+
 OBJS = logtape.o sortsupport.o tuplesort.o tuplestore.o
 
 tuplesort.o: qsort_tuple.c

From 4bfe68dfab009ce8fcaea79dc0832eadf3380051 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 15 Feb 2012 16:18:34 -0500
Subject: [PATCH 002/129] Run a portal's cleanup hook immediately when pushing
 it to FAILED state.

This extends the changes of commit 6252c4f9e201f619e5eebda12fa867acd4e4200e
so that we run the cleanup hook earlier for failure cases as well as
success cases.  As before, the point is to avoid an assertion failure from
an Assert I added in commit a874fe7b4c890d1fe3455215a83ca777867beadd, which
was meant to check that no user-written code can be called during portal
cleanup.  This fixes a case reported by Pavan Deolasee in which the Assert
could be triggered during backend exit (see the new regression test case),
and also prevents the possibility that the cleanup hook is run after
portions of the portal's state have already been recycled.  That doesn't
really matter in current usage, but it foreseeably could matter in the
future.

Back-patch to 9.1 where the Assert in question was added.
---
 src/backend/commands/portalcmds.c          |  8 +++-
 src/backend/tcop/pquery.c                  |  6 +--
 src/backend/utils/mmgr/portalmem.c         | 53 +++++++++++++++++++---
 src/include/utils/portal.h                 |  1 +
 src/test/regress/expected/transactions.out | 33 ++++----------
 src/test/regress/sql/transactions.sql      | 13 ++++--
 6 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c
index ab8b55c1d2e96..1c7a1c3a33f44 100644
--- a/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@ -224,7 +224,7 @@ PerformPortalClose(const char *name)
 	}
 
 	/*
-	 * Note: PortalCleanup is called as a side-effect
+	 * Note: PortalCleanup is called as a side-effect, if not already done.
 	 */
 	PortalDrop(portal, false);
 }
@@ -234,6 +234,10 @@ PerformPortalClose(const char *name)
  *
  * Clean up a portal when it's dropped.  This is the standard cleanup hook
  * for portals.
+ *
+ * Note: if portal->status is PORTAL_FAILED, we are probably being called
+ * during error abort, and must be careful to avoid doing anything that
+ * is likely to fail again.
  */
 void
 PortalCleanup(Portal portal)
@@ -420,7 +424,7 @@ PersistHoldablePortal(Portal portal)
 	PG_CATCH();
 	{
 		/* Uncaught error while executing portal: mark it dead */
-		portal->status = PORTAL_FAILED;
+		MarkPortalFailed(portal);
 
 		/* Restore global vars and propagate error */
 		ActivePortal = saveActivePortal;
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index f1504fac0d56f..42a0fb0f1f26e 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -607,7 +607,7 @@ PortalStart(Portal portal, ParamListInfo params, bool use_active_snapshot)
 	PG_CATCH();
 	{
 		/* Uncaught error while executing portal: mark it dead */
-		portal->status = PORTAL_FAILED;
+		MarkPortalFailed(portal);
 
 		/* Restore global vars and propagate error */
 		ActivePortal = saveActivePortal;
@@ -829,7 +829,7 @@ PortalRun(Portal portal, long count, bool isTopLevel,
 	PG_CATCH();
 	{
 		/* Uncaught error while executing portal: mark it dead */
-		portal->status = PORTAL_FAILED;
+		MarkPortalFailed(portal);
 
 		/* Restore global vars and propagate error */
 		if (saveMemoryContext == saveTopTransactionContext)
@@ -1446,7 +1446,7 @@ PortalRunFetch(Portal portal,
 	PG_CATCH();
 	{
 		/* Uncaught error while executing portal: mark it dead */
-		portal->status = PORTAL_FAILED;
+		MarkPortalFailed(portal);
 
 		/* Restore global vars and propagate error */
 		ActivePortal = saveActivePortal;
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index 89891ba9131a1..cfb73c1b09057 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -405,6 +405,8 @@ UnpinPortal(Portal portal)
 /*
  * MarkPortalDone
  *		Transition a portal from ACTIVE to DONE state.
+ *
+ * NOTE: never set portal->status = PORTAL_DONE directly; call this instead.
  */
 void
 MarkPortalDone(Portal portal)
@@ -418,8 +420,36 @@ MarkPortalDone(Portal portal)
 	 * well do that now, since the portal can't be executed any more.
 	 *
 	 * In some cases involving execution of a ROLLBACK command in an already
-	 * aborted transaction, this prevents an assertion failure from reaching
-	 * AtCleanup_Portals with the cleanup hook still unexecuted.
+	 * aborted transaction, this prevents an assertion failure caused by
+	 * reaching AtCleanup_Portals with the cleanup hook still unexecuted.
+	 */
+	if (PointerIsValid(portal->cleanup))
+	{
+		(*portal->cleanup) (portal);
+		portal->cleanup = NULL;
+	}
+}
+
+/*
+ * MarkPortalFailed
+ *		Transition a portal into FAILED state.
+ *
+ * NOTE: never set portal->status = PORTAL_FAILED directly; call this instead.
+ */
+void
+MarkPortalFailed(Portal portal)
+{
+	/* Perform the state transition */
+	Assert(portal->status != PORTAL_DONE);
+	portal->status = PORTAL_FAILED;
+
+	/*
+	 * Allow portalcmds.c to clean up the state it knows about.  We might as
+	 * well do that now, since the portal can't be executed any more.
+	 *
+	 * In some cases involving cleanup of an already aborted transaction, this
+	 * prevents an assertion failure caused by reaching AtCleanup_Portals with
+	 * the cleanup hook still unexecuted.
 	 */
 	if (PointerIsValid(portal->cleanup))
 	{
@@ -455,6 +485,9 @@ PortalDrop(Portal portal, bool isTopCommit)
 	 * hook's responsibility to not try to do that more than once, in the case
 	 * that failure occurs and then we come back to drop the portal again
 	 * during transaction abort.
+	 *
+	 * Note: in most paths of control, this will have been done already in
+	 * MarkPortalDone or MarkPortalFailed.  We're just making sure.
 	 */
 	if (PointerIsValid(portal->cleanup))
 	{
@@ -713,7 +746,7 @@ AtAbort_Portals(void)
 
 		/* Any portal that was actually running has to be considered broken */
 		if (portal->status == PORTAL_ACTIVE)
-			portal->status = PORTAL_FAILED;
+			MarkPortalFailed(portal);
 
 		/*
 		 * Do nothing else to cursors held over from a previous transaction.
@@ -728,9 +761,12 @@ AtAbort_Portals(void)
 		 * AtSubAbort_Portals.
 		 */
 		if (portal->status == PORTAL_READY)
-			portal->status = PORTAL_FAILED;
+			MarkPortalFailed(portal);
 
-		/* let portalcmds.c clean up the state it knows about */
+		/*
+		 * Allow portalcmds.c to clean up the state it knows about, if we
+		 * haven't already.
+		 */
 		if (PointerIsValid(portal->cleanup))
 		{
 			(*portal->cleanup) (portal);
@@ -862,9 +898,12 @@ AtSubAbort_Portals(SubTransactionId mySubid,
 		 */
 		if (portal->status == PORTAL_READY ||
 			portal->status == PORTAL_ACTIVE)
-			portal->status = PORTAL_FAILED;
+			MarkPortalFailed(portal);
 
-		/* let portalcmds.c clean up the state it knows about */
+		/*
+		 * Allow portalcmds.c to clean up the state it knows about, if we
+		 * haven't already.
+		 */
 		if (PointerIsValid(portal->cleanup))
 		{
 			(*portal->cleanup) (portal);
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index b257d774c49dd..48339426549ae 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -207,6 +207,7 @@ extern Portal CreateNewPortal(void);
 extern void PinPortal(Portal portal);
 extern void UnpinPortal(Portal portal);
 extern void MarkPortalDone(Portal portal);
+extern void MarkPortalFailed(Portal portal);
 extern void PortalDrop(Portal portal, bool isTopCommit);
 extern Portal GetPortalByName(const char *name);
 extern void PortalDefineQuery(Portal portal,
diff --git a/src/test/regress/expected/transactions.out b/src/test/regress/expected/transactions.out
index f49ec0effee49..e9d3908b1ab53 100644
--- a/src/test/regress/expected/transactions.out
+++ b/src/test/regress/expected/transactions.out
@@ -601,28 +601,11 @@ fetch from foo;
 (1 row)
 
 abort;
--- tests for the "tid" type
-SELECT '(3, 3)'::tid = '(3, 4)'::tid;
- ?column? 
-----------
- f
-(1 row)
-
-SELECT '(3, 3)'::tid = '(3, 3)'::tid;
- ?column? 
-----------
- t
-(1 row)
-
-SELECT '(3, 3)'::tid <> '(3, 3)'::tid;
- ?column? 
-----------
- f
-(1 row)
-
-SELECT '(3, 3)'::tid <> '(3, 4)'::tid;
- ?column? 
-----------
- t
-(1 row)
-
+-- Test for successful cleanup of an aborted transaction at session exit.
+-- THIS MUST BE THE LAST TEST IN THIS FILE.
+begin;
+select 1/0;
+ERROR:  division by zero
+rollback to X;
+ERROR:  no such savepoint
+-- DO NOT ADD ANYTHING HERE.
diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql
index 23271c8eabaf4..faf6a9bf93804 100644
--- a/src/test/regress/sql/transactions.sql
+++ b/src/test/regress/sql/transactions.sql
@@ -368,8 +368,11 @@ fetch from foo;
 
 abort;
 
--- tests for the "tid" type
-SELECT '(3, 3)'::tid = '(3, 4)'::tid;
-SELECT '(3, 3)'::tid = '(3, 3)'::tid;
-SELECT '(3, 3)'::tid <> '(3, 3)'::tid;
-SELECT '(3, 3)'::tid <> '(3, 4)'::tid;
+-- Test for successful cleanup of an aborted transaction at session exit.
+-- THIS MUST BE THE LAST TEST IN THIS FILE.
+
+begin;
+select 1/0;
+rollback to X;
+
+-- DO NOT ADD ANYTHING HERE.

From e9a22259c45e235aaa30f0d068f767d9c0f818a0 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 16 Feb 2012 11:49:20 -0500
Subject: [PATCH 003/129] Invent on_exit_nicely for pg_dump.

Per recent discussions on pgsql-hackers regarding parallel pg_dump.
---
 src/bin/pg_dump/common.c              |   6 +-
 src/bin/pg_dump/compress_io.c         |  27 ++---
 src/bin/pg_dump/dumputils.c           |  34 +++++-
 src/bin/pg_dump/dumputils.h           |   4 +
 src/bin/pg_dump/pg_backup.h           |   6 +-
 src/bin/pg_dump/pg_backup_archiver.c  |  16 +--
 src/bin/pg_dump/pg_backup_db.c        |   9 ++
 src/bin/pg_dump/pg_backup_directory.c |  17 +--
 src/bin/pg_dump/pg_backup_files.c     |  16 +--
 src/bin/pg_dump/pg_backup_tar.c       |  24 ++--
 src/bin/pg_dump/pg_dump.c             | 151 +++++++++++++-------------
 src/bin/pg_dump/pg_dump.h             |   2 -
 src/bin/pg_dump/pg_dumpall.c          |  48 ++++----
 src/bin/pg_dump/pg_restore.c          |  18 +--
 14 files changed, 207 insertions(+), 171 deletions(-)

diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c
index 266441df61d2c..db48ccb41915c 100644
--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@@ -770,7 +770,7 @@ findParentsByOid(TableInfo *self,
 							  inhinfo[i].inhparent,
 							  self->dobj.name,
 							  oid);
-					exit_nicely();
+					exit_nicely(1);
 				}
 				self->parents[j++] = parent;
 			}
@@ -809,7 +809,7 @@ parseOidArray(const char *str, Oid *array, int arraysize)
 				if (argNum >= arraysize)
 				{
 					write_msg(NULL, "could not parse numeric array \"%s\": too many numbers\n", str);
-					exit_nicely();
+					exit_nicely(1);
 				}
 				temp[j] = '\0';
 				array[argNum++] = atooid(temp);
@@ -824,7 +824,7 @@ parseOidArray(const char *str, Oid *array, int arraysize)
 				j >= sizeof(temp) - 1)
 			{
 				write_msg(NULL, "could not parse numeric array \"%s\": invalid character in number\n", str);
-				exit_nicely();
+				exit_nicely(1);
 			}
 			temp[j++] = s;
 		}
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index d48b276a0652f..c30b8f97380cf 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -54,6 +54,7 @@
 
 #include "compress_io.h"
 #include "dumpmem.h"
+#include "dumputils.h"
 
 /*----------------------
  * Compressor API
@@ -109,8 +110,8 @@ ParseCompressionOption(int compression, CompressionAlgorithm *alg, int *level)
 		*alg = COMPR_ALG_NONE;
 	else
 	{
-		die_horribly(NULL, modulename, "Invalid compression code: %d\n",
-					 compression);
+		exit_horribly(modulename, "Invalid compression code: %d\n",
+					  compression);
 		*alg = COMPR_ALG_NONE;	/* keep compiler quiet */
 	}
 
@@ -133,7 +134,7 @@ AllocateCompressor(int compression, WriteFunc writeF)
 
 #ifndef HAVE_LIBZ
 	if (alg == COMPR_ALG_LIBZ)
-		die_horribly(NULL, modulename, "not built with zlib support\n");
+		exit_horribly(modulename, "not built with zlib support\n");
 #endif
 
 	cs = (CompressorState *) pg_calloc(1, sizeof(CompressorState));
@@ -169,7 +170,7 @@ ReadDataFromArchive(ArchiveHandle *AH, int compression, ReadFunc readF)
 #ifdef HAVE_LIBZ
 		ReadDataFromArchiveZlib(AH, readF);
 #else
-		die_horribly(NULL, modulename, "not built with zlib support\n");
+		exit_horribly(modulename, "not built with zlib support\n");
 #endif
 	}
 }
@@ -187,7 +188,7 @@ WriteDataToArchive(ArchiveHandle *AH, CompressorState *cs,
 #ifdef HAVE_LIBZ
 			return WriteDataToArchiveZlib(AH, cs, data, dLen);
 #else
-			die_horribly(NULL, modulename, "not built with zlib support\n");
+			exit_horribly(modulename, "not built with zlib support\n");
 #endif
 		case COMPR_ALG_NONE:
 			return WriteDataToArchiveNone(AH, cs, data, dLen);
@@ -234,9 +235,9 @@ InitCompressorZlib(CompressorState *cs, int level)
 	cs->zlibOutSize = ZLIB_OUT_SIZE;
 
 	if (deflateInit(zp, level) != Z_OK)
-		die_horribly(NULL, modulename,
-					 "could not initialize compression library: %s\n",
-					 zp->msg);
+		exit_horribly(modulename,
+					  "could not initialize compression library: %s\n",
+					  zp->msg);
 
 	/* Just be paranoid - maybe End is called after Start, with no Write */
 	zp->next_out = (void *) cs->zlibOut;
@@ -343,9 +344,9 @@ ReadDataFromArchiveZlib(ArchiveHandle *AH, ReadFunc readF)
 	out = pg_malloc(ZLIB_OUT_SIZE + 1);
 
 	if (inflateInit(zp) != Z_OK)
-		die_horribly(NULL, modulename,
-					 "could not initialize compression library: %s\n",
-					 zp->msg);
+		exit_horribly(modulename,
+					  "could not initialize compression library: %s\n",
+					  zp->msg);
 
 	/* no minimal chunk size for zlib */
 	while ((cnt = readF(AH, &buf, &buflen)))
@@ -514,7 +515,7 @@ cfopen_write(const char *path, const char *mode, int compression)
 		fp = cfopen(fname, mode, 1);
 		free(fname);
 #else
-		die_horribly(NULL, modulename, "not built with zlib support\n");
+		exit_horribly(modulename, "not built with zlib support\n");
 		fp = NULL;				/* keep compiler quiet */
 #endif
 	}
@@ -541,7 +542,7 @@ cfopen(const char *path, const char *mode, int compression)
 			fp = NULL;
 		}
 #else
-		die_horribly(NULL, modulename, "not built with zlib support\n");
+		exit_horribly(modulename, "not built with zlib support\n");
 #endif
 	}
 	else
diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c
index 3493e39403372..0b24220bd4618 100644
--- a/src/bin/pg_dump/dumputils.c
+++ b/src/bin/pg_dump/dumputils.c
@@ -26,6 +26,15 @@
 int			quote_all_identifiers = 0;
 const char *progname = NULL;
 
+#define MAX_ON_EXIT_NICELY				20
+
+static struct
+{
+	on_exit_nicely_callback	function;
+	void	   *arg;
+} on_exit_nicely_list[MAX_ON_EXIT_NICELY];
+
+static int on_exit_nicely_index;
 
 #define supports_grant_options(version) ((version) >= 70400)
 
@@ -1261,7 +1270,7 @@ exit_horribly(const char *modulename, const char *fmt,...)
 	vwrite_msg(modulename, fmt, ap);
 	va_end(ap);
 
-	exit(1);
+	exit_nicely(1);
 }
 
 /*
@@ -1289,6 +1298,27 @@ set_section (const char *arg, int *dumpSections)
 				progname, arg);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 }
+
+/* Register a callback to be run when exit_nicely is invoked. */
+void
+on_exit_nicely(on_exit_nicely_callback function, void *arg)
+{
+	if (on_exit_nicely_index >= MAX_ON_EXIT_NICELY)
+		exit_horribly(NULL, "out of on_exit_nicely slots");
+	on_exit_nicely_list[on_exit_nicely_index].function = function;
+	on_exit_nicely_list[on_exit_nicely_index].arg = arg;
+	on_exit_nicely_index++;
+}
+
+/* Run accumulated on_exit_nicely callbacks and then exit quietly. */
+void
+exit_nicely(int code)
+{
+	while (--on_exit_nicely_index >= 0)
+		(*on_exit_nicely_list[on_exit_nicely_index].function)(code,
+			on_exit_nicely_list[on_exit_nicely_index].arg);
+	exit(code);
+}
diff --git a/src/bin/pg_dump/dumputils.h b/src/bin/pg_dump/dumputils.h
index de1536baaa0f4..82cf940892cb5 100644
--- a/src/bin/pg_dump/dumputils.h
+++ b/src/bin/pg_dump/dumputils.h
@@ -60,4 +60,8 @@ extern void exit_horribly(const char *modulename, const char *fmt,...)
 				__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3), noreturn));
 extern void set_section (const char *arg, int *dumpSections);
 
+typedef void (*on_exit_nicely_callback) (int code, void *arg);
+extern void on_exit_nicely(on_exit_nicely_callback function, void *arg);
+extern void exit_nicely(int code) __attribute__((noreturn));
+
 #endif   /* DUMPUTILS_H */
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index 0eef1dc8ba8d2..d12bd7fa792cc 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -159,15 +159,13 @@ typedef struct _restoreOptions
  * Main archiver interface.
  */
 
-
-/* Lets the archive know we have a DB connection to shutdown if it dies */
-
-PGconn *ConnectDatabase(Archive *AH,
+extern PGconn *ConnectDatabase(Archive *AH,
 				const char *dbname,
 				const char *pghost,
 				const char *pgport,
 				const char *username,
 				enum trivalue prompt_password);
+extern void DisconnectDatabase(Archive *AHX);
 
 /* Called to add a TOC entry */
 extern void ArchiveEntry(Archive *AHX,
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 44ba913d4a11d..55c84fdd47993 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -459,10 +459,7 @@ RestoreArchive(Archive *AHX, RestoreOptions *ropt)
 		RestoreOutput(AH, sav);
 
 	if (ropt->useDB)
-	{
-		PQfinish(AH->connection);
-		AH->connection = NULL;
-	}
+		DisconnectDatabase(&AH->public);
 }
 
 /*
@@ -1435,11 +1432,10 @@ vdie_horribly(ArchiveHandle *AH, const char *modulename,
 	{
 		if (AH->public.verbose)
 			write_msg(NULL, "*** aborted because of error\n");
-		if (AH->connection)
-			PQfinish(AH->connection);
+		DisconnectDatabase(&AH->public);
 	}
 
-	exit(1);
+	exit_nicely(1);
 }
 
 /* As above, but with variable arg list */
@@ -3332,8 +3328,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
 	 * mainly to ensure that we don't exceed the specified number of parallel
 	 * connections.
 	 */
-	PQfinish(AH->connection);
-	AH->connection = NULL;
+	DisconnectDatabase(&AH->public);
 
 	/* blow away any transient state from the old connection */
 	if (AH->currUser)
@@ -3795,8 +3790,7 @@ parallel_restore(RestoreArgs *args)
 	retval = restore_toc_entry(AH, te, ropt, true);
 
 	/* And clean up */
-	PQfinish(AH->connection);
-	AH->connection = NULL;
+	DisconnectDatabase((Archive *) AH);
 
 	/* If we reopened the file, we are done with it, so close it now */
 	if (te->section == SECTION_DATA)
diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c
index 09da892597197..31f6d8d94da8f 100644
--- a/src/bin/pg_dump/pg_backup_db.c
+++ b/src/bin/pg_dump/pg_backup_db.c
@@ -310,6 +310,15 @@ ConnectDatabase(Archive *AHX,
 	return AH->connection;
 }
 
+void
+DisconnectDatabase(Archive *AHX)
+{
+	ArchiveHandle *AH = (ArchiveHandle *) AHX;
+
+	PQfinish(AH->connection);		/* noop if AH->connection is NULL */
+	AH->connection = NULL;
+}
+
 
 static void
 notice_processor(void *arg, const char *message)
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 9c6d7c1b6de66..df95411cfd976 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -35,6 +35,7 @@
 
 #include "compress_io.h"
 #include "dumpmem.h"
+#include "dumputils.h"
 
 #include <dirent.h>
 #include <sys/stat.h>
@@ -633,13 +634,13 @@ createDirectory(const char *dir)
 	if (stat(dir, &st) == 0)
 	{
 		if (S_ISDIR(st.st_mode))
-			die_horribly(NULL, modulename,
-						 "cannot create directory %s, it exists already\n",
-						 dir);
+			exit_horribly(modulename,
+						  "cannot create directory %s, it exists already\n",
+						  dir);
 		else
-			die_horribly(NULL, modulename,
-						 "cannot create directory %s, a file with this name "
-						 "exists already\n", dir);
+			exit_horribly(modulename,
+						  "cannot create directory %s, a file with this name "
+						  "exists already\n", dir);
 	}
 
 	/*
@@ -648,8 +649,8 @@ createDirectory(const char *dir)
 	 * between our two calls.
 	 */
 	if (mkdir(dir, 0700) < 0)
-		die_horribly(NULL, modulename, "could not create directory %s: %s",
-					 dir, strerror(errno));
+		exit_horribly(modulename, "could not create directory %s: %s",
+					  dir, strerror(errno));
 }
 
 
diff --git a/src/bin/pg_dump/pg_backup_files.c b/src/bin/pg_dump/pg_backup_files.c
index ffcbb8f6425d7..71bace0eab71f 100644
--- a/src/bin/pg_dump/pg_backup_files.c
+++ b/src/bin/pg_dump/pg_backup_files.c
@@ -127,15 +127,15 @@ InitArchiveFmt_Files(ArchiveHandle *AH)
 		{
 			AH->FH = fopen(AH->fSpec, PG_BINARY_W);
 			if (AH->FH == NULL)
-				die_horribly(NULL, modulename, "could not open output file \"%s\": %s\n",
-							 AH->fSpec, strerror(errno));
+				exit_horribly(modulename, "could not open output file \"%s\": %s\n",
+							  AH->fSpec, strerror(errno));
 		}
 		else
 		{
 			AH->FH = stdout;
 			if (AH->FH == NULL)
-				die_horribly(NULL, modulename, "could not open output file: %s\n",
-							 strerror(errno));
+				exit_horribly(modulename, "could not open output file: %s\n",
+							  strerror(errno));
 		}
 
 		ctx->hasSeek = checkSeek(AH->FH);
@@ -152,15 +152,15 @@ InitArchiveFmt_Files(ArchiveHandle *AH)
 		{
 			AH->FH = fopen(AH->fSpec, PG_BINARY_R);
 			if (AH->FH == NULL)
-				die_horribly(NULL, modulename, "could not open input file \"%s\": %s\n",
-							 AH->fSpec, strerror(errno));
+				exit_horribly(modulename, "could not open input file \"%s\": %s\n",
+							  AH->fSpec, strerror(errno));
 		}
 		else
 		{
 			AH->FH = stdin;
 			if (AH->FH == NULL)
-				die_horribly(NULL, modulename, "could not open input file: %s\n",
-							 strerror(errno));
+				exit_horribly(modulename, "could not open input file: %s\n",
+							  strerror(errno));
 		}
 
 		ctx->hasSeek = checkSeek(AH->FH);
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index 39ce417d86d9e..4952f5a15d335 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -29,6 +29,7 @@
 #include "pg_backup_archiver.h"
 #include "pg_backup_tar.h"
 #include "dumpmem.h"
+#include "dumputils.h"
 
 #include <sys/stat.h>
 #include <ctype.h>
@@ -178,17 +179,17 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
 		{
 			ctx->tarFH = fopen(AH->fSpec, PG_BINARY_W);
 			if (ctx->tarFH == NULL)
-				die_horribly(NULL, modulename,
-						   "could not open TOC file \"%s\" for output: %s\n",
-							 AH->fSpec, strerror(errno));
+				exit_horribly(modulename,
+							  "could not open TOC file \"%s\" for output: %s\n",
+							  AH->fSpec, strerror(errno));
 		}
 		else
 		{
 			ctx->tarFH = stdout;
 			if (ctx->tarFH == NULL)
-				die_horribly(NULL, modulename,
-							 "could not open TOC file for output: %s\n",
-							 strerror(errno));
+				exit_horribly(modulename,
+							  "could not open TOC file for output: %s\n",
+							  strerror(errno));
 		}
 
 		ctx->tarFHpos = 0;
@@ -214,7 +215,8 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
 		 * positioning.
 		 */
 		if (AH->compression != 0)
-			die_horribly(NULL, modulename, "compression is not supported by tar archive format\n");
+			exit_horribly(modulename,
+						  "compression is not supported by tar archive format\n");
 	}
 	else
 	{							/* Read Mode */
@@ -222,15 +224,15 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
 		{
 			ctx->tarFH = fopen(AH->fSpec, PG_BINARY_R);
 			if (ctx->tarFH == NULL)
-				die_horribly(NULL, modulename, "could not open TOC file \"%s\" for input: %s\n",
-							 AH->fSpec, strerror(errno));
+				exit_horribly(modulename, "could not open TOC file \"%s\" for input: %s\n",
+							  AH->fSpec, strerror(errno));
 		}
 		else
 		{
 			ctx->tarFH = stdin;
 			if (ctx->tarFH == NULL)
-				die_horribly(NULL, modulename, "could not open TOC file for input: %s\n",
-							 strerror(errno));
+				exit_horribly(modulename, "could not open TOC file for input: %s\n",
+							  strerror(errno));
 		}
 
 		/*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index f968496aaa67a..6eddc63e28be2 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -145,6 +145,7 @@ static int	serializable_deferrable = 0;
 
 
 static void help(const char *progname);
+static void pgdump_cleanup_at_exit(int code, void *arg);
 static void setup_connection(Archive *AH, const char *dumpencoding,
 				 char *use_role);
 static ArchiveFormat parseArchiveFormat(const char *format, ArchiveMode *mode);
@@ -370,12 +371,12 @@ main(int argc, char **argv)
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
 			help(progname);
-			exit(0);
+			exit_nicely(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
 		{
 			puts("pg_dump (PostgreSQL) " PG_VERSION);
-			exit(0);
+			exit_nicely(0);
 		}
 	}
 
@@ -508,7 +509,7 @@ main(int argc, char **argv)
 
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
-				exit(1);
+				exit_nicely(1);
 		}
 	}
 
@@ -523,7 +524,7 @@ main(int argc, char **argv)
 				progname, argv[optind]);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/* --column-inserts implies --inserts */
@@ -533,13 +534,13 @@ main(int argc, char **argv)
 	if (dataOnly && schemaOnly)
 	{
 		write_msg(NULL, "options -s/--schema-only and -a/--data-only cannot be used together\n");
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if ((dataOnly || schemaOnly) && dumpSections != DUMP_UNSECTIONED)
 	{
 		write_msg(NULL, "options -s/--schema-only and -a/--data-only cannot be used with --section\n");
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (dataOnly)
@@ -555,14 +556,14 @@ main(int argc, char **argv)
 	if (dataOnly && outputClean)
 	{
 		write_msg(NULL, "options -c/--clean and -a/--data-only cannot be used together\n");
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (dump_inserts && oids)
 	{
 		write_msg(NULL, "options --inserts/--column-inserts and -o/--oids cannot be used together\n");
 		write_msg(NULL, "(The INSERT command cannot set OIDs.)\n");
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/* Identify archive format to emit */
@@ -583,11 +584,12 @@ main(int argc, char **argv)
 
 	/* Open the output file */
 	fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode);
+	on_exit_nicely(pgdump_cleanup_at_exit, fout);
 
 	if (fout == NULL)
 	{
 		write_msg(NULL, "could not open output file \"%s\" for writing\n", filename);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/* Let the archiver know how noisy to be */
@@ -597,7 +599,7 @@ main(int argc, char **argv)
 	if (my_version < 0)
 	{
 		write_msg(NULL, "could not parse version string \"%s\"\n", PG_VERSION);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/*
@@ -669,7 +671,7 @@ main(int argc, char **argv)
 		if (schema_include_oids.head == NULL)
 		{
 			write_msg(NULL, "No matching schemas were found\n");
-			exit_nicely();
+			exit_nicely(1);
 		}
 	}
 	expand_schema_name_patterns(fout, &schema_exclude_patterns,
@@ -684,7 +686,7 @@ main(int argc, char **argv)
 		if (table_include_oids.head == NULL)
 		{
 			write_msg(NULL, "No matching tables were found\n");
-			exit_nicely();
+			exit_nicely(1);
 		}
 	}
 	expand_table_name_patterns(fout, &table_exclude_patterns,
@@ -790,9 +792,7 @@ main(int argc, char **argv)
 
 	CloseArchive(fout);
 
-	PQfinish(g_conn);
-
-	exit(0);
+	exit_nicely(0);
 }
 
 
@@ -858,13 +858,12 @@ help(const char *progname)
 	printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
 }
 
-void
-exit_nicely(void)
+static void
+pgdump_cleanup_at_exit(int code, void *arg)
 {
-	PQfinish(g_conn);
-	if (g_verbose)
-		write_msg(NULL, "*** aborted because of error\n");
-	exit(1);
+	Archive	   *AH = (Archive *) arg;
+
+	DisconnectDatabase(AH);
 }
 
 static void
@@ -879,7 +878,7 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
 		{
 			write_msg(NULL, "invalid client encoding \"%s\" specified\n",
 					  dumpencoding);
-			exit(1);
+			exit_nicely(1);
 		}
 	}
 
@@ -977,7 +976,7 @@ parseArchiveFormat(const char *format, ArchiveMode *mode)
 	else
 	{
 		write_msg(NULL, "invalid output format \"%s\" specified\n", format);
-		exit(1);
+		exit_nicely(1);
 	}
 	return archiveFormat;
 }
@@ -1002,7 +1001,7 @@ expand_schema_name_patterns(Archive *fout,
 	if (fout->remoteVersion < 70300)
 	{
 		write_msg(NULL, "server version must be at least 7.3 to use schema selection switches\n");
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	query = createPQExpBuffer();
@@ -1397,7 +1396,7 @@ dumpTableData_copy(Archive *fout, void *dcontext)
 		write_msg(NULL, "Dumping the contents of table \"%s\" failed: PQgetCopyData() failed.\n", classname);
 		write_msg(NULL, "Error message from server: %s", PQerrorMessage(g_conn));
 		write_msg(NULL, "The command was: %s\n", q->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	/* Check command status and return to normal libpq state */
@@ -1407,7 +1406,7 @@ dumpTableData_copy(Archive *fout, void *dcontext)
 		write_msg(NULL, "Dumping the contents of table \"%s\" failed: PQgetResult() failed.\n", classname);
 		write_msg(NULL, "Error message from server: %s", PQerrorMessage(g_conn));
 		write_msg(NULL, "The command was: %s\n", q->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 	PQclear(res);
 
@@ -1936,14 +1935,14 @@ dumpDatabase(Archive *fout)
 	{
 		write_msg(NULL, "missing pg_database entry for database \"%s\"\n",
 				  datname);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	if (ntups != 1)
 	{
 		write_msg(NULL, "query returned more than one (%d) pg_database entry for database \"%s\"\n",
 				  ntups, datname);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_tableoid = PQfnumber(res, "tableoid");
@@ -2045,7 +2044,7 @@ dumpDatabase(Archive *fout)
 		if (PQntuples(lo_res) != 1)
 		{
 			write_msg(NULL, "dumpDatabase(): could not find pg_largeobject.relfrozenxid\n");
-			exit_nicely();
+			exit_nicely(1);
 		}
 
 		i_relfrozenxid = PQfnumber(lo_res, "relfrozenxid");
@@ -2083,7 +2082,7 @@ dumpDatabase(Archive *fout)
 			if (PQntuples(lo_res) != 1)
 			{
 				write_msg(NULL, "dumpDatabase(): could not find pg_largeobject_metadata.relfrozenxid\n");
-				exit_nicely();
+				exit_nicely(1);
 			}
 
 			i_relfrozenxid = PQfnumber(lo_res, "relfrozenxid");
@@ -2409,7 +2408,7 @@ dumpBlobs(Archive *fout, void *arg)
 			{
 				write_msg(NULL, "could not open large object %u: %s",
 						  blobOid, PQerrorMessage(g_conn));
-				exit_nicely();
+				exit_nicely(1);
 			}
 
 			StartBlob(fout, blobOid);
@@ -2422,7 +2421,7 @@ dumpBlobs(Archive *fout, void *arg)
 				{
 					write_msg(NULL, "error reading large object %u: %s",
 							  blobOid, PQerrorMessage(g_conn));
-					exit_nicely();
+					exit_nicely(1);
 				}
 
 				WriteData(fout, buf, cnt);
@@ -2473,7 +2472,7 @@ binary_upgrade_set_type_oids_by_type_oid(Archive *fout,
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, upgrade_query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	pg_type_array_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "typarray")));
@@ -2521,7 +2520,7 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, upgrade_query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	pg_type_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "crel")));
@@ -2577,7 +2576,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, upgrade_query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	pg_class_reltoastrelid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid")));
@@ -2655,7 +2654,7 @@ binary_upgrade_extension_member(PQExpBuffer upgrade_buffer,
 	if (extobj == NULL)
 	{
 		write_msg(NULL, "could not find parent extension for %s", objlabel);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	appendPQExpBuffer(upgrade_buffer,
@@ -2796,7 +2795,7 @@ findNamespace(Archive *fout, Oid nsoid, Oid objoid)
 				return nsinfo;
 		}
 		write_msg(NULL, "schema with OID %u does not exist\n", nsoid);
-		exit_nicely();
+		exit_nicely(1);
 	}
 	else
 	{
@@ -5124,7 +5123,7 @@ getRules(Archive *fout, int *numRules)
 			write_msg(NULL, "failed sanity check, parent table OID %u of pg_rewrite entry OID %u not found\n",
 					  ruletableoid,
 					  ruleinfo[i].dobj.catId.oid);
-			exit_nicely();
+			exit_nicely(1);
 		}
 		ruleinfo[i].dobj.namespace = ruleinfo[i].ruletable->dobj.namespace;
 		ruleinfo[i].dobj.dump = ruleinfo[i].ruletable->dobj.dump;
@@ -5370,7 +5369,7 @@ getTriggers(Archive *fout, TableInfo tblinfo[], int numTables)
 							write_msg(NULL, "query produced null referenced table name for foreign key trigger \"%s\" on table \"%s\" (OID of table: %u)\n",
 									  tginfo[j].dobj.name, tbinfo->dobj.name,
 									  tginfo[j].tgconstrrelid);
-							exit_nicely();
+							exit_nicely(1);
 						}
 						tginfo[j].tgconstrrelname = pg_strdup(PQgetvalue(res, j, i_tgconstrrelname));
 					}
@@ -5917,7 +5916,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			{
 				write_msg(NULL, "invalid column numbering in table \"%s\"\n",
 						  tbinfo->dobj.name);
-				exit_nicely();
+				exit_nicely(1);
 			}
 			tbinfo->attnames[j] = pg_strdup(PQgetvalue(res, j, i_attname));
 			tbinfo->atttypnames[j] = pg_strdup(PQgetvalue(res, j, i_atttypname));
@@ -6005,7 +6004,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 				{
 					write_msg(NULL, "invalid adnum value %d for table \"%s\"\n",
 							  adnum, tbinfo->dobj.name);
-					exit_nicely();
+					exit_nicely(1);
 				}
 
 				/*
@@ -6182,7 +6181,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 										 tbinfo->ncheck),
 						  tbinfo->ncheck, tbinfo->dobj.name, numConstrs);
 				write_msg(NULL, "(The system catalogs might be corrupted.)\n");
-				exit_nicely();
+				exit_nicely(1);
 			}
 
 			constrs = (ConstraintInfo *) pg_malloc(numConstrs * sizeof(ConstraintInfo));
@@ -7706,7 +7705,7 @@ dumpRangeType(Archive *fout, TypeInfo *tyinfo)
 	{
 		write_msg(NULL, "query returned %d pg_range entries for range type \"%s\"\n",
 				  PQntuples(res), tyinfo->dobj.name);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	/*
@@ -8019,7 +8018,7 @@ dumpBaseType(Archive *fout, TypeInfo *tyinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	typlen = PQgetvalue(res, 0, PQfnumber(res, "typlen"));
@@ -8250,7 +8249,7 @@ dumpDomain(Archive *fout, TypeInfo *tyinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	typnotnull = PQgetvalue(res, 0, PQfnumber(res, "typnotnull"));
@@ -9242,7 +9241,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	proretset = PQgetvalue(res, 0, PQfnumber(res, "proretset"));
@@ -9420,7 +9419,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 		{
 			write_msg(NULL, "unrecognized provolatile value for function \"%s\"\n",
 					  finfo->dobj.name);
-			exit_nicely();
+			exit_nicely(1);
 		}
 	}
 
@@ -9799,7 +9798,7 @@ dumpOpr(Archive *fout, OprInfo *oprinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_oprkind = PQfnumber(res, "oprkind");
@@ -10052,7 +10051,7 @@ convertTSFunction(Archive *fout, Oid funcOid)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	result = pg_strdup(PQgetvalue(res, 0, 0));
@@ -10169,7 +10168,7 @@ dumpOpclass(Archive *fout, OpclassInfo *opcinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_opcintype = PQfnumber(res, "opcintype");
@@ -10637,7 +10636,7 @@ dumpOpfamily(Archive *fout, OpfamilyInfo *opfinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_amname = PQfnumber(res, "amname");
@@ -10822,7 +10821,7 @@ dumpCollation(Archive *fout, CollInfo *collinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_collcollate = PQfnumber(res, "collcollate");
@@ -10927,7 +10926,7 @@ dumpConversion(Archive *fout, ConvInfo *convinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_conforencoding = PQfnumber(res, "conforencoding");
@@ -11127,7 +11126,7 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	i_aggtransfn = PQfnumber(res, "aggtransfn");
@@ -11368,7 +11367,7 @@ dumpTSDictionary(Archive *fout, TSDictInfo *dictinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 	nspname = PQgetvalue(res, 0, 0);
 	tmplname = PQgetvalue(res, 0, 1);
@@ -11534,7 +11533,7 @@ dumpTSConfig(Archive *fout, TSConfigInfo *cfginfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 	nspname = PQgetvalue(res, 0, 0);
 	prsname = PQgetvalue(res, 0, 1);
@@ -11753,7 +11752,7 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 	fdwname = PQgetvalue(res, 0, 0);
 
@@ -11952,7 +11951,7 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
 			/* shouldn't get here */
 			write_msg(NULL, "unknown object type (%d) in default privileges\n",
 					  (int) daclinfo->defaclobjtype);
-			exit_nicely();
+			exit_nicely(1);
 			type = "";			/* keep compiler quiet */
 	}
 
@@ -11969,7 +11968,7 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
 	{
 		write_msg(NULL, "could not parse default ACL list (%s)\n",
 				  daclinfo->defaclacl);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	ArchiveEntry(fout, daclinfo->dobj.catId, daclinfo->dobj.dumpId,
@@ -12026,7 +12025,7 @@ dumpACL(Archive *fout, CatalogId objCatId, DumpId objDumpId,
 	{
 		write_msg(NULL, "could not parse ACL list (%s) for object \"%s\" (%s)\n",
 				  acls, name, type);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	if (sql->len > 0)
@@ -12472,7 +12471,7 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
 			else
 				write_msg(NULL, "query to obtain definition of view \"%s\" returned more than one definition\n",
 						  tbinfo->dobj.name);
-			exit_nicely();
+			exit_nicely(1);
 		}
 
 		viewdef = PQgetvalue(res, 0, 0);
@@ -12481,7 +12480,7 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
 		{
 			write_msg(NULL, "definition of view \"%s\" appears to be empty (length zero)\n",
 					  tbinfo->dobj.name);
-			exit_nicely();
+			exit_nicely(1);
 		}
 
 		/*
@@ -12537,7 +12536,7 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
 										 "query returned %d foreign server entries for foreign table \"%s\"\n",
 										 PQntuples(res)),
 						  PQntuples(res), tbinfo->dobj.name);
-				exit_nicely();
+				exit_nicely(1);
 			}
 			i_srvname = PQfnumber(res, "srvname");
 			i_ftoptions = PQfnumber(res, "ftoptions");
@@ -13102,7 +13101,7 @@ getAttrName(int attrnum, TableInfo *tblInfo)
 	}
 	write_msg(NULL, "invalid column number %d for table \"%s\"\n",
 			  attrnum, tblInfo->dobj.name);
-	exit_nicely();
+	exit_nicely(1);
 	return NULL;				/* keep compiler quiet */
 }
 
@@ -13214,7 +13213,7 @@ dumpConstraint(Archive *fout, ConstraintInfo *coninfo)
 		{
 			write_msg(NULL, "missing index for constraint \"%s\"\n",
 					  coninfo->dobj.name);
-			exit_nicely();
+			exit_nicely(1);
 		}
 
 		if (binary_upgrade)
@@ -13402,7 +13401,7 @@ dumpConstraint(Archive *fout, ConstraintInfo *coninfo)
 	else
 	{
 		write_msg(NULL, "unrecognized constraint type: %c\n", coninfo->contype);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	/* Dump Constraint Comments --- only works for table constraints */
@@ -13464,12 +13463,12 @@ findLastBuiltinOid_V71(Archive *fout, const char *dbname)
 	if (ntups < 1)
 	{
 		write_msg(NULL, "missing pg_database entry for this database\n");
-		exit_nicely();
+		exit_nicely(1);
 	}
 	if (ntups > 1)
 	{
 		write_msg(NULL, "found more than one pg_database entry for this database\n");
-		exit_nicely();
+		exit_nicely(1);
 	}
 	last_oid = atooid(PQgetvalue(res, 0, PQfnumber(res, "datlastsysoid")));
 	PQclear(res);
@@ -13499,12 +13498,12 @@ findLastBuiltinOid_V70(Archive *fout)
 	if (ntups < 1)
 	{
 		write_msg(NULL, "could not find entry for pg_indexes in pg_class\n");
-		exit_nicely();
+		exit_nicely(1);
 	}
 	if (ntups > 1)
 	{
 		write_msg(NULL, "found more than one entry for pg_indexes in pg_class\n");
-		exit_nicely();
+		exit_nicely(1);
 	}
 	last_oid = atooid(PQgetvalue(res, 0, PQfnumber(res, "oid")));
 	PQclear(res);
@@ -13578,7 +13577,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
 								 "query to get data of sequence \"%s\" returned %d rows (expected 1)\n",
 								 PQntuples(res)),
 				  tbinfo->dobj.name, PQntuples(res));
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	/* Disable this check: it fails if sequence has been renamed */
@@ -13587,7 +13586,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
 	{
 		write_msg(NULL, "query to get data of sequence \"%s\" returned name \"%s\"\n",
 				  tbinfo->dobj.name, PQgetvalue(res, 0, 0));
-		exit_nicely();
+		exit_nicely(1);
 	}
 #endif
 
@@ -13816,7 +13815,7 @@ dumpTrigger(Archive *fout, TriggerInfo *tginfo)
 		else
 		{
 			write_msg(NULL, "unexpected tgtype value: %d\n", tginfo->tgtype);
-			exit_nicely();
+			exit_nicely(1);
 		}
 
 		findx = 0;
@@ -13901,7 +13900,7 @@ dumpTrigger(Archive *fout, TriggerInfo *tginfo)
 						  tginfo->tgargs,
 						  tginfo->dobj.name,
 						  tbinfo->dobj.name);
-				exit_nicely();
+				exit_nicely(1);
 			}
 
 			if (findx > 0)
@@ -14016,7 +14015,7 @@ dumpRule(Archive *fout, RuleInfo *rinfo)
 	{
 		write_msg(NULL, "query to get rule \"%s\" for table \"%s\" failed: wrong number of rows returned\n",
 				  rinfo->dobj.name, tbinfo->dobj.name);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	printfPQExpBuffer(cmd, "%s\n", PQgetvalue(res, 0, 0));
@@ -14474,7 +14473,7 @@ getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts)
 							   "query returned %d rows instead of one: %s\n",
 								 ntups),
 				  ntups, query->data);
-		exit_nicely();
+		exit_nicely(1);
 	}
 
 	if (fout->remoteVersion >= 70100)
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 421847e5d5aa1..28a7fe4e13097 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -518,8 +518,6 @@ extern void simple_string_list_append(SimpleStringList *list, const char *val);
 extern bool simple_oid_list_member(SimpleOidList *list, Oid val);
 extern bool simple_string_list_member(SimpleStringList *list, const char *val);
 
-extern void exit_nicely(void) __attribute__((noreturn));
-
 extern void parseOidArray(const char *str, Oid *array, int arraysize);
 
 extern void sortDumpableObjects(DumpableObject **objs, int numObjs);
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
index 4c9366738ab82..4f8dd600686d9 100644
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -151,12 +151,12 @@ main(int argc, char *argv[])
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
 			help();
-			exit(0);
+			exit_nicely(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
 		{
 			puts("pg_dumpall (PostgreSQL) " PG_VERSION);
-			exit(0);
+			exit_nicely(0);
 		}
 	}
 
@@ -181,7 +181,7 @@ main(int argc, char *argv[])
 					  "but was not the same version as %s.\n"
 					  "Check your installation.\n"),
 					full_path, progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	pgdumpopts = createPQExpBuffer();
@@ -296,7 +296,7 @@ main(int argc, char *argv[])
 
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
-				exit(1);
+				exit_nicely(1);
 		}
 	}
 
@@ -307,7 +307,7 @@ main(int argc, char *argv[])
 				progname, argv[optind]);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/* Make sure the user hasn't specified a mix of globals-only options */
@@ -317,7 +317,7 @@ main(int argc, char *argv[])
 				progname);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (globals_only && tablespaces_only)
@@ -326,7 +326,7 @@ main(int argc, char *argv[])
 				progname);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (roles_only && tablespaces_only)
@@ -335,7 +335,7 @@ main(int argc, char *argv[])
 				progname);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/* Add long options to the pg_dump argument list */
@@ -375,7 +375,7 @@ main(int argc, char *argv[])
 		{
 			fprintf(stderr, _("%s: could not connect to database \"%s\"\n"),
 					progname, pgdb);
-			exit(1);
+			exit_nicely(1);
 		}
 	}
 	else
@@ -393,7 +393,7 @@ main(int argc, char *argv[])
 					progname);
 			fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 					progname);
-			exit(1);
+			exit_nicely(1);
 		}
 	}
 
@@ -407,7 +407,7 @@ main(int argc, char *argv[])
 		{
 			fprintf(stderr, _("%s: could not open the output file \"%s\": %s\n"),
 					progname, filename, strerror(errno));
-			exit(1);
+			exit_nicely(1);
 		}
 	}
 	else
@@ -525,7 +525,7 @@ main(int argc, char *argv[])
 	if (filename)
 		fclose(OPF);
 
-	exit(0);
+	exit_nicely(0);
 }
 
 
@@ -1068,7 +1068,7 @@ dumpTablespaces(PGconn *conn)
 			fprintf(stderr, _("%s: could not parse ACL list (%s) for tablespace \"%s\"\n"),
 					progname, spcacl, fspcname);
 			PQfinish(conn);
-			exit(1);
+			exit_nicely(1);
 		}
 
 		if (spccomment && strlen(spccomment))
@@ -1372,7 +1372,7 @@ dumpCreateDB(PGconn *conn)
 			fprintf(stderr, _("%s: could not parse ACL list (%s) for database \"%s\"\n"),
 					progname, dbacl, fdbname);
 			PQfinish(conn);
-			exit(1);
+			exit_nicely(1);
 		}
 
 		fprintf(OPF, "%s", buf->data);
@@ -1587,7 +1587,7 @@ dumpDatabases(PGconn *conn)
 		if (ret != 0)
 		{
 			fprintf(stderr, _("%s: pg_dump failed on database \"%s\", exiting\n"), progname, dbname);
-			exit(1);
+			exit_nicely(1);
 		}
 
 		if (filename)
@@ -1597,7 +1597,7 @@ dumpDatabases(PGconn *conn)
 			{
 				fprintf(stderr, _("%s: could not re-open the output file \"%s\": %s\n"),
 						progname, filename, strerror(errno));
-				exit(1);
+				exit_nicely(1);
 			}
 		}
 
@@ -1724,7 +1724,7 @@ connectDatabase(const char *dbname, const char *pghost, const char *pgport,
 		{
 			fprintf(stderr, _("%s: could not connect to database \"%s\"\n"),
 					progname, dbname);
-			exit(1);
+			exit_nicely(1);
 		}
 
 		if (PQstatus(conn) == CONNECTION_BAD &&
@@ -1746,7 +1746,7 @@ connectDatabase(const char *dbname, const char *pghost, const char *pgport,
 			fprintf(stderr,
 					_("%s: could not connect to database \"%s\": %s\n"),
 					progname, dbname, PQerrorMessage(conn));
-			exit(1);
+			exit_nicely(1);
 		}
 		else
 		{
@@ -1759,14 +1759,14 @@ connectDatabase(const char *dbname, const char *pghost, const char *pgport,
 	if (!remoteversion_str)
 	{
 		fprintf(stderr, _("%s: could not get server version\n"), progname);
-		exit(1);
+		exit_nicely(1);
 	}
 	server_version = parse_version(remoteversion_str);
 	if (server_version < 0)
 	{
 		fprintf(stderr, _("%s: could not parse server version \"%s\"\n"),
 				progname, remoteversion_str);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	my_version = parse_version(PG_VERSION);
@@ -1774,7 +1774,7 @@ connectDatabase(const char *dbname, const char *pghost, const char *pgport,
 	{
 		fprintf(stderr, _("%s: could not parse version \"%s\"\n"),
 				progname, PG_VERSION);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/*
@@ -1788,7 +1788,7 @@ connectDatabase(const char *dbname, const char *pghost, const char *pgport,
 		fprintf(stderr, _("server version: %s; %s version: %s\n"),
 				remoteversion_str, progname, PG_VERSION);
 		fprintf(stderr, _("aborting because of server version mismatch\n"));
-		exit(1);
+		exit_nicely(1);
 	}
 
 	/*
@@ -1822,7 +1822,7 @@ executeQuery(PGconn *conn, const char *query)
 		fprintf(stderr, _("%s: query was: %s\n"),
 				progname, query);
 		PQfinish(conn);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	return res;
@@ -1848,7 +1848,7 @@ executeCommand(PGconn *conn, const char *query)
 		fprintf(stderr, _("%s: query was: %s\n"),
 				progname, query);
 		PQfinish(conn);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	PQclear(res);
diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c
index 6ff1ab89040b3..1c026cf91a5bc 100644
--- a/src/bin/pg_dump/pg_restore.c
+++ b/src/bin/pg_dump/pg_restore.c
@@ -138,12 +138,12 @@ main(int argc, char **argv)
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
 			usage(progname);
-			exit(0);
+			exit_nicely(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
 		{
 			puts("pg_restore (PostgreSQL) " PG_VERSION);
-			exit(0);
+			exit_nicely(0);
 		}
 	}
 
@@ -279,7 +279,7 @@ main(int argc, char **argv)
 
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
-				exit(1);
+				exit_nicely(1);
 		}
 	}
 
@@ -296,21 +296,21 @@ main(int argc, char **argv)
 				progname, argv[optind]);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (opts->dataOnly && opts->schemaOnly)
 	{
 		fprintf(stderr, _("%s: options -s/--schema-only and -a/--data-only cannot be used together\n"),
 			progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if ((opts->dataOnly || opts->schemaOnly) && (opts->dumpSections != DUMP_UNSECTIONED))
 	{
 		fprintf(stderr, _("%s: options -s/--schema-only and -a/--data-only cannot be used with --section\n"),
 			progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	if (opts->dataOnly)
@@ -332,7 +332,7 @@ main(int argc, char **argv)
 					progname);
 			fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 					progname);
-			exit(1);
+			exit_nicely(1);
 		}
 		opts->useDB = 1;
 	}
@@ -342,7 +342,7 @@ main(int argc, char **argv)
 	{
 		fprintf(stderr, _("%s: cannot specify both --single-transaction and multiple jobs\n"),
 				progname);
-		exit(1);
+		exit_nicely(1);
 	}
 
 	opts->disable_triggers = disable_triggers;
@@ -378,7 +378,7 @@ main(int argc, char **argv)
 			default:
 				write_msg(NULL, "unrecognized archive format \"%s\"; please specify \"c\", \"d\", or \"t\"\n",
 						  opts->formatName);
-				exit(1);
+				exit_nicely(1);
 		}
 	}
 

From 549e93c990575b2b69c49fcacc0f280bc9762db9 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 16 Feb 2012 12:07:06 -0500
Subject: [PATCH 004/129] Refactor pg_dump.c to avoid duplicating
 returns-one-row check.

Any patches apt to get broken have probably already been broken by the
error-handling cleanups I just did, so we might as well clean this up
at the same time.
---
 src/bin/pg_dump/pg_dump.c | 252 +++++++-------------------------------
 1 file changed, 41 insertions(+), 211 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 6eddc63e28be2..35530f7d7dfb3 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -257,6 +257,7 @@ static void binary_upgrade_extension_member(PQExpBuffer upgrade_buffer,
 								const char *objlabel);
 static const char *getAttrName(int attrnum, TableInfo *tblInfo);
 static const char *fmtCopyColumnList(const TableInfo *ti);
+static PGresult *ExecuteSqlQueryForSingleRow(Archive *fout, char *query);
 
 int
 main(int argc, char **argv)
@@ -2446,7 +2447,6 @@ binary_upgrade_set_type_oids_by_type_oid(Archive *fout,
 										 Oid pg_type_oid)
 {
 	PQExpBuffer upgrade_query = createPQExpBuffer();
-	int			ntups;
 	PGresult   *upgrade_res;
 	Oid			pg_type_array_oid;
 
@@ -2462,18 +2462,7 @@ binary_upgrade_set_type_oids_by_type_oid(Archive *fout,
 					  "WHERE pg_type.oid = '%u'::pg_catalog.oid;",
 					  pg_type_oid);
 
-	upgrade_res = ExecuteSqlQuery(fout, upgrade_query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(upgrade_res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, upgrade_query->data);
-		exit_nicely(1);
-	}
+	upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
 
 	pg_type_array_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "typarray")));
 
@@ -2496,7 +2485,6 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 										Oid pg_rel_oid)
 {
 	PQExpBuffer upgrade_query = createPQExpBuffer();
-	int			ntups;
 	PGresult   *upgrade_res;
 	Oid			pg_type_oid;
 	bool		toast_set = false;
@@ -2510,18 +2498,7 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 					  "WHERE c.oid = '%u'::pg_catalog.oid;",
 					  pg_rel_oid);
 
-	upgrade_res = ExecuteSqlQuery(fout, upgrade_query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(upgrade_res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, upgrade_query->data);
-		exit_nicely(1);
-	}
+	upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
 
 	pg_type_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "crel")));
 
@@ -2554,7 +2531,6 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 								 bool is_index)
 {
 	PQExpBuffer upgrade_query = createPQExpBuffer();
-	int			ntups;
 	PGresult   *upgrade_res;
 	Oid			pg_class_reltoastrelid;
 	Oid			pg_class_reltoastidxid;
@@ -2566,18 +2542,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 					  "WHERE c.oid = '%u'::pg_catalog.oid;",
 					  pg_class_oid);
 
-	upgrade_res = ExecuteSqlQuery(fout, upgrade_query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(upgrade_res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, upgrade_query->data);
-		exit_nicely(1);
-	}
+	upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
 
 	pg_class_reltoastrelid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid")));
 	pg_class_reltoastidxid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastidxid")));
@@ -7807,7 +7772,6 @@ dumpBaseType(Archive *fout, TypeInfo *tyinfo)
 	PQExpBuffer labelq = createPQExpBuffer();
 	PQExpBuffer query = createPQExpBuffer();
 	PGresult   *res;
-	int			ntups;
 	char	   *typlen;
 	char	   *typinput;
 	char	   *typoutput;
@@ -8008,18 +7972,7 @@ dumpBaseType(Archive *fout, TypeInfo *tyinfo)
 						  tyinfo->dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	typlen = PQgetvalue(res, 0, PQfnumber(res, "typlen"));
 	typinput = PQgetvalue(res, 0, PQfnumber(res, "typinput"));
@@ -8201,7 +8154,6 @@ dumpDomain(Archive *fout, TypeInfo *tyinfo)
 	PQExpBuffer labelq = createPQExpBuffer();
 	PQExpBuffer query = createPQExpBuffer();
 	PGresult   *res;
-	int			ntups;
 	int			i;
 	char	   *typnotnull;
 	char	   *typdefn;
@@ -8239,18 +8191,7 @@ dumpDomain(Archive *fout, TypeInfo *tyinfo)
 						  tyinfo->dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	typnotnull = PQgetvalue(res, 0, PQfnumber(res, "typnotnull"));
 	typdefn = PQgetvalue(res, 0, PQfnumber(res, "typdefn"));
@@ -9056,7 +8997,6 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 	char	   *funcsig;		/* identity signature */
 	char	   *funcfullsig;	/* full signature */
 	char	   *funcsig_tag;
-	int			ntups;
 	char	   *proretset;
 	char	   *prosrc;
 	char	   *probin;
@@ -9231,18 +9171,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 						  finfo->dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	proretset = PQgetvalue(res, 0, PQfnumber(res, "proretset"));
 	prosrc = PQgetvalue(res, 0, PQfnumber(res, "prosrc"));
@@ -9685,7 +9614,6 @@ dumpOpr(Archive *fout, OprInfo *oprinfo)
 	PQExpBuffer details;
 	const char *name;
 	PGresult   *res;
-	int			ntups;
 	int			i_oprkind;
 	int			i_oprcode;
 	int			i_oprleft;
@@ -9788,18 +9716,7 @@ dumpOpr(Archive *fout, OprInfo *oprinfo)
 						  oprinfo->dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_oprkind = PQfnumber(res, "oprkind");
 	i_oprcode = PQfnumber(res, "oprcode");
@@ -10038,21 +9955,10 @@ convertTSFunction(Archive *fout, Oid funcOid)
 	char	   *result;
 	char		query[128];
 	PGresult   *res;
-	int			ntups;
 
 	snprintf(query, sizeof(query),
 			 "SELECT '%u'::pg_catalog.regproc", funcOid);
-	res = ExecuteSqlQuery(fout, query, PGRES_TUPLES_OK);
-
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query);
 
 	result = pg_strdup(PQgetvalue(res, 0, 0));
 
@@ -10158,18 +10064,7 @@ dumpOpclass(Archive *fout, OpclassInfo *opcinfo)
 						  opcinfo->dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_opcintype = PQfnumber(res, "opcintype");
 	i_opckeytype = PQfnumber(res, "opckeytype");
@@ -10626,18 +10521,7 @@ dumpOpfamily(Archive *fout, OpfamilyInfo *opfinfo)
 					  "WHERE oid = '%u'::pg_catalog.oid",
 					  opfinfo->dobj.catId.oid);
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_amname = PQfnumber(res, "amname");
 
@@ -10785,7 +10669,6 @@ dumpCollation(Archive *fout, CollInfo *collinfo)
 	PQExpBuffer delq;
 	PQExpBuffer labelq;
 	PGresult   *res;
-	int			ntups;
 	int			i_collcollate;
 	int			i_collctype;
 	const char *collcollate;
@@ -10811,18 +10694,7 @@ dumpCollation(Archive *fout, CollInfo *collinfo)
 					  "WHERE c.oid = '%u'::pg_catalog.oid",
 					  collinfo->dobj.catId.oid);
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_collcollate = PQfnumber(res, "collcollate");
 	i_collctype = PQfnumber(res, "collctype");
@@ -10885,7 +10757,6 @@ dumpConversion(Archive *fout, ConvInfo *convinfo)
 	PQExpBuffer delq;
 	PQExpBuffer labelq;
 	PGresult   *res;
-	int			ntups;
 	int			i_conforencoding;
 	int			i_contoencoding;
 	int			i_conproc;
@@ -10916,18 +10787,7 @@ dumpConversion(Archive *fout, ConvInfo *convinfo)
 					  "WHERE c.oid = '%u'::pg_catalog.oid",
 					  convinfo->dobj.catId.oid);
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_conforencoding = PQfnumber(res, "conforencoding");
 	i_contoencoding = PQfnumber(res, "contoencoding");
@@ -11040,7 +10900,6 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 	char	   *aggsig;
 	char	   *aggsig_tag;
 	PGresult   *res;
-	int			ntups;
 	int			i_aggtransfn;
 	int			i_aggfinalfn;
 	int			i_aggsortop;
@@ -11116,18 +10975,7 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 						  agginfo->aggfn.dobj.catId.oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	i_aggtransfn = PQfnumber(res, "aggtransfn");
 	i_aggfinalfn = PQfnumber(res, "aggfinalfn");
@@ -11340,7 +11188,6 @@ dumpTSDictionary(Archive *fout, TSDictInfo *dictinfo)
 	PQExpBuffer labelq;
 	PQExpBuffer query;
 	PGresult   *res;
-	int			ntups;
 	char	   *nspname;
 	char	   *tmplname;
 
@@ -11359,16 +11206,7 @@ dumpTSDictionary(Archive *fout, TSDictInfo *dictinfo)
 					  "FROM pg_ts_template p, pg_namespace n "
 					  "WHERE p.oid = '%u' AND n.oid = tmplnamespace",
 					  dictinfo->dicttemplate);
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 	nspname = PQgetvalue(res, 0, 0);
 	tmplname = PQgetvalue(res, 0, 1);
 
@@ -11525,16 +11363,7 @@ dumpTSConfig(Archive *fout, TSConfigInfo *cfginfo)
 					  "FROM pg_ts_parser p, pg_namespace n "
 					  "WHERE p.oid = '%u' AND n.oid = prsnamespace",
 					  cfginfo->cfgparser);
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 	nspname = PQgetvalue(res, 0, 0);
 	prsname = PQgetvalue(res, 0, 1);
 
@@ -11723,7 +11552,6 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo)
 	PQExpBuffer labelq;
 	PQExpBuffer query;
 	PGresult   *res;
-	int			ntups;
 	char	   *qsrvname;
 	char	   *fdwname;
 
@@ -11744,16 +11572,7 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo)
 					  "FROM pg_foreign_data_wrapper w "
 					  "WHERE w.oid = '%u'",
 					  srvinfo->srvfdw);
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 	fdwname = PQgetvalue(res, 0, 0);
 
 	appendPQExpBuffer(q, "CREATE SERVER %s", qsrvname);
@@ -14430,7 +14249,6 @@ getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts)
 	char	   *result;
 	PQExpBuffer query;
 	PGresult   *res;
-	int			ntups;
 
 	if (oid == 0)
 	{
@@ -14463,18 +14281,7 @@ getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts)
 						  oid);
 	}
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	/* Expecting a single result only */
-	ntups = PQntuples(res);
-	if (ntups != 1)
-	{
-		write_msg(NULL, ngettext("query returned %d row instead of one: %s\n",
-							   "query returned %d rows instead of one: %s\n",
-								 ntups),
-				  ntups, query->data);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	if (fout->remoteVersion >= 70100)
 	{
@@ -14630,3 +14437,26 @@ fmtCopyColumnList(const TableInfo *ti)
 	appendPQExpBuffer(q, ")");
 	return q->data;
 }
+
+/*
+ * Execute an SQL query and verify that we got exactly one row back.
+ */
+static PGresult *
+ExecuteSqlQueryForSingleRow(Archive *fout, char *query)
+{
+	PGresult   *res;
+	int			ntups;
+
+	res = ExecuteSqlQuery(fout, query, PGRES_TUPLES_OK);
+
+	/* Expecting a single result only */
+	ntups = PQntuples(res);
+	if (ntups != 1)
+		exit_horribly(NULL,
+					  ngettext("query returned %d row instead of one: %s\n",
+							   "query returned %d rows instead of one: %s\n",
+								 ntups),
+					  ntups, query);
+
+	return res;
+}

From 689d0eb7db9514f33f49fd5260462b9ba8331e80 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 16 Feb 2012 13:00:24 -0500
Subject: [PATCH 005/129] pg_dump: Remove global connection pointer.

Parallel pg_dump wants to have multiple ArchiveHandle objects, and
therefore multiple PGconns, in play at the same time.  This should
be just about the end of the refactoring that we need in order to
make that workable.
---
 src/bin/pg_dump/pg_backup.h    |  3 +-
 src/bin/pg_dump/pg_backup_db.c | 11 +++++--
 src/bin/pg_dump/pg_dump.c      | 53 +++++++++++++++++-----------------
 3 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d12bd7fa792cc..ff0db467fe1e3 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -159,13 +159,14 @@ typedef struct _restoreOptions
  * Main archiver interface.
  */
 
-extern PGconn *ConnectDatabase(Archive *AH,
+extern void ConnectDatabase(Archive *AH,
 				const char *dbname,
 				const char *pghost,
 				const char *pgport,
 				const char *username,
 				enum trivalue prompt_password);
 extern void DisconnectDatabase(Archive *AHX);
+extern PGconn *GetConnection(Archive *AHX);
 
 /* Called to add a TOC entry */
 extern void ArchiveEntry(Archive *AHX,
diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c
index 31f6d8d94da8f..a843eacc0c842 100644
--- a/src/bin/pg_dump/pg_backup_db.c
+++ b/src/bin/pg_dump/pg_backup_db.c
@@ -225,7 +225,7 @@ _connectDB(ArchiveHandle *AH, const char *reqdb, const char *requser)
  * cache if the username keeps changing.  In current usage, however, the
  * username never does change, so one savedPassword is sufficient.
  */
-PGconn *
+void
 ConnectDatabase(Archive *AHX,
 				const char *dbname,
 				const char *pghost,
@@ -306,8 +306,6 @@ ConnectDatabase(Archive *AHX,
 	_check_database_version(AH);
 
 	PQsetNoticeProcessor(AH->connection, notice_processor, NULL);
-
-	return AH->connection;
 }
 
 void
@@ -319,6 +317,13 @@ DisconnectDatabase(Archive *AHX)
 	AH->connection = NULL;
 }
 
+PGconn *
+GetConnection(Archive *AHX)
+{
+	ArchiveHandle *AH = (ArchiveHandle *) AHX;
+
+	return AH->connection;
+}
 
 static void
 notice_processor(void *arg, const char *message)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 35530f7d7dfb3..c1b16f5e773c3 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -86,7 +86,6 @@ typedef struct
 /* global decls */
 bool		g_verbose;			/* User wants verbose narration of our
 								 * activities. */
-PGconn	   *g_conn;				/* the database connection */
 
 /* various user-settable parameters */
 bool		schemaOnly;
@@ -614,9 +613,7 @@ main(int argc, char **argv)
 	 * Open the database using the Archiver, so it knows about it. Errors mean
 	 * death.
 	 */
-	g_conn = ConnectDatabase(fout, dbname, pghost, pgport,
-							 username, prompt_password);
-
+	ConnectDatabase(fout, dbname, pghost, pgport, username, prompt_password);
 	setup_connection(fout, dumpencoding, use_role);
 
 	/*
@@ -657,7 +654,8 @@ main(int argc, char **argv)
 	if (fout->remoteVersion < 70300)
 	{
 		if (fout->remoteVersion >= 70100)
-			g_last_builtin_oid = findLastBuiltinOid_V71(fout, PQdb(g_conn));
+			g_last_builtin_oid = findLastBuiltinOid_V71(fout,
+				PQdb(GetConnection(fout)));
 		else
 			g_last_builtin_oid = findLastBuiltinOid_V70(fout);
 		if (g_verbose)
@@ -870,12 +868,13 @@ pgdump_cleanup_at_exit(int code, void *arg)
 static void
 setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
 {
+	PGconn	   *conn = GetConnection(AH);
 	const char *std_strings;
 
 	/* Set the client encoding if requested */
 	if (dumpencoding)
 	{
-		if (PQsetClientEncoding(g_conn, dumpencoding) < 0)
+		if (PQsetClientEncoding(conn, dumpencoding) < 0)
 		{
 			write_msg(NULL, "invalid client encoding \"%s\" specified\n",
 					  dumpencoding);
@@ -887,9 +886,9 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
 	 * Get the active encoding and the standard_conforming_strings setting, so
 	 * we know how to escape strings.
 	 */
-	AH->encoding = PQclientEncoding(g_conn);
+	AH->encoding = PQclientEncoding(conn);
 
-	std_strings = PQparameterStatus(g_conn, "standard_conforming_strings");
+	std_strings = PQparameterStatus(conn, "standard_conforming_strings");
 	AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0);
 
 	/* Set the role if requested */
@@ -1018,9 +1017,8 @@ expand_schema_name_patterns(Archive *fout,
 			appendPQExpBuffer(query, "UNION ALL\n");
 		appendPQExpBuffer(query,
 						  "SELECT oid FROM pg_catalog.pg_namespace n\n");
-		processSQLNamePattern(g_conn, query, cell->val, false, false,
-							  NULL, "n.nspname", NULL,
-							  NULL);
+		processSQLNamePattern(GetConnection(fout), query, cell->val, false,
+							  false, NULL, "n.nspname", NULL, NULL);
 	}
 
 	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
@@ -1068,8 +1066,8 @@ expand_table_name_patterns(Archive *fout,
 						  "\nWHERE c.relkind in ('%c', '%c', '%c', '%c')\n",
 						  RELKIND_RELATION, RELKIND_SEQUENCE, RELKIND_VIEW,
 						  RELKIND_FOREIGN_TABLE);
-		processSQLNamePattern(g_conn, query, cell->val, true, false,
-							  "n.nspname", "c.relname", NULL,
+		processSQLNamePattern(GetConnection(fout), query, cell->val, true,
+							  false, "n.nspname", "c.relname", NULL,
 							  "pg_catalog.pg_table_is_visible(c.oid)");
 	}
 
@@ -1266,6 +1264,7 @@ dumpTableData_copy(Archive *fout, void *dcontext)
 	const bool	hasoids = tbinfo->hasoids;
 	const bool	oids = tdinfo->oids;
 	PQExpBuffer q = createPQExpBuffer();
+	PGconn	   *conn = GetConnection(fout);
 	PGresult   *res;
 	int			ret;
 	char	   *copybuf;
@@ -1332,7 +1331,7 @@ dumpTableData_copy(Archive *fout, void *dcontext)
 
 	for (;;)
 	{
-		ret = PQgetCopyData(g_conn, &copybuf, 0);
+		ret = PQgetCopyData(conn, &copybuf, 0);
 
 		if (ret < 0)
 			break;				/* done or error */
@@ -1395,17 +1394,17 @@ dumpTableData_copy(Archive *fout, void *dcontext)
 	{
 		/* copy data transfer failed */
 		write_msg(NULL, "Dumping the contents of table \"%s\" failed: PQgetCopyData() failed.\n", classname);
-		write_msg(NULL, "Error message from server: %s", PQerrorMessage(g_conn));
+		write_msg(NULL, "Error message from server: %s", PQerrorMessage(conn));
 		write_msg(NULL, "The command was: %s\n", q->data);
 		exit_nicely(1);
 	}
 
 	/* Check command status and return to normal libpq state */
-	res = PQgetResult(g_conn);
+	res = PQgetResult(conn);
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
 		write_msg(NULL, "Dumping the contents of table \"%s\" failed: PQgetResult() failed.\n", classname);
-		write_msg(NULL, "Error message from server: %s", PQerrorMessage(g_conn));
+		write_msg(NULL, "Error message from server: %s", PQerrorMessage(conn));
 		write_msg(NULL, "The command was: %s\n", q->data);
 		exit_nicely(1);
 	}
@@ -1830,6 +1829,7 @@ dumpDatabase(Archive *fout)
 	PQExpBuffer dbQry = createPQExpBuffer();
 	PQExpBuffer delQry = createPQExpBuffer();
 	PQExpBuffer creaQry = createPQExpBuffer();
+	PGconn	   *conn = GetConnection(fout);
 	PGresult   *res;
 	int			ntups;
 	int			i_tableoid,
@@ -1850,7 +1850,7 @@ dumpDatabase(Archive *fout)
 			   *tablespace;
 	uint32		frozenxid;
 
-	datname = PQdb(g_conn);
+	datname = PQdb(conn);
 
 	if (g_verbose)
 		write_msg(NULL, "saving database definition\n");
@@ -2150,10 +2150,10 @@ dumpDatabase(Archive *fout)
 	{
 		PQExpBuffer seclabelQry = createPQExpBuffer();
 
-		buildShSecLabelQuery(g_conn, "pg_database", dbCatId.oid, seclabelQry);
+		buildShSecLabelQuery(conn, "pg_database", dbCatId.oid, seclabelQry);
 		res = ExecuteSqlQuery(fout, seclabelQry->data, PGRES_TUPLES_OK);
 		resetPQExpBuffer(seclabelQry);
-		emitShSecLabels(g_conn, res, seclabelQry, "DATABASE", datname);
+		emitShSecLabels(conn, res, seclabelQry, "DATABASE", datname);
 		if (strlen(seclabelQry->data))
 			ArchiveEntry(fout, dbCatId, createDumpId(), datname, NULL, NULL,
 						 dba, false, "SECURITY LABEL", SECTION_NONE,
@@ -2362,6 +2362,7 @@ dumpBlobs(Archive *fout, void *arg)
 {
 	const char *blobQry;
 	const char *blobFetchQry;
+	PGconn	   *conn = GetConnection(fout);
 	PGresult   *res;
 	char		buf[LOBBUFSIZE];
 	int			ntups;
@@ -2404,11 +2405,11 @@ dumpBlobs(Archive *fout, void *arg)
 
 			blobOid = atooid(PQgetvalue(res, i, 0));
 			/* Open the BLOB */
-			loFd = lo_open(g_conn, blobOid, INV_READ);
+			loFd = lo_open(conn, blobOid, INV_READ);
 			if (loFd == -1)
 			{
 				write_msg(NULL, "could not open large object %u: %s",
-						  blobOid, PQerrorMessage(g_conn));
+						  blobOid, PQerrorMessage(conn));
 				exit_nicely(1);
 			}
 
@@ -2417,18 +2418,18 @@ dumpBlobs(Archive *fout, void *arg)
 			/* Now read it in chunks, sending data to archive */
 			do
 			{
-				cnt = lo_read(g_conn, loFd, buf, LOBBUFSIZE);
+				cnt = lo_read(conn, loFd, buf, LOBBUFSIZE);
 				if (cnt < 0)
 				{
 					write_msg(NULL, "error reading large object %u: %s",
-							  blobOid, PQerrorMessage(g_conn));
+							  blobOid, PQerrorMessage(conn));
 					exit_nicely(1);
 				}
 
 				WriteData(fout, buf, cnt);
 			} while (cnt > 0);
 
-			lo_close(g_conn, loFd);
+			lo_close(conn, loFd);
 
 			EndBlob(fout, blobOid);
 		}
@@ -4298,7 +4299,7 @@ getTables(Archive *fout, int *numTables)
 		 */
 		resetPQExpBuffer(query);
 		appendPQExpBuffer(query, "SET statement_timeout = ");
-		appendStringLiteralConn(query, lockWaitTimeout, g_conn);
+		appendStringLiteralConn(query, lockWaitTimeout, GetConnection(fout));
 		ExecuteSqlStatement(fout, query->data);
 	}
 

From 1cc1b91d1b09a5cdd9fc51c9eee31effd2227b4f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 16 Feb 2012 13:24:19 -0500
Subject: [PATCH 006/129] pg_dump: Miscellaneous tightening based on recent
 refactorings.

Use exit_horribly() and ExecuteSqlQueryForSingleRow() in various
places where it's equivalent, or nearly equivalent, to the prior
coding. Apart from being more compact, this also makes the error
messages for the wrong-number-of-tuples case more consistent.
---
 src/bin/pg_dump/pg_dump.c | 249 +++++++++-----------------------------
 1 file changed, 57 insertions(+), 192 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index c1b16f5e773c3..c3c861d85787f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -532,16 +532,10 @@ main(int argc, char **argv)
 		dump_inserts = 1;
 
 	if (dataOnly && schemaOnly)
-	{
-		write_msg(NULL, "options -s/--schema-only and -a/--data-only cannot be used together\n");
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "options -s/--schema-only and -a/--data-only cannot be used together\n");
 
 	if ((dataOnly || schemaOnly) && dumpSections != DUMP_UNSECTIONED)
-	{
-		write_msg(NULL, "options -s/--schema-only and -a/--data-only cannot be used with --section\n");
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "options -s/--schema-only and -a/--data-only cannot be used with --section\n");
 
 	if (dataOnly)
 		dumpSections = DUMP_DATA;
@@ -554,10 +548,7 @@ main(int argc, char **argv)
 	}
 
 	if (dataOnly && outputClean)
-	{
-		write_msg(NULL, "options -c/--clean and -a/--data-only cannot be used together\n");
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "options -c/--clean and -a/--data-only cannot be used together\n");
 
 	if (dump_inserts && oids)
 	{
@@ -587,20 +578,14 @@ main(int argc, char **argv)
 	on_exit_nicely(pgdump_cleanup_at_exit, fout);
 
 	if (fout == NULL)
-	{
-		write_msg(NULL, "could not open output file \"%s\" for writing\n", filename);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "could not open output file \"%s\" for writing\n", filename);
 
 	/* Let the archiver know how noisy to be */
 	fout->verbose = g_verbose;
 
 	my_version = parse_version(PG_VERSION);
 	if (my_version < 0)
-	{
-		write_msg(NULL, "could not parse version string \"%s\"\n", PG_VERSION);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "could not parse version string \"%s\"\n", PG_VERSION);
 
 	/*
 	 * We allow the server to be back to 7.0, and up to any minor release of
@@ -668,10 +653,7 @@ main(int argc, char **argv)
 		expand_schema_name_patterns(fout, &schema_include_patterns,
 									&schema_include_oids);
 		if (schema_include_oids.head == NULL)
-		{
-			write_msg(NULL, "No matching schemas were found\n");
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "No matching schemas were found\n");
 	}
 	expand_schema_name_patterns(fout, &schema_exclude_patterns,
 								&schema_exclude_oids);
@@ -683,10 +665,7 @@ main(int argc, char **argv)
 		expand_table_name_patterns(fout, &table_include_patterns,
 								   &table_include_oids);
 		if (table_include_oids.head == NULL)
-		{
-			write_msg(NULL, "No matching tables were found\n");
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "No matching tables were found\n");
 	}
 	expand_table_name_patterns(fout, &table_exclude_patterns,
 							   &table_exclude_oids);
@@ -875,11 +854,8 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
 	if (dumpencoding)
 	{
 		if (PQsetClientEncoding(conn, dumpencoding) < 0)
-		{
-			write_msg(NULL, "invalid client encoding \"%s\" specified\n",
-					  dumpencoding);
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "invalid client encoding \"%s\" specified\n",
+						  dumpencoding);
 	}
 
 	/*
@@ -974,10 +950,7 @@ parseArchiveFormat(const char *format, ArchiveMode *mode)
 	else if (pg_strcasecmp(format, "tar") == 0)
 		archiveFormat = archTar;
 	else
-	{
-		write_msg(NULL, "invalid output format \"%s\" specified\n", format);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "invalid output format \"%s\" specified\n", format);
 	return archiveFormat;
 }
 
@@ -999,10 +972,7 @@ expand_schema_name_patterns(Archive *fout,
 		return;					/* nothing to do */
 
 	if (fout->remoteVersion < 70300)
-	{
-		write_msg(NULL, "server version must be at least 7.3 to use schema selection switches\n");
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "server version must be at least 7.3 to use schema selection switches\n");
 
 	query = createPQExpBuffer();
 
@@ -1831,7 +1801,6 @@ dumpDatabase(Archive *fout)
 	PQExpBuffer creaQry = createPQExpBuffer();
 	PGconn	   *conn = GetConnection(fout);
 	PGresult   *res;
-	int			ntups;
 	int			i_tableoid,
 				i_oid,
 				i_dba,
@@ -1928,23 +1897,7 @@ dumpDatabase(Archive *fout)
 		appendStringLiteralAH(dbQry, datname, fout);
 	}
 
-	res = ExecuteSqlQuery(fout, dbQry->data, PGRES_TUPLES_OK);
-
-	ntups = PQntuples(res);
-
-	if (ntups <= 0)
-	{
-		write_msg(NULL, "missing pg_database entry for database \"%s\"\n",
-				  datname);
-		exit_nicely(1);
-	}
-
-	if (ntups != 1)
-	{
-		write_msg(NULL, "query returned more than one (%d) pg_database entry for database \"%s\"\n",
-				  ntups, datname);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, dbQry->data);
 
 	i_tableoid = PQfnumber(res, "tableoid");
 	i_oid = PQfnumber(res, "oid");
@@ -2040,13 +1993,7 @@ dumpDatabase(Archive *fout)
 						  "WHERE oid = %u;\n",
 						  LargeObjectRelationId);
 
-		lo_res = ExecuteSqlQuery(fout, loFrozenQry->data, PGRES_TUPLES_OK);
-
-		if (PQntuples(lo_res) != 1)
-		{
-			write_msg(NULL, "dumpDatabase(): could not find pg_largeobject.relfrozenxid\n");
-			exit_nicely(1);
-		}
+		lo_res = ExecuteSqlQueryForSingleRow(fout, loFrozenQry->data);
 
 		i_relfrozenxid = PQfnumber(lo_res, "relfrozenxid");
 
@@ -2078,13 +2025,7 @@ dumpDatabase(Archive *fout)
 							  "WHERE oid = %u;\n",
 							  LargeObjectMetadataRelationId);
 
-			lo_res = ExecuteSqlQuery(fout, loFrozenQry->data, PGRES_TUPLES_OK);
-
-			if (PQntuples(lo_res) != 1)
-			{
-				write_msg(NULL, "dumpDatabase(): could not find pg_largeobject_metadata.relfrozenxid\n");
-				exit_nicely(1);
-			}
+			lo_res = ExecuteSqlQueryForSingleRow(fout, loFrozenQry->data);
 
 			i_relfrozenxid = PQfnumber(lo_res, "relfrozenxid");
 
@@ -2407,11 +2348,8 @@ dumpBlobs(Archive *fout, void *arg)
 			/* Open the BLOB */
 			loFd = lo_open(conn, blobOid, INV_READ);
 			if (loFd == -1)
-			{
-				write_msg(NULL, "could not open large object %u: %s",
-						  blobOid, PQerrorMessage(conn));
-				exit_nicely(1);
-			}
+				exit_horribly(NULL, "could not open large object %u: %s",
+							  blobOid, PQerrorMessage(conn));
 
 			StartBlob(fout, blobOid);
 
@@ -2420,11 +2358,8 @@ dumpBlobs(Archive *fout, void *arg)
 			{
 				cnt = lo_read(conn, loFd, buf, LOBBUFSIZE);
 				if (cnt < 0)
-				{
-					write_msg(NULL, "error reading large object %u: %s",
-							  blobOid, PQerrorMessage(conn));
-					exit_nicely(1);
-				}
+					exit_horribly(NULL, "error reading large object %u: %s",
+								  blobOid, PQerrorMessage(conn));
 
 				WriteData(fout, buf, cnt);
 			} while (cnt > 0);
@@ -2618,10 +2553,7 @@ binary_upgrade_extension_member(PQExpBuffer upgrade_buffer,
 		extobj = NULL;
 	}
 	if (extobj == NULL)
-	{
-		write_msg(NULL, "could not find parent extension for %s", objlabel);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "could not find parent extension for %s", objlabel);
 
 	appendPQExpBuffer(upgrade_buffer,
 	  "\n-- For binary upgrade, handle extension membership the hard way\n");
@@ -2760,8 +2692,7 @@ findNamespace(Archive *fout, Oid nsoid, Oid objoid)
 			if (nsoid == nsinfo->dobj.catId.oid)
 				return nsinfo;
 		}
-		write_msg(NULL, "schema with OID %u does not exist\n", nsoid);
-		exit_nicely(1);
+		exit_horribly(NULL, "schema with OID %u does not exist\n", nsoid);
 	}
 	else
 	{
@@ -5085,12 +5016,8 @@ getRules(Archive *fout, int *numRules)
 		ruletableoid = atooid(PQgetvalue(res, i, i_ruletable));
 		ruleinfo[i].ruletable = findTableByOid(ruletableoid);
 		if (ruleinfo[i].ruletable == NULL)
-		{
-			write_msg(NULL, "failed sanity check, parent table OID %u of pg_rewrite entry OID %u not found\n",
-					  ruletableoid,
-					  ruleinfo[i].dobj.catId.oid);
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "failed sanity check, parent table OID %u of pg_rewrite entry OID %u not found\n",
+						  ruletableoid, ruleinfo[i].dobj.catId.oid);
 		ruleinfo[i].dobj.namespace = ruleinfo[i].ruletable->dobj.namespace;
 		ruleinfo[i].dobj.dump = ruleinfo[i].ruletable->dobj.dump;
 		ruleinfo[i].ev_type = *(PQgetvalue(res, i, i_ev_type));
@@ -5331,12 +5258,10 @@ getTriggers(Archive *fout, TableInfo tblinfo[], int numTables)
 					if (OidIsValid(tginfo[j].tgconstrrelid))
 					{
 						if (PQgetisnull(res, j, i_tgconstrrelname))
-						{
-							write_msg(NULL, "query produced null referenced table name for foreign key trigger \"%s\" on table \"%s\" (OID of table: %u)\n",
-									  tginfo[j].dobj.name, tbinfo->dobj.name,
-									  tginfo[j].tgconstrrelid);
-							exit_nicely(1);
-						}
+							exit_horribly(NULL, "query produced null referenced table name for foreign key trigger \"%s\" on table \"%s\" (OID of table: %u)\n",
+										  tginfo[j].dobj.name,
+										  tbinfo->dobj.name,
+										  tginfo[j].tgconstrrelid);
 						tginfo[j].tgconstrrelname = pg_strdup(PQgetvalue(res, j, i_tgconstrrelname));
 					}
 					else
@@ -5879,11 +5804,9 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 		for (j = 0; j < ntups; j++)
 		{
 			if (j + 1 != atoi(PQgetvalue(res, j, i_attnum)))
-			{
-				write_msg(NULL, "invalid column numbering in table \"%s\"\n",
-						  tbinfo->dobj.name);
-				exit_nicely(1);
-			}
+				exit_horribly(NULL,
+							  "invalid column numbering in table \"%s\"\n",
+							  tbinfo->dobj.name);
 			tbinfo->attnames[j] = pg_strdup(PQgetvalue(res, j, i_attname));
 			tbinfo->atttypnames[j] = pg_strdup(PQgetvalue(res, j, i_atttypname));
 			tbinfo->atttypmod[j] = atoi(PQgetvalue(res, j, i_atttypmod));
@@ -5967,11 +5890,9 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 				adnum = atoi(PQgetvalue(res, j, 2));
 
 				if (adnum <= 0 || adnum > ntups)
-				{
-					write_msg(NULL, "invalid adnum value %d for table \"%s\"\n",
-							  adnum, tbinfo->dobj.name);
-					exit_nicely(1);
-				}
+					exit_horribly(NULL,
+								  "invalid adnum value %d for table \"%s\"\n",
+								  adnum, tbinfo->dobj.name);
 
 				/*
 				 * dropped columns shouldn't have defaults, but just in case,
@@ -7666,13 +7587,7 @@ dumpRangeType(Archive *fout, TypeInfo *tyinfo)
 					  "rngtypid = '%u'",
 					  tyinfo->dobj.catId.oid);
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-	if (PQntuples(res) != 1)
-	{
-		write_msg(NULL, "query returned %d pg_range entries for range type \"%s\"\n",
-				  PQntuples(res), tyinfo->dobj.name);
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
 	/*
 	 * DROP must be fully qualified in case same name appears in pg_catalog.
@@ -9346,11 +9261,8 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 		else if (provolatile[0] == PROVOLATILE_STABLE)
 			appendPQExpBuffer(q, " STABLE");
 		else if (provolatile[0] != PROVOLATILE_VOLATILE)
-		{
-			write_msg(NULL, "unrecognized provolatile value for function \"%s\"\n",
-					  finfo->dobj.name);
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "unrecognized provolatile value for function \"%s\"\n",
+						  finfo->dobj.name);
 	}
 
 	if (proisstrict[0] == 't')
@@ -11769,9 +11681,9 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
 			break;
 		default:
 			/* shouldn't get here */
-			write_msg(NULL, "unknown object type (%d) in default privileges\n",
-					  (int) daclinfo->defaclobjtype);
-			exit_nicely(1);
+			exit_horribly(NULL,
+						  "unknown object type (%d) in default privileges\n",
+						  (int) daclinfo->defaclobjtype);
 			type = "";			/* keep compiler quiet */
 	}
 
@@ -11785,11 +11697,8 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
 								 daclinfo->defaclrole,
 								 fout->remoteVersion,
 								 q))
-	{
-		write_msg(NULL, "could not parse default ACL list (%s)\n",
-				  daclinfo->defaclacl);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL, "could not parse default ACL list (%s)\n",
+					  daclinfo->defaclacl);
 
 	ArchiveEntry(fout, daclinfo->dobj.catId, daclinfo->dobj.dumpId,
 				 tag->data,
@@ -11842,11 +11751,9 @@ dumpACL(Archive *fout, CatalogId objCatId, DumpId objDumpId,
 
 	if (!buildACLCommands(name, subname, type, acls, owner,
 						  "", fout->remoteVersion, sql))
-	{
-		write_msg(NULL, "could not parse ACL list (%s) for object \"%s\" (%s)\n",
-				  acls, name, type);
-		exit_nicely(1);
-	}
+		exit_horribly(NULL,
+					  "could not parse ACL list (%s) for object \"%s\" (%s)\n",
+					  acls, name, type);
 
 	if (sql->len > 0)
 		ArchiveEntry(fout, nilCatalogId, createDumpId(),
@@ -12286,22 +12193,18 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
 		if (PQntuples(res) != 1)
 		{
 			if (PQntuples(res) < 1)
-				write_msg(NULL, "query to obtain definition of view \"%s\" returned no data\n",
+				exit_horribly(NULL, "query to obtain definition of view \"%s\" returned no data\n",
 						  tbinfo->dobj.name);
 			else
-				write_msg(NULL, "query to obtain definition of view \"%s\" returned more than one definition\n",
+				exit_horribly(NULL, "query to obtain definition of view \"%s\" returned more than one definition\n",
 						  tbinfo->dobj.name);
-			exit_nicely(1);
 		}
 
 		viewdef = PQgetvalue(res, 0, 0);
 
 		if (strlen(viewdef) == 0)
-		{
-			write_msg(NULL, "definition of view \"%s\" appears to be empty (length zero)\n",
-					  tbinfo->dobj.name);
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "definition of view \"%s\" appears to be empty (length zero)\n",
+						  tbinfo->dobj.name);
 
 		/*
 		 * DROP must be fully qualified in case same name appears in
@@ -12349,15 +12252,7 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
 							  "ON (fs.oid = ft.ftserver) "
 							  "WHERE ft.ftrelid = '%u'",
 							  tbinfo->dobj.catId.oid);
-			res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-			if (PQntuples(res) != 1)
-			{
-				write_msg(NULL, ngettext("query returned %d foreign server entry for foreign table \"%s\"\n",
-										 "query returned %d foreign server entries for foreign table \"%s\"\n",
-										 PQntuples(res)),
-						  PQntuples(res), tbinfo->dobj.name);
-				exit_nicely(1);
-			}
+			res = ExecuteSqlQueryForSingleRow(fout, query->data);
 			i_srvname = PQfnumber(res, "srvname");
 			i_ftoptions = PQfnumber(res, "ftoptions");
 			srvname = pg_strdup(PQgetvalue(res, 0, i_srvname));
@@ -12919,9 +12814,8 @@ getAttrName(int attrnum, TableInfo *tblInfo)
 		case TableOidAttributeNumber:
 			return "tableoid";
 	}
-	write_msg(NULL, "invalid column number %d for table \"%s\"\n",
-			  attrnum, tblInfo->dobj.name);
-	exit_nicely(1);
+	exit_horribly(NULL, "invalid column number %d for table \"%s\"\n",
+				  attrnum, tblInfo->dobj.name);
 	return NULL;				/* keep compiler quiet */
 }
 
@@ -13030,11 +12924,8 @@ dumpConstraint(Archive *fout, ConstraintInfo *coninfo)
 		indxinfo = (IndxInfo *) findObjectByDumpId(coninfo->conindex);
 
 		if (indxinfo == NULL)
-		{
-			write_msg(NULL, "missing index for constraint \"%s\"\n",
-					  coninfo->dobj.name);
-			exit_nicely(1);
-		}
+			exit_horribly(NULL, "missing index for constraint \"%s\"\n",
+						  coninfo->dobj.name);
 
 		if (binary_upgrade)
 			binary_upgrade_set_pg_class_oids(fout, q,
@@ -13220,8 +13111,8 @@ dumpConstraint(Archive *fout, ConstraintInfo *coninfo)
 	}
 	else
 	{
-		write_msg(NULL, "unrecognized constraint type: %c\n", coninfo->contype);
-		exit_nicely(1);
+		exit_horribly(NULL, "unrecognized constraint type: %c\n",
+					  coninfo->contype);
 	}
 
 	/* Dump Constraint Comments --- only works for table constraints */
@@ -13269,7 +13160,6 @@ static Oid
 findLastBuiltinOid_V71(Archive *fout, const char *dbname)
 {
 	PGresult   *res;
-	int			ntups;
 	Oid			last_oid;
 	PQExpBuffer query = createPQExpBuffer();
 
@@ -13277,19 +13167,7 @@ findLastBuiltinOid_V71(Archive *fout, const char *dbname)
 	appendPQExpBuffer(query, "SELECT datlastsysoid from pg_database where datname = ");
 	appendStringLiteralAH(query, dbname, fout);
 
-	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
-	ntups = PQntuples(res);
-	if (ntups < 1)
-	{
-		write_msg(NULL, "missing pg_database entry for this database\n");
-		exit_nicely(1);
-	}
-	if (ntups > 1)
-	{
-		write_msg(NULL, "found more than one pg_database entry for this database\n");
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 	last_oid = atooid(PQgetvalue(res, 0, PQfnumber(res, "datlastsysoid")));
 	PQclear(res);
 	destroyPQExpBuffer(query);
@@ -13308,23 +13186,10 @@ static Oid
 findLastBuiltinOid_V70(Archive *fout)
 {
 	PGresult   *res;
-	int			ntups;
 	int			last_oid;
 
-	res = ExecuteSqlQuery(fout,
-				 	"SELECT oid FROM pg_class WHERE relname = 'pg_indexes'",
-					PGRES_TUPLES_OK);
-	ntups = PQntuples(res);
-	if (ntups < 1)
-	{
-		write_msg(NULL, "could not find entry for pg_indexes in pg_class\n");
-		exit_nicely(1);
-	}
-	if (ntups > 1)
-	{
-		write_msg(NULL, "found more than one entry for pg_indexes in pg_class\n");
-		exit_nicely(1);
-	}
+	res = ExecuteSqlQueryForSingleRow(fout,
+				 	"SELECT oid FROM pg_class WHERE relname = 'pg_indexes'");
 	last_oid = atooid(PQgetvalue(res, 0, PQfnumber(res, "oid")));
 	PQclear(res);
 	return last_oid;

From 4767bc8ff2edc1258cf4d8a83155d4cedd724231 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 16 Feb 2012 17:33:28 -0500
Subject: [PATCH 007/129] Improve statistics estimation to make some use of
 DISTINCT in sub-queries.

Formerly, we just punted when trying to estimate stats for variables coming
out of sub-queries using DISTINCT, on the grounds that whatever stats we
might have for underlying table columns would be inapplicable.  But if the
sub-query has only one DISTINCT column, we can consider its output variable
as being unique, which is useful information all by itself.  The scope of
this improvement is pretty narrow, but it costs nearly nothing, so we might
as well do it.  Per discussion with Andres Freund.

This patch differs from the draft I submitted yesterday in updating various
comments about vardata.isunique (to reflect its extended meaning) and in
tweaking the interaction with security_barrier views.  There does not seem
to be a reason why we can't use this sort of knowledge even when the
sub-query is such a view.
---
 src/backend/utils/adt/selfuncs.c | 94 ++++++++++++++++++++------------
 src/include/utils/selfuncs.h     |  2 +-
 2 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 6d78068476e52..0a685aac2c06f 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -110,6 +110,7 @@
 #include "optimizer/predtest.h"
 #include "optimizer/restrictinfo.h"
 #include "optimizer/var.h"
+#include "parser/parse_clause.h"
 #include "parser/parse_coerce.h"
 #include "parser/parsetree.h"
 #include "utils/builtins.h"
@@ -255,10 +256,11 @@ var_eq_const(VariableStatData *vardata, Oid operator,
 		return 0.0;
 
 	/*
-	 * If we matched the var to a unique index, assume there is exactly one
-	 * match regardless of anything else.  (This is slightly bogus, since the
-	 * index's equality operator might be different from ours, but it's more
-	 * likely to be right than ignoring the information.)
+	 * If we matched the var to a unique index or DISTINCT clause, assume
+	 * there is exactly one match regardless of anything else.  (This is
+	 * slightly bogus, since the index or clause's equality operator might be
+	 * different from ours, but it's much more likely to be right than
+	 * ignoring the information.)
 	 */
 	if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
 		return 1.0 / vardata->rel->tuples;
@@ -389,10 +391,11 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
 	bool		isdefault;
 
 	/*
-	 * If we matched the var to a unique index, assume there is exactly one
-	 * match regardless of anything else.  (This is slightly bogus, since the
-	 * index's equality operator might be different from ours, but it's more
-	 * likely to be right than ignoring the information.)
+	 * If we matched the var to a unique index or DISTINCT clause, assume
+	 * there is exactly one match regardless of anything else.  (This is
+	 * slightly bogus, since the index or clause's equality operator might be
+	 * different from ours, but it's much more likely to be right than
+	 * ignoring the information.)
 	 */
 	if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
 		return 1.0 / vardata->rel->tuples;
@@ -4128,10 +4131,11 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
  *	atttype, atttypmod: type data to pass to get_attstatsslot().  This is
  *		commonly the same as the exposed type of the variable argument,
  *		but can be different in binary-compatible-type cases.
- *	isunique: TRUE if we were able to match the var to a unique index,
- *		implying its values are unique for this query.  (Caution: this
- *		should be trusted for statistical purposes only, since we do not
- *		check indimmediate.)
+ *	isunique: TRUE if we were able to match the var to a unique index or a
+ *		single-column DISTINCT clause, implying its values are unique for
+ *		this query.  (Caution: this should be trusted for statistical
+ *		purposes only, since we do not check indimmediate nor verify that
+ *		the exact same definition of equality applies.)
  *
  * Caller is responsible for doing ReleaseVariableStats() before exiting.
  */
@@ -4357,32 +4361,21 @@ examine_simple_variable(PlannerInfo *root, Var *var,
 	{
 		/*
 		 * Plain subquery (not one that was converted to an appendrel).
-		 *
-		 * Punt if subquery uses set operations, GROUP BY, or DISTINCT --- any
-		 * of these will mash underlying columns' stats beyond recognition.
-		 * (Set ops are particularly nasty; if we forged ahead, we would
-		 * return stats relevant to only the leftmost subselect...)
 		 */
 		Query	   *subquery = rte->subquery;
 		RelOptInfo *rel;
 		TargetEntry *ste;
 
-		if (subquery->setOperations ||
-			subquery->groupClause ||
-			subquery->distinctClause)
-			return;
-
 		/*
-		 * If the sub-query originated from a view with the security_barrier
-		 * attribute, we treat it as a black-box from outside of the view.
-		 * This is probably a harsher restriction than necessary; it's
-		 * certainly OK for the selectivity estimator (which is a C function,
-		 * and therefore omnipotent anyway) to look at the statistics.  But
-		 * many selectivity estimators will happily *invoke the operator
-		 * function* to try to work out a good estimate - and that's not OK.
-		 * So for now, we do this.
+		 * Punt if subquery uses set operations or GROUP BY, as these will
+		 * mash underlying columns' stats beyond recognition.  (Set ops are
+		 * particularly nasty; if we forged ahead, we would return stats
+		 * relevant to only the leftmost subselect...)  DISTINCT is also
+		 * problematic, but we check that later because there is a possibility
+		 * of learning something even with it.
 		 */
-		if (rte->security_barrier)
+		if (subquery->setOperations ||
+			subquery->groupClause)
 			return;
 
 		/*
@@ -4415,6 +4408,37 @@ examine_simple_variable(PlannerInfo *root, Var *var,
 				 rte->eref->aliasname, var->varattno);
 		var = (Var *) ste->expr;
 
+		/*
+		 * If subquery uses DISTINCT, we can't make use of any stats for the
+		 * variable ... but, if it's the only DISTINCT column, we are entitled
+		 * to consider it unique.  We do the test this way so that it works
+		 * for cases involving DISTINCT ON.
+		 */
+		if (subquery->distinctClause)
+		{
+			if (list_length(subquery->distinctClause) == 1 &&
+				targetIsInSortList(ste, InvalidOid, subquery->distinctClause))
+				vardata->isunique = true;
+			/* cannot go further */
+			return;
+		}
+
+		/*
+		 * If the sub-query originated from a view with the security_barrier
+		 * attribute, we must not look at the variable's statistics, though
+		 * it seems all right to notice the existence of a DISTINCT clause.
+		 * So stop here.
+		 *
+		 * This is probably a harsher restriction than necessary; it's
+		 * certainly OK for the selectivity estimator (which is a C function,
+		 * and therefore omnipotent anyway) to look at the statistics.  But
+		 * many selectivity estimators will happily *invoke the operator
+		 * function* to try to work out a good estimate - and that's not OK.
+		 * So for now, don't dig down for stats.
+		 */
+		if (rte->security_barrier)
+			return;
+
 		/* Can only handle a simple Var of subquery's query level */
 		if (var && IsA(var, Var) &&
 			var->varlevelsup == 0)
@@ -4513,10 +4537,10 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 	}
 
 	/*
-	 * If there is a unique index for the variable, assume it is unique no
-	 * matter what pg_statistic says; the statistics could be out of date, or
-	 * we might have found a partial unique index that proves the var is
-	 * unique for this query.
+	 * If there is a unique index or DISTINCT clause for the variable, assume
+	 * it is unique no matter what pg_statistic says; the statistics could be
+	 * out of date, or we might have found a partial unique index that proves
+	 * the var is unique for this query.
 	 */
 	if (vardata->isunique)
 		stadistinct = -1.0;
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 78eda1b503f78..bffc2d80ef012 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -74,7 +74,7 @@ typedef struct VariableStatData
 	Oid			vartype;		/* exposed type of expression */
 	Oid			atttype;		/* type to pass to get_attstatsslot */
 	int32		atttypmod;		/* typmod to pass to get_attstatsslot */
-	bool		isunique;		/* true if matched to a unique index */
+	bool		isunique;		/* matches unique index or DISTINCT clause */
 } VariableStatData;
 
 #define ReleaseVariableStats(vardata)  \

From 06d9afa6f93ec08a45da4de7afd97bbf16738739 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 16 Feb 2012 20:00:11 -0500
Subject: [PATCH 008/129] Fix longstanding error in contrib/intarray's int[] &
 int[] operator.

The array intersection code would give wrong results if the first entry of
the correct output array would be "1".  (I think only this value could be
at risk, since the previous word would always be a lower-bound entry with
that fixed value.)

Problem spotted by Julien Rouhaud, initial patch by Guillaume Lelarge,
cosmetic improvements by me.
---
 contrib/intarray/_int_tool.c       | 13 +++++++------
 contrib/intarray/expected/_int.out |  6 ++++++
 contrib/intarray/sql/_int.sql      |  1 +
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index 79f018d333b1d..132d15316054d 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -140,7 +140,8 @@ inner_int_inter(ArrayType *a, ArrayType *b)
 			   *db,
 			   *dr;
 	int			i,
-				j;
+				j,
+				k;
 
 	if (ARRISEMPTY(a) || ARRISEMPTY(b))
 		return new_intArrayType(0);
@@ -152,15 +153,15 @@ inner_int_inter(ArrayType *a, ArrayType *b)
 	r = new_intArrayType(Min(na, nb));
 	dr = ARRPTR(r);
 
-	i = j = 0;
+	i = j = k = 0;
 	while (i < na && j < nb)
 	{
 		if (da[i] < db[j])
 			i++;
 		else if (da[i] == db[j])
 		{
-			if (i + j == 0 || (i + j > 0 && *(dr - 1) != db[j]))
-				*dr++ = db[j];
+			if (k == 0 || dr[k - 1] != db[j])
+				dr[k++] = db[j];
 			i++;
 			j++;
 		}
@@ -168,13 +169,13 @@ inner_int_inter(ArrayType *a, ArrayType *b)
 			j++;
 	}
 
-	if ((dr - ARRPTR(r)) == 0)
+	if (k == 0)
 	{
 		pfree(r);
 		return new_intArrayType(0);
 	}
 	else
-		return resize_intArrayType(r, dr - ARRPTR(r));
+		return resize_intArrayType(r, k);
 }
 
 void
diff --git a/contrib/intarray/expected/_int.out b/contrib/intarray/expected/_int.out
index 6ed3cc6ced096..4080b9633fe98 100644
--- a/contrib/intarray/expected/_int.out
+++ b/contrib/intarray/expected/_int.out
@@ -137,6 +137,12 @@ SELECT '{123,623,445}'::int[] & '{1623,623}';
  {623}
 (1 row)
 
+SELECT '{-1,3,1}'::int[] & '{1,2}';
+ ?column? 
+----------
+ {1}
+(1 row)
+
 --test query_int
 SELECT '1'::query_int;
  query_int 
diff --git a/contrib/intarray/sql/_int.sql b/contrib/intarray/sql/_int.sql
index b60e936dc520d..216c5c58d615a 100644
--- a/contrib/intarray/sql/_int.sql
+++ b/contrib/intarray/sql/_int.sql
@@ -24,6 +24,7 @@ SELECT '{123,623,445}'::int[] | 623;
 SELECT '{123,623,445}'::int[] | 1623;
 SELECT '{123,623,445}'::int[] | '{1623,623}';
 SELECT '{123,623,445}'::int[] & '{1623,623}';
+SELECT '{-1,3,1}'::int[] & '{1,2}';
 
 
 --test query_int

From 08fd6ff37f71485e2fc04bc6ce07d2a483c36702 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Feb 2012 19:44:26 -0500
Subject: [PATCH 009/129] Sync regex code with Tcl 8.5.11.

Sync our regex code with upstream changes since last time we did this,
which was Tcl 8.5.0 (see commit df1e965e12cdd48c11057ee6e15346ee2b8b02f5).

There are no functional changes here; the main point is just to lay down
a commit-log marker that somebody has looked at this recently, and to do
what we can to keep the two codebases comparable.
---
 src/backend/regex/regc_locale.c |  4 ++--
 src/backend/regex/rege_dfa.c    | 23 +++++++++++------------
 src/backend/regex/regexec.c     | 28 +++++++---------------------
 3 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 0f70931b13edf..6cf27958b1545 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -499,7 +499,7 @@ cclass(struct vars * v,			/* context */
 {
 	size_t		len;
 	struct cvec *cv = NULL;
-	const char **namePtr;
+	const char * const *namePtr;
 	int			i,
 				index;
 
@@ -507,7 +507,7 @@ cclass(struct vars * v,			/* context */
 	 * The following arrays define the valid character class names.
 	 */
 
-	static const char *classNames[] = {
+	static const char * const classNames[] = {
 		"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 		"lower", "print", "punct", "space", "upper", "xdigit", NULL
 	};
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index e521261a57192..f4fd41458bd46 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -272,36 +272,35 @@ static struct dfa *
 newdfa(struct vars * v,
 	   struct cnfa * cnfa,
 	   struct colormap * cm,
-	   struct smalldfa * small) /* preallocated space, may be NULL */
+	   struct smalldfa * sml) /* preallocated space, may be NULL */
 {
 	struct dfa *d;
 	size_t		nss = cnfa->nstates * 2;
 	int			wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
-	struct smalldfa *smallwas = small;
+	struct smalldfa *smallwas = sml;
 
 	assert(cnfa != NULL && cnfa->nstates != 0);
 
 	if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
 	{
 		assert(wordsper == 1);
-		if (small == NULL)
+		if (sml == NULL)
 		{
-			small = (struct smalldfa *) MALLOC(
-											   sizeof(struct smalldfa));
-			if (small == NULL)
+			sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
+			if (sml == NULL)
 			{
 				ERR(REG_ESPACE);
 				return NULL;
 			}
 		}
-		d = &small->dfa;
-		d->ssets = small->ssets;
-		d->statesarea = small->statesarea;
+		d = &sml->dfa;
+		d->ssets = sml->ssets;
+		d->statesarea = sml->statesarea;
 		d->work = &d->statesarea[nss];
-		d->outsarea = small->outsarea;
-		d->incarea = small->incarea;
+		d->outsarea = sml->outsarea;
+		d->incarea = sml->incarea;
 		d->cptsmalloced = 0;
-		d->mallocarea = (smallwas == NULL) ? (char *) small : NULL;
+		d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL;
 	}
 	else
 	{
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 7dc0ddba29ef2..f8e31f8f4ade8 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -141,7 +141,6 @@ static int	dissect(struct vars *, struct subre *, chr *, chr *);
 static int	condissect(struct vars *, struct subre *, chr *, chr *);
 static int	altdissect(struct vars *, struct subre *, chr *, chr *);
 static int	cdissect(struct vars *, struct subre *, chr *, chr *);
-static int	ccaptdissect(struct vars *, struct subre *, chr *, chr *);
 static int	ccondissect(struct vars *, struct subre *, chr *, chr *);
 static int	crevdissect(struct vars *, struct subre *, chr *, chr *);
 static int	cbrdissect(struct vars *, struct subre *, chr *, chr *);
@@ -708,6 +707,8 @@ cdissect(struct vars * v,
 		 chr *begin,			/* beginning of relevant substring */
 		 chr *end)				/* end of same */
 {
+	int			er;
+
 	assert(t != NULL);
 	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
 
@@ -727,31 +728,16 @@ cdissect(struct vars * v,
 			return ccondissect(v, t, begin, end);
 		case '(':				/* capturing */
 			assert(t->left != NULL && t->right == NULL);
-			return ccaptdissect(v, t, begin, end);
+			assert(t->subno > 0);
+			er = cdissect(v, t->left, begin, end);
+			if (er == REG_OKAY)
+				subset(v, t, begin, end);
+			return er;
 		default:
 			return REG_ASSERT;
 	}
 }
 
-/*
- * ccaptdissect - capture subexpression matches (with complications)
- */
-static int						/* regexec return code */
-ccaptdissect(struct vars * v,
-			 struct subre * t,
-			 chr *begin,		/* beginning of relevant substring */
-			 chr *end)			/* end of same */
-{
-	int			er;
-
-	assert(t->subno > 0);
-
-	er = cdissect(v, t->left, begin, end);
-	if (er == REG_OKAY)
-		subset(v, t, begin, end);
-	return er;
-}
-
 /*
  * ccondissect - concatenation subexpression matches (with complications)
  * The retry memory stores the offset of the trial midpoint from begin,

From 5e7710e725b1d1fe408ac20548d872cc52f7b8ab Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Fri, 17 Feb 2012 14:53:22 +0100
Subject: [PATCH 010/129] Make sure all connection paramters are used in call
 to PQconnectdbParams.

---
 src/interfaces/ecpg/ecpglib/connect.c         | 83 ++++++++++++++++---
 src/interfaces/ecpg/test/connect/test5.pgc    |  3 +
 .../ecpg/test/expected/connect-test1.stderr   |  2 +-
 .../ecpg/test/expected/connect-test5.c        | 19 +++--
 .../ecpg/test/expected/connect-test5.stderr   | 12 ++-
 5 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c
index 15384ec3523c2..b874d860e2191 100644
--- a/src/interfaces/ecpg/ecpglib/connect.c
+++ b/src/interfaces/ecpg/ecpglib/connect.c
@@ -267,15 +267,15 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 	struct sqlca_t *sqlca = ECPGget_sqlca();
 	enum COMPAT_MODE compat = c;
 	struct connection *this;
-	int			i;
+	int			i, connect_params = 0;
 	char	   *dbname = name ? ecpg_strdup(name, lineno) : NULL,
 			   *host = NULL,
 			   *tmp,
 			   *port = NULL,
 			   *realname = NULL,
 			   *options = NULL;
-	const char *conn_keywords[7];
-	const char *conn_values[6];
+	const char **conn_keywords;
+	const char **conn_values;
 
 	ecpg_init_sqlca(sqlca);
 
@@ -359,7 +359,10 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 				if (tmp != NULL)	/* database name given */
 				{
 					if (tmp[1] != '\0') /* non-empty database name */
+					{
 						realname = ecpg_strdup(tmp + 1, lineno);
+						connect_params++;
+					}
 					*tmp = '\0';
 				}
 
@@ -373,6 +376,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 					{
 						*tmp2 = '\0';
 						host = ecpg_strdup(tmp + 1, lineno);
+						connect_params++;
 						if (strncmp(dbname, "unix:", 5) != 0)
 						{
 							ecpg_log("ECPGconnect: socketname %s given for TCP connection on line %d\n", host, lineno);
@@ -394,7 +398,10 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 						}
 					}
 					else
+					{
 						port = ecpg_strdup(tmp + 1, lineno);
+						connect_params++;
+					}
 				}
 
 				if (strncmp(dbname, "unix:", 5) == 0)
@@ -418,7 +425,10 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 					}
 				}
 				else
+				{
 					host = ecpg_strdup(dbname + offset, lineno);
+					connect_params++;
+				}
 
 			}
 		}
@@ -429,6 +439,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 			if (tmp != NULL)	/* port number given */
 			{
 				port = ecpg_strdup(tmp + 1, lineno);
+				connect_params++;
 				*tmp = '\0';
 			}
 
@@ -436,10 +447,17 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 			if (tmp != NULL)	/* host name given */
 			{
 				host = ecpg_strdup(tmp + 1, lineno);
+				connect_params++;
 				*tmp = '\0';
 			}
 
-			realname = (strlen(dbname) > 0) ? ecpg_strdup(dbname, lineno) : NULL;
+			if (strlen(dbname) > 0)
+			{
+				realname = ecpg_strdup(dbname, lineno);
+				connect_params++;
+			}
+			else
+				realname = NULL;
 		}
 	}
 	else
@@ -475,10 +493,35 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 			 options ? "with options " : "", options ? options : "",
 			 (user && strlen(user) > 0) ? "for user " : "", user ? user : "");
 
-	if (options)				/* replace '&' if there are any */
+	if (options)
 		for (i = 0; options[i]; i++)
-			if (options[i] == '&')
-				options[i] = ' ';
+			/* count options */
+			if (options[i] == '=')
+				connect_params++;
+
+	if (user && strlen(user) > 0)
+		connect_params++;
+	if (passwd && strlen(passwd) > 0)
+		connect_params++;
+
+	/* allocate enough space for all connection parameters */
+	conn_keywords = (const char **) ecpg_alloc((connect_params + 1) * sizeof (char *), lineno);
+	conn_values = (const char **) ecpg_alloc(connect_params * sizeof (char *), lineno);
+	if (conn_keywords == NULL || conn_values == NULL)
+	{ 
+		if (host)
+			ecpg_free(host);
+		if (port)
+			ecpg_free(port);
+		if (options)
+			ecpg_free(options);
+		if (realname)
+			ecpg_free(realname);
+		if (dbname)
+			ecpg_free(dbname);
+		free(this);
+		return false;
+	}
 
 	i = 0;
 	if (realname)
@@ -513,9 +556,27 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 	}
 	if (options)
 	{
-		conn_keywords[i] = "options";
-		conn_values[i] = options;
-		i++;
+		char *saveptr, *token1, *token2, *str;
+
+		/* options look like this "option1 = value1 option2 = value2 ... */
+		/* we have to break up the string into single options */
+		for (str = options; ; str = NULL)
+		{
+			token1 = strtok_r(str, "=", &saveptr);
+			if (token1 == NULL)
+				break;
+			/* strip leading blanks */
+			for (; *token1 && *token1 == ' '; token1++); 
+
+			token2 = strtok_r(NULL, "&", &saveptr);
+			if (token2 == NULL)
+                                break;
+
+			conn_keywords[i] = token1;
+			conn_values[i] = token2;
+			i++;
+		}
+
 	}
 	conn_keywords[i] = NULL;	/* terminator */
 
@@ -529,6 +590,8 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 		ecpg_free(options);
 	if (dbname)
 		ecpg_free(dbname);
+	ecpg_free(conn_values);
+	ecpg_free(conn_keywords);
 
 	if (PQstatus(this->connection) == CONNECTION_BAD)
 	{
diff --git a/src/interfaces/ecpg/test/connect/test5.pgc b/src/interfaces/ecpg/test/connect/test5.pgc
index c747a5b43716a..d3efecbf62b16 100644
--- a/src/interfaces/ecpg/test/connect/test5.pgc
+++ b/src/interfaces/ecpg/test/connect/test5.pgc
@@ -53,6 +53,9 @@ exec sql end declare section;
 	exec sql connect to 'unix:postgresql://localhost/connectdb' as main user :user;
 	exec sql disconnect main;
 
+	exec sql connect to unix:postgresql://localhost/connectdb?connect_timeout=14&client_encoding=latin1 as main user connectuser;
+	exec sql disconnect main;
+
 	exec sql connect to "unix:postgresql://200.46.204.71/connectdb" as main user connectuser;
 	exec sql disconnect main;
 
diff --git a/src/interfaces/ecpg/test/expected/connect-test1.stderr b/src/interfaces/ecpg/test/expected/connect-test1.stderr
index de1bf17121d55..e755a0a3458ef 100644
--- a/src/interfaces/ecpg/test/expected/connect-test1.stderr
+++ b/src/interfaces/ecpg/test/expected/connect-test1.stderr
@@ -52,7 +52,7 @@
 [NO_PID]: ECPGconnect: opening database connectdb on localhost port <REGRESSION_PORT>  for user connectuser
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ECPGconnect: could not open database: could not connect to server: Connection refused
-	Is the server running on host "localhost" and accepting
+	Is the server running on host "localhost" (127.0.0.1) and accepting
 	TCP/IP connections on port 20?
 
 [NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/expected/connect-test5.c b/src/interfaces/ecpg/test/expected/connect-test5.c
index 8804eaaf9ec8e..a8f79f9a950e7 100644
--- a/src/interfaces/ecpg/test/expected/connect-test5.c
+++ b/src/interfaces/ecpg/test/expected/connect-test5.c
@@ -115,34 +115,41 @@ main(void)
 #line 54 "test5.pgc"
 
 
-	{ ECPGconnect(__LINE__, 0, "unix:postgresql://200.46.204.71/connectdb" , "connectuser" , NULL , "main", 0); }
+	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/connectdb?connect_timeout=14 & client_encoding=latin1" , "connectuser" , NULL , "main", 0); }
 #line 56 "test5.pgc"
 
 	{ ECPGdisconnect(__LINE__, "main");}
 #line 57 "test5.pgc"
 
 
-	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/" , "connectdb" , NULL , "main", 0); }
+	{ ECPGconnect(__LINE__, 0, "unix:postgresql://200.46.204.71/connectdb" , "connectuser" , NULL , "main", 0); }
 #line 59 "test5.pgc"
 
 	{ ECPGdisconnect(__LINE__, "main");}
 #line 60 "test5.pgc"
 
 
+	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/" , "connectdb" , NULL , "main", 0); }
+#line 62 "test5.pgc"
+
+	{ ECPGdisconnect(__LINE__, "main");}
+#line 63 "test5.pgc"
+
+
 	/* connect twice */
 	{ ECPGconnect(__LINE__, 0, "connectdb" , NULL, NULL , "main", 0); }
-#line 63 "test5.pgc"
+#line 66 "test5.pgc"
 
 	{ ECPGconnect(__LINE__, 0, "connectdb" , NULL, NULL , "main", 0); }
-#line 64 "test5.pgc"
+#line 67 "test5.pgc"
 
 	{ ECPGdisconnect(__LINE__, "main");}
-#line 65 "test5.pgc"
+#line 68 "test5.pgc"
 
 
 	/* not connected */
 	{ ECPGdisconnect(__LINE__, "nonexistant");}
-#line 68 "test5.pgc"
+#line 71 "test5.pgc"
 
 
 	return (0);
diff --git a/src/interfaces/ecpg/test/expected/connect-test5.stderr b/src/interfaces/ecpg/test/expected/connect-test5.stderr
index ea6f9e2591c71..9b4055598478c 100644
--- a/src/interfaces/ecpg/test/expected/connect-test5.stderr
+++ b/src/interfaces/ecpg/test/expected/connect-test5.stderr
@@ -46,11 +46,15 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection main closed
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGconnect: non-localhost access via sockets on line 56
+[NO_PID]: ECPGconnect: opening database connectdb on <DEFAULT> port <DEFAULT> with options connect_timeout=14 & client_encoding=latin1 for user connectuser
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: raising sqlcode -402 on line 56: could not connect to database "connectdb" on line 56
+[NO_PID]: ecpg_finish: connection main closed
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ECPGconnect: non-localhost access via sockets on line 59
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: raising sqlcode -402 on line 59: could not connect to database "connectdb" on line 59
 [NO_PID]: sqlca: code: -402, state: 08001
-[NO_PID]: raising sqlcode -220 on line 57: connection "main" does not exist on line 57
+[NO_PID]: raising sqlcode -220 on line 60: connection "main" does not exist on line 60
 [NO_PID]: sqlca: code: -220, state: 08003
 [NO_PID]: ECPGconnect: opening database <DEFAULT> on <DEFAULT> port <DEFAULT>  for user connectdb
 [NO_PID]: sqlca: code: 0, state: 00000
@@ -62,5 +66,5 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection main closed
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: raising sqlcode -220 on line 68: connection "nonexistant" does not exist on line 68
+[NO_PID]: raising sqlcode -220 on line 71: connection "nonexistant" does not exist on line 71
 [NO_PID]: sqlca: code: -220, state: 08003

From e3155c97b0fa6d73aa9128cf0e56e8a776f5d355 Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Sat, 18 Feb 2012 15:56:39 +0100
Subject: [PATCH 011/129] Windows doesn't have strtok_r, so let's use strtok_s
 instead.

---
 src/interfaces/ecpg/ecpglib/connect.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c
index b874d860e2191..ce6bbce15670b 100644
--- a/src/interfaces/ecpg/ecpglib/connect.c
+++ b/src/interfaces/ecpg/ecpglib/connect.c
@@ -10,6 +10,10 @@
 #include "extern.h"
 #include "sqlca.h"
 
+#ifdef WIN32
+#define strtok_r(s,d,p) strtok_s(s,d,p)
+#endif
+
 #ifdef ENABLE_THREAD_SAFETY
 static pthread_mutex_t connections_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t actual_connection_key;

From 45b7ab6b59f4a1982181327ac6393c09ffbbbbd0 Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Sat, 18 Feb 2012 17:20:53 +0100
Subject: [PATCH 012/129] gcc on Windows does not know about strtok_s.

---
 src/interfaces/ecpg/ecpglib/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c
index ce6bbce15670b..716988c570e53 100644
--- a/src/interfaces/ecpg/ecpglib/connect.c
+++ b/src/interfaces/ecpg/ecpglib/connect.c
@@ -10,7 +10,7 @@
 #include "extern.h"
 #include "sqlca.h"
 
-#ifdef WIN32
+#ifdef WIN32_ONLY_COMPILER
 #define strtok_r(s,d,p) strtok_s(s,d,p)
 #endif
 

From 759c95c45b65a5220976c85e6f03323975c2b276 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 18 Feb 2012 18:08:02 -0500
Subject: [PATCH 013/129] Update expected/collate.linux.utf8.out for recent
 plpgsql changes.

This file was missed in commit 4c6cedd1b014abf2046886a9a92e10e18f0d658e.
---
 src/test/regress/expected/collate.linux.utf8.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out
index f9659f7739e25..7ba003a13cca1 100644
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -859,7 +859,7 @@ SELECT mylt2('a', 'B') as f;
 SELECT mylt2('a', 'B' collate "C") as fail; -- conflicting collations
 ERROR:  could not determine which collation to use for string comparison
 HINT:  Use the COLLATE clause to set the collation explicitly.
-CONTEXT:  PL/pgSQL function "mylt2" line 6 at RETURN
+CONTEXT:  PL/pgSQL function mylt2(text,text) line 6 at RETURN
 SELECT mylt2('a', 'B' collate "POSIX") as f;
  f 
 ---

From 84ff5b5db5f801f5da6690d8904cc88cea4700e8 Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Sun, 19 Feb 2012 14:50:14 +0100
Subject: [PATCH 014/129] In ecpglib rewrote code that used strtok_r to not use
 library functions anymore. This way we don't have to worry which compiler on
 which OS offers which version of strtok.

---
 src/interfaces/ecpg/ecpglib/connect.c | 42 ++++++++++++++++-----------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c
index 716988c570e53..cf625f96a34c6 100644
--- a/src/interfaces/ecpg/ecpglib/connect.c
+++ b/src/interfaces/ecpg/ecpglib/connect.c
@@ -10,10 +10,6 @@
 #include "extern.h"
 #include "sqlca.h"
 
-#ifdef WIN32_ONLY_COMPILER
-#define strtok_r(s,d,p) strtok_s(s,d,p)
-#endif
-
 #ifdef ENABLE_THREAD_SAFETY
 static pthread_mutex_t connections_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t actual_connection_key;
@@ -560,25 +556,37 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 	}
 	if (options)
 	{
-		char *saveptr, *token1, *token2, *str;
+		char *str;
 
 		/* options look like this "option1 = value1 option2 = value2 ... */
 		/* we have to break up the string into single options */
-		for (str = options; ; str = NULL)
+		for (str = options; *str;)
 		{
-			token1 = strtok_r(str, "=", &saveptr);
-			if (token1 == NULL)
-				break;
-			/* strip leading blanks */
-			for (; *token1 && *token1 == ' '; token1++); 
+			int e, a;
+			char *token1, *token2;
 
-			token2 = strtok_r(NULL, "&", &saveptr);
-			if (token2 == NULL)
-                                break;
+			for (token1 = str; *token1 && *token1 == ' '; token1++); 
+			for (e = 0; token1[e] && token1[e] != '='; e++);
+			if (token1[e]) /* found "=" */
+			{
+				token1[e] = '\0';
+				for (token2 = token1 + e + 1; *token2 && *token2 == ' '; token2++);
+				for (a = 0; token2[a] && token2[a] != '&'; a++);
+				if (token2[a]) /* found "&" => another option follows */
+				{
+					token2[a] = '\0';
+					str = token2 + a + 1;
+				}
+				else
+					str = token2 + a;
 
-			conn_keywords[i] = token1;
-			conn_values[i] = token2;
-			i++;
+				conn_keywords[i] = token1;
+				conn_values[i] = token2;
+				i++;
+			}	
+			else
+				/* the parser should not be able to create this invalid option */
+				str = token1 + e; 
 		}
 
 	}

From 2f582f76b1945929ff07116cd4639747ce9bb8a1 Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Sun, 19 Feb 2012 11:43:46 -0500
Subject: [PATCH 015/129] Improve pretty printing of viewdefs.

Some line feeds are added to target lists and from lists to make
them more readable. By default they wrap at 80 columns if possible,
but the wrap column is also selectable - if 0 it wraps after every
item.

Andrew Dunstan, reviewed by Hitoshi Harada.
---
 doc/src/sgml/func.sgml                     |  12 +-
 src/backend/utils/adt/ruleutils.c          | 137 ++++++++++++++++++++-
 src/include/catalog/catversion.h           |   2 +-
 src/include/catalog/pg_proc.h              |   2 +
 src/include/utils/builtins.h               |   1 +
 src/test/regress/expected/polymorphism.out |   4 +-
 src/test/regress/expected/rules.out        |  33 +++++
 src/test/regress/expected/with.out         |  25 ++--
 src/test/regress/sql/rules.sql             |   6 +
 9 files changed, 204 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 236a60a688e7b..e8e637bf31bbd 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -13828,7 +13828,8 @@ SELECT pg_type_is_visible('myschema.widget'::regtype);
       <row>
        <entry><literal><function>pg_get_viewdef(<parameter>view_name</parameter>, <parameter>pretty_bool</>)</function></literal></entry>
        <entry><type>text</type></entry>
-       <entry>get underlying <command>SELECT</command> command for view (<emphasis>deprecated</emphasis>)</entry>
+       <entry>get underlying <command>SELECT</command> command for view, 
+              lines with fields are wrapped to 80 columns if pretty_bool is true (<emphasis>deprecated</emphasis>)</entry>
       </row>
       <row>
        <entry><literal><function>pg_get_viewdef(<parameter>view_oid</parameter>)</function></literal></entry>
@@ -13838,7 +13839,14 @@ SELECT pg_type_is_visible('myschema.widget'::regtype);
       <row>
        <entry><literal><function>pg_get_viewdef(<parameter>view_oid</parameter>, <parameter>pretty_bool</>)</function></literal></entry>
        <entry><type>text</type></entry>
-       <entry>get underlying <command>SELECT</command> command for view</entry>
+       <entry>get underlying <command>SELECT</command> command for view, 
+              lines with fields are wrapped to 80 columns if pretty_bool is true</entry>
+      </row>
+      <row>
+       <entry><literal><function>pg_get_viewdef(<parameter>view_oid</parameter>, <parameter>wrap_int</>)</function></literal></entry>
+       <entry><type>text</type></entry>
+       <entry>get underlying <command>SELECT</command> command for view, 
+              wrapping lines with fields as specified, pretty printing is implied</entry>
       </row>
       <row>
        <entry><literal><function>pg_options_to_table(<parameter>reloptions</parameter>)</function></literal></entry>
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 9ad54c5decbcb..64ba8ec8917d3 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -73,6 +73,8 @@
 #define PRETTYFLAG_PAREN		1
 #define PRETTYFLAG_INDENT		2
 
+#define PRETTY_WRAP_DEFAULT     79
+
 /* macro to test if pretty action needed */
 #define PRETTY_PAREN(context)	((context)->prettyFlags & PRETTYFLAG_PAREN)
 #define PRETTY_INDENT(context)	((context)->prettyFlags & PRETTYFLAG_INDENT)
@@ -136,6 +138,7 @@ static SPIPlanPtr plan_getrulebyoid = NULL;
 static const char *query_getrulebyoid = "SELECT * FROM pg_catalog.pg_rewrite WHERE oid = $1";
 static SPIPlanPtr plan_getviewrule = NULL;
 static const char *query_getviewrule = "SELECT * FROM pg_catalog.pg_rewrite WHERE ev_class = $1 AND rulename = $2";
+static int pretty_wrap = PRETTY_WRAP_DEFAULT;
 
 /* GUC parameters */
 bool		quote_all_identifiers = false;
@@ -380,6 +383,23 @@ pg_get_viewdef_ext(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(string_to_text(pg_get_viewdef_worker(viewoid, prettyFlags)));
 }
 
+Datum
+pg_get_viewdef_wrap(PG_FUNCTION_ARGS)
+{
+	/* By OID */
+	Oid			viewoid = PG_GETARG_OID(0);
+	int		    wrap = PG_GETARG_INT32(1);
+	int			prettyFlags;
+	char       *result;
+
+	/* calling this implies we want pretty printing */
+	prettyFlags = PRETTYFLAG_PAREN | PRETTYFLAG_INDENT;
+	pretty_wrap = wrap;
+	result = pg_get_viewdef_worker(viewoid, prettyFlags);
+	pretty_wrap = PRETTY_WRAP_DEFAULT;
+	PG_RETURN_TEXT_P(string_to_text(result));
+}
+
 Datum
 pg_get_viewdef_name(PG_FUNCTION_ARGS)
 {
@@ -3013,6 +3033,7 @@ get_target_list(List *targetList, deparse_context *context,
 	char	   *sep;
 	int			colno;
 	ListCell   *l;
+	bool        last_was_multiline = false;
 
 	sep = " ";
 	colno = 0;
@@ -3021,6 +3042,10 @@ get_target_list(List *targetList, deparse_context *context,
 		TargetEntry *tle = (TargetEntry *) lfirst(l);
 		char	   *colname;
 		char	   *attname;
+		StringInfoData targetbuf;
+		int         leading_nl_pos =  -1;
+		char       *trailing_nl;
+		int         pos;
 
 		if (tle->resjunk)
 			continue;			/* ignore junk entries */
@@ -3029,6 +3054,15 @@ get_target_list(List *targetList, deparse_context *context,
 		sep = ", ";
 		colno++;
 
+		/*
+		 * Put the new field spec into targetbuf so we can
+		 * decide after we've got it whether or not it needs
+		 * to go on a new line.
+		 */
+
+		initStringInfo(&targetbuf);
+		context->buf = &targetbuf;
+
 		/*
 		 * We special-case Var nodes rather than using get_rule_expr. This is
 		 * needed because get_rule_expr will display a whole-row Var as
@@ -3063,8 +3097,66 @@ get_target_list(List *targetList, deparse_context *context,
 		if (colname)			/* resname could be NULL */
 		{
 			if (attname == NULL || strcmp(attname, colname) != 0)
-				appendStringInfo(buf, " AS %s", quote_identifier(colname));
+				appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname));
+		}
+
+		/* Restore context buffer */
+
+		context->buf = buf;
+
+		/* Does the new field start with whitespace plus a new line? */
+
+		for (pos=0; pos < targetbuf.len; pos++)
+		{
+			if (targetbuf.data[pos] == '\n')
+			{
+				leading_nl_pos = pos;
+				break;
+			}
+			if (targetbuf.data[pos] > ' ')
+				break;
+		}
+
+		/* Locate the start of the current  line in the buffer */
+
+		trailing_nl = (strrchr(buf->data,'\n'));
+		if (trailing_nl == NULL)
+			trailing_nl = buf->data;
+		else 
+			trailing_nl++;
+
+		/*
+		 * If the field we're adding is the first in the list, or it already 
+		 * has a leading newline, or wrap mode is disabled (pretty_wrap < 0), 
+		 * don't add anything.
+		 * Otherwise, add a newline, plus some  indentation, if either the 
+		 * new field would cause an overflow or the last field used more than
+		 * one line.
+		 */
+
+		if (colno > 1 &&
+			leading_nl_pos == -1 && 
+			pretty_wrap >= 0 &&
+			((strlen(trailing_nl) + strlen(targetbuf.data) > pretty_wrap) ||
+			 last_was_multiline))
+		{
+			appendContextKeyword(context, "", -PRETTYINDENT_STD, 
+								 PRETTYINDENT_STD, PRETTYINDENT_VAR);
 		}
+
+		/* Add the new field */
+
+		appendStringInfoString(buf, targetbuf.data);
+
+
+		/* Keep track of this field's status for next iteration */
+
+		last_was_multiline = 
+			(strchr(targetbuf.data + leading_nl_pos + 1,'\n') != NULL);
+
+		/* cleanup */
+
+		pfree (targetbuf.data);
 	}
 }
 
@@ -6445,11 +6537,52 @@ get_from_clause(Query *query, const char *prefix, deparse_context *context)
 			appendContextKeyword(context, prefix,
 								 -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
 			first = false;
+
+			get_from_clause_item(jtnode, query, context);
 		}
 		else
+		{
+			StringInfoData targetbuf;
+			char          *trailing_nl;
+
 			appendStringInfoString(buf, ", ");
+			
+			initStringInfo(&targetbuf);
+			context->buf = &targetbuf;
+
+			get_from_clause_item(jtnode, query, context);
+
+			context->buf = buf;
+
+			/* Locate the start of the current  line in the buffer */
+
+			trailing_nl = (strrchr(buf->data,'\n'));
+			if (trailing_nl == NULL)
+				trailing_nl = buf->data;
+			else 
+				trailing_nl++;
+			
+			/*
+			 * Add a newline, plus some  indentation, if pretty_wrap is on and the 
+			 * new from-clause item would cause an overflow.
+			 */
+			
+			if (pretty_wrap >= 0 &&
+				(strlen(trailing_nl) + strlen(targetbuf.data) > pretty_wrap))
+			{
+				appendContextKeyword(context, "", -PRETTYINDENT_STD, 
+									 PRETTYINDENT_STD, PRETTYINDENT_VAR);
+			}
+
+			/* Add the new item */
+
+			appendStringInfoString(buf, targetbuf.data);
+			
+			/* cleanup */
+
+			pfree (targetbuf.data);
+		}
 
-		get_from_clause_item(jtnode, query, context);
 	}
 }
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 7a54a74757e75..6100472d94a91 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201202141
+#define CATALOG_VERSION_NO	201202191
 
 #endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index fb2923f94db06..8700d0d958a26 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -3743,6 +3743,8 @@ DATA(insert OID = 2505 (  pg_get_viewdef	   PGNSP PGUID 12 1 0 0 0 f f f f t f s
 DESCR("select statement of a view with pretty-print option");
 DATA(insert OID = 2506 (  pg_get_viewdef	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 25 "26 16" _null_ _null_ _null_ _null_	pg_get_viewdef_ext _null_ _null_ _null_ ));
 DESCR("select statement of a view with pretty-print option");
+DATA(insert OID = 3159 (  pg_get_viewdef	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 25 "26 23" _null_ _null_ _null_ _null_	pg_get_viewdef_wrap _null_ _null_ _null_ ));
+DESCR("select statement of a view with pretty-printing and specified line wrapping");
 DATA(insert OID = 2507 (  pg_get_indexdef	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 3 0 25 "26 23 16" _null_ _null_ _null_ _null_	pg_get_indexdef_ext _null_ _null_ _null_ ));
 DESCR("index description (full create statement or single expression) with pretty-print option");
 DATA(insert OID = 2508 (  pg_get_constraintdef PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 25 "26 16" _null_ _null_ _null_ _null_	pg_get_constraintdef_ext _null_ _null_ _null_ ));
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 2c331ce5eb9b2..fe253bcc7cdf8 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -624,6 +624,7 @@ extern Datum pg_get_ruledef(PG_FUNCTION_ARGS);
 extern Datum pg_get_ruledef_ext(PG_FUNCTION_ARGS);
 extern Datum pg_get_viewdef(PG_FUNCTION_ARGS);
 extern Datum pg_get_viewdef_ext(PG_FUNCTION_ARGS);
+extern Datum pg_get_viewdef_wrap(PG_FUNCTION_ARGS);
 extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
 extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
 extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/polymorphism.out b/src/test/regress/expected/polymorphism.out
index cb3d756290cf1..1e879532f870c 100644
--- a/src/test/regress/expected/polymorphism.out
+++ b/src/test/regress/expected/polymorphism.out
@@ -1381,7 +1381,9 @@ select * from dfview;
  c3     | bigint |           | plain   | 
  c4     | bigint |           | plain   | 
 View definition:
- SELECT int8_tbl.q1, int8_tbl.q2, dfunc(int8_tbl.q1, int8_tbl.q2, flag := int8_tbl.q1 > int8_tbl.q2) AS c3, dfunc(int8_tbl.q1, flag := int8_tbl.q1 < int8_tbl.q2, b := int8_tbl.q2) AS c4
+ SELECT int8_tbl.q1, int8_tbl.q2, 
+    dfunc(int8_tbl.q1, int8_tbl.q2, flag := int8_tbl.q1 > int8_tbl.q2) AS c3, 
+    dfunc(int8_tbl.q1, flag := int8_tbl.q1 < int8_tbl.q2, b := int8_tbl.q2) AS c4
    FROM int8_tbl;
 
 drop view dfview;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index f67b8dc3f43e9..0275a0e120e40 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1568,3 +1568,36 @@ select * from only t1_2;
  19
 (10 rows)
 
+-- test various flavors of pg_get_viewdef()
+select pg_get_viewdef('shoe'::regclass) as unpretty;
+                                                                                                             unpretty                                                                                                              
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ SELECT sh.shoename, sh.sh_avail, sh.slcolor, sh.slminlen, (sh.slminlen * un.un_fact) AS slminlen_cm, sh.slmaxlen, (sh.slmaxlen * un.un_fact) AS slmaxlen_cm, sh.slunit FROM shoe_data sh, unit un WHERE (sh.slunit = un.un_name);
+(1 row)
+
+select pg_get_viewdef('shoe'::regclass,true) as pretty;
+                           pretty                            
+-------------------------------------------------------------
+  SELECT sh.shoename, sh.sh_avail, sh.slcolor, sh.slminlen, +
+     sh.slminlen * un.un_fact AS slminlen_cm, sh.slmaxlen,  +
+     sh.slmaxlen * un.un_fact AS slmaxlen_cm, sh.slunit     +
+    FROM shoe_data sh, unit un                              +
+   WHERE sh.slunit = un.un_name;
+(1 row)
+
+select pg_get_viewdef('shoe'::regclass,0) as prettier;
+                   prettier                    
+-----------------------------------------------
+  SELECT sh.shoename,                         +
+     sh.sh_avail,                             +
+     sh.slcolor,                              +
+     sh.slminlen,                             +
+     sh.slminlen * un.un_fact AS slminlen_cm, +
+     sh.slmaxlen,                             +
+     sh.slmaxlen * un.un_fact AS slmaxlen_cm, +
+     sh.slunit                                +
+    FROM shoe_data sh,                        +
+     unit un                                  +
+   WHERE sh.slunit = un.un_name;
+(1 row)
+
diff --git a/src/test/regress/expected/with.out b/src/test/regress/expected/with.out
index a58739b68ef59..fae92cd37bf35 100644
--- a/src/test/regress/expected/with.out
+++ b/src/test/regress/expected/with.out
@@ -277,18 +277,19 @@ SELECT pg_get_viewdef('vsubdepartment'::regclass);
 (1 row)
 
 SELECT pg_get_viewdef('vsubdepartment'::regclass, true);
-                                    pg_get_viewdef                                    
---------------------------------------------------------------------------------------
-  WITH RECURSIVE subdepartment AS (                                                  +
-                  SELECT department.id, department.parent_department, department.name+
-                    FROM department                                                  +
-                   WHERE department.name = 'A'::text                                 +
-         UNION ALL                                                                   +
-                  SELECT d.id, d.parent_department, d.name                           +
-                    FROM department d, subdepartment sd                              +
-                   WHERE d.parent_department = sd.id                                 +
-         )                                                                           +
-  SELECT subdepartment.id, subdepartment.parent_department, subdepartment.name       +
+                                pg_get_viewdef                                 
+-------------------------------------------------------------------------------
+  WITH RECURSIVE subdepartment AS (                                           +
+                  SELECT department.id, department.parent_department,         +
+                     department.name                                          +
+                    FROM department                                           +
+                   WHERE department.name = 'A'::text                          +
+         UNION ALL                                                            +
+                  SELECT d.id, d.parent_department, d.name                    +
+                    FROM department d, subdepartment sd                       +
+                   WHERE d.parent_department = sd.id                          +
+         )                                                                    +
+  SELECT subdepartment.id, subdepartment.parent_department, subdepartment.name+
     FROM subdepartment;
 (1 row)
 
diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
index 16dc106ab0767..ea665a3c8bb80 100644
--- a/src/test/regress/sql/rules.sql
+++ b/src/test/regress/sql/rules.sql
@@ -927,3 +927,9 @@ update t1 set a = 4 where a = 5;
 select * from only t1;
 select * from only t1_1;
 select * from only t1_2;
+
+-- test various flavors of pg_get_viewdef()
+
+select pg_get_viewdef('shoe'::regclass) as unpretty;
+select pg_get_viewdef('shoe'::regclass,true) as pretty;
+select pg_get_viewdef('shoe'::regclass,0) as prettier;

From 27af91438b68f46f4015853b6f75c6f5c3a8650c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 19 Feb 2012 18:57:38 -0500
Subject: [PATCH 016/129] Create the beginnings of internals documentation for
 the regex code.

Create src/backend/regex/README to hold an implementation overview of
the regex package, and fill it in with some preliminary notes about
the code's DFA/NFA processing and colormap management.  Much more to
do there of course.

Also, improve some code comments around the colormap and cvec code.
No functional changes except to add one missing assert.
---
 src/backend/regex/README      | 291 ++++++++++++++++++++++++++++++++++
 src/backend/regex/regc_cvec.c |  13 +-
 src/backend/regex/regcomp.c   |   1 +
 src/include/regex/regguts.h   |  54 +++++--
 4 files changed, 343 insertions(+), 16 deletions(-)
 create mode 100644 src/backend/regex/README

diff --git a/src/backend/regex/README b/src/backend/regex/README
new file mode 100644
index 0000000000000..3fd58c000119a
--- /dev/null
+++ b/src/backend/regex/README
@@ -0,0 +1,291 @@
+Implementation notes about Henry Spencer's regex library
+========================================================
+
+If Henry ever had any internals documentation, he didn't publish it.
+So this file is an attempt to reverse-engineer some docs.
+
+General source-file layout
+--------------------------
+
+There are four separately-compilable source files, each exposing exactly
+one exported function:
+	regcomp.c: pg_regcomp
+	regexec.c: pg_regexec
+	regerror.c: pg_regerror
+	regfree.c: pg_regfree
+(The pg_ prefixes were added by the Postgres project to distinguish this
+library version from any similar one that might be present on a particular
+system.  They'd need to be removed or replaced in any standalone version
+of the library.)
+
+There are additional source files regc_*.c that are #include'd in regcomp,
+and similarly additional source files rege_*.c that are #include'd in
+regexec.  This was done to avoid exposing internal symbols globally;
+all functions not meant to be part of the library API are static.
+
+(Actually the above is a lie in one respect: there is one more global
+symbol, pg_set_regex_collation in regcomp.  It is not meant to be part of
+the API, but it has to be global because both regcomp and regexec call it.
+It'd be better to get rid of that, as well as the static variables it
+sets, in favor of keeping the needed locale state in the regex structs.
+We have not done this yet for lack of a design for how to add
+application-specific state to the structs.)
+
+What's where in src/backend/regex/:
+
+regcomp.c		Top-level regex compilation code
+regc_color.c		Color map management
+regc_cvec.c		Character vector (cvec) management
+regc_lex.c		Lexer
+regc_nfa.c		NFA handling
+regc_locale.c		Application-specific locale code from Tcl project
+regc_pg_locale.c	Postgres-added application-specific locale code
+regexec.c		Top-level regex execution code
+rege_dfa.c		DFA creation and execution
+regerror.c		pg_regerror: generate text for a regex error code
+regfree.c		pg_regfree: API to free a no-longer-needed regex_t
+
+The locale-specific code is concerned primarily with case-folding and with
+expanding locale-specific character classes, such as [[:alnum:]].  It
+really needs refactoring if this is ever to become a standalone library.
+
+The header files for the library are in src/include/regex/:
+
+regcustom.h		Customizes library for particular application
+regerrs.h		Error message list
+regex.h			Exported API
+regguts.h		Internals declarations
+
+
+DFAs, NFAs, and all that
+------------------------
+
+This library is a hybrid DFA/NFA regex implementation.  (If you've never
+heard either of those terms, get thee to a first-year comp sci textbook.)
+It might not be clear at first glance what that really means and how it
+relates to what you'll see in the code.  Here's what really happens:
+
+* Initial parsing of a regex generates an NFA representation, with number
+of states approximately proportional to the length of the regexp.
+
+* The NFA is then optimized into a "compact NFA" representation, which is
+basically the same data but without fields that are not going to be needed
+at runtime.  We do a little bit of cleanup too, such as removing
+unreachable states that might be created as a result of the rather naive
+transformation done by initial parsing.  The cNFA representation is what
+is passed from regcomp to regexec.
+
+* Unlike traditional NFA-based regex engines, we do not execute directly
+from the NFA representation, as that would require backtracking and so be
+very slow in some cases.  Rather, we execute a DFA, which ideally can
+process an input string in linear time (O(M) for M characters of input)
+without backtracking.  Each state of the DFA corresponds to a set of
+states of the NFA, that is all the states that the NFA might have been in
+upon reaching the current point in the input string.  Therefore, an NFA
+with N states might require as many as 2^N states in the corresponding
+DFA, which could easily require unreasonable amounts of memory.  We deal
+with this by materializing states of the DFA lazily (only when needed) and
+keeping them in a limited-size cache.  The possible need to build the same
+state of the DFA repeatedly makes this approach not truly O(M) time, but
+in the worst case as much as O(M*N).  That's still far better than the
+worst case for a backtracking NFA engine.
+
+If that were the end of it, we'd just say this is a DFA engine, with the
+use of NFAs being merely an implementation detail.  However, a DFA engine
+cannot handle some important regex features such as capturing parens and
+back-references.  If the parser finds that a regex uses these features
+(collectively called "messy cases" in the code), then we have to use
+NFA-style backtracking search after all.
+
+When using the NFA mode, the representation constructed by the parser
+consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
+either plain regular expressions (which are executed as DFAs in the manner
+described above) or back-references (which try to match the input to some
+previous substring).  Non-leaf nodes are capture nodes (which save the
+location of the substring currently matching their child node) or
+concatenation or alternation nodes.  At execution time, the executor
+recursively scans the tree.  At concatenation or alternation nodes,
+it considers each possible alternative way of matching the input string,
+ie each place where the string could be split for a concatenation, or each
+child node for an alternation.  It tries the next alternative if the match
+fails according to the child nodes.  This is exactly the sort of
+backtracking search done by a traditional NFA regex engine.  If there are
+many tree levels it can get very slow.
+
+But all is not lost: we can still be smarter than the average pure NFA
+engine.  To do this, each subre node has an associated DFA, which
+represents what the node could possibly match insofar as a mathematically
+pure regex can describe that, which basically means "no backrefs".
+Before we perform any search of possible alternative sub-matches, we run
+the DFA to see if it thinks the proposed substring could possibly match.
+If not, we can reject the match immediately without iterating through many
+possibilities.
+
+As an example, consider the regex "(a[bc]+)\1".  The compiled
+representation will have a top-level concatenation subre node.  Its left
+child is a capture node, and the child of that is a plain DFA node for
+"a[bc]+".  The concatenation's right child is a backref node for \1.
+The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
+where the backref has been replaced by a copy of the DFA for its referent
+expression.  When executed, the concatenation node will have to search for
+a possible division of the input string that allows its two child nodes to
+each match their part of the string (and although this specific case can
+only succeed when the division is at the middle, the code does not know
+that, nor would it be true in general).  However, we can first run the DFA
+and quickly reject any input that doesn't contain two a's and some number
+of b's and c's.  If the DFA doesn't match, there is no need to recurse to
+the two child nodes for each possible string division point.  In many
+cases, this prefiltering makes the search run much faster than a pure NFA
+engine could do.  It is this behavior that justifies using the phrase
+"hybrid DFA/NFA engine" to describe Spencer's library.
+
+
+Colors and colormapping
+-----------------------
+
+In many common regex patterns, there are large numbers of characters that
+can be treated alike by the execution engine.  A simple example is the
+pattern "[[:alpha:]][[:alnum:]]*" for an identifier.  Basically the engine
+only needs to care whether an input symbol is a letter, a digit, or other.
+We could build the NFA or DFA with a separate arc for each possible letter
+and digit, but that's very wasteful of space and not so cheap to execute
+either, especially when dealing with Unicode which can have thousands of
+letters.  Instead, the parser builds a "color map" that maps each possible
+input symbol to a "color", or equivalence class.  The NFA or DFA
+representation then has arcs labeled with colors, not specific input
+symbols.  At execution, the first thing the executor does with each input
+symbol is to look up its color in the color map, and then everything else
+works from the color only.
+
+To build the colormap, we start by assigning every possible input symbol
+the color WHITE, which means "other" (that is, at the end of parsing, the
+symbols that are still WHITE are those not explicitly referenced anywhere
+in the regex).  When we see a simple literal character or a bracket
+expression in the regex, we want to assign that character, or all the
+characters represented by the bracket expression, a unique new color that
+can be used to label the NFA arc corresponding to the state transition for
+matching this character or bracket expression.  The basic idea is:
+first, change the color assigned to a character to some new value;
+second, run through all the existing arcs in the partially-built NFA,
+and for each one referencing the character's old color, add a parallel
+arc referencing its new color (this keeps the reassignment from changing
+the semantics of what we already built); and third, add a new arc with
+the character's new color to the current pair of NFA states, denoting
+that seeing this character allows the state transition to be made.
+
+This is complicated a bit by not wanting to create more colors
+(equivalence classes) than absolutely necessary.  In particular, if a
+bracket expression mentions two characters that had the same color before,
+they should still share the same color after we process the bracket, since
+there is still not a need to distinguish them.  But we do need to
+distinguish them from other characters that previously had the same color
+yet are not listed in the bracket expression.  To mechanize this, the code
+has a concept of "parent colors" and "subcolors", where a color's subcolor
+is the new color that we are giving to any characters of that color while
+parsing the current atom.  (The word "parent" is a bit unfortunate here,
+because it suggests a long-lived relationship, but a subcolor link really
+only lasts for the duration of parsing a single atom.)  In other words,
+a subcolor link means that we are in process of splitting the parent color
+into two colors (equivalence classes), depending on whether or not each
+member character should be included by the current regex atom.
+
+As an example, suppose we have the regex "a\d\wx".  Initially all possible
+character codes are labeled WHITE (color 0).  To parse the atom "a", we
+create a new color (1), update "a"'s color map entry to 1, and create an
+arc labeled 1 between the first two states of the NFA.  Now we see \d,
+which is really a bracket expression containing the digits "0"-"9".
+First we process "0", which is currently WHITE, so we create a new color
+(2), update "0"'s color map entry to 2, and create an arc labeled 2
+between the second and third states of the NFA.  We also mark color WHITE
+as having the subcolor 2, which means that future relabelings of WHITE
+characters should also select 2 as the new color.  Thus, when we process
+"1", we won't create a new color but re-use 2.  We update "1"'s color map
+entry to 2, and then find that we don't need a new arc because there is
+already one labeled 2 between the second and third states of the NFA.
+Similarly for the other 8 digits, so there will be only one arc labeled 2
+between NFA states 2 and 3 for all members of this bracket expression.
+At completion of processing of the bracket expression, we call okcolors()
+which breaks all the existing parent/subcolor links; there is no longer a
+marker saying that WHITE characters should be relabeled 2.  (Note:
+actually, we did the same creation and clearing of a subcolor link for the
+primitive atom "a", but it didn't do anything very interesting.)  Now we
+come to the "\w" bracket expression, which for simplicity assume expands
+to just "[a-z0-9]".  We process "a", but observe that it is already the
+sole member of its color 1.  This means there is no need to subdivide that
+equivalence class more finely, so we do not create any new color.  We just
+make an arc labeled 1 between the third and fourth NFA states.  Next we
+process "b", which is WHITE and far from the only WHITE character, so we
+create a new color (3), link that as WHITE's subcolor, relabel "b" as
+color 3, and make an arc labeled 3.  As we process "c" through "z", each
+is relabeled from WHITE to 3, but no new arc is needed.  Now we come to
+"0", which is not the only member of its color 2, so we suppose that a new
+color is needed and create color 4.  We link 4 as subcolor of 2, relabel
+"0" as color 4 in the map, and add an arc for color 4.  Next "1" through
+"9" are similarly relabeled as color 4, with no additional arcs needed.
+Having finished the bracket expression, we call okcolors(), which breaks
+the subcolor links.  okcolors() further observes that we have removed
+every member of color 2 (the previous color of the digit characters).
+Therefore, it runs through the partial NFA built so far and relabels arcs
+labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is
+relabeled color 4.  Then it frees up color 2, since we have no more use
+for that color.  We now have an NFA in which transitions for digits are
+consistently labeled with color 4.  Last, we come to the atom "x".
+"x" is currently labeled with color 3, and it's not the only member of
+that color, so we realize that we now need to distinguish "x" from other
+letters when we did not before.  We create a new color, which might have
+been 5 but instead we recycle the unused color 2.  "x" is relabeled 2 in
+the color map and 2 is linked as the subcolor of 3, and we add an arc for
+2 between states 4 and 5 of the NFA.  Now we call okcolors(), which breaks
+the subcolor link between colors 3 and 2 and notices that both colors are
+nonempty.  Therefore, it also runs through the existing NFA arcs and adds
+an additional arc labeled 2 wherever there is an arc labeled 3; this
+action ensures that characters of color 2 (i.e., "x") will still be
+considered as allowing any transitions they did before.  We are now done
+parsing the regex, and we have these final color assignments:
+	color 1: "a"
+	color 2: "x"
+	color 3: other letters
+	color 4: digits
+and the NFA has these arcs:
+	states 1 -> 2 on color 1 (hence, "a" only)
+	states 2 -> 3 on color 4 (digits)
+	states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters)
+	states 4 -> 5 on color 2 ("x" only)
+which can be seen to be a correct representation of the regex.
+
+Given this summary, we can see we need the following operations for
+colors:
+
+* A fast way to look up the current color assignment for any character
+  code.  (This is needed during both parsing and execution, while the
+  remaining operations are needed only during parsing.)
+* A way to alter the color assignment for any given character code.
+* We must track the number of characters currently assigned to each
+  color, so that we can detect empty and singleton colors.
+* We must track all existing NFA arcs of a given color, so that we
+  can relabel them at need, or add parallel arcs of a new color when
+  an existing color has to be subdivided.
+
+The last two of these are handled with the "struct colordesc" array and
+the "colorchain" links in NFA arc structs.  The color map proper (that
+is, the per-character lookup array) is handled as a multi-level tree,
+with each tree level indexed by one byte of a character's value.  The
+code arranges to not have more than one copy of bottom-level tree pages
+that are all-the-same-color.
+
+Unfortunately, this design does not seem terribly efficient for common
+cases such as a tree in which all Unicode letters are colored the same,
+because there aren't that many places where we get a whole page all the
+same color, except at the end of the map.  (It also strikes me that given
+PG's current restrictions on the range of Unicode values, we could use a
+3-level rather than 4-level tree; but there's not provision for that in
+regguts.h at the moment.)
+
+A bigger problem is that it just doesn't seem very reasonable to have to
+consider each Unicode letter separately at regex parse time for a regex
+such as "\w"; more than likely, a huge percentage of those codes will
+never be seen at runtime.  We need to fix things so that locale-based
+character classes are somehow processed "symbolically" without making a
+full expansion of their contents at parse time.  This would mean that we'd
+have to be ready to call iswalpha() at runtime, but if that only happens
+for high-code-value characters, it shouldn't be a big performance hit.
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
index fb6f06b5243f5..580a693161e89 100644
--- a/src/backend/regex/regc_cvec.c
+++ b/src/backend/regex/regc_cvec.c
@@ -77,6 +77,7 @@ static void
 addchr(struct cvec * cv,		/* character vector */
 	   chr c)					/* character to add */
 {
+	assert(cv->nchrs < cv->chrspace);
 	cv->chrs[cv->nchrs++] = (chr) c;
 }
 
@@ -95,17 +96,27 @@ addrange(struct cvec * cv,		/* character vector */
 }
 
 /*
- * getcvec - get a cvec, remembering it as v->cv
+ * getcvec - get a transient cvec, initialized to empty
+ *
+ * The returned cvec is valid only until the next call of getcvec, which
+ * typically will recycle the space.  Callers should *not* free the cvec
+ * explicitly; it will be cleaned up when the struct vars is destroyed.
+ *
+ * This is typically used while interpreting bracket expressions.  In that
+ * usage the cvec is only needed momentarily until we build arcs from it,
+ * so transientness is a convenient behavior.
  */
 static struct cvec *
 getcvec(struct vars * v,		/* context */
 		int nchrs,				/* to hold this many chrs... */
 		int nranges)			/* ... and this many ranges */
 {
+	/* recycle existing transient cvec if large enough */
 	if (v->cv != NULL && nchrs <= v->cv->chrspace &&
 		nranges <= v->cv->rangespace)
 		return clearcvec(v->cv);
 
+	/* nope, make a new one */
 	if (v->cv != NULL)
 		freecvec(v->cv);
 	v->cv = newcvec(nchrs, nranges);
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index bd4d4c3761928..4f9da5b0468d5 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -356,6 +356,7 @@ pg_regcomp(regex_t *re,
 	ZAPCNFA(g->search);
 	v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
 	CNOERR();
+	/* set up a reasonably-sized transient cvec for getcvec usage */
 	v->cv = newcvec(100, 20);
 	if (v->cv == NULL)
 		return freev(v, REG_ESPACE);
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 0cced701dbdc8..fb6789b560f38 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -181,34 +181,52 @@ union tree
 #define tcolor	colors.ccolor
 #define tptr	ptrs.pptr
 
-/* internal per-color descriptor structure for the color machinery */
+/*
+ * Per-color data structure for the compile-time color machinery
+ *
+ * If "sub" is not NOSUB then it is the number of the color's current
+ * subcolor, i.e. we are in process of dividing this color (character
+ * equivalence class) into two colors.  See src/backend/regex/README for
+ * discussion of subcolors.
+ *
+ * Currently-unused colors have the FREECOL bit set and are linked into a
+ * freelist using their "sub" fields, but only if their color numbers are
+ * less than colormap.max.  Any array entries beyond "max" are just garbage.
+ */
 struct colordesc
 {
 	uchr		nchrs;			/* number of chars of this color */
-	color		sub;			/* open subcolor (if any); free chain ptr */
-#define  NOSUB	 COLORLESS
-	struct arc *arcs;			/* color chain */
-	int			flags;
+	color		sub;			/* open subcolor, if any; or free-chain ptr */
+#define  NOSUB	 COLORLESS		/* value of "sub" when no open subcolor */
+	struct arc *arcs;			/* chain of all arcs of this color */
+	int			flags;			/* bit values defined next */
 #define  FREECOL 01				/* currently free */
 #define  PSEUDO  02				/* pseudocolor, no real chars */
 #define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
 	union tree *block;			/* block of solid color, if any */
 };
 
-/* the color map itself */
+/*
+ * The color map itself
+ *
+ * Only the "tree" part is used at execution time, and that only via the
+ * GETCOLOR() macro.  Possibly that should be separated from the compile-time
+ * data.
+ */
 struct colormap
 {
 	int			magic;
 #define  CMMAGIC 0x876
 	struct vars *v;				/* for compile error reporting */
-	size_t		ncds;			/* number of colordescs */
-	size_t		max;			/* highest in use */
+	size_t		ncds;			/* allocated length of colordescs array */
+	size_t		max;			/* highest color number currently in use */
 	color		free;			/* beginning of free chain (if non-0) */
-	struct colordesc *cd;
+	struct colordesc *cd;		/* pointer to array of colordescs */
 #define  CDEND(cm)	 (&(cm)->cd[(cm)->max + 1])
+	/* If we need up to NINLINECDS, we store them here to save a malloc */
 #define  NINLINECDS  ((size_t)10)
 	struct colordesc cdspace[NINLINECDS];
-	union tree	tree[NBYTS];	/* tree top, plus fill blocks */
+	union tree	tree[NBYTS];	/* tree top, plus lower-level fill blocks */
 };
 
 /* optimization magic to do fast chr->color mapping */
@@ -229,19 +247,25 @@ struct colormap
 
 
 /*
- * Interface definitions for locale-interface functions in locale.c.
+ * Interface definitions for locale-interface functions in regc_locale.c.
  */
 
-/* Representation of a set of characters. */
+/*
+ * Representation of a set of characters.  chrs[] represents individual
+ * code points, ranges[] represents ranges in the form min..max inclusive.
+ *
+ * Note that in cvecs gotten from newcvec() and intended to be freed by
+ * freecvec(), both arrays of chrs are after the end of the struct, not
+ * separately malloc'd; so chrspace and rangespace are effectively immutable.
+ */
 struct cvec
 {
 	int			nchrs;			/* number of chrs */
-	int			chrspace;		/* number of chrs possible */
+	int			chrspace;		/* number of chrs allocated in chrs[] */
 	chr		   *chrs;			/* pointer to vector of chrs */
 	int			nranges;		/* number of ranges (chr pairs) */
-	int			rangespace;		/* number of chrs possible */
+	int			rangespace;		/* number of ranges allocated in ranges[] */
 	chr		   *ranges;			/* pointer to vector of chr pairs */
-	/* both batches of chrs are on the end */
 };
 
 

From e00f68e49c148851187136d3278b7e9afa370537 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 19 Feb 2012 21:01:13 -0500
Subject: [PATCH 017/129] Add caching of ctype.h/wctype.h results in
 regc_locale.c.

While this doesn't save a huge amount of runtime, it still seems worth
doing, especially since I realized that the data copying I did in my first
draft was quite unnecessary.  In this version, once we have the results
cached, getting them back for re-use is really very cheap.

Also, remove the hard-wired limitation to not consider wctype.h results for
character codes above 255.  It turns out that we can't push the limit as
far up as I'd originally hoped, because the regex colormap code is not
efficient enough to cope very well with character classes containing many
thousand letters, which a Unicode locale is entirely capable of producing.
Still, we can push it up to U+7FF (which I chose as the limit of 2-byte
UTF8 characters), which will at least make Eastern Europeans happy pending
a better solution.  Thus, this commit resolves the specific complaint in
bug #6457, but not the more general issue that letters of non-western
alphabets are mostly not recognized as matching [[:alpha:]].
---
 src/backend/regex/regc_locale.c    | 119 +++++-----------
 src/backend/regex/regc_pg_locale.c | 222 ++++++++++++++++++++++++++++-
 2 files changed, 260 insertions(+), 81 deletions(-)

diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 6cf27958b1545..c0414a24912f2 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,6 +350,16 @@ static const struct cname
 };
 
 
+/*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible.  For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter.  So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
 /*
  * element - map collating-element name to celt
  */
@@ -489,7 +499,11 @@ eclass(struct vars * v,			/* context */
 /*
  * cclass - supply cvec for a character class
  *
- * Must include case counterparts on request.
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache().  This is okay
+ * because callers are not supposed to explicitly free the result either way.
  */
 static struct cvec *
 cclass(struct vars * v,			/* context */
@@ -548,79 +562,54 @@ cclass(struct vars * v,			/* context */
 		index = (int) CC_ALPHA;
 
 	/*
-	 * Now compute the character class contents.
-	 *
-	 * For the moment, assume that only char codes < 256 can be in these
-	 * classes.
+	 * Now compute the character class contents.  For classes that are
+	 * based on the behavior of a <wctype.h> or <ctype.h> function, we use
+	 * pg_ctype_get_cache so that we can cache the results.  Other classes
+	 * have definitions that are hard-wired here, and for those we just
+	 * construct a transient cvec on the fly.
 	 */
 
 	switch ((enum classes) index)
 	{
 		case CC_PRINT:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isprint((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isprint);
 			break;
 		case CC_ALNUM:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isalnum((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isalnum);
 			break;
 		case CC_ALPHA:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isalpha((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isalpha);
 			break;
 		case CC_ASCII:
+			/* hard-wired meaning */
 			cv = getcvec(v, 0, 1);
 			if (cv)
 				addrange(cv, 0, 0x7f);
 			break;
 		case CC_BLANK:
+			/* hard-wired meaning */
 			cv = getcvec(v, 2, 0);
 			addchr(cv, '\t');
 			addchr(cv, ' ');
 			break;
 		case CC_CNTRL:
+			/* hard-wired meaning */
 			cv = getcvec(v, 0, 2);
 			addrange(cv, 0x0, 0x1f);
 			addrange(cv, 0x7f, 0x9f);
 			break;
 		case CC_DIGIT:
-			cv = getcvec(v, 0, 1);
-			if (cv)
-				addrange(cv, (chr) '0', (chr) '9');
+			cv = pg_ctype_get_cache(pg_wc_isdigit);
 			break;
 		case CC_PUNCT:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_ispunct((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_ispunct);
 			break;
 		case CC_XDIGIT:
+			/*
+			 * It's not clear how to define this in non-western locales, and
+			 * even less clear that there's any particular use in trying.
+			 * So just hard-wire the meaning.
+			 */
 			cv = getcvec(v, 0, 3);
 			if (cv)
 			{
@@ -630,50 +619,20 @@ cclass(struct vars * v,			/* context */
 			}
 			break;
 		case CC_SPACE:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isspace((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isspace);
 			break;
 		case CC_LOWER:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_islower((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_islower);
 			break;
 		case CC_UPPER:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isupper((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isupper);
 			break;
 		case CC_GRAPH:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isgraph((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isgraph);
 			break;
 	}
+
+	/* If cv is NULL now, the reason must be "out of memory" */
 	if (cv == NULL)
 		ERR(REG_ESPACE);
 	return cv;
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 7c010e3728580..eac951f200065 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -1,7 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * regc_pg_locale.c
- *	  ctype functions adapted to work on pg_wchar (a/k/a chr)
+ *	  ctype functions adapted to work on pg_wchar (a/k/a chr),
+ *	  and functions to cache the results of wholesale ctype probing.
  *
  * This file is #included by regcomp.c; it's not meant to compile standalone.
  *
@@ -72,6 +73,7 @@ typedef enum
 
 static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
+static Oid	pg_regex_collation;
 
 /*
  * Hard-wired character properties for C locale
@@ -233,6 +235,7 @@ pg_set_regex_collation(Oid collation)
 		/* C/POSIX collations use this path regardless of database encoding */
 		pg_regex_strategy = PG_REGEX_LOCALE_C;
 		pg_regex_locale = 0;
+		pg_regex_collation = C_COLLATION_OID;
 	}
 	else
 	{
@@ -275,6 +278,8 @@ pg_set_regex_collation(Oid collation)
 			else
 				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
 		}
+
+		pg_regex_collation = collation;
 	}
 }
 
@@ -656,3 +661,218 @@ pg_wc_tolower(pg_wchar c)
 	}
 	return 0;					/* can't get here, but keep compiler quiet */
 }
+
+
+/*
+ * These functions cache the results of probing libc's ctype behavior for
+ * all character codes of interest in a given encoding/collation.  The
+ * result is provided as a "struct cvec", but notice that the representation
+ * is a touch different from a cvec created by regc_cvec.c: we allocate the
+ * chrs[] and ranges[] arrays separately from the struct so that we can
+ * realloc them larger at need.  This is okay since the cvecs made here
+ * should never be freed by freecvec().
+ *
+ * We use malloc not palloc since we mustn't lose control on out-of-memory;
+ * the main regex code expects us to return a failure indication instead.
+ */
+
+typedef int (*pg_wc_probefunc) (pg_wchar c);
+
+typedef struct pg_ctype_cache
+{
+	pg_wc_probefunc probefunc;		/* pg_wc_isalpha or a sibling */
+	Oid			collation;			/* collation this entry is for */
+	struct cvec cv;					/* cache entry contents */
+	struct pg_ctype_cache *next;	/* chain link */
+} pg_ctype_cache;
+
+static pg_ctype_cache *pg_ctype_cache_list = NULL;
+
+/*
+ * Add a chr or range to pcc->cv; return false if run out of memory
+ */
+static bool
+store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
+{
+	chr		   *newchrs;
+
+	if (nchrs > 1)
+	{
+		if (pcc->cv.nranges >= pcc->cv.rangespace)
+		{
+			pcc->cv.rangespace *= 2;
+			newchrs = (chr *) realloc(pcc->cv.ranges,
+									  pcc->cv.rangespace * sizeof(chr) * 2);
+			if (newchrs == NULL)
+				return false;
+			pcc->cv.ranges = newchrs;
+		}
+		pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
+		pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
+		pcc->cv.nranges++;
+	}
+	else
+	{
+		assert(nchrs == 1);
+		if (pcc->cv.nchrs >= pcc->cv.chrspace)
+		{
+			pcc->cv.chrspace *= 2;
+			newchrs = (chr *) realloc(pcc->cv.chrs,
+									  pcc->cv.chrspace * sizeof(chr));
+			if (newchrs == NULL)
+				return false;
+			pcc->cv.chrs = newchrs;
+		}
+		pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
+	}
+	return true;
+}
+
+/*
+ * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+ * chrs satisfying the probe function.  The active collation is the one
+ * previously set by pg_set_regex_collation.  Return NULL if out of memory.
+ *
+ * Note that the result must not be freed or modified by caller.
+ */
+static struct cvec *
+pg_ctype_get_cache(pg_wc_probefunc probefunc)
+{
+	pg_ctype_cache *pcc;
+	pg_wchar	max_chr;
+	pg_wchar	cur_chr;
+	int			nmatches;
+	chr		   *newchrs;
+
+	/*
+	 * Do we already have the answer cached?
+	 */
+	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
+	{
+		if (pcc->probefunc == probefunc &&
+			pcc->collation == pg_regex_collation)
+			return &pcc->cv;
+	}
+
+	/*
+	 * Nope, so initialize some workspace ...
+	 */
+	pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
+	if (pcc == NULL)
+		return NULL;
+	pcc->probefunc = probefunc;
+	pcc->collation = pg_regex_collation;
+	pcc->cv.nchrs = 0;
+	pcc->cv.chrspace = 128;
+	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
+	pcc->cv.nranges = 0;
+	pcc->cv.rangespace = 64;
+	pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
+	if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
+		goto out_of_memory;
+
+	/*
+	 * Decide how many character codes we ought to look through.  For C locale
+	 * there's no need to go further than 127.  Otherwise, if the encoding is
+	 * UTF8 go up to 0x7FF, which is a pretty arbitrary cutoff but we cannot
+	 * extend it as far as we'd like (say, 0xFFFF, the end of the Basic
+	 * Multilingual Plane) without creating significant performance issues due
+	 * to too many characters being fed through the colormap code.  This will
+	 * need redesign to fix reasonably, but at least for the moment we have
+	 * all common European languages covered.  Otherwise (not C, not UTF8) go
+	 * up to 255.  These limits are interrelated with restrictions discussed
+	 * at the head of this file.
+	 */
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			max_chr = (pg_wchar) 127;
+			break;
+		case PG_REGEX_LOCALE_WIDE:
+		case PG_REGEX_LOCALE_WIDE_L:
+			max_chr = (pg_wchar) 0x7FF;
+			break;
+		case PG_REGEX_LOCALE_1BYTE:
+		case PG_REGEX_LOCALE_1BYTE_L:
+			max_chr = (pg_wchar) UCHAR_MAX;
+			break;
+		default:
+			max_chr = 0;		/* can't get here, but keep compiler quiet */
+			break;
+	}
+
+	/*
+	 * And scan 'em ...
+	 */
+	nmatches = 0;				/* number of consecutive matches */
+
+	for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
+	{
+		if ((*probefunc) (cur_chr))
+			nmatches++;
+		else if (nmatches > 0)
+		{
+			if (!store_match(pcc, cur_chr - nmatches, nmatches))
+				goto out_of_memory;
+			nmatches = 0;
+		}
+	}
+
+	if (nmatches > 0)
+		if (!store_match(pcc, cur_chr - nmatches, nmatches))
+			goto out_of_memory;
+
+	/*
+	 * We might have allocated more memory than needed, if so free it
+	 */
+	if (pcc->cv.nchrs == 0)
+	{
+		free(pcc->cv.chrs);
+		pcc->cv.chrs = NULL;
+		pcc->cv.chrspace = 0;
+	}
+	else if (pcc->cv.nchrs < pcc->cv.chrspace)
+	{
+		newchrs = (chr *) realloc(pcc->cv.chrs,
+								  pcc->cv.nchrs * sizeof(chr));
+		if (newchrs == NULL)
+			goto out_of_memory;
+		pcc->cv.chrs = newchrs;
+		pcc->cv.chrspace = pcc->cv.nchrs;
+	}
+	if (pcc->cv.nranges == 0)
+	{
+		free(pcc->cv.ranges);
+		pcc->cv.ranges = NULL;
+		pcc->cv.rangespace = 0;
+	}
+	else if (pcc->cv.nranges < pcc->cv.rangespace)
+	{
+		newchrs = (chr *) realloc(pcc->cv.ranges,
+								  pcc->cv.nranges * sizeof(chr) * 2);
+		if (newchrs == NULL)
+			goto out_of_memory;
+		pcc->cv.ranges = newchrs;
+		pcc->cv.rangespace = pcc->cv.nranges;
+	}
+
+	/*
+	 * Success, link it into cache chain
+	 */
+	pcc->next = pg_ctype_cache_list;
+	pg_ctype_cache_list = pcc;
+
+	return &pcc->cv;
+
+	/*
+	 * Failure, clean up
+	 */
+out_of_memory:
+	if (pcc->cv.chrs)
+		free(pcc->cv.chrs);
+	if (pcc->cv.ranges)
+		free(pcc->cv.ranges);
+	free(pcc);
+
+	return NULL;
+}

From 5223f96d92fd6fb6fcf260da9f9cb111831f0b37 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Feb 2012 00:52:33 -0500
Subject: [PATCH 018/129] Fix regex back-references that are directly
 quantified with *.

The syntax "\n*", that is a backref with a * quantifier directly applied
to it, has never worked correctly in Spencer's library.  This has been an
open bug in the Tcl bug tracker since 2005:
https://sourceforge.net/tracker/index.php?func=detail&aid=1115587&group_id=10894&atid=110894

The core of the problem is in parseqatom(), which first changes "\n*" to
"\n+|" and then applies repeat() to the NFA representing the backref atom.
repeat() thinks that any arc leading into its "rp" argument is part of the
sub-NFA to be repeated.  Unfortunately, since parseqatom() already created
the arc that was intended to represent the empty bypass around "\n+", this
arc gets moved too, so that it now leads into the state loop created by
repeat().  Thus, what was supposed to be an "empty" bypass gets turned into
something that represents zero or more repetitions of the NFA representing
the backref atom.  In the original example, in place of
	^([bc])\1*$
we now have something that acts like
	^([bc])(\1+|[bc]*)$
At runtime, the branch involving the actual backref fails, as it's supposed
to, but then the other branch succeeds anyway.

We could no doubt fix this by some rearrangement of the operations in
parseqatom(), but that code is plenty ugly already, and what's more the
whole business of converting "x*" to "x+|" probably needs to go away to fix
another problem I'll mention in a moment.  Instead, this patch suppresses
the *-conversion when the target is a simple backref atom, leaving the case
of m == 0 to be handled at runtime.  This makes the patch in regcomp.c a
one-liner, at the cost of having to tweak cbrdissect() a little.  In the
event I went a bit further than that and rewrote cbrdissect() to check all
the string-length-related conditions before it starts comparing characters.
It seems a bit stupid to possibly iterate through many copies of an
n-character backreference, only to fail at the end because the target
string's length isn't a multiple of n --- we could have found that out
before starting.  The existing coding could only be a win if integer
division is hugely expensive compared to character comparison, but I don't
know of any modern machine where that might be true.

This does not fix all the problems with quantified back-references.  In
particular, the code is still broken for back-references that appear within
a larger expression that is quantified (so that direct insertion of the
quantification limits into the BACKREF node doesn't apply).  I think fixing
that will take some major surgery on the NFA code, specifically introducing
an explicit iteration node type instead of trying to transform iteration
into concatenation of modified regexps.

Back-patch to all supported branches.  In HEAD, also add a regression test
case for this.  (It may seem a bit silly to create a regression test file
for just one test case; but I'm expecting that we will soon import a whole
bunch of regex regression tests from Tcl, so might as well create the
infrastructure now.)
---
 src/backend/regex/regcomp.c         |  8 +++-
 src/backend/regex/regexec.c         | 73 ++++++++++++++++++-----------
 src/test/regress/expected/regex.out | 36 ++++++++++++++
 src/test/regress/parallel_schedule  |  2 +-
 src/test/regress/serial_schedule    |  1 +
 src/test/regress/sql/regex.sql      | 13 +++++
 6 files changed, 103 insertions(+), 30 deletions(-)
 create mode 100644 src/test/regress/expected/regex.out
 create mode 100644 src/test/regress/sql/regex.sql

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 4f9da5b0468d5..6b80140e90940 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1088,8 +1088,12 @@ parseqatom(struct vars * v,
 		NOERR();
 	}
 
-	/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
-	if (m == 0)
+	/*
+	 * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
+	 * with quantifiers internally.  Otherwise, the first step is to turn
+	 * x{0,...} into x{1,...}|empty
+	 */
+	if (m == 0 && atomtype != BACKREF)
 	{
 		EMPTYARC(s2, atom->end);	/* the bypass */
 		assert(PREF(qprefer) != 0);
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index f8e31f8f4ade8..224da5064b69b 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -720,7 +720,7 @@ cdissect(struct vars * v,
 		case '|':				/* alternation */
 			assert(t->left != NULL);
 			return caltdissect(v, t, begin, end);
-		case 'b':				/* back ref -- shouldn't be calling us! */
+		case 'b':				/* back reference */
 			assert(t->left == NULL && t->right == NULL);
 			return cbrdissect(v, t, begin, end);
 		case '.':				/* concatenation */
@@ -962,12 +962,12 @@ cbrdissect(struct vars * v,
 		   chr *begin,			/* beginning of relevant substring */
 		   chr *end)			/* end of same */
 {
-	int			i;
 	int			n = t->subno;
-	size_t		len;
-	chr		   *paren;
+	size_t		numreps;
+	size_t		tlen;
+	size_t		brlen;
+	chr		   *brstring;
 	chr		   *p;
-	chr		   *stop;
 	int			min = t->min;
 	int			max = t->max;
 
@@ -978,46 +978,65 @@ cbrdissect(struct vars * v,
 
 	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
 
+	/* get the backreferenced string */
 	if (v->pmatch[n].rm_so == -1)
 		return REG_NOMATCH;
-	paren = v->start + v->pmatch[n].rm_so;
-	len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+	brstring = v->start + v->pmatch[n].rm_so;
+	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
 
 	/* no room to maneuver -- retries are pointless */
 	if (v->mem[t->retry])
 		return REG_NOMATCH;
 	v->mem[t->retry] = 1;
 
-	/* special-case zero-length string */
-	if (len == 0)
+	/* special cases for zero-length strings */
+	if (brlen == 0)
+	{
+		/*
+		 * matches only if target is zero length, but any number of
+		 * repetitions can be considered to be present
+		 */
+		if (begin == end && min <= max)
+		{
+			MDEBUG(("cbackref matched trivially\n"));
+			return REG_OKAY;
+		}
+		return REG_NOMATCH;
+	}
+	if (begin == end)
 	{
-		if (begin == end)
+		/* matches only if zero repetitions are okay */
+		if (min == 0)
+		{
+			MDEBUG(("cbackref matched trivially\n"));
 			return REG_OKAY;
+		}
 		return REG_NOMATCH;
 	}
 
-	/* and too-short string */
-	assert(end >= begin);
-	if ((size_t) (end - begin) < len)
+	/*
+	 * check target length to see if it could possibly be an allowed number of
+	 * repetitions of brstring
+	 */
+	assert(end > begin);
+	tlen = end - begin;
+	if (tlen % brlen != 0)
+		return REG_NOMATCH;
+	numreps = tlen / brlen;
+	if (numreps < min || (numreps > max && max != INFINITY))
 		return REG_NOMATCH;
-	stop = end - len;
 
-	/* count occurrences */
-	i = 0;
-	for (p = begin; p <= stop && (i < max || max == INFINITY); p += len)
+	/* okay, compare the actual string contents */
+	p = begin;
+	while (numreps-- > 0)
 	{
-		if ((*v->g->compare) (paren, p, len) != 0)
-			break;
-		i++;
+		if ((*v->g->compare) (brstring, p, brlen) != 0)
+			return REG_NOMATCH;
+		p += brlen;
 	}
-	MDEBUG(("cbackref found %d\n", i));
 
-	/* and sort it out */
-	if (p != end)				/* didn't consume all of it */
-		return REG_NOMATCH;
-	if (min <= i && (i <= max || max == INFINITY))
-		return REG_OKAY;
-	return REG_NOMATCH;			/* out of range */
+	MDEBUG(("cbackref matched\n"));
+	return REG_OKAY;
 }
 
 /*
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
new file mode 100644
index 0000000000000..5694908163af8
--- /dev/null
+++ b/src/test/regress/expected/regex.out
@@ -0,0 +1,36 @@
+--
+-- Regular expression tests
+--
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'ccc' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'xxx' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'bbc' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'b' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 862f5b20077a6..8852e0a40fc5c 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -30,7 +30,7 @@ test: point lseg box path polygon circle date time timetz timestamp timestamptz
 # geometry depends on point, lseg, box, path, polygon and circle
 # horology depends on interval, timetz, timestamp, timestamptz, reltime and abstime
 # ----------
-test: geometry horology oidjoins type_sanity opr_sanity
+test: geometry horology regex oidjoins type_sanity opr_sanity
 
 # ----------
 # These four each depend on the previous one
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 142fc9cf0d1a1..0bc5df7fe73f5 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -42,6 +42,7 @@ test: tstypes
 test: comments
 test: geometry
 test: horology
+test: regex
 test: oidjoins
 test: type_sanity
 test: opr_sanity
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
new file mode 100644
index 0000000000000..242a81ef3298a
--- /dev/null
+++ b/src/test/regress/sql/regex.sql
@@ -0,0 +1,13 @@
+--
+-- Regular expression tests
+--
+
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+select 'ccc' ~ '^([bc])\1*$' as t;
+select 'xxx' ~ '^([bc])\1*$' as f;
+select 'bbc' ~ '^([bc])\1*$' as f;
+select 'b' ~ '^([bc])\1*$' as t;

From 83fcaffea2b55152e45fdcaf3fdaf4c0c89f65ce Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Mon, 20 Feb 2012 15:01:03 -0500
Subject: [PATCH 019/129] Fix a couple of cases of JSON output.

First, as noted by Itagaki Takahiro, a datum of type JSON doesn't
need to be escaped. Second, ensure that numeric output not in
the form of a legal JSON number is quoted and escaped.
---
 src/backend/utils/adt/json.c       | 25 +++++++++++++++++++++----
 src/test/regress/expected/json.out | 30 ++++++++++++++++++++++++++++++
 src/test/regress/sql/json.sql      | 15 +++++++++++++++
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 60addf2871f34..feda0e0035740 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -84,6 +84,10 @@ static void array_dim_to_json(StringInfo result, int dim, int ndims,int * dims,
 							  Oid typoutputfunc, bool use_line_feeds);
 static void array_to_json_internal(Datum array, StringInfo result, bool use_line_feeds);
 
+/* fake type category for JSON so we can distinguish it in datum_to_json */
+#define TYPCATEGORY_JSON 'j'
+/* letters appearing in numeric output that aren't valid in a JSON number */
+#define NON_NUMERIC_LETTER "NnAnIiFfTtYy"
 /*
  * Input.
  */
@@ -707,10 +711,20 @@ datum_to_json(Datum val, StringInfo result, TYPCATEGORY tcategory,
 		case TYPCATEGORY_NUMERIC:
 			outputstr = OidOutputFunctionCall(typoutputfunc, val);
 			/*
-			 * Don't call escape_json here. Numeric output should
-			 * be a valid JSON number and JSON numbers shouldn't
-			 * be quoted.
+			 * Don't call escape_json here if it's a valid JSON
+			 * number. Numeric output should usually be a valid 
+			 * JSON number and JSON numbers shouldn't be quoted. 
+			 * Quote cases like "Nan" and "Infinity", however.
 			 */
+			if (strpbrk(outputstr,NON_NUMERIC_LETTER) == NULL)
+				appendStringInfoString(result, outputstr);
+			else
+				escape_json(result, outputstr);
+			pfree(outputstr);
+			break;
+		case TYPCATEGORY_JSON:
+			/* JSON will already be escaped */
+			outputstr = OidOutputFunctionCall(typoutputfunc, val);
 			appendStringInfoString(result, outputstr);
 			pfree(outputstr);
 			break;
@@ -806,9 +820,10 @@ array_to_json_internal(Datum array, StringInfo result, bool use_line_feeds)
 					  typalign, &elements, &nulls,
 					  &nitems);
 
-	/* can't have an array of arrays, so this is the only special case here */
 	if (element_type == RECORDOID)
 		tcategory = TYPCATEGORY_COMPOSITE;
+	else if (element_type == JSONOID)
+		tcategory = TYPCATEGORY_JSON;
 	else
 		tcategory = TypeCategory(element_type);
 
@@ -876,6 +891,8 @@ composite_to_json(Datum composite, StringInfo result, bool use_line_feeds)
 			tcategory = TYPCATEGORY_ARRAY;
 		else if (tupdesc->attrs[i]->atttypid == RECORDOID)
 			tcategory = TYPCATEGORY_COMPOSITE;
+		else if (tupdesc->attrs[i]->atttypid == JSONOID)
+			tcategory = TYPCATEGORY_JSON;
 		else
 			tcategory = TypeCategory(tupdesc->attrs[i]->atttypid);
 
diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out
index 2b573511139c4..fa8415cdb77c5 100644
--- a/src/test/regress/expected/json.out
+++ b/src/test/regress/expected/json.out
@@ -367,3 +367,33 @@ SELECT row_to_json(row((select array_agg(x) as d from generate_series(5,10) x)),
  {"f1":[5,6,7,8,9,10]}
 (1 row)
 
+-- non-numeric output
+SELECT row_to_json(q)
+FROM (SELECT 'NaN'::float8 AS "float8field") q;
+      row_to_json      
+-----------------------
+ {"float8field":"NaN"}
+(1 row)
+
+SELECT row_to_json(q)
+FROM (SELECT 'Infinity'::float8 AS "float8field") q;
+        row_to_json         
+----------------------------
+ {"float8field":"Infinity"}
+(1 row)
+
+SELECT row_to_json(q)
+FROM (SELECT '-Infinity'::float8 AS "float8field") q;
+         row_to_json         
+-----------------------------
+ {"float8field":"-Infinity"}
+(1 row)
+
+-- json input
+SELECT row_to_json(q)
+FROM (SELECT '{"a":1,"b": [2,3,4,"d","e","f"],"c":{"p":1,"q":2}}'::json AS "jsonfield") q;
+                           row_to_json                            
+------------------------------------------------------------------
+ {"jsonfield":{"a":1,"b": [2,3,4,"d","e","f"],"c":{"p":1,"q":2}}}
+(1 row)
+
diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql
index 61273555aae1f..ab1c41c1c4c98 100644
--- a/src/test/regress/sql/json.sql
+++ b/src/test/regress/sql/json.sql
@@ -97,3 +97,18 @@ SELECT row_to_json(q,true)
 FROM rows q;
 
 SELECT row_to_json(row((select array_agg(x) as d from generate_series(5,10) x)),false);
+
+-- non-numeric output
+SELECT row_to_json(q)
+FROM (SELECT 'NaN'::float8 AS "float8field") q;
+
+SELECT row_to_json(q)
+FROM (SELECT 'Infinity'::float8 AS "float8field") q;
+
+SELECT row_to_json(q)
+FROM (SELECT '-Infinity'::float8 AS "float8field") q;
+
+-- json input
+SELECT row_to_json(q)
+FROM (SELECT '{"a":1,"b": [2,3,4,"d","e","f"],"c":{"p":1,"q":2}}'::json AS "jsonfield") q;
+

From c0efc2c2ab416b805ba5ccea621d7198a3f3330f Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Feb 2012 16:21:28 -0500
Subject: [PATCH 020/129] Don't reject threaded Python on FreeBSD.

According to Chris Rees, this has worked for awhile, and the current
FreeBSD port is removing the test anyway.
---
 config/python.m4 | 4 ++--
 configure        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/python.m4 b/config/python.m4
index fa484b46a00a6..baa7136f3674e 100644
--- a/config/python.m4
+++ b/config/python.m4
@@ -85,13 +85,13 @@ AC_SUBST(python_libdir)[]dnl
 AC_SUBST(python_libspec)[]dnl
 AC_SUBST(python_additional_libs)[]dnl
 
-# threaded python is not supported on bsd's
+# threaded python is not supported on OpenBSD
 AC_MSG_CHECKING(whether Python is compiled with thread support)
 pythreads=`${PYTHON} -c "import sys; print(int('thread' in sys.builtin_module_names))"`
 if test "$pythreads" = "1"; then
   AC_MSG_RESULT(yes)
   case $host_os in
-  openbsd*|freebsd*)
+  openbsd*)
     AC_MSG_ERROR([threaded Python not supported on this platform])
     ;;
   esac
diff --git a/configure b/configure
index 26f945ef65250..4a943641580b7 100755
--- a/configure
+++ b/configure
@@ -7401,7 +7401,7 @@ python_additional_libs=`${PYTHON} -c "import distutils.sysconfig; print(' '.join
 $as_echo "${python_libspec} ${python_additional_libs}" >&6; }
 
 
-# threaded python is not supported on bsd's
+# threaded python is not supported on OpenBSD
 { $as_echo "$as_me:$LINENO: checking whether Python is compiled with thread support" >&5
 $as_echo_n "checking whether Python is compiled with thread support... " >&6; }
 pythreads=`${PYTHON} -c "import sys; print(int('thread' in sys.builtin_module_names))"`
@@ -7409,7 +7409,7 @@ if test "$pythreads" = "1"; then
   { $as_echo "$as_me:$LINENO: result: yes" >&5
 $as_echo "yes" >&6; }
   case $host_os in
-  openbsd*|freebsd*)
+  openbsd*)
     { { $as_echo "$as_me:$LINENO: error: threaded Python not supported on this platform" >&5
 $as_echo "$as_me: error: threaded Python not supported on this platform" >&2;}
    { (exit 1); exit 1; }; }

From 95ca2859f45171c345d427991c1f319b5e77cc6c Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Tue, 21 Feb 2012 16:45:19 +0200
Subject: [PATCH 021/129] pg_regress: Add application name setting

Set the PGAPPNAME environment variable in pg_regress so that it
identifies itself as such instead of "psql".
---
 src/test/regress/pg_regress.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 2f6b37bf3b5b6..13842231ce14f 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -691,6 +691,8 @@ initialize_environment(void)
 {
 	char	   *tmp;
 
+	putenv("PGAPPNAME=pg_regress");
+
 	if (nolocale)
 	{
 		/*

From 6b044cb810460993ad9e458a0ee8fcc9fde5a350 Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Tue, 21 Feb 2012 11:03:51 -0500
Subject: [PATCH 022/129] Fix typo, noticed by Will Crawford.

---
 src/backend/utils/adt/json.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index feda0e0035740..d7db4cf0cf935 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -87,7 +87,7 @@ static void array_to_json_internal(Datum array, StringInfo result, bool use_line
 /* fake type category for JSON so we can distinguish it in datum_to_json */
 #define TYPCATEGORY_JSON 'j'
 /* letters appearing in numeric output that aren't valid in a JSON number */
-#define NON_NUMERIC_LETTER "NnAnIiFfTtYy"
+#define NON_NUMERIC_LETTER "NnAaIiFfTtYy"
 /*
  * Input.
  */

From c2a2f7516bd27d4b2bcd387b2aa25a5b98d9c7b0 Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Tue, 21 Feb 2012 17:12:25 +0100
Subject: [PATCH 023/129] Avoid double close of file handle in syslogger on
 win32

This causes an exception when running under a debugger or in particular
when running on a debug version of Windows.

Patch from MauMau
---
 src/backend/postmaster/syslogger.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index cea896a8f979e..c331dd9de8414 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -588,8 +588,11 @@ SysLogger_Start(void)
 							 errmsg("could not redirect stderr: %m")));
 				close(fd);
 				_setmode(_fileno(stderr), _O_BINARY);
-				/* Now we are done with the write end of the pipe. */
-				CloseHandle(syslogPipe[1]);
+				/*
+				 * Now we are done with the write end of the pipe.
+				 * CloseHandle() must not be called because the preceding
+				 * close() closes the underlying handle.
+				*/
 				syslogPipe[1] = 0;
 #endif
 				redirection_done = true;

From 9789c99d01e7e4460b77c29b77d177f86c45a273 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Feb 2012 14:14:16 -0500
Subject: [PATCH 024/129] Cosmetic cleanup for commit
 a760893dbda9934e287789d54bbd3c4ca3914ce0.

Mostly, fixing overlooked comments.
---
 src/backend/access/nbtree/nbtpage.c | 28 +++++++++++++++++++++-------
 src/backend/access/nbtree/nbtree.c  | 11 ++++++-----
 src/backend/access/nbtree/nbtxlog.c |  4 ++--
 src/include/access/nbtree.h         |  3 ++-
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 42b2cd40b89e6..29a9df027b85e 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -715,7 +715,7 @@ _bt_page_recyclable(Page page)
 }
 
 /*
- * Delete item(s) from a btree page.
+ * Delete item(s) from a btree page during VACUUM.
  *
  * This must only be used for deleting leaf items.	Deleting an item on a
  * non-leaf page has to be done as part of an atomic action that includes
@@ -736,7 +736,8 @@ _bt_page_recyclable(Page page)
  */
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
-			OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
+					OffsetNumber *itemnos, int nitems,
+					BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
@@ -771,7 +772,6 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	{
 		XLogRecPtr	recptr;
 		XLogRecData rdata[2];
-
 		xl_btree_vacuum xlrec_vacuum;
 
 		xlrec_vacuum.node = rel->rd_node;
@@ -811,13 +811,27 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	END_CRIT_SECTION();
 }
 
+/*
+ * Delete item(s) from a btree page during single-page cleanup.
+ *
+ * As above, must only be used on leaf pages.
+ *
+ * This routine assumes that the caller has pinned and locked the buffer.
+ * Also, the given itemnos *must* appear in increasing order in the array.
+ *
+ * This is nearly the same as _bt_delitems_vacuum as far as what it does to
+ * the page, but the WAL logging considerations are quite different.  See
+ * comments for _bt_delitems_vacuum.
+ */
 void
 _bt_delitems_delete(Relation rel, Buffer buf,
-					OffsetNumber *itemnos, int nitems, Relation heapRel)
+					OffsetNumber *itemnos, int nitems,
+					Relation heapRel)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
 
+	/* Shouldn't be called unless there's something to do */
 	Assert(nitems > 0);
 
 	/* No ereport(ERROR) until changes are logged */
@@ -849,7 +863,6 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 	{
 		XLogRecPtr	recptr;
 		XLogRecData rdata[3];
-
 		xl_btree_delete xlrec_delete;
 
 		xlrec_delete.node = rel->rd_node;
@@ -863,8 +876,9 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 		rdata[0].next = &(rdata[1]);
 
 		/*
-		 * We need the target-offsets array whether or not we store the to
-		 * allow us to find the latestRemovedXid on a standby server.
+		 * We need the target-offsets array whether or not we store the whole
+		 * buffer, to allow us to find the latestRemovedXid on a standby
+		 * server.
 		 */
 		rdata[1].data = (char *) itemnos;
 		rdata[1].len = nitems * sizeof(OffsetNumber);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 7627738437367..184fc3bb79b23 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1004,14 +1004,15 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
 		}
 
 		/*
-		 * Apply any needed deletes.  We issue just one _bt_delitems() call
-		 * per page, so as to minimize WAL traffic.
+		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
+		 * call per page, so as to minimize WAL traffic.
 		 */
 		if (ndeletable > 0)
 		{
 			BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
 
-			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed);
+			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
+								vstate->lastBlockVacuumed);
 
 			/*
 			 * Keep track of the block number of the lastBlockVacuumed, so we
@@ -1031,8 +1032,8 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
 			/*
 			 * If the page has been split during this vacuum cycle, it seems
 			 * worth expending a write to clear btpo_cycleid even if we don't
-			 * have any deletions to do.  (If we do, _bt_delitems takes care
-			 * of this.)  This ensures we won't process the page again.
+			 * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
+			 * care of this.)  This ensures we won't process the page again.
 			 *
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 0f5c113492dd1..3b351a8b9641d 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -539,7 +539,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 
 	/*
 	 * Mark the page as not containing any LP_DEAD items --- see comments in
-	 * _bt_delitems().
+	 * _bt_delitems_vacuum().
 	 */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
@@ -720,7 +720,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 
 	/*
 	 * Mark the page as not containing any LP_DEAD items --- see comments in
-	 * _bt_delitems().
+	 * _bt_delitems_delete().
 	 */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 041733ce30952..cae51a384d476 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -635,7 +635,8 @@ extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 					OffsetNumber *itemnos, int nitems, Relation heapRel);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
-		   OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed);
+					OffsetNumber *itemnos, int nitems,
+					BlockNumber lastBlockVacuumed);
 extern int	_bt_pagedel(Relation rel, Buffer buf, BTStack stack);
 
 /*

From 593a9631a7947ab95903e87e24786d7e469cc988 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Feb 2012 15:03:36 -0500
Subject: [PATCH 025/129] Don't clear btpo_cycleid during _bt_vacuum_one_page.

When "vacuuming" a single btree page by removing LP_DEAD tuples, we are not
actually within a vacuum operation, but rather in an ordinary insertion
process that could well be running concurrently with a vacuum.  So clearing
the cycleid is incorrect, and could cause the concurrent vacuum to miss
removing tuples that it needs to remove.  This is a longstanding bug
introduced by commit e6284649b9e30372b3990107a082bc7520325676 of
2006-07-25.  I believe it explains Maxim Boguk's recent report of index
corruption, and probably some other previously unexplained reports.

In 9.0 and up this is a one-line fix; before that we need to introduce a
flag to tell _bt_delitems what to do.
---
 src/backend/access/nbtree/nbtpage.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 29a9df027b85e..c5e147ff43522 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -841,11 +841,9 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 	PageIndexMultiDelete(page, itemnos, nitems);
 
 	/*
-	 * We can clear the vacuum cycle ID since this page has certainly been
-	 * processed by the current vacuum scan.
+	 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
+	 * because this is not called by VACUUM.
 	 */
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-	opaque->btpo_cycleid = 0;
 
 	/*
 	 * Mark the page as not containing any LP_DEAD items.  This is not
@@ -854,6 +852,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 	 * true and it doesn't seem worth an additional page scan to check it.
 	 * Remember that BTP_HAS_GARBAGE is only a hint anyway.
 	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
 
 	MarkBufferDirty(buf);

From a417f85e1da1ef241af4bf40507ca213464d7069 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 21 Feb 2012 17:58:02 -0300
Subject: [PATCH 026/129] REASSIGN OWNED: Support foreign data wrappers and
 servers

This was overlooked when implementing those kinds of objects, in commit
cae565e503c42a0942ca1771665243b4453c5770.

Per report from Pawel Casperek.
---
 src/backend/catalog/pg_shdepend.c  |  10 +++
 src/backend/commands/foreigncmds.c | 134 ++++++++++++++++++++++-------
 src/include/commands/defrem.h      |   2 +
 3 files changed, 115 insertions(+), 31 deletions(-)

diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index 11cb9883a762a..bcc663515c1f9 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -25,6 +25,8 @@
 #include "catalog/pg_conversion.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_default_acl.h"
+#include "catalog/pg_foreign_data_wrapper.h"
+#include "catalog/pg_foreign_server.h"
 #include "catalog/pg_language.h"
 #include "catalog/pg_largeobject.h"
 #include "catalog/pg_namespace.h"
@@ -1382,6 +1384,14 @@ shdepReassignOwned(List *roleids, Oid newrole)
 					AlterOpFamilyOwner_oid(sdepForm->objid, newrole);
 					break;
 
+				case ForeignServerRelationId:
+					AlterForeignServerOwner_oid(sdepForm->objid, newrole);
+					break;
+
+				case ForeignDataWrapperRelationId:
+					AlterForeignDataWrapperOwner_oid(sdepForm->objid, newrole);
+					break;
+
 				default:
 					elog(ERROR, "unexpected classid %u", sdepForm->classid);
 					break;
diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c
index 4135e268575f4..990875de76bb4 100644
--- a/src/backend/commands/foreigncmds.c
+++ b/src/backend/commands/foreigncmds.c
@@ -277,27 +277,24 @@ RenameForeignServer(const char *oldname, const char *newname)
 
 
 /*
- * Change foreign-data wrapper owner.
+ * Internal workhorse for changing a data wrapper's owner.
  *
  * Allow this only for superusers; also the new owner must be a
  * superuser.
  */
-void
-AlterForeignDataWrapperOwner(const char *name, Oid newOwnerId)
+static void
+AlterForeignDataWrapperOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId)
 {
-	HeapTuple	tup;
-	Relation	rel;
-	Oid			fdwId;
 	Form_pg_foreign_data_wrapper form;
 
-	rel = heap_open(ForeignDataWrapperRelationId, RowExclusiveLock);
+	form = (Form_pg_foreign_data_wrapper) GETSTRUCT(tup);
 
 	/* Must be a superuser to change a FDW owner */
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied to change owner of foreign-data wrapper \"%s\"",
-						name),
+						NameStr(form->fdwname)),
 				 errhint("Must be superuser to change owner of a foreign-data wrapper.")));
 
 	/* New owner must also be a superuser */
@@ -305,19 +302,9 @@ AlterForeignDataWrapperOwner(const char *name, Oid newOwnerId)
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied to change owner of foreign-data wrapper \"%s\"",
-						name),
+						NameStr(form->fdwname)),
 		errhint("The owner of a foreign-data wrapper must be a superuser.")));
 
-	tup = SearchSysCacheCopy1(FOREIGNDATAWRAPPERNAME, CStringGetDatum(name));
-
-	if (!HeapTupleIsValid(tup))
-		ereport(ERROR,
-				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("foreign-data wrapper \"%s\" does not exist", name)));
-
-	fdwId = HeapTupleGetOid(tup);
-	form = (Form_pg_foreign_data_wrapper) GETSTRUCT(tup);
-
 	if (form->fdwowner != newOwnerId)
 	{
 		form->fdwowner = newOwnerId;
@@ -327,38 +314,73 @@ AlterForeignDataWrapperOwner(const char *name, Oid newOwnerId)
 
 		/* Update owner dependency reference */
 		changeDependencyOnOwner(ForeignDataWrapperRelationId,
-								fdwId,
+								HeapTupleGetOid(tup),
 								newOwnerId);
 	}
+}
+
+/*
+ * Change foreign-data wrapper owner -- by name
+ *
+ * Note restrictions in the "_internal" function, above.
+ */
+void
+AlterForeignDataWrapperOwner(const char *name, Oid newOwnerId)
+{
+	HeapTuple	tup;
+	Relation	rel;
+
+	rel = heap_open(ForeignDataWrapperRelationId, RowExclusiveLock);
+
+	tup = SearchSysCacheCopy1(FOREIGNDATAWRAPPERNAME, CStringGetDatum(name));
+
+	if (!HeapTupleIsValid(tup))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("foreign-data wrapper \"%s\" does not exist", name)));
+
+	AlterForeignDataWrapperOwner_internal(rel, tup, newOwnerId);
 
 	heap_freetuple(tup);
 
 	heap_close(rel, RowExclusiveLock);
 }
 
-
 /*
- * Change foreign server owner
+ * Change foreign-data wrapper owner -- by OID
+ *
+ * Note restrictions in the "_internal" function, above.
  */
 void
-AlterForeignServerOwner(const char *name, Oid newOwnerId)
+AlterForeignDataWrapperOwner_oid(Oid fwdId, Oid newOwnerId)
 {
 	HeapTuple	tup;
 	Relation	rel;
-	Oid			srvId;
-	AclResult	aclresult;
-	Form_pg_foreign_server form;
 
-	rel = heap_open(ForeignServerRelationId, RowExclusiveLock);
+	rel = heap_open(ForeignDataWrapperRelationId, RowExclusiveLock);
 
-	tup = SearchSysCacheCopy1(FOREIGNSERVERNAME, CStringGetDatum(name));
+	tup = SearchSysCacheCopy1(FOREIGNDATAWRAPPEROID, ObjectIdGetDatum(fwdId));
 
 	if (!HeapTupleIsValid(tup))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("server \"%s\" does not exist", name)));
+				 errmsg("foreign-data wrapper with OID \"%u\" does not exist", fwdId)));
+
+	AlterForeignDataWrapperOwner_internal(rel, tup, newOwnerId);
+
+	heap_freetuple(tup);
+
+	heap_close(rel, RowExclusiveLock);
+}
+
+/*
+ * Internal workhorse for changing a foreign server's owner
+ */
+static void
+AlterForeignServerOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId)
+{
+	Form_pg_foreign_server form;
 
-	srvId = HeapTupleGetOid(tup);
 	form = (Form_pg_foreign_server) GETSTRUCT(tup);
 
 	if (form->srvowner != newOwnerId)
@@ -366,10 +388,15 @@ AlterForeignServerOwner(const char *name, Oid newOwnerId)
 		/* Superusers can always do it */
 		if (!superuser())
 		{
+			Oid			srvId;
+			AclResult	aclresult;
+
+			srvId = HeapTupleGetOid(tup);
+
 			/* Must be owner */
 			if (!pg_foreign_server_ownercheck(srvId, GetUserId()))
 				aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_FOREIGN_SERVER,
-							   name);
+							   NameStr(form->srvname));
 
 			/* Must be able to become new owner */
 			check_is_member_of_role(GetUserId(), newOwnerId);
@@ -393,12 +420,57 @@ AlterForeignServerOwner(const char *name, Oid newOwnerId)
 		changeDependencyOnOwner(ForeignServerRelationId, HeapTupleGetOid(tup),
 								newOwnerId);
 	}
+}
+
+/*
+ * Change foreign server owner -- by name
+ */
+void
+AlterForeignServerOwner(const char *name, Oid newOwnerId)
+{
+	HeapTuple	tup;
+	Relation	rel;
+
+	rel = heap_open(ForeignServerRelationId, RowExclusiveLock);
+
+	tup = SearchSysCacheCopy1(FOREIGNSERVERNAME, CStringGetDatum(name));
+
+	if (!HeapTupleIsValid(tup))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("server \"%s\" does not exist", name)));
+
+	AlterForeignServerOwner_internal(rel, tup, newOwnerId);
 
 	heap_freetuple(tup);
 
 	heap_close(rel, RowExclusiveLock);
 }
 
+/*
+ * Change foreign server owner -- by OID
+ */
+void
+AlterForeignServerOwner_oid(Oid srvId, Oid newOwnerId)
+{
+	HeapTuple	tup;
+	Relation	rel;
+
+	rel = heap_open(ForeignServerRelationId, RowExclusiveLock);
+
+	tup = SearchSysCacheCopy1(FOREIGNSERVEROID, ObjectIdGetDatum(srvId));
+
+	if (!HeapTupleIsValid(tup))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("server with OID \"%u\" does not exist", srvId)));
+
+	AlterForeignServerOwner_internal(rel, tup, newOwnerId);
+
+	heap_freetuple(tup);
+
+	heap_close(rel, RowExclusiveLock);
+}
 
 /*
  * Convert a handler function name passed from the parser to an Oid.
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 6c7c04f898775..163b2ea002b7f 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -149,7 +149,9 @@ extern List *deserialize_deflist(Datum txt);
 extern void RenameForeignServer(const char *oldname, const char *newname);
 extern void RenameForeignDataWrapper(const char *oldname, const char *newname);
 extern void AlterForeignServerOwner(const char *name, Oid newOwnerId);
+extern void AlterForeignServerOwner_oid(Oid , Oid newOwnerId);
 extern void AlterForeignDataWrapperOwner(const char *name, Oid newOwnerId);
+extern void AlterForeignDataWrapperOwner_oid(Oid fwdId, Oid newOwnerId);
 extern void CreateForeignDataWrapper(CreateFdwStmt *stmt);
 extern void AlterForeignDataWrapper(AlterFdwStmt *stmt);
 extern void RemoveForeignDataWrapperById(Oid fdwId);

From a445cb92ef5b3a31313ebce30e18cc1d6e0bdecb Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Wed, 22 Feb 2012 23:40:46 +0200
Subject: [PATCH 027/129] Add parameters for controlling locations of
 server-side SSL files

This allows changing the location of the files that were previously
hard-coded to server.crt, server.key, root.crt, root.crl.

server.crt and server.key continue to be the default settings and are
thus required to be present by default if SSL is enabled.  But the
settings for the server-side CA and CRL are now empty by default, and
if they are set, the files are required to be present.  This replaces
the previous behavior of ignoring the functionality if the files were
not found.
---
 doc/src/sgml/config.sgml                      |  64 +++++++++++
 doc/src/sgml/runtime.sgml                     |  36 +++---
 src/backend/libpq/be-secure.c                 | 107 +++++++-----------
 src/backend/libpq/hba.c                       |   2 +-
 src/backend/utils/misc/guc.c                  |  41 +++++++
 src/backend/utils/misc/postgresql.conf.sample |   4 +
 src/include/libpq/libpq.h                     |   5 +
 7 files changed, 179 insertions(+), 80 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0ea9aebdb02c5..6e1378a9d6dc8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -668,6 +668,70 @@ SET ENABLE_SEQSCAN TO OFF;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-ssl-ca-file" xreflabel="ssl_ca_file">
+      <term><varname>ssl_ca_file</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>ssl_ca_file</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the name of the file containing the SSL server certificate
+        authority (CA).  The default is empty, meaning no CA file is loaded,
+        and client certificate verification is not performed.  (In previous
+        releases of PostgreSQL, the name of this file was hard-coded
+        as <filename>root.crt</filename>.)  Relative paths are relative to the
+        data directory.  This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-ssl-cert-file" xreflabel="ssl_cert_file">
+      <term><varname>ssl_cert_file</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>ssl_cert_file</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the name of the file containing the SSL server certificate.
+        The default is <filename>server.crt</filename>.  Relative paths are
+        relative to the data directory.  This parameter can only be set at
+        server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-ssl-crl-file" xreflabel="ssl_crl_file">
+      <term><varname>ssl_crl_file</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>ssl_crl_file</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the name of the file containing the SSL server certificate
+        revocation list (CRL).  The default is empty, meaning no CRL file is
+        loaded.  (In previous releases of PostgreSQL, the name of this file was
+        hard-coded as <filename>root.crl</filename>.)  Relative paths are
+        relative to the data directory.  This parameter can only be set at
+        server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-ssl-key-file" xreflabel="ssl_key_file">
+      <term><varname>ssl_key_file</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>ssl_key_file</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the name of the file containing the SSL server private key.
+        The default is <filename>server.key</filename>.  Relative paths are
+        relative to the data directory.  This parameter can only be set at
+        server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-ssl-renegotiation-limit" xreflabel="ssl_renegotiation_limit">
       <term><varname>ssl_renegotiation_limit</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 1c3a9c87d8a37..5785450e571ee 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1831,10 +1831,8 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    SSL certificates and make sure that clients check the server's certificate.
    To do that, the server
    must be configured to accept only <literal>hostssl</> connections (<xref
-   linkend="auth-pg-hba-conf">) and have SSL
-   <filename>server.key</filename> (key) and
-   <filename>server.crt</filename> (certificate) files (<xref
-   linkend="ssl-tcp">). The TCP client must connect using
+   linkend="auth-pg-hba-conf">) and have SSL key and certificate files
+   (<xref linkend="ssl-tcp">). The TCP client must connect using
    <literal>sslmode=verify-ca</> or
    <literal>verify-full</> and have the appropriate root certificate
    file installed (<xref linkend="libpq-connect">).
@@ -2053,10 +2051,12 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
   </note>
 
   <para>
-   To start in <acronym>SSL</> mode, the files <filename>server.crt</>
-   and <filename>server.key</> must exist in the server's data directory.
-   These files should contain the server certificate and private key,
-   respectively.
+   To start in <acronym>SSL</> mode, files containing the server certificate
+   and private key must exist.  By default, these files are expected to be
+   named <filename>server.crt</> and <filename>server.key</>, respectively, in
+   the server's data directory, but other names and locations can be specified
+   using the configuration parameters <xref linkend="guc-ssl-cert-file">
+   and <xref linkend="guc-ssl-key-file">.
    On Unix systems, the permissions on <filename>server.key</filename> must
    disallow any access to world or group; achieve this by the command
    <command>chmod 0600 server.key</command>.
@@ -2083,7 +2083,9 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    To require the client to supply a trusted certificate, place
    certificates of the certificate authorities (<acronym>CA</acronym>s)
    you trust in the file <filename>root.crt</filename> in the data
-   directory, and set the <literal>clientcert</literal> parameter
+   directory, set the parameter <xref linkend="guc-ssl-ca-file"> in
+   <filename>postgresql.conf</filename> to <literal>root.crt</literal>,
+   and set the <literal>clientcert</literal> parameter
    to 1 on the appropriate <literal>hostssl</> line(s) in
    <filename>pg_hba.conf</>.
    A certificate will then be requested from the client during
@@ -2091,7 +2093,7 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    description of how to set up certificates on the client.)  The server will
    verify that the client's certificate is signed by one of the trusted
    certificate authorities.  Certificate Revocation List (CRL) entries
-   are also checked if the file <filename>root.crl</filename> exists.
+   are also checked if the parameter <xref linkend="guc-ssl-crl-file"> is set.
    <!-- If this URL changes replace it with a URL to www.archive.org. -->
    (See <ulink
    url="http://h71000.www7.hp.com/DOC/83final/BA554_90007/ch04s02.html"></>
@@ -2103,7 +2105,7 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    available for all authentication methods, but only for rows specified as
    <literal>hostssl</>.  When <literal>clientcert</literal> is not specified
    or is set to 0, the server will still verify presented client
-   certificates against <filename>root.crt</filename> if that file exists
+   certificates against its CA list, if one is configured,
    &mdash; but it will not insist that a client certificate be presented.
   </para>
 
@@ -2127,7 +2129,8 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
 
    <para>
     <xref linkend="ssl-file-usage"> summarizes the files that are
-    relevant to the SSL setup on the server.
+    relevant to the SSL setup on the server.  (The shown file names are default
+    or typical names.  The locally configured names could be different.)
    </para>
 
   <table id="ssl-file-usage">
@@ -2144,27 +2147,27 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
     <tbody>
 
      <row>
-      <entry><filename>$PGDATA/server.crt</></entry>
+      <entry><xref linkend="guc-ssl-cert-file"> (<filename>$PGDATA/server.crt</>)</entry>
       <entry>server certificate</entry>
       <entry>sent to client to indicate server's identity</entry>
      </row>
 
      <row>
-      <entry><filename>$PGDATA/server.key</></entry>
+      <entry><xref linkend="guc-ssl-key-file"> (<filename>$PGDATA/server.key</>)</entry>
       <entry>server private key</entry>
       <entry>proves server certificate was sent by the owner; does not indicate
       certificate owner is trustworthy</entry>
      </row>
 
      <row>
-      <entry><filename>$PGDATA/root.crt</></entry>
+      <entry><xref linkend="guc-ssl-ca-file"> (<filename>$PGDATA/root.crt</>)</entry>
       <entry>trusted certificate authorities</entry>
       <entry>checks that client certificate is
       signed by a trusted certificate authority</entry>
      </row>
 
      <row>
-      <entry><filename>$PGDATA/root.crl</></entry>
+      <entry><xref linkend="guc-ssl-crl-file"> (<filename>$PGDATA/root.crl</>)</entry>
       <entry>certificates revoked by certificate authorities</entry>
       <entry>client certificate must not be on this list</entry>
      </row>
@@ -2176,6 +2179,7 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    <para>
     The files <filename>server.key</>, <filename>server.crt</>,
     <filename>root.crt</filename>, and <filename>root.crl</filename>
+    (or their configured alternative names)
     are only examined during server start; so you must restart
     the server for changes in them to take effect.
    </para>
diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index e35df7309532b..f0a38c238a419 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -77,10 +77,10 @@
 
 #ifdef USE_SSL
 
-#define ROOT_CERT_FILE			"root.crt"
-#define ROOT_CRL_FILE			"root.crl"
-#define SERVER_CERT_FILE		"server.crt"
-#define SERVER_PRIVATE_KEY_FILE "server.key"
+char *ssl_cert_file;
+char *ssl_key_file;
+char *ssl_ca_file;
+char *ssl_crl_file;
 
 static DH  *load_dh_file(int keylength);
 static DH  *load_dh_buffer(const char *, size_t);
@@ -746,17 +746,17 @@ initialize_SSL(void)
 		 * Load and verify server's certificate and private key
 		 */
 		if (SSL_CTX_use_certificate_chain_file(SSL_context,
-											   SERVER_CERT_FILE) != 1)
+											   ssl_cert_file) != 1)
 			ereport(FATAL,
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
 				  errmsg("could not load server certificate file \"%s\": %s",
-						 SERVER_CERT_FILE, SSLerrmessage())));
+						 ssl_cert_file, SSLerrmessage())));
 
-		if (stat(SERVER_PRIVATE_KEY_FILE, &buf) != 0)
+		if (stat(ssl_key_file, &buf) != 0)
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not access private key file \"%s\": %m",
-							SERVER_PRIVATE_KEY_FILE)));
+							ssl_key_file)));
 
 		/*
 		 * Require no public access to key file.
@@ -771,16 +771,16 @@ initialize_SSL(void)
 			ereport(FATAL,
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
 				  errmsg("private key file \"%s\" has group or world access",
-						 SERVER_PRIVATE_KEY_FILE),
+						 ssl_key_file),
 				   errdetail("Permissions should be u=rw (0600) or less.")));
 #endif
 
 		if (SSL_CTX_use_PrivateKey_file(SSL_context,
-										SERVER_PRIVATE_KEY_FILE,
+										ssl_key_file,
 										SSL_FILETYPE_PEM) != 1)
 			ereport(FATAL,
 					(errmsg("could not load private key file \"%s\": %s",
-							SERVER_PRIVATE_KEY_FILE, SSLerrmessage())));
+							ssl_key_file, SSLerrmessage())));
 
 		if (SSL_CTX_check_private_key(SSL_context) != 1)
 			ereport(FATAL,
@@ -797,48 +797,30 @@ initialize_SSL(void)
 		elog(FATAL, "could not set the cipher list (no valid ciphers available)");
 
 	/*
-	 * Attempt to load CA store, so we can verify client certificates if
-	 * needed.
+	 * Load CA store, so we can verify client certificates if needed.
 	 */
-	ssl_loaded_verify_locations = false;
-
-	if (access(ROOT_CERT_FILE, R_OK) != 0)
+	if (ssl_ca_file[0])
 	{
-		/*
-		 * If root certificate file simply not found, don't log an error here,
-		 * because it's quite likely the user isn't planning on using client
-		 * certificates. If we can't access it for other reasons, it is an
-		 * error.
-		 */
-		if (errno != ENOENT)
+		if (SSL_CTX_load_verify_locations(SSL_context, ssl_ca_file, NULL) != 1 ||
+			(root_cert_list = SSL_load_client_CA_file(ssl_ca_file)) == NULL)
 			ereport(FATAL,
-				 (errmsg("could not access root certificate file \"%s\": %m",
-						 ROOT_CERT_FILE)));
+					(errmsg("could not load root certificate file \"%s\": %s",
+							ssl_ca_file, SSLerrmessage())));
 	}
-	else if (SSL_CTX_load_verify_locations(SSL_context, ROOT_CERT_FILE, NULL) != 1 ||
-		  (root_cert_list = SSL_load_client_CA_file(ROOT_CERT_FILE)) == NULL)
-	{
-		/*
-		 * File was there, but we could not load it. This means the file is
-		 * somehow broken, and we cannot do verification at all - so fail.
-		 */
-		ereport(FATAL,
-				(errmsg("could not load root certificate file \"%s\": %s",
-						ROOT_CERT_FILE, SSLerrmessage())));
-	}
-	else
+
+	/*----------
+	 * Load the Certificate Revocation List (CRL).
+	 * http://searchsecurity.techtarget.com/sDefinition/0,,sid14_gci803160,00.html
+	 *----------
+	 */
+	if (ssl_crl_file[0])
 	{
-		/*----------
-		 * Load the Certificate Revocation List (CRL) if file exists.
-		 * http://searchsecurity.techtarget.com/sDefinition/0,,sid14_gci803160,00.html
-		 *----------
-		 */
 		X509_STORE *cvstore = SSL_CTX_get_cert_store(SSL_context);
 
 		if (cvstore)
 		{
 			/* Set the flags to check against the complete CRL chain */
-			if (X509_STORE_load_locations(cvstore, ROOT_CRL_FILE, NULL) == 1)
+			if (X509_STORE_load_locations(cvstore, ssl_crl_file, NULL) == 1)
 			{
 				/* OpenSSL 0.96 does not support X509_V_FLAG_CRL_CHECK */
 #ifdef X509_V_FLAG_CRL_CHECK
@@ -847,32 +829,31 @@ initialize_SSL(void)
 #else
 				ereport(LOG,
 				(errmsg("SSL certificate revocation list file \"%s\" ignored",
-						ROOT_CRL_FILE),
+						ssl_crl_file),
 				 errdetail("SSL library does not support certificate revocation lists.")));
 #endif
 			}
 			else
-			{
-				/* Not fatal - we do not require CRL */
-				ereport(LOG,
-						(errmsg("SSL certificate revocation list file \"%s\" not found, skipping: %s",
-								ROOT_CRL_FILE, SSLerrmessage()),
-						 errdetail("Certificates will not be checked against revocation list.")));
-			}
+				ereport(FATAL,
+						(errmsg("could not load SSL certificate revocation list file \"%s\": %s",
+								ssl_crl_file, SSLerrmessage())));
+		}
+	}
 
-			/*
-			 * Always ask for SSL client cert, but don't fail if it's not
-			 * presented.  We might fail such connections later, depending on
-			 * what we find in pg_hba.conf.
-			 */
-			SSL_CTX_set_verify(SSL_context,
-							   (SSL_VERIFY_PEER |
-								SSL_VERIFY_CLIENT_ONCE),
-							   verify_cb);
+	if (ssl_ca_file[0])
+	{
+		/*
+		 * Always ask for SSL client cert, but don't fail if it's not
+		 * presented.  We might fail such connections later, depending on
+		 * what we find in pg_hba.conf.
+		 */
+		SSL_CTX_set_verify(SSL_context,
+						   (SSL_VERIFY_PEER |
+							SSL_VERIFY_CLIENT_ONCE),
+						   verify_cb);
 
-			/* Set flag to remember CA store is successfully loaded */
-			ssl_loaded_verify_locations = true;
-		}
+		/* Set flag to remember CA store is successfully loaded */
+		ssl_loaded_verify_locations = true;
 
 		/*
 		 * Tell OpenSSL to send the list of root certs we trust to clients in
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index 1dadafc7048c1..a83b52ea29065 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -1417,7 +1417,7 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num)
 				ereport(LOG,
 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
 						 errmsg("client certificates can only be checked if a root certificate store is available"),
-						 errhint("Make sure the root.crt file is present and readable."),
+						 errhint("Make sure the configuration parameter \"ssl_ca_file\" is set."),
 				   errcontext("line %d of configuration file \"%s\"",
 							  line_num, HbaFileName)));
 				return false;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7df5292f95140..84b330c6d3924 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -39,6 +39,7 @@
 #include "funcapi.h"
 #include "libpq/auth.h"
 #include "libpq/be-fsstubs.h"
+#include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
@@ -2960,6 +2961,46 @@ static struct config_string ConfigureNamesString[] =
 		check_canonical_path, NULL, NULL
 	},
 
+	{
+		{"ssl_cert_file", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+			gettext_noop("Location of the SSL server certificate file."),
+			NULL
+		},
+		&ssl_cert_file,
+		"server.crt",
+		NULL, NULL, NULL
+	},
+
+	{
+		{"ssl_key_file", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+			gettext_noop("Location of the SSL server private key file."),
+			NULL
+		},
+		&ssl_key_file,
+		"server.key",
+		NULL, NULL, NULL
+	},
+
+	{
+		{"ssl_ca_file", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+			gettext_noop("Location of the SSL certificate authority file."),
+			NULL
+		},
+		&ssl_ca_file,
+		"",
+		NULL, NULL, NULL
+	},
+
+	{
+		{"ssl_crl_file", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+			gettext_noop("Location of the SSL certificate revocation list file."),
+			NULL
+		},
+		&ssl_crl_file,
+		"",
+		NULL, NULL, NULL
+	},
+
 	{
 		{"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
 			gettext_noop("Writes temporary statistics files to the specified directory."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 400c52bf9d7bb..96da086b0f4c8 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -81,6 +81,10 @@
 #ssl_ciphers = 'ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH'	# allowed SSL ciphers
 					# (change requires restart)
 #ssl_renegotiation_limit = 512MB	# amount of data between renegotiations
+#ssl_cert_file = 'server.crt'		# (change requires restart)
+#ssl_key_file = 'server.key'		# (change requires restart)
+#ssl_ca_file = ''			# (change requires restart)
+#ssl_crl_file = ''			# (change requires restart)
 #password_encryption = on
 #db_user_namespace = off
 
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index a4ef7b3e09411..7083cd866b68e 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -70,6 +70,11 @@ extern void pq_endcopyout(bool errorAbort);
 /*
  * prototypes for functions in be-secure.c
  */
+extern char *ssl_cert_file;
+extern char *ssl_key_file;
+extern char *ssl_ca_file;
+extern char *ssl_crl_file;
+
 extern int	secure_initialize(void);
 extern bool secure_loaded_verify_locations(void);
 extern void secure_destroy(void);

From dd2954963b4da9f6e8ea759f19ca5eb0cf79010f Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 22 Feb 2012 18:11:46 -0500
Subject: [PATCH 028/129] Draft release notes for 9.1.3, 9.0.7, 8.4.11, 8.3.18.

---
 doc/src/sgml/release-8.3.sgml | 258 +++++++++++++++
 doc/src/sgml/release-8.4.sgml | 312 ++++++++++++++++++
 doc/src/sgml/release-9.0.sgml | 466 +++++++++++++++++++++++++++
 doc/src/sgml/release-9.1.sgml | 580 ++++++++++++++++++++++++++++++++++
 4 files changed, 1616 insertions(+)

diff --git a/doc/src/sgml/release-8.3.sgml b/doc/src/sgml/release-8.3.sgml
index ec1880d20fa3b..e80743f463b19 100644
--- a/doc/src/sgml/release-8.3.sgml
+++ b/doc/src/sgml/release-8.3.sgml
@@ -1,6 +1,264 @@
 <!-- doc/src/sgml/release-8.3.sgml -->
 <!-- See header comment in release.sgml about typical markup -->
 
+ <sect1 id="release-8-3-18">
+  <title>Release 8.3.18</title>
+
+  <note>
+  <title>Release Date</title>
+  <simpara>2012-02-27</simpara>
+  </note>
+
+  <para>
+   This release contains a variety of fixes from 8.3.17.
+   For information about new features in the 8.3 major release, see
+   <xref linkend="release-8-3">.
+  </para>
+
+  <sect2>
+   <title>Migration to Version 8.3.18</title>
+
+   <para>
+    A dump/restore is not required for those running 8.3.X.
+   </para>
+
+   <para>
+    However, if you are upgrading from a version earlier than 8.3.17,
+    see the release notes for 8.3.17.
+   </para>
+
+  </sect2>
+
+  <sect2>
+   <title>Changes</title>
+
+   <itemizedlist>
+
+    <listitem>
+     <para>
+      Fix btree index corruption from insertions concurrent with vacuuming
+      (Tom Lane)
+     </para>
+
+     <para>
+      An index page split caused by an insertion could sometimes cause a
+      concurrently-running <command>VACUUM</> to miss removing index entries
+      that it should remove.  After the corresponding table rows are removed,
+      the dangling index entries would cause errors (such as <quote>could not
+      read block N in file ...</>) or worse, silently wrong query results
+      after unrelated rows are re-inserted at the now-free table locations.
+      This bug has been present since release 8.2, but occurs so infrequently
+      that it was not diagnosed until now.  If you have reason to suspect
+      that it has happened in your database, reindexing the affected index
+      will fix things.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow non-existent values for some settings in <command>ALTER
+      USER/DATABASE SET</> (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Allow <varname>default_text_search_config</>,
+      <varname>default_tablespace</>, and <varname>temp_tablespaces</> to be
+      set to names that are not known.  This is because they might be known
+      in another database where the setting is intended to be used, or for the
+      tablespace cases because the tablespace might not be created yet.  The
+      same issue was previously recognized for <varname>search_path</>, and
+      these settings now act like that one.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Track the OID counter correctly during WAL replay, even when it wraps
+      around (Tom Lane)
+     </para>
+
+     <para>
+      Previously the OID counter would remain stuck at a high value until the
+      system exited replay mode.  The practical consequences of that are
+      usually nil, but there are scenarios wherein a standby server that's
+      been promoted to master might take a long time to advance the OID
+      counter to a reasonable value once values are needed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix regular expression back-references with <literal>*</> attached
+      (Tom Lane)
+     </para>
+
+     <para>
+      Rather than enforcing an exact string match, the code would effectively
+      accept any string that satisfies the pattern sub-expression referenced
+      by the back-reference symbol.
+     </para>
+
+     <para>
+      A similar problem still afflicts back-references that are embedded in a
+      larger quantified expression, rather than being the immediate subject
+      of the quantifier.  This will be addressed in a future
+      <productname>PostgreSQL</> release.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix recently-introduced memory leak in processing of
+      <type>inet</>/<type>cidr</> values (Heikki Linnakangas)
+     </para>
+
+     <para>
+      A patch in the December 2011 releases of <productname>PostgreSQL</>
+      caused memory leakage in these operations, which could be significant
+      in scenarios such as building a btree index on such a column.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid double close of file handle in syslogger on Windows (MauMau)
+     </para>
+
+     <para>
+      Ordinarily this error was invisible, but it would cause an exception
+      when running on a debug version of Windows.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix I/O-conversion-related memory leaks in plpgsql
+      (Andres Freund, Jan Urbanski, Tom Lane)
+     </para>
+
+     <para>
+      Certain operations would leak memory until the end of the current
+      function.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Improve <application>pg_dump</>'s handling of inherited table columns
+      (Tom Lane)
+     </para>
+
+     <para>
+      <application>pg_dump</> mishandled situations where a child column has
+      a different default expression than its parent column.  If the default
+      is textually identical to the parent's default, but not actually the
+      same (for instance, because of schema search path differences) it would
+      not be recognized as different, so that after dump and restore the
+      child would be allowed to inherit the parent's default.  Child columns
+      that are <literal>NOT NULL</> where their parent is not could also be
+      restored subtly incorrectly.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <application>pg_restore</>'s direct-to-database mode for
+      INSERT-style table data (Tom Lane)
+     </para>
+
+     <para>
+      Direct-to-database restores from archive files made with
+      <option>--inserts</> or <option>--column-inserts</> options fail when
+      using <application>pg_restore</> from a release dated September or
+      December 2011, as a result of an oversight in a fix for another
+      problem.  The archive file itself is not at fault, and text-mode
+      output is okay.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error in <filename>contrib/intarray</>'s <literal>int[] &amp;
+      int[]</> operator (Guillaume Lelarge)
+     </para>
+
+     <para>
+      If the smallest integer the two input arrays have in common is 1,
+      and there are smaller values in either array, then 1 would be
+      incorrectly omitted from the result.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error detection in <filename>contrib/pgcrypto</>'s
+      <function>encrypt_iv()</> and <function>decrypt_iv()</>
+      (Marko Kreen)
+     </para>
+
+     <para>
+      These functions failed to report certain types of invalid-input errors,
+      and would instead return random garbage values for incorrect input.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix one-byte buffer overrun in <filename>contrib/test_parser</>
+      (Paul Guyot)
+     </para>
+
+     <para>
+      The code would try to read one more byte than it should, which would
+      crash in corner cases.
+      Since <filename>contrib/test_parser</> is only example code, this is
+      not a security issue in itself, but bad example code is still bad.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <function>__sync_lock_test_and_set()</> for spinlocks on ARM, if
+      available (Martin Pitt)
+     </para>
+
+     <para>
+      This function replaces our previous use of the <literal>SWPB</>
+      instruction, which is deprecated and not available on ARMv6 and later.
+      Reports suggest that the old code doesn't fail in an obvious way on
+      recent ARM boards, but simply doesn't interlock concurrent accesses,
+      leading to bizarre failures in multiprocess operation.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <option>-fexcess-precision=standard</> option when building with
+      gcc versions that accept it (Andrew Dunstan)
+     </para>
+
+     <para>
+      This prevents assorted scenarios wherein recent versions of gcc will
+      produce creative results.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow use of threaded Python on FreeBSD (Chris Rees)
+     </para>
+
+     <para>
+      Our configure script previously believed that this combination wouldn't
+      work; but FreeBSD fixed the problem, so remove that error check.
+     </para>
+    </listitem>
+
+   </itemizedlist>
+
+  </sect2>
+ </sect1>
+
  <sect1 id="release-8-3-17">
   <title>Release 8.3.17</title>
 
diff --git a/doc/src/sgml/release-8.4.sgml b/doc/src/sgml/release-8.4.sgml
index 27e1e6c7b1e87..2cddc5ec0c488 100644
--- a/doc/src/sgml/release-8.4.sgml
+++ b/doc/src/sgml/release-8.4.sgml
@@ -1,6 +1,318 @@
 <!-- doc/src/sgml/release-8.4.sgml -->
 <!-- See header comment in release.sgml about typical markup -->
 
+ <sect1 id="release-8-4-11">
+  <title>Release 8.4.11</title>
+
+  <note>
+  <title>Release Date</title>
+  <simpara>2012-02-27</simpara>
+  </note>
+
+  <para>
+   This release contains a variety of fixes from 8.4.10.
+   For information about new features in the 8.4 major release, see
+   <xref linkend="release-8-4">.
+  </para>
+
+  <sect2>
+   <title>Migration to Version 8.4.11</title>
+
+   <para>
+    A dump/restore is not required for those running 8.4.X.
+   </para>
+
+   <para>
+    However, if you are upgrading from a version earlier than 8.4.10,
+    see the release notes for 8.4.10.
+   </para>
+
+  </sect2>
+
+  <sect2>
+   <title>Changes</title>
+
+   <itemizedlist>
+
+    <listitem>
+     <para>
+      Fix btree index corruption from insertions concurrent with vacuuming
+      (Tom Lane)
+     </para>
+
+     <para>
+      An index page split caused by an insertion could sometimes cause a
+      concurrently-running <command>VACUUM</> to miss removing index entries
+      that it should remove.  After the corresponding table rows are removed,
+      the dangling index entries would cause errors (such as <quote>could not
+      read block N in file ...</>) or worse, silently wrong query results
+      after unrelated rows are re-inserted at the now-free table locations.
+      This bug has been present since release 8.2, but occurs so infrequently
+      that it was not diagnosed until now.  If you have reason to suspect
+      that it has happened in your database, reindexing the affected index
+      will fix things.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Update per-column permissions, not only per-table permissions, when
+      changing table owner (Tom Lane)
+     </para>
+
+     <para>
+      Failure to do this meant that any previously granted column permissions
+      were still shown as having been granted by the old owner.  This meant
+      that neither the new owner nor a superuser could revoke the
+      now-untraceable-to-table-owner permissions.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow non-existent values for some settings in <command>ALTER
+      USER/DATABASE SET</> (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Allow <varname>default_text_search_config</>,
+      <varname>default_tablespace</>, and <varname>temp_tablespaces</> to be
+      set to names that are not known.  This is because they might be known
+      in another database where the setting is intended to be used, or for the
+      tablespace cases because the tablespace might not be created yet.  The
+      same issue was previously recognized for <varname>search_path</>, and
+      these settings now act like that one.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid crashing when we have problems deleting table files post-commit
+      (Tom Lane)
+     </para>
+
+     <para>
+      Dropping a table should lead to deleting the underlying disk files only
+      after the transaction commits.  In event of failure then (for instance,
+      because of wrong file permissions) the code is supposed to just emit a
+      warning message and go on, since it's too late to abort the
+      transaction.  This logic got broken as of release 8.4, causing such
+      situations to result in a PANIC and an unrestartable database.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Track the OID counter correctly during WAL replay, even when it wraps
+      around (Tom Lane)
+     </para>
+
+     <para>
+      Previously the OID counter would remain stuck at a high value until the
+      system exited replay mode.  The practical consequences of that are
+      usually nil, but there are scenarios wherein a standby server that's
+      been promoted to master might take a long time to advance the OID
+      counter to a reasonable value once values are needed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix regular expression back-references with <literal>*</> attached
+      (Tom Lane)
+     </para>
+
+     <para>
+      Rather than enforcing an exact string match, the code would effectively
+      accept any string that satisfies the pattern sub-expression referenced
+      by the back-reference symbol.
+     </para>
+
+     <para>
+      A similar problem still afflicts back-references that are embedded in a
+      larger quantified expression, rather than being the immediate subject
+      of the quantifier.  This will be addressed in a future
+      <productname>PostgreSQL</> release.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix recently-introduced memory leak in processing of
+      <type>inet</>/<type>cidr</> values (Heikki Linnakangas)
+     </para>
+
+     <para>
+      A patch in the December 2011 releases of <productname>PostgreSQL</>
+      caused memory leakage in these operations, which could be significant
+      in scenarios such as building a btree index on such a column.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix dangling pointer after <command>CREATE TABLE AS</>/<command>SELECT
+      INTO</> in a SQL-language function (Tom Lane)
+     </para>
+
+     <para>
+      In most cases this only led to an assertion failure in assert-enabled
+      builds, but worse consequences seem possible.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid double close of file handle in syslogger on Windows (MauMau)
+     </para>
+
+     <para>
+      Ordinarily this error was invisible, but it would cause an exception
+      when running on a debug version of Windows.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix I/O-conversion-related memory leaks in plpgsql
+      (Andres Freund, Jan Urbanski, Tom Lane)
+     </para>
+
+     <para>
+      Certain operations would leak memory until the end of the current
+      function.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Improve <application>pg_dump</>'s handling of inherited table columns
+      (Tom Lane)
+     </para>
+
+     <para>
+      <application>pg_dump</> mishandled situations where a child column has
+      a different default expression than its parent column.  If the default
+      is textually identical to the parent's default, but not actually the
+      same (for instance, because of schema search path differences) it would
+      not be recognized as different, so that after dump and restore the
+      child would be allowed to inherit the parent's default.  Child columns
+      that are <literal>NOT NULL</> where their parent is not could also be
+      restored subtly incorrectly.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <application>pg_restore</>'s direct-to-database mode for
+      INSERT-style table data (Tom Lane)
+     </para>
+
+     <para>
+      Direct-to-database restores from archive files made with
+      <option>--inserts</> or <option>--column-inserts</> options fail when
+      using <application>pg_restore</> from a release dated September or
+      December 2011, as a result of an oversight in a fix for another
+      problem.  The archive file itself is not at fault, and text-mode
+      output is okay.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow <literal>AT</> option in <application>ecpg</>
+      <literal>DEALLOCATE</> statements (Michael Meskes)
+     </para>
+
+     <para>
+      The infrastructure to support this has been there for awhile, but
+      through an oversight there was still an error check rejecting the case.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error in <filename>contrib/intarray</>'s <literal>int[] &amp;
+      int[]</> operator (Guillaume Lelarge)
+     </para>
+
+     <para>
+      If the smallest integer the two input arrays have in common is 1,
+      and there are smaller values in either array, then 1 would be
+      incorrectly omitted from the result.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error detection in <filename>contrib/pgcrypto</>'s
+      <function>encrypt_iv()</> and <function>decrypt_iv()</>
+      (Marko Kreen)
+     </para>
+
+     <para>
+      These functions failed to report certain types of invalid-input errors,
+      and would instead return random garbage values for incorrect input.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix one-byte buffer overrun in <filename>contrib/test_parser</>
+      (Paul Guyot)
+     </para>
+
+     <para>
+      The code would try to read one more byte than it should, which would
+      crash in corner cases.
+      Since <filename>contrib/test_parser</> is only example code, this is
+      not a security issue in itself, but bad example code is still bad.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <function>__sync_lock_test_and_set()</> for spinlocks on ARM, if
+      available (Martin Pitt)
+     </para>
+
+     <para>
+      This function replaces our previous use of the <literal>SWPB</>
+      instruction, which is deprecated and not available on ARMv6 and later.
+      Reports suggest that the old code doesn't fail in an obvious way on
+      recent ARM boards, but simply doesn't interlock concurrent accesses,
+      leading to bizarre failures in multiprocess operation.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <option>-fexcess-precision=standard</> option when building with
+      gcc versions that accept it (Andrew Dunstan)
+     </para>
+
+     <para>
+      This prevents assorted scenarios wherein recent versions of gcc will
+      produce creative results.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow use of threaded Python on FreeBSD (Chris Rees)
+     </para>
+
+     <para>
+      Our configure script previously believed that this combination wouldn't
+      work; but FreeBSD fixed the problem, so remove that error check.
+     </para>
+    </listitem>
+
+   </itemizedlist>
+
+  </sect2>
+ </sect1>
+
  <sect1 id="release-8-4-10">
   <title>Release 8.4.10</title>
 
diff --git a/doc/src/sgml/release-9.0.sgml b/doc/src/sgml/release-9.0.sgml
index 4f8ec5116a438..7b29590bb113e 100644
--- a/doc/src/sgml/release-9.0.sgml
+++ b/doc/src/sgml/release-9.0.sgml
@@ -1,6 +1,472 @@
 <!-- doc/src/sgml/release-9.0.sgml -->
 <!-- See header comment in release.sgml about typical markup -->
 
+ <sect1 id="release-9-0-7">
+  <title>Release 9.0.7</title>
+
+  <note>
+  <title>Release Date</title>
+  <simpara>2012-02-27</simpara>
+  </note>
+
+  <para>
+   This release contains a variety of fixes from 9.0.6.
+   For information about new features in the 9.0 major release, see
+   <xref linkend="release-9-0">.
+  </para>
+
+  <sect2>
+   <title>Migration to Version 9.0.7</title>
+
+   <para>
+    A dump/restore is not required for those running 9.0.X.
+   </para>
+
+   <para>
+    However, if you are upgrading from a version earlier than 9.0.6,
+    see the release notes for 9.0.6.
+   </para>
+
+  </sect2>
+
+  <sect2>
+   <title>Changes</title>
+
+   <itemizedlist>
+
+    <listitem>
+     <para>
+      Fix btree index corruption from insertions concurrent with vacuuming
+      (Tom Lane)
+     </para>
+
+     <para>
+      An index page split caused by an insertion could sometimes cause a
+      concurrently-running <command>VACUUM</> to miss removing index entries
+      that it should remove.  After the corresponding table rows are removed,
+      the dangling index entries would cause errors (such as <quote>could not
+      read block N in file ...</>) or worse, silently wrong query results
+      after unrelated rows are re-inserted at the now-free table locations.
+      This bug has been present since release 8.2, but occurs so infrequently
+      that it was not diagnosed until now.  If you have reason to suspect
+      that it has happened in your database, reindexing the affected index
+      will fix things.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix transient zeroing of shared buffers during WAL replay (Tom Lane)
+     </para>
+
+     <para>
+      The replay logic would sometimes zero and refill a shared buffer, so
+      that the contents were transiently invalid.  In hot standby mode this
+      can result in a query that's executing in parallel seeing garbage data.
+      Various symptoms could result from that, but the most common one seems
+      to be <quote>invalid memory alloc request size</>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix postmaster to attempt restart after a hot-standby crash (Tom Lane)
+     </para>
+
+     <para>
+      A logic error caused the postmaster to terminate, rather than attempt
+      to restart the cluster, if any backend process crashed while operating
+      in hot standby mode.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <command>CLUSTER</>/<command>VACUUM FULL</> handling of toast
+      values owned by recently-updated rows (Tom Lane)
+     </para>
+
+     <para>
+      This oversight could lead to <quote>duplicate key value violates unique
+      constraint</> errors being reported against the toast table's index
+      during one of these commands.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Update per-column permissions, not only per-table permissions, when
+      changing table owner (Tom Lane)
+     </para>
+
+     <para>
+      Failure to do this meant that any previously granted column permissions
+      were still shown as having been granted by the old owner.  This meant
+      that neither the new owner nor a superuser could revoke the
+      now-untraceable-to-table-owner permissions.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Support foreign data wrappers and foreign servers in
+      <command>REASSIGN OWNED</> (Alvaro Herrera)
+     </para>
+
+     <para>
+      This command failed with <quote>unexpected classid</> errors if
+      it needed to change the ownership of any such objects.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow non-existent values for some settings in <command>ALTER
+      USER/DATABASE SET</> (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Allow <varname>default_text_search_config</>,
+      <varname>default_tablespace</>, and <varname>temp_tablespaces</> to be
+      set to names that are not known.  This is because they might be known
+      in another database where the setting is intended to be used, or for the
+      tablespace cases because the tablespace might not be created yet.  The
+      same issue was previously recognized for <varname>search_path</>, and
+      these settings now act like that one.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid crashing when we have problems deleting table files post-commit
+      (Tom Lane)
+     </para>
+
+     <para>
+      Dropping a table should lead to deleting the underlying disk files only
+      after the transaction commits.  In event of failure then (for instance,
+      because of wrong file permissions) the code is supposed to just emit a
+      warning message and go on, since it's too late to abort the
+      transaction.  This logic got broken as of release 8.4, causing such
+      situations to result in a PANIC and an unrestartable database.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Recover from errors occurring during WAL replay of <command>DROP
+      TABLESPACE</> (Tom Lane)
+     </para>
+
+     <para>
+      Replay will attempt to remove the tablespace's directories, but there
+      are various reasons why this might fail (for example, incorrect
+      ownership or permissions on those directories).  Formerly the replay
+      code would panic, rendering the database unrestartable without manual
+      intervention.  It seems better to log the problem and continue, since
+      the only consequence of failure to remove the directories is some
+      wasted disk space.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix race condition in logging AccessExclusiveLocks for hot standby
+      (Simon Riggs)
+     </para>
+
+     <para>
+      Sometimes a lock would be logged as being held by <quote>transaction
+      zero</>.  This is at least known to produce assertion failures on
+      slave servers, and might be the cause of more serious problems.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Track the OID counter correctly during WAL replay, even when it wraps
+      around (Tom Lane)
+     </para>
+
+     <para>
+      Previously the OID counter would remain stuck at a high value until the
+      system exited replay mode.  The practical consequences of that are
+      usually nil, but there are scenarios wherein a standby server that's
+      been promoted to master might take a long time to advance the OID
+      counter to a reasonable value once values are needed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Prevent emitting misleading <quote>consistent recovery state reached</>
+      log message at the beginning of crash recovery (Heikki Linnakangas)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix initial value of
+      <structname>pg_stat_replication</>.<structfield>replay_location</>
+      (Fujii Masao)
+     </para>
+
+     <para>
+      Previously, the value shown would be wrong until at least one WAL
+      record had been replayed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix regular expression back-references with <literal>*</> attached
+      (Tom Lane)
+     </para>
+
+     <para>
+      Rather than enforcing an exact string match, the code would effectively
+      accept any string that satisfies the pattern sub-expression referenced
+      by the back-reference symbol.
+     </para>
+
+     <para>
+      A similar problem still afflicts back-references that are embedded in a
+      larger quantified expression, rather than being the immediate subject
+      of the quantifier.  This will be addressed in a future
+      <productname>PostgreSQL</> release.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix recently-introduced memory leak in processing of
+      <type>inet</>/<type>cidr</> values (Heikki Linnakangas)
+     </para>
+
+     <para>
+      A patch in the December 2011 releases of <productname>PostgreSQL</>
+      caused memory leakage in these operations, which could be significant
+      in scenarios such as building a btree index on such a column.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix dangling pointer after <command>CREATE TABLE AS</>/<command>SELECT
+      INTO</> in a SQL-language function (Tom Lane)
+     </para>
+
+     <para>
+      In most cases this only led to an assertion failure in assert-enabled
+      builds, but worse consequences seem possible.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid double close of file handle in syslogger on Windows (MauMau)
+     </para>
+
+     <para>
+      Ordinarily this error was invisible, but it would cause an exception
+      when running on a debug version of Windows.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix I/O-conversion-related memory leaks in plpgsql
+      (Andres Freund, Jan Urbanski, Tom Lane)
+     </para>
+
+     <para>
+      Certain operations would leak memory until the end of the current
+      function.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Improve <application>pg_dump</>'s handling of inherited table columns
+      (Tom Lane)
+     </para>
+
+     <para>
+      <application>pg_dump</> mishandled situations where a child column has
+      a different default expression than its parent column.  If the default
+      is textually identical to the parent's default, but not actually the
+      same (for instance, because of schema search path differences) it would
+      not be recognized as different, so that after dump and restore the
+      child would be allowed to inherit the parent's default.  Child columns
+      that are <literal>NOT NULL</> where their parent is not could also be
+      restored subtly incorrectly.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <application>pg_restore</>'s direct-to-database mode for
+      INSERT-style table data (Tom Lane)
+     </para>
+
+     <para>
+      Direct-to-database restores from archive files made with
+      <option>--inserts</> or <option>--column-inserts</> options fail when
+      using <application>pg_restore</> from a release dated September or
+      December 2011, as a result of an oversight in a fix for another
+      problem.  The archive file itself is not at fault, and text-mode
+      output is okay.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow <application>pg_upgrade</> to process tables containing
+      <type>regclass</> columns (Bruce Momjian)
+     </para>
+
+     <para>
+      Since <application>pg_upgrade</> now takes care to preserve
+      <structname>pg_class</> OIDs, there was no longer any reason for this
+      restriction.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Make <application>libpq</> ignore <literal>ENOTDIR</> errors
+      when looking for an SSL client certificate file
+      (Magnus Hagander)
+     </para>
+
+     <para>
+      This allows SSL connections to be established, though without a
+      certificate, even when the user's home directory is set to something
+      like <literal>/dev/null</>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix some more field alignment issues in <application>ecpg</>'s SQLDA area
+      (Zoltan Boszormenyi)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow <literal>AT</> option in <application>ecpg</>
+      <literal>DEALLOCATE</> statements (Michael Meskes)
+     </para>
+
+     <para>
+      The infrastructure to support this has been there for awhile, but
+      through an oversight there was still an error check rejecting the case.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Do not use the variable name when defining a varchar structure in ecpg
+      (Michael Meskes)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <filename>contrib/auto_explain</>'s JSON output mode to produce
+      valid JSON (Andrew Dunstan)
+     </para>
+
+     <para>
+      The output used brackets at the top level, when it should have used
+      braces.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error in <filename>contrib/intarray</>'s <literal>int[] &amp;
+      int[]</> operator (Guillaume Lelarge)
+     </para>
+
+     <para>
+      If the smallest integer the two input arrays have in common is 1,
+      and there are smaller values in either array, then 1 would be
+      incorrectly omitted from the result.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error detection in <filename>contrib/pgcrypto</>'s
+      <function>encrypt_iv()</> and <function>decrypt_iv()</>
+      (Marko Kreen)
+     </para>
+
+     <para>
+      These functions failed to report certain types of invalid-input errors,
+      and would instead return random garbage values for incorrect input.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix one-byte buffer overrun in <filename>contrib/test_parser</>
+      (Paul Guyot)
+     </para>
+
+     <para>
+      The code would try to read one more byte than it should, which would
+      crash in corner cases.
+      Since <filename>contrib/test_parser</> is only example code, this is
+      not a security issue in itself, but bad example code is still bad.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <function>__sync_lock_test_and_set()</> for spinlocks on ARM, if
+      available (Martin Pitt)
+     </para>
+
+     <para>
+      This function replaces our previous use of the <literal>SWPB</>
+      instruction, which is deprecated and not available on ARMv6 and later.
+      Reports suggest that the old code doesn't fail in an obvious way on
+      recent ARM boards, but simply doesn't interlock concurrent accesses,
+      leading to bizarre failures in multiprocess operation.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <option>-fexcess-precision=standard</> option when building with
+      gcc versions that accept it (Andrew Dunstan)
+     </para>
+
+     <para>
+      This prevents assorted scenarios wherein recent versions of gcc will
+      produce creative results.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow use of threaded Python on FreeBSD (Chris Rees)
+     </para>
+
+     <para>
+      Our configure script previously believed that this combination wouldn't
+      work; but FreeBSD fixed the problem, so remove that error check.
+     </para>
+    </listitem>
+
+   </itemizedlist>
+
+  </sect2>
+ </sect1>
+
  <sect1 id="release-9-0-6">
   <title>Release 9.0.6</title>
 
diff --git a/doc/src/sgml/release-9.1.sgml b/doc/src/sgml/release-9.1.sgml
index 8832a4a5d81e4..46abbec10a654 100644
--- a/doc/src/sgml/release-9.1.sgml
+++ b/doc/src/sgml/release-9.1.sgml
@@ -1,6 +1,586 @@
 <!-- doc/src/sgml/release-9.1.sgml -->
 <!-- See header comment in release.sgml about typical markup -->
 
+ <sect1 id="release-9-1-3">
+  <title>Release 9.1.3</title>
+
+  <note>
+  <title>Release Date</title>
+  <simpara>2012-02-27</simpara>
+  </note>
+
+  <para>
+   This release contains a variety of fixes from 9.1.2.
+   For information about new features in the 9.1 major release, see
+   <xref linkend="release-9-1">.
+  </para>
+
+  <sect2>
+   <title>Migration to Version 9.1.3</title>
+
+   <para>
+    A dump/restore is not required for those running 9.1.X.
+   </para>
+
+   <para>
+    However, if you are upgrading from a version earlier than 9.1.2,
+    see the release notes for 9.1.2.
+   </para>
+
+  </sect2>
+
+  <sect2>
+   <title>Changes</title>
+
+   <itemizedlist>
+
+    <listitem>
+     <para>
+      Fix btree index corruption from insertions concurrent with vacuuming
+      (Tom Lane)
+     </para>
+
+     <para>
+      An index page split caused by an insertion could sometimes cause a
+      concurrently-running <command>VACUUM</> to miss removing index entries
+      that it should remove.  After the corresponding table rows are removed,
+      the dangling index entries would cause errors (such as <quote>could not
+      read block N in file ...</>) or worse, silently wrong query results
+      after unrelated rows are re-inserted at the now-free table locations.
+      This bug has been present since release 8.2, but occurs so infrequently
+      that it was not diagnosed until now.  If you have reason to suspect
+      that it has happened in your database, reindexing the affected index
+      will fix things.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix transient zeroing of shared buffers during WAL replay (Tom Lane)
+     </para>
+
+     <para>
+      The replay logic would sometimes zero and refill a shared buffer, so
+      that the contents were transiently invalid.  In hot standby mode this
+      can result in a query that's executing in parallel seeing garbage data.
+      Various symptoms could result from that, but the most common one seems
+      to be <quote>invalid memory alloc request size</>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix handling of data-modifying <literal>WITH</> subplans in
+      <literal>READ COMMITTED</> rechecking (Tom Lane)
+     </para>
+
+     <para>
+      A <literal>WITH</> clause containing
+      <command>INSERT</>/<command>UPDATE</>/<command>DELETE</> would crash
+      if the parent <command>UPDATE</> or <command>DELETE</> command needed
+      to be re-evaluated at one or more rows due to concurrent updates
+      in <literal>READ COMMITTED</> mode.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix corner case in SSI transaction cleanup
+      (Dan Ports)
+     </para>
+
+     <para>
+      When finishing up a read-write serializable transaction,
+      a crash could occur if all remaining active serializable transactions
+      are read-only.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix postmaster to attempt restart after a hot-standby crash (Tom Lane)
+     </para>
+
+     <para>
+      A logic error caused the postmaster to terminate, rather than attempt
+      to restart the cluster, if any backend process crashed while operating
+      in hot standby mode.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <command>CLUSTER</>/<command>VACUUM FULL</> handling of toast
+      values owned by recently-updated rows (Tom Lane)
+     </para>
+
+     <para>
+      This oversight could lead to <quote>duplicate key value violates unique
+      constraint</> errors being reported against the toast table's index
+      during one of these commands.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Update per-column permissions, not only per-table permissions, when
+      changing table owner (Tom Lane)
+     </para>
+
+     <para>
+      Failure to do this meant that any previously granted column permissions
+      were still shown as having been granted by the old owner.  This meant
+      that neither the new owner nor a superuser could revoke the
+      now-untraceable-to-table-owner permissions.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Support foreign data wrappers and foreign servers in
+      <command>REASSIGN OWNED</> (Alvaro Herrera)
+     </para>
+
+     <para>
+      This command failed with <quote>unexpected classid</> errors if
+      it needed to change the ownership of any such objects.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow non-existent values for some settings in <command>ALTER
+      USER/DATABASE SET</> (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Allow <varname>default_text_search_config</>,
+      <varname>default_tablespace</>, and <varname>temp_tablespaces</> to be
+      set to names that are not known.  This is because they might be known
+      in another database where the setting is intended to be used, or for the
+      tablespace cases because the tablespace might not be created yet.  The
+      same issue was previously recognized for <varname>search_path</>, and
+      these settings now act like that one.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <quote>unsupported node type</> error caused by <literal>COLLATE</>
+      in an <command>INSERT</> expression (Tom Lane)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid crashing when we have problems deleting table files post-commit
+      (Tom Lane)
+     </para>
+
+     <para>
+      Dropping a table should lead to deleting the underlying disk files only
+      after the transaction commits.  In event of failure then (for instance,
+      because of wrong file permissions) the code is supposed to just emit a
+      warning message and go on, since it's too late to abort the
+      transaction.  This logic got broken as of release 8.4, causing such
+      situations to result in a PANIC and an unrestartable database.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Recover from errors occurring during WAL replay of <command>DROP
+      TABLESPACE</> (Tom Lane)
+     </para>
+
+     <para>
+      Replay will attempt to remove the tablespace's directories, but there
+      are various reasons why this might fail (for example, incorrect
+      ownership or permissions on those directories).  Formerly the replay
+      code would panic, rendering the database unrestartable without manual
+      intervention.  It seems better to log the problem and continue, since
+      the only consequence of failure to remove the directories is some
+      wasted disk space.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix race condition in logging AccessExclusiveLocks for hot standby
+      (Simon Riggs)
+     </para>
+
+     <para>
+      Sometimes a lock would be logged as being held by <quote>transaction
+      zero</>.  This is at least known to produce assertion failures on
+      slave servers, and might be the cause of more serious problems.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Track the OID counter correctly during WAL replay, even when it wraps
+      around (Tom Lane)
+     </para>
+
+     <para>
+      Previously the OID counter would remain stuck at a high value until the
+      system exited replay mode.  The practical consequences of that are
+      usually nil, but there are scenarios wherein a standby server that's
+      been promoted to master might take a long time to advance the OID
+      counter to a reasonable value once values are needed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Prevent emitting misleading <quote>consistent recovery state reached</>
+      log message at the beginning of crash recovery (Heikki Linnakangas)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix initial value of
+      <structname>pg_stat_replication</>.<structfield>replay_location</>
+      (Fujii Masao)
+     </para>
+
+     <para>
+      Previously, the value shown would be wrong until at least one WAL
+      record had been replayed.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix regular expression back-references with <literal>*</> attached
+      (Tom Lane)
+     </para>
+
+     <para>
+      Rather than enforcing an exact string match, the code would effectively
+      accept any string that satisfies the pattern sub-expression referenced
+      by the back-reference symbol.
+     </para>
+
+     <para>
+      A similar problem still afflicts back-references that are embedded in a
+      larger quantified expression, rather than being the immediate subject
+      of the quantifier.  This will be addressed in a future
+      <productname>PostgreSQL</> release.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix recently-introduced memory leak in processing of
+      <type>inet</>/<type>cidr</> values (Heikki Linnakangas)
+     </para>
+
+     <para>
+      A patch in the December 2011 releases of <productname>PostgreSQL</>
+      caused memory leakage in these operations, which could be significant
+      in scenarios such as building a btree index on such a column.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix planner's ability to push down index-expression restrictions
+      through <literal>UNION ALL</> (Tom Lane)
+     </para>
+
+     <para>
+      This type of optimization was inadvertently disabled by a fix for
+      another problem in 9.1.2.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix planning of <literal>WITH</> clauses referenced in
+      <command>UPDATE</>/<command>DELETE</> on an inherited table
+      (Tom Lane)
+     </para>
+
+     <para>
+      This bug led to <quote>could not find plan for CTE</> failures.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix GIN cost estimation to handle <literal>column IN (...)</>
+      index conditions (Marti Raudsepp)
+     </para>
+
+     <para>
+      This oversight would usually lead to crashes if such a condition could
+      be used with a GIN index.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Prevent assertion failure when exiting a session with an open, failed
+      transaction (Tom Lane)
+     </para>
+
+     <para>
+      This bug has no impact on normal builds with asserts not enabled.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix dangling pointer after <command>CREATE TABLE AS</>/<command>SELECT
+      INTO</> in a SQL-language function (Tom Lane)
+     </para>
+
+     <para>
+      In most cases this only led to an assertion failure in assert-enabled
+      builds, but worse consequences seem possible.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Avoid double close of file handle in syslogger on Windows (MauMau)
+     </para>
+
+     <para>
+      Ordinarily this error was invisible, but it would cause an exception
+      when running on a debug version of Windows.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix I/O-conversion-related memory leaks in plpgsql
+      (Andres Freund, Jan Urbanski, Tom Lane)
+     </para>
+
+     <para>
+      Certain operations would leak memory until the end of the current
+      function.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Work around bug in perl's SvPVutf8() function (Andrew Dunstan)
+     </para>
+
+     <para>
+      This function crashes when handed a typeglob or certain read-only
+      objects such as <literal>$^V</>.  Make plperl avoid passing those to
+      it.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      In <application>pg_dump</>, don't dump contents of an extension's
+      configuration tables if the extension itself is not being dumped
+      (Tom Lane)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Improve <application>pg_dump</>'s handling of inherited table columns
+      (Tom Lane)
+     </para>
+
+     <para>
+      <application>pg_dump</> mishandled situations where a child column has
+      a different default expression than its parent column.  If the default
+      is textually identical to the parent's default, but not actually the
+      same (for instance, because of schema search path differences) it would
+      not be recognized as different, so that after dump and restore the
+      child would be allowed to inherit the parent's default.  Child columns
+      that are <literal>NOT NULL</> where their parent is not could also be
+      restored subtly incorrectly.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <application>pg_restore</>'s direct-to-database mode for
+      INSERT-style table data (Tom Lane)
+     </para>
+
+     <para>
+      Direct-to-database restores from archive files made with
+      <option>--inserts</> or <option>--column-inserts</> options fail when
+      using <application>pg_restore</> from a release dated September or
+      December 2011, as a result of an oversight in a fix for another
+      problem.  The archive file itself is not at fault, and text-mode
+      output is okay.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Teach <application>pg_upgrade</> to handle renaming of
+      <application>plpython</>'s shared library (Bruce Momjian)
+     </para>
+
+     <para>
+      Upgrading a pre-9.1 database that included plpython would fail because
+      of this oversight.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow <application>pg_upgrade</> to process tables containing
+      <type>regclass</> columns (Bruce Momjian)
+     </para>
+
+     <para>
+      Since <application>pg_upgrade</> now takes care to preserve
+      <structname>pg_class</> OIDs, there was no longer any reason for this
+      restriction.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Make <application>libpq</> ignore <literal>ENOTDIR</> errors
+      when looking for an SSL client certificate file
+      (Magnus Hagander)
+     </para>
+
+     <para>
+      This allows SSL connections to be established, though without a
+      certificate, even when the user's home directory is set to something
+      like <literal>/dev/null</>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix some more field alignment issues in <application>ecpg</>'s SQLDA area
+      (Zoltan Boszormenyi)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow <literal>AT</> option in <application>ecpg</>
+      <literal>DEALLOCATE</> statements (Michael Meskes)
+     </para>
+
+     <para>
+      The infrastructure to support this has been there for awhile, but
+      through an oversight there was still an error check rejecting the case.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Do not use the variable name when defining a varchar structure in ecpg
+      (Michael Meskes)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix <filename>contrib/auto_explain</>'s JSON output mode to produce
+      valid JSON (Andrew Dunstan)
+     </para>
+
+     <para>
+      The output used brackets at the top level, when it should have used
+      braces.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error in <filename>contrib/intarray</>'s <literal>int[] &amp;
+      int[]</> operator (Guillaume Lelarge)
+     </para>
+
+     <para>
+      If the smallest integer the two input arrays have in common is 1,
+      and there are smaller values in either array, then 1 would be
+      incorrectly omitted from the result.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix error detection in <filename>contrib/pgcrypto</>'s
+      <function>encrypt_iv()</> and <function>decrypt_iv()</>
+      (Marko Kreen)
+     </para>
+
+     <para>
+      These functions failed to report certain types of invalid-input errors,
+      and would instead return random garbage values for incorrect input.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Fix one-byte buffer overrun in <filename>contrib/test_parser</>
+      (Paul Guyot)
+     </para>
+
+     <para>
+      The code would try to read one more byte than it should, which would
+      crash in corner cases.
+      Since <filename>contrib/test_parser</> is only example code, this is
+      not a security issue in itself, but bad example code is still bad.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <function>__sync_lock_test_and_set()</> for spinlocks on ARM, if
+      available (Martin Pitt)
+     </para>
+
+     <para>
+      This function replaces our previous use of the <literal>SWPB</>
+      instruction, which is deprecated and not available on ARMv6 and later.
+      Reports suggest that the old code doesn't fail in an obvious way on
+      recent ARM boards, but simply doesn't interlock concurrent accesses,
+      leading to bizarre failures in multiprocess operation.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Use <option>-fexcess-precision=standard</> option when building with
+      gcc versions that accept it (Andrew Dunstan)
+     </para>
+
+     <para>
+      This prevents assorted scenarios wherein recent versions of gcc will
+      produce creative results.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Allow use of threaded Python on FreeBSD (Chris Rees)
+     </para>
+
+     <para>
+      Our configure script previously believed that this combination wouldn't
+      work; but FreeBSD fixed the problem, so remove that error check.
+     </para>
+    </listitem>
+
+   </itemizedlist>
+
+  </sect2>
+ </sect1>
+
  <sect1 id="release-9-1-2">
   <title>Release 9.1.2</title>
 

From 4e6092a2277e50f15d24292ac98ebca8133c881b Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Wed, 22 Feb 2012 18:40:19 -0500
Subject: [PATCH 029/129] Improve wording of pg_upgrade logfile option:

  -l, --logfile=FILENAME        log internal activity to file\n\
---
 contrib/pg_upgrade/option.c | 2 +-
 doc/src/sgml/pgupgrade.sgml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/pg_upgrade/option.c b/contrib/pg_upgrade/option.c
index 85d33d2736fb0..0a105efec713f 100644
--- a/contrib/pg_upgrade/option.c
+++ b/contrib/pg_upgrade/option.c
@@ -259,7 +259,7 @@ Options:\n\
   -g, --debug                   enable debugging\n\
   -G, --debugfile=FILENAME      output debugging activity to file\n\
   -k, --link                    link instead of copying files to new cluster\n\
-  -l, --logfile=FILENAME        log session activity to file\n\
+  -l, --logfile=FILENAME        log internal activity to file\n\
   -o, --old-options=OPTIONS     old cluster options to pass to the server\n\
   -O, --new-options=OPTIONS     new cluster options to pass to the server\n\
   -p, --old-port=OLDPORT        old cluster port number (default %d)\n\
diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
index ac3f99bd7da79..1373069243c93 100644
--- a/doc/src/sgml/pgupgrade.sgml
+++ b/doc/src/sgml/pgupgrade.sgml
@@ -111,7 +111,7 @@
      <varlistentry>
       <term><option>-l</option> <replaceable>log_filename</></term>
       <term><option>--logfile=</option><replaceable>log_filename</></term>
-      <listitem><para>log session activity to file</para></listitem>
+      <listitem><para>log internal activity to file</para></listitem>
      </varlistentry>
 
      <varlistentry>

From f74f9a277c37b42c570ce01019f815abbec58ba0 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 22 Feb 2012 19:44:52 -0500
Subject: [PATCH 030/129] Fix typo in comment.

Sandro Santilli
---
 src/backend/commands/extension.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 6ecbbc7fe32e0..a9963ac93b939 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -876,7 +876,7 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control,
 
 		/*
 		 * If it's not relocatable, substitute the target schema name for
-		 * occcurrences of @extschema@.
+		 * occurrences of @extschema@.
 		 *
 		 * For a relocatable extension, we needn't do this.  There cannot be
 		 * any need for @extschema@, else it wouldn't be relocatable.

From 2254367435fcc4a31cc3b6d8324e33c5c30f265a Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 22 Feb 2012 20:33:05 -0500
Subject: [PATCH 031/129] Make EXPLAIN (BUFFERS) track blocks dirtied, as well
 as those written.

Also expose the new counters through pg_stat_statements.

Patch by me.  Review by Fujii Masao and Greg Smith.
---
 contrib/pg_stat_statements/Makefile           |  3 +-
 .../pg_stat_statements--1.0--1.1.sql          | 40 ++++++++++++++++++
 .../pg_stat_statements--1.1.sql               | 41 +++++++++++++++++++
 .../pg_stat_statements/pg_stat_statements.c   | 21 +++++++++-
 .../pg_stat_statements.control                |  2 +-
 doc/src/sgml/pgstatstatements.sgml            | 14 +++++++
 doc/src/sgml/ref/explain.sgml                 | 12 ++++--
 src/backend/commands/explain.c                | 16 ++++++--
 src/backend/executor/instrument.c             |  2 +
 src/backend/storage/buffer/bufmgr.c           |  1 +
 src/backend/storage/buffer/localbuf.c         |  4 ++
 src/include/executor/instrument.h             |  2 +
 12 files changed, 148 insertions(+), 10 deletions(-)
 create mode 100644 contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql
 create mode 100644 contrib/pg_stat_statements/pg_stat_statements--1.1.sql

diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile
index e086fd8a827dd..e8aed61216402 100644
--- a/contrib/pg_stat_statements/Makefile
+++ b/contrib/pg_stat_statements/Makefile
@@ -4,7 +4,8 @@ MODULE_big = pg_stat_statements
 OBJS = pg_stat_statements.o
 
 EXTENSION = pg_stat_statements
-DATA = pg_stat_statements--1.0.sql pg_stat_statements--unpackaged--1.0.sql
+DATA = pg_stat_statements--1.1.sql pg_stat_statements--1.0--1.1.sql \
+	pg_stat_statements--unpackaged--1.0.sql
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql b/contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql
new file mode 100644
index 0000000000000..223271d0ce591
--- /dev/null
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql
@@ -0,0 +1,40 @@
+/* contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_stat_statements UPDATE" to load this file. \quit
+
+/* First we have to remove them from the extension */
+ALTER EXTENSION pg_stat_statements DROP VIEW pg_stat_statements;
+ALTER EXTENSION pg_stat_statements DROP FUNCTION pg_stat_statements();
+
+/* Then we can drop them */
+DROP VIEW pg_stat_statements;
+DROP FUNCTION pg_stat_statements();
+
+/* Now redefine */
+CREATE FUNCTION pg_stat_statements(
+    OUT userid oid,
+    OUT dbid oid,
+    OUT query text,
+    OUT calls int8,
+    OUT total_time float8,
+    OUT rows int8,
+    OUT shared_blks_hit int8,
+    OUT shared_blks_read int8,
+    OUT shared_blks_dirtied int8,
+    OUT shared_blks_written int8,
+    OUT local_blks_hit int8,
+    OUT local_blks_read int8,
+    OUT local_blks_dirtied int8,
+    OUT local_blks_written int8,
+    OUT temp_blks_read int8,
+    OUT temp_blks_written int8
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE VIEW pg_stat_statements AS
+  SELECT * FROM pg_stat_statements();
+
+GRANT SELECT ON pg_stat_statements TO PUBLIC;
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.1.sql b/contrib/pg_stat_statements/pg_stat_statements--1.1.sql
new file mode 100644
index 0000000000000..1233736c9c439
--- /dev/null
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.1.sql
@@ -0,0 +1,41 @@
+/* contrib/pg_stat_statements/pg_stat_statements--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_stat_statements" to load this file. \quit
+
+-- Register functions.
+CREATE FUNCTION pg_stat_statements_reset()
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_stat_statements(
+    OUT userid oid,
+    OUT dbid oid,
+    OUT query text,
+    OUT calls int8,
+    OUT total_time float8,
+    OUT rows int8,
+    OUT shared_blks_hit int8,
+    OUT shared_blks_read int8,
+    OUT shared_blks_dirtied int8,
+    OUT shared_blks_written int8,
+    OUT local_blks_hit int8,
+    OUT local_blks_read int8,
+    OUT local_blks_dirtied int8,
+    OUT local_blks_written int8,
+    OUT temp_blks_read int8,
+    OUT temp_blks_written int8
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+-- Register a view on the function for ease of use.
+CREATE VIEW pg_stat_statements AS
+  SELECT * FROM pg_stat_statements();
+
+GRANT SELECT ON pg_stat_statements TO PUBLIC;
+
+-- Don't want this to be available to non-superusers.
+REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 434aa71a33cd6..914fbf270d7db 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -77,9 +77,11 @@ typedef struct Counters
 	int64		rows;			/* total # of retrieved or affected rows */
 	int64		shared_blks_hit;	/* # of shared buffer hits */
 	int64		shared_blks_read;		/* # of shared disk blocks read */
+	int64		shared_blks_dirtied;	/* # of shared disk blocks dirtied */
 	int64		shared_blks_written;	/* # of shared disk blocks written */
 	int64		local_blks_hit; /* # of local buffer hits */
 	int64		local_blks_read;	/* # of local disk blocks read */
+	int64		local_blks_dirtied;		/* # of local disk blocks dirtied */
 	int64		local_blks_written;		/* # of local disk blocks written */
 	int64		temp_blks_read; /* # of temp blocks read */
 	int64		temp_blks_written;		/* # of temp blocks written */
@@ -652,12 +654,16 @@ pgss_ProcessUtility(Node *parsetree, const char *queryString,
 			pgBufferUsage.shared_blks_hit - bufusage.shared_blks_hit;
 		bufusage.shared_blks_read =
 			pgBufferUsage.shared_blks_read - bufusage.shared_blks_read;
+		bufusage.shared_blks_dirtied =
+			pgBufferUsage.shared_blks_dirtied - bufusage.shared_blks_dirtied;
 		bufusage.shared_blks_written =
 			pgBufferUsage.shared_blks_written - bufusage.shared_blks_written;
 		bufusage.local_blks_hit =
 			pgBufferUsage.local_blks_hit - bufusage.local_blks_hit;
 		bufusage.local_blks_read =
 			pgBufferUsage.local_blks_read - bufusage.local_blks_read;
+		bufusage.local_blks_dirtied =
+			pgBufferUsage.local_blks_dirtied - bufusage.local_blks_dirtied;
 		bufusage.local_blks_written =
 			pgBufferUsage.local_blks_written - bufusage.local_blks_written;
 		bufusage.temp_blks_read =
@@ -766,9 +772,11 @@ pgss_store(const char *query, double total_time, uint64 rows,
 		e->counters.rows += rows;
 		e->counters.shared_blks_hit += bufusage->shared_blks_hit;
 		e->counters.shared_blks_read += bufusage->shared_blks_read;
+		e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
 		e->counters.shared_blks_written += bufusage->shared_blks_written;
 		e->counters.local_blks_hit += bufusage->local_blks_hit;
 		e->counters.local_blks_read += bufusage->local_blks_read;
+		e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
 		e->counters.local_blks_written += bufusage->local_blks_written;
 		e->counters.temp_blks_read += bufusage->temp_blks_read;
 		e->counters.temp_blks_written += bufusage->temp_blks_written;
@@ -793,7 +801,8 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-#define PG_STAT_STATEMENTS_COLS		14
+#define PG_STAT_STATEMENTS_COLS_V1_0	14
+#define PG_STAT_STATEMENTS_COLS			16
 
 /*
  * Retrieve statement statistics.
@@ -810,6 +819,7 @@ pg_stat_statements(PG_FUNCTION_ARGS)
 	bool		is_superuser = superuser();
 	HASH_SEQ_STATUS hash_seq;
 	pgssEntry  *entry;
+	bool		sql_supports_dirty_counters = true;
 
 	if (!pgss || !pgss_hash)
 		ereport(ERROR,
@@ -830,6 +840,8 @@ pg_stat_statements(PG_FUNCTION_ARGS)
 	/* Build a tuple descriptor for our result type */
 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 		elog(ERROR, "return type must be a row type");
+	if (tupdesc->natts == PG_STAT_STATEMENTS_COLS_V1_0)
+		sql_supports_dirty_counters = false;
 
 	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
 	oldcontext = MemoryContextSwitchTo(per_query_ctx);
@@ -887,14 +899,19 @@ pg_stat_statements(PG_FUNCTION_ARGS)
 		values[i++] = Int64GetDatumFast(tmp.rows);
 		values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
 		values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
+		if (sql_supports_dirty_counters)
+			values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
 		values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
 		values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
 		values[i++] = Int64GetDatumFast(tmp.local_blks_read);
+		if (sql_supports_dirty_counters)
+			values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
 		values[i++] = Int64GetDatumFast(tmp.local_blks_written);
 		values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
 		values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
 
-		Assert(i == PG_STAT_STATEMENTS_COLS);
+		Assert(i == sql_supports_dirty_counters ? \
+			PG_STAT_STATEMENTS_COLS : PG_STAT_STATEMENTS_COLS_V1_0);
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 	}
diff --git a/contrib/pg_stat_statements/pg_stat_statements.control b/contrib/pg_stat_statements/pg_stat_statements.control
index 6f9a9471228df..428fbb23749ad 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.control
+++ b/contrib/pg_stat_statements/pg_stat_statements.control
@@ -1,5 +1,5 @@
 # pg_stat_statements extension
 comment = 'track execution statistics of all SQL statements executed'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/pg_stat_statements'
 relocatable = true
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index 5a0230c428618..ab34ca193a4ed 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -99,6 +99,13 @@
       <entry>Total number of shared blocks reads by the statement</entry>
      </row>
 
+     <row>
+      <entry><structfield>shared_blks_dirtied</structfield></entry>
+      <entry><type>bigint</type></entry>
+      <entry></entry>
+      <entry>Total number of shared blocks dirtied by the statement</entry>
+     </row>
+
      <row>
       <entry><structfield>shared_blks_written</structfield></entry>
       <entry><type>bigint</type></entry>
@@ -120,6 +127,13 @@
       <entry>Total number of local blocks reads by the statement</entry>
      </row>
 
+     <row>
+      <entry><structfield>local_blks_dirtied</structfield></entry>
+      <entry><type>bigint</type></entry>
+      <entry></entry>
+      <entry>Total number of local blocks dirtied by the statement</entry>
+     </row>
+
      <row>
       <entry><structfield>local_blks_written</structfield></entry>
       <entry><type>bigint</type></entry>
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 419b72cad348f..1f35a1d155279 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -155,14 +155,20 @@ ROLLBACK;
     <listitem>
      <para>
       Include information on buffer usage. Specifically, include the number of
-      shared blocks hits, reads, and writes, the number of local blocks hits,
-      reads, and writes, and the number of temp blocks reads and writes.
-      A <quote>hit</> means that a read was avoided because the block was
+      shared blocks hit, read, dirtied, and written, the number of local blocks
+      hit, read, dirtied, and written, and the number of temp blocks read and
+      written.
+      A <emphasis>hit</> means that a read was avoided because the block was
       found already in cache when needed.
       Shared blocks contain data from regular tables and indexes;
       local blocks contain data from temporary tables and indexes;
       while temp blocks contain short-term working data used in sorts, hashes,
       Materialize plan nodes, and similar cases.
+      The number of blocks <emphasis>dirtied</> indicates the number of
+      previously unmodified blocks that were changed by this query; while the
+      number of blocks <emphasis>written</> indicates the number of
+      previously-dirtied blocks evicted from cache by this backend during
+      query processing.
       The number of blocks shown for an
       upper-level node includes those used by all its child nodes.  In text
       format, only non-zero values are printed.  This parameter may only be
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index a1692f82ae85a..93b1f34ca0c62 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1183,12 +1183,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		{
 			bool		has_shared = (usage->shared_blks_hit > 0 ||
 									  usage->shared_blks_read > 0 ||
-									  usage->shared_blks_written);
+									  usage->shared_blks_dirtied > 0 ||
+									  usage->shared_blks_written > 0);
 			bool		has_local = (usage->local_blks_hit > 0 ||
 									 usage->local_blks_read > 0 ||
-									 usage->local_blks_written);
+									 usage->local_blks_dirtied > 0 ||
+									 usage->local_blks_written > 0);
 			bool		has_temp = (usage->temp_blks_read > 0 ||
-									usage->temp_blks_written);
+									usage->temp_blks_written > 0);
 
 			/* Show only positive counter values. */
 			if (has_shared || has_local || has_temp)
@@ -1205,6 +1207,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					if (usage->shared_blks_read > 0)
 						appendStringInfo(es->str, " read=%ld",
 										 usage->shared_blks_read);
+					if (usage->shared_blks_dirtied > 0)
+						appendStringInfo(es->str, " dirtied=%ld",
+										 usage->shared_blks_dirtied);
 					if (usage->shared_blks_written > 0)
 						appendStringInfo(es->str, " written=%ld",
 										 usage->shared_blks_written);
@@ -1220,6 +1225,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					if (usage->local_blks_read > 0)
 						appendStringInfo(es->str, " read=%ld",
 										 usage->local_blks_read);
+					if (usage->local_blks_dirtied > 0)
+						appendStringInfo(es->str, " dirtied=%ld",
+										 usage->local_blks_dirtied);
 					if (usage->local_blks_written > 0)
 						appendStringInfo(es->str, " written=%ld",
 										 usage->local_blks_written);
@@ -1243,9 +1251,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		{
 			ExplainPropertyLong("Shared Hit Blocks", usage->shared_blks_hit, es);
 			ExplainPropertyLong("Shared Read Blocks", usage->shared_blks_read, es);
+			ExplainPropertyLong("Shared Dirtied Blocks", usage->shared_blks_dirtied, es);
 			ExplainPropertyLong("Shared Written Blocks", usage->shared_blks_written, es);
 			ExplainPropertyLong("Local Hit Blocks", usage->local_blks_hit, es);
 			ExplainPropertyLong("Local Read Blocks", usage->local_blks_read, es);
+			ExplainPropertyLong("Local Dirtied Blocks", usage->local_blks_dirtied, es);
 			ExplainPropertyLong("Local Written Blocks", usage->local_blks_written, es);
 			ExplainPropertyLong("Temp Read Blocks", usage->temp_blks_read, es);
 			ExplainPropertyLong("Temp Written Blocks", usage->temp_blks_written, es);
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2c749b13cd8b4..6e9f450d68865 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -137,9 +137,11 @@ BufferUsageAccumDiff(BufferUsage *dst,
 {
 	dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
 	dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
+	dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
 	dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
 	dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
 	dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
+	dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
 	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1adb6d360dd5e..3924a51c0c6d3 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -988,6 +988,7 @@ MarkBufferDirty(Buffer buffer)
 	if (dirtied)
 	{
 		VacuumPageDirty++;
+		pgBufferUsage.shared_blks_dirtied++;
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 		if (ProcGlobal->bgwriterLatch)
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 096d36a233bd4..63c14f7300cc4 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -276,6 +276,10 @@ MarkLocalBufferDirty(Buffer buffer)
 	Assert(LocalRefCount[bufid] > 0);
 
 	bufHdr = &LocalBufferDescriptors[bufid];
+
+	if (!(bufHdr->flags & BM_DIRTY))
+		pgBufferUsage.local_blks_dirtied++;
+
 	bufHdr->flags |= BM_DIRTY;
 }
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 084302e4e7efe..066f684f330ca 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -20,9 +20,11 @@ typedef struct BufferUsage
 {
 	long		shared_blks_hit;	/* # of shared buffer hits */
 	long		shared_blks_read;		/* # of shared disk blocks read */
+	long		shared_blks_dirtied;	/* # of shared blocks dirtied */
 	long		shared_blks_written;	/* # of shared disk blocks written */
 	long		local_blks_hit; /* # of local buffer hits */
 	long		local_blks_read;	/* # of local disk blocks read */
+	long		local_blks_dirtied;		/* # of shared blocks dirtied */
 	long		local_blks_written;		/* # of local disk blocks written */
 	long		temp_blks_read; /* # of temp blocks read */
 	long		temp_blks_written;		/* # of temp blocks written */

From d4fb2f99ec86edc5c7ad11a9c7adc0d977cbb4d7 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 22 Feb 2012 20:37:13 -0500
Subject: [PATCH 032/129] Don't install hstore--1.0.sql any more.

Since the current version is 1.1, the 1.0 file isn't really needed.  We do
need the 1.0--1.1 upgrade file, so people on 1.0 can upgrade.

Per recent discussion on pgsql-hackers.
---
 contrib/hstore/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/contrib/hstore/Makefile b/contrib/hstore/Makefile
index e9e5e53142d82..1236e7958f401 100644
--- a/contrib/hstore/Makefile
+++ b/contrib/hstore/Makefile
@@ -5,8 +5,7 @@ OBJS = hstore_io.o hstore_op.o hstore_gist.o hstore_gin.o hstore_compat.o \
 	crc32.o
 
 EXTENSION = hstore
-DATA = hstore--1.0.sql hstore--1.1.sql hstore--1.0--1.1.sql \
-	hstore--unpackaged--1.0.sql
+DATA = hstore--1.1.sql hstore--1.0--1.1.sql hstore--unpackaged--1.0.sql
 
 REGRESS = hstore
 

From 8251670cb30c4e9c76a9cb8382a88109502ba583 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 23 Feb 2012 10:20:25 +0200
Subject: [PATCH 033/129] Fix build without OpenSSL

This is a fixup for commit a445cb92ef5b3a31313ebce30e18cc1d6e0bdecb.
---
 src/backend/libpq/be-secure.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index f0a38c238a419..6b1365c6d3efa 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -77,11 +77,6 @@
 
 #ifdef USE_SSL
 
-char *ssl_cert_file;
-char *ssl_key_file;
-char *ssl_ca_file;
-char *ssl_crl_file;
-
 static DH  *load_dh_file(int keylength);
 static DH  *load_dh_buffer(const char *, size_t);
 static DH  *tmp_dh_cb(SSL *s, int is_export, int keylength);
@@ -93,6 +88,11 @@ static void close_SSL(Port *);
 static const char *SSLerrmessage(void);
 #endif
 
+char *ssl_cert_file;
+char *ssl_key_file;
+char *ssl_ca_file;
+char *ssl_crl_file;
+
 /*
  *	How much data can be sent across a secure connection
  *	(total in both directions) before we require renegotiation.

From c9d700444007046d799cdfea2038194e56bec1f7 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 23 Feb 2012 12:51:33 +0200
Subject: [PATCH 034/129] Remove inappropriate quotes

And adjust wording for consistency.
---
 src/backend/commands/foreigncmds.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c
index 990875de76bb4..5d18bdcf0a972 100644
--- a/src/backend/commands/foreigncmds.c
+++ b/src/backend/commands/foreigncmds.c
@@ -364,7 +364,7 @@ AlterForeignDataWrapperOwner_oid(Oid fwdId, Oid newOwnerId)
 	if (!HeapTupleIsValid(tup))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("foreign-data wrapper with OID \"%u\" does not exist", fwdId)));
+				 errmsg("foreign-data wrapper with OID %u does not exist", fwdId)));
 
 	AlterForeignDataWrapperOwner_internal(rel, tup, newOwnerId);
 
@@ -463,7 +463,7 @@ AlterForeignServerOwner_oid(Oid srvId, Oid newOwnerId)
 	if (!HeapTupleIsValid(tup))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("server with OID \"%u\" does not exist", srvId)));
+				 errmsg("foreign server with OID %u does not exist", srvId)));
 
 	AlterForeignServerOwner_internal(rel, tup, newOwnerId);
 

From 74e29162a4f0ec0ad1c7224b8be936d1f9a51f7e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2012 15:05:08 -0500
Subject: [PATCH 035/129] Allow MinGW builds to use standardly-named OpenSSL
 libraries.

In the Fedora variant of MinGW, the openssl libraries have their normal
names, not libeay32 and libssleay32.  Adjust configure probes to allow
that, per bug #6486.

Tomasz Ostrowski
---
 configure                     | 108 +++++++++++++++++++++-------------
 configure.in                  |   4 +-
 src/include/pg_config.h.in    |   6 --
 src/include/pg_config.h.win32 |   6 --
 4 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/configure b/configure
index 4a943641580b7..84d18a263d8fa 100755
--- a/configure
+++ b/configure
@@ -9192,14 +9192,12 @@ $as_echo "$as_me: error: library 'ssl' is required for OpenSSL" >&2;}
 fi
 
   else
-
-{ $as_echo "$as_me:$LINENO: checking for CRYPTO_new_ex_data in -leay32" >&5
-$as_echo_n "checking for CRYPTO_new_ex_data in -leay32... " >&6; }
-if test "${ac_cv_lib_eay32_CRYPTO_new_ex_data+set}" = set; then
+     { $as_echo "$as_me:$LINENO: checking for library containing CRYPTO_new_ex_data" >&5
+$as_echo_n "checking for library containing CRYPTO_new_ex_data... " >&6; }
+if test "${ac_cv_search_CRYPTO_new_ex_data+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-leay32  $LIBS"
+  ac_func_search_save_LIBS=$LIBS
 cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
@@ -9222,7 +9220,14 @@ return CRYPTO_new_ex_data ();
   return 0;
 }
 _ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
+for ac_lib in '' eay32 crypto; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  rm -f conftest.$ac_objext conftest$ac_exeext
 if { (ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
@@ -9243,42 +9248,47 @@ $as_echo "$ac_try_echo") >&5
 	 test "$cross_compiling" = yes ||
 	 $as_test_x conftest$ac_exeext
        }; then
-  ac_cv_lib_eay32_CRYPTO_new_ex_data=yes
+  ac_cv_search_CRYPTO_new_ex_data=$ac_res
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_cv_lib_eay32_CRYPTO_new_ex_data=no
+
 fi
 
 rm -rf conftest.dSYM
 rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+      conftest$ac_exeext
+  if test "${ac_cv_search_CRYPTO_new_ex_data+set}" = set; then
+  break
 fi
-{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_eay32_CRYPTO_new_ex_data" >&5
-$as_echo "$ac_cv_lib_eay32_CRYPTO_new_ex_data" >&6; }
-if test "x$ac_cv_lib_eay32_CRYPTO_new_ex_data" = x""yes; then
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBEAY32 1
-_ACEOF
-
-  LIBS="-leay32 $LIBS"
+done
+if test "${ac_cv_search_CRYPTO_new_ex_data+set}" = set; then
+  :
+else
+  ac_cv_search_CRYPTO_new_ex_data=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_search_CRYPTO_new_ex_data" >&5
+$as_echo "$ac_cv_search_CRYPTO_new_ex_data" >&6; }
+ac_res=$ac_cv_search_CRYPTO_new_ex_data
+if test "$ac_res" != no; then
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
 else
-  { { $as_echo "$as_me:$LINENO: error: library 'eay32' is required for OpenSSL" >&5
-$as_echo "$as_me: error: library 'eay32' is required for OpenSSL" >&2;}
+  { { $as_echo "$as_me:$LINENO: error: library 'eay32' or 'crypto' is required for OpenSSL" >&5
+$as_echo "$as_me: error: library 'eay32' or 'crypto' is required for OpenSSL" >&2;}
    { (exit 1); exit 1; }; }
 fi
 
-
-{ $as_echo "$as_me:$LINENO: checking for SSL_library_init in -lssleay32" >&5
-$as_echo_n "checking for SSL_library_init in -lssleay32... " >&6; }
-if test "${ac_cv_lib_ssleay32_SSL_library_init+set}" = set; then
+     { $as_echo "$as_me:$LINENO: checking for library containing SSL_library_init" >&5
+$as_echo_n "checking for library containing SSL_library_init... " >&6; }
+if test "${ac_cv_search_SSL_library_init+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lssleay32  $LIBS"
+  ac_func_search_save_LIBS=$LIBS
 cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
@@ -9301,7 +9311,14 @@ return SSL_library_init ();
   return 0;
 }
 _ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
+for ac_lib in '' ssleay32 ssl; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  rm -f conftest.$ac_objext conftest$ac_exeext
 if { (ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
@@ -9322,31 +9339,38 @@ $as_echo "$ac_try_echo") >&5
 	 test "$cross_compiling" = yes ||
 	 $as_test_x conftest$ac_exeext
        }; then
-  ac_cv_lib_ssleay32_SSL_library_init=yes
+  ac_cv_search_SSL_library_init=$ac_res
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_cv_lib_ssleay32_SSL_library_init=no
+
 fi
 
 rm -rf conftest.dSYM
 rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+      conftest$ac_exeext
+  if test "${ac_cv_search_SSL_library_init+set}" = set; then
+  break
 fi
-{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_ssleay32_SSL_library_init" >&5
-$as_echo "$ac_cv_lib_ssleay32_SSL_library_init" >&6; }
-if test "x$ac_cv_lib_ssleay32_SSL_library_init" = x""yes; then
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBSSLEAY32 1
-_ACEOF
-
-  LIBS="-lssleay32 $LIBS"
+done
+if test "${ac_cv_search_SSL_library_init+set}" = set; then
+  :
+else
+  ac_cv_search_SSL_library_init=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_search_SSL_library_init" >&5
+$as_echo "$ac_cv_search_SSL_library_init" >&6; }
+ac_res=$ac_cv_search_SSL_library_init
+if test "$ac_res" != no; then
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
 else
-  { { $as_echo "$as_me:$LINENO: error: library 'ssleay32' is required for OpenSSL" >&5
-$as_echo "$as_me: error: library 'ssleay32' is required for OpenSSL" >&2;}
+  { { $as_echo "$as_me:$LINENO: error: library 'ssleay32' or 'ssl' is required for OpenSSL" >&5
+$as_echo "$as_me: error: library 'ssleay32' or 'ssl' is required for OpenSSL" >&2;}
    { (exit 1); exit 1; }; }
 fi
 
diff --git a/configure.in b/configure.in
index da5b11a96f85c..0ed6d01e5941f 100644
--- a/configure.in
+++ b/configure.in
@@ -948,8 +948,8 @@ if test "$with_openssl" = yes ; then
      AC_CHECK_LIB(crypto, CRYPTO_new_ex_data, [], [AC_MSG_ERROR([library 'crypto' is required for OpenSSL])])
      AC_CHECK_LIB(ssl,    SSL_library_init, [], [AC_MSG_ERROR([library 'ssl' is required for OpenSSL])])
   else
-     AC_CHECK_LIB(eay32, CRYPTO_new_ex_data, [], [AC_MSG_ERROR([library 'eay32' is required for OpenSSL])])
-     AC_CHECK_LIB(ssleay32,    SSL_library_init, [], [AC_MSG_ERROR([library 'ssleay32' is required for OpenSSL])])
+     AC_SEARCH_LIBS(CRYPTO_new_ex_data, eay32 crypto, [], [AC_MSG_ERROR([library 'eay32' or 'crypto' is required for OpenSSL])])
+     AC_SEARCH_LIBS(SSL_library_init, ssleay32 ssl, [], [AC_MSG_ERROR([library 'ssleay32' or 'ssl' is required for OpenSSL])])
   fi
 fi
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 46c72245032c5..813ee678eaee6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -299,9 +299,6 @@
 /* Define to 1 if you have the `crypto' library (-lcrypto). */
 #undef HAVE_LIBCRYPTO
 
-/* Define to 1 if you have the `eay32' library (-leay32). */
-#undef HAVE_LIBEAY32
-
 /* Define to 1 if you have the `ldap' library (-lldap). */
 #undef HAVE_LIBLDAP
 
@@ -323,9 +320,6 @@
 /* Define to 1 if you have the `ssl' library (-lssl). */
 #undef HAVE_LIBSSL
 
-/* Define to 1 if you have the `ssleay32' library (-lssleay32). */
-#undef HAVE_LIBSSLEAY32
-
 /* Define to 1 if you have the `wldap32' library (-lwldap32). */
 #undef HAVE_LIBWLDAP32
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 35294e9f12490..dd109b5762bf5 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -220,9 +220,6 @@
 /* Define to 1 if you have the `crypto' library (-lcrypto). */
 /* #undef HAVE_LIBCRYPTO */
 
-/* Define to 1 if you have the `eay32' library (-leay32). */
-/* #undef HAVE_LIBEAY32 */
-
 /* Define to 1 if you have the `ldap' library (-lldap). */
 /* #undef HAVE_LIBLDAP */
 
@@ -235,9 +232,6 @@
 /* Define to 1 if you have the `ssl' library (-lssl). */
 /* #undef HAVE_LIBSSL */
 
-/* Define to 1 if you have the `ssleay32' library (-lssleay32). */
-/* #undef HAVE_LIBSSLEAY32 */
-
 /* Define to 1 if you have the `wldap32' library (-lwldap32). */
 /* #undef HAVE_LIBWLDAP32 */
 

From 891e6e7bfd9bb72687522af08c18689f795cb60a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2012 15:38:56 -0500
Subject: [PATCH 036/129] Require execute permission on the trigger function
 for CREATE TRIGGER.

This check was overlooked when we added function execute permissions to the
system years ago.  For an ordinary trigger function it's not a big deal,
since trigger functions execute with the permissions of the table owner,
so they couldn't do anything the user issuing the CREATE TRIGGER couldn't
have done anyway.  However, if a trigger function is SECURITY DEFINER,
that is not the case.  The lack of checking would allow another user to
install it on his own table and then invoke it with, essentially, forged
input data; which the trigger function is unlikely to realize, so it might
do something undesirable, for instance insert false entries in an audit log
table.

Reported by Dinesh Kumar, patch by Robert Haas

Security: CVE-2012-0866
---
 doc/src/sgml/ref/create_trigger.sgml |  3 ++-
 src/backend/commands/trigger.c       | 11 +++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml
index a7915cf09f4bf..aed07a9dfb359 100644
--- a/doc/src/sgml/ref/create_trigger.sgml
+++ b/doc/src/sgml/ref/create_trigger.sgml
@@ -362,7 +362,8 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
 
   <para>
    To create a trigger on a table, the user must have the
-   <literal>TRIGGER</literal> privilege on the table.
+   <literal>TRIGGER</literal> privilege on the table.  The user must
+   also have <literal>EXECUTE</literal> privilege on the trigger function.
   </para>
 
   <para>
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 2838b66e402c5..caae2dafab159 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -108,8 +108,8 @@ static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
  * if TRUE causes us to modify the given trigger name to ensure uniqueness.
  *
  * When isInternal is not true we require ACL_TRIGGER permissions on the
- * relation.  For internal triggers the caller must apply any required
- * permission checks.
+ * relation, as well as ACL_EXECUTE on the trigger function.  For internal
+ * triggers the caller must apply any required permission checks.
  *
  * Note: can return InvalidOid if we decided to not create a trigger at all,
  * but a foreign-key constraint.  This is a kluge for backwards compatibility.
@@ -377,6 +377,13 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 	 * Find and validate the trigger function.
 	 */
 	funcoid = LookupFuncName(stmt->funcname, 0, fargtypes, false);
+	if (!isInternal)
+	{
+		aclresult = pg_proc_aclcheck(funcoid, GetUserId(), ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_PROC,
+						   NameListToString(stmt->funcname));
+	}
 	funcrettype = get_func_rettype(funcoid);
 	if (funcrettype != TRIGGEROID)
 	{

From 077711c2e3e86384d19d833233bd35e05b921cfc Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2012 15:48:04 -0500
Subject: [PATCH 037/129] Remove arbitrary limitation on length of common name
 in SSL certificates.

Both libpq and the backend would truncate a common name extracted from a
certificate at 32 bytes.  Replace that fixed-size buffer with dynamically
allocated string so that there is no hard limit.  While at it, remove the
code for extracting peer_dn, which we weren't using for anything; and
don't bother to store peer_cn longer than we need it in libpq.

This limit was not so terribly unreasonable when the code was written,
because we weren't using the result for anything critical, just logging it.
But now that there are options for checking the common name against the
server host name (in libpq) or using it as the user's name (in the server),
this could result in undesirable failures.  In the worst case it even seems
possible to spoof a server name or user name, if the correct name is
exactly 32 bytes and the attacker can persuade a trusted CA to issue a
certificate in which that string is a prefix of the certificate's common
name.  (To exploit this for a server name, he'd also have to send the
connection astray via phony DNS data or some such.)  The case that this is
a realistic security threat is a bit thin, but nonetheless we'll treat it
as one.

Back-patch to 8.4.  Older releases contain the faulty code, but it's not
a security problem because the common name wasn't used for anything
interesting.

Reported and patched by Heikki Linnakangas

Security: CVE-2012-0867
---
 src/backend/libpq/be-secure.c    |  59 +++++++++++------
 src/include/libpq/libpq-be.h     |   3 +-
 src/interfaces/libpq/fe-secure.c | 106 +++++++++++++++++++------------
 src/interfaces/libpq/libpq-int.h |   2 -
 4 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 6b1365c6d3efa..dce0eaa20e208 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -73,6 +73,7 @@
 
 #include "libpq/libpq.h"
 #include "tcop/tcopprot.h"
+#include "utils/memutils.h"
 
 
 #ifdef USE_SSL
@@ -945,44 +946,54 @@ open_server_SSL(Port *port)
 
 	port->count = 0;
 
-	/* get client certificate, if available. */
+	/* Get client certificate, if available. */
 	port->peer = SSL_get_peer_certificate(port->ssl);
-	if (port->peer == NULL)
-	{
-		strlcpy(port->peer_dn, "(anonymous)", sizeof(port->peer_dn));
-		strlcpy(port->peer_cn, "(anonymous)", sizeof(port->peer_cn));
-	}
-	else
+
+	/* and extract the Common Name from it. */
+	port->peer_cn = NULL;
+	if (port->peer != NULL)
 	{
-		X509_NAME_oneline(X509_get_subject_name(port->peer),
-						  port->peer_dn, sizeof(port->peer_dn));
-		port->peer_dn[sizeof(port->peer_dn) - 1] = '\0';
-		r = X509_NAME_get_text_by_NID(X509_get_subject_name(port->peer),
-					   NID_commonName, port->peer_cn, sizeof(port->peer_cn));
-		port->peer_cn[sizeof(port->peer_cn) - 1] = '\0';
-		if (r == -1)
-		{
-			/* Unable to get the CN, set it to blank so it can't be used */
-			port->peer_cn[0] = '\0';
-		}
-		else
+		int		len;
+
+		len = X509_NAME_get_text_by_NID(X509_get_subject_name(port->peer),
+										NID_commonName, NULL, 0);
+		if (len != -1)
 		{
+			char	   *peer_cn;
+
+			peer_cn = MemoryContextAlloc(TopMemoryContext, len + 1);
+			r = X509_NAME_get_text_by_NID(X509_get_subject_name(port->peer),
+										  NID_commonName, peer_cn, len + 1);
+			peer_cn[len] = '\0';
+			if (r != len)
+			{
+				/* shouldn't happen */
+				pfree(peer_cn);
+				close_SSL(port);
+				return -1;
+			}
+
 			/*
 			 * Reject embedded NULLs in certificate common name to prevent
 			 * attacks like CVE-2009-4034.
 			 */
-			if (r != strlen(port->peer_cn))
+			if (len != strlen(peer_cn))
 			{
 				ereport(COMMERROR,
 						(errcode(ERRCODE_PROTOCOL_VIOLATION),
 						 errmsg("SSL certificate's common name contains embedded null")));
+				pfree(peer_cn);
 				close_SSL(port);
 				return -1;
 			}
+
+			port->peer_cn = peer_cn;
 		}
 	}
+
 	ereport(DEBUG2,
-			(errmsg("SSL connection from \"%s\"", port->peer_cn)));
+			(errmsg("SSL connection from \"%s\"",
+					port->peer_cn ? port->peer_cn : "(anonymous)")));
 
 	/* set up debugging/info callback */
 	SSL_CTX_set_info_callback(SSL_context, info_cb);
@@ -1008,6 +1019,12 @@ close_SSL(Port *port)
 		X509_free(port->peer);
 		port->peer = NULL;
 	}
+
+	if (port->peer_cn)
+	{
+		pfree(port->peer_cn);
+		port->peer_cn = NULL;
+	}
 }
 
 /*
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index ad670c3216f66..4d92c18974bc5 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -175,8 +175,7 @@ typedef struct Port
 #ifdef USE_SSL
 	SSL		   *ssl;
 	X509	   *peer;
-	char		peer_dn[128 + 1];
-	char		peer_cn[SM_USER + 1];
+	char	   *peer_cn;
 	unsigned long count;
 #endif
 } Port;
diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c
index 3c1ca8c97fae2..5c4d73c3acf4a 100644
--- a/src/interfaces/libpq/fe-secure.c
+++ b/src/interfaces/libpq/fe-secure.c
@@ -733,6 +733,11 @@ wildcard_certificate_match(const char *pattern, const char *string)
 static bool
 verify_peer_name_matches_certificate(PGconn *conn)
 {
+	char	   *peer_cn;
+	int			r;
+	int			len;
+	bool		result;
+
 	/*
 	 * If told not to verify the peer name, don't do it. Return true
 	 * indicating that the verification was successful.
@@ -740,33 +745,81 @@ verify_peer_name_matches_certificate(PGconn *conn)
 	if (strcmp(conn->sslmode, "verify-full") != 0)
 		return true;
 
+	/*
+	 * Extract the common name from the certificate.
+	 *
+	 * XXX: Should support alternate names here
+	 */
+	/* First find out the name's length and allocate a buffer for it. */
+	len = X509_NAME_get_text_by_NID(X509_get_subject_name(conn->peer),
+									NID_commonName, NULL, 0);
+	if (len == -1)
+	{
+		printfPQExpBuffer(&conn->errorMessage,
+						  libpq_gettext("could not get server common name from server certificate\n"));
+		return false;
+	}
+	peer_cn = malloc(len + 1);
+	if (peer_cn == NULL)
+	{
+		printfPQExpBuffer(&conn->errorMessage,
+						  libpq_gettext("out of memory\n"));
+		return false;
+	}
+
+	r = X509_NAME_get_text_by_NID(X509_get_subject_name(conn->peer),
+								  NID_commonName, peer_cn, len + 1);
+	if (r != len)
+	{
+		/* Got different length than on the first call. Shouldn't happen. */
+		printfPQExpBuffer(&conn->errorMessage,
+						  libpq_gettext("could not get server common name from server certificate\n"));
+		free(peer_cn);
+		return false;
+	}
+	peer_cn[len] = '\0';
+
+	/*
+	 * Reject embedded NULLs in certificate common name to prevent attacks
+	 * like CVE-2009-4034.
+	 */
+	if (len != strlen(peer_cn))
+	{
+		printfPQExpBuffer(&conn->errorMessage,
+						  libpq_gettext("SSL certificate's common name contains embedded null\n"));
+		free(peer_cn);
+		return false;
+	}
+
+	/*
+	 * We got the peer's common name. Now compare it against the originally
+	 * given hostname.
+	 */
 	if (!(conn->pghost && conn->pghost[0] != '\0'))
 	{
 		printfPQExpBuffer(&conn->errorMessage,
 						  libpq_gettext("host name must be specified for a verified SSL connection\n"));
-		return false;
+		result = false;
 	}
 	else
 	{
-		/*
-		 * Compare CN to originally given hostname.
-		 *
-		 * XXX: Should support alternate names here
-		 */
-		if (pg_strcasecmp(conn->peer_cn, conn->pghost) == 0)
+		if (pg_strcasecmp(peer_cn, conn->pghost) == 0)
 			/* Exact name match */
-			return true;
-		else if (wildcard_certificate_match(conn->peer_cn, conn->pghost))
+			result = true;
+		else if (wildcard_certificate_match(peer_cn, conn->pghost))
 			/* Matched wildcard certificate */
-			return true;
+			result = true;
 		else
 		{
 			printfPQExpBuffer(&conn->errorMessage,
 							  libpq_gettext("server common name \"%s\" does not match host name \"%s\"\n"),
-							  conn->peer_cn, conn->pghost);
-			return false;
+							  peer_cn, conn->pghost);
+			result = false;
 		}
 	}
+
+	free(peer_cn);
+	return result;
 }
 
 #ifdef ENABLE_THREAD_SAFETY
@@ -1372,7 +1425,7 @@ open_client_SSL(PGconn *conn)
 	 * SSL_CTX_set_verify(), if root.crt exists.
 	 */
 
-	/* pull out server distinguished and common names */
+	/* get server certificate */
 	conn->peer = SSL_get_peer_certificate(conn->ssl);
 	if (conn->peer == NULL)
 	{
@@ -1386,33 +1439,6 @@ open_client_SSL(PGconn *conn)
 		return PGRES_POLLING_FAILED;
 	}
 
-	X509_NAME_oneline(X509_get_subject_name(conn->peer),
-					  conn->peer_dn, sizeof(conn->peer_dn));
-	conn->peer_dn[sizeof(conn->peer_dn) - 1] = '\0';
-
-	r = X509_NAME_get_text_by_NID(X509_get_subject_name(conn->peer),
-								  NID_commonName, conn->peer_cn, SM_USER);
-	conn->peer_cn[SM_USER] = '\0';		/* buffer is SM_USER+1 chars! */
-	if (r == -1)
-	{
-		/* Unable to get the CN, set it to blank so it can't be used */
-		conn->peer_cn[0] = '\0';
-	}
-	else
-	{
-		/*
-		 * Reject embedded NULLs in certificate common name to prevent attacks
-		 * like CVE-2009-4034.
-		 */
-		if (r != strlen(conn->peer_cn))
-		{
-			printfPQExpBuffer(&conn->errorMessage,
-							  libpq_gettext("SSL certificate's common name contains embedded null\n"));
-			close_SSL(conn);
-			return PGRES_POLLING_FAILED;
-		}
-	}
-
 	if (!verify_peer_name_matches_certificate(conn))
 	{
 		close_SSL(conn);
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 987311efc8dd6..2103af8832989 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -406,8 +406,6 @@ struct pg_conn
 								 * attempting normal connection */
 	SSL		   *ssl;			/* SSL status, if have SSL connection */
 	X509	   *peer;			/* X509 cert of server */
-	char		peer_dn[256 + 1];		/* peer distinguished name */
-	char		peer_cn[SM_USER + 1];	/* peer common name */
 #ifdef USE_SSL_ENGINE
 	ENGINE	   *engine;			/* SSL engine, if any */
 #else

From 89e0bac86dbca40dfc321926205f2a90d3da5437 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2012 15:53:09 -0500
Subject: [PATCH 038/129] Convert newlines to spaces in names written in
 pg_dump comments.

pg_dump was incautious about sanitizing object names that are emitted
within SQL comments in its output script.  A name containing a newline
would at least render the script syntactically incorrect.  Maliciously
crafted object names could present a SQL injection risk when the script
is reloaded.

Reported by Heikki Linnakangas, patch by Robert Haas

Security: CVE-2012-0868
---
 src/bin/pg_dump/pg_backup_archiver.c | 60 ++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 55c84fdd47993..79f7dda211a96 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -99,6 +99,7 @@ static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt,
 static void _getObjectDescription(PQExpBuffer buf, TocEntry *te,
 					  ArchiveHandle *AH);
 static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isData, bool acl_pass);
+static char *replace_line_endings(const char *str);
 
 
 static void _doSetFixedOutputState(ArchiveHandle *AH);
@@ -2932,6 +2933,9 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isDat
 	if (!AH->noTocComments)
 	{
 		const char *pfx;
+		char	   *sanitized_name;
+		char	   *sanitized_schema;
+		char	   *sanitized_owner;
 
 		if (isData)
 			pfx = "Data for ";
@@ -2953,12 +2957,39 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isDat
 				ahprintf(AH, "\n");
 			}
 		}
+
+		/*
+		 * Zap any line endings embedded in user-supplied fields, to prevent
+		 * corruption of the dump (which could, in the worst case, present an
+		 * SQL injection vulnerability if someone were to incautiously load a
+		 * dump containing objects with maliciously crafted names).
+		 */
+		sanitized_name = replace_line_endings(te->tag);
+		if (te->namespace)
+			sanitized_schema = replace_line_endings(te->namespace);
+		else
+			sanitized_schema = pg_strdup("-");
+		if (!ropt->noOwner)
+			sanitized_owner = replace_line_endings(te->owner);
+		else
+			sanitized_owner = pg_strdup("-");
+
 		ahprintf(AH, "-- %sName: %s; Type: %s; Schema: %s; Owner: %s",
-				 pfx, te->tag, te->desc,
-				 te->namespace ? te->namespace : "-",
-				 ropt->noOwner ? "-" : te->owner);
+				 pfx, sanitized_name, te->desc, sanitized_schema,
+				 sanitized_owner);
+
+		free(sanitized_name);
+		free(sanitized_schema);
+		free(sanitized_owner);
+
 		if (te->tablespace && !ropt->noTablespace)
-			ahprintf(AH, "; Tablespace: %s", te->tablespace);
+		{
+			char   *sanitized_tablespace;
+
+			sanitized_tablespace = replace_line_endings(te->tablespace);
+			ahprintf(AH, "; Tablespace: %s", sanitized_tablespace);
+			free(sanitized_tablespace);
+		}
 		ahprintf(AH, "\n");
 
 		if (AH->PrintExtraTocPtr !=NULL)
@@ -3053,6 +3084,27 @@ _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isDat
 	}
 }
 
+/*
+ * Sanitize a string to be included in an SQL comment, by replacing any
+ * newlines with spaces.
+ */
+static char *
+replace_line_endings(const char *str)
+{
+	char   *result;
+	char   *s;
+
+	result = pg_strdup(str);
+
+	for (s = result; *s != '\0'; s++)
+	{
+		if (*s == '\n' || *s == '\r')
+			*s = ' ';
+	}
+
+	return result;
+}
+
 void
 WriteHead(ArchiveHandle *AH)
 {

From b2ce60703ab431a1d6c10f50587ea5f5e984af2e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Feb 2012 17:47:52 -0500
Subject: [PATCH 039/129] Last-minute release note updates.

Security: CVE-2012-0866, CVE-2012-0867, CVE-2012-0868
---
 doc/src/sgml/release-8.3.sgml | 30 ++++++++++++++++++
 doc/src/sgml/release-8.4.sgml | 50 ++++++++++++++++++++++++++++++
 doc/src/sgml/release-9.0.sgml | 50 ++++++++++++++++++++++++++++++
 doc/src/sgml/release-9.1.sgml | 57 +++++++++++++++++++++++++++++++++++
 4 files changed, 187 insertions(+)

diff --git a/doc/src/sgml/release-8.3.sgml b/doc/src/sgml/release-8.3.sgml
index e80743f463b19..09f867b527dd7 100644
--- a/doc/src/sgml/release-8.3.sgml
+++ b/doc/src/sgml/release-8.3.sgml
@@ -34,6 +34,36 @@
 
    <itemizedlist>
 
+    <listitem>
+     <para>
+      Require execute permission on the trigger function for
+      <command>CREATE TRIGGER</> (Robert Haas)
+     </para>
+
+     <para>
+      This missing check could allow another user to execute a trigger
+      function with forged input data, by installing it on a table he owns.
+      This is only of significance for trigger functions marked
+      <literal>SECURITY DEFINER</>, since otherwise trigger functions run
+      as the table owner anyway.  (CVE-2012-0866)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Convert newlines to spaces in names written in <application>pg_dump</>
+      comments (Robert Haas)
+     </para>
+
+     <para>
+      <application>pg_dump</> was incautious about sanitizing object names
+      that are emitted within SQL comments in its output script.  A name
+      containing a newline would at least render the script syntactically
+      incorrect.  Maliciously crafted object names could present a SQL
+      injection risk when the script is reloaded.  (CVE-2012-0868)
+     </para>
+    </listitem>
+
     <listitem>
      <para>
       Fix btree index corruption from insertions concurrent with vacuuming
diff --git a/doc/src/sgml/release-8.4.sgml b/doc/src/sgml/release-8.4.sgml
index 2cddc5ec0c488..7dbc78e500c00 100644
--- a/doc/src/sgml/release-8.4.sgml
+++ b/doc/src/sgml/release-8.4.sgml
@@ -34,6 +34,56 @@
 
    <itemizedlist>
 
+    <listitem>
+     <para>
+      Require execute permission on the trigger function for
+      <command>CREATE TRIGGER</> (Robert Haas)
+     </para>
+
+     <para>
+      This missing check could allow another user to execute a trigger
+      function with forged input data, by installing it on a table he owns.
+      This is only of significance for trigger functions marked
+      <literal>SECURITY DEFINER</>, since otherwise trigger functions run
+      as the table owner anyway.  (CVE-2012-0866)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Remove arbitrary limitation on length of common name in SSL
+      certificates (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Both <application>libpq</> and the server truncated the common name
+      extracted from an SSL certificate at 32 bytes.  Normally this would
+      cause nothing worse than an unexpected verification failure, but there
+      are some rather-implausible scenarios in which it might allow one
+      certificate holder to impersonate another.  The victim would have to
+      have a common name exactly 32 bytes long, and the attacker would have
+      to persuade a trusted CA to issue a certificate in which the common
+      name has that string as a prefix.  Impersonating a server would also
+      require some additional exploit to redirect client connections.
+      (CVE-2012-0867)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Convert newlines to spaces in names written in <application>pg_dump</>
+      comments (Robert Haas)
+     </para>
+
+     <para>
+      <application>pg_dump</> was incautious about sanitizing object names
+      that are emitted within SQL comments in its output script.  A name
+      containing a newline would at least render the script syntactically
+      incorrect.  Maliciously crafted object names could present a SQL
+      injection risk when the script is reloaded.  (CVE-2012-0868)
+     </para>
+    </listitem>
+
     <listitem>
      <para>
       Fix btree index corruption from insertions concurrent with vacuuming
diff --git a/doc/src/sgml/release-9.0.sgml b/doc/src/sgml/release-9.0.sgml
index 7b29590bb113e..16de221dc117d 100644
--- a/doc/src/sgml/release-9.0.sgml
+++ b/doc/src/sgml/release-9.0.sgml
@@ -34,6 +34,56 @@
 
    <itemizedlist>
 
+    <listitem>
+     <para>
+      Require execute permission on the trigger function for
+      <command>CREATE TRIGGER</> (Robert Haas)
+     </para>
+
+     <para>
+      This missing check could allow another user to execute a trigger
+      function with forged input data, by installing it on a table he owns.
+      This is only of significance for trigger functions marked
+      <literal>SECURITY DEFINER</>, since otherwise trigger functions run
+      as the table owner anyway.  (CVE-2012-0866)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Remove arbitrary limitation on length of common name in SSL
+      certificates (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Both <application>libpq</> and the server truncated the common name
+      extracted from an SSL certificate at 32 bytes.  Normally this would
+      cause nothing worse than an unexpected verification failure, but there
+      are some rather-implausible scenarios in which it might allow one
+      certificate holder to impersonate another.  The victim would have to
+      have a common name exactly 32 bytes long, and the attacker would have
+      to persuade a trusted CA to issue a certificate in which the common
+      name has that string as a prefix.  Impersonating a server would also
+      require some additional exploit to redirect client connections.
+      (CVE-2012-0867)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Convert newlines to spaces in names written in <application>pg_dump</>
+      comments (Robert Haas)
+     </para>
+
+     <para>
+      <application>pg_dump</> was incautious about sanitizing object names
+      that are emitted within SQL comments in its output script.  A name
+      containing a newline would at least render the script syntactically
+      incorrect.  Maliciously crafted object names could present a SQL
+      injection risk when the script is reloaded.  (CVE-2012-0868)
+     </para>
+    </listitem>
+
     <listitem>
      <para>
       Fix btree index corruption from insertions concurrent with vacuuming
diff --git a/doc/src/sgml/release-9.1.sgml b/doc/src/sgml/release-9.1.sgml
index 46abbec10a654..ca53f5fc7d214 100644
--- a/doc/src/sgml/release-9.1.sgml
+++ b/doc/src/sgml/release-9.1.sgml
@@ -34,6 +34,56 @@
 
    <itemizedlist>
 
+    <listitem>
+     <para>
+      Require execute permission on the trigger function for
+      <command>CREATE TRIGGER</> (Robert Haas)
+     </para>
+
+     <para>
+      This missing check could allow another user to execute a trigger
+      function with forged input data, by installing it on a table he owns.
+      This is only of significance for trigger functions marked
+      <literal>SECURITY DEFINER</>, since otherwise trigger functions run
+      as the table owner anyway.  (CVE-2012-0866)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Remove arbitrary limitation on length of common name in SSL
+      certificates (Heikki Linnakangas)
+     </para>
+
+     <para>
+      Both <application>libpq</> and the server truncated the common name
+      extracted from an SSL certificate at 32 bytes.  Normally this would
+      cause nothing worse than an unexpected verification failure, but there
+      are some rather-implausible scenarios in which it might allow one
+      certificate holder to impersonate another.  The victim would have to
+      have a common name exactly 32 bytes long, and the attacker would have
+      to persuade a trusted CA to issue a certificate in which the common
+      name has that string as a prefix.  Impersonating a server would also
+      require some additional exploit to redirect client connections.
+      (CVE-2012-0867)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Convert newlines to spaces in names written in <application>pg_dump</>
+      comments (Robert Haas)
+     </para>
+
+     <para>
+      <application>pg_dump</> was incautious about sanitizing object names
+      that are emitted within SQL comments in its output script.  A name
+      containing a newline would at least render the script syntactically
+      incorrect.  Maliciously crafted object names could present a SQL
+      injection risk when the script is reloaded.  (CVE-2012-0868)
+     </para>
+    </listitem>
+
     <listitem>
      <para>
       Fix btree index corruption from insertions concurrent with vacuuming
@@ -576,6 +626,13 @@
      </para>
     </listitem>
 
+    <listitem>
+     <para>
+      Allow MinGW builds to use standardly-named OpenSSL libraries
+      (Tomasz Ostrowski)
+     </para>
+    </listitem>
+
    </itemizedlist>
 
   </sect2>

From 0c9e5d5e0d407013bf66af01942a7b2dd3342546 Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Thu, 23 Feb 2012 23:44:16 -0500
Subject: [PATCH 040/129] Correctly handle NULLs in JSON output.

Error reported by David Wheeler.
---
 src/backend/utils/adt/json.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index d7db4cf0cf935..2968c57e3f88e 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -79,9 +79,10 @@ static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex);
 static void report_invalid_token(JsonLexContext *lex);
 static char *extract_mb_char(char *s);
 static void composite_to_json(Datum composite, StringInfo result, bool use_line_feeds);
-static void array_dim_to_json(StringInfo result, int dim, int ndims,int * dims,
-							  Datum *vals, int * valcount, TYPCATEGORY tcategory,
-							  Oid typoutputfunc, bool use_line_feeds);
+static void array_dim_to_json(StringInfo result, int dim, int ndims, int *dims,
+							  Datum *vals, bool *nulls, int *valcount, 
+							  TYPCATEGORY tcategory, Oid typoutputfunc, 
+							  bool use_line_feeds);
 static void array_to_json_internal(Datum array, StringInfo result, bool use_line_feeds);
 
 /* fake type category for JSON so we can distinguish it in datum_to_json */
@@ -682,13 +683,13 @@ extract_mb_char(char *s)
  * composite_to_json or array_to_json_internal as appropriate.
  */
 static inline void
-datum_to_json(Datum val, StringInfo result, TYPCATEGORY tcategory,
+datum_to_json(Datum val, bool is_null, StringInfo result, TYPCATEGORY tcategory,
 			  Oid typoutputfunc)
 {
 
 	char *outputstr;
 
-	if (val == (Datum) NULL)
+	if (is_null)
 	{
 		appendStringInfoString(result,"null");
 		return;
@@ -742,8 +743,8 @@ datum_to_json(Datum val, StringInfo result, TYPCATEGORY tcategory,
  */
 static void
 array_dim_to_json(StringInfo result, int dim, int ndims,int * dims, Datum *vals,
-				  int * valcount, TYPCATEGORY tcategory, Oid typoutputfunc,
-				  bool use_line_feeds)
+				  bool *nulls, int * valcount, TYPCATEGORY tcategory, 
+				  Oid typoutputfunc, bool use_line_feeds)
 {
 
 	int i;
@@ -762,7 +763,8 @@ array_dim_to_json(StringInfo result, int dim, int ndims,int * dims, Datum *vals,
 
 		if (dim + 1 == ndims)
 		{
-			datum_to_json(vals[*valcount],result,tcategory,typoutputfunc);
+			datum_to_json(vals[*valcount], nulls[*valcount], result, tcategory,
+						  typoutputfunc);
 			(*valcount)++;
 		}
 		else
@@ -771,8 +773,8 @@ array_dim_to_json(StringInfo result, int dim, int ndims,int * dims, Datum *vals,
 			 * Do we want line feeds on inner dimensions of arrays?
 			 * For now we'll say no.
 			 */
-			array_dim_to_json(result, dim+1, ndims, dims, vals, valcount,
-							  tcategory,typoutputfunc,false);
+			array_dim_to_json(result, dim+1, ndims, dims, vals, nulls,
+							  valcount, tcategory, typoutputfunc, false);
 		}
 	}
 
@@ -827,7 +829,7 @@ array_to_json_internal(Datum array, StringInfo result, bool use_line_feeds)
 	else
 		tcategory = TypeCategory(element_type);
 
-	array_dim_to_json(result, 0, ndim, dim, elements, &count, tcategory,
+	array_dim_to_json(result, 0, ndim, dim, elements, nulls, &count, tcategory,
 					  typoutputfunc, use_line_feeds);
 
 	pfree(elements);
@@ -908,7 +910,7 @@ composite_to_json(Datum composite, StringInfo result, bool use_line_feeds)
 		else
 			val = origval;
 
-		datum_to_json(val, result, tcategory, typoutput);
+		datum_to_json(val, isnull, result, tcategory, typoutput);
 
 		/* Clean up detoasted copy, if any */
 		if (val != origval)

From 173e29aa5deefd9e71c183583ba37805c8102a72 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Feb 2012 01:40:18 -0500
Subject: [PATCH 041/129] Fix the general case of quantified regex
 back-references.

Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)

To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes.  Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
---
 src/backend/regex/README            |  18 +-
 src/backend/regex/regcomp.c         |  88 ++--
 src/backend/regex/regexec.c         | 764 +++++++++++++++++++++++++++-
 src/include/regex/regguts.h         |  24 +-
 src/test/regress/expected/regex.out |  37 ++
 src/test/regress/sql/regex.sql      |   8 +
 6 files changed, 884 insertions(+), 55 deletions(-)

diff --git a/src/backend/regex/README b/src/backend/regex/README
index 3fd58c000119a..89ba6a62ea2f7 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
 either plain regular expressions (which are executed as DFAs in the manner
 described above) or back-references (which try to match the input to some
 previous substring).  Non-leaf nodes are capture nodes (which save the
-location of the substring currently matching their child node) or
-concatenation or alternation nodes.  At execution time, the executor
-recursively scans the tree.  At concatenation or alternation nodes,
-it considers each possible alternative way of matching the input string,
-ie each place where the string could be split for a concatenation, or each
-child node for an alternation.  It tries the next alternative if the match
-fails according to the child nodes.  This is exactly the sort of
-backtracking search done by a traditional NFA regex engine.  If there are
-many tree levels it can get very slow.
+location of the substring currently matching their child node),
+concatenation, alternation, or iteration nodes.  At execution time, the
+executor recursively scans the tree.  At concatenation, alternation, or
+iteration nodes, it considers each possible alternative way of matching the
+input string, that is each place where the string could be split for a
+concatenation or iteration, or each child node for an alternation.  It
+tries the next alternative if the match fails according to the child nodes.
+This is exactly the sort of backtracking search done by a traditional NFA
+regex engine.  If there are many tree levels it can get very slow.
 
 But all is not lost: we can still be smarter than the average pure NFA
 engine.  To do this, each subre node has an associated DFA, which
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 6b80140e90940..b84d0c3af55f0 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
 	/*----------
 	 * Prepare a general-purpose state skeleton.
 	 *
-	 *	  ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
-	 *	 /											  /
-	 * [lp] ----> [s2] ----bypass---------------------
+	 * In the no-backrefs case, we want this:
 	 *
-	 * where bypass is an empty, and prefix is some repetitions of atom
+	 * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+	 *
+	 * where prefix is some repetitions of atom.  In the general case we need
+	 *
+	 * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+	 *
+	 * where the iterator wraps around [begin] ---atom---> [end]
+	 *
+	 * We make the s state here for both cases; s2 is made below if needed
 	 *----------
 	 */
 	s = newstate(v->nfa);		/* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
 	NOERR();
 	atom->begin = s;
 	atom->end = s2;
-	s = newstate(v->nfa);		/* and spots for prefix and bypass */
-	s2 = newstate(v->nfa);
+	s = newstate(v->nfa);		/* set up starting state */
 	NOERR();
 	EMPTYARC(lp, s);
-	EMPTYARC(lp, s2);
 	NOERR();
 
 	/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
 	}
 
 	/*
-	 * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
-	 * with quantifiers internally.  Otherwise, the first step is to turn
-	 * x{0,...} into x{1,...}|empty
+	 * It's quantifier time.  If the atom is just a backref, we'll let it deal
+	 * with quantifiers internally.
 	 */
-	if (m == 0 && atomtype != BACKREF)
-	{
-		EMPTYARC(s2, atom->end);	/* the bypass */
-		assert(PREF(qprefer) != 0);
-		f = COMBINE(qprefer, atom->flags);
-		t = subre(v, '|', f, lp, atom->end);
-		NOERR();
-		t->left = atom;
-		t->right = subre(v, '|', PREF(f), s2, atom->end);
-		NOERR();
-		t->right->left = subre(v, '=', 0, s2, atom->end);
-		NOERR();
-		*atomp = t;
-		atomp = &t->left;
-		m = 1;
-	}
-
-	/* deal with the rest of the quantifier */
 	if (atomtype == BACKREF)
 	{
 		/* special case:  backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
 		atom->min = (short) m;
 		atom->max = (short) n;
 		atom->flags |= COMBINE(qprefer, atom->flags);
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
 	}
 	else if (m == 1 && n == 1)
 	{
 		/* no/vacuous quantifier:  done */
 		EMPTYARC(s, atom->begin);		/* empty prefix */
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
 	}
-	else
+	else if (m > 0 && !(atom->flags & BACKR))
 	{
 		/*
-		 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
-		 * second x
+		 * If there's no backrefs involved, we can turn x{m,n} into
+		 * x{m-1,n-1}x, with capturing parens in only the second x.  This
+		 * is valid because we only care about capturing matches from the
+		 * final iteration of the quantifier.  It's a win because we can
+		 * implement the backref-free left side as a plain DFA node, since
+		 * we don't really care where its submatches are.
 		 */
 		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
 		assert(m >= 1 && m != INFINITY && n >= 1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
 		NOERR();
 		t->right = atom;
 		*atomp = t;
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
+	}
+	else
+	{
+		/* general case: need an iteration node */
+		s2 = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, atom->end, s2);
+		NOERR();
+		dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+		repeat(v, s, s2, m, n);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '*', f, s, s2);
+		NOERR();
+		t->min = (short) m;
+		t->max = (short) n;
+		t->left = atom;
+		*atomp = t;
+		/* rest of branch is to be strung from iteration's end state */
 	}
 
 	/* and finally, look after that postponed recursion */
 	t = top->right;
 	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
-		t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+		t->right = parsebranch(v, stopper, type, s2, rp, 1);
 	else
 	{
-		EMPTYARC(atom->end, rp);
-		t->right = subre(v, '=', 0, atom->end, rp);
+		EMPTYARC(s2, rp);
+		t->right = subre(v, '=', 0, s2, rp);
 	}
 	assert(SEE('|') || SEE(stopper) || SEE(EOS));
 	t->flags |= COMBINE(t->flags, t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
 /*
  * repeat - replicate subNFA for quantifiers
  *
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
  * The duplication sequences used here are chosen carefully so that any
  * pointers starting out pointing into the subexpression end up pointing into
  * the last occurrence.  (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
 	   int n)
 {
 #define  SOME	 2
-#define  INF 3
+#define  INF	 3
 #define  PAIR(x, y)  ((x)*4 + (y))
 #define  REDUCE(x)	 ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
 	const int	rm = REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
 		v->treechain = ret;
 	}
 
-	assert(strchr("|.b(=", op) != NULL);
+	assert(strchr("=b|.*(", op) != NULL);
 
 	ret->op = op;
 	ret->flags = flags;
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 224da5064b69b..ea16e39a6eddf 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *);
 static int	dissect(struct vars *, struct subre *, chr *, chr *);
 static int	condissect(struct vars *, struct subre *, chr *, chr *);
 static int	altdissect(struct vars *, struct subre *, chr *, chr *);
+static int	iterdissect(struct vars *, struct subre *, chr *, chr *);
+static int	reviterdissect(struct vars *, struct subre *, chr *, chr *);
 static int	cdissect(struct vars *, struct subre *, chr *, chr *);
 static int	ccondissect(struct vars *, struct subre *, chr *, chr *);
 static int	crevdissect(struct vars *, struct subre *, chr *, chr *);
 static int	cbrdissect(struct vars *, struct subre *, chr *, chr *);
 static int	caltdissect(struct vars *, struct subre *, chr *, chr *);
+static int	citerdissect(struct vars *, struct subre *, chr *, chr *);
+static int	creviterdissect(struct vars *, struct subre *, chr *, chr *);
 
 /* === rege_dfa.c === */
 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
@@ -563,14 +567,17 @@ dissect(struct vars * v,
 		case '=':				/* terminal node */
 			assert(t->left == NULL && t->right == NULL);
 			return REG_OKAY;	/* no action, parent did the work */
-		case '|':				/* alternation */
-			assert(t->left != NULL);
-			return altdissect(v, t, begin, end);
 		case 'b':				/* back ref -- shouldn't be calling us! */
 			return REG_ASSERT;
 		case '.':				/* concatenation */
 			assert(t->left != NULL && t->right != NULL);
 			return condissect(v, t, begin, end);
+		case '|':				/* alternation */
+			assert(t->left != NULL);
+			return altdissect(v, t, begin, end);
+		case '*':				/* iteration */
+			assert(t->left != NULL);
+			return iterdissect(v, t, begin, end);
 		case '(':				/* capturing */
 			assert(t->left != NULL && t->right == NULL);
 			assert(t->subno > 0);
@@ -696,6 +703,375 @@ altdissect(struct vars * v,
 	return REG_ASSERT;			/* none of them matched?!? */
 }
 
+/*
+ * iterdissect - iteration subexpression matches (uncomplicated)
+ */
+static int						/* regexec return code */
+iterdissect(struct vars * v,
+			struct subre * t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(begin <= end);
+
+	if (t->left->flags & SHORTER)		/* reverse scan */
+		return reviterdissect(v, t, begin, end);
+
+	/*
+	 * If zero matches are allowed, and target string is empty, just declare
+	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * so we pretend the min is 1.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+	{
+		if (begin == end)
+			return REG_OKAY;
+		min_matches = 1;
+	}
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != INFINITY)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("iter %d\n", t->retry));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = end;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can shorten previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->retry, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to shorten some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			/* reject zero-length match unless necessary to achieve min */
+			if (endpts[k] == endpts[k - 1] &&
+				(k >= min_matches || min_matches - k < end - endpts[k]))
+				goto backtrack;
+
+			k++;
+			limit = end;
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches
+		 * that works so far as the child DFA can tell.  If k is an allowed
+		 * number of matches, start the slow part: recurse to verify each
+		 * sub-match.  We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			freedfa(d);
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->retry));
+			freedfa(d);
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+		/*
+		 * Must consider shorter versions of the current sub-match.  However,
+		 * we'll only ask for a zero-length match if necessary.
+		 */
+		while (k > 0)
+		{
+			chr	   *prev_end = endpts[k - 1];
+
+			if (endpts[k] > prev_end)
+			{
+				limit = endpts[k] - 1;
+				if (limit > prev_end ||
+					(k < min_matches && min_matches - k >= end - prev_end))
+				{
+					/* break out of backtrack loop, continue the outer one */
+					break;
+				}
+			}
+			/* can't shorten k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+	MDEBUG(("%d failed\n", t->retry));
+	freedfa(d);
+	FREE(endpts);
+	return REG_ASSERT;
+}
+
+/*
+ * reviterdissect - shortest-first iteration subexpression matches
+ */
+static int						/* regexec return code */
+reviterdissect(struct vars * v,
+			   struct subre * t,
+			   chr *begin,		/* beginning of relevant substring */
+			   chr *end)		/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->left->flags & SHORTER);
+	assert(begin <= end);
+
+	/*
+	 * If zero matches are allowed, and target string is empty, just declare
+	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * so we pretend the min is 1.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+	{
+		if (begin == end)
+			return REG_OKAY;
+		min_matches = 1;
+	}
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != INFINITY)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("reviter %d\n", t->retry));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = begin;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* disallow zero-length match unless necessary to achieve min */
+		if (limit == endpts[k - 1] &&
+			limit != end &&
+			(k >= min_matches || min_matches - k < end - limit))
+			limit++;
+
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+							 (chr **) NULL, (int *) NULL);
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can lengthen previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->retry, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to lengthen some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			k++;
+			limit = endpts[k - 1];
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches
+		 * that works so far as the child DFA can tell.  If k is an allowed
+		 * number of matches, start the slow part: recurse to verify each
+		 * sub-match.  We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			er = dissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			freedfa(d);
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->retry));
+			freedfa(d);
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+		/*
+		 * Must consider longer versions of the current sub-match.
+		 */
+		while (k > 0)
+		{
+			if (endpts[k] < end)
+			{
+				limit = endpts[k] + 1;
+				/* break out of backtrack loop, continue the outer one */
+				break;
+			}
+			/* can't lengthen k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
+	MDEBUG(("%d failed\n", t->retry));
+	freedfa(d);
+	FREE(endpts);
+	return REG_ASSERT;
+}
+
 /*
  * cdissect - determine subexpression matches (with complications)
  * The retry memory stores the offset of the trial midpoint from begin,
@@ -717,15 +1093,18 @@ cdissect(struct vars * v,
 		case '=':				/* terminal node */
 			assert(t->left == NULL && t->right == NULL);
 			return REG_OKAY;	/* no action, parent did the work */
-		case '|':				/* alternation */
-			assert(t->left != NULL);
-			return caltdissect(v, t, begin, end);
 		case 'b':				/* back reference */
 			assert(t->left == NULL && t->right == NULL);
 			return cbrdissect(v, t, begin, end);
 		case '.':				/* concatenation */
 			assert(t->left != NULL && t->right != NULL);
 			return ccondissect(v, t, begin, end);
+		case '|':				/* alternation */
+			assert(t->left != NULL);
+			return caltdissect(v, t, begin, end);
+		case '*':				/* iteration */
+			assert(t->left != NULL);
+			return citerdissect(v, t, begin, end);
 		case '(':				/* capturing */
 			assert(t->left != NULL && t->right == NULL);
 			assert(t->subno > 0);
@@ -847,7 +1226,7 @@ ccondissect(struct vars * v,
 }
 
 /*
- * crevdissect - determine backref shortest-first subexpression matches
+ * crevdissect - shortest-first concatenation subexpression matches
  * The retry memory stores the offset of the trial midpoint from begin,
  * plus 1 so that 0 uniquely means "clean slate".
  */
@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v,
 	return caltdissect(v, t->right, begin, end);
 }
 
+/*
+ * citerdissect - iteration subexpression matches (with complications)
+ */
+static int						/* regexec return code */
+citerdissect(struct vars * v,
+			 struct subre * t,
+			 chr *begin,		/* beginning of relevant substring */
+			 chr *end)			/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(begin <= end);
+
+	if (t->left->flags & SHORTER)		/* reverse scan */
+		return creviterdissect(v, t, begin, end);
+
+	/*
+	 * If zero matches are allowed, and target string is empty, just declare
+	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * so we pretend the min is 1.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+	{
+		if (begin == end)
+			return REG_OKAY;
+		min_matches = 1;
+	}
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != INFINITY)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("citer %d\n", t->retry));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = end;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can shorten previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->retry, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to shorten some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			/* reject zero-length match unless necessary to achieve min */
+			if (endpts[k] == endpts[k - 1] &&
+				(k >= min_matches || min_matches - k < end - endpts[k]))
+				goto backtrack;
+
+			k++;
+			limit = end;
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches
+		 * that works so far as the child DFA can tell.  If k is an allowed
+		 * number of matches, start the slow part: recurse to verify each
+		 * sub-match.  We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			zapmem(v, t->left);
+			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			freedfa(d);
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->retry));
+			freedfa(d);
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+		/*
+		 * Must consider shorter versions of the current sub-match.  However,
+		 * we'll only ask for a zero-length match if necessary.
+		 */
+		while (k > 0)
+		{
+			chr	   *prev_end = endpts[k - 1];
+
+			if (endpts[k] > prev_end)
+			{
+				limit = endpts[k] - 1;
+				if (limit > prev_end ||
+					(k < min_matches && min_matches - k >= end - prev_end))
+				{
+					/* break out of backtrack loop, continue the outer one */
+					break;
+				}
+			}
+			/* can't shorten k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted */
+	MDEBUG(("%d failed\n", t->retry));
+	freedfa(d);
+	FREE(endpts);
+	return REG_NOMATCH;
+}
+
+/*
+ * creviterdissect - shortest-first iteration subexpression matches
+ */
+static int						/* regexec return code */
+creviterdissect(struct vars * v,
+				struct subre * t,
+				chr *begin,		/* beginning of relevant substring */
+				chr *end)		/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->left->flags & SHORTER);
+	assert(begin <= end);
+
+	/*
+	 * If zero matches are allowed, and target string is empty, just declare
+	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * so we pretend the min is 1.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+	{
+		if (begin == end)
+			return REG_OKAY;
+		min_matches = 1;
+	}
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != INFINITY)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("creviter %d\n", t->retry));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = begin;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* disallow zero-length match unless necessary to achieve min */
+		if (limit == endpts[k - 1] &&
+			limit != end &&
+			(k >= min_matches || min_matches - k < end - limit))
+			limit++;
+
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+							 (chr **) NULL, (int *) NULL);
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can lengthen previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->retry, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to lengthen some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			k++;
+			limit = endpts[k - 1];
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches
+		 * that works so far as the child DFA can tell.  If k is an allowed
+		 * number of matches, start the slow part: recurse to verify each
+		 * sub-match.  We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			zapmem(v, t->left);
+			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			freedfa(d);
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->retry));
+			freedfa(d);
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+		/*
+		 * Must consider longer versions of the current sub-match.
+		 */
+		while (k > 0)
+		{
+			if (endpts[k] < end)
+			{
+				limit = endpts[k] + 1;
+				/* break out of backtrack loop, continue the outer one */
+				break;
+			}
+			/* can't lengthen k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted */
+	MDEBUG(("%d failed\n", t->retry));
+	freedfa(d);
+	FREE(endpts);
+	return REG_NOMATCH;
+}
+
 
 
 #include "rege_dfa.c"
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index fb6789b560f38..d420ea8316e18 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -372,10 +372,28 @@ struct cnfa
 
 /*
  * subexpression tree
+ *
+ * "op" is one of:
+ *		'='  plain regex without interesting substructure (implemented as DFA)
+ *		'b'  back-reference (has no substructure either)
+ *		'('  capture node: captures the match of its single child
+ *		'.'  concatenation: matches a match for left, then a match for right
+ *		'|'  alternation: matches a match for left or a match for right
+ *		'*'  iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect.  This could stand to be changed.  Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top.  This is
+ * for efficiency: there is no need to search for possible division points.
  */
 struct subre
 {
-	char		op;				/* '|', '.' (concat), 'b' (backref), '(', '=' */
+	char		op;				/* see type codes above */
 	char		flags;
 #define  LONGER  01				/* prefers longer match */
 #define  SHORTER 02				/* prefers shorter match */
@@ -393,8 +411,8 @@ struct subre
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
 	short		retry;			/* index into retry memory */
 	int			subno;			/* subexpression number (for 'b' and '(') */
-	short		min;			/* min repetitions, for backref only */
-	short		max;			/* max repetitions, for backref only */
+	short		min;			/* min repetitions for iteration or backref */
+	short		max;			/* max repetitions for iteration or backref */
 	struct subre *left;			/* left child, if any (also freelist chain) */
 	struct subre *right;		/* right child, if any */
 	struct state *begin;		/* outarcs from here... */
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index 5694908163af8..4acc4a47a03bc 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
  t
 (1 row)
 
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index 242a81ef3298a..b5315a3df6ddd 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
 select 'xxx' ~ '^([bc])\1*$' as f;
 select 'bbc' ~ '^([bc])\1*$' as f;
 select 'b' ~ '^([bc])\1*$' as t;
+
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;

From 9cfd800aab9ee3c3b0b2b11ab41e129cc92dc15b Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 24 Feb 2012 11:04:45 +0200
Subject: [PATCH 042/129] Add some enumeration commas, for consistency

---
 src/backend/access/gist/gistbuild.c   | 2 +-
 src/backend/commands/async.c          | 2 +-
 src/backend/commands/tablecmds.c      | 2 +-
 src/backend/libpq/hba.c               | 6 +++---
 src/backend/utils/adt/float.c         | 2 +-
 src/backend/utils/adt/numeric.c       | 2 +-
 src/backend/utils/misc/guc.c          | 4 ++--
 src/bin/pg_basebackup/pg_basebackup.c | 2 +-
 src/bin/pg_dump/pg_dump.c             | 2 +-
 src/bin/pg_dump/pg_restore.c          | 2 +-
 src/test/regress/expected/numeric.out | 4 ++--
 11 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 3dc3e96df4d9c..988896d289f5a 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -248,7 +248,7 @@ gistValidateBufferingOption(char *value)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid value for \"buffering\" option"),
-			   errdetail("Valid values are \"on\", \"off\" and \"auto\".")));
+			   errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
 	}
 }
 
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index a81f7c76c3eb0..fcb087ed15dfb 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -739,7 +739,7 @@ AtPrepare_Notify(void)
 	if (pendingActions || pendingNotifies)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot PREPARE a transaction that has executed LISTEN, UNLISTEN or NOTIFY")));
+				 errmsg("cannot PREPARE a transaction that has executed LISTEN, UNLISTEN, or NOTIFY")));
 }
 
 /*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 07dc326cea7c4..28889c1f44040 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -2115,7 +2115,7 @@ renameatt_check(Oid myrelid, Form_pg_class classform, bool recursing)
 		relkind != RELKIND_FOREIGN_TABLE)
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("\"%s\" is not a table, view, composite type, index or foreign table",
+				 errmsg("\"%s\" is not a table, view, composite type, index, or foreign table",
 						NameStr(classform->relname))));
 
 	/*
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index a83b52ea29065..56229cb4df8b2 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -1391,7 +1391,7 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num)
 			hbaline->auth_method != uaGSS &&
 			hbaline->auth_method != uaSSPI &&
 			hbaline->auth_method != uaCert)
-			INVALID_AUTH_OPTION("map", gettext_noop("ident, peer, krb5, gssapi, sspi and cert"));
+			INVALID_AUTH_OPTION("map", gettext_noop("ident, peer, krb5, gssapi, sspi, and cert"));
 		hbaline->usermap = pstrdup(val);
 	}
 	else if (strcmp(name, "clientcert") == 0)
@@ -1510,7 +1510,7 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num)
 		if (hbaline->auth_method != uaKrb5 &&
 			hbaline->auth_method != uaGSS &&
 			hbaline->auth_method != uaSSPI)
-			INVALID_AUTH_OPTION("krb_realm", gettext_noop("krb5, gssapi and sspi"));
+			INVALID_AUTH_OPTION("krb_realm", gettext_noop("krb5, gssapi, and sspi"));
 		hbaline->krb_realm = pstrdup(val);
 	}
 	else if (strcmp(name, "include_realm") == 0)
@@ -1518,7 +1518,7 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num)
 		if (hbaline->auth_method != uaKrb5 &&
 			hbaline->auth_method != uaGSS &&
 			hbaline->auth_method != uaSSPI)
-			INVALID_AUTH_OPTION("include_realm", gettext_noop("krb5, gssapi and sspi"));
+			INVALID_AUTH_OPTION("include_realm", gettext_noop("krb5, gssapi, and sspi"));
 		if (strcmp(val, "1") == 0)
 			hbaline->include_realm = true;
 		else
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 245030f531c53..ca0042a1762dc 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -2750,7 +2750,7 @@ width_bucket_float8(PG_FUNCTION_ARGS)
 	if (isnan(operand) || isnan(bound1) || isnan(bound2))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
-			  errmsg("operand, lower bound and upper bound cannot be NaN")));
+			  errmsg("operand, lower bound, and upper bound cannot be NaN")));
 
 	/* Note that we allow "operand" to be infinite */
 	if (isinf(bound1) || isinf(bound2))
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index cee6cab954c7c..627870a4464e3 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -1223,7 +1223,7 @@ width_bucket_numeric(PG_FUNCTION_ARGS)
 		NUMERIC_IS_NAN(bound2))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
-			  errmsg("operand, lower bound and upper bound cannot be NaN")));
+			  errmsg("operand, lower bound, and upper bound cannot be NaN")));
 
 	init_var(&result_var);
 	init_var(&count_var);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 84b330c6d3924..a5becbe8ff775 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2240,7 +2240,7 @@ static struct config_int ConfigureNamesInt[] =
 	},
 	{
 		{"autovacuum_analyze_threshold", PGC_SIGHUP, AUTOVACUUM,
-			gettext_noop("Minimum number of tuple inserts, updates or deletes prior to analyze."),
+			gettext_noop("Minimum number of tuple inserts, updates, or deletes prior to analyze."),
 			NULL
 		},
 		&autovacuum_anl_thresh,
@@ -2494,7 +2494,7 @@ static struct config_real ConfigureNamesReal[] =
 	},
 	{
 		{"autovacuum_analyze_scale_factor", PGC_SIGHUP, AUTOVACUUM,
-			gettext_noop("Number of tuple inserts, updates or deletes prior to analyze as a fraction of reltuples."),
+			gettext_noop("Number of tuple inserts, updates, or deletes prior to analyze as a fraction of reltuples."),
 			NULL
 		},
 		&autovacuum_anl_scale,
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index aabbdac8d007f..b39d2e7bf3807 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -1245,7 +1245,7 @@ main(int argc, char **argv)
 					streamwal = true;
 				else
 				{
-					fprintf(stderr, _("%s: invalid xlog option \"%s\", must be empty, \"fetch\" or \"stream\"\n"),
+					fprintf(stderr, _("%s: invalid xlog option \"%s\", must be empty, \"fetch\", or \"stream\"\n"),
 							progname, optarg);
 					exit(1);
 				}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index c3c861d85787f..d845c90ac8283 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -817,7 +817,7 @@ help(const char *progname)
 	printf(_("  --no-tablespaces            do not dump tablespace assignments\n"));
 	printf(_("  --no-unlogged-table-data    do not dump unlogged table data\n"));
 	printf(_("  --quote-all-identifiers     quote all identifiers, even if not key words\n"));
-	printf(_("  --section=SECTION           dump named section (pre-data, data or post-data)\n"));
+	printf(_("  --section=SECTION           dump named section (pre-data, data, or post-data)\n"));
 	printf(_("  --serializable-deferrable   wait until the dump can run without anomalies\n"));
 	printf(_("  --use-set-session-authorization\n"
 			 "                              use SET SESSION AUTHORIZATION commands instead of\n"
diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c
index 1c026cf91a5bc..b5f4c627c4063 100644
--- a/src/bin/pg_dump/pg_restore.c
+++ b/src/bin/pg_dump/pg_restore.c
@@ -463,7 +463,7 @@ usage(const char *progname)
 			 "                           created\n"));
 	printf(_("  --no-security-labels     do not restore security labels\n"));
 	printf(_("  --no-tablespaces         do not restore tablespace assignments\n"));
-	printf(_("  --section=SECTION        restore named section (pre-data, data or post-data)\n"));
+	printf(_("  --section=SECTION        restore named section (pre-data, data, or post-data)\n"));
 	printf(_("  --use-set-session-authorization\n"
 			 "                           use SET SESSION AUTHORIZATION commands instead of\n"
 	  "                           ALTER OWNER commands to set ownership\n"));
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 71c520e5e7a03..94cb0ed551dbe 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -746,9 +746,9 @@ ERROR:  count must be greater than zero
 SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888);
 ERROR:  lower bound cannot equal upper bound
 SELECT width_bucket('NaN', 3.0, 4.0, 888);
-ERROR:  operand, lower bound and upper bound cannot be NaN
+ERROR:  operand, lower bound, and upper bound cannot be NaN
 SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
-ERROR:  operand, lower bound and upper bound cannot be NaN
+ERROR:  operand, lower bound, and upper bound cannot be NaN
 -- normal operation
 CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
 COPY width_bucket_test (operand_num) FROM stdin;

From 3aa42c25c3d1cab8dcbdff913aa60e404f67de1f Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 24 Feb 2012 20:40:29 +0200
Subject: [PATCH 043/129] Put Debian package list back in alphabetical order

---
 doc/src/sgml/docguide.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml
index 95d00586aac4a..0b40a57fd99fa 100644
--- a/doc/src/sgml/docguide.sgml
+++ b/doc/src/sgml/docguide.sgml
@@ -272,7 +272,7 @@ CATALOG "docbook/4.2/catalog"
     available for <productname>Debian GNU/Linux</productname>.
     To install, simply use:
 <programlisting>
-apt-get install docbook docbook-dsssl docbook-xsl openjade1.3 xsltproc opensp
+apt-get install docbook docbook-dsssl docbook-xsl openjade1.3 opensp xsltproc
 </programlisting>
    </para>
   </sect2>

From 7c19f9d139e76529bbb94b1bde7bc4c19f1bc37a Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Fri, 24 Feb 2012 16:17:07 -0500
Subject: [PATCH 044/129] Update src/tools/make_ctags to avoid Exuberant tags
 option that has been renamed and undocumented since 2003;  instead, use the
 documented option.  Add comments.

---
 src/tools/make_ctags | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/tools/make_ctags b/src/tools/make_ctags
index 4fc01f543d025..37ac134aa3dff 100755
--- a/src/tools/make_ctags
+++ b/src/tools/make_ctags
@@ -5,20 +5,41 @@
 trap "rm -f /tmp/$$" 0 1 2 3 15
 rm -f ./tags
 
-cv=`ctags --version 2>&1 | grep Exuberant`
+IS_EXUBERANT=""
+ctags --version 2>&1 | grep Exuberant && IS_EXUBERANT="Y"
 
-if [ -z "$cv" ]
-then	FLAGS="-dt"
-else	FLAGS="--c-types=+dfmstuv"
+# List of kinds supported by Exuberant Ctags 5.8
+# generated by ctags --list-kinds
+#    c  classes
+#    d  macro definitions
+#    e  enumerators (values inside an enumeration)
+#    f  function definitions
+#    g  enumeration names
+#    l  local variables [off]
+#    m  class, struct, and union members
+#    n  namespaces
+#    p  function prototypes [off]
+#    s  structure names
+#    t  typedefs
+#    u  union names
+#    v  variable definitions
+#    x  external and forward variable declarations [off]
+
+if [ "$IS_EXUBERANT" ]
+then	FLAGS="--c-kinds=+dfmstuv"
+else	FLAGS="-dt"
 fi
 
-find `pwd`/ \( -name _deadcode -prune \) -o \
-		-type f -name '*.[chyl]' -print |
-	xargs ctags "$FLAGS" -a -f tags
+# this is outputting the tags into the file 'tags', and appending
+find `pwd`/ -type f -name '*.[chyl]' -print |
+	xargs ctags -a -f tags "$FLAGS"
 
-if [ -z "$cv" ]
-then
-	LC_ALL=C
+# Exuberant tags has a header that we cannot sort in with the other entries
+# so we skip the sort step
+# Why are we sorting this?  I guess some tag implementation need this,
+# particularly for append mode.  bjm 2012-02-24
+if [ ! "$IS_EXUBERANT" ]
+then	LC_ALL=C
 	export LC_ALL
 	sort tags >/tmp/$$ && mv /tmp/$$ tags
 fi

From 1fbacbf998678ae8d5868aeea3e83c915e940d0a Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Fri, 24 Feb 2012 16:19:18 -0500
Subject: [PATCH 045/129] Mention original ctags option name.

---
 src/tools/make_ctags | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tools/make_ctags b/src/tools/make_ctags
index 37ac134aa3dff..1609c07675437 100755
--- a/src/tools/make_ctags
+++ b/src/tools/make_ctags
@@ -10,6 +10,7 @@ ctags --version 2>&1 | grep Exuberant && IS_EXUBERANT="Y"
 
 # List of kinds supported by Exuberant Ctags 5.8
 # generated by ctags --list-kinds
+# --c-kinds was called --c-types before 2003
 #    c  classes
 #    d  macro definitions
 #    e  enumerators (values inside an enumeration)

From 3cbfe485e44d055b9e6a27e47069729375059f8c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Feb 2012 03:36:49 -0500
Subject: [PATCH 046/129] Remove useless "retry memory" logic within regex
 engine.

Apparently some primordial version of Spencer's engine needed cdissect()
and child functions to be able to continue matching from a previous
position when re-called.  That is dead code, though, since trivial
inspection shows that cdissect can never be entered without having
previously done zapmem which resets the relevant retry counter.  I have
also verified experimentally that no case in the Tcl regression tests
reaches cdissect with a nonzero retry value.  Accordingly, remove that
logic.  This doesn't really save any noticeable number of cycles in itself,
but it is one step towards making dissect() and cdissect() equivalent,
which will allow removing hundreds of lines of near-duplicated code.

Since struct subre's "retry" field is no longer particularly related to
any kind of retry, rename it to "id".  As of this commit it's only used
for identifying a subre node in debug printouts, so you might think we
should get rid of the field entirely; but I have a plan for another use.
---
 src/backend/regex/regcomp.c |  14 +--
 src/backend/regex/regexec.c | 212 ++++++++++++------------------------
 src/include/regex/regguts.h |   2 +-
 3 files changed, 78 insertions(+), 150 deletions(-)

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index b84d0c3af55f0..7fd0b07e2c5d4 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1623,7 +1623,7 @@ subre(struct vars * v,
 
 	ret->op = op;
 	ret->flags = flags;
-	ret->retry = 0;
+	ret->id = 0;				/* will be assigned later */
 	ret->subno = 0;
 	ret->min = ret->max = 1;
 	ret->left = NULL;
@@ -1693,7 +1693,7 @@ optst(struct vars * v,
 }
 
 /*
- * numst - number tree nodes (assigning retry indexes)
+ * numst - number tree nodes (assigning "id" indexes)
  */
 static int						/* next number */
 numst(struct subre * t,
@@ -1704,7 +1704,7 @@ numst(struct subre * t,
 	assert(t != NULL);
 
 	i = start;
-	t->retry = (short) i++;
+	t->id = (short) i++;
 	if (t->left != NULL)
 		i = numst(t->left, i);
 	if (t->right != NULL)
@@ -1999,11 +1999,11 @@ stid(struct subre * t,
 	 char *buf,
 	 size_t bufsize)
 {
-	/* big enough for hex int or decimal t->retry? */
-	if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->retry) * 3 + 1)
+	/* big enough for hex int or decimal t->id? */
+	if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
 		return "unable";
-	if (t->retry != 0)
-		sprintf(buf, "%d", t->retry);
+	if (t->id != 0)
+		sprintf(buf, "%d", t->id);
 	else
 		sprintf(buf, "%p", t);
 	return buf;
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index ea16e39a6eddf..55f0c18d14f6a 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -112,7 +112,6 @@ struct vars
 	chr		   *search_start;	/* search start of string */
 	chr		   *stop;			/* just past end of string */
 	int			err;			/* error code if any (0 none) */
-	regoff_t   *mem;			/* memory vector for backtracking */
 	struct smalldfa dfa1;
 	struct smalldfa dfa2;
 };
@@ -134,8 +133,8 @@ struct vars
 static int	find(struct vars *, struct cnfa *, struct colormap *);
 static int	cfind(struct vars *, struct cnfa *, struct colormap *);
 static int	cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
-static void zapsubs(regmatch_t *, size_t);
-static void zapmem(struct vars *, struct subre *);
+static void zapallsubs(regmatch_t *, size_t);
+static void zaptreesubs(struct vars *, struct subre *);
 static void subset(struct vars *, struct subre *, chr *, chr *);
 static int	dissect(struct vars *, struct subre *, chr *, chr *);
 static int	condissect(struct vars *, struct subre *, chr *, chr *);
@@ -186,9 +185,6 @@ pg_regexec(regex_t *re,
 #define  LOCALMAT	 20
 	regmatch_t	mat[LOCALMAT];
 
-#define  LOCALMEM	 40
-	regoff_t	mem[LOCALMEM];
-
 	/* sanity checks */
 	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
 		return REG_INVARG;
@@ -229,24 +225,6 @@ pg_regexec(regex_t *re,
 	v->search_start = (chr *) string + search_start;
 	v->stop = (chr *) string + len;
 	v->err = 0;
-	if (backref)
-	{
-		/* need retry memory */
-		assert(v->g->ntree >= 0);
-		n = (size_t) v->g->ntree;
-		if (n <= LOCALMEM)
-			v->mem = mem;
-		else
-			v->mem = (regoff_t *) MALLOC(n * sizeof(regoff_t));
-		if (v->mem == NULL)
-		{
-			if (v->pmatch != pmatch && v->pmatch != mat)
-				FREE(v->pmatch);
-			return REG_ESPACE;
-		}
-	}
-	else
-		v->mem = NULL;
 
 	/* do it */
 	assert(v->g->tree != NULL);
@@ -258,7 +236,7 @@ pg_regexec(regex_t *re,
 	/* copy (portion of) match vector over if necessary */
 	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
 	{
-		zapsubs(pmatch, nmatch);
+		zapallsubs(pmatch, nmatch);
 		n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
 		memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
 	}
@@ -266,8 +244,6 @@ pg_regexec(regex_t *re,
 	/* clean up */
 	if (v->pmatch != pmatch && v->pmatch != mat)
 		FREE(v->pmatch);
-	if (v->mem != NULL && v->mem != mem)
-		FREE(v->mem);
 	return st;
 }
 
@@ -354,7 +330,7 @@ find(struct vars * v,
 		return REG_OKAY;
 
 	/* submatches */
-	zapsubs(v->pmatch, v->nmatch);
+	zapallsubs(v->pmatch, v->nmatch);
 	return dissect(v, v->g->tree, begin, end);
 }
 
@@ -451,8 +427,7 @@ cfindloop(struct vars * v,
 				if (end == NULL)
 					break;		/* NOTE BREAK OUT */
 				MDEBUG(("tentative end %ld\n", LOFF(end)));
-				zapsubs(v->pmatch, v->nmatch);
-				zapmem(v, v->g->tree);
+				zapallsubs(v->pmatch, v->nmatch);
 				er = cdissect(v, v->g->tree, begin, end);
 				if (er == REG_OKAY)
 				{
@@ -490,11 +465,11 @@ cfindloop(struct vars * v,
 }
 
 /*
- * zapsubs - initialize the subexpression matches to "no match"
+ * zapallsubs - initialize all subexpression matches to "no match"
  */
 static void
-zapsubs(regmatch_t *p,
-		size_t n)
+zapallsubs(regmatch_t *p,
+		   size_t n)
 {
 	size_t		i;
 
@@ -506,17 +481,12 @@ zapsubs(regmatch_t *p,
 }
 
 /*
- * zapmem - initialize the retry memory of a subtree to zeros
+ * zaptreesubs - initialize subexpressions within subtree to "no match"
  */
 static void
-zapmem(struct vars * v,
-	   struct subre * t)
+zaptreesubs(struct vars * v,
+			struct subre * t)
 {
-	if (t == NULL)
-		return;
-
-	assert(v->mem != NULL);
-	v->mem[t->retry] = 0;
 	if (t->op == '(')
 	{
 		assert(t->subno > 0);
@@ -525,9 +495,9 @@ zapmem(struct vars * v,
 	}
 
 	if (t->left != NULL)
-		zapmem(v, t->left);
+		zaptreesubs(v, t->left);
 	if (t->right != NULL)
-		zapmem(v, t->right);
+		zaptreesubs(v, t->right);
 }
 
 /*
@@ -767,7 +737,7 @@ iterdissect(struct vars * v,
 		FREE(endpts);
 		return v->err;
 	}
-	MDEBUG(("iter %d\n", t->retry));
+	MDEBUG(("iter %d\n", t->id));
 
 	/*
 	 * Our strategy is to first find a set of sub-match endpoints that are
@@ -796,7 +766,7 @@ iterdissect(struct vars * v,
 			goto backtrack;
 		}
 		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->retry, k, LOFF(endpts[k])));
+				t->id, k, LOFF(endpts[k])));
 
 		/* k'th sub-match can no longer be considered verified */
 		if (nverified >= k)
@@ -831,7 +801,7 @@ iterdissect(struct vars * v,
 		if (k < min_matches)
 			goto backtrack;
 
-		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
 
 		for (i = nverified + 1; i <= k; i++)
 		{
@@ -852,7 +822,7 @@ iterdissect(struct vars * v,
 		if (i > k)
 		{
 			/* satisfaction */
-			MDEBUG(("%d successful\n", t->retry));
+			MDEBUG(("%d successful\n", t->id));
 			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
@@ -885,7 +855,7 @@ iterdissect(struct vars * v,
 	}
 
 	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
-	MDEBUG(("%d failed\n", t->retry));
+	MDEBUG(("%d failed\n", t->id));
 	freedfa(d);
 	FREE(endpts);
 	return REG_ASSERT;
@@ -953,7 +923,7 @@ reviterdissect(struct vars * v,
 		FREE(endpts);
 		return v->err;
 	}
-	MDEBUG(("reviter %d\n", t->retry));
+	MDEBUG(("reviter %d\n", t->id));
 
 	/*
 	 * Our strategy is to first find a set of sub-match endpoints that are
@@ -989,7 +959,7 @@ reviterdissect(struct vars * v,
 			goto backtrack;
 		}
 		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->retry, k, LOFF(endpts[k])));
+				t->id, k, LOFF(endpts[k])));
 
 		/* k'th sub-match can no longer be considered verified */
 		if (nverified >= k)
@@ -1019,7 +989,7 @@ reviterdissect(struct vars * v,
 		if (k < min_matches)
 			goto backtrack;
 
-		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
 
 		for (i = nverified + 1; i <= k; i++)
 		{
@@ -1040,7 +1010,7 @@ reviterdissect(struct vars * v,
 		if (i > k)
 		{
 			/* satisfaction */
-			MDEBUG(("%d successful\n", t->retry));
+			MDEBUG(("%d successful\n", t->id));
 			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
@@ -1066,7 +1036,7 @@ reviterdissect(struct vars * v,
 	}
 
 	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
-	MDEBUG(("%d failed\n", t->retry));
+	MDEBUG(("%d failed\n", t->id));
 	freedfa(d);
 	FREE(endpts);
 	return REG_ASSERT;
@@ -1074,8 +1044,6 @@ reviterdissect(struct vars * v,
 
 /*
  * cdissect - determine subexpression matches (with complications)
- * The retry memory stores the offset of the trial midpoint from begin,
- * plus 1 so that 0 uniquely means "clean slate".
  */
 static int						/* regexec return code */
 cdissect(struct vars * v,
@@ -1119,8 +1087,6 @@ cdissect(struct vars * v,
 
 /*
  * ccondissect - concatenation subexpression matches (with complications)
- * The retry memory stores the offset of the trial midpoint from begin,
- * plus 1 so that 0 uniquely means "clean slate".
  */
 static int						/* regexec return code */
 ccondissect(struct vars * v,
@@ -1149,26 +1115,17 @@ ccondissect(struct vars * v,
 		freedfa(d);
 		return v->err;
 	}
-	MDEBUG(("cconcat %d\n", t->retry));
+	MDEBUG(("cconcat %d\n", t->id));
 
 	/* pick a tentative midpoint */
-	if (v->mem[t->retry] == 0)
-	{
-		mid = longest(v, d, begin, end, (int *) NULL);
-		if (mid == NULL)
-		{
-			freedfa(d);
-			freedfa(d2);
-			return REG_NOMATCH;
-		}
-		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
-		v->mem[t->retry] = (mid - begin) + 1;
-	}
-	else
+	mid = longest(v, d, begin, end, (int *) NULL);
+	if (mid == NULL)
 	{
-		mid = begin + (v->mem[t->retry] - 1);
-		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
+		freedfa(d);
+		freedfa(d2);
+		return REG_NOMATCH;
 	}
+	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
 	/* iterate until satisfaction or failure */
 	for (;;)
@@ -1201,7 +1158,7 @@ ccondissect(struct vars * v,
 		if (mid == begin)
 		{
 			/* all possibilities exhausted */
-			MDEBUG(("%d no midpoint\n", t->retry));
+			MDEBUG(("%d no midpoint\n", t->id));
 			freedfa(d);
 			freedfa(d2);
 			return REG_NOMATCH;
@@ -1210,15 +1167,14 @@ ccondissect(struct vars * v,
 		if (mid == NULL)
 		{
 			/* failed to find a new one */
-			MDEBUG(("%d failed midpoint\n", t->retry));
+			MDEBUG(("%d failed midpoint\n", t->id));
 			freedfa(d);
 			freedfa(d2);
 			return REG_NOMATCH;
 		}
-		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
-		v->mem[t->retry] = (mid - begin) + 1;
-		zapmem(v, t->left);
-		zapmem(v, t->right);
+		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+		zaptreesubs(v, t->left);
+		zaptreesubs(v, t->right);
 	}
 
 	/* can't get here */
@@ -1227,8 +1183,6 @@ ccondissect(struct vars * v,
 
 /*
  * crevdissect - shortest-first concatenation subexpression matches
- * The retry memory stores the offset of the trial midpoint from begin,
- * plus 1 so that 0 uniquely means "clean slate".
  */
 static int						/* regexec return code */
 crevdissect(struct vars * v,
@@ -1256,26 +1210,17 @@ crevdissect(struct vars * v,
 		freedfa(d);
 		return v->err;
 	}
-	MDEBUG(("crev %d\n", t->retry));
+	MDEBUG(("crev %d\n", t->id));
 
 	/* pick a tentative midpoint */
-	if (v->mem[t->retry] == 0)
-	{
-		mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
-		if (mid == NULL)
-		{
-			freedfa(d);
-			freedfa(d2);
-			return REG_NOMATCH;
-		}
-		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
-		v->mem[t->retry] = (mid - begin) + 1;
-	}
-	else
+	mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
+	if (mid == NULL)
 	{
-		mid = begin + (v->mem[t->retry] - 1);
-		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
+		freedfa(d);
+		freedfa(d2);
+		return REG_NOMATCH;
 	}
+	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
 	/* iterate until satisfaction or failure */
 	for (;;)
@@ -1308,7 +1253,7 @@ crevdissect(struct vars * v,
 		if (mid == end)
 		{
 			/* all possibilities exhausted */
-			MDEBUG(("%d no midpoint\n", t->retry));
+			MDEBUG(("%d no midpoint\n", t->id));
 			freedfa(d);
 			freedfa(d2);
 			return REG_NOMATCH;
@@ -1317,15 +1262,14 @@ crevdissect(struct vars * v,
 		if (mid == NULL)
 		{
 			/* failed to find a new one */
-			MDEBUG(("%d failed midpoint\n", t->retry));
+			MDEBUG(("%d failed midpoint\n", t->id));
 			freedfa(d);
 			freedfa(d2);
 			return REG_NOMATCH;
 		}
-		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
-		v->mem[t->retry] = (mid - begin) + 1;
-		zapmem(v, t->left);
-		zapmem(v, t->right);
+		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+		zaptreesubs(v, t->left);
+		zaptreesubs(v, t->right);
 	}
 
 	/* can't get here */
@@ -1355,7 +1299,7 @@ cbrdissect(struct vars * v,
 	assert(n >= 0);
 	assert((size_t) n < v->nmatch);
 
-	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
+	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
 
 	/* get the backreferenced string */
 	if (v->pmatch[n].rm_so == -1)
@@ -1363,11 +1307,6 @@ cbrdissect(struct vars * v,
 	brstring = v->start + v->pmatch[n].rm_so;
 	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
 
-	/* no room to maneuver -- retries are pointless */
-	if (v->mem[t->retry])
-		return REG_NOMATCH;
-	v->mem[t->retry] = 1;
-
 	/* special cases for zero-length strings */
 	if (brlen == 0)
 	{
@@ -1430,40 +1369,29 @@ caltdissect(struct vars * v,
 	struct dfa *d;
 	int			er;
 
-#define  UNTRIED 0				/* not yet tried at all */
-#define  TRYING  1				/* top matched, trying submatches */
-#define  TRIED	 2				/* top didn't match or submatches exhausted */
-
 	if (t == NULL)
 		return REG_NOMATCH;
-	assert(t->op == '|');
-	if (v->mem[t->retry] == TRIED)
-		return caltdissect(v, t->right, begin, end);
 
-	MDEBUG(("calt n%d\n", t->retry));
+	assert(t->op == '|');
 	assert(t->left != NULL);
 
-	if (v->mem[t->retry] == UNTRIED)
+	MDEBUG(("calt n%d\n", t->id));
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+		return v->err;
+	if (longest(v, d, begin, end, (int *) NULL) != end)
 	{
-		d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-		if (ISERR())
-			return v->err;
-		if (longest(v, d, begin, end, (int *) NULL) != end)
-		{
-			freedfa(d);
-			v->mem[t->retry] = TRIED;
-			return caltdissect(v, t->right, begin, end);
-		}
 		freedfa(d);
-		MDEBUG(("calt matched\n"));
-		v->mem[t->retry] = TRYING;
+		return caltdissect(v, t->right, begin, end);
 	}
+	freedfa(d);
+	MDEBUG(("calt matched\n"));
 
 	er = cdissect(v, t->left, begin, end);
 	if (er != REG_NOMATCH)
 		return er;
 
-	v->mem[t->retry] = TRIED;
 	return caltdissect(v, t->right, begin, end);
 }
 
@@ -1531,7 +1459,7 @@ citerdissect(struct vars * v,
 		FREE(endpts);
 		return v->err;
 	}
-	MDEBUG(("citer %d\n", t->retry));
+	MDEBUG(("citer %d\n", t->id));
 
 	/*
 	 * Our strategy is to first find a set of sub-match endpoints that are
@@ -1560,7 +1488,7 @@ citerdissect(struct vars * v,
 			goto backtrack;
 		}
 		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->retry, k, LOFF(endpts[k])));
+				t->id, k, LOFF(endpts[k])));
 
 		/* k'th sub-match can no longer be considered verified */
 		if (nverified >= k)
@@ -1595,11 +1523,11 @@ citerdissect(struct vars * v,
 		if (k < min_matches)
 			goto backtrack;
 
-		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
 
 		for (i = nverified + 1; i <= k; i++)
 		{
-			zapmem(v, t->left);
+			zaptreesubs(v, t->left);
 			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
 			if (er == REG_OKAY)
 			{
@@ -1617,7 +1545,7 @@ citerdissect(struct vars * v,
 		if (i > k)
 		{
 			/* satisfaction */
-			MDEBUG(("%d successful\n", t->retry));
+			MDEBUG(("%d successful\n", t->id));
 			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
@@ -1650,7 +1578,7 @@ citerdissect(struct vars * v,
 	}
 
 	/* all possibilities exhausted */
-	MDEBUG(("%d failed\n", t->retry));
+	MDEBUG(("%d failed\n", t->id));
 	freedfa(d);
 	FREE(endpts);
 	return REG_NOMATCH;
@@ -1718,7 +1646,7 @@ creviterdissect(struct vars * v,
 		FREE(endpts);
 		return v->err;
 	}
-	MDEBUG(("creviter %d\n", t->retry));
+	MDEBUG(("creviter %d\n", t->id));
 
 	/*
 	 * Our strategy is to first find a set of sub-match endpoints that are
@@ -1754,7 +1682,7 @@ creviterdissect(struct vars * v,
 			goto backtrack;
 		}
 		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->retry, k, LOFF(endpts[k])));
+				t->id, k, LOFF(endpts[k])));
 
 		/* k'th sub-match can no longer be considered verified */
 		if (nverified >= k)
@@ -1784,11 +1712,11 @@ creviterdissect(struct vars * v,
 		if (k < min_matches)
 			goto backtrack;
 
-		MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
 
 		for (i = nverified + 1; i <= k; i++)
 		{
-			zapmem(v, t->left);
+			zaptreesubs(v, t->left);
 			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
 			if (er == REG_OKAY)
 			{
@@ -1806,7 +1734,7 @@ creviterdissect(struct vars * v,
 		if (i > k)
 		{
 			/* satisfaction */
-			MDEBUG(("%d successful\n", t->retry));
+			MDEBUG(("%d successful\n", t->id));
 			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
@@ -1832,7 +1760,7 @@ creviterdissect(struct vars * v,
 	}
 
 	/* all possibilities exhausted */
-	MDEBUG(("%d failed\n", t->retry));
+	MDEBUG(("%d failed\n", t->id));
 	freedfa(d);
 	FREE(endpts);
 	return REG_NOMATCH;
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index d420ea8316e18..bc5419d98e78b 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -409,7 +409,7 @@ struct subre
 #define  PREF(f) ((f)&LOCAL)
 #define  PREF2(f1, f2)	 ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
-	short		retry;			/* index into retry memory */
+	short		id;				/* ID of subre (1..ntree) */
 	int			subno;			/* subexpression number (for 'b' and '(') */
 	short		min;			/* min repetitions for iteration or backref */
 	short		max;			/* max repetitions for iteration or backref */

From 587359479acbbdc95c8e37da40707e37097423f5 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Feb 2012 14:56:35 -0500
Subject: [PATCH 047/129] Avoid repeated creation/freeing of per-subre DFAs
 during regex search.

In nested sub-regex trees, lower-level nodes created DFAs and then
destroyed them again before exiting, which is a bit dumb considering that
the recursive search is likely to call those nodes again later.  Instead
cache each created DFA until the end of pg_regexec().  This is basically a
space for time tradeoff, in that it might increase the maximum memory
usage.  However, in most regex patterns there are not all that many subre
nodes, so not that many DFAs --- and in any case, the peak usage occurs
when reaching the bottom recursion level, and except for alternation cases
that's going to be the same anyway.
---
 src/backend/regex/regexec.c | 158 +++++++++++++++---------------------
 src/include/regex/regguts.h |   4 +-
 2 files changed, 68 insertions(+), 94 deletions(-)

diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 55f0c18d14f6a..d3e850a86994c 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -112,6 +112,7 @@ struct vars
 	chr		   *search_start;	/* search start of string */
 	chr		   *stop;			/* just past end of string */
 	int			err;			/* error code if any (0 none) */
+	struct dfa **subdfas;		/* per-subre DFAs */
 	struct smalldfa dfa1;
 	struct smalldfa dfa2;
 };
@@ -130,6 +131,7 @@ struct vars
  * forward declarations
  */
 /* === regexec.c === */
+static struct dfa *getsubdfa(struct vars *, struct subre *);
 static int	find(struct vars *, struct cnfa *, struct colormap *);
 static int	cfind(struct vars *, struct cnfa *, struct colormap *);
 static int	cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
@@ -180,11 +182,15 @@ pg_regexec(regex_t *re,
 	register struct vars *v = &var;
 	int			st;
 	size_t		n;
+	size_t		i;
 	int			backref;
 
 #define  LOCALMAT	 20
 	regmatch_t	mat[LOCALMAT];
 
+#define  LOCALDFAS	 40
+	struct dfa *subdfas[LOCALDFAS];
+
 	/* sanity checks */
 	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
 		return REG_INVARG;
@@ -225,6 +231,20 @@ pg_regexec(regex_t *re,
 	v->search_start = (chr *) string + search_start;
 	v->stop = (chr *) string + len;
 	v->err = 0;
+	assert(v->g->ntree >= 0);
+	n = (size_t) v->g->ntree;
+	if (n <= LOCALDFAS)
+		v->subdfas = subdfas;
+	else
+		v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+	if (v->subdfas == NULL)
+	{
+		if (v->pmatch != pmatch && v->pmatch != mat)
+			FREE(v->pmatch);
+		return REG_ESPACE;
+	}
+	for (i = 0; i < n; i++)
+		v->subdfas[i] = NULL;
 
 	/* do it */
 	assert(v->g->tree != NULL);
@@ -244,9 +264,36 @@ pg_regexec(regex_t *re,
 	/* clean up */
 	if (v->pmatch != pmatch && v->pmatch != mat)
 		FREE(v->pmatch);
+	for (i = 0; i < n; i++)
+	{
+		if (v->subdfas[i] != NULL)
+			freedfa(v->subdfas[i]);
+	}
+	if (v->subdfas != subdfas)
+		FREE(v->subdfas);
+
 	return st;
 }
 
+/*
+ * getsubdfa - create or re-fetch the DFA for a subre node
+ *
+ * We only need to create the DFA once per overall regex execution.
+ * The DFA will be freed by the cleanup step in pg_regexec().
+ */
+static struct dfa *
+getsubdfa(struct vars * v,
+		  struct subre * t)
+{
+	if (v->subdfas[t->id] == NULL)
+	{
+		v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+		if (ISERR())
+			return NULL;
+	}
+	return v->subdfas[t->id];
+}
+
 /*
  * find - find a match for the main NFA (no-complications case)
  */
@@ -578,15 +625,10 @@ condissect(struct vars * v,
 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
+	d = getsubdfa(v, t->left);
+	NOERR();
+	d2 = getsubdfa(v, t->right);
 	NOERR();
-	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
-	if (ISERR())
-	{
-		assert(d2 == NULL);
-		freedfa(d);
-		return v->err;
-	}
 
 	/* pick a tentative midpoint */
 	if (shorter)
@@ -595,11 +637,7 @@ condissect(struct vars * v,
 	else
 		mid = longest(v, d, begin, end, (int *) NULL);
 	if (mid == NULL)
-	{
-		freedfa(d);
-		freedfa(d2);
 		return REG_ASSERT;
-	}
 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
 	/* iterate until satisfaction or failure */
@@ -610,8 +648,6 @@ condissect(struct vars * v,
 		{
 			/* all possibilities exhausted! */
 			MDEBUG(("no midpoint!\n"));
-			freedfa(d);
-			freedfa(d2);
 			return REG_ASSERT;
 		}
 		if (shorter)
@@ -623,8 +659,6 @@ condissect(struct vars * v,
 		{
 			/* failed to find a new one! */
 			MDEBUG(("failed midpoint!\n"));
-			freedfa(d);
-			freedfa(d2);
 			return REG_ASSERT;
 		}
 		MDEBUG(("new midpoint %ld\n", LOFF(mid)));
@@ -632,8 +666,6 @@ condissect(struct vars * v,
 
 	/* satisfaction */
 	MDEBUG(("successful\n"));
-	freedfa(d);
-	freedfa(d2);
 	i = dissect(v, t->left, begin, mid);
 	if (i != REG_OKAY)
 		return i;
@@ -659,16 +691,13 @@ altdissect(struct vars * v,
 	{
 		MDEBUG(("trying %dth\n", i));
 		assert(t->left != NULL && t->left->cnfa.nstates > 0);
-		d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
-		if (ISERR())
-			return v->err;
+		d = getsubdfa(v, t->left);
+		NOERR();
 		if (longest(v, d, begin, end, (int *) NULL) == end)
 		{
 			MDEBUG(("success\n"));
-			freedfa(d);
 			return dissect(v, t->left, begin, end);
 		}
-		freedfa(d);
 	}
 	return REG_ASSERT;			/* none of them matched?!? */
 }
@@ -731,7 +760,7 @@ iterdissect(struct vars * v,
 		return REG_ESPACE;
 	endpts[0] = begin;
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	d = getsubdfa(v, t->left);
 	if (ISERR())
 	{
 		FREE(endpts);
@@ -814,7 +843,6 @@ iterdissect(struct vars * v,
 			if (er == REG_NOMATCH)
 				break;
 			/* oops, something failed */
-			freedfa(d);
 			FREE(endpts);
 			return er;
 		}
@@ -823,7 +851,6 @@ iterdissect(struct vars * v,
 		{
 			/* satisfaction */
 			MDEBUG(("%d successful\n", t->id));
-			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
 		}
@@ -856,7 +883,6 @@ iterdissect(struct vars * v,
 
 	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
 	MDEBUG(("%d failed\n", t->id));
-	freedfa(d);
 	FREE(endpts);
 	return REG_ASSERT;
 }
@@ -917,7 +943,7 @@ reviterdissect(struct vars * v,
 		return REG_ESPACE;
 	endpts[0] = begin;
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	d = getsubdfa(v, t->left);
 	if (ISERR())
 	{
 		FREE(endpts);
@@ -1002,7 +1028,6 @@ reviterdissect(struct vars * v,
 			if (er == REG_NOMATCH)
 				break;
 			/* oops, something failed */
-			freedfa(d);
 			FREE(endpts);
 			return er;
 		}
@@ -1011,7 +1036,6 @@ reviterdissect(struct vars * v,
 		{
 			/* satisfaction */
 			MDEBUG(("%d successful\n", t->id));
-			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
 		}
@@ -1037,7 +1061,6 @@ reviterdissect(struct vars * v,
 
 	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
 	MDEBUG(("%d failed\n", t->id));
-	freedfa(d);
 	FREE(endpts);
 	return REG_ASSERT;
 }
@@ -1106,25 +1129,16 @@ ccondissect(struct vars * v,
 	if (t->left->flags & SHORTER)		/* reverse scan */
 		return crevdissect(v, t, begin, end);
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-	if (ISERR())
-		return v->err;
-	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
-	if (ISERR())
-	{
-		freedfa(d);
-		return v->err;
-	}
+	d = getsubdfa(v, t->left);
+	NOERR();
+	d2 = getsubdfa(v, t->right);
+	NOERR();
 	MDEBUG(("cconcat %d\n", t->id));
 
 	/* pick a tentative midpoint */
 	mid = longest(v, d, begin, end, (int *) NULL);
 	if (mid == NULL)
-	{
-		freedfa(d);
-		freedfa(d2);
 		return REG_NOMATCH;
-	}
 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
 	/* iterate until satisfaction or failure */
@@ -1141,17 +1155,11 @@ ccondissect(struct vars * v,
 				{
 					/* satisfaction */
 					MDEBUG(("successful\n"));
-					freedfa(d);
-					freedfa(d2);
 					return REG_OKAY;
 				}
 			}
 			if (er != REG_OKAY && er != REG_NOMATCH)
-			{
-				freedfa(d);
-				freedfa(d2);
 				return er;
-			}
 		}
 
 		/* that midpoint didn't work, find a new one */
@@ -1159,8 +1167,6 @@ ccondissect(struct vars * v,
 		{
 			/* all possibilities exhausted */
 			MDEBUG(("%d no midpoint\n", t->id));
-			freedfa(d);
-			freedfa(d2);
 			return REG_NOMATCH;
 		}
 		mid = longest(v, d, begin, mid - 1, (int *) NULL);
@@ -1168,8 +1174,6 @@ ccondissect(struct vars * v,
 		{
 			/* failed to find a new one */
 			MDEBUG(("%d failed midpoint\n", t->id));
-			freedfa(d);
-			freedfa(d2);
 			return REG_NOMATCH;
 		}
 		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -1201,25 +1205,16 @@ crevdissect(struct vars * v,
 	assert(t->left->flags & SHORTER);
 
 	/* concatenation -- need to split the substring between parts */
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-	if (ISERR())
-		return v->err;
-	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
-	if (ISERR())
-	{
-		freedfa(d);
-		return v->err;
-	}
+	d = getsubdfa(v, t->left);
+	NOERR();
+	d2 = getsubdfa(v, t->right);
+	NOERR();
 	MDEBUG(("crev %d\n", t->id));
 
 	/* pick a tentative midpoint */
 	mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
 	if (mid == NULL)
-	{
-		freedfa(d);
-		freedfa(d2);
 		return REG_NOMATCH;
-	}
 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
 	/* iterate until satisfaction or failure */
@@ -1236,17 +1231,11 @@ crevdissect(struct vars * v,
 				{
 					/* satisfaction */
 					MDEBUG(("successful\n"));
-					freedfa(d);
-					freedfa(d2);
 					return REG_OKAY;
 				}
 			}
 			if (er != REG_OKAY && er != REG_NOMATCH)
-			{
-				freedfa(d);
-				freedfa(d2);
 				return er;
-			}
 		}
 
 		/* that midpoint didn't work, find a new one */
@@ -1254,8 +1243,6 @@ crevdissect(struct vars * v,
 		{
 			/* all possibilities exhausted */
 			MDEBUG(("%d no midpoint\n", t->id));
-			freedfa(d);
-			freedfa(d2);
 			return REG_NOMATCH;
 		}
 		mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
@@ -1263,8 +1250,6 @@ crevdissect(struct vars * v,
 		{
 			/* failed to find a new one */
 			MDEBUG(("%d failed midpoint\n", t->id));
-			freedfa(d);
-			freedfa(d2);
 			return REG_NOMATCH;
 		}
 		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -1377,15 +1362,10 @@ caltdissect(struct vars * v,
 
 	MDEBUG(("calt n%d\n", t->id));
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-	if (ISERR())
-		return v->err;
+	d = getsubdfa(v, t->left);
+	NOERR();
 	if (longest(v, d, begin, end, (int *) NULL) != end)
-	{
-		freedfa(d);
 		return caltdissect(v, t->right, begin, end);
-	}
-	freedfa(d);
 	MDEBUG(("calt matched\n"));
 
 	er = cdissect(v, t->left, begin, end);
@@ -1453,7 +1433,7 @@ citerdissect(struct vars * v,
 		return REG_ESPACE;
 	endpts[0] = begin;
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	d = getsubdfa(v, t->left);
 	if (ISERR())
 	{
 		FREE(endpts);
@@ -1537,7 +1517,6 @@ citerdissect(struct vars * v,
 			if (er == REG_NOMATCH)
 				break;
 			/* oops, something failed */
-			freedfa(d);
 			FREE(endpts);
 			return er;
 		}
@@ -1546,7 +1525,6 @@ citerdissect(struct vars * v,
 		{
 			/* satisfaction */
 			MDEBUG(("%d successful\n", t->id));
-			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
 		}
@@ -1579,7 +1557,6 @@ citerdissect(struct vars * v,
 
 	/* all possibilities exhausted */
 	MDEBUG(("%d failed\n", t->id));
-	freedfa(d);
 	FREE(endpts);
 	return REG_NOMATCH;
 }
@@ -1640,7 +1617,7 @@ creviterdissect(struct vars * v,
 		return REG_ESPACE;
 	endpts[0] = begin;
 
-	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	d = getsubdfa(v, t->left);
 	if (ISERR())
 	{
 		FREE(endpts);
@@ -1726,7 +1703,6 @@ creviterdissect(struct vars * v,
 			if (er == REG_NOMATCH)
 				break;
 			/* oops, something failed */
-			freedfa(d);
 			FREE(endpts);
 			return er;
 		}
@@ -1735,7 +1711,6 @@ creviterdissect(struct vars * v,
 		{
 			/* satisfaction */
 			MDEBUG(("%d successful\n", t->id));
-			freedfa(d);
 			FREE(endpts);
 			return REG_OKAY;
 		}
@@ -1761,7 +1736,6 @@ creviterdissect(struct vars * v,
 
 	/* all possibilities exhausted */
 	MDEBUG(("%d failed\n", t->id));
-	freedfa(d);
 	FREE(endpts);
 	return REG_NOMATCH;
 }
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index bc5419d98e78b..65b8d178da86d 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -409,7 +409,7 @@ struct subre
 #define  PREF(f) ((f)&LOCAL)
 #define  PREF2(f1, f2)	 ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
-	short		id;				/* ID of subre (1..ntree) */
+	short		id;				/* ID of subre (1..ntree-1) */
 	int			subno;			/* subexpression number (for 'b' and '(') */
 	short		min;			/* min repetitions for iteration or backref */
 	short		max;			/* max repetitions for iteration or backref */
@@ -446,7 +446,7 @@ struct guts
 	size_t		nsub;			/* copy of re_nsub */
 	struct subre *tree;
 	struct cnfa search;			/* for fast preliminary search */
-	int			ntree;
+	int			ntree;			/* number of subre's, less one */
 	struct colormap cmap;
 	int			FUNCPTR(compare, (const chr *, const chr *, size_t));
 	struct subre *lacons;		/* lookahead-constraint vector */

From 4dd78bf37aa29d04b3f358b08c4a2fa43cf828e7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Feb 2012 16:26:10 -0500
Subject: [PATCH 048/129] Merge dissect() into cdissect() to remove a pile of
 near-duplicate code.

The "uncomplicated" case isn't materially less complicated than the full
case, certainly not enough so to justify duplicating nearly 500 lines
of code.  The only extra work being done in the full path is zaptreesubs,
which is very cheap compared to everything else being done here, and
besides that I'm less than convinced that it's not needed in some cases
even without backrefs.
---
 src/backend/regex/regexec.c | 626 +++++-------------------------------
 1 file changed, 77 insertions(+), 549 deletions(-)

diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index d3e850a86994c..c9c73e978b65e 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -138,14 +138,9 @@ static int	cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa
 static void zapallsubs(regmatch_t *, size_t);
 static void zaptreesubs(struct vars *, struct subre *);
 static void subset(struct vars *, struct subre *, chr *, chr *);
-static int	dissect(struct vars *, struct subre *, chr *, chr *);
-static int	condissect(struct vars *, struct subre *, chr *, chr *);
-static int	altdissect(struct vars *, struct subre *, chr *, chr *);
-static int	iterdissect(struct vars *, struct subre *, chr *, chr *);
-static int	reviterdissect(struct vars *, struct subre *, chr *, chr *);
 static int	cdissect(struct vars *, struct subre *, chr *, chr *);
 static int	ccondissect(struct vars *, struct subre *, chr *, chr *);
-static int	crevdissect(struct vars *, struct subre *, chr *, chr *);
+static int	crevcondissect(struct vars *, struct subre *, chr *, chr *);
 static int	cbrdissect(struct vars *, struct subre *, chr *, chr *);
 static int	caltdissect(struct vars *, struct subre *, chr *, chr *);
 static int	citerdissect(struct vars *, struct subre *, chr *, chr *);
@@ -376,9 +371,9 @@ find(struct vars * v,
 	if (v->nmatch == 1)			/* no need for submatches */
 		return REG_OKAY;
 
-	/* submatches */
+	/* find submatches */
 	zapallsubs(v->pmatch, v->nmatch);
-	return dissect(v, v->g->tree, begin, end);
+	return cdissect(v, v->g->tree, begin, end);
 }
 
 /*
@@ -568,505 +563,19 @@ subset(struct vars * v,
 }
 
 /*
- * dissect - determine subexpression matches (uncomplicated case)
- */
-static int						/* regexec return code */
-dissect(struct vars * v,
-		struct subre * t,
-		chr *begin,				/* beginning of relevant substring */
-		chr *end)				/* end of same */
-{
-	assert(t != NULL);
-	MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
-
-	switch (t->op)
-	{
-		case '=':				/* terminal node */
-			assert(t->left == NULL && t->right == NULL);
-			return REG_OKAY;	/* no action, parent did the work */
-		case 'b':				/* back ref -- shouldn't be calling us! */
-			return REG_ASSERT;
-		case '.':				/* concatenation */
-			assert(t->left != NULL && t->right != NULL);
-			return condissect(v, t, begin, end);
-		case '|':				/* alternation */
-			assert(t->left != NULL);
-			return altdissect(v, t, begin, end);
-		case '*':				/* iteration */
-			assert(t->left != NULL);
-			return iterdissect(v, t, begin, end);
-		case '(':				/* capturing */
-			assert(t->left != NULL && t->right == NULL);
-			assert(t->subno > 0);
-			subset(v, t, begin, end);
-			return dissect(v, t->left, begin, end);
-		default:
-			return REG_ASSERT;
-	}
-}
-
-/*
- * condissect - determine concatenation subexpression matches (uncomplicated)
- */
-static int						/* regexec return code */
-condissect(struct vars * v,
-		   struct subre * t,
-		   chr *begin,			/* beginning of relevant substring */
-		   chr *end)			/* end of same */
-{
-	struct dfa *d;
-	struct dfa *d2;
-	chr		   *mid;
-	int			i;
-	int			shorter = (t->left->flags & SHORTER) ? 1 : 0;
-	chr		   *stop = (shorter) ? end : begin;
-
-	assert(t->op == '.');
-	assert(t->left != NULL && t->left->cnfa.nstates > 0);
-	assert(t->right != NULL && t->right->cnfa.nstates > 0);
-
-	d = getsubdfa(v, t->left);
-	NOERR();
-	d2 = getsubdfa(v, t->right);
-	NOERR();
-
-	/* pick a tentative midpoint */
-	if (shorter)
-		mid = shortest(v, d, begin, begin, end, (chr **) NULL,
-					   (int *) NULL);
-	else
-		mid = longest(v, d, begin, end, (int *) NULL);
-	if (mid == NULL)
-		return REG_ASSERT;
-	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
-
-	/* iterate until satisfaction or failure */
-	while (longest(v, d2, mid, end, (int *) NULL) != end)
-	{
-		/* that midpoint didn't work, find a new one */
-		if (mid == stop)
-		{
-			/* all possibilities exhausted! */
-			MDEBUG(("no midpoint!\n"));
-			return REG_ASSERT;
-		}
-		if (shorter)
-			mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL,
-						   (int *) NULL);
-		else
-			mid = longest(v, d, begin, mid - 1, (int *) NULL);
-		if (mid == NULL)
-		{
-			/* failed to find a new one! */
-			MDEBUG(("failed midpoint!\n"));
-			return REG_ASSERT;
-		}
-		MDEBUG(("new midpoint %ld\n", LOFF(mid)));
-	}
-
-	/* satisfaction */
-	MDEBUG(("successful\n"));
-	i = dissect(v, t->left, begin, mid);
-	if (i != REG_OKAY)
-		return i;
-	return dissect(v, t->right, mid, end);
-}
-
-/*
- * altdissect - determine alternative subexpression matches (uncomplicated)
- */
-static int						/* regexec return code */
-altdissect(struct vars * v,
-		   struct subre * t,
-		   chr *begin,			/* beginning of relevant substring */
-		   chr *end)			/* end of same */
-{
-	struct dfa *d;
-	int			i;
-
-	assert(t != NULL);
-	assert(t->op == '|');
-
-	for (i = 0; t != NULL; t = t->right, i++)
-	{
-		MDEBUG(("trying %dth\n", i));
-		assert(t->left != NULL && t->left->cnfa.nstates > 0);
-		d = getsubdfa(v, t->left);
-		NOERR();
-		if (longest(v, d, begin, end, (int *) NULL) == end)
-		{
-			MDEBUG(("success\n"));
-			return dissect(v, t->left, begin, end);
-		}
-	}
-	return REG_ASSERT;			/* none of them matched?!? */
-}
-
-/*
- * iterdissect - iteration subexpression matches (uncomplicated)
- */
-static int						/* regexec return code */
-iterdissect(struct vars * v,
-			struct subre * t,
-			chr *begin,			/* beginning of relevant substring */
-			chr *end)			/* end of same */
-{
-	struct dfa *d;
-	chr		  **endpts;
-	chr		   *limit;
-	int			min_matches;
-	size_t		max_matches;
-	int			nverified;
-	int			k;
-	int			i;
-	int			er;
-
-	assert(t->op == '*');
-	assert(t->left != NULL && t->left->cnfa.nstates > 0);
-	assert(begin <= end);
-
-	if (t->left->flags & SHORTER)		/* reverse scan */
-		return reviterdissect(v, t, begin, end);
-
-	/*
-	 * If zero matches are allowed, and target string is empty, just declare
-	 * victory.  OTOH, if target string isn't empty, zero matches can't work
-	 * so we pretend the min is 1.
-	 */
-	min_matches = t->min;
-	if (min_matches <= 0)
-	{
-		if (begin == end)
-			return REG_OKAY;
-		min_matches = 1;
-	}
-
-	/*
-	 * We need workspace to track the endpoints of each sub-match.  Normally
-	 * we consider only nonzero-length sub-matches, so there can be at most
-	 * end-begin of them.  However, if min is larger than that, we will also
-	 * consider zero-length sub-matches in order to find enough matches.
-	 *
-	 * For convenience, endpts[0] contains the "begin" pointer and we store
-	 * sub-match endpoints in endpts[1..max_matches].
-	 */
-	max_matches = end - begin;
-	if (max_matches > t->max && t->max != INFINITY)
-		max_matches = t->max;
-	if (max_matches < min_matches)
-		max_matches = min_matches;
-	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
-	if (endpts == NULL)
-		return REG_ESPACE;
-	endpts[0] = begin;
-
-	d = getsubdfa(v, t->left);
-	if (ISERR())
-	{
-		FREE(endpts);
-		return v->err;
-	}
-	MDEBUG(("iter %d\n", t->id));
-
-	/*
-	 * Our strategy is to first find a set of sub-match endpoints that are
-	 * valid according to the child node's DFA, and then recursively dissect
-	 * each sub-match to confirm validity.  If any validity check fails,
-	 * backtrack the last sub-match and try again.  And, when we next try for
-	 * a validity check, we need not recheck any successfully verified
-	 * sub-matches that we didn't move the endpoints of.  nverified remembers
-	 * how many sub-matches are currently known okay.
-	 */
-
-	/* initialize to consider first sub-match */
-	nverified = 0;
-	k = 1;
-	limit = end;
-
-	/* iterate until satisfaction or failure */
-	while (k > 0)
-	{
-		/* try to find an endpoint for the k'th sub-match */
-		endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
-		if (endpts[k] == NULL)
-		{
-			/* no match possible, so see if we can shorten previous one */
-			k--;
-			goto backtrack;
-		}
-		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->id, k, LOFF(endpts[k])));
-
-		/* k'th sub-match can no longer be considered verified */
-		if (nverified >= k)
-			nverified = k - 1;
-
-		if (endpts[k] != end)
-		{
-			/* haven't reached end yet, try another iteration if allowed */
-			if (k >= max_matches)
-			{
-				/* must try to shorten some previous match */
-				k--;
-				goto backtrack;
-			}
-
-			/* reject zero-length match unless necessary to achieve min */
-			if (endpts[k] == endpts[k - 1] &&
-				(k >= min_matches || min_matches - k < end - endpts[k]))
-				goto backtrack;
-
-			k++;
-			limit = end;
-			continue;
-		}
-
-		/*
-		 * We've identified a way to divide the string into k sub-matches
-		 * that works so far as the child DFA can tell.  If k is an allowed
-		 * number of matches, start the slow part: recurse to verify each
-		 * sub-match.  We always have k <= max_matches, needn't check that.
-		 */
-		if (k < min_matches)
-			goto backtrack;
-
-		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
-
-		for (i = nverified + 1; i <= k; i++)
-		{
-			er = dissect(v, t->left, endpts[i - 1], endpts[i]);
-			if (er == REG_OKAY)
-			{
-				nverified = i;
-				continue;
-			}
-			if (er == REG_NOMATCH)
-				break;
-			/* oops, something failed */
-			FREE(endpts);
-			return er;
-		}
-
-		if (i > k)
-		{
-			/* satisfaction */
-			MDEBUG(("%d successful\n", t->id));
-			FREE(endpts);
-			return REG_OKAY;
-		}
-
-		/* match failed to verify, so backtrack */
-
-backtrack:
-		/*
-		 * Must consider shorter versions of the current sub-match.  However,
-		 * we'll only ask for a zero-length match if necessary.
-		 */
-		while (k > 0)
-		{
-			chr	   *prev_end = endpts[k - 1];
-
-			if (endpts[k] > prev_end)
-			{
-				limit = endpts[k] - 1;
-				if (limit > prev_end ||
-					(k < min_matches && min_matches - k >= end - prev_end))
-				{
-					/* break out of backtrack loop, continue the outer one */
-					break;
-				}
-			}
-			/* can't shorten k'th sub-match any more, consider previous one */
-			k--;
-		}
-	}
-
-	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
-	MDEBUG(("%d failed\n", t->id));
-	FREE(endpts);
-	return REG_ASSERT;
-}
-
-/*
- * reviterdissect - shortest-first iteration subexpression matches
- */
-static int						/* regexec return code */
-reviterdissect(struct vars * v,
-			   struct subre * t,
-			   chr *begin,		/* beginning of relevant substring */
-			   chr *end)		/* end of same */
-{
-	struct dfa *d;
-	chr		  **endpts;
-	chr		   *limit;
-	int			min_matches;
-	size_t		max_matches;
-	int			nverified;
-	int			k;
-	int			i;
-	int			er;
-
-	assert(t->op == '*');
-	assert(t->left != NULL && t->left->cnfa.nstates > 0);
-	assert(t->left->flags & SHORTER);
-	assert(begin <= end);
-
-	/*
-	 * If zero matches are allowed, and target string is empty, just declare
-	 * victory.  OTOH, if target string isn't empty, zero matches can't work
-	 * so we pretend the min is 1.
-	 */
-	min_matches = t->min;
-	if (min_matches <= 0)
-	{
-		if (begin == end)
-			return REG_OKAY;
-		min_matches = 1;
-	}
-
-	/*
-	 * We need workspace to track the endpoints of each sub-match.  Normally
-	 * we consider only nonzero-length sub-matches, so there can be at most
-	 * end-begin of them.  However, if min is larger than that, we will also
-	 * consider zero-length sub-matches in order to find enough matches.
-	 *
-	 * For convenience, endpts[0] contains the "begin" pointer and we store
-	 * sub-match endpoints in endpts[1..max_matches].
-	 */
-	max_matches = end - begin;
-	if (max_matches > t->max && t->max != INFINITY)
-		max_matches = t->max;
-	if (max_matches < min_matches)
-		max_matches = min_matches;
-	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
-	if (endpts == NULL)
-		return REG_ESPACE;
-	endpts[0] = begin;
-
-	d = getsubdfa(v, t->left);
-	if (ISERR())
-	{
-		FREE(endpts);
-		return v->err;
-	}
-	MDEBUG(("reviter %d\n", t->id));
-
-	/*
-	 * Our strategy is to first find a set of sub-match endpoints that are
-	 * valid according to the child node's DFA, and then recursively dissect
-	 * each sub-match to confirm validity.  If any validity check fails,
-	 * backtrack the last sub-match and try again.  And, when we next try for
-	 * a validity check, we need not recheck any successfully verified
-	 * sub-matches that we didn't move the endpoints of.  nverified remembers
-	 * how many sub-matches are currently known okay.
-	 */
-
-	/* initialize to consider first sub-match */
-	nverified = 0;
-	k = 1;
-	limit = begin;
-
-	/* iterate until satisfaction or failure */
-	while (k > 0)
-	{
-		/* disallow zero-length match unless necessary to achieve min */
-		if (limit == endpts[k - 1] &&
-			limit != end &&
-			(k >= min_matches || min_matches - k < end - limit))
-			limit++;
-
-		/* try to find an endpoint for the k'th sub-match */
-		endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
-							 (chr **) NULL, (int *) NULL);
-		if (endpts[k] == NULL)
-		{
-			/* no match possible, so see if we can lengthen previous one */
-			k--;
-			goto backtrack;
-		}
-		MDEBUG(("%d: working endpoint %d: %ld\n",
-				t->id, k, LOFF(endpts[k])));
-
-		/* k'th sub-match can no longer be considered verified */
-		if (nverified >= k)
-			nverified = k - 1;
-
-		if (endpts[k] != end)
-		{
-			/* haven't reached end yet, try another iteration if allowed */
-			if (k >= max_matches)
-			{
-				/* must try to lengthen some previous match */
-				k--;
-				goto backtrack;
-			}
-
-			k++;
-			limit = endpts[k - 1];
-			continue;
-		}
-
-		/*
-		 * We've identified a way to divide the string into k sub-matches
-		 * that works so far as the child DFA can tell.  If k is an allowed
-		 * number of matches, start the slow part: recurse to verify each
-		 * sub-match.  We always have k <= max_matches, needn't check that.
-		 */
-		if (k < min_matches)
-			goto backtrack;
-
-		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
-
-		for (i = nverified + 1; i <= k; i++)
-		{
-			er = dissect(v, t->left, endpts[i - 1], endpts[i]);
-			if (er == REG_OKAY)
-			{
-				nverified = i;
-				continue;
-			}
-			if (er == REG_NOMATCH)
-				break;
-			/* oops, something failed */
-			FREE(endpts);
-			return er;
-		}
-
-		if (i > k)
-		{
-			/* satisfaction */
-			MDEBUG(("%d successful\n", t->id));
-			FREE(endpts);
-			return REG_OKAY;
-		}
-
-		/* match failed to verify, so backtrack */
-
-backtrack:
-		/*
-		 * Must consider longer versions of the current sub-match.
-		 */
-		while (k > 0)
-		{
-			if (endpts[k] < end)
-			{
-				limit = endpts[k] + 1;
-				/* break out of backtrack loop, continue the outer one */
-				break;
-			}
-			/* can't lengthen k'th sub-match any more, consider previous one */
-			k--;
-		}
-	}
-
-	/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
-	MDEBUG(("%d failed\n", t->id));
-	FREE(endpts);
-	return REG_ASSERT;
-}
-
-/*
- * cdissect - determine subexpression matches (with complications)
+ * cdissect - check backrefs and determine subexpression matches
+ *
+ * cdissect recursively processes a subre tree to check matching of backrefs
+ * and/or identify submatch boundaries for capture nodes.  The proposed match
+ * runs from "begin" to "end" (not including "end"), and we are basically
+ * "dissecting" it to see where the submatches are.
+ *
+ * Before calling any level of cdissect, the caller must have run the node's
+ * DFA and found that the proposed substring satisfies the DFA.  (We make
+ * the caller do that because in concatenation and iteration nodes, it's
+ * much faster to check all the substrings against the child DFAs before we
+ * recurse.)  Also, caller must have cleared subexpression match data via
+ * zaptreesubs (or zapallsubs at the top level).
  */
 static int						/* regexec return code */
 cdissect(struct vars * v,
@@ -1083,33 +592,54 @@ cdissect(struct vars * v,
 	{
 		case '=':				/* terminal node */
 			assert(t->left == NULL && t->right == NULL);
-			return REG_OKAY;	/* no action, parent did the work */
+			er = REG_OKAY;		/* no action, parent did the work */
+			break;
 		case 'b':				/* back reference */
 			assert(t->left == NULL && t->right == NULL);
-			return cbrdissect(v, t, begin, end);
+			er = cbrdissect(v, t, begin, end);
+			break;
 		case '.':				/* concatenation */
 			assert(t->left != NULL && t->right != NULL);
-			return ccondissect(v, t, begin, end);
+			if (t->left->flags & SHORTER)		/* reverse scan */
+				er = crevcondissect(v, t, begin, end);
+			else
+				er = ccondissect(v, t, begin, end);
+			break;
 		case '|':				/* alternation */
 			assert(t->left != NULL);
-			return caltdissect(v, t, begin, end);
+			er = caltdissect(v, t, begin, end);
+			break;
 		case '*':				/* iteration */
 			assert(t->left != NULL);
-			return citerdissect(v, t, begin, end);
+			if (t->left->flags & SHORTER)		/* reverse scan */
+				er = creviterdissect(v, t, begin, end);
+			else
+				er = citerdissect(v, t, begin, end);
+			break;
 		case '(':				/* capturing */
 			assert(t->left != NULL && t->right == NULL);
 			assert(t->subno > 0);
 			er = cdissect(v, t->left, begin, end);
 			if (er == REG_OKAY)
 				subset(v, t, begin, end);
-			return er;
+			break;
 		default:
-			return REG_ASSERT;
+			er = REG_ASSERT;
+			break;
 	}
+
+	/*
+	 * We should never have a match failure unless backrefs lurk below;
+	 * otherwise, either caller failed to check the DFA, or there's some
+	 * inconsistency between the DFA and the node's innards.
+	 */
+	assert(er != REG_NOMATCH || (t->flags & BACKR));
+
+	return er;
 }
 
 /*
- * ccondissect - concatenation subexpression matches (with complications)
+ * ccondissect - dissect match for concatenation node
  */
 static int						/* regexec return code */
 ccondissect(struct vars * v,
@@ -1125,9 +655,7 @@ ccondissect(struct vars * v,
 	assert(t->op == '.');
 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
-
-	if (t->left->flags & SHORTER)		/* reverse scan */
-		return crevdissect(v, t, begin, end);
+	assert(!(t->left->flags & SHORTER));
 
 	d = getsubdfa(v, t->left);
 	NOERR();
@@ -1158,7 +686,7 @@ ccondissect(struct vars * v,
 					return REG_OKAY;
 				}
 			}
-			if (er != REG_OKAY && er != REG_NOMATCH)
+			if (er != REG_NOMATCH)
 				return er;
 		}
 
@@ -1186,13 +714,13 @@ ccondissect(struct vars * v,
 }
 
 /*
- * crevdissect - shortest-first concatenation subexpression matches
+ * crevcondissect - dissect match for concatenation node, shortest-first
  */
 static int						/* regexec return code */
-crevdissect(struct vars * v,
-			struct subre * t,
-			chr *begin,			/* beginning of relevant substring */
-			chr *end)			/* end of same */
+crevcondissect(struct vars * v,
+			   struct subre * t,
+			   chr *begin,		/* beginning of relevant substring */
+			   chr *end)		/* end of same */
 {
 	struct dfa *d;
 	struct dfa *d2;
@@ -1204,12 +732,11 @@ crevdissect(struct vars * v,
 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
 	assert(t->left->flags & SHORTER);
 
-	/* concatenation -- need to split the substring between parts */
 	d = getsubdfa(v, t->left);
 	NOERR();
 	d2 = getsubdfa(v, t->right);
 	NOERR();
-	MDEBUG(("crev %d\n", t->id));
+	MDEBUG(("crevcon %d\n", t->id));
 
 	/* pick a tentative midpoint */
 	mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
@@ -1234,7 +761,7 @@ crevdissect(struct vars * v,
 					return REG_OKAY;
 				}
 			}
-			if (er != REG_OKAY && er != REG_NOMATCH)
+			if (er != REG_NOMATCH)
 				return er;
 		}
 
@@ -1262,7 +789,7 @@ crevdissect(struct vars * v,
 }
 
 /*
- * cbrdissect - determine backref subexpression matches
+ * cbrdissect - dissect match for backref node
  */
 static int						/* regexec return code */
 cbrdissect(struct vars * v,
@@ -1343,7 +870,7 @@ cbrdissect(struct vars * v,
 }
 
 /*
- * caltdissect - determine alternative subexpression matches (w. complications)
+ * caltdissect - dissect match for alternation node
  */
 static int						/* regexec return code */
 caltdissect(struct vars * v,
@@ -1354,29 +881,32 @@ caltdissect(struct vars * v,
 	struct dfa *d;
 	int			er;
 
-	if (t == NULL)
-		return REG_NOMATCH;
-
-	assert(t->op == '|');
-	assert(t->left != NULL);
+	/* We loop, rather than tail-recurse, to handle a chain of alternatives */
+	while (t != NULL)
+	{
+		assert(t->op == '|');
+		assert(t->left != NULL && t->left->cnfa.nstates > 0);
 
-	MDEBUG(("calt n%d\n", t->id));
+		MDEBUG(("calt n%d\n", t->id));
 
-	d = getsubdfa(v, t->left);
-	NOERR();
-	if (longest(v, d, begin, end, (int *) NULL) != end)
-		return caltdissect(v, t->right, begin, end);
-	MDEBUG(("calt matched\n"));
+		d = getsubdfa(v, t->left);
+		NOERR();
+		if (longest(v, d, begin, end, (int *) NULL) == end)
+		{
+			MDEBUG(("calt matched\n"));
+			er = cdissect(v, t->left, begin, end);
+			if (er != REG_NOMATCH)
+				return er;
+		}
 
-	er = cdissect(v, t->left, begin, end);
-	if (er != REG_NOMATCH)
-		return er;
+		t = t->right;
+	}
 
-	return caltdissect(v, t->right, begin, end);
+	return REG_NOMATCH;
 }
 
 /*
- * citerdissect - iteration subexpression matches (with complications)
+ * citerdissect - dissect match for iteration node
  */
 static int						/* regexec return code */
 citerdissect(struct vars * v,
@@ -1396,11 +926,9 @@ citerdissect(struct vars * v,
 
 	assert(t->op == '*');
 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(!(t->left->flags & SHORTER));
 	assert(begin <= end);
 
-	if (t->left->flags & SHORTER)		/* reverse scan */
-		return creviterdissect(v, t, begin, end);
-
 	/*
 	 * If zero matches are allowed, and target string is empty, just declare
 	 * victory.  OTOH, if target string isn't empty, zero matches can't work
@@ -1562,7 +1090,7 @@ citerdissect(struct vars * v,
 }
 
 /*
- * creviterdissect - shortest-first iteration subexpression matches
+ * creviterdissect - dissect match for iteration node, shortest-first
  */
 static int						/* regexec return code */
 creviterdissect(struct vars * v,

From da9ed7dafdc49e6571d056d92f2ef67858d68946 Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sat, 25 Feb 2012 15:20:49 +0100
Subject: [PATCH 049/129] Make each pg_stat_ view into it's own table in the
 documentation

This makes it easier to match a column name with the description of it,
and makes it possible to add more detailed documentation in the future.

This patch does not add that extra documentation at this point, only
the structure required for it.

Modeled on the changes already done to pg_stat_activity.
---
 doc/src/sgml/monitoring.sgml | 941 ++++++++++++++++++++++++++++++++---
 1 file changed, 859 insertions(+), 82 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index cb13c8e8353bb..ffad77f04f987 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -261,83 +261,44 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_stat_bgwriter</><indexterm><primary>pg_stat_bgwriter</primary></indexterm></entry>
-      <entry>One row only, showing cluster-wide statistics from the
-      background writer: number of scheduled checkpoints, requested
-      checkpoints, buffers written by checkpoints and cleaning scans,
-      and the number of times the background writer stopped a cleaning scan
-      because it had written too many buffers.  Also includes
-      statistics about the shared buffer pool, including buffers written
-      by backends (that is, not by the background writer), how many times
-      those backends had to execute their own fsync calls (normally the
-      background writer handles those even when the backend does its own
-      write), total buffers allocated, and time of last statistics reset.
+      <entry>One row only, showing cluster-wide statistics. See
+       <xref linkend="pg-stat-bgwriter-view"> for more details.
      </entry>
      </row>
 
      <row>
       <entry><structname>pg_stat_database</><indexterm><primary>pg_stat_database</primary></indexterm></entry>
-      <entry>One row per database, showing database OID, database name,
-      number of active server processes connected to that database,
-      number of transactions committed and rolled back in that database,
-      total disk blocks read, total buffer hits (i.e., block
-      read requests avoided by finding the block already in buffer cache),
-      number of rows returned, fetched, inserted, updated and deleted, the
-      total number of queries canceled due to conflict with recovery (on
-      standby servers), number and size of temporary files used, total
-      number of deadlocks detected, and time of last statistics reset.
-     </entry>
+      <entry>One row per database, showing database wide statistics. See
+       <xref linkend="pg-stat-database-view"> for more details.
+      </entry>
      </row>
 
      <row>
       <entry><structname>pg_stat_database_conflicts</><indexterm><primary>pg_stat_database_conflicts</primary></indexterm></entry>
-      <entry>One row per database, showing database OID, database name and
-      the number of queries that have been canceled in this database due to
-      dropped tablespaces, lock timeouts, old snapshots, pinned buffers and
-      deadlocks. Will only contain information on standby servers, since
-      conflicts do not occur on master servers.
+      <entry>
+       One row per database showing database wide statistics about
+       query cancels due to conflict with recovery on standby servers.
+       Will only contain information on standby servers, since
+       conflicts do not occur on master servers.
+       See <xref linkend="pg-stat-database-conflicts-view"> for more details.
      </entry>
      </row>
 
      <row>
       <entry><structname>pg_stat_replication</><indexterm><primary>pg_stat_replication</primary></indexterm></entry>
-      <entry>One row per WAL sender process, showing process <acronym>ID</>,
-      user OID, user name, application name, client's address, host name
-      (if available) and port number, time at which the server process began
-      execution, and the current WAL sender state and transaction log
-      location.  In addition, the standby reports the last transaction log
-      position it received and wrote, the last position it flushed to disk,
-      and the last position it replayed, and this information is also
-      displayed here. If the standby's application names matches one of the
-      settings in <varname>synchronous_standby_names</> then the sync_priority
-      is shown here also, that is the order in which standbys will become
-      the synchronous standby. The columns detailing what exactly the connection
-      is doing are only visible if the user examining the view is a superuser.
-      The client's host name will be available only if
-      <xref linkend="guc-log-hostname"> is set or if the user's host name
-      needed to be looked up during <filename>pg_hba.conf</filename>
-      processing. Only directly connected standbys are listed; no information
-      about downstream standby servers is recorded.
+      <entry>One row per WAL sender process, showing statistics about the
+       replication to this slave. See <xref linkend="pg-stat-replication-view">
+       for more details. Only directly connected standbys are listed; no
+       information about downstream standby servers is recorded.
      </entry>
      </row>
 
      <row>
       <entry><structname>pg_stat_all_tables</><indexterm><primary>pg_stat_all_tables</primary></indexterm></entry>
-      <entry>For each table in the current database (including TOAST tables),
-      the table OID, schema and table name, number of sequential
-      scans initiated, number of live rows fetched by sequential
-      scans, number of index scans initiated (over all indexes
-      belonging to the table), number of live rows fetched by index
-      scans, numbers of row insertions, updates, and deletions,
-      number of row updates that were HOT (i.e., no separate index update),
-      numbers of live and dead rows,
-      the last time the table was non-<option>FULL</> vacuumed manually,
-      the last time it was vacuumed by the autovacuum daemon,
-      the last time it was analyzed manually,
-      the last time it was analyzed by the autovacuum daemon,
-      number of times it has been non-<option>FULL</> vacuumed manually,
-      number of times it has been vacuumed by the autovacuum daemon,
-      number of times it has been analyzed manually,
-      and the number of times it has been analyzed by the autovacuum daemon.
+      <entry>
+       One row for each table in the current database (including TOAST
+       tables) with information about accesses to this specific table.
+       See <xref linkend="pg-stat-all-tables-view"> for more details.
       </entry>
      </row>
 
@@ -376,11 +337,10 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_stat_all_indexes</><indexterm><primary>pg_stat_all_indexes</primary></indexterm></entry>
-      <entry>For each index in the current database,
-      the table and index OID, schema, table and index name,
-      number of index scans initiated on that index, number of
-      index entries returned by index scans, and number of live table rows
-      fetched by simple index scans using that index.
+      <entry>
+       One row for each index in the current database with information
+       about accesses to this specific index.
+       See <xref linkend="pg-stat-all-indexes-view"> for more details.
       </entry>
      </row>
 
@@ -398,13 +358,10 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_statio_all_tables</><indexterm><primary>pg_statio_all_tables</primary></indexterm></entry>
-      <entry>For each table in the current database (including TOAST tables),
-      the table OID, schema and table name, number of disk
-      blocks read from that table, number of buffer hits, numbers of
-      disk blocks read and buffer hits in all indexes of that table,
-      numbers of disk blocks read and buffer hits from that table's
-      auxiliary TOAST table (if any), and numbers of disk blocks read
-      and buffer hits for the TOAST table's index.
+      <entry>
+       One row for each table in the current database (including TOAST
+       tables) with information about I/O on this specific table.
+       See <xref linkend="pg-statio-all-tables-view"> for more details.
       </entry>
      </row>
 
@@ -422,9 +379,10 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_statio_all_indexes</><indexterm><primary>pg_statio_all_indexes</primary></indexterm></entry>
-      <entry>For each index in the current database,
-      the table and index OID, schema, table and index name,
-      numbers of disk blocks read and buffer hits in that index.
+      <entry>
+       One row for each index in the current database
+       with information about I/O on this specific index.
+       See <xref linkend="pg-statio-all-indexes-view"> for more details.
       </entry>
      </row>
 
@@ -442,10 +400,11 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_statio_all_sequences</><indexterm><primary>pg_statio_all_sequences</primary></indexterm></entry>
-      <entry>For each sequence object in the current database,
-      the sequence OID, schema and sequence name,
-      numbers of disk blocks read and buffer hits in that sequence.
-      </entry>
+     <entry>
+       One row for each sequence in the current database
+       with information about I/O on this specific sequence.
+       See <xref linkend="pg-statio-all-sequences-view"> for more details.
+     </entry>
      </row>
 
      <row>
@@ -463,11 +422,11 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_stat_user_functions</><indexterm><primary>pg_stat_user_functions</primary></indexterm></entry>
-      <entry>For all tracked functions, function OID, schema, name, number
-      of calls, total time, and self time.  Self time is the
-      amount of time spent in the function itself, total time includes the
-      time spent in functions it called. Time values are in milliseconds.
-     </entry>
+      <entry>
+       One row for each tracked function (as specified by the
+       <xref linkend="guc-track-functions"> parameter). See
+       <xref linkend="pg-stat-user-functions-view"> for more details.
+      </entry>
      </row>
 
      <row>
@@ -723,6 +682,824 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    </tgroup>
   </table>
 
+  <para>
+   The <structname>pg_stat_activity</structname> view will have one row
+   per server process, showing information related to each connection to
+   the server.
+  </para>
+
+  <table id="pg-stat-bgwriter-view" xreflabel="pg_stat_bgwriter">
+   <title>pg_stat_bgwriter view</title>
+
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry>checkpoints_timed</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of scheduled checkpoints</entry>
+     </row>
+     <row>
+      <entry>checkpoints_requested</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of requested checkpoints</entry>
+     </row>
+     <row>
+      <entry>buffers_checkpoint</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffers written during checkpoints</entry>
+     </row>
+     <row>
+      <entry>buffers_clean</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffers written by the background writer</entry>
+     </row>
+     <row>
+      <entry>maxwritten_clean</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of times the background writer stopped a cleaning
+      scan because it had written too many buffers</entry>
+     </row>
+     <row>
+      <entry>buffers_backend</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffers written directly by a backend</entry>
+     </row>
+     <row>
+      <entry>buffers_backend_fsync</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of times a backend had to execute its own fsync
+      call (normally the background writer handles those even when the
+      backend does its own write)</entry>
+     </row>
+     <row>
+      <entry>buffers_alloc</entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffers allocated</entry>
+     </row>
+     <row>
+      <entry>stats_reset</entry>
+      <entry><type>bigint</type></entry>
+      <entry>The last time these statistics were reset</entry>
+     </row>
+    </tbody>
+    </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_bgwriter</structname> view will always have a
+   single row with global data for the cluster.
+  </para>
+
+  <table id="pg-stat-database-view" xreflabel="pg_stat_database">
+   <title>pg_stat_database view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>datid</entry>
+     <entry><type>oid</></entry>
+     <entry>The oid of the database</entry>
+    </row>
+    <row>
+     <entry>datname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the database</entry>
+    </row>
+    <row>
+     <entry>numbackends</entry>
+     <entry><type>integer</></entry>
+     <entry>The number of backends currently connected to this database.
+     This is the only column in this view that returns a value for the
+     current state, all other columns return the accumulated values since
+     the last reset.</entry>
+    </row>
+    <row>
+     <entry>xact_commit</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of transactions in this database that has committed</entry>
+    </row>
+    <row>
+     <entry>xact_rollback</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of transactions in this database that has rolled back</entry>
+    </row>
+    <row>
+     <entry>blks_read</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of disk blocks read in this database</entry>
+    </row>
+    <row>
+     <entry>blks_hits</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of disk blocks read from the buffer cache
+     (this only includes hits in the PostgreSQL buffer cache, and not
+     the operating system filesystem cache)</entry>
+    </row>
+    <row>
+     <entry>tup_returned</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows returned by queries in this database</entry>
+    </row>
+    <row>
+     <entry>tup_fetched</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows fetched by queries in this database</entry>
+    </row>
+    <row>
+     <entry>tup_inserted</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows inserted by queries in this database</entry>
+    </row>
+    <row>
+     <entry>tup_updated</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows updated by queries in this database</entry>
+    </row>
+    <row>
+     <entry>tup_deleted</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows deleted by queries in this database</entry>
+    </row>
+    <row>
+     <entry>conflicts</entry>
+     <entry><type>bigint</></entry>
+     <entry>
+      The number of queries canceled due to conflict with recovery
+      (on standby servers) in this database. (See
+      <xref linkend="pg-stat-database-conflicts-view"> for more details)
+     </entry>
+    </row>
+    <row>
+     <entry>temp_files</entry>
+     <entry><type>bigint</></entry>
+     <entry>
+      The number of temporary files written by queries in the database.
+      All temporary files are counted, regardless of why the temporary file
+      was created (sorting or hash) or file size, and regardless of the
+      <xref linkend="guc-log-temp-files"> setting.
+     </entry>
+    </row>
+    <row>
+     <entry>temp_bytes</entry>
+     <entry><type>bigint</></entry>
+     <entry>
+      The amount of data written to temporary files by queries in
+      the database. All temporary files are counted, regardless of why
+      the temporary file was created (sorting or hash) or file size, and
+      regardless of the <xref linkend="guc-log-temp-files"> setting.
+     </entry>
+    </row>
+    <row>
+     <entry>deadlocks</entry>
+     <entry><type>bigint</></entry>
+     <entry>Number of deadlocks detected in the database</entry>
+    </row>
+    <row>
+     <entry>stats_reset</entry>
+     <entry><type>timestamptz</></entry>
+     <entry>The last time the statistics were reset</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_database</structname> view will contain one row
+   for each database in the cluster showing database wide statistics.
+  </para>
+
+  <table id="pg-stat-database-conflicts-view" xreflabel="pg_stat_database_conflicts">
+   <title>pg_stat_database_conflicts view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>datid</entry>
+     <entry><type>oid</></entry>
+     <entry>The oid of the database</entry>
+    </row>
+    <row>
+     <entry>datname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the database</entry>
+    </row>
+    <row>
+     <entry>confl_tablespace</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of queries that have been canceled due to
+      dropped tablespaces</entry>
+    </row>
+    <row>
+     <entry>confl_lock</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of queries that have been canceled due to
+      lock timeouts</entry>
+    </row>
+    <row>
+     <entry>confl_snapshot</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of queries that have been canceled due to
+      old snapshots</entry>
+    </row>
+    <row>
+     <entry>confl_bufferpin</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of queries that have been canceled due to
+      pinned buffers</entry>
+    </row>
+    <row>
+     <entry>confl_deadlock</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of queries that have been canceled due to
+      deadlocks</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_database_conflicts</structname> view will contain
+   one row per database showing database wide statistics about
+   query cancels due to conflict with recovery on standby servers.
+   Will only contain information on standby servers, since
+   conflicts do not occur on master servers.
+  </para>
+
+  <table id="pg-stat-replication-view" xreflabel="pg_stat_replication">
+   <title>pg_stat_replication view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>pid</entry>
+     <entry><type>integer</></entry>
+     <entry>The process id of the WAL sender process</entry>
+    </row>
+    <row>
+     <entry>usesysid</entry>
+     <entry><type>oid</></entry>
+     <entry>The oid of the user logged into this WAL sender process</entry>
+    </row>
+    <row>
+     <entry>usename</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the user logged into this WAL sender process</entry>
+    </row>
+    <row>
+     <entry>application_name</entry>
+     <entry><type>text</></entry>
+     <entry>The name of the application that has initiated the connection
+      to the WAL sender.</entry>
+    </row>
+    <row>
+     <entry>client_addr</entry>
+     <entry><type>inet</></entry>
+     <entry>The remote IP of the client connected to the WAL sender.
+      If this field is not set, it indicates that the client is
+      connected via a Unix socket on the server machine.
+     </entry>
+    </row>
+    <row>
+     <entry>client_hostname</entry>
+     <entry><type>text</></entry>
+     <entry>
+      If available, the hostname of the client as reported by a
+      reverse lookup of <structfield>client_addr</>. This field will
+      only be set when <xref linkend="guc-log-hostname"> is enabled.
+     </entry>
+    </row>
+    <row>
+     <entry>client_port</entry>
+     <entry><type>integer</></entry>
+     <entry>
+      The remote TCP port that the client is using for communication
+      to the , or <symbol>NULL</> if a unix socket is used.
+     </entry>
+    </row>
+    <row>
+     <entry>backend_start</entry>
+     <entry><type>timestamp with time zone</></entry>
+     <entry>
+      The time when this process was started,  i.e. when the
+      client connected to the WAL sender.
+     </entry>
+    </row>
+    <row>
+     <entry>state</entry>
+     <entry><type>text</></entry>
+     <entry>Current WAL sender state</entry>
+    </row>
+    <row>
+     <entry>sent_location</entry>
+     <entry><type>text</></entry>
+     <entry>Last transaction log position sent on this connection</entry>
+    </row>
+    <row>
+     <entry>write_location</entry>
+     <entry><type>text</></entry>
+     <entry>Last transaction log position written to disk by the slave</entry>
+    </row>
+    <row>
+     <entry>flush_location</entry>
+     <entry><type>text</></entry>
+     <entry>Last transaction log position flushed to disk by the slave</entry>
+    </row>
+    <row>
+     <entry>replay_location</entry>
+     <entry><type>text</></entry>
+     <entry>Last transaction log position replayed into the database on the slave</entry>
+    </row>
+    <row>
+     <entry>sync_priority</entry>
+     <entry><type>int</></entry>
+     <entry>
+      The priority in the order which this slave will be picked as
+      the synchronous standby.
+     </entry>
+    </row>
+    <row>
+     <entry>sync_state</entry>
+     <entry><type>text</></entry>
+     <entry>
+      The synchronous state of this slave.
+     </entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_replication</structname> view will contain one row
+   per WAL sender process, showing statistics about the replication to this
+   slave. Only directly connected standbys are listed; no information about
+   downstream standby servers is recorded.
+  </para>
+
+  <table id="pg-stat-all-tables-view" xreflabel="pg_stat_all_tables">
+   <title>pg_stat_all_tables view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>relid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the table this row</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema that the table is in</entry>
+    </row>
+    <row>
+     <entry>relname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the table</entry>
+    </row>
+    <row>
+     <entry>seq_scan</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of sequential scans initiated on this table</entry>
+    </row>
+    <row>
+     <entry>seq_tup_read</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of live rows fetch by sequential scans</entry>
+    </row>
+    <row>
+     <entry>idx_scan</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of index scans initiated on this table</entry>
+    </row>
+    <row>
+     <entry>idx_tup_fetch</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of live rows fetch by index scans</entry>
+    </row>
+    <row>
+     <entry>n_tup_ins</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows inserted</entry>
+    </row>
+    <row>
+     <entry>n_tup_upd</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows updated</entry>
+    </row>
+    <row>
+     <entry>n_tup_del</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows deleted</entry>
+    </row>
+    <row>
+     <entry>n_tup_hot_upd</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of rows HOT (i.e., no separate index update) updated</entry>
+    </row>
+    <row>
+     <entry>n_live_tup</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of live rows</entry>
+    </row>
+    <row>
+     <entry>n_dead_tup</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of dead rows</entry>
+    </row>
+    <row>
+     <entry>last_vacuum</entry>
+     <entry><type>timestamp with time zone</></entry>
+     <entry>The last time the table was manually non-<option>FULL</> vacuumed</entry>
+    </row>
+    <row>
+     <entry>last_autovacuum</entry>
+     <entry><type>timestamp with time zone</></entry>
+     <entry>The last time the table was vacuumed by the autovacuum daemon</entry>
+    </row>
+    <row>
+     <entry>last_analyze</entry>
+     <entry><type>timestamp with time zone</></entry>
+     <entry>The last time the table was manually analyzed</entry>
+    </row>
+    <row>
+     <entry>last_autoanalyze</entry>
+     <entry><type>timestamp with time zone</></entry>
+     <entry>The last time the table was analyzed by the autovacuum daemon</entry>
+    </row>
+    <row>
+     <entry>vacuum_count</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of times this table has been manually non-<option>FULL</> vacuumed</entry>
+    </row>
+    <row>
+     <entry>autovacuum_count</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of times this table has been vacuumed by the autovacuum daemon</entry>
+    </row>
+    <row>
+     <entry>analyze_count</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of times this table has been manually analyzed</entry>
+    </row>
+    <row>
+     <entry>autoanalyze_count</entry>
+     <entry><type>bigint</></entry>
+     <entry>The number of times this table has been analyzed by the autovacuum daemon</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_all_tables</structname> view will contain
+   one row for each table in the current database (including TOAST
+   tables) with information about accesses to this specific table. The
+   <structname>pg_stat_user_tables</structname> and
+   <structname>pg_stat_sys_tables</structname> contain the same information,
+   but filtered to only have rows for user and system tables.
+  </para>
+
+  <table id="pg-stat-all-indexes-view" xreflabel="pg_stat_all_indexes">
+   <title>pg_stat_all_indexes view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>relid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the table for this index</entry>
+    </row>
+    <row>
+     <entry>indexrelid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the index</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema the index is in</entry>
+    </row>
+    <row>
+     <entry>relname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the table for this index</entry>
+    </row>
+    <row>
+     <entry>indexrelname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the index</entry>
+    </row>
+    <row>
+     <entry>idx_scan</entry>
+     <entry><type>bigint</></entry>
+     <entry>Number of index scans initiated on this index</entry>
+    </row>
+    <row>
+     <entry>idx_tup_read</entry>
+     <entry><type>bigint</></entry>
+     <entry>Number of index entries returned by scans on this index</entry>
+    </row>
+    <row>
+     <entry>idx_tup_fetch</entry>
+     <entry><type>bigint</></entry>
+     <entry>Number of live table rows fetched by simple index scans using this index</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_all_indexes</structname> view will contain
+   one row for each index in the current database
+   with information about accesses to this specific index. The
+   <structname>pg_stat_user_indexes</structname> and
+   <structname>pg_stat_sys_indexes</structname> contain the same information,
+   but filtered to only have rows for user and system indexes.
+  </para>
+
+  <table id="pg-statio-all-tables-view" xreflabel="pg_statio_all_tables">
+   <title>pg_statio_all_tables view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>relid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the table</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema that the table is in</entry>
+    </row>
+    <row>
+     <entry>relname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the table</entry>
+    </row>
+    <row>
+     <entry>heap_blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from this table</entry>
+    </row>
+    <row>
+     <entry>heap_blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in this table</entry>
+    </row>
+    <row>
+     <entry>idx_blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from all indexes on this table</entry>
+    </row>
+    <row>
+     <entry>idx_blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in all indexes of this table</entry>
+    </row>
+    <row>
+     <entry>toast_blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from this table's TOAST table (if any)</entry>
+    </row>
+    <row>
+     <entry>toast_blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in this table's TOAST table (if any)</entry>
+    </row>
+    <row>
+     <entry>tidx_blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from this table's TOAST table index (if any)</entry>
+    </row>
+    <row>
+     <entry>tidx_blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in this table's TOAST table index (if any)</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_statio_all_tables</structname> view will contain
+   one row for each table in the current database (including TOAST
+   tables) with information about I/O on this specific table. The
+   <structname>pg_statio_user_tables</structname> and
+   <structname>pg_statio_sys_tables</structname> contain the same information,
+   but filtered to only have rows for user and system tables.
+  </para>
+
+  <table id="pg-statio-all-indexes-view" xreflabel="pg_statio_all_indexes">
+   <title>pg_statio_all_indexes view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>relid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the table for this index</entry>
+    </row>
+    <row>
+     <entry>indexrelid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the index</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema the index is in</entry>
+    </row>
+    <row>
+     <entry>relname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the table for this index</entry>
+    </row>
+    <row>
+     <entry>indexrelname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the index</entry>
+    </row>
+    <row>
+     <entry>idx_blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from the index</entry>
+    </row>
+    <row>
+     <entry>idx_blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in the index</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_statio_all_indexes</structname> view will contain
+   one row for each index in the current database
+   with information about I/O on this specific index. The
+   <structname>pg_statio_user_indexes</structname> and
+   <structname>pg_statio_sys_indexes</structname> contain the same information,
+   but filtered to only have rows for user and system indexes.
+  </para>
+
+  <table id="pg-statio-all-sequences-view" xreflabel="pg_statio_all_sequences">
+   <title>pg_statio_all_sequences view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>relid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the sequence</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema the sequence is in</entry>
+    </row>
+    <row>
+     <entry>relname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the sequence</entry>
+    </row>
+    <row>
+     <entry>blks_read</entry>
+     <entry><type>name</></entry>
+     <entry>Number of disk blocks read from the sequence</entry>
+    </row>
+    <row>
+     <entry>blks_hit</entry>
+     <entry><type>name</></entry>
+     <entry>Number of buffer hits in the sequence</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_statio_all_indexes</structname> view will contain
+   one row for each sequence in the current database
+   with information about I/O on this specific sequence.
+  </para>
+
+  <table id="pg-stat-user-functions-view" xreflabel="pg_stat_user_functions">
+   <title>pg_stat_user_functions view</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry>funcid</entry>
+     <entry><type>oid</></entry>
+     <entry>The OID of the function</entry>
+    </row>
+    <row>
+     <entry>schemaname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the schema the function is in</entry>
+    </row>
+    <row>
+     <entry>funcname</entry>
+     <entry><type>name</></entry>
+     <entry>The name of the function</entry>
+    </row>
+    <row>
+     <entry>calls</entry>
+     <entry><type>bigint</></entry>
+     <entry>Number of times the function has been called</entry>
+    </row>
+    <row>
+     <entry>total_time</entry>
+     <entry><type>bigint</></entry>
+     <entry>Total time spent in this functions and all other functions
+     called by it, in milliseconds.</entry>
+    </row>
+    <row>
+     <entry>self_time</entry>
+     <entry><type>bigint</></entry>
+     <entry>Total time spent in this functions itself but not including
+     other functions called by it, in milliseconds.</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_user_functions</structname> view will contain
+   one row for each tracked function (as specified by the
+   <xref linkend="guc-track-functions"> parameter).
+  </para>
+
  <sect3 id="monitoring-stats-functions">
   <title>Statistics Access Functions</title>
 

From ddfc2d9a3705d246c7f262d1f3745d2cf64da1bd Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sat, 25 Feb 2012 15:21:04 +0100
Subject: [PATCH 050/129] Merge the list of statistics functions into the view
 documentation

Most people won't read them individually anyway, it's an easy way to find
them, and it's a lot of duplicated information if they are kept in two
different places.
---
 doc/src/sgml/monitoring.sgml | 940 ++++++++---------------------------
 1 file changed, 217 insertions(+), 723 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index ffad77f04f987..3cc5d4df0038b 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -225,8 +225,9 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    <structname>pg_stat_xact_sys_tables</>,
    <structname>pg_stat_xact_user_tables</>, and
    <structname>pg_stat_xact_user_functions</>, or via these views' underlying
-   functions.  These numbers do not act as stated above; instead they update
-   continuously throughout the transaction.
+   functions (named the same as the standard statistics functions but with the
+   prefix <function>pg_stat_get_xact_</function>).  These numbers do not act
+   as stated above; instead they update continuously throughout the transaction.
   </para>
 
   <table id="monitoring-stats-views-table">
@@ -489,6 +490,22 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    into the kernel's handling of I/O.
   </para>
 
+  <para>
+   Other ways of looking at the statistics can be set up by writing
+   queries that use the same underlying statistics access functions as
+   these standard views do.  These functions are listed in <xref
+   linkend="monitoring-stats-funcs-table">.  The per-database access
+   functions take a database OID as argument to identify which
+   database to report on.  The per-table and per-index functions take
+   a table or index OID.  The functions for function-call statistics
+   take a function OID.  (Note that only tables, indexes, and functions
+   in the current database can be seen with these functions.)  The
+   per-server-process access functions take a server process
+   number, which ranges from one to the number of currently active
+   server processes.
+  </para>
+
+
   <table id="pg-stat-activity-view" xreflabel="pg_stat_activity">
    <title>pg_stat_activity view</title>
 
@@ -505,7 +522,9 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>datid</entry>
      <entry><type>oid</></entry>
-     <entry>The oid of the database the backend is connected to.</entry>
+     <entry>The oid of the database the backend is connected to.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_backend_dbid</function>.</entry>
     </row>
     <row>
      <entry>datname</entry>
@@ -515,12 +534,16 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>pid</entry>
      <entry><type>integer</></entry>
-     <entry>The process ID of the backend.</entry>
+     <entry>The process ID of the backend.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_backend_pid</function>.</entry>
     </row>
     <row>
      <entry>usesysid</entry>
      <entry><type>oid</></entry>
-     <entry>The id of the user logged into the backend.</entry>
+     <entry>The id of the user logged into the backend.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_backend_userid</function>.</entry>
     </row>
     <row>
      <entry>usename</entry>
@@ -540,6 +563,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       If this field is not set, it indicates that the client is either connected
       via a Unix socket on the server machine or is an internal process such
       as autovacuum.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_client_addr</function>.
      </entry>
     </row>
     <row>
@@ -557,6 +582,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>
       The remote TCP port that the client is using for communication
       to the backend, or <symbol>NULL</> if a unix socket is used.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_client_port</function>.
      </entry>
     </row>
     <row>
@@ -565,6 +592,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>
       The time when this process was started,  i.e. when the
       client connected to the server.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_start</function>.
      </entry>
     </row>
     <row>
@@ -574,6 +603,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       The time when the current transaction was started. If the client is
       using autocommit for transactions, this value is equal to the
       query_start column.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_xact_start</function>.
      </entry>
     </row>
     <row>
@@ -583,6 +614,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       The time when the currently active query started, or if
       <structfield>state</> is <literal>idle</>, when the last query
       was started.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_activity_start</function>.
      </entry>
     </row>
     <row>
@@ -595,6 +628,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry><type>boolean</></entry>
      <entry>
       Boolean indicating if a backend is currently waiting on a lock.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_backend_waiting</function>.
      </entry>
     </row>
     <row>
@@ -687,6 +722,18 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    per server process, showing information related to each connection to
    the server.
   </para>
+  <para>
+   All functions used in the view are indexed by backend id number. The
+   function <function>pg_stat_get_backend_idset</function> provides a
+   convenient way to generate one row for each active server process. For
+   example, to show the <acronym>PID</>s and current queries of all server processes:
+
+<programlisting>
+SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
+       pg_stat_get_backend_activity(s.backendid) AS query
+    FROM (SELECT pg_stat_get_backend_idset() AS backendid) AS s;
+</programlisting>
+  </para>
 
   <table id="pg-stat-bgwriter-view" xreflabel="pg_stat_bgwriter">
    <title>pg_stat_bgwriter view</title>
@@ -704,33 +751,45 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <row>
       <entry>checkpoints_timed</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of scheduled checkpoints</entry>
+      <entry>Number of scheduled checkpoints.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_timed_checkpoints</function> function.</entry>
      </row>
      <row>
       <entry>checkpoints_requested</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of requested checkpoints</entry>
+      <entry>Number of requested checkpoints.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_requested_checkpoints</function> function.</entry>
      </row>
      <row>
       <entry>buffers_checkpoint</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of buffers written during checkpoints</entry>
+      <entry>Number of buffers written during checkpoints.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_buf_written_checkpoints</function> function.</entry>
      </row>
      <row>
       <entry>buffers_clean</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of buffers written by the background writer</entry>
+      <entry>Number of buffers written by the background writer.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_buf_written_clean</function> function.</entry>
      </row>
      <row>
       <entry>maxwritten_clean</entry>
       <entry><type>bigint</type></entry>
       <entry>Number of times the background writer stopped a cleaning
-      scan because it had written too many buffers</entry>
+      scan because it had written too many buffers.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_maxwritten_clean</function> function.</entry>
      </row>
      <row>
       <entry>buffers_backend</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of buffers written directly by a backend</entry>
+      <entry>Number of buffers written directly by a backend.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_buf_written_backend</function> function.</entry>
      </row>
      <row>
       <entry>buffers_backend_fsync</entry>
@@ -742,12 +801,16 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <row>
       <entry>buffers_alloc</entry>
       <entry><type>bigint</type></entry>
-      <entry>Number of buffers allocated</entry>
+      <entry>Number of buffers allocated.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_buf_alloc</function> function.</entry>
      </row>
      <row>
       <entry>stats_reset</entry>
       <entry><type>bigint</type></entry>
-      <entry>The last time these statistics were reset</entry>
+      <entry>The last time these statistics were reset.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_bgwriter_stat_reset_time</function> function.</entry>
      </row>
     </tbody>
     </tgroup>
@@ -786,54 +849,75 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>The number of backends currently connected to this database.
      This is the only column in this view that returns a value for the
      current state, all other columns return the accumulated values since
-     the last reset.</entry>
+     the last reset. This value can also be returned by directly calling
+     the <function>pg_stat_get_db_numbackends</function> function.</entry>
     </row>
     <row>
      <entry>xact_commit</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of transactions in this database that has committed</entry>
+     <entry>The number of transactions in this database that has committed.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_xact_commit</function> function.</entry>
     </row>
     <row>
      <entry>xact_rollback</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of transactions in this database that has rolled back</entry>
+     <entry>The number of transactions in this database that has rolled back.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_xact_rollback</function> function.</entry>
     </row>
     <row>
      <entry>blks_read</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of disk blocks read in this database</entry>
+     <entry>The number of disk blocks read in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_blocks_fetched</function> and
+     <function>pg_stat_get_db_blocks_hit</function> functions and
+     subtracting the results.</entry>
     </row>
     <row>
      <entry>blks_hits</entry>
      <entry><type>bigint</></entry>
      <entry>The number of disk blocks read from the buffer cache
      (this only includes hits in the PostgreSQL buffer cache, and not
-     the operating system filesystem cache)</entry>
+     the operating system filesystem cache).
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_blocks_hit</function> function.</entry>
     </row>
     <row>
      <entry>tup_returned</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows returned by queries in this database</entry>
+     <entry>The number of rows returned by queries in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_tuples_returned</function> function.</entry>
     </row>
     <row>
      <entry>tup_fetched</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows fetched by queries in this database</entry>
+     <entry>The number of rows fetched by queries in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_tuples_fetched</function> function.</entry>
     </row>
     <row>
      <entry>tup_inserted</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows inserted by queries in this database</entry>
+     <entry>The number of rows inserted by queries in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_tuples_inserted</function> function.</entry>
     </row>
     <row>
      <entry>tup_updated</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows updated by queries in this database</entry>
+     <entry>The number of rows updated by queries in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_tuples_updated</function> function.</entry>
     </row>
     <row>
      <entry>tup_deleted</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows deleted by queries in this database</entry>
+     <entry>The number of rows deleted by queries in this database.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_db_tuples_deleted</function> function.</entry>
     </row>
     <row>
      <entry>conflicts</entry>
@@ -841,7 +925,9 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>
       The number of queries canceled due to conflict with recovery
       (on standby servers) in this database. (See
-      <xref linkend="pg-stat-database-conflicts-view"> for more details)
+      <xref linkend="pg-stat-database-conflicts-view"> for more details).
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_all</function> function.
      </entry>
     </row>
     <row>
@@ -852,6 +938,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       All temporary files are counted, regardless of why the temporary file
       was created (sorting or hash) or file size, and regardless of the
       <xref linkend="guc-log-temp-files"> setting.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_db_temp_files</function> function.
      </entry>
     </row>
     <row>
@@ -862,17 +950,23 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       the database. All temporary files are counted, regardless of why
       the temporary file was created (sorting or hash) or file size, and
       regardless of the <xref linkend="guc-log-temp-files"> setting.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_db_temp_bytes</function> function.
      </entry>
     </row>
     <row>
      <entry>deadlocks</entry>
      <entry><type>bigint</></entry>
-     <entry>Number of deadlocks detected in the database</entry>
+     <entry>Number of deadlocks detected in the database.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_db_deadlocks</function> function.</entry>
     </row>
     <row>
      <entry>stats_reset</entry>
      <entry><type>timestamptz</></entry>
-     <entry>The last time the statistics were reset</entry>
+     <entry>The last time the statistics were reset.
+      This value can also be returned by directly calling
+      the <function>pg_stat_get_reset_time</function> function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -909,31 +1003,37 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>confl_tablespace</entry>
      <entry><type>bigint</></entry>
      <entry>The number of queries that have been canceled due to
-      dropped tablespaces</entry>
+      dropped tablespaces. This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_tablespace</function> function.</entry>
     </row>
     <row>
      <entry>confl_lock</entry>
      <entry><type>bigint</></entry>
      <entry>The number of queries that have been canceled due to
-      lock timeouts</entry>
+      lock timeouts. This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_lock</function> function.</entry>
     </row>
     <row>
      <entry>confl_snapshot</entry>
      <entry><type>bigint</></entry>
      <entry>The number of queries that have been canceled due to
-      old snapshots</entry>
+      old snapshots. This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_snapshot</function> function.</entry>
     </row>
     <row>
      <entry>confl_bufferpin</entry>
      <entry><type>bigint</></entry>
      <entry>The number of queries that have been canceled due to
-      pinned buffers</entry>
+      pinned buffers. This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_bufferpin</function> function.</entry>
     </row>
     <row>
      <entry>confl_deadlock</entry>
      <entry><type>bigint</></entry>
      <entry>The number of queries that have been canceled due to
-      deadlocks</entry>
+      deadlocks. This value can also be returned by directly calling
+      the <function>pg_stat_get_db_conflict_startup_deadlock</function>
+      function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -1094,12 +1194,16 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>seq_scan</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of sequential scans initiated on this table</entry>
+     <entry>The number of sequential scans initiated on this table.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_numscans</function> function.</entry>
     </row>
     <row>
      <entry>seq_tup_read</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of live rows fetch by sequential scans</entry>
+     <entry>The number of live rows fetch by sequential scans.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_returned</function> function.</entry>
     </row>
     <row>
      <entry>idx_scan</entry>
@@ -1109,77 +1213,105 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>idx_tup_fetch</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of live rows fetch by index scans</entry>
+     <entry>The number of live rows fetch by index scans.</entry>
     </row>
     <row>
      <entry>n_tup_ins</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows inserted</entry>
+     <entry>The number of rows inserted.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_inserted</function> function.</entry>
     </row>
     <row>
      <entry>n_tup_upd</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows updated</entry>
+     <entry>The number of rows updated.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_updated</function> function.</entry>
     </row>
     <row>
      <entry>n_tup_del</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows deleted</entry>
+     <entry>The number of rows deleted.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_deleted</function> function.</entry>
     </row>
     <row>
      <entry>n_tup_hot_upd</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of rows HOT (i.e., no separate index update) updated</entry>
+     <entry>The number of rows HOT (i.e., no separate index update) updated.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_hot_updated</function> function.</entry>
     </row>
     <row>
      <entry>n_live_tup</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of live rows</entry>
+     <entry>The number of live rows.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_live_tuples</function> function.</entry>
     </row>
     <row>
      <entry>n_dead_tup</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of dead rows</entry>
+     <entry>The number of dead rows.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_dead_tuples</function> function.</entry>
     </row>
     <row>
      <entry>last_vacuum</entry>
      <entry><type>timestamp with time zone</></entry>
-     <entry>The last time the table was manually non-<option>FULL</> vacuumed</entry>
+     <entry>The last time the table was manually non-<option>FULL</> vacuumed.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_last_vacuum_time</function> function.</entry>
     </row>
     <row>
      <entry>last_autovacuum</entry>
      <entry><type>timestamp with time zone</></entry>
-     <entry>The last time the table was vacuumed by the autovacuum daemon</entry>
+     <entry>The last time the table was vacuumed by the autovacuum daemon.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_last_autovacuum_time</function> function.</entry>
     </row>
     <row>
      <entry>last_analyze</entry>
      <entry><type>timestamp with time zone</></entry>
-     <entry>The last time the table was manually analyzed</entry>
+     <entry>The last time the table was manually analyzed.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_last_analyze_time</function> function.</entry>
     </row>
     <row>
      <entry>last_autoanalyze</entry>
      <entry><type>timestamp with time zone</></entry>
-     <entry>The last time the table was analyzed by the autovacuum daemon</entry>
+     <entry>The last time the table was analyzed by the autovacuum daemon.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_last_autoanalyze_time</function> function.</entry>
     </row>
     <row>
      <entry>vacuum_count</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of times this table has been manually non-<option>FULL</> vacuumed</entry>
+     <entry>The number of times this table has been manually non-<option>FULL</> vacuumed.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_vacuum_count</function> function.</entry>
     </row>
     <row>
      <entry>autovacuum_count</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of times this table has been vacuumed by the autovacuum daemon</entry>
+     <entry>The number of times this table has been vacuumed by the autovacuum daemon.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_autovacuum_count</function> function.</entry>
     </row>
     <row>
      <entry>analyze_count</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of times this table has been manually analyzed</entry>
+     <entry>The number of times this table has been manually analyzed.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_analyze_count</function> function.</entry>
     </row>
     <row>
      <entry>autoanalyze_count</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of times this table has been analyzed by the autovacuum daemon</entry>
+     <entry>The number of times this table has been analyzed by the autovacuum daemon.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_autoanalyze_count</function> function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -1234,17 +1366,23 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>idx_scan</entry>
      <entry><type>bigint</></entry>
-     <entry>Number of index scans initiated on this index</entry>
+     <entry>Number of index scans initiated on this index.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_numscans</function> function.</entry>
     </row>
     <row>
      <entry>idx_tup_read</entry>
      <entry><type>bigint</></entry>
-     <entry>Number of index entries returned by scans on this index</entry>
+     <entry>Number of index entries returned by scans on this index.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_returned</function> function.</entry>
     </row>
     <row>
      <entry>idx_tup_fetch</entry>
      <entry><type>bigint</></entry>
-     <entry>Number of live table rows fetched by simple index scans using this index</entry>
+     <entry>Number of live table rows fetched by simple index scans using this index.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_tuples_fetched</function> function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -1289,12 +1427,18 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>heap_blks_read</entry>
      <entry><type>name</></entry>
-     <entry>Number of disk blocks read from this table</entry>
+     <entry>Number of disk blocks read from this table.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_blocks_fetched</function> and
+     <function>pg_stat_get_blocks_hit</function> functions and
+     subtracting the results.</entry>
     </row>
     <row>
      <entry>heap_blks_hit</entry>
      <entry><type>name</></entry>
-     <entry>Number of buffer hits in this table</entry>
+     <entry>Number of buffer hits in this table.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_blocks_hit</function> function.</entry>
     </row>
     <row>
      <entry>idx_blks_read</entry>
@@ -1304,7 +1448,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>idx_blks_hit</entry>
      <entry><type>name</></entry>
-     <entry>Number of buffer hits in all indexes of this table</entry>
+     <entry>Number of buffer hits in all indexes of this table.</entry>
     </row>
     <row>
      <entry>toast_blks_read</entry>
@@ -1379,12 +1523,18 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>idx_blks_read</entry>
      <entry><type>name</></entry>
-     <entry>Number of disk blocks read from the index</entry>
+     <entry>Number of disk blocks read from the index.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_blocks_fetched</function> and
+     <function>pg_stat_get_blocks_hit</function> functions and
+     subtracting the results.</entry>
     </row>
     <row>
      <entry>idx_blks_hit</entry>
      <entry><type>name</></entry>
-     <entry>Number of buffer hits in the index</entry>
+     <entry>Number of buffer hits in the index.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_blocks_hit</function> function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -1476,19 +1626,25 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>calls</entry>
      <entry><type>bigint</></entry>
-     <entry>Number of times the function has been called</entry>
+     <entry>Number of times the function has been called.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_function_calls</function> function.</entry>
     </row>
     <row>
      <entry>total_time</entry>
      <entry><type>bigint</></entry>
      <entry>Total time spent in this functions and all other functions
-     called by it, in milliseconds.</entry>
+     called by it, in milliseconds.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_function_time</function> function.</entry>
     </row>
     <row>
      <entry>self_time</entry>
      <entry><type>bigint</></entry>
      <entry>Total time spent in this functions itself but not including
-     other functions called by it, in milliseconds.</entry>
+     other functions called by it, in milliseconds.
+     This value can also be returned by directly calling
+     the <function>pg_stat_get_function_self_time</function> function.</entry>
     </row>
    </tbody>
    </tgroup>
@@ -1501,25 +1657,11 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
   </para>
 
  <sect3 id="monitoring-stats-functions">
-  <title>Statistics Access Functions</title>
+  <title>Other Statistics Functions</title>
 
-  <para>
-   Other ways of looking at the statistics can be set up by writing
-   queries that use the same underlying statistics access functions as
-   these standard views do.  These functions are listed in <xref
-   linkend="monitoring-stats-funcs-table">.  The per-database access
-   functions take a database OID as argument to identify which
-   database to report on.  The per-table and per-index functions take
-   a table or index OID.  The functions for function-call statistics
-   take a function OID.  (Note that only tables, indexes, and functions
-   in the current database can be seen with these functions.)  The
-   per-server-process access functions take a server process
-   number, which ranges from one to the number of currently active
-   server processes.
-  </para>
 
   <table id="monitoring-stats-funcs-table">
-   <title>Statistics Access Functions</title>
+   <title>Other Statistics Functions</title>
 
    <tgroup cols="3">
     <thead>
@@ -1531,401 +1673,6 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     </thead>
 
     <tbody>
-     <row>
-      <entry><literal><function>pg_stat_get_db_numbackends</function>(<type>oid</type>)</literal></entry>
-      <entry><type>integer</type></entry>
-      <entry>
-       Number of active server processes for database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_xact_commit</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of transactions committed in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_xact_rollback</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of transactions rolled back in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_blocks_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block fetch requests for database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_blocks_hit</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block fetch requests found in cache for database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_tuples_returned</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of tuples returned for database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_tuples_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of tuples fetched for database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_tuples_inserted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of tuples inserted in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_tuples_updated</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of tuples updated in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_tuples_deleted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of tuples deleted in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_conflict_tablespace</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of queries canceled because of recovery conflict with dropped tablespaces in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_conflict_lock</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of queries canceled because of recovery conflict with locks in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_conflict_snapshot</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of queries canceled because of recovery conflict with old snapshots in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_conflict_bufferpin</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of queries canceled because of recovery conflict with pinned buffers in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_conflict_startup_deadlock</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of queries canceled because of recovery conflict with deadlocks in database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_stat_reset_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-       Time of the last statistics reset for the database.  Initialized to the
-       system time during the first connection to each database.  The reset time
-       is updated when you call <function>pg_stat_reset</function> on the
-       database, as well as upon execution of
-       <function>pg_stat_reset_single_table_counters</function> against any
-       table or index in it.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_temp_bytes</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Amount of data written to temporary files by queries in the database.
-       All temporary files are counted, regardless of why the temporary file
-       was created (sorting or hash), and regardless of the
-       <xref linkend="guc-log-temp-files"> setting.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_temp_files</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of temporary files written by queries in the database. All temporary
-       files are counted, regardless of why the temporary file was created
-       (sorting or hash) or file size, and regardless of the
-       <xref linkend="guc-log-temp-files"> setting.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_db_deadlocks</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of deadlocks detected in the database
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_numscans</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of sequential scans done when argument is a table,
-       or number of index scans done when argument is an index
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_returned</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows read by sequential scans when argument is a table,
-       or number of index entries returned when argument is an index
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of table rows fetched by bitmap scans when argument is a table,
-       or table rows fetched by simple index scans using the index
-       when argument is an index
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_inserted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows inserted into table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_updated</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows updated in table (includes HOT updates)
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_deleted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows deleted from table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_tuples_hot_updated</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows HOT-updated in table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_live_tuples</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of live rows in table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_dead_tuples</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of dead rows in table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_blocks_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block fetch requests for table or index
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_blocks_hit</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block requests found in cache for table or index
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_last_vacuum_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-       Time of the last non-<option>FULL</option> vacuum initiated by the user on this table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_last_autovacuum_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-       Time of the last vacuum initiated by the autovacuum daemon on this table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_last_analyze_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-       Time of the last analyze initiated by the user on this table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_last_autoanalyze_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-       Time of the last analyze initiated by the autovacuum daemon on this
-       table
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_vacuum_count</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       The number of times this table has been non-<option>FULL</> vacuumed manually
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_autovacuum_count</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       The number of times this table has been vacuumed by the autovacuum daemon
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_analyze_count</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       The number of times this table has been analyzed manually
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_autoanalyze_count</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       The number of times this table has been analyzed by the autovacuum daemon
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_numscans</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of sequential scans done when argument is a table,
-       or number of index scans done when argument is an index, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_returned</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows read by sequential scans when argument is a table,
-       or number of index entries returned when argument is an index, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of table rows fetched by bitmap scans when argument is a table,
-       or table rows fetched by simple index scans using the index
-       when argument is an index, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_inserted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows inserted into table, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_updated</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows updated in table (includes HOT updates), in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_deleted</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows deleted from table, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_tuples_hot_updated</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of rows HOT-updated in table, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_blocks_fetched</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block fetch requests for table or index, in the current transaction
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_blocks_hit</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of disk block requests found in cache for table or index, in the current transaction
-      </entry>
-     </row>
 
      <row>
        <!-- See also the entry for this in func.sgml -->
@@ -1947,59 +1694,6 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       </entry>
      </row>
 
-     <row>
-      <entry><literal><function>pg_stat_get_function_calls</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of times the function has been called
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_function_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Total wall clock time spent in the function, in microseconds.  Includes
-       the time spent in functions called by this one.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_function_self_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Time spent in only this function. Time spent in called functions
-       is excluded.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_function_calls</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of times the function has been called, in the current transaction.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_function_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Total wall clock time spent in the function, in microseconds, in the
-       current transaction.  Includes the time spent in functions called by
-       this one.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_xact_function_self_time</function>(<type>oid</type>)</literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Time spent in only this function, in the current transaction. Time
-       spent in called functions is excluded.
-      </entry>
-     </row>
-
      <row>
       <entry><literal><function>pg_stat_get_backend_idset()</function></literal></entry>
       <entry><type>setof integer</type></entry>
@@ -2009,181 +1703,6 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       </entry>
      </row>
 
-     <row>
-      <entry><literal><function>pg_stat_get_backend_pid</function>(<type>integer</type>)</literal></entry>
-      <entry><type>integer</type></entry>
-      <entry>
-       Process ID of the given server process
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_dbid</function>(<type>integer</type>)</literal></entry>
-      <entry><type>oid</type></entry>
-      <entry>
-       Database ID of the given server process
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_userid</function>(<type>integer</type>)</literal></entry>
-      <entry><type>oid</type></entry>
-      <entry>
-       User ID of the given server process
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_activity</function>(<type>integer</type>)</literal></entry>
-      <entry><type>text</type></entry>
-      <entry>
-       Active command of the given server process, but only if the
-       current user is a superuser or the same user as that of
-       the session being queried (and
-       <varname>track_activities</varname> is on)
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_waiting</function>(<type>integer</type>)</literal></entry>
-      <entry><type>boolean</type></entry>
-      <entry>
-       True if the given server process is waiting for a lock,
-       but only if the current user is a superuser or the same user as that of
-       the session being queried (and
-       <varname>track_activities</varname> is on)
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_activity_start</function>(<type>integer</type>)</literal></entry>
-      <entry><type>timestamp with time zone</type></entry>
-      <entry>
-       The time at which the given server process' currently
-       executing query was started, but only if the
-       current user is a superuser or the same user as that of
-       the session being queried (and
-       <varname>track_activities</varname> is on)
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_xact_start</function>(<type>integer</type>)</literal></entry>
-      <entry><type>timestamp with time zone</type></entry>
-      <entry>
-       The time at which the given server process' currently
-       executing transaction was started, but only if the
-       current user is a superuser or the same user as that of
-       the session being queried (and
-       <varname>track_activities</varname> is on)
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_start</function>(<type>integer</type>)</literal></entry>
-      <entry><type>timestamp with time zone</type></entry>
-      <entry>
-       The time at which the given server process was started, or
-       null if the current user is not a superuser nor the same user
-       as that of the session being queried
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_client_addr</function>(<type>integer</type>)</literal></entry>
-      <entry><type>inet</type></entry>
-      <entry>
-       The IP address of the client connected to the given
-       server process; null if the connection is over a Unix domain
-       socket, also null if the current user is not a superuser nor
-       the same user as that of the session being queried
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_backend_client_port</function>(<type>integer</type>)</literal></entry>
-      <entry><type>integer</type></entry>
-      <entry>
-       The TCP port number of the client connected to the given
-       server process; -1 if the connection is over a Unix domain
-       socket, null if the current user is not a superuser nor the
-       same user as that of the session being queried
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_timed_checkpoints()</function></literal></entry>
-       <entry><type>bigint</type></entry>
-       <entry>
-        Number of times the background writer has started timed checkpoints
-        (because the <varname>checkpoint_timeout</varname> time has expired)
-       </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_requested_checkpoints()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of times the background writer has started checkpoints based
-       on requests from backends because the <varname>checkpoint_segments</varname>
-       has been exceeded or because the <command>CHECKPOINT</command>
-       command has been issued
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_buf_written_checkpoints()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of buffers written by the background writer during checkpoints
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_buf_written_clean()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of buffers written by the background writer for routine cleaning of
-       dirty pages
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_maxwritten_clean()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of times the background writer has stopped its cleaning scan because
-       it has written more buffers than specified in the
-       <varname>bgwriter_lru_maxpages</varname> parameter
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_bgwriter_stat_reset_time()</function></literal></entry>
-      <entry><type>timestamptz</type></entry>
-      <entry>
-        Time of the last statistics reset for the background writer, updated
-        when executing <function>pg_stat_reset_shared('bgwriter')</function>
-        on the database cluster.
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_buf_written_backend()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Number of buffers written by backends because they needed
-       to allocate a new buffer
-      </entry>
-     </row>
-
-     <row>
-      <entry><literal><function>pg_stat_get_buf_alloc()</function></literal></entry>
-      <entry><type>bigint</type></entry>
-      <entry>
-       Total number of buffer allocations
-      </entry>
-     </row>
 
      <row>
       <entry><literal><function>pg_stat_get_wal_senders()</function></literal></entry>
@@ -2243,31 +1762,6 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    </tgroup>
   </table>
 
-   <note>
-    <para>
-     <function>pg_stat_get_blocks_fetched</function> minus
-     <function>pg_stat_get_blocks_hit</function> gives the number of kernel
-     <function>read()</> calls issued for the table, index, or
-     database; the number of actual physical reads is usually
-     lower due to kernel-level buffering.  The <literal>*_blks_read</>
-     statistics columns use this subtraction, i.e., fetched minus hit.
-    </para>
-   </note>
-
-  <para>
-   All functions to access information about backends are indexed by backend id
-   number, except <function>pg_stat_get_activity</function> which is indexed by PID.
-   The function <function>pg_stat_get_backend_idset</function> provides
-   a convenient way to generate one row for each active server process.  For
-   example, to show the <acronym>PID</>s and current queries of all server processes:
-
-<programlisting>
-SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
-       pg_stat_get_backend_activity(s.backendid) AS query
-    FROM (SELECT pg_stat_get_backend_idset() AS backendid) AS s;
-</programlisting>
-  </para>
-
  </sect3>
  </sect2>
  </sect1>

From ac086d466d124337ba87c8eca9aca2417c5bb98f Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sat, 25 Feb 2012 16:33:57 +0100
Subject: [PATCH 051/129] Fix some typos and correct wording in the monitoring
 docs patch

Thom Brown
---
 doc/src/sgml/monitoring.sgml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3cc5d4df0038b..840e54aef8a55 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -269,7 +269,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
 
      <row>
       <entry><structname>pg_stat_database</><indexterm><primary>pg_stat_database</primary></indexterm></entry>
-      <entry>One row per database, showing database wide statistics. See
+      <entry>One row per database, showing database-wide statistics. See
        <xref linkend="pg-stat-database-view"> for more details.
       </entry>
      </row>
@@ -277,7 +277,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <row>
       <entry><structname>pg_stat_database_conflicts</><indexterm><primary>pg_stat_database_conflicts</primary></indexterm></entry>
       <entry>
-       One row per database showing database wide statistics about
+       One row per database showing database-wide statistics about
        query cancels due to conflict with recovery on standby servers.
        Will only contain information on standby servers, since
        conflicts do not occur on master servers.
@@ -495,7 +495,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    queries that use the same underlying statistics access functions as
    these standard views do.  These functions are listed in <xref
    linkend="monitoring-stats-funcs-table">.  The per-database access
-   functions take a database OID as argument to identify which
+   functions take a database OID as an argument to identify which
    database to report on.  The per-table and per-index functions take
    a table or index OID.  The functions for function-call statistics
    take a function OID.  (Note that only tables, indexes, and functions
@@ -522,9 +522,9 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
     <row>
      <entry>datid</entry>
      <entry><type>oid</></entry>
-     <entry>The oid of the database the backend is connected to.
+     <entry>The OID of the database the backend is connected to.
      This value can also be returned by directly calling
-     the <function>pg_stat_get_backend_dbid</function>.</entry>
+     the <function>pg_stat_get_backend_dbid</function> function.</entry>
     </row>
     <row>
      <entry>datname</entry>
@@ -836,7 +836,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
     <row>
      <entry>datid</entry>
      <entry><type>oid</></entry>
-     <entry>The oid of the database</entry>
+     <entry>The OID of the database</entry>
     </row>
     <row>
      <entry>datname</entry>
@@ -855,15 +855,15 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
     <row>
      <entry>xact_commit</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of transactions in this database that has committed.
-     This value can also be returned by directly calling
+     <entry>The number of transactions in this database that have been
+     committed. This value can also be returned by directly calling
      the <function>pg_stat_get_db_xact_commit</function> function.</entry>
     </row>
     <row>
      <entry>xact_rollback</entry>
      <entry><type>bigint</></entry>
-     <entry>The number of transactions in this database that has rolled back.
-     This value can also be returned by directly calling
+     <entry>The number of transactions in this database that have been
+     rolled back. This value can also be returned by directly calling
      the <function>pg_stat_get_db_xact_rollback</function> function.</entry>
     </row>
     <row>
@@ -974,7 +974,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
 
   <para>
    The <structname>pg_stat_database</structname> view will contain one row
-   for each database in the cluster showing database wide statistics.
+   for each database in the cluster showing database-wide statistics.
   </para>
 
   <table id="pg-stat-database-conflicts-view" xreflabel="pg_stat_database_conflicts">
@@ -992,7 +992,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
     <row>
      <entry>datid</entry>
      <entry><type>oid</></entry>
-     <entry>The oid of the database</entry>
+     <entry>The OID of the database</entry>
     </row>
     <row>
      <entry>datname</entry>
@@ -1041,7 +1041,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
 
   <para>
    The <structname>pg_stat_database_conflicts</structname> view will contain
-   one row per database showing database wide statistics about
+   one row per database showing database-wide statistics about
    query cancels due to conflict with recovery on standby servers.
    Will only contain information on standby servers, since
    conflicts do not occur on master servers.
@@ -1067,7 +1067,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
     <row>
      <entry>usesysid</entry>
      <entry><type>oid</></entry>
-     <entry>The oid of the user logged into this WAL sender process</entry>
+     <entry>The OID of the user logged into this WAL sender process</entry>
     </row>
     <row>
      <entry>usename</entry>
@@ -1102,7 +1102,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
      <entry><type>integer</></entry>
      <entry>
       The remote TCP port that the client is using for communication
-      to the , or <symbol>NULL</> if a unix socket is used.
+      to the, or <symbol>NULL</> if a unix socket is used.
      </entry>
     </row>
     <row>
@@ -1179,7 +1179,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
     <row>
      <entry>relid</entry>
      <entry><type>oid</></entry>
-     <entry>The OID of the table this row</entry>
+     <entry>The OID of the table</entry>
     </row>
     <row>
      <entry>schemaname</entry>

From 66f0cf7da8eeaeca4b9894bfafd61789b514af4a Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Sun, 26 Feb 2012 15:22:27 +0200
Subject: [PATCH 052/129] Remove useless const qualifier

Claiming that the typevar argument to DefineCompositeType() is const
was a plain lie.  A similar case in DefineVirtualRelation() was
already changed in passing in commit 1575fbcb.  Also clean up the now
unnecessary casts that used to cast away the const.
---
 src/backend/commands/typecmds.c | 4 ++--
 src/backend/commands/view.c     | 2 +-
 src/include/commands/typecmds.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 3523554cfbdef..22c1132e9b9ca 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -1980,7 +1980,7 @@ AssignTypeArrayOid(void)
  *-------------------------------------------------------------------
  */
 Oid
-DefineCompositeType(const RangeVar *typevar, List *coldeflist)
+DefineCompositeType(RangeVar *typevar, List *coldeflist)
 {
 	CreateStmt *createStmt = makeNode(CreateStmt);
 	Oid			old_type_oid;
@@ -1991,7 +1991,7 @@ DefineCompositeType(const RangeVar *typevar, List *coldeflist)
 	 * now set the parameters for keys/inheritance etc. All of these are
 	 * uninteresting for composite types...
 	 */
-	createStmt->relation = (RangeVar *) typevar;
+	createStmt->relation = typevar;
 	createStmt->tableElts = coldeflist;
 	createStmt->inhRelations = NIL;
 	createStmt->constraints = NIL;
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index f895488a5a571..99fb7dbb8f4d0 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -254,7 +254,7 @@ DefineVirtualRelation(RangeVar *relation, List *tlist, bool replace,
 		 * now set the parameters for keys/inheritance etc. All of these are
 		 * uninteresting for views...
 		 */
-		createStmt->relation = (RangeVar *) relation;
+		createStmt->relation = relation;
 		createStmt->tableElts = attrList;
 		createStmt->inhRelations = NIL;
 		createStmt->constraints = NIL;
diff --git a/src/include/commands/typecmds.h b/src/include/commands/typecmds.h
index 9de5330924e6a..0c7e10d392d37 100644
--- a/src/include/commands/typecmds.h
+++ b/src/include/commands/typecmds.h
@@ -25,7 +25,7 @@ extern void DefineDomain(CreateDomainStmt *stmt);
 extern void DefineEnum(CreateEnumStmt *stmt);
 extern void DefineRange(CreateRangeStmt *stmt);
 extern void AlterEnum(AlterEnumStmt *stmt);
-extern Oid	DefineCompositeType(const RangeVar *typevar, List *coldeflist);
+extern Oid	DefineCompositeType(RangeVar *typevar, List *coldeflist);
 extern Oid	AssignTypeArrayOid(void);
 
 extern void AlterDomainDefault(List *names, Node *defaultRaw);

From b5c077c3685c1b122c10c7ef03ff4c07abe786a3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Sun, 26 Feb 2012 15:31:16 +0200
Subject: [PATCH 053/129] Remove useless cast

---
 src/backend/libpq/crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/libpq/crypt.c b/src/backend/libpq/crypt.c
index 2cb5f027eae0d..73eee77ac260a 100644
--- a/src/backend/libpq/crypt.c
+++ b/src/backend/libpq/crypt.c
@@ -89,7 +89,7 @@ md5_crypt_verify(const Port *port, const char *role, char *client_pass)
 			{
 				/* stored password already encrypted, only do salt */
 				if (!pg_md5_encrypt(shadow_pass + strlen("md5"),
-									(char *) port->md5Salt,
+									port->md5Salt,
 									sizeof(port->md5Salt), crypt_pwd))
 				{
 					pfree(crypt_pwd);

From 1b630751d0ffef4c856bfe382889d0d187eca404 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 26 Feb 2012 15:12:17 -0500
Subject: [PATCH 054/129] Fix some more bugs in GIN's WAL replay logic.

In commit 4016bdef8aded77b4903c457050622a5a1815c16 I fixed a bunch of
ginxlog.c bugs having to do with not handling XLogReadBuffer failures
correctly.  However, in ginRedoUpdateMetapage and ginRedoDeleteListPages,
I unaccountably thought that failure to read the metapage would be
impossible and just put in an elog(PANIC) call.  This is of course wrong:
failure is exactly what will happen if the index got dropped (or rebuilt)
between creation of the WAL record and the crash we're trying to recover
from.  I believe this explains Nicholas Wilson's recent report of these
errors getting reached.

Also, fix memory leak in forgetIncompleteSplit.  This wasn't of much
concern when the code was written, but in a long-running standby server
page split records could be expected to accumulate indefinitely.

Back-patch to 8.4 --- before that, GIN didn't have a metapage.
---
 src/backend/access/gin/ginxlog.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 388589beac33b..063d793565ff9 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -58,9 +58,12 @@ forgetIncompleteSplit(RelFileNode node, BlockNumber leftBlkno, BlockNumber updat
 	{
 		ginIncompleteSplit *split = (ginIncompleteSplit *) lfirst(l);
 
-		if (RelFileNodeEquals(node, split->node) && leftBlkno == split->leftBlkno && updateBlkno == split->rightBlkno)
+		if (RelFileNodeEquals(node, split->node) &&
+			leftBlkno == split->leftBlkno &&
+			updateBlkno == split->rightBlkno)
 		{
 			incomplete_splits = list_delete_ptr(incomplete_splits, split);
+			pfree(split);
 			break;
 		}
 	}
@@ -486,7 +489,7 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
 
 	metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
 	if (!BufferIsValid(metabuffer))
-		elog(PANIC, "GIN metapage disappeared");
+		return;					/* assume index was deleted, nothing to do */
 	metapage = BufferGetPage(metabuffer);
 
 	if (!XLByteLE(lsn, PageGetLSN(metapage)))
@@ -631,7 +634,7 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
 
 	metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
 	if (!BufferIsValid(metabuffer))
-		elog(PANIC, "GIN metapage disappeared");
+		return;					/* assume index was deleted, nothing to do */
 	metapage = BufferGetPage(metabuffer);
 
 	if (!XLByteLE(lsn, PageGetLSN(metapage)))

From 9bf8603c7a9153cada7e32eb0cf7ac1feb1d3b56 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Mon, 27 Feb 2012 13:53:12 +0200
Subject: [PATCH 055/129] Call check_keywords.pl in maintainer-check

For that purpose, have check_keywords.pl print errors to stderr and
return a useful exit status.
---
 src/backend/common.mk       |  2 +-
 src/backend/parser/Makefile |  4 ++++
 src/tools/check_keywords.pl | 25 +++++++++++++++++--------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/backend/common.mk b/src/backend/common.mk
index 5d599dbd0ca25..2e56151e2b48f 100644
--- a/src/backend/common.mk
+++ b/src/backend/common.mk
@@ -45,4 +45,4 @@ clean: clean-local
 clean-local:
 	rm -f $(subsysfilename) $(OBJS)
 
-$(call recurse,coverage)
+$(call recurse,coverage maintainer-check)
diff --git a/src/backend/parser/Makefile b/src/backend/parser/Makefile
index 0bdb3249a2e96..00e8e88e9c2e6 100644
--- a/src/backend/parser/Makefile
+++ b/src/backend/parser/Makefile
@@ -65,3 +65,7 @@ gram.o keywords.o parser.o: gram.h
 # are not cleaned here.
 clean distclean maintainer-clean:
 	rm -f lex.backup
+
+
+maintainer-check:
+	$(PERL) $(top_srcdir)/src/tools/check_keywords.pl $(top_srcdir)
diff --git a/src/tools/check_keywords.pl b/src/tools/check_keywords.pl
index 178775484001d..33816c51332d6 100755
--- a/src/tools/check_keywords.pl
+++ b/src/tools/check_keywords.pl
@@ -7,8 +7,14 @@
 #
 # src/tools/check_keywords.pl
 
+my $errors = 0;
 my $path;
 
+sub error(@) {
+    print STDERR @_;
+    $errors = 1;
+}
+
 if (@ARGV) {
 	$path = $ARGV[0];
 	shift @ARGV;
@@ -102,7 +108,8 @@
 	$bare_kword = $kword;
 	$bare_kword =~ s/_P$//;
 	if ($bare_kword le $prevkword) {
-	    print "'$bare_kword' after '$prevkword' in $kcat list is misplaced";
+	    error "'$bare_kword' after '$prevkword' in $kcat list is misplaced";
+	    $errors = 1;
 	}
 	$prevkword = $bare_kword;
     }
@@ -141,35 +148,35 @@
 
 	# Check that the list is in alphabetical order
 	if ($kwstring le $prevkwstring) {
-	    print "'$kwstring' after '$prevkwstring' in kwlist.h is misplaced";
+	    error "'$kwstring' after '$prevkwstring' in kwlist.h is misplaced";
 	}
 	$prevkwstring = $kwstring;
 
 	# Check that the keyword string is valid: all lower-case ASCII chars
 	if ($kwstring !~ /^[a-z_]*$/) {
-	    print "'$kwstring' is not a valid keyword string, must be all lower-case ASCII chars";
+	    error "'$kwstring' is not a valid keyword string, must be all lower-case ASCII chars";
 	}
 
 	# Check that the keyword name is valid: all upper-case ASCII chars
 	if ($kwname !~ /^[A-Z_]*$/) {
-	    print "'$kwname' is not a valid keyword name, must be all upper-case ASCII chars";
+	    error "'$kwname' is not a valid keyword name, must be all upper-case ASCII chars";
 	}
 
 	# Check that the keyword string matches keyword name
 	$bare_kwname = $kwname;
 	$bare_kwname =~ s/_P$//;
 	if ($bare_kwname ne uc($kwstring)) {
-	    print "keyword name '$kwname' doesn't match keyword string '$kwstring'";
+	    error "keyword name '$kwname' doesn't match keyword string '$kwstring'";
 	}
 
 	# Check that the keyword is present in the grammar
 	%kwhash = %{$kwhashes{$kwcat_id}};
 
 	if (!(%kwhash))	{
-	    #print "Unknown kwcat_id: $kwcat_id";
+	    #error "Unknown kwcat_id: $kwcat_id";
 	} else {
 	    if (!($kwhash{$kwname})) {
-		print "'$kwname' not present in $kwcat_id section of gram.y";
+		error "'$kwname' not present in $kwcat_id section of gram.y";
 	    } else {
 		# Remove it from the hash, so that we can complain at the end
 		# if there's keywords left that were not found in kwlist.h
@@ -185,6 +192,8 @@
     %kwhash = %{$kwhashes{$kwcat_id}};
 
     for my $kw ( keys %kwhash ) {
-	print "'$kw' found in gram.y $kwcat category, but not in kwlist.h"
+	error "'$kw' found in gram.y $kwcat category, but not in kwlist.h"
     }
 }
+
+exit $errors;

From cb3a7c2b95a28e57c56562d48d2a3aa5eeb7fa29 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 27 Feb 2012 18:28:00 -0300
Subject: [PATCH 056/129] ALTER TABLE: skip FK validation when it's safe to do
 so

We already skip rewriting the table in these cases, but we still force a
whole table scan to validate the data.  This can be skipped, and thus
we can make the whole ALTER TABLE operation just do some catalog touches
instead of scanning the table, when these two conditions hold:

(a) Old and new pg_constraint.conpfeqop match exactly.  This is actually
stronger than needed; we could loosen things by way of operator
families, but it'd require a lot more effort.

(b) The functions, if any, implementing a cast from the foreign type to
the primary opcintype are the same.  For this purpose, we can consider a
binary coercion equivalent to an exact type match.  When the opcintype
is polymorphic, require that the old and new foreign types match
exactly.  (Since ri_triggers.c does use the executor, the stronger check
for polymorphic types is no mere future-proofing.  However, no core type
exercises its necessity.)

Author: Noah Misch

Committer's note: catalog version bumped due to change of the Constraint
node.  I can't actually find any way to have such a node in a stored
rule, but given that we have "out" support for them, better be safe.
---
 src/backend/commands/tablecmds.c    | 186 +++++++++++++++++++++++++++-
 src/backend/nodes/copyfuncs.c       |   1 +
 src/backend/nodes/equalfuncs.c      |   1 +
 src/backend/nodes/outfuncs.c        |   1 +
 src/backend/utils/adt/ri_triggers.c |   1 +
 src/include/catalog/catversion.h    |   2 +-
 src/include/nodes/parsenodes.h      |   1 +
 7 files changed, 187 insertions(+), 6 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 28889c1f44040..cd4490a1c24e5 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -276,6 +276,8 @@ static Oid transformFkeyCheckAttrs(Relation pkrel,
 						int numattrs, int16 *attnums,
 						Oid *opclasses);
 static void checkFkeyPermissions(Relation rel, int16 *attnums, int natts);
+static CoercionPathType findFkeyCast(Oid targetTypeId, Oid sourceTypeId,
+			 Oid *funcid);
 static void validateCheckConstraint(Relation rel, HeapTuple constrtup);
 static void validateForeignKeyConstraint(char *conname,
 							 Relation rel, Relation pkrel,
@@ -358,6 +360,7 @@ static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMOD
 static void ATPostAlterTypeParse(Oid oldId, char *cmd,
 					 List **wqueue, LOCKMODE lockmode, bool rewrite);
 static void TryReuseIndex(Oid oldId, IndexStmt *stmt);
+static void TryReuseForeignKey(Oid oldId, Constraint *con);
 static void change_owner_fix_column_acls(Oid relationOid,
 							 Oid oldOwnerId, Oid newOwnerId);
 static void change_owner_recurse_to_sequences(Oid relationOid,
@@ -5620,6 +5623,8 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 				numpks;
 	Oid			indexOid;
 	Oid			constrOid;
+	bool		old_check_ok;
+	ListCell   *old_pfeqop_item = list_head(fkconstraint->old_conpfeqop);
 
 	/*
 	 * Grab an exclusive lock on the pk table, so that someone doesn't delete
@@ -5736,6 +5741,13 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 				(errcode(ERRCODE_INVALID_FOREIGN_KEY),
 				 errmsg("number of referencing and referenced columns for foreign key disagree")));
 
+	/*
+	 * On the strength of a previous constraint, we might avoid scanning
+	 * tables to validate this one.  See below.
+	 */
+	old_check_ok = (fkconstraint->old_conpfeqop != NIL);
+	Assert(!old_check_ok || numfks == list_length(fkconstraint->old_conpfeqop));
+
 	for (i = 0; i < numpks; i++)
 	{
 		Oid			pktype = pktypoid[i];
@@ -5750,6 +5762,7 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 		Oid			ppeqop;
 		Oid			ffeqop;
 		int16		eqstrategy;
+		Oid			pfeqop_right;
 
 		/* We need several fields out of the pg_opclass entry */
 		cla_ht = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclasses[i]));
@@ -5792,10 +5805,17 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 		pfeqop = get_opfamily_member(opfamily, opcintype, fktyped,
 									 eqstrategy);
 		if (OidIsValid(pfeqop))
+		{
+			pfeqop_right = fktyped;
 			ffeqop = get_opfamily_member(opfamily, fktyped, fktyped,
 										 eqstrategy);
+		}
 		else
-			ffeqop = InvalidOid;	/* keep compiler quiet */
+		{
+			/* keep compiler quiet */
+			pfeqop_right = InvalidOid;
+			ffeqop = InvalidOid;
+		}
 
 		if (!(OidIsValid(pfeqop) && OidIsValid(ffeqop)))
 		{
@@ -5817,7 +5837,10 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 			target_typeids[1] = opcintype;
 			if (can_coerce_type(2, input_typeids, target_typeids,
 								COERCION_IMPLICIT))
+			{
 				pfeqop = ffeqop = ppeqop;
+				pfeqop_right = opcintype;
+			}
 		}
 
 		if (!(OidIsValid(pfeqop) && OidIsValid(ffeqop)))
@@ -5833,6 +5856,77 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 							   format_type_be(fktype),
 							   format_type_be(pktype))));
 
+		if (old_check_ok)
+		{
+			/*
+			 * When a pfeqop changes, revalidate the constraint.  We could
+			 * permit intra-opfamily changes, but that adds subtle complexity
+			 * without any concrete benefit for core types.  We need not
+			 * assess ppeqop or ffeqop, which RI_Initial_Check() does not use.
+			 */
+			old_check_ok = (pfeqop == lfirst_oid(old_pfeqop_item));
+			old_pfeqop_item = lnext(old_pfeqop_item);
+		}
+		if (old_check_ok)
+		{
+			Oid			old_fktype;
+			Oid			new_fktype;
+			CoercionPathType old_pathtype;
+			CoercionPathType new_pathtype;
+			Oid			old_castfunc;
+			Oid			new_castfunc;
+
+			/*
+			 * Identify coercion pathways from each of the old and new FK-side
+			 * column types to the right (foreign) operand type of the pfeqop.
+			 * We may assume that pg_constraint.conkey is not changing.
+			 */
+			old_fktype = tab->oldDesc->attrs[fkattnum[i] - 1]->atttypid;
+			new_fktype = fktype;
+			old_pathtype = findFkeyCast(pfeqop_right, old_fktype,
+										&old_castfunc);
+			new_pathtype = findFkeyCast(pfeqop_right, new_fktype,
+										&new_castfunc);
+
+			/*
+			 * Upon a change to the cast from the FK column to its pfeqop
+			 * operand, revalidate the constraint.  For this evaluation, a
+			 * binary coercion cast is equivalent to no cast at all.  While
+			 * type implementors should design implicit casts with an eye
+			 * toward consistency of operations like equality, we cannot assume
+			 * here that they have done so.
+			 *
+			 * A function with a polymorphic argument could change behavior
+			 * arbitrarily in response to get_fn_expr_argtype().  Therefore,
+			 * when the cast destination is polymorphic, we only avoid
+			 * revalidation if the input type has not changed at all.  Given
+			 * just the core data types and operator classes, this requirement
+			 * prevents no would-be optimizations.
+			 *
+			 * If the cast converts from a base type to a domain thereon, then
+			 * that domain type must be the opcintype of the unique index.
+			 * Necessarily, the primary key column must then be of the domain
+			 * type.  Since the constraint was previously valid, all values on
+			 * the foreign side necessarily exist on the primary side and in
+			 * turn conform to the domain.  Consequently, we need not treat
+			 * domains specially here.
+			 *
+			 * Since we require that all collations share the same notion of
+			 * equality (which they do, because texteq reduces to bitwise
+			 * equality), we don't compare collation here.
+			 *
+			 * We need not directly consider the PK type.  It's necessarily
+			 * binary coercible to the opcintype of the unique index column,
+			 * and ri_triggers.c will only deal with PK datums in terms of that
+			 * opcintype.  Changing the opcintype also changes pfeqop.
+			 */
+			old_check_ok = (new_pathtype == old_pathtype &&
+							new_castfunc == old_castfunc &&
+							(!IsPolymorphicType(pfeqop_right) ||
+							 new_fktype == old_fktype));
+
+		}
+
 		pfeqoperators[i] = pfeqop;
 		ppeqoperators[i] = ppeqop;
 		ffeqoperators[i] = ffeqop;
@@ -5877,10 +5971,12 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
 
 	/*
 	 * Tell Phase 3 to check that the constraint is satisfied by existing rows.
-	 * We can skip this during table creation, or if requested explicitly by
-	 * specifying NOT VALID in an ADD FOREIGN KEY command.
+	 * We can skip this during table creation, when requested explicitly by
+	 * specifying NOT VALID in an ADD FOREIGN KEY command, and when we're
+	 * recreating a constraint following a SET DATA TYPE operation that did not
+	 * impugn its validity.
 	 */
-	if (!fkconstraint->skip_validation)
+	if (!old_check_ok && !fkconstraint->skip_validation)
 	{
 		NewConstraint *newcon;
 
@@ -6330,6 +6426,35 @@ transformFkeyCheckAttrs(Relation pkrel,
 	return indexoid;
 }
 
+/*
+ * findFkeyCast -
+ *
+ *	Wrapper around find_coercion_pathway() for ATAddForeignKeyConstraint().
+ *	Caller has equal regard for binary coercibility and for an exact match.
+*/
+static CoercionPathType
+findFkeyCast(Oid targetTypeId, Oid sourceTypeId, Oid *funcid)
+{
+	CoercionPathType ret;
+
+	if (targetTypeId == sourceTypeId)
+	{
+		ret = COERCION_PATH_RELABELTYPE;
+		*funcid = InvalidOid;
+	}
+	else
+	{
+		ret = find_coercion_pathway(targetTypeId, sourceTypeId,
+									COERCION_IMPLICIT, funcid);
+		if (ret == COERCION_PATH_NONE)
+			/* A previously-relied-upon cast is now gone. */
+			elog(ERROR, "could not find cast from %u to %u",
+				 sourceTypeId, targetTypeId);
+	}
+
+	return ret;
+}
+
 /* Permissions checks for ADD FOREIGN KEY */
 static void
 checkFkeyPermissions(Relation rel, int16 *attnums, int natts)
@@ -7717,6 +7842,7 @@ ATPostAlterTypeParse(Oid oldId, char *cmd,
 					foreach(lcmd, stmt->cmds)
 					{
 						AlterTableCmd *cmd = (AlterTableCmd *) lfirst(lcmd);
+						Constraint *con;
 
 						switch (cmd->subtype)
 						{
@@ -7730,6 +7856,12 @@ ATPostAlterTypeParse(Oid oldId, char *cmd,
 									lappend(tab->subcmds[AT_PASS_OLD_INDEX], cmd);
 								break;
 							case AT_AddConstraint:
+								Assert(IsA(cmd->def, Constraint));
+								con = (Constraint *) cmd->def;
+								/* rewriting neither side of a FK */
+								if (con->contype == CONSTR_FOREIGN &&
+									!rewrite && !tab->rewrite)
+									TryReuseForeignKey(oldId, con);
 								tab->subcmds[AT_PASS_OLD_CONSTR] =
 									lappend(tab->subcmds[AT_PASS_OLD_CONSTR], cmd);
 								break;
@@ -7751,7 +7883,7 @@ ATPostAlterTypeParse(Oid oldId, char *cmd,
 /*
  * Subroutine for ATPostAlterTypeParse().  Calls out to CheckIndexCompatible()
  * for the real analysis, then mutates the IndexStmt based on that verdict.
-*/
+ */
 static void
 TryReuseIndex(Oid oldId, IndexStmt *stmt)
 {
@@ -7768,6 +7900,50 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)
 	}
 }
 
+/*
+ * Subroutine for ATPostAlterTypeParse().
+ *
+ * Stash the old P-F equality operator into the Constraint node, for possible
+ * use by ATAddForeignKeyConstraint() in determining whether revalidation of
+ * this constraint can be skipped.
+ */
+static void
+TryReuseForeignKey(Oid oldId, Constraint *con)
+{
+	HeapTuple	tup;
+	Datum		adatum;
+	bool		isNull;
+	ArrayType  *arr;
+	Oid		   *rawarr;
+	int			numkeys;
+	int			i;
+
+	Assert(con->contype == CONSTR_FOREIGN);
+	Assert(con->old_conpfeqop == NIL); /* already prepared this node */
+
+	tup = SearchSysCache1(CONSTROID, ObjectIdGetDatum(oldId));
+	if (!HeapTupleIsValid(tup)) /* should not happen */
+		elog(ERROR, "cache lookup failed for constraint %u", oldId);
+
+	adatum = SysCacheGetAttr(CONSTROID, tup,
+							 Anum_pg_constraint_conpfeqop, &isNull);
+	if (isNull)
+		elog(ERROR, "null conpfeqop for constraint %u", oldId);
+	arr = DatumGetArrayTypeP(adatum);	/* ensure not toasted */
+	numkeys = ARR_DIMS(arr)[0];
+	/* test follows the one in ri_FetchConstraintInfo() */
+	if (ARR_NDIM(arr) != 1 ||
+		ARR_HASNULL(arr) ||
+		ARR_ELEMTYPE(arr) != OIDOID)
+		elog(ERROR, "conpfeqop is not a 1-D Oid array");
+	rawarr = (Oid *) ARR_DATA_PTR(arr);
+
+	/* stash a List of the operator Oids in our Constraint node */
+	for (i = 0; i < numkeys; i++)
+		con->old_conpfeqop = lcons_oid(rawarr[i], con->old_conpfeqop);
+
+	ReleaseSysCache(tup);
+}
 
 /*
  * ALTER TABLE OWNER
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index cc3168d906670..7fec4dbf7b56c 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2364,6 +2364,7 @@ _copyConstraint(const Constraint *from)
 	COPY_SCALAR_FIELD(fk_matchtype);
 	COPY_SCALAR_FIELD(fk_upd_action);
 	COPY_SCALAR_FIELD(fk_del_action);
+	COPY_NODE_FIELD(old_conpfeqop);
 	COPY_SCALAR_FIELD(skip_validation);
 	COPY_SCALAR_FIELD(initially_valid);
 
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 2295195fabc36..d2a79eb851c8c 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2199,6 +2199,7 @@ _equalConstraint(const Constraint *a, const Constraint *b)
 	COMPARE_SCALAR_FIELD(fk_matchtype);
 	COMPARE_SCALAR_FIELD(fk_upd_action);
 	COMPARE_SCALAR_FIELD(fk_del_action);
+	COMPARE_NODE_FIELD(old_conpfeqop);
 	COMPARE_SCALAR_FIELD(skip_validation);
 	COMPARE_SCALAR_FIELD(initially_valid);
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 829f6d4f7b59c..25a215e9d71f7 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2626,6 +2626,7 @@ _outConstraint(StringInfo str, const Constraint *node)
 			WRITE_CHAR_FIELD(fk_matchtype);
 			WRITE_CHAR_FIELD(fk_upd_action);
 			WRITE_CHAR_FIELD(fk_del_action);
+			WRITE_NODE_FIELD(old_conpfeqop);
 			WRITE_BOOL_FIELD(skip_validation);
 			WRITE_BOOL_FIELD(initially_valid);
 			break;
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index 03a974a7121b1..dd58f4efc8a37 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -3224,6 +3224,7 @@ ri_FetchConstraintInfo(RI_ConstraintInfo *riinfo,
 		elog(ERROR, "null conpfeqop for constraint %u", constraintOid);
 	arr = DatumGetArrayTypeP(adatum);	/* ensure not toasted */
 	numkeys = ARR_DIMS(arr)[0];
+	/* see TryReuseForeignKey if you change the test below */
 	if (ARR_NDIM(arr) != 1 ||
 		numkeys != riinfo->nkeys ||
 		numkeys > RI_MAX_NUMKEYS ||
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 6100472d94a91..8451dfde040a3 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201202191
+#define CATALOG_VERSION_NO	201202271
 
 #endif
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 1d33cebc9b8ce..ab5563997d409 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1552,6 +1552,7 @@ typedef struct Constraint
 	char		fk_matchtype;	/* FULL, PARTIAL, UNSPECIFIED */
 	char		fk_upd_action;	/* ON UPDATE action */
 	char		fk_del_action;	/* ON DELETE action */
+	List	   *old_conpfeqop;	/* pg_constraint.conpfeqop of my former self */
 
 	/* Fields used for constraints that allow a NOT VALID specification */
 	bool		skip_validation;	/* skip validation of existing rows? */

From 41e3c94cac0e68257126b2d264dc5e877e892490 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 28 Feb 2012 01:06:29 -0300
Subject: [PATCH 057/129] psql: when tab-completing, use quotes on file names
 that need them

psql backslash commands that deal with file or directory names require
quotes around those that have spaces, single quotes, or backslashes.
However, tab-completing such names does not provide said quotes, and is
thus almost useless with them.

This patch fixes the problem by having a wrapper function around
rl_filename_completion_function that dequotes on input and quotes on
output.  This eases dealing with such names.

Author: Noah Misch
---
 src/bin/psql/stringutils.c  | 69 +++++++++++++++++++++++++++++++++++++
 src/bin/psql/stringutils.h  |  3 ++
 src/bin/psql/tab-complete.c | 58 +++++++++++++++++++++++++++++--
 3 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/src/bin/psql/stringutils.c b/src/bin/psql/stringutils.c
index 3b5ce1ba4bf3c..77387dcf3deaa 100644
--- a/src/bin/psql/stringutils.c
+++ b/src/bin/psql/stringutils.c
@@ -272,3 +272,72 @@ strip_quotes(char *source, char quote, char escape, int encoding)
 
 	*dst = '\0';
 }
+
+
+/*
+ * quote_if_needed
+ *
+ * Opposite of strip_quotes().  If "source" denotes itself literally without
+ * quoting or escaping, returns NULL.  Otherwise, returns a malloc'd copy with
+ * quoting and escaping applied:
+ *
+ * source -			string to parse
+ * entails_quote -	any of these present?  need outer quotes
+ * quote -			doubled within string, affixed to both ends
+ * escape -			doubled within string
+ * encoding -		the active character-set encoding
+ *
+ * Do not use this as a substitute for PQescapeStringConn().  Use it for
+ * strings to be parsed by strtokx() or psql_scan_slash_option().
+ */
+char *
+quote_if_needed(const char *source, const char *entails_quote,
+				char quote, char escape, int encoding)
+{
+	const char *src;
+	char	   *ret;
+	char	   *dst;
+	bool		need_quotes = false;
+
+	psql_assert(source);
+	psql_assert(quote);
+
+	src = source;
+	dst = ret = pg_malloc(2 * strlen(src) + 3);	/* excess */
+
+	*dst++ = quote;
+
+	while (*src)
+	{
+		char		c = *src;
+		int			i;
+
+		if (c == quote)
+		{
+			need_quotes = true;
+			*dst++ = quote;
+		}
+		else if (c == escape)
+		{
+			need_quotes = true;
+			*dst++ = escape;
+		}
+		else if (strchr(entails_quote, c))
+			need_quotes = true;
+
+		i = PQmblen(src, encoding);
+		while (i--)
+			*dst++ = *src++;
+	}
+
+	*dst++ = quote;
+	*dst = '\0';
+
+	if (!need_quotes)
+	{
+		free(ret);
+		ret = NULL;
+	}
+
+	return ret;
+}
diff --git a/src/bin/psql/stringutils.h b/src/bin/psql/stringutils.h
index c7c5f3877d92f..c64fc584585f0 100644
--- a/src/bin/psql/stringutils.h
+++ b/src/bin/psql/stringutils.h
@@ -19,4 +19,7 @@ extern char *strtokx(const char *s,
 		bool del_quotes,
 		int encoding);
 
+extern char *quote_if_needed(const char *source, const char *entails_quote,
+				char quote, char escape, int encoding);
+
 #endif   /* STRINGUTILS_H */
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 3854f7f421fc8..6f481bb24dd40 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -680,6 +680,7 @@ static char *complete_from_list(const char *text, int state);
 static char *complete_from_const(const char *text, int state);
 static char **complete_from_variables(char *text,
 						const char *prefix, const char *suffix);
+static char *complete_from_files(const char *text, int state);
 
 static char *pg_strdup_same_case(const char *s, const char *ref);
 static PGresult *exec_query(const char *query);
@@ -1630,7 +1631,10 @@ psql_completion(char *text, int start, int end)
 			  pg_strcasecmp(prev3_wd, "BINARY") == 0) &&
 			 (pg_strcasecmp(prev_wd, "FROM") == 0 ||
 			  pg_strcasecmp(prev_wd, "TO") == 0))
-		matches = completion_matches(text, filename_completion_function);
+	{
+		completion_charp = "";
+		matches = completion_matches(text, complete_from_files);
+	}
 
 	/* Handle COPY|BINARY <sth> FROM|TO filename */
 	else if ((pg_strcasecmp(prev4_wd, "COPY") == 0 ||
@@ -2953,7 +2957,10 @@ psql_completion(char *text, int start, int end)
 			 strcmp(prev_wd, "\\s") == 0 ||
 			 strcmp(prev_wd, "\\w") == 0 || strcmp(prev_wd, "\\write") == 0
 		)
-		matches = completion_matches(text, filename_completion_function);
+	{
+		completion_charp = "\\";
+		matches = completion_matches(text, complete_from_files);
+	}
 
 	/*
 	 * Finally, we look through the list of "things", such as TABLE, INDEX and
@@ -3426,6 +3433,53 @@ complete_from_variables(char *text, const char *prefix, const char *suffix)
 }
 
 
+/*
+ * This function wraps rl_filename_completion_function() to strip quotes from
+ * the input before searching for matches and to quote any matches for which
+ * the consuming command will require it.
+ */
+static char *
+complete_from_files(const char *text, int state)
+{
+	static const char *unquoted_text;
+	char	   *unquoted_match;
+	char	   *ret = NULL;
+
+	if (state == 0)
+	{
+		/* Initialization: stash the unquoted input. */
+		unquoted_text = strtokx(text, "", NULL, "'", *completion_charp,
+								false, true, pset.encoding);
+		/* expect a NULL return for the empty string only */
+		if (!unquoted_text)
+		{
+			psql_assert(!*text);
+			unquoted_text = text;
+		}
+	}
+
+	unquoted_match = filename_completion_function(unquoted_text, state);
+	if (unquoted_match)
+	{
+		/*
+		 * Caller sets completion_charp to a zero- or one-character string
+		 * containing the escape character.  This is necessary since \copy has
+		 * no escape character, but every other backslash command recognizes
+		 * "\" as an escape character.  Since we have only two callers, don't
+		 * bother providing a macro to simplify this.
+		 */
+		ret = quote_if_needed(unquoted_match, " \t\r\n\"`",
+							  '\'', *completion_charp, pset.encoding);
+		if (ret)
+			free(unquoted_match);
+		else
+			ret = unquoted_match;
+	}
+
+	return ret;
+}
+
+
 /* HELPER FUNCTIONS */
 
 

From 973e9fb294dc05a384ecae7623923ae53cb81806 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Tue, 28 Feb 2012 12:42:08 +0200
Subject: [PATCH 058/129] Add const qualifiers where they are accidentally cast
 away

This only produces warnings under -Wcast-qual, but it's more correct
and consistent in any case.
---
 contrib/intarray/_intbig_gist.c      |  2 +-
 contrib/ltree/ltree_gist.c           |  4 ++--
 src/backend/access/gist/gistproc.c   | 12 ++++++------
 src/backend/utils/adt/tsquery_util.c |  2 +-
 src/backend/utils/adt/xml.c          |  2 +-
 src/backend/utils/misc/guc.c         |  4 ++--
 src/bin/psql/mbprint.c               |  8 ++++----
 src/include/c.h                      |  2 +-
 src/include/utils/pg_crc.h           |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c
index eb8f2826349a7..c6b00eaeff0da 100644
--- a/contrib/intarray/_intbig_gist.c
+++ b/contrib/intarray/_intbig_gist.c
@@ -332,7 +332,7 @@ typedef struct
 static int
 comparecost(const void *a, const void *b)
 {
-	return ((SPLITCOST *) a)->cost - ((SPLITCOST *) b)->cost;
+	return ((const SPLITCOST *) a)->cost - ((const SPLITCOST *) b)->cost;
 }
 
 
diff --git a/contrib/ltree/ltree_gist.c b/contrib/ltree/ltree_gist.c
index 1c0fa88c3af44..13d96656d2e14 100644
--- a/contrib/ltree/ltree_gist.c
+++ b/contrib/ltree/ltree_gist.c
@@ -275,8 +275,8 @@ static int
 treekey_cmp(const void *a, const void *b)
 {
 	return ltree_compare(
-						 ((RIX *) a)->r,
-						 ((RIX *) b)->r
+						 ((const RIX *) a)->r,
+						 ((const RIX *) b)->r
 		);
 }
 
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
index 41b84112961f5..7070890f20f29 100644
--- a/src/backend/access/gist/gistproc.c
+++ b/src/backend/access/gist/gistproc.c
@@ -284,8 +284,8 @@ typedef struct
 static int
 interval_cmp_lower(const void *i1, const void *i2)
 {
-	double		lower1 = ((SplitInterval *) i1)->lower,
-				lower2 = ((SplitInterval *) i2)->lower;
+	double		lower1 = ((const SplitInterval *) i1)->lower,
+				lower2 = ((const SplitInterval *) i2)->lower;
 
 	if (lower1 < lower2)
 		return -1;
@@ -301,8 +301,8 @@ interval_cmp_lower(const void *i1, const void *i2)
 static int
 interval_cmp_upper(const void *i1, const void *i2)
 {
-	double		upper1 = ((SplitInterval *) i1)->upper,
-				upper2 = ((SplitInterval *) i2)->upper;
+	double		upper1 = ((const SplitInterval *) i1)->upper,
+				upper2 = ((const SplitInterval *) i2)->upper;
 
 	if (upper1 < upper2)
 		return -1;
@@ -455,8 +455,8 @@ box_penalty(BOX *original, BOX *new)
 static int
 common_entry_cmp(const void *i1, const void *i2)
 {
-	double		delta1 = ((CommonEntry *) i1)->delta,
-				delta2 = ((CommonEntry *) i2)->delta;
+	double		delta1 = ((const CommonEntry *) i1)->delta,
+				delta2 = ((const CommonEntry *) i2)->delta;
 
 	if (delta1 < delta2)
 		return -1;
diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c
index 946caa50db473..ae00f180b5dfc 100644
--- a/src/backend/utils/adt/tsquery_util.c
+++ b/src/backend/utils/adt/tsquery_util.c
@@ -134,7 +134,7 @@ QTNodeCompare(QTNode *an, QTNode *bn)
 static int
 cmpQTN(const void *a, const void *b)
 {
-	return QTNodeCompare(*(QTNode **) a, *(QTNode **) b);
+	return QTNodeCompare(*(QTNode * const *) a, *(QTNode * const *) b);
 }
 
 void
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index c0ca3f8cec4ae..d7b637c1c437c 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -311,7 +311,7 @@ xml_recv(PG_FUNCTION_ARGS)
 	str = VARDATA(result);
 	str[nbytes] = '\0';
 
-	parse_xml_decl((xmlChar *) str, NULL, NULL, &encodingStr, NULL);
+	parse_xml_decl((const xmlChar *) str, NULL, NULL, &encodingStr, NULL);
 
 	/*
 	 * If encoding wasn't explicitly specified in the XML header, treat it as
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a5becbe8ff775..486bdcddef899 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3780,8 +3780,8 @@ find_option(const char *name, bool create_placeholders, int elevel)
 static int
 guc_var_compare(const void *a, const void *b)
 {
-	struct config_generic *confa = *(struct config_generic **) a;
-	struct config_generic *confb = *(struct config_generic **) b;
+	const struct config_generic *confa = *(struct config_generic * const *) a;
+	const struct config_generic *confb = *(struct config_generic * const *) b;
 
 	return guc_name_compare(confa->name, confb->name);
 }
diff --git a/src/bin/psql/mbprint.c b/src/bin/psql/mbprint.c
index 2904b39b965f7..32fc756efe5d1 100644
--- a/src/bin/psql/mbprint.c
+++ b/src/bin/psql/mbprint.c
@@ -217,10 +217,10 @@ pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
 
 	for (; *pwcs && len > 0; pwcs += chlen)
 	{
-		chlen = PQmblen((char *) pwcs, encoding);
+		chlen = PQmblen((const char *) pwcs, encoding);
 		if (len < (size_t) chlen)
 			break;
-		w = PQdsplen((char *) pwcs, encoding);
+		w = PQdsplen((const char *) pwcs, encoding);
 
 		if (chlen == 1)			/* single-byte char */
 		{
@@ -298,10 +298,10 @@ pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
 
 	for (; *pwcs && len > 0; pwcs += chlen)
 	{
-		chlen = PQmblen((char *) pwcs, encoding);
+		chlen = PQmblen((const char *) pwcs, encoding);
 		if (len < (size_t) chlen)
 			break;
-		w = PQdsplen((char *) pwcs, encoding);
+		w = PQdsplen((const char *) pwcs, encoding);
 
 		if (chlen == 1)			/* single-byte char */
 		{
diff --git a/src/include/c.h b/src/include/c.h
index 7396adbaa7780..82acd14a9b470 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -478,7 +478,7 @@ typedef NameData *Name;
  * PointerIsValid
  *		True iff pointer is valid.
  */
-#define PointerIsValid(pointer) ((void*)(pointer) != NULL)
+#define PointerIsValid(pointer) ((const void*)(pointer) != NULL)
 
 /*
  * PointerIsAligned
diff --git a/src/include/utils/pg_crc.h b/src/include/utils/pg_crc.h
index d5273d0f96c3d..0652c0ad3b8f9 100644
--- a/src/include/utils/pg_crc.h
+++ b/src/include/utils/pg_crc.h
@@ -40,7 +40,7 @@ typedef uint32 pg_crc32;
 /* Accumulate some (more) bytes into a CRC */
 #define COMP_CRC32(crc, data, len)	\
 do { \
-	unsigned char *__data = (unsigned char *) (data); \
+	const unsigned char *__data = (const unsigned char *) (data); \
 	uint32		__len = (len); \
 \
 	while (__len-- > 0) \

From 0140a11b9ba5b22e1e4807e178bca770d46c3e28 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 28 Feb 2012 18:10:40 -0500
Subject: [PATCH 059/129] Fix thinko in new match_join_clauses_to_index()
 logic.

We don't need to constrain the other side of an indexable join clause to
not be below an outer join; an example here is

SELECT FROM t1 LEFT JOIN t2 ON t1.a = t2.b LEFT JOIN t3 ON t2.c = t3.d;

We can consider an inner indexscan on t3.d using c = d as indexqual, even
though t2.c is potentially nulled by a previous outer join.  The comparable
logic in orindxpath.c has always worked that way, but I was being overly
cautious here.
---
 src/backend/optimizer/path/indxpath.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 82af494296532..2f088b797879f 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1702,9 +1702,9 @@ match_join_clauses_to_index(PlannerInfo *root,
 		 * outer join rules.
 		 *
 		 * Instead of considering required_relids, we ignore clauses for which
-		 * any referenced rel is in nullable_relids; that means there's an
-		 * outer join below the clause and so it can't be checked at the
-		 * relation scan level.
+		 * the indexed rel is in nullable_relids; that means there's an outer
+		 * join below the clause and so it can't be checked at the relation
+		 * scan level.
 		 *
 		 * Note: unlike create_or_index_quals(), we can accept clauses that
 		 * are marked !is_pushed_down (ie they are themselves outer-join
@@ -1712,7 +1712,7 @@ match_join_clauses_to_index(PlannerInfo *root,
 		 * could only be used in the inside of a nestloop join, which will be
 		 * the nullable side.
 		 */
-		if (bms_overlap(rinfo->clause_relids, rinfo->nullable_relids))
+		if (bms_overlap(rel->relids, rinfo->nullable_relids))
 			continue;
 
 		/* Potentially usable, so see if it matches the index or is an OR */

From 5c02a00d440b90ead12658ce6ec9f4eee95dd0a3 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 28 Feb 2012 19:53:39 -0500
Subject: [PATCH 060/129] Move CRC tables to libpgport, and provide them in a
 separate include file.

This makes it much more convenient to build tools for Postgres that are
separately compiled and require a matching CRC implementation.

To prevent multiple copies of the CRC polynomial tables being introduced
into the postgres binaries, they are now included in the static library
libpgport that is mainly meant for replacement system functions.  That
seems like a bit of a kludge, but there's no better place.

This cleans up building of the tools pg_controldata and pg_resetxlog,
which previously had to build their own copies of pg_crc.o.

In the future, external programs that need access to the CRC tables can
include the tables directly from the new header file pg_crc_tables.h.

Daniel Farina, reviewed by Abhijit Menon-Sen and Tom Lane
---
 src/backend/utils/hash/Makefile               |  2 +-
 src/bin/pg_controldata/.gitignore             |  2 --
 src/bin/pg_controldata/Makefile               |  7 ++-----
 src/bin/pg_resetxlog/.gitignore               |  2 --
 src/bin/pg_resetxlog/Makefile                 |  7 ++-----
 .../utils/pg_crc_tables.h}                    | 20 ++++++++++--------
 src/port/Makefile                             |  4 ++--
 src/port/pg_crc.c                             | 21 +++++++++++++++++++
 src/tools/msvc/Project.pm                     |  4 ++--
 9 files changed, 41 insertions(+), 28 deletions(-)
 rename src/{backend/utils/hash/pg_crc.c => include/utils/pg_crc_tables.h} (98%)
 create mode 100644 src/port/pg_crc.c

diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 64eebd1d996c4..05d347c856301 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = dynahash.o hashfn.o pg_crc.o
+OBJS = dynahash.o hashfn.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/bin/pg_controldata/.gitignore b/src/bin/pg_controldata/.gitignore
index 32ea40181f5b4..eab0c28a8bb3f 100644
--- a/src/bin/pg_controldata/.gitignore
+++ b/src/bin/pg_controldata/.gitignore
@@ -1,3 +1 @@
-/pg_crc.c
-
 /pg_controldata
diff --git a/src/bin/pg_controldata/Makefile b/src/bin/pg_controldata/Makefile
index 0eff84666da0b..b8a39dc1cdba8 100644
--- a/src/bin/pg_controldata/Makefile
+++ b/src/bin/pg_controldata/Makefile
@@ -15,16 +15,13 @@ subdir = src/bin/pg_controldata
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS= pg_controldata.o pg_crc.o $(WIN32RES)
+OBJS= pg_controldata.o $(WIN32RES)
 
 all: pg_controldata
 
 pg_controldata: $(OBJS) | submake-libpgport
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
 
-pg_crc.c: $(top_srcdir)/src/backend/utils/hash/pg_crc.c
-	rm -f $@ && $(LN_S) $< .
-
 install: all installdirs
 	$(INSTALL_PROGRAM) pg_controldata$(X) '$(DESTDIR)$(bindir)/pg_controldata$(X)'
 
@@ -35,4 +32,4 @@ uninstall:
 	rm -f '$(DESTDIR)$(bindir)/pg_controldata$(X)'
 
 clean distclean maintainer-clean:
-	rm -f pg_controldata$(X) $(OBJS) pg_crc.c
+	rm -f pg_controldata$(X) $(OBJS)
diff --git a/src/bin/pg_resetxlog/.gitignore b/src/bin/pg_resetxlog/.gitignore
index 584590951fc6f..6b84208ee0c22 100644
--- a/src/bin/pg_resetxlog/.gitignore
+++ b/src/bin/pg_resetxlog/.gitignore
@@ -1,3 +1 @@
-/pg_crc.c
-
 /pg_resetxlog
diff --git a/src/bin/pg_resetxlog/Makefile b/src/bin/pg_resetxlog/Makefile
index eb03b8a0b9198..0e2603558699e 100644
--- a/src/bin/pg_resetxlog/Makefile
+++ b/src/bin/pg_resetxlog/Makefile
@@ -15,16 +15,13 @@ subdir = src/bin/pg_resetxlog
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS= pg_resetxlog.o pg_crc.o $(WIN32RES)
+OBJS= pg_resetxlog.o $(WIN32RES)
 
 all: pg_resetxlog
 
 pg_resetxlog: $(OBJS) | submake-libpgport
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
 
-pg_crc.c: $(top_srcdir)/src/backend/utils/hash/pg_crc.c
-	rm -f $@ && $(LN_S) $< .
-
 install: all installdirs
 	$(INSTALL_PROGRAM) pg_resetxlog$(X) '$(DESTDIR)$(bindir)/pg_resetxlog$(X)'
 
@@ -35,4 +32,4 @@ uninstall:
 	rm -f '$(DESTDIR)$(bindir)/pg_resetxlog$(X)'
 
 clean distclean maintainer-clean:
-	rm -f pg_resetxlog$(X) $(OBJS) pg_crc.c
+	rm -f pg_resetxlog$(X) $(OBJS)
diff --git a/src/backend/utils/hash/pg_crc.c b/src/include/utils/pg_crc_tables.h
similarity index 98%
rename from src/backend/utils/hash/pg_crc.c
rename to src/include/utils/pg_crc_tables.h
index 596184b59af85..524410fffdfb6 100644
--- a/src/backend/utils/hash/pg_crc.c
+++ b/src/include/utils/pg_crc_tables.h
@@ -1,7 +1,11 @@
 /*-------------------------------------------------------------------------
  *
- * pg_crc.c
- *	  PostgreSQL CRC support
+ * pg_crc_tables.h
+ *	  Polynomial lookup tables for CRC macros
+ *
+ * We make these tables available as a .h file so that programs not linked
+ * with libpgport can still use the macros in pg_crc.h.  They just need
+ * to #include this header as well.
  *
  * See Ross Williams' excellent introduction
  * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
@@ -17,16 +21,12 @@
  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *
- * IDENTIFICATION
- *	  src/backend/utils/hash/pg_crc.c
+ * src/include/utils/pg_crc_tables.h
  *
  *-------------------------------------------------------------------------
  */
-
-/* Use c.h so that this file can be built in either frontend or backend */
-#include "c.h"
-
+#ifndef PG_CRC_TABLES_H
+#define PG_CRC_TABLES_H
 
 /*
  * This table is based on the polynomial
@@ -513,3 +513,5 @@ const uint64 pg_crc64_table[256] = {
 #endif   /* SIZEOF_VOID_P < 8 */
 
 #endif   /* PROVIDE_64BIT_CRC */
+
+#endif   /* PG_CRC_TABLES_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 1bf0963ba7888..4e3a8edd3a151 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -31,8 +31,8 @@ override CPPFLAGS := -I$(top_builddir)/src/port -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
 OBJS = $(LIBOBJS) chklocale.o dirmod.o erand48.o exec.o fls.o inet_net_ntop.o \
-	noblock.o path.o pgcheckdir.o pgmkdirp.o pgsleep.o pgstrcasecmp.o \
-	qsort.o qsort_arg.o sprompt.o thread.o
+	noblock.o path.o pgcheckdir.o pg_crc.o pgmkdirp.o pgsleep.o \
+	pgstrcasecmp.o qsort.o qsort_arg.o sprompt.o thread.o
 
 # foo_srv.o and foo.o are both built from foo.c, but only foo.o has -DFRONTEND
 OBJS_SRV = $(OBJS:%.o=%_srv.o)
diff --git a/src/port/pg_crc.c b/src/port/pg_crc.c
new file mode 100644
index 0000000000000..ebf4f3a61a758
--- /dev/null
+++ b/src/port/pg_crc.c
@@ -0,0 +1,21 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc.c
+ *	  PostgreSQL CRC support
+ *
+ * This file simply #includes the CRC table definitions so that they are
+ * available to programs linked with libpgport.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "utils/pg_crc_tables.h"
diff --git a/src/tools/msvc/Project.pm b/src/tools/msvc/Project.pm
index 9db664a190e33..98db076e58c78 100644
--- a/src/tools/msvc/Project.pm
+++ b/src/tools/msvc/Project.pm
@@ -261,8 +261,8 @@ sub AddDir
         $mf =~ s{OBJS[^=]*=\s*(.*)$}{}m;
     }
 
-    # Match rules that pull in source files from different directories
-    # example: pg_crc.c: $(top_srcdir)/src/backend/utils/hash/pg_crc.c
+    # Match rules that pull in source files from different directories, eg
+    # pgstrcasecmp.c rint.c snprintf.c: % : $(top_srcdir)/src/port/%
     my $replace_re = qr{^([^:\n\$]+\.c)\s*:\s*(?:%\s*: )?\$(\([^\)]+\))\/(.*)\/[^\/]+$}m;
     while ($mf =~ m{$replace_re}m)
     {

From 58e9f974dcfae7c4c445631afad47d80deb83160 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 28 Feb 2012 23:52:52 -0300
Subject: [PATCH 061/129] Fix typo in comment

Haifeng Liu
---
 src/include/utils/json.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/utils/json.h b/src/include/utils/json.h
index 415787b458d5d..fa0d4fcad74d4 100644
--- a/src/include/utils/json.h
+++ b/src/include/utils/json.h
@@ -26,4 +26,4 @@ extern Datum row_to_json(PG_FUNCTION_ARGS);
 extern Datum row_to_json_pretty(PG_FUNCTION_ARGS);
 extern void  escape_json(StringInfo buf, const char *str);
 
-#endif   /* XML_H */
+#endif   /* JSON_H */

From 8cae5810ebaaabb54171d9953bdd9cc802f0d135 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 29 Feb 2012 00:24:01 -0500
Subject: [PATCH 062/129] Fix MSVC builds for previous patch's addition of a
 src/port file.

(And why in the world is this OBJS list not being scraped from the
corresponding Makefile?)
---
 src/tools/msvc/Mkvcbuild.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 4c59a0b2f837c..64ec4080d08da 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -56,7 +56,7 @@ sub mkvcbuild
       chklocale.c crypt.c fls.c fseeko.c getrusage.c inet_aton.c random.c
       srandom.c getaddrinfo.c gettimeofday.c inet_net_ntop.c kill.c open.c
       erand48.c snprintf.c strlcat.c strlcpy.c dirmod.c exec.c noblock.c path.c
-      pgcheckdir.c pgmkdirp.c pgsleep.c pgstrcasecmp.c qsort.c qsort_arg.c
+      pgcheckdir.c pg_crc.c pgmkdirp.c pgsleep.c pgstrcasecmp.c qsort.c qsort_arg.c
       sprompt.c thread.c getopt.c getopt_long.c dirent.c rint.c win32env.c
       win32error.c win32setlocale.c);
 

From d6a7271958e61fe8029087a34483437292f41f6f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 29 Feb 2012 15:22:49 +0200
Subject: [PATCH 063/129] Correctly detect SSI conflicts of prepared
 transactions after crash.

A prepared transaction can get new conflicts in and out after preparing, so
we cannot rely on the in- and out-flags stored in the statefile at prepare-
time. As a quick fix, make the conservative assumption that after a restart,
all prepared transactions are considered to have both in- and out-conflicts.
That can lead to unnecessary rollbacks after a crash, but that shouldn't be
a big problem in practice; you don't want prepared transactions to hang
around for a long time anyway.

Dan Ports
---
 src/backend/storage/lmgr/predicate.c | 31 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 9e927f8564451..08fb69c32e5cc 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -4730,14 +4730,11 @@ AtPrepare_PredicateLocks(void)
 	xactRecord->flags = MySerializableXact->flags;
 
 	/*
-	 * Tweak the flags. Since we're not going to output the inConflicts and
-	 * outConflicts lists, if they're non-empty we'll represent that by
-	 * setting the appropriate summary conflict flags.
+	 * Note that we don't include the list of conflicts in our out in
+	 * the statefile, because new conflicts can be added even after the
+	 * transaction prepares. We'll just make a conservative assumption
+	 * during recovery instead.
 	 */
-	if (!SHMQueueEmpty(&MySerializableXact->inConflicts))
-		xactRecord->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
-	if (!SHMQueueEmpty(&MySerializableXact->outConflicts))
-		xactRecord->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
 
 	RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
 						   &record, sizeof(record));
@@ -4872,15 +4869,6 @@ predicatelock_twophase_recover(TransactionId xid, uint16 info,
 
 		sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
 
-
-		/*
-		 * We don't need the details of a prepared transaction's conflicts,
-		 * just whether it had conflicts in or out (which we get from the
-		 * flags)
-		 */
-		SHMQueueInit(&(sxact->outConflicts));
-		SHMQueueInit(&(sxact->inConflicts));
-
 		/*
 		 * Don't need to track this; no transactions running at the time the
 		 * recovered xact started are still active, except possibly other
@@ -4902,6 +4890,17 @@ predicatelock_twophase_recover(TransactionId xid, uint16 info,
 				   (MaxBackends + max_prepared_xacts));
 		}
 
+		/*
+		 * We don't know whether the transaction had any conflicts or
+		 * not, so we'll conservatively assume that it had both a
+		 * conflict in and a conflict out, and represent that with the
+		 * summary conflict flags.
+		 */
+		SHMQueueInit(&(sxact->outConflicts));
+		SHMQueueInit(&(sxact->inConflicts));
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+
 		/* Register the transaction's xid */
 		sxidtag.xid = xid;
 		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,

From a5c1a1969dd838189e5cc936c15cb40e13fb6d68 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 29 Feb 2012 12:11:10 -0500
Subject: [PATCH 064/129] Simplify references to backslash-doubling in
 func.sgml.

Several places were still written as though standard_conforming_strings
didn't exist, much less be the default.  Now that it is on by default,
we can simplify the text and just insert occasional notes suggesting that
you might have to think harder if it's turned off.  Per discussion of a
suggestion from Hannes Frederic Sowa.

Back-patch to 9.1 where standard_conforming_strings was made the default.
---
 doc/src/sgml/func.sgml | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e8e637bf31bbd..8f6e2d04bdbf5 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3368,8 +3368,8 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
 
    <para>
     <function>LIKE</function> pattern matching always covers the entire
-    string.  Therefore, to match a sequence anywhere within a string, the
-    pattern must start and end with a percent sign.
+    string.  Therefore, if it's desired to match a sequence anywhere within
+    a string, the pattern must start and end with a percent sign.
    </para>
 
    <para>
@@ -3382,17 +3382,13 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     character itself, write two escape characters.
    </para>
 
-   <para>
-    Note that the backslash already has a special meaning in string literals,
-    so to write a pattern constant that contains a backslash you must write two
-    backslashes in an SQL statement (assuming escape string syntax is used, see
-    <xref linkend="sql-syntax-strings">).  Thus, writing a pattern that
-    actually matches a literal backslash means writing four backslashes in the
-    statement.  You can avoid this by selecting a different escape character
-    with <literal>ESCAPE</literal>; then a backslash is not special to
-    <function>LIKE</function> anymore. (But backslash is still special to the
-    string literal parser, so you still need two of them to match a backslash.)
-   </para>
+   <note>
+    <para>
+     If you have <xref linkend="guc-standard-conforming-strings"> turned off,
+     any backslashes you write in literal string constants will need to be
+     doubled.  See <xref linkend="sql-syntax-strings"> for more information.
+    </para>
+   </note>
 
    <para>
     It's also possible to select no escape character by writing
@@ -3720,8 +3716,7 @@ substring('foobar' from 'o(.)b')   <lineannotation>o</lineannotation>
      inserted, and it can contain <literal>\&amp;</> to indicate that the
      substring matching the entire pattern should be inserted.  Write
      <literal>\\</> if you need to put a literal backslash in the replacement
-     text.  (As always, remember to double backslashes written in literal
-     constant strings, assuming escape string syntax is used.)
+     text.
      The <replaceable>flags</> parameter is an optional text
      string containing zero or more single-letter flags that change the
      function's behavior.  Flag <literal>i</> specifies case-insensitive
@@ -4031,16 +4026,14 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo;
     </table>
 
    <para>
-    An RE cannot end with <literal>\</>.
+    An RE cannot end with a backslash (<literal>\</>).
    </para>
 
    <note>
     <para>
-     Remember that the backslash (<literal>\</literal>) already has a special
-     meaning in <productname>PostgreSQL</> string literals.
-     To write a pattern constant that contains a backslash,
-     you must write two backslashes in the statement, assuming escape
-     string syntax is used (see <xref linkend="sql-syntax-strings">).
+     If you have <xref linkend="guc-standard-conforming-strings"> turned off,
+     any backslashes you write in literal string constants will need to be
+     doubled.  See <xref linkend="sql-syntax-strings"> for more information.
     </para>
    </note>
 
@@ -5541,10 +5534,8 @@ SELECT SUBSTRING('XY1234Z', 'Y*?([0-9]{1,3})');
      <listitem>
       <para>
        If you want to have a double quote in the output you must
-       precede it with a backslash, for example <literal>E'\\"YYYY
-       Month\\"'</literal>. <!-- "" font-lock sanity :-) -->
-       (Two backslashes are necessary because the backslash
-       has special meaning when using the escape string syntax.)
+       precede it with a backslash, for example <literal>'\"YYYY
+       Month\"'</literal>. <!-- "" font-lock sanity :-) -->
       </para>
      </listitem>
 

From 3433c6ba002f711a60352c3518f30cda73d06087 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 28 Feb 2012 23:43:36 -0300
Subject: [PATCH 065/129] Remove TOAST table from pg_database

The only toastable column now is datacl, but we don't really support
long ACLs anyway.  The TOAST table should have been removed when the
pg_db_role_setting catalog was introduced in commit
2eda8dfb52ed9962920282d8384da8bb4c22514d, but I forgot to do that.

Per -hackers discussion on March 2011.
---
 src/backend/catalog/catalog.c    | 4 +---
 src/include/catalog/catversion.h | 2 +-
 src/include/catalog/toasting.h   | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index c1f1db9a3291f..2547f33552ea6 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -402,9 +402,7 @@ IsSharedRelation(Oid relationId)
 		relationId == DbRoleSettingDatidRolidIndexId)
 		return true;
 	/* These are their toast tables and toast indexes (see toasting.h) */
-	if (relationId == PgDatabaseToastTable ||
-		relationId == PgDatabaseToastIndex ||
-		relationId == PgShdescriptionToastTable ||
+	if (relationId == PgShdescriptionToastTable ||
 		relationId == PgShdescriptionToastIndex ||
 		relationId == PgDbRoleSettingToastTable ||
 		relationId == PgDbRoleSettingToastIndex)
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 8451dfde040a3..e5fbbfc8fda6a 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201202271
+#define CATALOG_VERSION_NO	201203011
 
 #endif
diff --git a/src/include/catalog/toasting.h b/src/include/catalog/toasting.h
index a4f1718a2d807..22211cbe3f375 100644
--- a/src/include/catalog/toasting.h
+++ b/src/include/catalog/toasting.h
@@ -50,9 +50,6 @@ DECLARE_TOAST(pg_statistic, 2840, 2841);
 DECLARE_TOAST(pg_trigger, 2336, 2337);
 
 /* shared catalogs */
-DECLARE_TOAST(pg_database, 2844, 2845);
-#define PgDatabaseToastTable 2844
-#define PgDatabaseToastIndex 2845
 DECLARE_TOAST(pg_shdescription, 2846, 2847);
 #define PgShdescriptionToastTable 2846
 #define PgShdescriptionToastIndex 2847

From 89c2f573a392e3995fffc619d4faed23f8649269 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 1 Mar 2012 19:58:10 +0200
Subject: [PATCH 066/129] psql: Improve error display for psql -f -

Running "psql -f -" used to print

psql:<stdin>:1: ERROR:  blah

but that got broken between 8.4 and 9.0 (commit
b291c0fba83a1e93868e2f69c03be195d620f30c), and now it printed

psql:-:1: ERROR:  blah

This reverts to the old behavior and cleans up some code that was left
dead or useless by the mentioned commit.
---
 src/bin/psql/command.c | 13 ++++++++-----
 src/bin/psql/startup.c |  2 --
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 8421ad008602d..aa000735dc5c8 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -2062,14 +2062,17 @@ process_file(char *filename, bool single_txn, bool use_relative_path)
 		}
 
 		fd = fopen(filename, PG_BINARY_R);
+
+		if (!fd)
+		{
+			psql_error("%s: %s\n", filename, strerror(errno));
+			return EXIT_FAILURE;
+		}
 	}
 	else
-		fd = stdin;
-
-	if (!fd)
 	{
-		psql_error("%s: %s\n", filename, strerror(errno));
-		return EXIT_FAILURE;
+		fd = stdin;
+		filename = "<stdin>";	/* for future error messages */
 	}
 
 	oldfilename = pset.inputfile;
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index aff57728a2a67..166c227d6b6a8 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -313,8 +313,6 @@ main(int argc, char *argv[])
 			printf(_("Type \"help\" for help.\n\n"));
 		if (!pset.notty)
 			initializeInput(options.no_readline ? 0 : 1);
-		if (options.action_string)		/* -f - was used */
-			pset.inputfile = "<stdin>";
 
 		successResult = MainLoop(stdin);
 	}

From 36a1a8c33d0d400b246dec8395990725b98801b7 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 1 Mar 2012 20:50:36 +0200
Subject: [PATCH 067/129] Don't link pg_isolation_regress with libpq

It's not necessary and can only create confusion about which libpq
installation should be used.

Also remove some dead code from the makefile that was apparently
copied from elsewhere.
---
 src/test/isolation/Makefile | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/test/isolation/Makefile b/src/test/isolation/Makefile
index 80a8a25bb3f13..0278ecb21fd32 100644
--- a/src/test/isolation/Makefile
+++ b/src/test/isolation/Makefile
@@ -9,12 +9,7 @@ include $(top_builddir)/src/Makefile.global
 # where to find psql for testing an existing installation
 PSQLDIR = $(bindir)
 
-ifeq ($(PORTNAME), win32)
-LDLIBS += -lws2_32
-endif
-
 override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) -I$(srcdir)/../regress $(CPPFLAGS)
-override LDLIBS := $(libpq_pgport) $(LDLIBS)
 
 OBJS =  specparse.o isolationtester.o
 
@@ -25,12 +20,12 @@ pg_regress.o: | submake-regress
 	rm -f $@ && $(LN_S) $(top_builddir)/src/test/regress/pg_regress.o .
 
 pg_isolation_regress: isolation_main.o pg_regress.o
-	$(CC) $(CFLAGS) $^ $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
+	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
 
 all: isolationtester pg_isolation_regress
 
 isolationtester: $(OBJS) | submake-libpq submake-libpgport
-	$(CC) $(CFLAGS) $(OBJS) $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
+	$(CC) $(CFLAGS) $^ $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
 
 distprep: specparse.c specscanner.c
 

From bc8765e91c743d87f5658387b41e3a61cde54116 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 1 Mar 2012 21:16:24 +0200
Subject: [PATCH 068/129] Small possible clarification in pg_basebackup
 reference page

The <literal> markup is not visible as distinct on man pages, which
creates a bit of confusion when looking at the documentation of the
pg_basebackup -l option.  Rather than reinventing the entire font
system for man pages to remedy this, just put some quotes around this
particular case, which should also help in other output formats.
---
 doc/src/sgml/ref/pg_basebackup.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index c654a364124c7..7e343e671b658 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -284,7 +284,7 @@ PostgreSQL documentation
       <listitem>
        <para>
         Sets the label for the backup. If none is specified, a default value of
-        <literal>pg_basebackup base backup</literal> will be used.
+        <quote><literal>pg_basebackup base backup</literal></quote> will be used.
        </para>
       </listitem>
      </varlistentry>

From 2502f45979fca76a6b19a07c98d7a41737a3dc7b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 2 Mar 2012 13:16:09 +0200
Subject: [PATCH 069/129] When a GiST page is split during index build, it
 might not have a buffer.

Previously it was thought that it's impossible as the code stands, because
insertions create buffers as tuples are cascaded downwards, and index
split also creaters buffers eagerly for all halves. But the example from
Jay Levitt demonstrates that it can happen, when the root page is split.
It's in fact OK if the buffer doesn't exist, so we just need to remove the
sanity check. In fact, we've been discussing the possibility of destroying
empty buffers to conserve memory, which would render the sanity check
completely useless anyway.

Fix by Alexander Korotkov
---
 src/backend/access/gist/gistbuildbuffers.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
index 2a5f7b3422dc4..34a12bc79c09b 100644
--- a/src/backend/access/gist/gistbuildbuffers.c
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -606,12 +606,8 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
 							 HASH_FIND, &found);
 	if (!found)
 	{
-		/*
-		 * Node buffer should exist at this point. If it didn't exist before,
-		 * the insertion that caused the page to split should've created it.
-		 */
-		elog(ERROR, "node buffer of page being split (%u) does not exist",
-			 blocknum);
+		/* The page has no buffer, so we have nothing to do. */
+		return;
 	}
 
 	/*

From 8efb0bc57eb350bd991fd32c96e38a13bfe7f120 Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sat, 25 Feb 2012 15:13:12 +0100
Subject: [PATCH 070/129] Add a rule to optionally build docs with the
 stylesheet from the website

For those of us who prefer the formatting of the docs using the
website stylesheets. Use "make STYLE=website draft" (for example) to use.

The stylesheet itself is referenced directly to the website, so there
is currently no copy of it stored in the source repository. Thus, docs
built with it will only look correct if the browser can access the website
when viewing them.
---
 doc/src/sgml/Makefile       | 3 +++
 doc/src/sgml/stylesheet.dsl | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile
index e6c8a49df8782..19e640b5d252b 100644
--- a/doc/src/sgml/Makefile
+++ b/doc/src/sgml/Makefile
@@ -89,6 +89,9 @@ man-stamp: stylesheet-man.xsl postgres.xml
 .PHONY: draft
 
 JADE.html.call = $(JADE) $(JADEFLAGS) $(SPFLAGS) $(SGMLINCLUDE) $(CATALOG) -d stylesheet.dsl -t sgml -i output-html
+ifeq ($(STYLE),website)
+JADE.html.call += -V website-stylesheet
+endif
 
 # The draft target creates HTML output in draft mode, without index (for faster build).
 draft: postgres.sgml $(ALMOSTALLSGML) stylesheet.dsl
diff --git a/doc/src/sgml/stylesheet.dsl b/doc/src/sgml/stylesheet.dsl
index 232fa58e516a5..41796430850ec 100644
--- a/doc/src/sgml/stylesheet.dsl
+++ b/doc/src/sgml/stylesheet.dsl
@@ -29,6 +29,7 @@
 <!-- (applicable to all output formats) -->
 
 (define draft-mode              #f)
+(define website-stylesheet      #f)
 
 (define pgsql-docs-list "pgsql-docs@postgresql.org")
 
@@ -190,7 +191,7 @@
 (define %root-filename%         "index")
 (define %link-mailto-url%       (string-append "mailto:" pgsql-docs-list))
 (define %use-id-as-filename%    #t)
-(define %stylesheet%            "stylesheet.css")
+(define %stylesheet%            (if website-stylesheet "http://www.postgresql.org/media/css/docs.css" "stylesheet.css"))
 (define %graphic-default-extension% "gif")
 (define %gentext-nav-use-ff%    #t)
 (define %body-attr%             '())

From d41f510c807ce8b12c572196e2ae8f3817ac253a Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 2 Mar 2012 20:51:29 +0200
Subject: [PATCH 071/129] ecpg: Clean up some const usage

---
 src/interfaces/ecpg/ecpglib/execute.c    | 12 ++++++------
 src/interfaces/ecpg/ecpglib/prepare.c    |  4 ++--
 src/interfaces/ecpg/preproc/descriptor.c | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c
index f468147b29552..311bc5cbc50a7 100644
--- a/src/interfaces/ecpg/ecpglib/execute.c
+++ b/src/interfaces/ecpg/ecpglib/execute.c
@@ -1083,7 +1083,7 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari
 }
 
 static void
-free_params(const char **paramValues, int nParams, bool print, int lineno)
+free_params(char **paramValues, int nParams, bool print, int lineno)
 {
 	int			n;
 
@@ -1091,7 +1091,7 @@ free_params(const char **paramValues, int nParams, bool print, int lineno)
 	{
 		if (print)
 			ecpg_log("free_params on line %d: parameter %d = %s\n", lineno, n + 1, paramValues[n] ? paramValues[n] : "null");
-		ecpg_free((void *) (paramValues[n]));
+		ecpg_free(paramValues[n]);
 	}
 	ecpg_free(paramValues);
 }
@@ -1138,7 +1138,7 @@ ecpg_execute(struct statement * stmt)
 	PGnotify   *notify;
 	struct variable *var;
 	int			desc_counter = 0;
-	const char **paramValues = NULL;
+	char	  **paramValues = NULL;
 	int			nParams = 0;
 	int			position = 0;
 	struct sqlca_t *sqlca = ECPGget_sqlca();
@@ -1380,7 +1380,7 @@ ecpg_execute(struct statement * stmt)
 		else
 		{
 			nParams++;
-			if (!(paramValues = (const char **) ecpg_realloc(paramValues, sizeof(const char *) * nParams, stmt->lineno)))
+			if (!(paramValues = (char **) ecpg_realloc(paramValues, sizeof(char *) * nParams, stmt->lineno)))
 			{
 				ecpg_free(paramValues);
 				return false;
@@ -1441,7 +1441,7 @@ ecpg_execute(struct statement * stmt)
 	ecpg_log("ecpg_execute on line %d: query: %s; with %d parameter(s) on connection %s\n", stmt->lineno, stmt->command, nParams, stmt->connection->name);
 	if (stmt->statement_type == ECPGst_execute)
 	{
-		results = PQexecPrepared(stmt->connection->connection, stmt->name, nParams, paramValues, NULL, NULL, 0);
+		results = PQexecPrepared(stmt->connection->connection, stmt->name, nParams, (const char *const*) paramValues, NULL, NULL, 0);
 		ecpg_log("ecpg_execute on line %d: using PQexecPrepared for \"%s\"\n", stmt->lineno, stmt->command);
 	}
 	else
@@ -1453,7 +1453,7 @@ ecpg_execute(struct statement * stmt)
 		}
 		else
 		{
-			results = PQexecParams(stmt->connection->connection, stmt->command, nParams, NULL, paramValues, NULL, NULL, 0);
+			results = PQexecParams(stmt->connection->connection, stmt->command, nParams, NULL, (const char *const*) paramValues, NULL, NULL, 0);
 			ecpg_log("ecpg_execute on line %d: using PQexecParams\n", stmt->lineno);
 		}
 	}
diff --git a/src/interfaces/ecpg/ecpglib/prepare.c b/src/interfaces/ecpg/ecpglib/prepare.c
index c5a554e8e282e..1bddf215afe93 100644
--- a/src/interfaces/ecpg/ecpglib/prepare.c
+++ b/src/interfaces/ecpg/ecpglib/prepare.c
@@ -19,7 +19,7 @@ typedef struct
 	char		stmtID[STMTID_SIZE];
 	char	   *ecpgQuery;
 	long		execs;			/* # of executions		*/
-	char	   *connection;		/* connection for the statement		*/
+	const char *connection;		/* connection for the statement		*/
 } stmtCacheEntry;
 
 static int	nextStmtID = 1;
@@ -456,7 +456,7 @@ AddStmtToCache(int lineno,		/* line # of statement		*/
 	entry = &stmtCacheEntries[entNo];
 	entry->lineno = lineno;
 	entry->ecpgQuery = ecpg_strdup(ecpgQuery, lineno);
-	entry->connection = (char *) connection;
+	entry->connection = connection;
 	entry->execs = 0;
 	memcpy(entry->stmtID, stmtID, sizeof(entry->stmtID));
 
diff --git a/src/interfaces/ecpg/preproc/descriptor.c b/src/interfaces/ecpg/preproc/descriptor.c
index 52865293f24a5..115cb17ddc1c2 100644
--- a/src/interfaces/ecpg/preproc/descriptor.c
+++ b/src/interfaces/ecpg/preproc/descriptor.c
@@ -317,14 +317,14 @@ struct variable *
 descriptor_variable(const char *name, int input)
 {
 	static char descriptor_names[2][MAX_DESCRIPTOR_NAMELEN];
-	static const struct ECPGtype descriptor_type = {ECPGt_descriptor, NULL, NULL, NULL, {NULL}, 0};
-	static const struct variable varspace[2] = {
-		{descriptor_names[0], (struct ECPGtype *) & descriptor_type, 0, NULL},
-		{descriptor_names[1], (struct ECPGtype *) & descriptor_type, 0, NULL}
+	static struct ECPGtype descriptor_type = {ECPGt_descriptor, NULL, NULL, NULL, {NULL}, 0};
+	static struct variable varspace[2] = {
+		{descriptor_names[0], &descriptor_type, 0, NULL},
+		{descriptor_names[1], &descriptor_type, 0, NULL}
 	};
 
 	strlcpy(descriptor_names[input], name, sizeof(descriptor_names[input]));
-	return (struct variable *) & varspace[input];
+	return &varspace[input];
 }
 
 struct variable *

From 6688d2878e516314418274ee95c5c30412351933 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 2 Mar 2012 21:12:16 +0200
Subject: [PATCH 072/129] Add COLLATION FOR expression

reviewed by Jaime Casanova
---
 doc/src/sgml/func.sgml                | 30 +++++++++++++++++++++++++++
 src/backend/parser/gram.y             | 15 +++++++++++++-
 src/backend/utils/adt/misc.c          | 27 ++++++++++++++++++++++++
 src/include/catalog/catversion.h      |  2 +-
 src/include/catalog/pg_proc.h         |  2 ++
 src/include/parser/kwlist.h           |  2 +-
 src/include/utils/builtins.h          |  1 +
 src/test/regress/expected/collate.out | 20 ++++++++++++++++++
 src/test/regress/sql/collate.sql      |  7 +++++++
 9 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 8f6e2d04bdbf5..5c1cff3618d52 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -13698,6 +13698,10 @@ SELECT pg_type_is_visible('myschema.widget'::regtype);
     <primary>pg_typeof</primary>
    </indexterm>
 
+   <indexterm>
+    <primary>collation for</primary>
+   </indexterm>
+
   <para>
    <xref linkend="functions-info-catalog-table"> lists functions that
    extract information from the system catalogs.
@@ -13859,6 +13863,11 @@ SELECT pg_type_is_visible('myschema.widget'::regtype);
        <entry><type>regtype</type></entry>
        <entry>get the data type of any value</entry>
       </row>
+      <row>
+       <entry><literal><function>collation for (<parameter>any</parameter>)</function></literal></entry>
+       <entry><type>text</type></entry>
+       <entry>get the collation of the argument</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -13983,6 +13992,27 @@ SELECT typlen FROM pg_type WHERE oid = pg_typeof(33);
       4
 (1 row)
 </programlisting>
+  </para>
+
+  <para>
+   The expression <literal>collation for</literal> returns the collation of the
+   value that is passed to it.  Example:
+<programlisting>
+SELECT collation for (description) FROM pg_description LIMIT 1;
+ pg_collation_for 
+------------------
+ "default"
+(1 row)
+
+SELECT collation for ('foo' COLLATE "de_DE");
+ pg_collation_for 
+------------------
+ "de_DE"
+(1 row)
+</programlisting>
+  The value might be quoted and schema-qualified.  If no collation is derived
+  for the argument expression, then a null value is returned.  If the argument
+  is not of a collatable data type, then an error is raised.
   </para>
 
    <indexterm>
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index d1ce2ab0422cc..9aea2cd80b5b0 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -10701,6 +10701,19 @@ func_expr:	func_name '(' ')' over_clause
 					n->location = @1;
 					$$ = (Node *)n;
 				}
+			| COLLATION FOR '(' a_expr ')'
+				{
+					FuncCall *n = makeNode(FuncCall);
+					n->funcname = SystemFuncName("pg_collation_for");
+					n->args = list_make1($4);
+					n->agg_order = NIL;
+					n->agg_star = FALSE;
+					n->agg_distinct = FALSE;
+					n->func_variadic = FALSE;
+					n->over = NULL;
+					n->location = @1;
+					$$ = (Node *)n;
+				}
 			| CURRENT_DATE
 				{
 					/*
@@ -12152,7 +12165,6 @@ unreserved_keyword:
 			| CLASS
 			| CLOSE
 			| CLUSTER
-			| COLLATION
 			| COMMENT
 			| COMMENTS
 			| COMMIT
@@ -12491,6 +12503,7 @@ reserved_keyword:
 			| CAST
 			| CHECK
 			| COLLATE
+			| COLLATION
 			| COLUMN
 			| CONSTRAINT
 			| CREATE
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 3de6a5c992345..6a1b47714722c 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -32,6 +32,7 @@
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "utils/lsyscache.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/timestamp.h"
@@ -492,3 +493,29 @@ pg_typeof(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_OID(get_fn_expr_argtype(fcinfo->flinfo, 0));
 }
+
+
+/*
+ * Implementation of the COLLATE FOR expression; returns the collation
+ * of the argument.
+ */
+Datum
+pg_collation_for(PG_FUNCTION_ARGS)
+{
+	Oid typeid;
+	Oid collid;
+
+	typeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
+	if (!typeid)
+		PG_RETURN_NULL();
+	if (!type_is_collatable(typeid) && typeid != UNKNOWNOID)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATATYPE_MISMATCH),
+				 errmsg("collations are not supported by type %s",
+						format_type_be(typeid))));
+
+	collid = PG_GET_COLLATION();
+	if (!collid)
+		PG_RETURN_NULL();
+	PG_RETURN_TEXT_P(cstring_to_text(generate_collation_name(collid)));
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index e5fbbfc8fda6a..03353471559e6 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201203011
+#define CATALOG_VERSION_NO	201203021
 
 #endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 8700d0d958a26..b476d47579063 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -1953,6 +1953,8 @@ DESCR("convert generic options array to name/value table");
 
 DATA(insert OID = 1619 (  pg_typeof				PGNSP PGUID 12 1 0 0 0 f f f f f f s 1 0 2206 "2276" _null_ _null_ _null_ _null_  pg_typeof _null_ _null_ _null_ ));
 DESCR("type of the argument");
+DATA(insert OID = 3162 (  pg_collation_for		PGNSP PGUID 12 1 0 0 0 f f f f f f s 1 0   25 "2276" _null_ _null_ _null_ _null_  pg_collation_for _null_ _null_ _null_ ));
+DESCR("collation of the argument; implementation of the COLLATION FOR expression");
 
 /* Deferrable unique constraint trigger */
 DATA(insert OID = 1250 (  unique_key_recheck	PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 2279 "" _null_ _null_ _null_ _null_ unique_key_recheck _null_ _null_ _null_ ));
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index bf71ee541485a..9f6f6d354f3c0 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -79,7 +79,7 @@ PG_KEYWORD("close", CLOSE, UNRESERVED_KEYWORD)
 PG_KEYWORD("cluster", CLUSTER, UNRESERVED_KEYWORD)
 PG_KEYWORD("coalesce", COALESCE, COL_NAME_KEYWORD)
 PG_KEYWORD("collate", COLLATE, RESERVED_KEYWORD)
-PG_KEYWORD("collation", COLLATION, UNRESERVED_KEYWORD)
+PG_KEYWORD("collation", COLLATION, RESERVED_KEYWORD)
 PG_KEYWORD("column", COLUMN, RESERVED_KEYWORD)
 PG_KEYWORD("comment", COMMENT, UNRESERVED_KEYWORD)
 PG_KEYWORD("comments", COMMENTS, UNRESERVED_KEYWORD)
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index fe253bcc7cdf8..9fda7ad28ead2 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -480,6 +480,7 @@ extern Datum pg_rotate_logfile(PG_FUNCTION_ARGS);
 extern Datum pg_sleep(PG_FUNCTION_ARGS);
 extern Datum pg_get_keywords(PG_FUNCTION_ARGS);
 extern Datum pg_typeof(PG_FUNCTION_ARGS);
+extern Datum pg_collation_for(PG_FUNCTION_ARGS);
 
 /* oid.c */
 extern Datum oidin(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out
index a15e6911b0af9..81ac6de7b3154 100644
--- a/src/test/regress/expected/collate.out
+++ b/src/test/regress/expected/collate.out
@@ -577,6 +577,26 @@ RESET enable_nestloop;
 -- 9.1 bug with useless COLLATE in an expression subject to length coercion
 CREATE TEMP TABLE vctable (f1 varchar(25));
 INSERT INTO vctable VALUES ('foo' COLLATE "C");
+SELECT collation for ('foo'); -- unknown type - null
+ pg_collation_for 
+------------------
+ 
+(1 row)
+
+SELECT collation for ('foo'::text);
+ pg_collation_for 
+------------------
+ "default"
+(1 row)
+
+SELECT collation for ((SELECT a FROM collate_test1 LIMIT 1)); -- non-collatable type - error
+ERROR:  collations are not supported by type integer
+SELECT collation for ((SELECT b FROM collate_test1 LIMIT 1));
+ pg_collation_for 
+------------------
+ "C"
+(1 row)
+
 --
 -- Clean up.  Many of these table names will be re-used if the user is
 -- trying to run any platform-specific collation tests later, so we
diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql
index f72f3edb9d231..3c960e7ed9320 100644
--- a/src/test/regress/sql/collate.sql
+++ b/src/test/regress/sql/collate.sql
@@ -219,6 +219,13 @@ RESET enable_nestloop;
 CREATE TEMP TABLE vctable (f1 varchar(25));
 INSERT INTO vctable VALUES ('foo' COLLATE "C");
 
+
+SELECT collation for ('foo'); -- unknown type - null
+SELECT collation for ('foo'::text);
+SELECT collation for ((SELECT a FROM collate_test1 LIMIT 1)); -- non-collatable type - error
+SELECT collation for ((SELECT b FROM collate_test1 LIMIT 1));
+
+
 --
 -- Clean up.  Many of these table names will be re-used if the user is
 -- trying to run any platform-specific collation tests later, so we

From 44634e474fcb9dcd92b16fe3a0fb1d8a91e69353 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 2 Mar 2012 14:28:46 -0500
Subject: [PATCH 073/129] Allow child-relation entries to be made in
 ec_has_const EquivalenceClasses.

This fixes an oversight in commit 11cad29c91524aac1d0b61e0ea0357398ab79bf8,
which introduced MergeAppend plans.  Before that happened, we never
particularly cared about the sort ordering of scans of inheritance child
relations, since appending their outputs together would destroy any
ordering anyway.  But now it's important to be able to match child relation
sort orderings to those of the surrounding query.  The original coding of
add_child_rel_equivalences skipped ec_has_const EquivalenceClasses, on the
originally-correct grounds that adding child expressions to them was
useless.  The effect of this is that when a parent variable is equated to
a constant, we can't recognize that index columns on the equivalent child
variables are not sort-significant; that is, we can't recognize that a
child index on, say, (x, y) is able to generate output in "ORDER BY y"
order when there is a clause "WHERE x = constant".  Adding child
expressions to the (x, constant) EquivalenceClass fixes this, without any
downside that I can see other than a few more planner cycles expended on
such queries.

Per recent gripe from Robert McGehee.  Back-patch to 9.1 where MergeAppend
was introduced.
---
 src/backend/optimizer/path/equivclass.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 9228f82920165..b653d6cb35ca3 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -1785,14 +1785,11 @@ add_child_rel_equivalences(PlannerInfo *root,
 		ListCell   *lc2;
 
 		/*
-		 * If this EC contains a constant, then it's not useful for sorting or
-		 * driving an inner index-scan, so we skip generating child EMs.
-		 *
 		 * If this EC contains a volatile expression, then generating child
-		 * EMs would be downright dangerous.  We rely on a volatile EC having
-		 * only one EM.
+		 * EMs would be downright dangerous, so skip it.  We rely on a
+		 * volatile EC having only one EM.
 		 */
-		if (cur_ec->ec_has_const || cur_ec->ec_has_volatile)
+		if (cur_ec->ec_has_volatile)
 			continue;
 
 		/* No point in searching if parent rel not mentioned in eclass */

From 8e5f4300fdcadac1bfd72a7d1a0225030226c800 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 2 Mar 2012 22:09:10 +0200
Subject: [PATCH 074/129] Re-add "make check" target in
 src/test/isolation/Makefile

This effectively reverts 7886cc73ad12fb9b5a729b6c8152f11a309f5d65,
which was done under the impression that isolationtester needs libpq,
which it no longer does (and never really did).
---
 src/test/isolation/Makefile | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/test/isolation/Makefile b/src/test/isolation/Makefile
index 0278ecb21fd32..74baed52f0a35 100644
--- a/src/test/isolation/Makefile
+++ b/src/test/isolation/Makefile
@@ -70,10 +70,5 @@ maintainer-clean: distclean
 installcheck: all
 	./pg_isolation_regress --psqldir='$(PSQLDIR)' --inputdir=$(srcdir) --schedule=$(srcdir)/isolation_schedule
 
-# We can't support "make check" because isolationtester requires libpq, and
-# in fact (on typical platforms using shared libraries) requires libpq to
-# already be installed.  You could run "make install" and then run a check
-# using a temp installation, but there seems little point in that.
-check:
-	@echo "'make check' is not supported."
-	@echo "Install PostgreSQL, then 'make installcheck' instead."
+check: all
+	./pg_isolation_regress --temp-install=./tmp_check --inputdir=$(srcdir) --top-builddir=$(top_builddir) --schedule=$(srcdir)/isolation_schedule

From d923125b77c5d698bb8107a533a21627582baa43 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Fri, 2 Mar 2012 22:30:01 +0200
Subject: [PATCH 075/129] Fix incorrect uses of gzFile

gzFile is already a pointer, so code like

gzFile *handle = gzopen(...)

is wrong.

This used to pass silently because gzFile used to be defined as void*,
and you can assign a void* to a void**.  But somewhere between zlib
versions 1.2.3.4 and 1.2.6, the definition of gzFile was changed to
struct gzFile_s *, and with that new definition this usage causes
compiler warnings.

So remove all those extra pointer decorations.

There is a related issue in pg_backup_archiver.h, where

FILE       *FH;             /* General purpose file handle */

is used throughout pg_dump as sometimes a real FILE* and sometimes a
gzFile handle, which also causes warnings now.  This is not yet fixed
here, because it might need more code restructuring.
---
 src/bin/pg_basebackup/pg_basebackup.c |  4 ++--
 src/bin/pg_dump/pg_backup_files.c     |  2 +-
 src/bin/pg_dump/pg_backup_tar.c       | 11 ++++-------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index b39d2e7bf3807..bf88726f326a0 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -82,7 +82,7 @@ static bool segment_callback(XLogRecPtr segendpos, uint32 timeline);
 
 #ifdef HAVE_LIBZ
 static const char *
-get_gz_error(gzFile *gzf)
+get_gz_error(gzFile gzf)
 {
 	int			errnum;
 	const char *errmsg;
@@ -450,7 +450,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 	FILE	   *tarfile = NULL;
 
 #ifdef HAVE_LIBZ
-	gzFile	   *ztarfile = NULL;
+	gzFile		ztarfile = NULL;
 #endif
 
 	if (PQgetisnull(res, rownum, 0))
diff --git a/src/bin/pg_dump/pg_backup_files.c b/src/bin/pg_dump/pg_backup_files.c
index 71bace0eab71f..a7fd91d1c53a7 100644
--- a/src/bin/pg_dump/pg_backup_files.c
+++ b/src/bin/pg_dump/pg_backup_files.c
@@ -60,7 +60,7 @@ typedef struct
 typedef struct
 {
 #ifdef HAVE_LIBZ
-	gzFile	   *FH;
+	gzFile		FH;
 #else
 	FILE	   *FH;
 #endif
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index 4952f5a15d335..4823edec7fea0 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -58,16 +58,13 @@ static void _EndBlobs(ArchiveHandle *AH, TocEntry *te);
 #define K_STD_BUF_SIZE 1024
 
 
+typedef struct
+{
 #ifdef HAVE_LIBZ
- /* typedef gzFile	 ThingFile; */
-typedef FILE ThingFile;
+	gzFile		zFH;
 #else
-typedef FILE ThingFile;
+	FILE	   *zFH;
 #endif
-
-typedef struct
-{
-	ThingFile  *zFH;
 	FILE	   *nFH;
 	FILE	   *tarFH;
 	FILE	   *tmpFH;

From b59ca98209d45f5689fe9de22a7429d4cf09d40c Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Sat, 3 Mar 2012 16:03:05 +0200
Subject: [PATCH 076/129] Allow CREATE TABLE (LIKE ...) from composite type

The only reason this didn't work before was that parserOpenTable()
rejects composite types.  So use relation_openrv() directly and
manually do the errposition() setup that parserOpenTable() does.
---
 doc/src/sgml/ref/create_table.sgml            |  2 +-
 src/backend/parser/parse_utilcmd.c            | 34 ++++++++++++++-----
 .../regress/expected/create_table_like.out    | 14 ++++----
 src/test/regress/sql/create_table_like.sql    |  4 ++-
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index f55a0010de50f..bb93214210238 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -370,7 +370,7 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
      </para>
      <para>
       The <literal>LIKE</literal> clause can also be used to copy columns from
-      views or foreign tables.  Inapplicable options (e.g., <literal>INCLUDING
+      views, foreign tables, or composite types.  Inapplicable options (e.g., <literal>INCLUDING
       INDEXES</literal> from a view) are ignored.
      </para>
     </listitem>
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index f1a108a9828ab..43f5634d16c69 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -636,26 +636,42 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
 	TupleConstr *constr;
 	AclResult	aclresult;
 	char	   *comment;
+	ParseCallbackState pcbstate;
 
-	relation = parserOpenTable(cxt->pstate, table_like_clause->relation,
-							   AccessShareLock);
+	setup_parser_errposition_callback(&pcbstate, cxt->pstate, table_like_clause->relation->location);
+
+	relation = relation_openrv(table_like_clause->relation, AccessShareLock);
 
 	if (relation->rd_rel->relkind != RELKIND_RELATION
 		&& relation->rd_rel->relkind != RELKIND_VIEW
-		&& relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE)
+		&& relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE
+		&& relation->rd_rel->relkind != RELKIND_COMPOSITE_TYPE)
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("LIKE source relation \"%s\" is not a table, view, or foreign table",
+				 errmsg("\"%s\" is not a table, view, composite type, or foreign table",
 						table_like_clause->relation->relname)));
 
+	cancel_parser_errposition_callback(&pcbstate);
+
 	/*
-	 * Check for SELECT privileges
+	 * Check for privileges
 	 */
-	aclresult = pg_class_aclcheck(RelationGetRelid(relation), GetUserId(),
+	if (relation->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+	{
+		aclresult = pg_type_aclcheck(relation->rd_rel->reltype, GetUserId(),
+									 ACL_USAGE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_TYPE,
+						   RelationGetRelationName(relation));
+	}
+	else
+	{
+		aclresult = pg_class_aclcheck(RelationGetRelid(relation), GetUserId(),
 								  ACL_SELECT);
-	if (aclresult != ACLCHECK_OK)
-		aclcheck_error(aclresult, ACL_KIND_CLASS,
-					   RelationGetRelationName(relation));
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_CLASS,
+						   RelationGetRelationName(relation));
+	}
 
 	tupleDesc = RelationGetDescr(relation);
 	constr = tupleDesc->constr;
diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out
index 40b6766892a14..8bec55c3ca3f9 100644
--- a/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@ -8,6 +8,10 @@ CREATE TABLE inhx (xx text DEFAULT 'text');
  */
 CREATE TABLE ctla (aa TEXT);
 CREATE TABLE ctlb (bb TEXT) INHERITS (ctla);
+CREATE TABLE foo (LIKE nonexistent);
+ERROR:  relation "nonexistent" does not exist
+LINE 1: CREATE TABLE foo (LIKE nonexistent);
+                               ^
 CREATE TABLE inhe (ee text, LIKE inhx) inherits (ctlb);
 INSERT INTO inhe VALUES ('ee-col1', 'ee-col2', DEFAULT, 'ee-col4');
 SELECT * FROM inhe; /* Columns aa, bb, xx value NULL, ee */
@@ -224,18 +228,16 @@ NOTICE:  drop cascades to table inhe
 CREATE TABLE ctlt4 (a int, b text);
 CREATE SEQUENCE ctlseq1;
 CREATE TABLE ctlt10 (LIKE ctlseq1);  -- fail
-ERROR:  LIKE source relation "ctlseq1" is not a table, view, or foreign table
+ERROR:  "ctlseq1" is not a table, view, composite type, or foreign table
+LINE 1: CREATE TABLE ctlt10 (LIKE ctlseq1);
+                                  ^
 CREATE VIEW ctlv1 AS SELECT * FROM ctlt4;
 CREATE TABLE ctlt11 (LIKE ctlv1);
 CREATE TABLE ctlt11a (LIKE ctlv1 INCLUDING ALL);
 CREATE TYPE ctlty1 AS (a int, b text);
-CREATE TABLE ctlt12 (LIKE ctlty1);  -- currently fails
-ERROR:  "ctlty1" is a composite type
-LINE 1: CREATE TABLE ctlt12 (LIKE ctlty1);
-                                  ^
+CREATE TABLE ctlt12 (LIKE ctlty1);
 DROP SEQUENCE ctlseq1;
 DROP TYPE ctlty1;
 DROP VIEW ctlv1;
 DROP TABLE IF EXISTS ctlt4, ctlt10, ctlt11, ctlt11a, ctlt12;
 NOTICE:  table "ctlt10" does not exist, skipping
-NOTICE:  table "ctlt12" does not exist, skipping
diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql
index db66e48d45713..2d017bc02b5b1 100644
--- a/src/test/regress/sql/create_table_like.sql
+++ b/src/test/regress/sql/create_table_like.sql
@@ -10,6 +10,8 @@ CREATE TABLE inhx (xx text DEFAULT 'text');
 CREATE TABLE ctla (aa TEXT);
 CREATE TABLE ctlb (bb TEXT) INHERITS (ctla);
 
+CREATE TABLE foo (LIKE nonexistent);
+
 CREATE TABLE inhe (ee text, LIKE inhx) inherits (ctlb);
 INSERT INTO inhe VALUES ('ee-col1', 'ee-col2', DEFAULT, 'ee-col4');
 SELECT * FROM inhe; /* Columns aa, bb, xx value NULL, ee */
@@ -111,7 +113,7 @@ CREATE TABLE ctlt11 (LIKE ctlv1);
 CREATE TABLE ctlt11a (LIKE ctlv1 INCLUDING ALL);
 
 CREATE TYPE ctlty1 AS (a int, b text);
-CREATE TABLE ctlt12 (LIKE ctlty1);  -- currently fails
+CREATE TABLE ctlt12 (LIKE ctlty1);
 
 DROP SEQUENCE ctlseq1;
 DROP TYPE ctlty1;

From 34c978442c55dd13a3a8c6b90fd4380dad02f3da Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Sat, 3 Mar 2012 16:39:26 -0500
Subject: [PATCH 077/129] Provide environment overrides for psql file
 locations.

PSQL_HISTORY provides an alternative for the command history file,
and PSQLRC provides an alternative location for the .psqlrc file.
---
 doc/src/sgml/ref/psql-ref.sgml | 30 ++++++++++++++++++++++++++++++
 src/bin/psql/input.c           |  9 +++++++++
 src/bin/psql/startup.c         | 10 +++++++++-
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index 55aa5f2ac1dc7..fdeaea604047c 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -3340,6 +3340,26 @@ PSQL_EDITOR_LINENUMBER_ARG='--line '
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><envar>PSQL_HISTORY</envar></term>
+
+    <listitem>
+     <para>
+      Alternative location for the command history file. Tilde ("~") expansion is performed.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><envar>PSQLRC</envar></term>
+
+    <listitem>
+     <para>
+      Alternative location of the user's .psqlrc file. Tilde ("~") expansion is performed.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><envar>SHELL</envar></term>
 
@@ -3390,6 +3410,11 @@ PSQL_EDITOR_LINENUMBER_ARG='--line '
      to set up the client or the server to taste (using the <command>\set
      </command> and <command>SET</command> commands).
     </para>
+    <para>
+     The location of the user's <filename>~/.psqlrc</filename> file can
+     also be set explicitly via the <envar>PSQLRC</envar> environment
+     setting.
+    </para>
    </listitem>
 
    <listitem>
@@ -3411,6 +3436,11 @@ PSQL_EDITOR_LINENUMBER_ARG='--line '
      <filename>~/.psql_history</filename>, or
      <filename>%APPDATA%\postgresql\psql_history</filename> on Windows.
     </para>
+    <para>
+     The location of the history file can
+     also be set explicitly via the <envar>PSQL_HISTORY</envar> environment
+     setting.
+    </para>
    </listitem>
   </itemizedlist>
  </refsect1>
diff --git a/src/bin/psql/input.c b/src/bin/psql/input.c
index d77a731c2ec46..880e7e6511d7d 100644
--- a/src/bin/psql/input.c
+++ b/src/bin/psql/input.c
@@ -285,6 +285,15 @@ initializeInput(int flags)
 		history_lines_added = 0;
 
 		histfile = GetVariable(pset.vars, "HISTFILE");
+
+		if (histfile == NULL)
+		{
+			char * envhist;
+			envhist = getenv("PSQL_HISTORY");
+			if (envhist != NULL && strlen(envhist) > 0)
+				histfile = envhist;
+		}
+
 		if (histfile == NULL)
 		{
 			if (get_home_path(home))
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 166c227d6b6a8..b5664dfd1d375 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -591,6 +591,7 @@ process_psqlrc(char *argv0)
 	char		rc_file[MAXPGPATH];
 	char		my_exec_path[MAXPGPATH];
 	char		etc_path[MAXPGPATH];
+	char       *envrc;
 
 	find_my_exec(argv0, my_exec_path);
 	get_etc_path(my_exec_path, etc_path);
@@ -598,7 +599,14 @@ process_psqlrc(char *argv0)
 	snprintf(rc_file, MAXPGPATH, "%s/%s", etc_path, SYSPSQLRC);
 	process_psqlrc_file(rc_file);
 
-	if (get_home_path(home))
+	envrc = getenv("PSQLRC");
+	
+	if (envrc != NULL && strlen(envrc) > 0)
+	{
+		expand_tilde(&envrc);
+		process_psqlrc_file(envrc);
+	}
+	else if (get_home_path(home))
 	{
 		snprintf(rc_file, MAXPGPATH, "%s/%s", home, PSQLRC);
 		process_psqlrc_file(rc_file);

From 0e5e167aaea4ceb355a6e20eec96c4f7d05527ab Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 3 Mar 2012 20:20:19 -0500
Subject: [PATCH 078/129] Collect and use element-frequency statistics for
 arrays.

This patch improves selectivity estimation for the array <@, &&, and @>
(containment and overlaps) operators.  It enables collection of statistics
about individual array element values by ANALYZE, and introduces
operator-specific estimators that use these stats.  In addition,
ScalarArrayOpExpr constructs of the forms "const = ANY/ALL (array_column)"
and "const <> ANY/ALL (array_column)" are estimated by treating them as
variants of the containment operators.

Since we still collect scalar-style stats about the array values as a
whole, the pg_stats view is expanded to show both these stats and the
array-style stats in separate columns.  This creates an incompatible change
in how stats for tsvector columns are displayed in pg_stats: the stats
about lexemes are now displayed in the array-related columns instead of the
original scalar-related columns.

There are a few loose ends here, notably that it'd be nice to be able to
suppress either the scalar-style stats or the array-element stats for
columns for which they're not useful.  But the patch is in good enough
shape to commit for wider testing.

Alexander Korotkov, reviewed by Noah Misch and Nathan Boley
---
 doc/src/sgml/catalogs.sgml                |   51 +-
 src/backend/catalog/heap.c                |    2 +-
 src/backend/catalog/system_views.sql      |   43 +-
 src/backend/commands/analyze.c            |   12 +-
 src/backend/commands/typecmds.c           |    6 +-
 src/backend/tsearch/ts_selfuncs.c         |    4 +
 src/backend/tsearch/ts_typanalyze.c       |    5 +
 src/backend/utils/adt/Makefile            |    3 +-
 src/backend/utils/adt/array_selfuncs.c    | 1225 +++++++++++++++++++++
 src/backend/utils/adt/array_typanalyze.c  |  762 +++++++++++++
 src/backend/utils/adt/selfuncs.c          |   58 +-
 src/include/catalog/catversion.h          |    2 +-
 src/include/catalog/pg_operator.h         |    9 +-
 src/include/catalog/pg_proc.h             |    6 +
 src/include/catalog/pg_statistic.h        |   96 +-
 src/include/catalog/pg_type.h             |  132 +--
 src/include/commands/vacuum.h             |   11 +-
 src/include/utils/array.h                 |    5 +
 src/include/utils/selfuncs.h              |   14 +-
 src/test/regress/expected/arrays.out      |    1 +
 src/test/regress/expected/rules.out       |    2 +-
 src/test/regress/expected/type_sanity.out |   33 +
 src/test/regress/sql/arrays.sql           |    2 +
 src/test/regress/sql/type_sanity.sql      |   25 +
 24 files changed, 2341 insertions(+), 168 deletions(-)
 create mode 100644 src/backend/utils/adt/array_selfuncs.c
 create mode 100644 src/backend/utils/adt/array_typanalyze.c

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 180554b8e3941..9564e012e66a1 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -5354,9 +5354,9 @@
        Column data values of the appropriate kind for the
        <replaceable>N</>th <quote>slot</quote>, or null if the slot
        kind does not store any data values.  Each array's element
-       values are actually of the specific column's data type, so there
-       is no way to define these columns' type more specifically than
-       <type>anyarray</>.
+       values are actually of the specific column's data type, or a related
+       type such as an array's element type, so there is no way to define
+       these columns' type more specifically than <type>anyarray</>.
       </entry>
      </row>
     </tbody>
@@ -8291,8 +8291,6 @@
       <entry>
        A list of the most common values in the column. (Null if
        no values seem to be more common than any others.)
-       For some data types such as <type>tsvector</>, this is a list of
-       the most common element values rather than values of the type itself.
       </entry>
      </row>
 
@@ -8301,12 +8299,9 @@
       <entry><type>real[]</type></entry>
       <entry></entry>
       <entry>
-       A list of the frequencies of the most common values or elements,
+       A list of the frequencies of the most common values,
        i.e., number of occurrences of each divided by total number of rows.
        (Null when <structfield>most_common_vals</structfield> is.)
-       For some data types such as <type>tsvector</>, it can also store some
-       additional information, making it longer than the
-       <structfield>most_common_vals</> array.
       </entry>
      </row>
 
@@ -8338,13 +8333,47 @@
        type does not have a <literal>&lt;</> operator.)
       </entry>
      </row>
+
+     <row>
+      <entry><structfield>most_common_elems</structfield></entry>
+      <entry><type>anyarray</type></entry>
+      <entry></entry>
+      <entry>
+       A list of non-null element values most often appearing within values of
+       the column. (Null for scalar types.)
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>most_common_elem_freqs</structfield></entry>
+      <entry><type>real[]</type></entry>
+      <entry></entry>
+      <entry>
+       A list of the frequencies of the most common element values, i.e., the
+       fraction of rows containing at least one instance of the given value.
+       Two or three additional values follow the per-element frequencies;
+       these are the minimum and maximum of the preceding per-element
+       frequencies, and optionally the frequency of null elements.
+       (Null when <structfield>most_common_elems</structfield> is.)
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>elem_count_histogram</structfield></entry>
+      <entry><type>real[]</type></entry>
+      <entry></entry>
+      <entry>
+       A histogram of the counts of distinct non-null element values within the
+       values of the column, followed by the average number of distinct
+       non-null elements.  (Null for scalar types.)
+      </entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
 
   <para>
-   The maximum number of entries in the <structfield>most_common_vals</>
-   and <structfield>histogram_bounds</> arrays can be set on a
+   The maximum number of entries in the array fields can be controlled on a
    column-by-column basis using the <command>ALTER TABLE SET STATISTICS</>
    command, or globally by setting the
    <xref linkend="guc-default-statistics-target"> run-time parameter.
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index aef410ae9b215..a8653cd49562d 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1182,7 +1182,7 @@ heap_create_with_catalog(const char *relname,
 				   F_ARRAY_SEND,	/* array send (bin) proc */
 				   InvalidOid,	/* typmodin procedure - none */
 				   InvalidOid,	/* typmodout procedure - none */
-				   InvalidOid,	/* analyze procedure - default */
+				   F_ARRAY_TYPANALYZE,	/* array analyze procedure */
 				   new_type_oid,	/* array element type - the rowtype */
 				   true,		/* yes, this is an array type */
 				   InvalidOid,	/* this has no array type */
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 30b0bd06df06a..ab594eba9bcc1 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -117,29 +117,54 @@ CREATE VIEW pg_stats AS
         stawidth AS avg_width,
         stadistinct AS n_distinct,
         CASE
-            WHEN stakind1 IN (1, 4) THEN stavalues1
-            WHEN stakind2 IN (1, 4) THEN stavalues2
-            WHEN stakind3 IN (1, 4) THEN stavalues3
-            WHEN stakind4 IN (1, 4) THEN stavalues4
+            WHEN stakind1 = 1 THEN stavalues1
+            WHEN stakind2 = 1 THEN stavalues2
+            WHEN stakind3 = 1 THEN stavalues3
+            WHEN stakind4 = 1 THEN stavalues4
+            WHEN stakind5 = 1 THEN stavalues5
         END AS most_common_vals,
         CASE
-            WHEN stakind1 IN (1, 4) THEN stanumbers1
-            WHEN stakind2 IN (1, 4) THEN stanumbers2
-            WHEN stakind3 IN (1, 4) THEN stanumbers3
-            WHEN stakind4 IN (1, 4) THEN stanumbers4
+            WHEN stakind1 = 1 THEN stanumbers1
+            WHEN stakind2 = 1 THEN stanumbers2
+            WHEN stakind3 = 1 THEN stanumbers3
+            WHEN stakind4 = 1 THEN stanumbers4
+            WHEN stakind5 = 1 THEN stanumbers5
         END AS most_common_freqs,
         CASE
             WHEN stakind1 = 2 THEN stavalues1
             WHEN stakind2 = 2 THEN stavalues2
             WHEN stakind3 = 2 THEN stavalues3
             WHEN stakind4 = 2 THEN stavalues4
+            WHEN stakind5 = 2 THEN stavalues5
         END AS histogram_bounds,
         CASE
             WHEN stakind1 = 3 THEN stanumbers1[1]
             WHEN stakind2 = 3 THEN stanumbers2[1]
             WHEN stakind3 = 3 THEN stanumbers3[1]
             WHEN stakind4 = 3 THEN stanumbers4[1]
-        END AS correlation
+            WHEN stakind5 = 3 THEN stanumbers5[1]
+        END AS correlation,
+        CASE
+            WHEN stakind1 = 4 THEN stavalues1
+            WHEN stakind2 = 4 THEN stavalues2
+            WHEN stakind3 = 4 THEN stavalues3
+            WHEN stakind4 = 4 THEN stavalues4
+            WHEN stakind5 = 4 THEN stavalues5
+        END AS most_common_elems,
+        CASE
+            WHEN stakind1 = 4 THEN stanumbers1
+            WHEN stakind2 = 4 THEN stanumbers2
+            WHEN stakind3 = 4 THEN stanumbers3
+            WHEN stakind4 = 4 THEN stanumbers4
+            WHEN stakind5 = 4 THEN stanumbers5
+        END AS most_common_elem_freqs,
+        CASE
+            WHEN stakind1 = 5 THEN stanumbers1
+            WHEN stakind2 = 5 THEN stanumbers2
+            WHEN stakind3 = 5 THEN stanumbers3
+            WHEN stakind4 = 5 THEN stanumbers4
+            WHEN stakind5 = 5 THEN stanumbers5
+        END AS elem_count_histogram
     FROM pg_statistic s JOIN pg_class c ON (c.oid = s.starelid)
          JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum)
          LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index b40e57b14fcbc..9cd6e672ced1c 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -110,8 +110,6 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 
-static bool std_typanalyze(VacAttrStats *stats);
-
 
 /*
  *	analyze_rel() -- analyze one relation
@@ -476,8 +474,7 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 		for (i = 0; i < attr_cnt; i++)
 		{
 			VacAttrStats *stats = vacattrstats[i];
-			AttributeOpts *aopt =
-			get_attribute_options(onerel->rd_id, stats->attr->attnum);
+			AttributeOpts *aopt;
 
 			stats->rows = rows;
 			stats->tupDesc = onerel->rd_att;
@@ -490,11 +487,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 			 * If the appropriate flavor of the n_distinct option is
 			 * specified, override with the corresponding value.
 			 */
+			aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum);
 			if (aopt != NULL)
 			{
-				float8		n_distinct =
-				inh ? aopt->n_distinct_inherited : aopt->n_distinct;
+				float8		n_distinct;
 
+				n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
 				if (n_distinct != 0.0)
 					stats->stadistinct = n_distinct;
 			}
@@ -1794,7 +1792,7 @@ static int	compare_mcvs(const void *a, const void *b);
 /*
  * std_typanalyze -- the default type-specific typanalyze function
  */
-static bool
+bool
 std_typanalyze(VacAttrStats *stats)
 {
 	Form_pg_attribute attr = stats->attr;
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 22c1132e9b9ca..37fe5e8dae8fa 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -609,7 +609,7 @@ DefineType(List *names, List *parameters)
 			   F_ARRAY_SEND,	/* send procedure */
 			   typmodinOid,		/* typmodin procedure */
 			   typmodoutOid,	/* typmodout procedure */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1140,7 +1140,7 @@ DefineEnum(CreateEnumStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   enumTypeOid,		/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1450,7 +1450,7 @@ DefineRange(CreateRangeStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index 0922777505026..a07d410005460 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -220,6 +220,10 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
 	/*
 	 * There should be two more Numbers than Values, because the last two
 	 * cells are taken for minimal and maximal frequency.  Punt if not.
+	 *
+	 * (Note: the MCELEM statistics slot definition allows for a third extra
+	 * number containing the frequency of nulls, but we're not expecting that
+	 * to appear for a tsvector column.)
 	 */
 	if (nnumbers != nmcelem + 2)
 		return tsquery_opr_selec_no_stats(query);
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 15fae1c95f0c1..9771415b2e462 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -377,6 +377,11 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * able to find out the minimal and maximal frequency without
 			 * going through all the values.  We keep those two extra
 			 * frequencies in two extra cells in mcelem_freqs.
+			 *
+			 * (Note: the MCELEM statistics slot definition allows for a third
+			 * extra number containing the frequency of nulls, but we don't
+			 * create that for a tsvector column, since null elements aren't
+			 * possible.)
 			 */
 			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
 			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index c635c38f5b826..c5b0a75e931a1 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -15,7 +15,8 @@ override CFLAGS+= -mieee
 endif
 endif
 
-OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \
+OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
+	array_userfuncs.o arrayutils.o bool.o \
 	cash.o char.o date.o datetime.o datum.o domains.o \
 	enum.o float.o format_type.o \
 	geo_ops.o geo_selfuncs.o int.o int8.o json.o like.o lockfuncs.o \
diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c
new file mode 100644
index 0000000000000..3916de4bfb61d
--- /dev/null
+++ b/src/backend/utils/adt/array_selfuncs.c
@@ -0,0 +1,1225 @@
+/*-------------------------------------------------------------------------
+ *
+ * array_selfuncs.c
+ *	  Functions for selectivity estimation of array operators
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/array_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "catalog/pg_collation.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "optimizer/clauses.h"
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "utils/typcache.h"
+
+
+/* Default selectivity constant for "@>" and "<@" operators */
+#define DEFAULT_CONTAIN_SEL 0.005
+
+/* Default selectivity constant for "&&" operator */
+#define DEFAULT_OVERLAP_SEL 0.01
+
+/* Default selectivity for given operator */
+#define DEFAULT_SEL(operator) \
+	((operator) == OID_ARRAY_OVERLAP_OP ? \
+		DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
+
+static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
+				  Oid elemtype, Oid operator);
+static Selectivity mcelem_array_selec(ArrayType *array,
+				   TypeCacheEntry *typentry,
+				   Datum *mcelem, int nmcelem,
+				   float4 *numbers, int nnumbers,
+				   float4 *hist, int nhist,
+				   Oid operator, FmgrInfo *cmpfunc);
+static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
+								   float4 *numbers, int nnumbers,
+								   Datum *array_data, int nitems,
+								   Oid operator, FmgrInfo *cmpfunc);
+static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
+							 float4 *numbers, int nnumbers,
+							 Datum *array_data, int nitems,
+							 float4 *hist, int nhist,
+							 Oid operator, FmgrInfo *cmpfunc);
+static float *calc_hist(const float4 *hist, int nhist, int n);
+static float *calc_distr(const float *p, int n, int m, float rest);
+static int	floor_log2(uint32 n);
+static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
+				 int *index, FmgrInfo *cmpfunc);
+static int	element_compare(const void *key1, const void *key2, void *arg);
+static int	float_compare_desc(const void *key1, const void *key2);
+
+
+/*
+ * scalararraysel_containment
+ *		Estimate selectivity of ScalarArrayOpExpr via array containment.
+ *
+ * scalararraysel() has already verified that the operator of a
+ * ScalarArrayOpExpr is the array element type's default equality or
+ * inequality operator.  If we have const =/<> ANY/ALL (array_var)
+ * then we can estimate the selectivity as though this were an array
+ * containment operator, array_var op ARRAY[const].
+ *
+ * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
+ */
+Selectivity
+scalararraysel_containment(PlannerInfo *root,
+						   Node *leftop, Node *rightop,
+						   Oid elemtype, bool isEquality, bool useOr,
+						   int varRelid)
+{
+	Selectivity selec;
+	VariableStatData vardata;
+	Datum		constval;
+	TypeCacheEntry *typentry;
+	FmgrInfo   *cmpfunc;
+
+	/*
+	 * rightop must be a variable, else punt.
+	 */
+	examine_variable(root, rightop, varRelid, &vardata);
+	if (!vardata.rel)
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+
+	/*
+	 * Aggressively reduce leftop to a constant, if possible.
+	 */
+	leftop = estimate_expression_value(root, leftop);
+	if (!IsA(leftop, Const))
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+	if (((Const *) leftop)->constisnull)
+	{
+		/* qual can't succeed if null on left */
+		ReleaseVariableStats(vardata);
+		return (Selectivity) 0.0;
+	}
+	constval = ((Const *) leftop)->constvalue;
+
+	/* Get element type's default comparison function */
+	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
+	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+	cmpfunc = &typentry->cmp_proc_finfo;
+
+	/*
+	 * If the operator is <>, swap ANY/ALL, then invert the result later.
+	 */
+	if (!isEquality)
+		useOr = !useOr;
+
+	/* Get array element stats for var, if available */
+	if (HeapTupleIsValid(vardata.statsTuple))
+	{
+		Form_pg_statistic stats;
+		Datum	   *values;
+		int			nvalues;
+		float4	   *numbers;
+		int			nnumbers;
+		float4	   *hist;
+		int			nhist;
+
+		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+
+		/* MCELEM will be an array of same type as element */
+		if (get_attstatsslot(vardata.statsTuple,
+							 elemtype, vardata.atttypmod,
+							 STATISTIC_KIND_MCELEM, InvalidOid,
+							 NULL,
+							 &values, &nvalues,
+							 &numbers, &nnumbers))
+		{
+			/* For ALL case, also get histogram of distinct-element counts */
+			if (useOr ||
+				!get_attstatsslot(vardata.statsTuple,
+								  elemtype, vardata.atttypmod,
+								  STATISTIC_KIND_DECHIST, InvalidOid,
+								  NULL,
+								  NULL, NULL,
+								  &hist, &nhist))
+			{
+				hist = NULL;
+				nhist = 0;
+			}
+
+			/*
+			 * For = ANY, estimate as var @> ARRAY[const].
+			 *
+			 * For = ALL, estimate as var <@ ARRAY[const].
+			 */
+			if (useOr)
+				selec = mcelem_array_contain_overlap_selec(values, nvalues,
+														   numbers, nnumbers,
+														   &constval, 1,
+														   OID_ARRAY_CONTAINS_OP,
+														   cmpfunc);
+			else
+				selec = mcelem_array_contained_selec(values, nvalues,
+													 numbers, nnumbers,
+													 &constval, 1,
+													 hist, nhist,
+													 OID_ARRAY_CONTAINED_OP,
+													 cmpfunc);
+
+			if (hist)
+				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
+			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
+		}
+		else
+		{
+			/* No most-common-elements info, so do without */
+			if (useOr)
+				selec = mcelem_array_contain_overlap_selec(NULL, 0,
+														   NULL, 0,
+														   &constval, 1,
+														   OID_ARRAY_CONTAINS_OP,
+														   cmpfunc);
+			else
+				selec = mcelem_array_contained_selec(NULL, 0,
+													 NULL, 0,
+													 &constval, 1,
+													 NULL, 0,
+													 OID_ARRAY_CONTAINED_OP,
+													 cmpfunc);
+		}
+
+		/*
+		 * MCE stats count only non-null rows, so adjust for null rows.
+		 */
+		selec *= (1.0 - stats->stanullfrac);
+	}
+	else
+	{
+		/* No stats at all, so do without */
+		if (useOr)
+			selec = mcelem_array_contain_overlap_selec(NULL, 0,
+													   NULL, 0,
+													   &constval, 1,
+													   OID_ARRAY_CONTAINS_OP,
+													   cmpfunc);
+		else
+			selec = mcelem_array_contained_selec(NULL, 0,
+												 NULL, 0,
+												 &constval, 1,
+												 NULL, 0,
+												 OID_ARRAY_CONTAINED_OP,
+												 cmpfunc);
+		/* we assume no nulls here, so no stanullfrac correction */
+	}
+
+	ReleaseVariableStats(vardata);
+
+	/*
+	 * If the operator is <>, invert the results.
+	 */
+	if (!isEquality)
+		selec = 1.0 - selec;
+
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ * arraycontsel -- restriction selectivity for "arraycolumn @> const",
+ * "arraycolumn && const" or "arraycolumn <@ const"
+ */
+Datum
+arraycontsel(PG_FUNCTION_ARGS)
+{
+	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+	Oid			operator = PG_GETARG_OID(1);
+	List	   *args = (List *) PG_GETARG_POINTER(2);
+	int			varRelid = PG_GETARG_INT32(3);
+	VariableStatData vardata;
+	Node	   *other;
+	bool		varonleft;
+	Selectivity selec;
+	Oid			element_typeid;
+
+	/*
+	 * If expression is not (variable op something) or (something op
+	 * variable), then punt and return a default estimate.
+	 */
+	if (!get_restriction_variable(root, args, varRelid,
+								  &vardata, &other, &varonleft))
+		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+
+	/*
+	 * Can't do anything useful if the something is not a constant, either.
+	 */
+	if (!IsA(other, Const))
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+	}
+
+	/*
+	 * The "&&", "@>" and "<@" operators are strict, so we can cope with a
+	 * NULL constant right away.
+	 */
+	if (((Const *) other)->constisnull)
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(0.0);
+	}
+
+	/*
+	 * If var is on the right, commute the operator, so that we can assume
+	 * the var is on the left in what follows.
+	 */
+	if (!varonleft)
+	{
+		if (operator == OID_ARRAY_CONTAINS_OP)
+			operator = OID_ARRAY_CONTAINED_OP;
+		else if (operator == OID_ARRAY_CONTAINED_OP)
+			operator = OID_ARRAY_CONTAINS_OP;
+	}
+
+	/*
+	 * OK, there's a Var and a Const we're dealing with here.  We need the
+	 * Const to be a array with same element type as column, else we can't do
+	 * anything useful.  (Such cases will likely fail at runtime, but here
+	 * we'd rather just return a default estimate.)
+	 */
+	element_typeid = get_base_element_type(((Const *) other)->consttype);
+	if (element_typeid != InvalidOid &&
+		element_typeid == get_base_element_type(vardata.vartype))
+	{
+		selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
+								  element_typeid, operator);
+	}
+	else
+	{
+		selec = DEFAULT_SEL(operator);
+	}
+
+	ReleaseVariableStats(vardata);
+
+	CLAMP_PROBABILITY(selec);
+
+	PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * arraycontjoinsel -- join selectivity for "arraycolumn @> const",
+ * "arraycolumn && const" or "arraycolumn <@ const"
+ */
+Datum
+arraycontjoinsel(PG_FUNCTION_ARGS)
+{
+	/* For the moment this is just a stub */
+	Oid			operator = PG_GETARG_OID(1);
+
+	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+}
+
+/*
+ * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
+ * or "arraycolumn <@ const" based on the statistics
+ *
+ * This function is mainly responsible for extracting the pg_statistic data
+ * to be used; we then pass the problem on to mcelem_array_selec().
+ */
+static Selectivity
+calc_arraycontsel(VariableStatData *vardata, Datum constval,
+				  Oid elemtype, Oid operator)
+{
+	Selectivity selec;
+	TypeCacheEntry *typentry;
+	FmgrInfo   *cmpfunc;
+	ArrayType  *array;
+
+	/* Get element type's default comparison function */
+	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
+	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+		return DEFAULT_SEL(operator);
+	cmpfunc = &typentry->cmp_proc_finfo;
+
+	/*
+	 * The caller made sure the const is a array with same element type, so
+	 * get it now
+	 */
+	array = DatumGetArrayTypeP(constval);
+
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		Form_pg_statistic stats;
+		Datum	   *values;
+		int			nvalues;
+		float4	   *numbers;
+		int			nnumbers;
+		float4	   *hist;
+		int			nhist;
+
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+		/* MCELEM will be an array of same type as column */
+		if (get_attstatsslot(vardata->statsTuple,
+							 elemtype, vardata->atttypmod,
+							 STATISTIC_KIND_MCELEM, InvalidOid,
+							 NULL,
+							 &values, &nvalues,
+							 &numbers, &nnumbers))
+		{
+			/*
+			 * For "array <@ const" case we also need histogram of distinct
+			 * element counts.
+			 */
+			if (operator != OID_ARRAY_CONTAINED_OP ||
+				!get_attstatsslot(vardata->statsTuple,
+								  elemtype, vardata->atttypmod,
+								  STATISTIC_KIND_DECHIST, InvalidOid,
+								  NULL,
+								  NULL, NULL,
+								  &hist, &nhist))
+			{
+				hist = NULL;
+				nhist = 0;
+			}
+
+			/* Use the most-common-elements slot for the array Var. */
+			selec = mcelem_array_selec(array, typentry,
+									   values, nvalues,
+									   numbers, nnumbers,
+									   hist, nhist,
+									   operator, cmpfunc);
+
+			if (hist)
+				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
+			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
+		}
+		else
+		{
+			/* No most-common-elements info, so do without */
+			selec = mcelem_array_selec(array, typentry,
+									   NULL, 0, NULL, 0, NULL, 0,
+									   operator, cmpfunc);
+		}
+
+		/*
+		 * MCE stats count only non-null rows, so adjust for null rows.
+		 */
+		selec *= (1.0 - stats->stanullfrac);
+	}
+	else
+	{
+		/* No stats at all, so do without */
+		selec = mcelem_array_selec(array, typentry,
+								   NULL, 0, NULL, 0, NULL, 0,
+								   operator, cmpfunc);
+		/* we assume no nulls here, so no stanullfrac correction */
+	}
+
+	/* If constant was toasted, release the copy we made */
+	if (PointerGetDatum(array) != constval)
+		pfree(array);
+
+	return selec;
+}
+
+/*
+ * Array selectivity estimation based on most common elements statistics
+ *
+ * This function just deconstructs and sorts the array constant's contents,
+ * and then passes the problem on to mcelem_array_contain_overlap_selec or
+ * mcelem_array_contained_selec depending on the operator.
+ */
+static Selectivity
+mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
+				   Datum *mcelem, int nmcelem,
+				   float4 *numbers, int nnumbers,
+				   float4 *hist, int nhist,
+				   Oid operator, FmgrInfo *cmpfunc)
+{
+	Selectivity	selec;
+	int			num_elems;
+	Datum	   *elem_values;
+	bool	   *elem_nulls;
+	bool		null_present;
+	int			nonnull_nitems;
+	int			i;
+
+	/*
+	 * Prepare constant array data for sorting.  Sorting lets us find unique
+	 * elements and efficiently merge with the MCELEM array.
+	 */
+	deconstruct_array(array,
+					  typentry->type_id,
+					  typentry->typlen,
+					  typentry->typbyval,
+					  typentry->typalign,
+					  &elem_values, &elem_nulls, &num_elems);
+
+	/* Collapse out any null elements */
+	nonnull_nitems = 0;
+	null_present = false;
+	for (i = 0; i < num_elems; i++)
+	{
+		if (elem_nulls[i])
+			null_present = true;
+		else
+			elem_values[nonnull_nitems++] = elem_values[i];
+	}
+
+	/*
+	 * Query "column @> '{anything, null}'" matches nothing.  For the other
+	 * two operators, presence of a null in the constant can be ignored.
+	 */
+	if (null_present && operator == OID_ARRAY_CONTAINS_OP)
+	{
+		pfree(elem_values);
+		pfree(elem_nulls);
+		return (Selectivity) 0.0;
+	}
+
+	/* Sort extracted elements using their default comparison function. */
+	qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
+			  element_compare, cmpfunc);
+
+	/* Separate cases according to operator */
+	if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
+		selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
+												   numbers, nnumbers,
+												   elem_values, nonnull_nitems,
+												   operator, cmpfunc);
+	else if (operator == OID_ARRAY_CONTAINED_OP)
+		selec = mcelem_array_contained_selec(mcelem, nmcelem,
+											 numbers, nnumbers,
+											 elem_values, nonnull_nitems,
+											 hist, nhist,
+											 operator, cmpfunc);
+	else
+	{
+		elog(ERROR, "arraycontsel called for unrecognized operator %u",
+			 operator);
+		selec = 0.0;			/* keep compiler quiet */
+	}
+
+	pfree(elem_values);
+	pfree(elem_nulls);
+	return selec;
+}
+
+/*
+ * Estimate selectivity of "column @> const" and "column && const" based on
+ * most common element statistics.	This estimation assumes element
+ * occurrences are independent.
+ *
+ * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
+ * the array column's MCELEM statistics slot, or are NULL/0 if stats are
+ * not available.  array_data (of length nitems) is the constant's elements.
+ *
+ * Both the mcelem and array_data arrays are assumed presorted according
+ * to the element type's cmpfunc.  Null elements are not present.
+ *
+ * TODO: this estimate probably could be improved by using the distinct
+ * elements count histogram.  For example, excepting the special case of
+ * "column @> '{}'", we can multiply the calculated selectivity by the
+ * fraction of nonempty arrays in the column.
+ */
+static Selectivity
+mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
+								   float4 *numbers, int nnumbers,
+								   Datum *array_data, int nitems,
+								   Oid operator, FmgrInfo *cmpfunc)
+{
+	Selectivity selec,
+				elem_selec;
+	int			mcelem_index,
+				i;
+	bool		use_bsearch;
+	float4		minfreq;
+
+	/*
+	 * There should be three more Numbers than Values, because the last three
+	 * cells should hold minimal and maximal frequency among the non-null
+	 * elements, and then the frequency of null elements.  Ignore the Numbers
+	 * if not right.
+	 */
+	if (nnumbers != nmcelem + 3)
+	{
+		numbers = NULL;
+		nnumbers = 0;
+	}
+
+	if (numbers)
+	{
+		/* Grab the lowest observed frequency */
+		minfreq = numbers[nmcelem];
+	}
+	else
+	{
+		/* Without statistics make some default assumptions */
+		minfreq = 2 * DEFAULT_CONTAIN_SEL;
+	}
+
+	/* Decide whether it is faster to use binary search or not. */
+	if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
+		use_bsearch = true;
+	else
+		use_bsearch = false;
+
+	if (operator == OID_ARRAY_CONTAINS_OP)
+	{
+		/*
+		 * Initial selectivity for "column @> const" query is 1.0, and it will
+		 * be decreased with each element of constant array.
+		 */
+		selec = 1.0;
+	}
+	else
+	{
+		/*
+		 * Initial selectivity for "column && const" query is 0.0, and it will
+		 * be increased with each element of constant array.
+		 */
+		selec = 0.0;
+	}
+
+	/* Scan mcelem and array in parallel. */
+	mcelem_index = 0;
+	for (i = 0; i < nitems; i++)
+	{
+		bool		match = false;
+
+		/* Ignore any duplicates in the array data. */
+		if (i > 0 &&
+			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
+			continue;
+
+		/* Find the smallest MCELEM >= this array item. */
+		if (use_bsearch)
+		{
+			match = find_next_mcelem(mcelem, nmcelem, array_data[i],
+									 &mcelem_index, cmpfunc);
+		}
+		else
+		{
+			while (mcelem_index < nmcelem)
+			{
+				int			cmp = element_compare(&mcelem[mcelem_index],
+												  &array_data[i],
+												  cmpfunc);
+
+				if (cmp < 0)
+					mcelem_index++;
+				else
+				{
+					if (cmp == 0)
+						match = true; /* mcelem is found */
+					break;
+				}
+			}
+		}
+
+		if (match && numbers)
+		{
+			/* MCELEM matches the array item; use its frequency. */
+			elem_selec = numbers[mcelem_index];
+			mcelem_index++;
+		}
+		else
+		{
+			/*
+			 * The element is not in MCELEM.  Punt, but assume that the
+			 * selectivity cannot be more than minfreq / 2.
+			 */
+			elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
+		}
+
+		/*
+		 * Update overall selectivity using the current element's selectivity
+		 * and an assumption of element occurrence independence.
+		 */
+		if (operator == OID_ARRAY_CONTAINS_OP)
+			selec *= elem_selec;
+		else
+			selec = selec + elem_selec - selec * elem_selec;
+
+		/* Clamp intermediate results to stay sane despite roundoff error */
+		CLAMP_PROBABILITY(selec);
+	}
+
+	return selec;
+}
+
+/*
+ * Estimate selectivity of "column <@ const" based on most common element
+ * statistics.
+ *
+ * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
+ * the array column's MCELEM statistics slot, or are NULL/0 if stats are
+ * not available.  array_data (of length nitems) is the constant's elements.
+ * hist (of length nhist) is from the array column's DECHIST statistics slot,
+ * or is NULL/0 if those stats are not available.
+ *
+ * Both the mcelem and array_data arrays are assumed presorted according
+ * to the element type's cmpfunc.  Null elements are not present.
+ *
+ * Independent element occurrence would imply a particular distribution of
+ * distinct element counts among matching rows.  Real data usually falsifies
+ * that assumption.  For example, in a set of 11-element integer arrays having
+ * elements in the range [0..10], element occurrences are typically not
+ * independent.  If they were, a sufficiently-large set would include all
+ * distinct element counts 0 through 11.  We correct for this using the
+ * histogram of distinct element counts.
+ *
+ * In the "column @> const" and "column && const" cases, we usually have a
+ * "const" with low number of elements (otherwise we have selectivity close
+ * to 0 or 1 respectively).  That's why the effect of dependence related
+ * to distinct element count distribution is negligible there.  In the
+ * "column <@ const" case, number of elements is usually high (otherwise we
+ * have selectivity close to 0).  That's why we should do a correction with
+ * the array distinct element count distribution here.
+ *
+ * Using the histogram of distinct element counts produces a different
+ * distribution law than independent occurrences of elements.  This
+ * distribution law can be described as follows:
+ *
+ * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
+ *	  (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
+ *
+ * where:
+ * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
+ *		(1 - occurrence, 0 - no occurrence) in row
+ * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
+ *		(scalar values in [0..1]) according to collected statistics
+ * m = o1 + o2 + ... + on = total number of distinct elements in row
+ * hist[m] - histogram data for occurrence of m elements.
+ * ind[m] - probability of m occurrences from n events assuming their
+ *	  probabilities to be equal to frequencies of array elements.
+ *
+ * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
+ * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
+ */
+static Selectivity
+mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
+							 float4 *numbers, int nnumbers,
+							 Datum *array_data, int nitems,
+							 float4 *hist, int nhist,
+							 Oid operator, FmgrInfo *cmpfunc)
+{
+	int			mcelem_index,
+				i,
+				unique_nitems = 0;
+	float		selec,
+				minfreq,
+				nullelem_freq;
+	float	   *dist,
+			   *mcelem_dist,
+			   *hist_part;
+	float		avg_count,
+				mult,
+				rest;
+	float	   *elem_selec;
+
+	/*
+	 * There should be three more Numbers than Values in the MCELEM slot,
+	 * because the last three cells should hold minimal and maximal frequency
+	 * among the non-null elements, and then the frequency of null elements.
+	 * Punt if not right, because we can't do much without the element freqs.
+	 */
+	if (numbers == NULL || nnumbers != nmcelem + 3)
+		return DEFAULT_CONTAIN_SEL;
+
+	/*
+	 * Grab some of the summary statistics that compute_array_stats() stores:
+	 * lowest frequency, frequency of null elements, and average distinct
+	 * element count.
+	 */
+	minfreq = numbers[nmcelem];
+	nullelem_freq = numbers[nmcelem + 2];
+
+	if (hist && nhist > 0)
+		avg_count = hist[nhist - 1];
+	else
+		avg_count = 10.0f;		/* default assumption */
+
+	/*
+	 * "rest" will be the sum of the frequencies of all elements not
+	 * represented in MCELEM.  The average distinct element count is the sum
+	 * of the frequencies of *all* elements.  Begin with that; we will proceed
+	 * to subtract the MCELEM frequencies.
+	 */
+	rest = avg_count;
+
+	/*
+	 * mult is a multiplier representing estimate of probability that each
+	 * mcelem that is not present in constant doesn't occur.
+	 */
+	mult = 1.0f;
+
+	/*
+	 * elem_selec is array of estimated frequencies for elements in the
+	 * constant.
+	 */
+	elem_selec = (float *) palloc(sizeof(float) * nitems);
+
+	/* Scan mcelem and array in parallel. */
+	mcelem_index = 0;
+	for (i = 0; i < nitems; i++)
+	{
+		bool		match = false;
+
+		/* Ignore any duplicates in the array data. */
+		if (i > 0 &&
+			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
+			continue;
+
+		/*
+		 * Iterate over MCELEM until we find an entry greater than or equal to
+		 * this element of the constant.  Update "rest" and "mult" for mcelem
+		 * entries skipped over.
+		 */
+		while (mcelem_index < nmcelem)
+		{
+			int			cmp = element_compare(&mcelem[mcelem_index],
+											  &array_data[i],
+											  cmpfunc);
+
+			if (cmp < 0)
+			{
+				mult *= (1.0f - numbers[mcelem_index]);
+				rest -= numbers[mcelem_index];
+				mcelem_index++;
+			}
+			else
+			{
+				if (cmp == 0)
+					match = true; /* mcelem is found */
+				break;
+			}
+		}
+
+		if (match)
+		{
+			/* MCELEM matches the array item. */
+			elem_selec[unique_nitems] = numbers[mcelem_index];
+			/* "rest" is decremented for all mcelems, matched or not */
+			rest -= numbers[mcelem_index];
+			mcelem_index++;
+		}
+		else
+		{
+			/*
+			 * The element is not in MCELEM.  Punt, but assume that the
+			 * selectivity cannot be more than minfreq / 2.
+			 */
+			elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
+											minfreq / 2);
+		}
+
+		unique_nitems++;
+	}
+
+	/*
+	 * If we handled all constant elements without exhausting the MCELEM
+	 * array, finish walking it to complete calculation of "rest" and "mult".
+	 */
+	while (mcelem_index < nmcelem)
+	{
+		mult *= (1.0f - numbers[mcelem_index]);
+		rest -= numbers[mcelem_index];
+		mcelem_index++;
+	}
+
+	/*
+	 * The presence of many distinct rare elements materially decreases
+	 * selectivity.  Use the Poisson distribution to estimate the probability
+	 * of a column value having zero occurrences of such elements.	See above
+	 * for the definition of "rest".
+	 */
+	mult *= exp(-rest);
+
+	/* Check we have nonempty distinct element count histogram */
+	if (hist && nhist >= 3)
+	{
+		/*----------
+		 * Using the distinct element count histogram requires
+		 *		O(unique_nitems * (nmcelem + unique_nitems))
+		 * operations.  Beyond a certain computational cost threshold, it's
+		 * reasonable to sacrifice accuracy for decreased planning time.
+		 * We limit the number of operations to EFFORT * nmcelem; since
+		 * nmcelem is limited by the column's statistics target, the work
+		 * done is user-controllable.
+		 *
+		 * If the number of operations would be too large, we can reduce it
+		 * without losing all accuracy by reducing unique_nitems and
+		 * considering only the most-common elements of the constant array.
+		 * To make the results exactly match what we would have gotten with
+		 * only those elements to start with, we'd have to remove any
+		 * discarded elements' frequencies from "mult", but since this is only
+		 * an approximation anyway, we don't bother with that.  Therefore it's
+		 * sufficient to qsort elem_selec[] and take the largest elements.
+		 * (They will no longer match up with the elements of array_data[],
+		 * but we don't care.)
+		 *----------
+		 */
+#define EFFORT 100
+
+		if ((nmcelem + unique_nitems) > 0 &&
+			unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
+		{
+			/*
+			 * Use the quadratic formula to solve for largest allowable N;
+			 * we have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
+			 */
+			double	b = (double) nmcelem;
+			int		n;
+
+			n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
+
+			/* Sort, then take just the first n elements */
+			qsort(elem_selec, unique_nitems, sizeof(float),
+				  float_compare_desc);
+			unique_nitems = n;
+		}
+
+		/*
+		 * Calculate probabilities of each distinct element count for both
+		 * mcelems and constant elements.  At this point, assume independent
+		 * element occurrence.
+		 */
+		dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
+		mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
+
+		/* ignore hist[nhist-1], which is the avg not a histogram member */
+		hist_part = calc_hist(hist, nhist - 1, unique_nitems);
+
+		selec = 0.0f;
+		for (i = 0; i <= unique_nitems; i++)
+		{
+			/*
+			 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
+			 * matching from assumption of independent element occurrence with
+			 * the condition that distinct element count = i.
+			 */
+			if (mcelem_dist[i] > 0)
+				selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
+		}
+
+		pfree(dist);
+		pfree(mcelem_dist);
+		pfree(hist_part);
+	}
+	else
+	{
+		/* We don't have histogram.  Use a rough estimate. */
+		selec = mult;
+	}
+
+	pfree(elem_selec);
+
+	/* Take into account occurrence of NULL element. */
+	selec *= (1.0f - nullelem_freq);
+
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ * Calculate the first n distinct element count probabilities from a
+ * histogram of distinct element counts.
+ *
+ * Returns a palloc'd array of n+1 entries, with array[k] being the
+ * probability of element count k, k in [0..n].
+ *
+ * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
+ * (nhist - 1)) probability to each value in (a,b) and an additional half of
+ * that to a and b themselves.
+ */
+static float *
+calc_hist(const float4 *hist, int nhist, int n)
+{
+	float	   *hist_part;
+	int			k,
+				i = 0;
+	float		prev_interval = 0,
+				next_interval;
+	float		frac;
+
+	hist_part = (float *) palloc((n + 1) * sizeof(float));
+
+	/*
+	 * frac is a probability contribution for each interval between histogram
+	 * values.  We have nhist - 1 intervals, so contribution of each one will
+	 * be 1 / (nhist - 1).
+	 */
+	frac = 1.0f / ((float) (nhist - 1));
+
+	for (k = 0; k <= n; k++)
+	{
+		int			count = 0;
+
+		/*
+		 * Count the histogram boundaries equal to k.  (Although the histogram
+		 * should theoretically contain only exact integers, entries are
+		 * floats so there could be roundoff error in large values.  Treat any
+		 * fractional value as equal to the next larger k.)
+		 */
+		while (i < nhist && hist[i] <= k)
+		{
+			count++;
+			i++;
+		}
+
+		if (count > 0)
+		{
+			/* k is an exact bound for at least one histogram box. */
+			float		val;
+
+			/* Find length between current histogram value and the next one */
+			if (i < nhist)
+				next_interval = hist[i] - hist[i - 1];
+			else
+				next_interval = 0;
+
+			/*
+			 * count - 1 histogram boxes contain k exclusively.  They
+			 * contribute a total of (count - 1) * frac probability.  Also
+			 * factor in the partial histogram boxes on either side.
+			 */
+			val = (float) (count - 1);
+			if (next_interval > 0)
+				val += 0.5f / next_interval;
+			if (prev_interval > 0)
+				val += 0.5f / prev_interval;
+			hist_part[k] = frac * val;
+
+			prev_interval = next_interval;
+		}
+		else
+		{
+			/* k does not appear as an exact histogram bound. */
+			if (prev_interval > 0)
+				hist_part[k] = frac / prev_interval;
+			else
+				hist_part[k] = 0.0f;
+		}
+	}
+
+	return hist_part;
+}
+
+/*
+ * Consider n independent events with probabilities p[].  This function
+ * calculates probabilities of exact k of events occurrence for k in [0..m].
+ * Returns a palloc'd array of size m+1.
+ *
+ * "rest" is the sum of the probabilities of all low-probability events not
+ * included in p.
+ *
+ * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
+ * probability that exactly j of first i events occur.	Obviously M[0,0] = 1.
+ * For any constant j, each increment of i increases the probability iff the
+ * event occurs.  So, by the law of total probability:
+ *	M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
+ *		for i > 0, j > 0.
+ *	M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
+ */
+static float *
+calc_distr(const float *p, int n, int m, float rest)
+{
+	float	   *row,
+			   *prev_row,
+			   *tmp;
+	int			i,
+				j;
+
+	/*
+	 * Since we return only the last row of the matrix and need only the
+	 * current and previous row for calculations, allocate two rows.
+	 */
+	row = (float *) palloc((m + 1) * sizeof(float));
+	prev_row = (float *) palloc((m + 1) * sizeof(float));
+
+	/* M[0,0] = 1 */
+	row[0] = 1.0f;
+	for (i = 1; i <= n; i++)
+	{
+		float		t = p[i - 1];
+
+		/* Swap rows */
+		tmp = row;
+		row = prev_row;
+		prev_row = tmp;
+
+		/* Calculate next row */
+		for (j = 0; j <= i && j <= m; j++)
+		{
+			float		val = 0.0f;
+
+			if (j < i)
+				val += prev_row[j] * (1.0f - t);
+			if (j > 0)
+				val += prev_row[j - 1] * t;
+			row[j] = val;
+		}
+	}
+
+	/*
+	 * The presence of many distinct rare (not in "p") elements materially
+	 * decreases selectivity.  Model their collective occurrence with the
+	 * Poisson distribution.
+	 */
+	if (rest > DEFAULT_CONTAIN_SEL)
+	{
+		float		t;
+
+		/* Swap rows */
+		tmp = row;
+		row = prev_row;
+		prev_row = tmp;
+
+		for (i = 0; i <= m; i++)
+			row[i] = 0.0f;
+
+		/* Value of Poisson distribution for 0 occurrences */
+		t = exp(-rest);
+
+		/*
+		 * Calculate convolution of previously computed distribution and the
+		 * Poisson distribution.
+		 */
+		for (i = 0; i <= m; i++)
+		{
+			for (j = 0; j <= m - i; j++)
+				row[j + i] += prev_row[j] * t;
+
+			/* Get Poisson distribution value for (i + 1) occurrences */
+			t *= rest / (float) (i + 1);
+		}
+	}
+
+	pfree(prev_row);
+	return row;
+}
+
+/* Fast function for floor value of 2 based logarithm calculation. */
+static int
+floor_log2(uint32 n)
+{
+	int			logval = 0;
+
+	if (n == 0)
+		return -1;
+	if (n >= (1 << 16))
+	{
+		n >>= 16;
+		logval += 16;
+	}
+	if (n >= (1 << 8))
+	{
+		n >>= 8;
+		logval += 8;
+	}
+	if (n >= (1 << 4))
+	{
+		n >>= 4;
+		logval += 4;
+	}
+	if (n >= (1 << 2))
+	{
+		n >>= 2;
+		logval += 2;
+	}
+	if (n >= (1 << 1))
+	{
+		logval += 1;
+	}
+	return logval;
+}
+
+/*
+ * find_next_mcelem binary-searches a most common elements array, starting
+ * from *index, for the first member >= value.	It saves the position of the
+ * match into *index and returns true if it's an exact match.  (Note: we
+ * assume the mcelem elements are distinct so there can't be more than one
+ * exact match.)
+ */
+static bool
+find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
+				 FmgrInfo *cmpfunc)
+{
+	int			l = *index,
+				r = nmcelem - 1,
+				i,
+				res;
+
+	while (l <= r)
+	{
+		i = (l + r) / 2;
+		res = element_compare(&mcelem[i], &value, cmpfunc);
+		if (res == 0)
+		{
+			*index = i;
+			return true;
+		}
+		else if (res < 0)
+			l = i + 1;
+		else
+			r = i - 1;
+	}
+	*index = l;
+	return false;
+}
+
+/*
+ * Comparison function for elements.
+ *
+ * We use the element type's default btree opclass, and the default collation
+ * if the type is collation-sensitive.
+ *
+ * XXX consider using SortSupport infrastructure
+ */
+static int
+element_compare(const void *key1, const void *key2, void *arg)
+{
+	Datum		d1 = *((const Datum *) key1);
+	Datum		d2 = *((const Datum *) key2);
+	FmgrInfo   *cmpfunc = (FmgrInfo *) arg;
+	Datum		c;
+
+	c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
+	return DatumGetInt32(c);
+}
+
+/*
+ * Comparison function for sorting floats into descending order.
+ */
+static int
+float_compare_desc(const void *key1, const void *key2)
+{
+	float		d1 = *((const float *) key1);
+	float		d2 = *((const float *) key2);
+
+	if (d1 > d2)
+		return -1;
+	else if (d1 < d2)
+		return 1;
+	else
+		return 0;
+}
diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c
new file mode 100644
index 0000000000000..941e2adb03847
--- /dev/null
+++ b/src/backend/utils/adt/array_typanalyze.c
@@ -0,0 +1,762 @@
+/*-------------------------------------------------------------------------
+ *
+ * array_typanalyze.c
+ *	  Functions for gathering statistics from array columns
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/array_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/tuptoaster.h"
+#include "catalog/pg_collation.h"
+#include "commands/vacuum.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/typcache.h"
+
+
+/*
+ * To avoid consuming too much memory, IO and CPU load during analysis, and/or
+ * too much space in the resulting pg_statistic rows, we ignore arrays that
+ * are wider than ARRAY_WIDTH_THRESHOLD (after detoasting!).  Note that this
+ * number is considerably more than the similar WIDTH_THRESHOLD limit used
+ * in analyze.c's standard typanalyze code.
+ */
+#define ARRAY_WIDTH_THRESHOLD 0x10000
+
+/* Extra data for compute_array_stats function */
+typedef struct
+{
+	/* Information about array element type */
+	Oid			type_id;		/* element type's OID */
+	Oid			eq_opr;			/* default equality operator's OID */
+	bool		typbyval;		/* physical properties of element type */
+	int16		typlen;
+	char		typalign;
+
+	/*
+	 * Lookup data for element type's comparison and hash functions (these
+	 * are in the type's typcache entry, which we expect to remain valid
+	 * over the lifespan of the ANALYZE run)
+	 */
+	FmgrInfo   *cmp;
+	FmgrInfo   *hash;
+
+	/* Saved state from std_typanalyze() */
+	AnalyzeAttrComputeStatsFunc std_compute_stats;
+	void	   *std_extra_data;
+} ArrayAnalyzeExtraData;
+
+/*
+ * While compute_array_stats is running, we keep a pointer to the extra data
+ * here for use by assorted subroutines.  compute_array_stats doesn't
+ * currently need to be re-entrant, so avoiding this is not worth the extra
+ * notational cruft that would be needed.
+ */
+static ArrayAnalyzeExtraData *array_extra_data;
+
+/* A hash table entry for the Lossy Counting algorithm */
+typedef struct
+{
+	Datum		key;			/* This is 'e' from the LC algorithm. */
+	int			frequency;		/* This is 'f'. */
+	int			delta;			/* And this is 'delta'. */
+	int			last_container; /* For de-duplication of array elements. */
+} TrackItem;
+
+/* A hash table entry for distinct-elements counts */
+typedef struct
+{
+	int			count;			/* Count of distinct elements in an array */
+	int			frequency;		/* Number of arrays seen with this count */
+} DECountItem;
+
+static void compute_array_stats(VacAttrStats *stats,
+		   AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
+static void prune_element_hashtable(HTAB *elements_tab, int b_current);
+static uint32 element_hash(const void *key, Size keysize);
+static int	element_match(const void *key1, const void *key2, Size keysize);
+static int	element_compare(const void *key1, const void *key2);
+static int	trackitem_compare_frequencies_desc(const void *e1, const void *e2);
+static int	trackitem_compare_element(const void *e1, const void *e2);
+static int	countitem_compare_count(const void *e1, const void *e2);
+
+
+/*
+ * array_typanalyze -- typanalyze function for array columns
+ */
+Datum
+array_typanalyze(PG_FUNCTION_ARGS)
+{
+	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+	Oid			element_typeid;
+	TypeCacheEntry *typentry;
+	ArrayAnalyzeExtraData *extra_data;
+
+	/*
+	 * Call the standard typanalyze function.  It may fail to find needed
+	 * operators, in which case we also can't do anything, so just fail.
+	 */
+	if (!std_typanalyze(stats))
+		PG_RETURN_BOOL(false);
+
+	/*
+	 * Check attribute data type is a varlena array.
+	 */
+	element_typeid = stats->attrtype->typelem;
+
+	if (!OidIsValid(element_typeid) || stats->attrtype->typlen != -1)
+		elog(ERROR, "array_typanalyze was invoked for non-array type %u",
+			 stats->attrtypid);
+
+	/*
+	 * Gather information about the element type.  If we fail to find
+	 * something, return leaving the state from std_typanalyze() in place.
+	 */
+	typentry = lookup_type_cache(element_typeid,
+								 TYPECACHE_EQ_OPR |
+								 TYPECACHE_CMP_PROC_FINFO |
+								 TYPECACHE_HASH_PROC_FINFO);
+
+	if (!OidIsValid(typentry->eq_opr) ||
+		!OidIsValid(typentry->cmp_proc_finfo.fn_oid) ||
+		!OidIsValid(typentry->hash_proc_finfo.fn_oid))
+		PG_RETURN_BOOL(true);
+
+	/* Store our findings for use by compute_array_stats() */
+	extra_data = (ArrayAnalyzeExtraData *) palloc(sizeof(ArrayAnalyzeExtraData));
+	extra_data->type_id = typentry->type_id;
+	extra_data->eq_opr = typentry->eq_opr;
+	extra_data->typbyval = typentry->typbyval;
+	extra_data->typlen = typentry->typlen;
+	extra_data->typalign = typentry->typalign;
+	extra_data->cmp = &typentry->cmp_proc_finfo;
+	extra_data->hash = &typentry->hash_proc_finfo;
+
+	/* Save old compute_stats and extra_data for scalar statistics ... */
+	extra_data->std_compute_stats = stats->compute_stats;
+	extra_data->std_extra_data = stats->extra_data;
+
+	/* ... and replace with our info */
+	stats->compute_stats = compute_array_stats;
+	stats->extra_data = extra_data;
+
+	/*
+	 * Note we leave stats->minrows set as std_typanalyze set it.  Should
+	 * it be increased for array analysis purposes?
+	 */
+
+	PG_RETURN_BOOL(true);
+}
+
+/*
+ * compute_array_stats() -- compute statistics for a array column
+ *
+ * This function computes statistics useful for determining selectivity of
+ * the array operators <@, &&, and @>.  It is invoked by ANALYZE via the
+ * compute_stats hook after sample rows have been collected.
+ *
+ * We also invoke the standard compute_stats function, which will compute
+ * "scalar" statistics relevant to the btree-style array comparison operators.
+ * However, exact duplicates of an entire array may be rare despite many
+ * arrays sharing individual elements.  This especially afflicts long arrays,
+ * which are also liable to lack all scalar statistics due to the low
+ * WIDTH_THRESHOLD used in analyze.c.  So, in addition to the standard stats,
+ * we find the most common array elements and compute a histogram of distinct
+ * element counts.
+ *
+ * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
+ * frequency counts over data streams" by G. S. Manku and R. Motwani, in
+ * Proceedings of the 28th International Conference on Very Large Data Bases,
+ * Hong Kong, China, August 2002, section 4.2. The paper is available at
+ * http://www.vldb.org/conf/2002/S10P03.pdf
+ *
+ * The Lossy Counting (aka LC) algorithm goes like this:
+ * Let s be the threshold frequency for an item (the minimum frequency we
+ * are interested in) and epsilon the error margin for the frequency. Let D
+ * be a set of triples (e, f, delta), where e is an element value, f is that
+ * element's frequency (actually, its current occurrence count) and delta is
+ * the maximum error in f. We start with D empty and process the elements in
+ * batches of size w. (The batch size is also known as "bucket size" and is
+ * equal to 1/epsilon.) Let the current batch number be b_current, starting
+ * with 1. For each element e we either increment its f count, if it's
+ * already in D, or insert a new triple into D with values (e, 1, b_current
+ * - 1). After processing each batch we prune D, by removing from it all
+ * elements with f + delta <= b_current.  After the algorithm finishes we
+ * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
+ * where N is the total number of elements in the input.  We emit the
+ * remaining elements with estimated frequency f/N.  The LC paper proves
+ * that this algorithm finds all elements with true frequency at least s,
+ * and that no frequency is overestimated or is underestimated by more than
+ * epsilon.  Furthermore, given reasonable assumptions about the input
+ * distribution, the required table size is no more than about 7 times w.
+ *
+ * In the absence of a principled basis for other particular values, we
+ * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
+ * But we leave out the correction for stopwords, which do not apply to
+ * arrays.  These parameters give bucket width w = K/0.007 and maximum
+ * expected hashtable size of about 1000 * K.
+ *
+ * Elements may repeat within an array.  Since duplicates do not change the
+ * behavior of <@, && or @>, we want to count each element only once per
+ * array.  Therefore, we store in the finished pg_statistic entry each
+ * element's frequency as the fraction of all non-null rows that contain it.
+ * We divide the raw counts by nonnull_cnt to get those figures.
+ */
+static void
+compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+					int samplerows, double totalrows)
+{
+	ArrayAnalyzeExtraData *extra_data;
+	int			num_mcelem;
+	int			null_cnt = 0;
+	int			null_elem_cnt = 0;
+	int			analyzed_rows = 0;
+
+	/* This is D from the LC algorithm. */
+	HTAB	   *elements_tab;
+	HASHCTL		elem_hash_ctl;
+	HASH_SEQ_STATUS scan_status;
+
+	/* This is the current bucket number from the LC algorithm */
+	int			b_current;
+
+	/* This is 'w' from the LC algorithm */
+	int			bucket_width;
+	int			array_no;
+	int64		element_no;
+	TrackItem  *item;
+	int			slot_idx;
+	HTAB	   *count_tab;
+	HASHCTL		count_hash_ctl;
+	DECountItem *count_item;
+
+	extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;
+
+	/*
+	 * Invoke analyze.c's standard analysis function to create scalar-style
+	 * stats for the column.  It will expect its own extra_data pointer,
+	 * so temporarily install that.
+	 */
+	stats->extra_data = extra_data->std_extra_data;
+	(*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
+	stats->extra_data = extra_data;
+
+	/*
+	 * Set up static pointer for use by subroutines.  We wait till here in
+	 * case std_compute_stats somehow recursively invokes us (probably not
+	 * possible, but ...)
+	 */
+	array_extra_data = extra_data;
+
+	/*
+	 * We want statistics_target * 10 elements in the MCELEM array. This
+	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
+	 * the number of individual elements tracked in pg_statistic ought to be
+	 * more than the number of values for a simple scalar column.
+	 */
+	num_mcelem = stats->attr->attstattarget * 10;
+
+	/*
+	 * We set bucket width equal to num_mcelem / 0.007 as per the comment
+	 * above.
+	 */
+	bucket_width = num_mcelem * 1000 / 7;
+
+	/*
+	 * Create the hashtable. It will be in local memory, so we don't need to
+	 * worry about overflowing the initial size. Also we don't need to pay any
+	 * attention to locking and memory management.
+	 */
+	MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
+	elem_hash_ctl.keysize = sizeof(Datum);
+	elem_hash_ctl.entrysize = sizeof(TrackItem);
+	elem_hash_ctl.hash = element_hash;
+	elem_hash_ctl.match = element_match;
+	elem_hash_ctl.hcxt = CurrentMemoryContext;
+	elements_tab = hash_create("Analyzed elements table",
+							   bucket_width * 7,
+							   &elem_hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
+
+	/* hashtable for array distinct elements counts */
+	MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
+	count_hash_ctl.keysize = sizeof(int);
+	count_hash_ctl.entrysize = sizeof(DECountItem);
+	count_hash_ctl.hash = tag_hash;
+	count_hash_ctl.hcxt = CurrentMemoryContext;
+	count_tab = hash_create("Array distinct element count table",
+							64,
+							&count_hash_ctl,
+							HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	/* Initialize counters. */
+	b_current = 1;
+	element_no = 0;
+
+	/* Loop over the arrays. */
+	for (array_no = 0; array_no < samplerows; array_no++)
+	{
+		Datum		value;
+		bool		isnull;
+		ArrayType  *array;
+		int			num_elems;
+		Datum	   *elem_values;
+		bool	   *elem_nulls;
+		bool		null_present;
+		int			j;
+		int64		prev_element_no = element_no;
+		int			distinct_count;
+		bool		count_item_found;
+
+		vacuum_delay_point();
+
+		value = fetchfunc(stats, array_no, &isnull);
+		if (isnull)
+		{
+			/* array is null, just count that */
+			null_cnt++;
+			continue;
+		}
+
+		/* Skip too-large values. */
+		if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
+			continue;
+		else
+			analyzed_rows++;
+
+		/*
+		 * Now detoast the array if needed, and deconstruct into datums.
+		 */
+		array = DatumGetArrayTypeP(value);
+
+		Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
+		deconstruct_array(array,
+						  extra_data->type_id,
+						  extra_data->typlen,
+						  extra_data->typbyval,
+						  extra_data->typalign,
+						  &elem_values, &elem_nulls, &num_elems);
+
+		/*
+		 * We loop through the elements in the array and add them to our
+		 * tracking hashtable.
+		 */
+		null_present = false;
+		for (j = 0; j < num_elems; j++)
+		{
+			Datum		elem_value;
+			bool		found;
+
+			/* No null element processing other than flag setting here */
+			if (elem_nulls[j])
+			{
+				null_present = true;
+				continue;
+			}
+
+			/* Lookup current element in hashtable, adding it if new */
+			elem_value = elem_values[j];
+			item = (TrackItem *) hash_search(elements_tab,
+											 (const void *) &elem_value,
+											 HASH_ENTER, &found);
+
+			if (found)
+			{
+				/* The element value is already on the tracking list */
+
+				/*
+				 * The operators we assist ignore duplicate array elements,
+				 * so count a given distinct element only once per array.
+				 */
+				if (item->last_container == array_no)
+					continue;
+
+				item->frequency++;
+				item->last_container = array_no;
+			}
+			else
+			{
+				/* Initialize new tracking list element */
+
+				/*
+				 * If element type is pass-by-reference, we must copy it
+				 * into palloc'd space, so that we can release the array
+				 * below.  (We do this so that the space needed for element
+				 * values is limited by the size of the hashtable; if we
+				 * kept all the array values around, it could be much more.)
+				 */
+				item->key = datumCopy(elem_value,
+									  extra_data->typbyval,
+									  extra_data->typlen);
+
+				item->frequency = 1;
+				item->delta = b_current - 1;
+				item->last_container = array_no;
+			}
+
+			/* element_no is the number of elements processed (ie N) */
+			element_no++;
+
+			/* We prune the D structure after processing each bucket */
+			if (element_no % bucket_width == 0)
+			{
+				prune_element_hashtable(elements_tab, b_current);
+				b_current++;
+			}
+		}
+
+		/* Count null element presence once per array. */
+		if (null_present)
+			null_elem_cnt++;
+
+		/* Update frequency of the particular array distinct element count. */
+		distinct_count = (int) (element_no - prev_element_no);
+		count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
+												 HASH_ENTER,
+												 &count_item_found);
+
+		if (count_item_found)
+			count_item->frequency++;
+		else
+			count_item->frequency = 1;
+
+		/* Free memory allocated while detoasting. */
+		if (PointerGetDatum(array) != value)
+			pfree(array);
+		pfree(elem_values);
+		pfree(elem_nulls);
+	}
+
+	/* Skip pg_statistic slots occupied by standard statistics */
+	slot_idx = 0;
+	while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
+		slot_idx++;
+	if (slot_idx > STATISTIC_NUM_SLOTS - 2)
+		elog(ERROR, "insufficient pg_statistic slots for array stats");
+
+	/* We can only compute real stats if we found some non-null values. */
+	if (analyzed_rows > 0)
+	{
+		int			nonnull_cnt = analyzed_rows;
+		int			count_items_count;
+		int			i;
+		TrackItem **sort_table;
+		int			track_len;
+		int64		cutoff_freq;
+		int64		minfreq,
+					maxfreq;
+
+		/*
+		 * We assume the standard stats code already took care of setting
+		 * stats_valid, stanullfrac, stawidth, stadistinct.  We'd have to
+		 * re-compute those values if we wanted to not store the standard
+		 * stats.
+		 */
+
+		/*
+		 * Construct an array of the interesting hashtable items, that is,
+		 * those meeting the cutoff frequency (s - epsilon)*N.	Also identify
+		 * the minimum and maximum frequencies among these items.
+		 *
+		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
+		 * frequency is 9*N / bucket_width.
+		 */
+		cutoff_freq = 9 * element_no / bucket_width;
+
+		i = hash_get_num_entries(elements_tab); /* surely enough space */
+		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
+
+		hash_seq_init(&scan_status, elements_tab);
+		track_len = 0;
+		minfreq = element_no;
+		maxfreq = 0;
+		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+		{
+			if (item->frequency > cutoff_freq)
+			{
+				sort_table[track_len++] = item;
+				minfreq = Min(minfreq, item->frequency);
+				maxfreq = Max(maxfreq, item->frequency);
+			}
+		}
+		Assert(track_len <= i);
+
+		/* emit some statistics for debug purposes */
+		elog(DEBUG3, "compute_array_stats: target # mces = %d, "
+			 "bucket width = %d, "
+			 "# elements = " INT64_FORMAT ", hashtable size = %d, "
+			 "usable entries = %d",
+			 num_mcelem, bucket_width, element_no, i, track_len);
+
+		/*
+		 * If we obtained more elements than we really want, get rid of those
+		 * with least frequencies.	The easiest way is to qsort the array into
+		 * descending frequency order and truncate the array.
+		 */
+		if (num_mcelem < track_len)
+		{
+			qsort(sort_table, track_len, sizeof(TrackItem *),
+				  trackitem_compare_frequencies_desc);
+			/* reset minfreq to the smallest frequency we're keeping */
+			minfreq = sort_table[num_mcelem - 1]->frequency;
+		}
+		else
+			num_mcelem = track_len;
+
+		/* Generate MCELEM slot entry */
+		if (num_mcelem > 0)
+		{
+			MemoryContext old_context;
+			Datum	   *mcelem_values;
+			float4	   *mcelem_freqs;
+
+			/*
+			 * We want to store statistics sorted on the element value using
+			 * the element type's default comparison function.  This permits
+			 * fast binary searches in selectivity estimation functions.
+			 */
+			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+				  trackitem_compare_element);
+
+			/* Must copy the target values into anl_context */
+			old_context = MemoryContextSwitchTo(stats->anl_context);
+
+			/*
+			 * We sorted statistics on the element value, but we want to be
+			 * able to find the minimal and maximal frequencies without going
+			 * through all the values.	We also want the frequency of null
+			 * elements.  Store these three values at the end of mcelem_freqs.
+			 */
+			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
+			mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));
+
+			/*
+			 * See comments above about use of nonnull_cnt as the divisor for
+			 * the final frequency estimates.
+			 */
+			for (i = 0; i < num_mcelem; i++)
+			{
+				TrackItem  *item = sort_table[i];
+
+				mcelem_values[i] = datumCopy(item->key,
+											 extra_data->typbyval,
+											 extra_data->typlen);
+				mcelem_freqs[i] = (double) item->frequency /
+					(double) nonnull_cnt;
+			}
+			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
+			mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
+			mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;
+
+			MemoryContextSwitchTo(old_context);
+
+			stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
+			stats->staop[slot_idx] = extra_data->eq_opr;
+			stats->stanumbers[slot_idx] = mcelem_freqs;
+			/* See above comment about extra stanumber entries */
+			stats->numnumbers[slot_idx] = num_mcelem + 3;
+			stats->stavalues[slot_idx] = mcelem_values;
+			stats->numvalues[slot_idx] = num_mcelem;
+			/* We are storing values of element type */
+			stats->statypid[slot_idx] = extra_data->type_id;
+			stats->statyplen[slot_idx] = extra_data->typlen;
+			stats->statypbyval[slot_idx] = extra_data->typbyval;
+			stats->statypalign[slot_idx] = extra_data->typalign;
+			slot_idx++;
+		}
+
+		/* Generate DECHIST slot entry */
+		count_items_count = hash_get_num_entries(count_tab);
+		if (count_items_count > 0)
+		{
+			int			num_hist = stats->attr->attstattarget;
+			DECountItem **sorted_count_items;
+			int			count_item_index;
+			int			delta;
+			int			frac;
+			float4	   *hist;
+
+			/* num_hist must be at least 2 for the loop below to work */
+			num_hist = Max(num_hist, 2);
+
+			/*
+			 * Create an array of DECountItem pointers, and sort them into
+			 * increasing count order.
+			 */
+			sorted_count_items = (DECountItem **)
+				palloc(sizeof(DECountItem *) * count_items_count);
+			hash_seq_init(&scan_status, count_tab);
+			count_item_index = 0;
+			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
+			{
+				sorted_count_items[count_item_index++] = count_item;
+			}
+			qsort(sorted_count_items, count_items_count,
+				  sizeof(DECountItem *), countitem_compare_count);
+
+			/*
+			 * Fill stanumbers with the histogram, followed by the average
+			 * count.  This array must be stored in anl_context.
+			 */
+			hist = (float4 *)
+				MemoryContextAlloc(stats->anl_context,
+								   sizeof(float4) * (num_hist + 1));
+			hist[num_hist] = (double) element_no / (double) nonnull_cnt;
+
+			/*
+			 * Construct the histogram.
+			 *
+			 * XXX this needs work: frac could overflow, and it's not clear
+			 * how or why the code works.  Even if it does work, it needs
+			 * documented.
+			 */
+			delta = analyzed_rows - 1;
+			count_item_index = 0;
+			frac = sorted_count_items[0]->frequency * (num_hist - 1);
+			for (i = 0; i < num_hist; i++)
+			{
+				while (frac <= 0)
+				{
+					count_item_index++;
+					Assert(count_item_index < count_items_count);
+					frac += sorted_count_items[count_item_index]->frequency * (num_hist - 1);
+				}
+				hist[i] = sorted_count_items[count_item_index]->count;
+				frac -= delta;
+			}
+			Assert(count_item_index == count_items_count - 1);
+
+			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
+			stats->staop[slot_idx] = extra_data->eq_opr;
+			stats->stanumbers[slot_idx] = hist;
+			stats->numnumbers[slot_idx] = num_hist + 1;
+			slot_idx++;
+		}
+	}
+
+	/*
+	 * We don't need to bother cleaning up any of our temporary palloc's. The
+	 * hashtable should also go away, as it used a child memory context.
+	 */
+}
+
+/*
+ * A function to prune the D structure from the Lossy Counting algorithm.
+ * Consult compute_tsvector_stats() for wider explanation.
+ */
+static void
+prune_element_hashtable(HTAB *elements_tab, int b_current)
+{
+	HASH_SEQ_STATUS scan_status;
+	TrackItem  *item;
+
+	hash_seq_init(&scan_status, elements_tab);
+	while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+	{
+		if (item->frequency + item->delta <= b_current)
+		{
+			Datum		value = item->key;
+
+			if (hash_search(elements_tab, (const void *) &item->key,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "hash table corrupted");
+			/* We should free memory if element is not passed by value */
+			if (!array_extra_data->typbyval)
+				pfree(DatumGetPointer(value));
+		}
+	}
+}
+
+/*
+ * Hash function for elements.
+ *
+ * We use the element type's default hash opclass, and the default collation
+ * if the type is collation-sensitive.
+ */
+static uint32
+element_hash(const void *key, Size keysize)
+{
+	Datum		d = *((const Datum *) key);
+	Datum		h;
+
+	h = FunctionCall1Coll(array_extra_data->hash, DEFAULT_COLLATION_OID, d);
+	return DatumGetUInt32(h);
+}
+
+/*
+ * Matching function for elements, to be used in hashtable lookups.
+ */
+static int
+element_match(const void *key1, const void *key2, Size keysize)
+{
+	/* The keysize parameter is superfluous here */
+	return element_compare(key1, key2);
+}
+
+/*
+ * Comparison function for elements.
+ *
+ * We use the element type's default btree opclass, and the default collation
+ * if the type is collation-sensitive.
+ *
+ * XXX consider using SortSupport infrastructure
+ */
+static int
+element_compare(const void *key1, const void *key2)
+{
+	Datum		d1 = *((const Datum *) key1);
+	Datum		d2 = *((const Datum *) key2);
+	Datum		c;
+
+	c = FunctionCall2Coll(array_extra_data->cmp, DEFAULT_COLLATION_OID, d1, d2);
+	return DatumGetInt32(c);
+}
+
+/*
+ * qsort() comparator for sorting TrackItems by frequencies (descending sort)
+ */
+static int
+trackitem_compare_frequencies_desc(const void *e1, const void *e2)
+{
+	const TrackItem *const * t1 = (const TrackItem *const *) e1;
+	const TrackItem *const * t2 = (const TrackItem *const *) e2;
+
+	return (*t2)->frequency - (*t1)->frequency;
+}
+
+/*
+ * qsort() comparator for sorting TrackItems by element values
+ */
+static int
+trackitem_compare_element(const void *e1, const void *e2)
+{
+	const TrackItem *const * t1 = (const TrackItem *const *) e1;
+	const TrackItem *const * t2 = (const TrackItem *const *) e2;
+
+	return element_compare(&(*t1)->key, &(*t2)->key);
+}
+
+/*
+ * qsort() comparator for sorting DECountItems by count
+ */
+static int
+countitem_compare_count(const void *e1, const void *e2)
+{
+	const DECountItem * const *t1 = (const DECountItem * const *) e1;
+	const DECountItem * const *t2 = (const DECountItem * const *) e2;
+
+	if ((*t1)->count < (*t2)->count)
+		return -1;
+	else if ((*t1)->count == (*t2)->count)
+		return 0;
+	else
+		return 1;
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 0a685aac2c06f..382cd7372ba05 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -127,6 +127,7 @@
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
+#include "utils/typcache.h"
 
 
 /* Hooks for plugins to get control when we ask for stats */
@@ -1701,27 +1702,18 @@ scalararraysel(PlannerInfo *root,
 {
 	Oid			operator = clause->opno;
 	bool		useOr = clause->useOr;
+	bool		isEquality = false;
+	bool		isInequality = false;
 	Node	   *leftop;
 	Node	   *rightop;
 	Oid			nominal_element_type;
 	Oid			nominal_element_collation;
+	TypeCacheEntry *typentry;
 	RegProcedure oprsel;
 	FmgrInfo	oprselproc;
 	Selectivity s1;
 
-	/*
-	 * First, look up the underlying operator's selectivity estimator. Punt if
-	 * it hasn't got one.
-	 */
-	if (is_join_clause)
-		oprsel = get_oprjoin(operator);
-	else
-		oprsel = get_oprrest(operator);
-	if (!oprsel)
-		return (Selectivity) 0.5;
-	fmgr_info(oprsel, &oprselproc);
-
-	/* deconstruct the expression */
+	/* First, deconstruct the expression */
 	Assert(list_length(clause->args) == 2);
 	leftop = (Node *) linitial(clause->args);
 	rightop = (Node *) lsecond(clause->args);
@@ -1736,6 +1728,46 @@ scalararraysel(PlannerInfo *root,
 	/* look through any binary-compatible relabeling of rightop */
 	rightop = strip_array_coercion(rightop);
 
+	/*
+	 * Detect whether the operator is the default equality or inequality
+	 * operator of the array element type.
+	 */
+	typentry = lookup_type_cache(nominal_element_type, TYPECACHE_EQ_OPR);
+	if (OidIsValid(typentry->eq_opr))
+	{
+		if (operator == typentry->eq_opr)
+			isEquality = true;
+		else if (get_negator(operator) == typentry->eq_opr)
+			isInequality = true;
+	}
+
+	/*
+	 * If it is equality or inequality, we might be able to estimate this as
+	 * a form of array containment; for instance "const = ANY(column)" can be
+	 * treated as "ARRAY[const] <@ column".  scalararraysel_containment tries
+	 * that, and returns the selectivity estimate if successful, or -1 if not.
+	 */
+	if ((isEquality || isInequality) && !is_join_clause)
+	{
+		s1 = scalararraysel_containment(root, leftop, rightop,
+										nominal_element_type,
+										isEquality, useOr, varRelid);
+		if (s1 >= 0.0)
+			return s1;
+	}
+
+	/*
+	 * Look up the underlying operator's selectivity estimator. Punt if it
+	 * hasn't got one.
+	 */
+	if (is_join_clause)
+		oprsel = get_oprjoin(operator);
+	else
+		oprsel = get_oprrest(operator);
+	if (!oprsel)
+		return (Selectivity) 0.5;
+	fmgr_info(oprsel, &oprselproc);
+
 	/*
 	 * We consider three cases:
 	 *
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 03353471559e6..223f157310b6d 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201203021
+#define CATALOG_VERSION_NO	201203031
 
 #endif
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index ead5af6d80cde..48ddd16a94d8c 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1520,12 +1520,15 @@ DATA(insert OID = 2590 (  "|&>"    PGNSP PGUID b f f 718 718	16	 0	 0 circle_ove
 DESCR("overlaps or is above");
 
 /* overlap/contains/contained for arrays */
-DATA(insert OID = 2750 (  "&&"	   PGNSP PGUID b f f 2277 2277	16 2750  0 arrayoverlap areasel areajoinsel ));
+DATA(insert OID = 2750 (  "&&"	   PGNSP PGUID b f f 2277 2277	16 2750  0 arrayoverlap arraycontsel arraycontjoinsel ));
 DESCR("overlaps");
-DATA(insert OID = 2751 (  "@>"	   PGNSP PGUID b f f 2277 2277	16 2752  0 arraycontains contsel contjoinsel ));
+#define OID_ARRAY_OVERLAP_OP	2750
+DATA(insert OID = 2751 (  "@>"	   PGNSP PGUID b f f 2277 2277	16 2752  0 arraycontains arraycontsel arraycontjoinsel ));
 DESCR("contains");
-DATA(insert OID = 2752 (  "<@"	   PGNSP PGUID b f f 2277 2277	16 2751  0 arraycontained contsel contjoinsel ));
+#define OID_ARRAY_CONTAINS_OP	2751
+DATA(insert OID = 2752 (  "<@"	   PGNSP PGUID b f f 2277 2277	16 2751  0 arraycontained arraycontsel arraycontjoinsel ));
 DESCR("is contained by");
+#define OID_ARRAY_CONTAINED_OP	2752
 
 /* capturing operators to preserve pre-8.3 behavior of text concatenation */
 DATA(insert OID = 2779 (  "||"	   PGNSP PGUID b f f 25 2776	25	 0 0 textanycat - - ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index b476d47579063..074051bdcc6a9 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -869,6 +869,12 @@ DATA(insert OID = 2334 (  array_agg_finalfn   PGNSP PGUID 12 1 0 0 0 f f f f f f
 DESCR("aggregate final function");
 DATA(insert OID = 2335 (  array_agg		   PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
 DESCR("concatenate aggregate input into an array");
+DATA(insert OID = 3816 (  array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
+DESCR("array typanalyze");
+DATA(insert OID = 3817 (  arraycontsel	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ ));
+DESCR("restriction selectivity for array-containment operators");
+DATA(insert OID = 3818 (  arraycontjoinsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 5 0 701 "2281 26 2281 21 2281" _null_ _null_ _null_ _null_ arraycontjoinsel _null_ _null_ _null_ ));
+DESCR("join selectivity for array-containment operators");
 
 DATA(insert OID = 760 (  smgrin			   PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 210 "2275" _null_ _null_ _null_ _null_	smgrin _null_ _null_ _null_ ));
 DESCR("I/O");
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 0b15b001b43d2..383cc014159e3 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -21,16 +21,6 @@
 
 #include "catalog/genbki.h"
 
-/*
- * The CATALOG definition has to refer to the type of stavaluesN as
- * "anyarray" so that bootstrap mode recognizes it.  There is no real
- * typedef for that, however.  Since the fields are potentially-null and
- * therefore can't be accessed directly from C code, there is no particular
- * need for the C struct definition to show a valid field type --- instead
- * we just make it int.
- */
-#define anyarray int
-
 /* ----------------
  *		pg_statistic definition.  cpp turns this into
  *		typedef struct FormData_pg_statistic
@@ -83,7 +73,7 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
 	 * we do not hard-wire any particular meaning for the remaining
 	 * statistical fields.	Instead, we provide several "slots" in which
 	 * statistical data can be placed.	Each slot includes:
-	 *		kind			integer code identifying kind of data
+	 *		kind			integer code identifying kind of data (see below)
 	 *		op				OID of associated operator, if needed
 	 *		numbers			float4 array (for statistical values)
 	 *		values			anyarray (for representations of data values)
@@ -98,40 +88,36 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
 	int2		stakind2;
 	int2		stakind3;
 	int2		stakind4;
+	int2		stakind5;
 
 	Oid			staop1;
 	Oid			staop2;
 	Oid			staop3;
 	Oid			staop4;
+	Oid			staop5;
 
-	/*
-	 * THE REST OF THESE ARE VARIABLE LENGTH FIELDS, and may even be absent
-	 * (NULL). They cannot be accessed as C struct entries; you have to use
-	 * the full field access machinery (heap_getattr) for them.  We declare
-	 * them here for the catalog machinery.
-	 */
-
+#ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	float4		stanumbers1[1];
 	float4		stanumbers2[1];
 	float4		stanumbers3[1];
 	float4		stanumbers4[1];
+	float4		stanumbers5[1];
 
-#ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	/*
-	 * Values in these arrays are values of the column's data type.  We
-	 * presently have to cheat quite a bit to allow polymorphic arrays of this
-	 * kind, but perhaps someday it'll be a less bogus facility.
+	 * Values in these arrays are values of the column's data type, or of some
+	 * related type such as an array element type.  We presently have to cheat
+	 * quite a bit to allow polymorphic arrays of this kind, but perhaps
+	 * someday it'll be a less bogus facility.
 	 */
 	anyarray	stavalues1;
 	anyarray	stavalues2;
 	anyarray	stavalues3;
 	anyarray	stavalues4;
+	anyarray	stavalues5;
 #endif
 } FormData_pg_statistic;
 
-#define STATISTIC_NUM_SLOTS  4
-
-#undef anyarray
+#define STATISTIC_NUM_SLOTS  5
 
 
 /* ----------------
@@ -145,7 +131,7 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  *		compiler constants for pg_statistic
  * ----------------
  */
-#define Natts_pg_statistic				22
+#define Natts_pg_statistic				26
 #define Anum_pg_statistic_starelid		1
 #define Anum_pg_statistic_staattnum		2
 #define Anum_pg_statistic_stainherit	3
@@ -156,22 +142,26 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 #define Anum_pg_statistic_stakind2		8
 #define Anum_pg_statistic_stakind3		9
 #define Anum_pg_statistic_stakind4		10
-#define Anum_pg_statistic_staop1		11
-#define Anum_pg_statistic_staop2		12
-#define Anum_pg_statistic_staop3		13
-#define Anum_pg_statistic_staop4		14
-#define Anum_pg_statistic_stanumbers1	15
-#define Anum_pg_statistic_stanumbers2	16
-#define Anum_pg_statistic_stanumbers3	17
-#define Anum_pg_statistic_stanumbers4	18
-#define Anum_pg_statistic_stavalues1	19
-#define Anum_pg_statistic_stavalues2	20
-#define Anum_pg_statistic_stavalues3	21
-#define Anum_pg_statistic_stavalues4	22
+#define Anum_pg_statistic_stakind5		11
+#define Anum_pg_statistic_staop1		12
+#define Anum_pg_statistic_staop2		13
+#define Anum_pg_statistic_staop3		14
+#define Anum_pg_statistic_staop4		15
+#define Anum_pg_statistic_staop5		16
+#define Anum_pg_statistic_stanumbers1	17
+#define Anum_pg_statistic_stanumbers2	18
+#define Anum_pg_statistic_stanumbers3	19
+#define Anum_pg_statistic_stanumbers4	20
+#define Anum_pg_statistic_stanumbers5	21
+#define Anum_pg_statistic_stavalues1	22
+#define Anum_pg_statistic_stavalues2	23
+#define Anum_pg_statistic_stavalues3	24
+#define Anum_pg_statistic_stavalues4	25
+#define Anum_pg_statistic_stavalues5	26
 
 /*
- * Currently, three statistical slot "kinds" are defined: most common values,
- * histogram, and correlation.	Additional "kinds" will probably appear in
+ * Currently, five statistical slot "kinds" are defined by core PostgreSQL,
+ * as documented below.  Additional "kinds" will probably appear in
  * future to help cope with non-scalar datatypes.  Also, custom data types
  * can define their own "kind" codes by mutual agreement between a custom
  * typanalyze routine and the selectivity estimation functions of the type's
@@ -250,11 +240,14 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  * the most common element values, and stanumbers their frequencies.  Unlike
  * MCV slots, frequencies are measured as the fraction of non-null rows the
  * element value appears in, not the frequency of all rows.  Also unlike
- * MCV slots, the values are sorted into order (to support binary search
- * for a particular value).  Since this puts the minimum and maximum
- * frequencies at unpredictable spots in stanumbers, there are two extra
- * members of stanumbers, holding copies of the minimum and maximum
- * frequencies.
+ * MCV slots, the values are sorted into the element type's default order
+ * (to support binary search for a particular value).  Since this puts the
+ * minimum and maximum frequencies at unpredictable spots in stanumbers,
+ * there are two extra members of stanumbers, holding copies of the minimum
+ * and maximum frequencies.  Optionally, there can be a third extra member,
+ * which holds the frequency of null elements (expressed in the same terms:
+ * the fraction of non-null rows that contain at least one null element).  If
+ * this member is omitted, the column is presumed to contain no null elements.
  *
  * Note: in current usage for tsvector columns, the stavalues elements are of
  * type text, even though their representation within tsvector is not
@@ -262,4 +255,17 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  */
 #define STATISTIC_KIND_MCELEM  4
 
+/*
+ * A "distinct elements count histogram" slot describes the distribution of
+ * the number of distinct element values present in each row of an array-type
+ * column.  Only non-null rows are considered, and only non-null elements.
+ * staop contains the equality operator appropriate to the element type.
+ * stavalues is not used and should be NULL.  The last member of stanumbers is
+ * the average count of distinct element values over all non-null rows.  The
+ * preceding M (>=2) members form a histogram that divides the population of
+ * distinct-elements counts into M-1 bins of approximately equal population.
+ * The first of these is the minimum observed count, and the last the maximum.
+ */
+#define STATISTIC_KIND_DECHIST  5
+
 #endif   /* PG_STATISTIC_H */
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index e4bca552973a0..4e3bf69e6da7e 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -357,8 +357,8 @@ DATA(insert OID = 114 ( json		   PGNSP PGUID -1 f b U f t \054 0 0 199 json_in j
 DATA(insert OID = 142 ( xml		   PGNSP PGUID -1 f b U f t \054 0 0 143 xml_in xml_out xml_recv xml_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("XML content");
 #define XMLOID 142
-DATA(insert OID = 143 ( _xml	   PGNSP PGUID -1 f b A f t \054 0 142 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 199 ( _json	   PGNSP PGUID -1 f b A f t \054 0 114 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 143 ( _xml	   PGNSP PGUID -1 f b A f t \054 0 142 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 199 ( _json	   PGNSP PGUID -1 f b A f t \054 0 114 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 DATA(insert OID = 194 ( pg_node_tree	PGNSP PGUID -1 f b S f t \054 0 0 0 pg_node_tree_in pg_node_tree_out pg_node_tree_recv pg_node_tree_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ ));
 DESCR("string representing an internal node tree");
@@ -395,7 +395,7 @@ DESCR("geometric polygon '(pt1,...)'");
 DATA(insert OID = 628 (  line	   PGNSP PGUID 32 f b G f t \054 0 701 629 line_in line_out line_recv line_send - - - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("geometric line (not implemented)");
 #define LINEOID			628
-DATA(insert OID = 629 (  _line	   PGNSP PGUID	-1 f b A f t \054 0 628 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 629 (  _line	   PGNSP PGUID	-1 f b A f t \054 0 628 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("");
 
 /* OIDS 700 - 799 */
@@ -422,11 +422,11 @@ DESCR("");
 DATA(insert OID = 718 (  circle    PGNSP PGUID	24 f b G f t \054 0 0 719 circle_in circle_out circle_recv circle_send - - - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("geometric circle '(center,radius)'");
 #define CIRCLEOID		718
-DATA(insert OID = 719 (  _circle   PGNSP PGUID	-1 f b A f t \054 0  718 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 719 (  _circle   PGNSP PGUID	-1 f b A f t \054 0  718 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 790 (  money	   PGNSP PGUID	 8 FLOAT8PASSBYVAL b N f t \054 0 0 791 cash_in cash_out cash_recv cash_send - - - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("monetary amounts, $d,ddd.cc");
 #define CASHOID 790
-DATA(insert OID = 791 (  _money    PGNSP PGUID	-1 f b A f t \054 0  790 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 791 (  _money    PGNSP PGUID	-1 f b A f t \054 0  790 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* OIDS 800 - 899 */
 DATA(insert OID = 829 ( macaddr    PGNSP PGUID	6 f b U f t \054 0 0 1040 macaddr_in macaddr_out macaddr_recv macaddr_send - - - i p f 0 -1 0 0 _null_ _null_ _null_ ));
@@ -442,44 +442,44 @@ DESCR("network IP address/netmask, network address");
 /* OIDS 900 - 999 */
 
 /* OIDS 1000 - 1099 */
-DATA(insert OID = 1000 (  _bool		 PGNSP PGUID -1 f b A f t \054 0	16 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1001 (  _bytea	 PGNSP PGUID -1 f b A f t \054 0	17 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1002 (  _char		 PGNSP PGUID -1 f b A f t \054 0	18 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1003 (  _name		 PGNSP PGUID -1 f b A f t \054 0	19 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1005 (  _int2		 PGNSP PGUID -1 f b A f t \054 0	21 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1006 (  _int2vector PGNSP PGUID -1 f b A f t \054 0	22 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1007 (  _int4		 PGNSP PGUID -1 f b A f t \054 0	23 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1000 (  _bool		 PGNSP PGUID -1 f b A f t \054 0	16 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1001 (  _bytea	 PGNSP PGUID -1 f b A f t \054 0	17 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1002 (  _char		 PGNSP PGUID -1 f b A f t \054 0	18 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1003 (  _name		 PGNSP PGUID -1 f b A f t \054 0	19 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1005 (  _int2		 PGNSP PGUID -1 f b A f t \054 0	21 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1006 (  _int2vector PGNSP PGUID -1 f b A f t \054 0	22 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1007 (  _int4		 PGNSP PGUID -1 f b A f t \054 0	23 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define INT4ARRAYOID		1007
-DATA(insert OID = 1008 (  _regproc	 PGNSP PGUID -1 f b A f t \054 0	24 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1009 (  _text		 PGNSP PGUID -1 f b A f t \054 0	25 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DATA(insert OID = 1008 (  _regproc	 PGNSP PGUID -1 f b A f t \054 0	24 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1009 (  _text		 PGNSP PGUID -1 f b A f t \054 0	25 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ ));
 #define TEXTARRAYOID		1009
-DATA(insert OID = 1028 (  _oid		 PGNSP PGUID -1 f b A f t \054 0	26 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1010 (  _tid		 PGNSP PGUID -1 f b A f t \054 0	27 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1011 (  _xid		 PGNSP PGUID -1 f b A f t \054 0	28 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1012 (  _cid		 PGNSP PGUID -1 f b A f t \054 0	29 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1013 (  _oidvector PGNSP PGUID -1 f b A f t \054 0	30 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1014 (  _bpchar	 PGNSP PGUID -1 f b A f t \054 0 1042 0 array_in array_out array_recv array_send bpchartypmodin bpchartypmodout - i x f 0 -1 0 100 _null_ _null_ _null_ ));
-DATA(insert OID = 1015 (  _varchar	 PGNSP PGUID -1 f b A f t \054 0 1043 0 array_in array_out array_recv array_send varchartypmodin varchartypmodout - i x f 0 -1 0 100 _null_ _null_ _null_ ));
-DATA(insert OID = 1016 (  _int8		 PGNSP PGUID -1 f b A f t \054 0	20 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1017 (  _point	 PGNSP PGUID -1 f b A f t \054 0 600 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1018 (  _lseg		 PGNSP PGUID -1 f b A f t \054 0 601 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1019 (  _path		 PGNSP PGUID -1 f b A f t \054 0 602 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1020 (  _box		 PGNSP PGUID -1 f b A f t \073 0 603 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1021 (  _float4	 PGNSP PGUID -1 f b A f t \054 0 700 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1028 (  _oid		 PGNSP PGUID -1 f b A f t \054 0	26 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1010 (  _tid		 PGNSP PGUID -1 f b A f t \054 0	27 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1011 (  _xid		 PGNSP PGUID -1 f b A f t \054 0	28 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1012 (  _cid		 PGNSP PGUID -1 f b A f t \054 0	29 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1013 (  _oidvector PGNSP PGUID -1 f b A f t \054 0	30 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1014 (  _bpchar	 PGNSP PGUID -1 f b A f t \054 0 1042 0 array_in array_out array_recv array_send bpchartypmodin bpchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DATA(insert OID = 1015 (  _varchar	 PGNSP PGUID -1 f b A f t \054 0 1043 0 array_in array_out array_recv array_send varchartypmodin varchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DATA(insert OID = 1016 (  _int8		 PGNSP PGUID -1 f b A f t \054 0	20 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1017 (  _point	 PGNSP PGUID -1 f b A f t \054 0 600 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1018 (  _lseg		 PGNSP PGUID -1 f b A f t \054 0 601 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1019 (  _path		 PGNSP PGUID -1 f b A f t \054 0 602 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1020 (  _box		 PGNSP PGUID -1 f b A f t \073 0 603 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1021 (  _float4	 PGNSP PGUID -1 f b A f t \054 0 700 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define FLOAT4ARRAYOID 1021
-DATA(insert OID = 1022 (  _float8	 PGNSP PGUID -1 f b A f t \054 0 701 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1023 (  _abstime	 PGNSP PGUID -1 f b A f t \054 0 702 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1024 (  _reltime	 PGNSP PGUID -1 f b A f t \054 0 703 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1025 (  _tinterval PGNSP PGUID -1 f b A f t \054 0 704 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1027 (  _polygon	 PGNSP PGUID -1 f b A f t \054 0 604 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1022 (  _float8	 PGNSP PGUID -1 f b A f t \054 0 701 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1023 (  _abstime	 PGNSP PGUID -1 f b A f t \054 0 702 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1024 (  _reltime	 PGNSP PGUID -1 f b A f t \054 0 703 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1025 (  _tinterval PGNSP PGUID -1 f b A f t \054 0 704 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1027 (  _polygon	 PGNSP PGUID -1 f b A f t \054 0 604 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1033 (  aclitem	 PGNSP PGUID 12 f b U f t \054 0 0 1034 aclitemin aclitemout - - - - - i p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("access control list");
 #define ACLITEMOID		1033
-DATA(insert OID = 1034 (  _aclitem	 PGNSP PGUID -1 f b A f t \054 0 1033 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1040 (  _macaddr	 PGNSP PGUID -1 f b A f t \054 0  829 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1041 (  _inet		 PGNSP PGUID -1 f b A f t \054 0  869 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 651  (  _cidr		 PGNSP PGUID -1 f b A f t \054 0  650 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1263 (  _cstring	 PGNSP PGUID -1 f b A f t \054 0 2275 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1034 (  _aclitem	 PGNSP PGUID -1 f b A f t \054 0 1033 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1040 (  _macaddr	 PGNSP PGUID -1 f b A f t \054 0  829 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1041 (  _inet		 PGNSP PGUID -1 f b A f t \054 0  869 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 651  (  _cidr		 PGNSP PGUID -1 f b A f t \054 0  650 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1263 (  _cstring	 PGNSP PGUID -1 f b A f t \054 0 2275 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define CSTRINGARRAYOID		1263
 
 DATA(insert OID = 1042 ( bpchar		 PGNSP PGUID -1 f b S f t \054 0	0 1014 bpcharin bpcharout bpcharrecv bpcharsend bpchartypmodin bpchartypmodout - i x f 0 -1 0 100 _null_ _null_ _null_ ));
@@ -500,34 +500,34 @@ DESCR("time of day");
 DATA(insert OID = 1114 ( timestamp	 PGNSP PGUID	8 FLOAT8PASSBYVAL b D f t \054 0	0 1115 timestamp_in timestamp_out timestamp_recv timestamp_send timestamptypmodin timestamptypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("date and time");
 #define TIMESTAMPOID	1114
-DATA(insert OID = 1115 ( _timestamp  PGNSP PGUID	-1 f b A f t \054 0 1114 0 array_in array_out array_recv array_send timestamptypmodin timestamptypmodout - d x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1182 ( _date		 PGNSP PGUID	-1 f b A f t \054 0 1082 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 1183 ( _time		 PGNSP PGUID	-1 f b A f t \054 0 1083 0 array_in array_out array_recv array_send timetypmodin timetypmodout - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1115 ( _timestamp  PGNSP PGUID	-1 f b A f t \054 0 1114 0 array_in array_out array_recv array_send timestamptypmodin timestamptypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1182 ( _date		 PGNSP PGUID	-1 f b A f t \054 0 1082 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1183 ( _time		 PGNSP PGUID	-1 f b A f t \054 0 1083 0 array_in array_out array_recv array_send timetypmodin timetypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1184 ( timestamptz PGNSP PGUID	8 FLOAT8PASSBYVAL b D t t \054 0	0 1185 timestamptz_in timestamptz_out timestamptz_recv timestamptz_send timestamptztypmodin timestamptztypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("date and time with time zone");
 #define TIMESTAMPTZOID	1184
-DATA(insert OID = 1185 ( _timestamptz PGNSP PGUID -1 f b A f t \054 0	1184 0 array_in array_out array_recv array_send timestamptztypmodin timestamptztypmodout - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1185 ( _timestamptz PGNSP PGUID -1 f b A f t \054 0	1184 0 array_in array_out array_recv array_send timestamptztypmodin timestamptztypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1186 ( interval	 PGNSP PGUID 16 f b T t t \054 0	0 1187 interval_in interval_out interval_recv interval_send intervaltypmodin intervaltypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("@ <number> <units>, time interval");
 #define INTERVALOID		1186
-DATA(insert OID = 1187 ( _interval	 PGNSP PGUID	-1 f b A f t \054 0 1186 0 array_in array_out array_recv array_send intervaltypmodin intervaltypmodout - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1187 ( _interval	 PGNSP PGUID	-1 f b A f t \054 0 1186 0 array_in array_out array_recv array_send intervaltypmodin intervaltypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* OIDS 1200 - 1299 */
-DATA(insert OID = 1231 (  _numeric	 PGNSP PGUID -1 f b A f t \054 0	1700 0 array_in array_out array_recv array_send numerictypmodin numerictypmodout - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1231 (  _numeric	 PGNSP PGUID -1 f b A f t \054 0	1700 0 array_in array_out array_recv array_send numerictypmodin numerictypmodout array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1266 ( timetz		 PGNSP PGUID 12 f b D f t \054 0	0 1270 timetz_in timetz_out timetz_recv timetz_send timetztypmodin timetztypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("time of day with time zone");
 #define TIMETZOID		1266
-DATA(insert OID = 1270 ( _timetz	 PGNSP PGUID -1 f b A f t \054 0	1266 0 array_in array_out array_recv array_send timetztypmodin timetztypmodout - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1270 ( _timetz	 PGNSP PGUID -1 f b A f t \054 0	1266 0 array_in array_out array_recv array_send timetztypmodin timetztypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* OIDS 1500 - 1599 */
 DATA(insert OID = 1560 ( bit		 PGNSP PGUID -1 f b V f t \054 0	0 1561 bit_in bit_out bit_recv bit_send bittypmodin bittypmodout - i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("fixed-length bit string");
 #define BITOID	 1560
-DATA(insert OID = 1561 ( _bit		 PGNSP PGUID -1 f b A f t \054 0	1560 0 array_in array_out array_recv array_send bittypmodin bittypmodout - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1561 ( _bit		 PGNSP PGUID -1 f b A f t \054 0	1560 0 array_in array_out array_recv array_send bittypmodin bittypmodout array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1562 ( varbit		 PGNSP PGUID -1 f b V t t \054 0	0 1563 varbit_in varbit_out varbit_recv varbit_send varbittypmodin varbittypmodout - i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("variable-length bit string");
 #define VARBITOID	  1562
-DATA(insert OID = 1563 ( _varbit	 PGNSP PGUID -1 f b A f t \054 0	1562 0 array_in array_out array_recv array_send varbittypmodin varbittypmodout - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 1563 ( _varbit	 PGNSP PGUID -1 f b A f t \054 0	1562 0 array_in array_out array_recv array_send varbittypmodin varbittypmodout array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* OIDS 1600 - 1699 */
 
@@ -541,7 +541,7 @@ DESCR("reference to cursor (portal name)");
 #define REFCURSOROID	1790
 
 /* OIDS 2200 - 2299 */
-DATA(insert OID = 2201 ( _refcursor    PGNSP PGUID -1 f b A f t \054 0 1790 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2201 ( _refcursor    PGNSP PGUID -1 f b A f t \054 0 1790 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 DATA(insert OID = 2202 ( regprocedure  PGNSP PGUID	4 t b N f t \054 0	 0 2207 regprocedurein regprocedureout regprocedurerecv regproceduresend - - - i p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("registered procedure (with args)");
@@ -563,17 +563,17 @@ DATA(insert OID = 2206 ( regtype	   PGNSP PGUID	4 t b N f t \054 0	 0 2211 regty
 DESCR("registered type");
 #define REGTYPEOID		2206
 
-DATA(insert OID = 2207 ( _regprocedure PGNSP PGUID -1 f b A f t \054 0 2202 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 2208 ( _regoper	   PGNSP PGUID -1 f b A f t \054 0 2203 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 2209 ( _regoperator  PGNSP PGUID -1 f b A f t \054 0 2204 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 2210 ( _regclass	   PGNSP PGUID -1 f b A f t \054 0 2205 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 2211 ( _regtype	   PGNSP PGUID -1 f b A f t \054 0 2206 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2207 ( _regprocedure PGNSP PGUID -1 f b A f t \054 0 2202 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2208 ( _regoper	   PGNSP PGUID -1 f b A f t \054 0 2203 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2209 ( _regoperator  PGNSP PGUID -1 f b A f t \054 0 2204 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2210 ( _regclass	   PGNSP PGUID -1 f b A f t \054 0 2205 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2211 ( _regtype	   PGNSP PGUID -1 f b A f t \054 0 2206 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define REGTYPEARRAYOID 2211
 
 /* uuid */
 DATA(insert OID = 2950 ( uuid			PGNSP PGUID 16 f b U f t \054 0 0 2951 uuid_in uuid_out uuid_recv uuid_send - - - c p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("UUID datatype");
-DATA(insert OID = 2951 ( _uuid			PGNSP PGUID -1 f b A f t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2951 ( _uuid			PGNSP PGUID -1 f b A f t \054 0 2950 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* text search */
 DATA(insert OID = 3614 ( tsvector		PGNSP PGUID -1 f b U f t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - ts_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
@@ -592,36 +592,36 @@ DATA(insert OID = 3769 ( regdictionary	PGNSP PGUID 4 t b N f t \054 0 0 3770 reg
 DESCR("registered text search dictionary");
 #define REGDICTIONARYOID	3769
 
-DATA(insert OID = 3643 ( _tsvector		PGNSP PGUID -1 f b A f t \054 0 3614 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 3644 ( _gtsvector		PGNSP PGUID -1 f b A f t \054 0 3642 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 3645 ( _tsquery		PGNSP PGUID -1 f b A f t \054 0 3615 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 3735 ( _regconfig		PGNSP PGUID -1 f b A f t \054 0 3734 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
-DATA(insert OID = 3770 ( _regdictionary PGNSP PGUID -1 f b A f t \054 0 3769 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3643 ( _tsvector		PGNSP PGUID -1 f b A f t \054 0 3614 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3644 ( _gtsvector		PGNSP PGUID -1 f b A f t \054 0 3642 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3645 ( _tsquery		PGNSP PGUID -1 f b A f t \054 0 3615 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3735 ( _regconfig		PGNSP PGUID -1 f b A f t \054 0 3734 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3770 ( _regdictionary PGNSP PGUID -1 f b A f t \054 0 3769 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 DATA(insert OID = 2970 ( txid_snapshot	PGNSP PGUID -1 f b U f t \054 0 0 2949 txid_snapshot_in txid_snapshot_out txid_snapshot_recv txid_snapshot_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("txid snapshot");
-DATA(insert OID = 2949 ( _txid_snapshot PGNSP PGUID -1 f b A f t \054 0 2970 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2949 ( _txid_snapshot PGNSP PGUID -1 f b A f t \054 0 2970 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /* range types */
 DATA(insert OID = 3904 ( int4range		PGNSP PGUID  -1 f r R f t \054 0 0 3905 range_in range_out range_recv range_send - - range_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of integers");
 #define INT4RANGEOID		3904
-DATA(insert OID = 3905 ( _int4range		PGNSP PGUID  -1 f b A f t \054 0 3904 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3905 ( _int4range		PGNSP PGUID  -1 f b A f t \054 0 3904 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 3906 ( numrange		PGNSP PGUID  -1 f r R f t \054 0 0 3907 range_in range_out range_recv range_send - - range_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of numerics");
-DATA(insert OID = 3907 ( _numrange		PGNSP PGUID  -1 f b A f t \054 0 3906 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3907 ( _numrange		PGNSP PGUID  -1 f b A f t \054 0 3906 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 3908 ( tsrange		PGNSP PGUID  -1 f r R f t \054 0 0 3909 range_in range_out range_recv range_send - - range_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of timestamps without time zone");
-DATA(insert OID = 3909 ( _tsrange		PGNSP PGUID  -1 f b A f t \054 0 3908 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3909 ( _tsrange		PGNSP PGUID  -1 f b A f t \054 0 3908 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 3910 ( tstzrange		PGNSP PGUID  -1 f r R f t \054 0 0 3911 range_in range_out range_recv range_send - - range_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of timestamps with time zone");
-DATA(insert OID = 3911 ( _tstzrange		PGNSP PGUID  -1 f b A f t \054 0 3910 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3911 ( _tstzrange		PGNSP PGUID  -1 f b A f t \054 0 3910 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 3912 ( daterange		PGNSP PGUID  -1 f r R f t \054 0 0 3913 range_in range_out range_recv range_send - - range_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of dates");
-DATA(insert OID = 3913 ( _daterange		PGNSP PGUID  -1 f b A f t \054 0 3912 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3913 ( _daterange		PGNSP PGUID  -1 f b A f t \054 0 3912 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 3926 ( int8range		PGNSP PGUID  -1 f r R f t \054 0 0 3927 range_in range_out range_recv range_send - - range_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("range of bigints");
-DATA(insert OID = 3927 ( _int8range		PGNSP PGUID  -1 f b A f t \054 0 3926 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 3927 ( _int8range		PGNSP PGUID  -1 f b A f t \054 0 3926 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 
 /*
  * pseudo-types
@@ -638,7 +638,7 @@ DATA(insert OID = 3927 ( _int8range		PGNSP PGUID  -1 f b A f t \054 0 3926 0 arr
  */
 DATA(insert OID = 2249 ( record			PGNSP PGUID -1 f p P f t \054 0 0 2287 record_in record_out record_recv record_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define RECORDOID		2249
-DATA(insert OID = 2287 ( _record		PGNSP PGUID -1 f p P f t \054 0 2249 0 array_in array_out array_recv array_send - - - d x f 0 -1 0 0 _null_ _null_ _null_ ));
+DATA(insert OID = 2287 ( _record		PGNSP PGUID -1 f p P f t \054 0 2249 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 #define RECORDARRAYOID	2287
 DATA(insert OID = 2275 ( cstring		PGNSP PGUID -2 f p P f t \054 0 0 1263 cstring_in cstring_out cstring_recv cstring_send - - - c p f 0 -1 0 0 _null_ _null_ _null_ ));
 #define CSTRINGOID		2275
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 4526648a4fd15..3deee66b41369 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -61,6 +61,11 @@ typedef struct VacAttrStats *VacAttrStatsP;
 typedef Datum (*AnalyzeAttrFetchFunc) (VacAttrStatsP stats, int rownum,
 												   bool *isNull);
 
+typedef void (*AnalyzeAttrComputeStatsFunc) (VacAttrStatsP stats,
+											 AnalyzeAttrFetchFunc fetchfunc,
+											 int samplerows,
+											 double totalrows);
+
 typedef struct VacAttrStats
 {
 	/*
@@ -83,10 +88,7 @@ typedef struct VacAttrStats
 	 * These fields must be filled in by the typanalyze routine, unless it
 	 * returns FALSE.
 	 */
-	void		(*compute_stats) (VacAttrStatsP stats,
-											  AnalyzeAttrFetchFunc fetchfunc,
-											  int samplerows,
-											  double totalrows);
+	AnalyzeAttrComputeStatsFunc compute_stats;	/* function pointer */
 	int			minrows;		/* Minimum # of rows wanted for stats */
 	void	   *extra_data;		/* for extra type-specific data */
 
@@ -167,5 +169,6 @@ extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 /* in commands/analyze.c */
 extern void analyze_rel(Oid relid, VacuumStmt *vacstmt,
 			BufferAccessStrategy bstrategy);
+extern bool std_typanalyze(VacAttrStats *stats);
 
 #endif   /* VACUUM_H */
diff --git a/src/include/utils/array.h b/src/include/utils/array.h
index c6d0ad67f8802..1da20fefdabe1 100644
--- a/src/include/utils/array.h
+++ b/src/include/utils/array.h
@@ -289,4 +289,9 @@ extern ArrayType *create_singleton_array(FunctionCallInfo fcinfo,
 extern Datum array_agg_transfn(PG_FUNCTION_ARGS);
 extern Datum array_agg_finalfn(PG_FUNCTION_ARGS);
 
+/*
+ * prototypes for functions defined in array_typanalyze.c
+ */
+extern Datum array_typanalyze(PG_FUNCTION_ARGS);
+
 #endif   /* ARRAY_H */
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index bffc2d80ef012..4529f276839a0 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -95,9 +95,6 @@ typedef enum
 	Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
 } Pattern_Prefix_Status;
 
-
-/* selfuncs.c */
-
 /* Hooks for plugins to get control when we ask for stats */
 typedef bool (*get_relation_stats_hook_type) (PlannerInfo *root,
 														  RangeTblEntry *rte,
@@ -110,6 +107,8 @@ typedef bool (*get_index_stats_hook_type) (PlannerInfo *root,
 												  VariableStatData *vardata);
 extern PGDLLIMPORT get_index_stats_hook_type get_index_stats_hook;
 
+/* Functions in selfuncs.c */
+
 extern void examine_variable(PlannerInfo *root, Node *node, int varRelid,
 				 VariableStatData *vardata);
 extern bool get_restriction_variable(PlannerInfo *root, List *args,
@@ -197,4 +196,13 @@ extern Datum gistcostestimate(PG_FUNCTION_ARGS);
 extern Datum spgcostestimate(PG_FUNCTION_ARGS);
 extern Datum gincostestimate(PG_FUNCTION_ARGS);
 
+/* Functions in array_selfuncs.c */
+
+extern Selectivity scalararraysel_containment(PlannerInfo *root,
+						   Node *leftop, Node *rightop,
+						   Oid elemtype, bool isEquality, bool useOr,
+						   int varRelid);
+extern Datum arraycontsel(PG_FUNCTION_ARGS);
+extern Datum arraycontjoinsel(PG_FUNCTION_ARGS);
+
 #endif   /* SELFUNCS_H */
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index 6e5534995d909..9865b69b8b836 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -421,6 +421,7 @@ SELECT 0 || ARRAY[1,2] || 3 AS "{0,1,2,3}";
  {0,1,2,3}
 (1 row)
 
+ANALYZE array_op_test;
 SELECT * FROM array_op_test WHERE i @> '{32}' ORDER BY seqno;
  seqno |                i                |                                                                 t                                                                  
 -------+---------------------------------+------------------------------------------------------------------------------------------------------------------------------------
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 0275a0e120e40..aaf0cca026091 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1317,7 +1317,7 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
  pg_statio_user_indexes          | SELECT pg_statio_all_indexes.relid, pg_statio_all_indexes.indexrelid, pg_statio_all_indexes.schemaname, pg_statio_all_indexes.relname, pg_statio_all_indexes.indexrelname, pg_statio_all_indexes.idx_blks_read, pg_statio_all_indexes.idx_blks_hit FROM pg_statio_all_indexes WHERE ((pg_statio_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_indexes.schemaname !~ '^pg_toast'::text));
  pg_statio_user_sequences        | SELECT pg_statio_all_sequences.relid, pg_statio_all_sequences.schemaname, pg_statio_all_sequences.relname, pg_statio_all_sequences.blks_read, pg_statio_all_sequences.blks_hit FROM pg_statio_all_sequences WHERE ((pg_statio_all_sequences.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_sequences.schemaname !~ '^pg_toast'::text));
  pg_statio_user_tables           | SELECT pg_statio_all_tables.relid, pg_statio_all_tables.schemaname, pg_statio_all_tables.relname, pg_statio_all_tables.heap_blks_read, pg_statio_all_tables.heap_blks_hit, pg_statio_all_tables.idx_blks_read, pg_statio_all_tables.idx_blks_hit, pg_statio_all_tables.toast_blks_read, pg_statio_all_tables.toast_blks_hit, pg_statio_all_tables.tidx_blks_read, pg_statio_all_tables.tidx_blks_hit FROM pg_statio_all_tables WHERE ((pg_statio_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_tables.schemaname !~ '^pg_toast'::text));
- pg_stats                        | SELECT n.nspname AS schemaname, c.relname AS tablename, a.attname, s.stainherit AS inherited, s.stanullfrac AS null_frac, s.stawidth AS avg_width, s.stadistinct AS n_distinct, CASE WHEN (s.stakind1 = ANY (ARRAY[1, 4])) THEN s.stavalues1 WHEN (s.stakind2 = ANY (ARRAY[1, 4])) THEN s.stavalues2 WHEN (s.stakind3 = ANY (ARRAY[1, 4])) THEN s.stavalues3 WHEN (s.stakind4 = ANY (ARRAY[1, 4])) THEN s.stavalues4 ELSE NULL::anyarray END AS most_common_vals, CASE WHEN (s.stakind1 = ANY (ARRAY[1, 4])) THEN s.stanumbers1 WHEN (s.stakind2 = ANY (ARRAY[1, 4])) THEN s.stanumbers2 WHEN (s.stakind3 = ANY (ARRAY[1, 4])) THEN s.stanumbers3 WHEN (s.stakind4 = ANY (ARRAY[1, 4])) THEN s.stanumbers4 ELSE NULL::real[] END AS most_common_freqs, CASE WHEN (s.stakind1 = 2) THEN s.stavalues1 WHEN (s.stakind2 = 2) THEN s.stavalues2 WHEN (s.stakind3 = 2) THEN s.stavalues3 WHEN (s.stakind4 = 2) THEN s.stavalues4 ELSE NULL::anyarray END AS histogram_bounds, CASE WHEN (s.stakind1 = 3) THEN s.stanumbers1[1] WHEN (s.stakind2 = 3) THEN s.stanumbers2[1] WHEN (s.stakind3 = 3) THEN s.stanumbers3[1] WHEN (s.stakind4 = 3) THEN s.stanumbers4[1] ELSE NULL::real END AS correlation FROM (((pg_statistic s JOIN pg_class c ON ((c.oid = s.starelid))) JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 'select'::text));
+ pg_stats                        | SELECT n.nspname AS schemaname, c.relname AS tablename, a.attname, s.stainherit AS inherited, s.stanullfrac AS null_frac, s.stawidth AS avg_width, s.stadistinct AS n_distinct, CASE WHEN (s.stakind1 = 1) THEN s.stavalues1 WHEN (s.stakind2 = 1) THEN s.stavalues2 WHEN (s.stakind3 = 1) THEN s.stavalues3 WHEN (s.stakind4 = 1) THEN s.stavalues4 WHEN (s.stakind5 = 1) THEN s.stavalues5 ELSE NULL::anyarray END AS most_common_vals, CASE WHEN (s.stakind1 = 1) THEN s.stanumbers1 WHEN (s.stakind2 = 1) THEN s.stanumbers2 WHEN (s.stakind3 = 1) THEN s.stanumbers3 WHEN (s.stakind4 = 1) THEN s.stanumbers4 WHEN (s.stakind5 = 1) THEN s.stanumbers5 ELSE NULL::real[] END AS most_common_freqs, CASE WHEN (s.stakind1 = 2) THEN s.stavalues1 WHEN (s.stakind2 = 2) THEN s.stavalues2 WHEN (s.stakind3 = 2) THEN s.stavalues3 WHEN (s.stakind4 = 2) THEN s.stavalues4 WHEN (s.stakind5 = 2) THEN s.stavalues5 ELSE NULL::anyarray END AS histogram_bounds, CASE WHEN (s.stakind1 = 3) THEN s.stanumbers1[1] WHEN (s.stakind2 = 3) THEN s.stanumbers2[1] WHEN (s.stakind3 = 3) THEN s.stanumbers3[1] WHEN (s.stakind4 = 3) THEN s.stanumbers4[1] WHEN (s.stakind5 = 3) THEN s.stanumbers5[1] ELSE NULL::real END AS correlation, CASE WHEN (s.stakind1 = 4) THEN s.stavalues1 WHEN (s.stakind2 = 4) THEN s.stavalues2 WHEN (s.stakind3 = 4) THEN s.stavalues3 WHEN (s.stakind4 = 4) THEN s.stavalues4 WHEN (s.stakind5 = 4) THEN s.stavalues5 ELSE NULL::anyarray END AS most_common_elems, CASE WHEN (s.stakind1 = 4) THEN s.stanumbers1 WHEN (s.stakind2 = 4) THEN s.stanumbers2 WHEN (s.stakind3 = 4) THEN s.stanumbers3 WHEN (s.stakind4 = 4) THEN s.stanumbers4 WHEN (s.stakind5 = 4) THEN s.stanumbers5 ELSE NULL::real[] END AS most_common_elem_freqs, CASE WHEN (s.stakind1 = 5) THEN s.stanumbers1 WHEN (s.stakind2 = 5) THEN s.stanumbers2 WHEN (s.stakind3 = 5) THEN s.stanumbers3 WHEN (s.stakind4 = 5) THEN s.stanumbers4 WHEN (s.stakind5 = 5) THEN s.stanumbers5 ELSE NULL::real[] END AS elem_count_histogram FROM (((pg_statistic s JOIN pg_class c ON ((c.oid = s.starelid))) JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 'select'::text));
  pg_tables                       | SELECT n.nspname AS schemaname, c.relname AS tablename, pg_get_userbyid(c.relowner) AS tableowner, t.spcname AS tablespace, c.relhasindex AS hasindexes, c.relhasrules AS hasrules, c.relhastriggers AS hastriggers FROM ((pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace t ON ((t.oid = c.reltablespace))) WHERE (c.relkind = 'r'::"char");
  pg_timezone_abbrevs             | SELECT pg_timezone_abbrevs.abbrev, pg_timezone_abbrevs.utc_offset, pg_timezone_abbrevs.is_dst FROM pg_timezone_abbrevs() pg_timezone_abbrevs(abbrev, utc_offset, is_dst);
  pg_timezone_names               | SELECT pg_timezone_names.name, pg_timezone_names.abbrev, pg_timezone_names.utc_offset, pg_timezone_names.is_dst FROM pg_timezone_names() pg_timezone_names(name, abbrev, utc_offset, is_dst);
diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out
index 0e1dfd84861e1..70eab92370267 100644
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -375,6 +375,39 @@ WHERE p1.typanalyze = p2.oid AND NOT
 -----+---------+-----+---------
 (0 rows)
 
+-- domains inherit their base type's typanalyze
+SELECT d.oid, d.typname, d.typanalyze, t.oid, t.typname, t.typanalyze
+FROM pg_type d JOIN pg_type t ON d.typbasetype = t.oid
+WHERE d.typanalyze != t.typanalyze;
+ oid | typname | typanalyze | oid | typname | typanalyze 
+-----+---------+------------+-----+---------+------------
+(0 rows)
+
+-- range_typanalyze should be used for all and only range types
+-- (but exclude domains, which we checked above)
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t LEFT JOIN pg_range r on t.oid = r.rngtypid
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'range_typanalyze'::regproc) != (r.rngtypid IS NOT NULL);
+ oid | typname | typanalyze 
+-----+---------+------------
+(0 rows)
+
+-- array_typanalyze should be used for all and only array types
+-- (but exclude domains, which we checked above)
+-- As of 9.2 this finds int2vector and oidvector, which are weird anyway
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'array_typanalyze'::regproc) !=
+    (typelem != 0 AND typlen < 0)
+ORDER BY 1;
+ oid |  typname   | typanalyze 
+-----+------------+------------
+  22 | int2vector | -
+  30 | oidvector  | -
+(2 rows)
+
 -- **************** pg_class ****************
 -- Look for illegal values in pg_class fields
 SELECT p1.oid, p1.relname
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index 9ea53b1544bbc..294b44ee086f4 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -196,6 +196,8 @@ SELECT ARRAY[[1,2],[3,4]] || ARRAY[5,6] AS "{{1,2},{3,4},{5,6}}";
 SELECT ARRAY[0,0] || ARRAY[1,1] || ARRAY[2,2] AS "{0,0,1,1,2,2}";
 SELECT 0 || ARRAY[1,2] || 3 AS "{0,1,2,3}";
 
+ANALYZE array_op_test;
+
 SELECT * FROM array_op_test WHERE i @> '{32}' ORDER BY seqno;
 SELECT * FROM array_op_test WHERE i && '{32}' ORDER BY seqno;
 SELECT * FROM array_op_test WHERE i @> '{17}' ORDER BY seqno;
diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql
index c6a70ad14c51f..413b220d592d7 100644
--- a/src/test/regress/sql/type_sanity.sql
+++ b/src/test/regress/sql/type_sanity.sql
@@ -272,6 +272,31 @@ WHERE p1.typanalyze = p2.oid AND NOT
      p2.proargtypes[0] = 'internal'::regtype AND
      p2.prorettype = 'bool'::regtype AND NOT p2.proretset);
 
+-- domains inherit their base type's typanalyze
+
+SELECT d.oid, d.typname, d.typanalyze, t.oid, t.typname, t.typanalyze
+FROM pg_type d JOIN pg_type t ON d.typbasetype = t.oid
+WHERE d.typanalyze != t.typanalyze;
+
+-- range_typanalyze should be used for all and only range types
+-- (but exclude domains, which we checked above)
+
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t LEFT JOIN pg_range r on t.oid = r.rngtypid
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'range_typanalyze'::regproc) != (r.rngtypid IS NOT NULL);
+
+-- array_typanalyze should be used for all and only array types
+-- (but exclude domains, which we checked above)
+-- As of 9.2 this finds int2vector and oidvector, which are weird anyway
+
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'array_typanalyze'::regproc) !=
+    (typelem != 0 AND typlen < 0)
+ORDER BY 1;
+
 -- **************** pg_class ****************
 
 -- Look for illegal values in pg_class fields

From bc5ac3686580079bd4ea26bf027178786d77a9ee Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sun, 4 Mar 2012 12:15:24 +0100
Subject: [PATCH 079/129] Add function pg_xlog_location_diff to help
 comparisons

Comparing two xlog locations are useful for example when calculating
replication lag.

Euler Taveira de Oliveira, reviewed by Fujii Masao, and some cleanups
from me
---
 doc/src/sgml/func.sgml                 | 20 +++++-
 src/backend/access/transam/xlogfuncs.c | 90 ++++++++++++++++++++++++++
 src/include/access/xlog_internal.h     |  1 +
 src/include/catalog/catversion.h       |  2 +-
 src/include/catalog/pg_proc.h          |  3 +
 5 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 5c1cff3618d52..34fea16eeeeb5 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14475,11 +14475,15 @@ SELECT set_config('log_statement_stats', 'off', false);
    <indexterm>
     <primary>pg_xlogfile_name_offset</primary>
    </indexterm>
+   <indexterm>
+    <primary>pg_xlog_location_diff</primary>
+   </indexterm>
 
    <para>
     The functions shown in <xref
     linkend="functions-admin-backup-table"> assist in making on-line backups.
-    These functions cannot be executed during recovery.
+    These functions cannot be executed during recovery (except
+    <function>pg_xlog_location_diff</function>).
    </para>
 
    <table id="functions-admin-backup-table">
@@ -14547,6 +14551,13 @@ SELECT set_config('log_statement_stats', 'off', false);
        <entry><type>text</>, <type>integer</></entry>
        <entry>Convert transaction log location string to file name and decimal byte offset within file</entry>
       </row>
+      <row>
+       <entry>
+        <literal><function>pg_xlog_location_diff(<parameter>location</> <type>text</>, <parameter>location</> <type>text</>)</function></literal>
+       </entry>
+       <entry><type>numeric</></entry>
+       <entry>Calculate the difference between two transaction log locations</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -14639,6 +14650,13 @@ postgres=# SELECT * FROM pg_xlogfile_name_offset(pg_stop_backup());
     needs to be archived.
    </para>
 
+   <para>
+    <function>pg_xlog_location_diff</> calculates the difference in bytes
+    between two transaction log locations. It can be used with
+    <structname>pg_stat_replication</structname> or some functions shown in
+    <xref linkend="functions-admin-backup-table"> to get the replication lag.
+   </para>
+
    <para>
     For details about proper usage of these functions, see
     <xref linkend="continuous-archiving">.
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 2e10d4d15f742..08b5724b97e7b 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -26,6 +26,7 @@
 #include "replication/walreceiver.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
+#include "utils/numeric.h"
 #include "utils/guc.h"
 #include "utils/timestamp.h"
 
@@ -465,3 +466,92 @@ pg_is_in_recovery(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_BOOL(RecoveryInProgress());
 }
+
+/*
+ * Validate the text form of a transaction log location.
+ * (Just using sscanf() input allows incorrect values such as
+ * negatives, so we have to be a bit more careful about that).
+ */
+static void
+validate_xlog_location(char *str)
+{
+#define MAXLSNCOMPONENT		8
+
+	int			len1,
+				len2;
+
+	len1 = strspn(str, "0123456789abcdefABCDEF");
+	if (len1 < 1 || len1 > MAXLSNCOMPONENT || str[len1] != '/')
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for transaction log location: \"%s\"", str)));
+
+	len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF");
+	if (len2 < 1 || len2 > MAXLSNCOMPONENT || str[len1 + 1 + len2] != '\0')
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for transaction log location: \"%s\"", str)));
+}
+
+/*
+ * Compute the difference in bytes between two WAL locations.
+ */
+Datum
+pg_xlog_location_diff(PG_FUNCTION_ARGS)
+{
+	text	   *location1 = PG_GETARG_TEXT_P(0);
+	text	   *location2 = PG_GETARG_TEXT_P(1);
+	char	   *str1,
+			   *str2;
+	XLogRecPtr	loc1,
+				loc2;
+	Numeric		result;
+
+	/*
+	 * Read and parse input
+	 */
+	str1 = text_to_cstring(location1);
+	str2 = text_to_cstring(location2);
+
+	validate_xlog_location(str1);
+	validate_xlog_location(str2);
+
+	if (sscanf(str1, "%X/%X", &loc1.xlogid, &loc1.xrecoff) != 2)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+		   errmsg("could not parse transaction log location \"%s\"", str1)));
+	if (sscanf(str2, "%X/%X", &loc2.xlogid, &loc2.xrecoff) != 2)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+		   errmsg("could not parse transaction log location \"%s\"", str2)));
+
+	/*
+	 * Sanity check
+	 */
+	if (loc1.xrecoff > XLogFileSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("xrecoff \"%X\" is out of valid range, 0..%X", loc1.xrecoff, XLogFileSize)));
+	if (loc2.xrecoff > XLogFileSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("xrecoff \"%X\" is out of valid range, 0..%X", loc2.xrecoff, XLogFileSize)));
+
+	/*
+	 * result = XLogFileSize * (xlogid1 - xlogid2) + xrecoff1 - xrecoff2
+	 */
+	result = DatumGetNumeric(DirectFunctionCall2(numeric_sub,
+	   DirectFunctionCall1(int8_numeric, Int64GetDatum((int64) loc1.xlogid)),
+	 DirectFunctionCall1(int8_numeric, Int64GetDatum((int64) loc2.xlogid))));
+	result = DatumGetNumeric(DirectFunctionCall2(numeric_mul,
+	  DirectFunctionCall1(int8_numeric, Int64GetDatum((int64) XLogFileSize)),
+												 NumericGetDatum(result)));
+	result = DatumGetNumeric(DirectFunctionCall2(numeric_add,
+												 NumericGetDatum(result),
+	DirectFunctionCall1(int8_numeric, Int64GetDatum((int64) loc1.xrecoff))));
+	result = DatumGetNumeric(DirectFunctionCall2(numeric_sub,
+												 NumericGetDatum(result),
+	DirectFunctionCall1(int8_numeric, Int64GetDatum((int64) loc2.xrecoff))));
+
+	PG_RETURN_NUMERIC(result);
+}
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index b81c15688182b..c079a9aa8f555 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -281,5 +281,6 @@ extern Datum pg_is_in_recovery(PG_FUNCTION_ARGS);
 extern Datum pg_xlog_replay_pause(PG_FUNCTION_ARGS);
 extern Datum pg_xlog_replay_resume(PG_FUNCTION_ARGS);
 extern Datum pg_is_xlog_replay_paused(PG_FUNCTION_ARGS);
+extern Datum pg_xlog_location_diff(PG_FUNCTION_ARGS);
 
 #endif   /* XLOG_INTERNAL_H */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 223f157310b6d..993e3872c7b76 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201203031
+#define CATALOG_VERSION_NO	201203041
 
 #endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 074051bdcc6a9..2db848903c4a2 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2936,6 +2936,9 @@ DESCR("xlog filename and byte offset, given an xlog location");
 DATA(insert OID = 2851 ( pg_xlogfile_name			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "25" _null_ _null_ _null_ _null_ pg_xlogfile_name _null_ _null_ _null_ ));
 DESCR("xlog filename, given an xlog location");
 
+DATA(insert OID = 3165 ( pg_xlog_location_diff		PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1700 "25 25" _null_ _null_ _null_ _null_ pg_xlog_location_diff _null_ _null_ _null_ ));
+DESCR("difference in bytes, given two xlog locations");
+
 DATA(insert OID = 3809 ( pg_export_snapshot		PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 25 "" _null_ _null_ _null_ _null_ pg_export_snapshot _null_ _null_ _null_ ));
 DESCR("export a snapshot");
 

From 141b89826ddb82b3afa7cf5e048d28a3d8e1c45c Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sun, 4 Mar 2012 12:24:09 +0100
Subject: [PATCH 080/129] More carefully validate xlog location string inputs

Now that we have validate_xlog_location, call it from the previously
existing functions taking xlog locatoins as a string input.

Suggested by Fujii Masao
---
 src/backend/access/transam/xlogfuncs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 08b5724b97e7b..f3c8a09c2aa48 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -30,6 +30,10 @@
 #include "utils/guc.h"
 #include "utils/timestamp.h"
 
+
+static void validate_xlog_location(char *str);
+
+
 /*
  * pg_start_backup: set up for taking an on-line backup dump
  *
@@ -289,6 +293,8 @@ pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
 	 */
 	locationstr = text_to_cstring(location);
 
+	validate_xlog_location(locationstr);
+
 	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -361,6 +367,8 @@ pg_xlogfile_name(PG_FUNCTION_ARGS)
 
 	locationstr = text_to_cstring(location);
 
+	validate_xlog_location(locationstr);
+
 	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),

From 4fb694aebc524f2085152d8c98a85e01ef6136f4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Mar 2012 15:40:16 -0500
Subject: [PATCH 081/129] Improve histogram-filling loop in new
 compute_array_stats() code.

Do "frac" arithmetic in int64 to prevent overflow with large statistics
targets, and improve the comments so people have some chance of
understanding how it works.

Alexander Korotkov and Tom Lane
---
 src/backend/utils/adt/array_typanalyze.c | 63 +++++++++++++++++-------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c
index 941e2adb03847..ba9873905e2ec 100644
--- a/src/backend/utils/adt/array_typanalyze.c
+++ b/src/backend/utils/adt/array_typanalyze.c
@@ -579,9 +579,9 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 		{
 			int			num_hist = stats->attr->attstattarget;
 			DECountItem **sorted_count_items;
-			int			count_item_index;
+			int			j;
 			int			delta;
-			int			frac;
+			int64		frac;
 			float4	   *hist;
 
 			/* num_hist must be at least 2 for the loop below to work */
@@ -594,45 +594,70 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 			sorted_count_items = (DECountItem **)
 				palloc(sizeof(DECountItem *) * count_items_count);
 			hash_seq_init(&scan_status, count_tab);
-			count_item_index = 0;
+			j = 0;
 			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
 			{
-				sorted_count_items[count_item_index++] = count_item;
+				sorted_count_items[j++] = count_item;
 			}
 			qsort(sorted_count_items, count_items_count,
 				  sizeof(DECountItem *), countitem_compare_count);
 
 			/*
-			 * Fill stanumbers with the histogram, followed by the average
-			 * count.  This array must be stored in anl_context.
+			 * Prepare to fill stanumbers with the histogram, followed by the
+			 * average count.  This array must be stored in anl_context.
 			 */
 			hist = (float4 *)
 				MemoryContextAlloc(stats->anl_context,
 								   sizeof(float4) * (num_hist + 1));
 			hist[num_hist] = (double) element_no / (double) nonnull_cnt;
 
-			/*
-			 * Construct the histogram.
+			/*----------
+			 * Construct the histogram of distinct-element counts (DECs).
+			 *
+			 * The object of this loop is to copy the min and max DECs to
+			 * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
+			 * in between (where "evenly-spaced" is with reference to the
+			 * whole input population of arrays).  If we had a complete sorted
+			 * array of DECs, one per analyzed row, the i'th hist value would
+			 * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
+			 * (compare the histogram-making loop in compute_scalar_stats()).
+			 * But instead of that we have the sorted_count_items[] array,
+			 * which holds unique DEC values with their frequencies (that is,
+			 * a run-length-compressed version of the full array).  So we
+			 * control advancing through sorted_count_items[] with the
+			 * variable "frac", which is defined as (x - y) * (num_hist - 1),
+			 * where x is the index in the notional DECs array corresponding
+			 * to the start of the next sorted_count_items[] element's run,
+			 * and y is the index in DECs from which we should take the next
+			 * histogram value.  We have to advance whenever x <= y, that is
+			 * frac <= 0.  The x component is the sum of the frequencies seen
+			 * so far (up through the current sorted_count_items[] element),
+			 * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
+			 * per the subscript calculation above.  (The subscript calculation
+			 * implies dropping any fractional part of y; in this formulation
+			 * that's handled by not advancing until frac reaches 1.)
 			 *
-			 * XXX this needs work: frac could overflow, and it's not clear
-			 * how or why the code works.  Even if it does work, it needs
-			 * documented.
+			 * Even though frac has a bounded range, it could overflow int32
+			 * when working with very large statistics targets, so we do that
+			 * math in int64.
+			 *----------
 			 */
 			delta = analyzed_rows - 1;
-			count_item_index = 0;
-			frac = sorted_count_items[0]->frequency * (num_hist - 1);
+			j = 0;				/* current index in sorted_count_items */
+			/* Initialize frac for sorted_count_items[0]; y is initially 0 */
+			frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
 			for (i = 0; i < num_hist; i++)
 			{
 				while (frac <= 0)
 				{
-					count_item_index++;
-					Assert(count_item_index < count_items_count);
-					frac += sorted_count_items[count_item_index]->frequency * (num_hist - 1);
+					/* Advance, and update x component of frac */
+					j++;
+					frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
 				}
-				hist[i] = sorted_count_items[count_item_index]->count;
-				frac -= delta;
+				hist[i] = sorted_count_items[j]->count;
+				frac -= delta;		/* update y for upcoming i increment */
 			}
-			Assert(count_item_index == count_items_count - 1);
+			Assert(j == count_items_count - 1);
 
 			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
 			stats->staop[slot_idx] = extra_data->eq_opr;

From e2eed7891008cbf2b7d3868b3d77751b33ed09ad Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Mar 2012 16:03:38 -0500
Subject: [PATCH 082/129] Remove useless "rough estimate" path from
 mcelem_array_contained_selec.

The code in this function that tried to cope with a missing count histogram
was quite ineffective for anything except a perfectly flat distribution.
Furthermore, since we were already punting for missing MCELEM slot, it's
rather useless to sweat over missing DECHIST: there are no cases where
ANALYZE will create the first but not the second.  So just simplify the
code by punting rather than pretending we can do something useful.
---
 src/backend/utils/adt/array_selfuncs.c | 138 +++++++++++--------------
 1 file changed, 62 insertions(+), 76 deletions(-)

diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c
index 3916de4bfb61d..bc4ebd2074999 100644
--- a/src/backend/utils/adt/array_selfuncs.c
+++ b/src/backend/utils/adt/array_selfuncs.c
@@ -242,8 +242,7 @@ scalararraysel_containment(PlannerInfo *root,
 }
 
 /*
- * arraycontsel -- restriction selectivity for "arraycolumn @> const",
- * "arraycolumn && const" or "arraycolumn <@ const"
+ * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
  */
 Datum
 arraycontsel(PG_FUNCTION_ARGS)
@@ -323,8 +322,7 @@ arraycontsel(PG_FUNCTION_ARGS)
 }
 
 /*
- * arraycontjoinsel -- join selectivity for "arraycolumn @> const",
- * "arraycolumn && const" or "arraycolumn <@ const"
+ * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
  */
 Datum
 arraycontjoinsel(PG_FUNCTION_ARGS)
@@ -744,6 +742,10 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
 	if (numbers == NULL || nnumbers != nmcelem + 3)
 		return DEFAULT_CONTAIN_SEL;
 
+	/* Can't do much without a count histogram, either */
+	if (hist == NULL || nhist < 3)
+		return DEFAULT_CONTAIN_SEL;
+
 	/*
 	 * Grab some of the summary statistics that compute_array_stats() stores:
 	 * lowest frequency, frequency of null elements, and average distinct
@@ -751,11 +753,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
 	 */
 	minfreq = numbers[nmcelem];
 	nullelem_freq = numbers[nmcelem + 2];
-
-	if (hist && nhist > 0)
-		avg_count = hist[nhist - 1];
-	else
-		avg_count = 10.0f;		/* default assumption */
+	avg_count = hist[nhist - 1];
 
 	/*
 	 * "rest" will be the sum of the frequencies of all elements not
@@ -853,83 +851,71 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
 	 */
 	mult *= exp(-rest);
 
-	/* Check we have nonempty distinct element count histogram */
-	if (hist && nhist >= 3)
-	{
-		/*----------
-		 * Using the distinct element count histogram requires
-		 *		O(unique_nitems * (nmcelem + unique_nitems))
-		 * operations.  Beyond a certain computational cost threshold, it's
-		 * reasonable to sacrifice accuracy for decreased planning time.
-		 * We limit the number of operations to EFFORT * nmcelem; since
-		 * nmcelem is limited by the column's statistics target, the work
-		 * done is user-controllable.
-		 *
-		 * If the number of operations would be too large, we can reduce it
-		 * without losing all accuracy by reducing unique_nitems and
-		 * considering only the most-common elements of the constant array.
-		 * To make the results exactly match what we would have gotten with
-		 * only those elements to start with, we'd have to remove any
-		 * discarded elements' frequencies from "mult", but since this is only
-		 * an approximation anyway, we don't bother with that.  Therefore it's
-		 * sufficient to qsort elem_selec[] and take the largest elements.
-		 * (They will no longer match up with the elements of array_data[],
-		 * but we don't care.)
-		 *----------
-		 */
+	/*----------
+	 * Using the distinct element count histogram requires
+	 *		O(unique_nitems * (nmcelem + unique_nitems))
+	 * operations.  Beyond a certain computational cost threshold, it's
+	 * reasonable to sacrifice accuracy for decreased planning time.  We limit
+	 * the number of operations to EFFORT * nmcelem; since nmcelem is limited
+	 * by the column's statistics target, the work done is user-controllable.
+	 *
+	 * If the number of operations would be too large, we can reduce it
+	 * without losing all accuracy by reducing unique_nitems and considering
+	 * only the most-common elements of the constant array.  To make the
+	 * results exactly match what we would have gotten with only those
+	 * elements to start with, we'd have to remove any discarded elements'
+	 * frequencies from "mult", but since this is only an approximation
+	 * anyway, we don't bother with that.  Therefore it's sufficient to qsort
+	 * elem_selec[] and take the largest elements.  (They will no longer match
+	 * up with the elements of array_data[], but we don't care.)
+	 *----------
+	 */
 #define EFFORT 100
 
-		if ((nmcelem + unique_nitems) > 0 &&
-			unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
-		{
-			/*
-			 * Use the quadratic formula to solve for largest allowable N;
-			 * we have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
-			 */
-			double	b = (double) nmcelem;
-			int		n;
-
-			n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
-
-			/* Sort, then take just the first n elements */
-			qsort(elem_selec, unique_nitems, sizeof(float),
-				  float_compare_desc);
-			unique_nitems = n;
-		}
-
+	if ((nmcelem + unique_nitems) > 0 &&
+		unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
+	{
 		/*
-		 * Calculate probabilities of each distinct element count for both
-		 * mcelems and constant elements.  At this point, assume independent
-		 * element occurrence.
+		 * Use the quadratic formula to solve for largest allowable N.  We
+		 * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
 		 */
-		dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
-		mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
+		double	b = (double) nmcelem;
+		int		n;
 
-		/* ignore hist[nhist-1], which is the avg not a histogram member */
-		hist_part = calc_hist(hist, nhist - 1, unique_nitems);
+		n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
 
-		selec = 0.0f;
-		for (i = 0; i <= unique_nitems; i++)
-		{
-			/*
-			 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
-			 * matching from assumption of independent element occurrence with
-			 * the condition that distinct element count = i.
-			 */
-			if (mcelem_dist[i] > 0)
-				selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
-		}
-
-		pfree(dist);
-		pfree(mcelem_dist);
-		pfree(hist_part);
+		/* Sort, then take just the first n elements */
+		qsort(elem_selec, unique_nitems, sizeof(float),
+			  float_compare_desc);
+		unique_nitems = n;
 	}
-	else
+
+	/*
+	 * Calculate probabilities of each distinct element count for both
+	 * mcelems and constant elements.  At this point, assume independent
+	 * element occurrence.
+	 */
+	dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
+	mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
+
+	/* ignore hist[nhist-1], which is the average not a histogram member */
+	hist_part = calc_hist(hist, nhist - 1, unique_nitems);
+
+	selec = 0.0f;
+	for (i = 0; i <= unique_nitems; i++)
 	{
-		/* We don't have histogram.  Use a rough estimate. */
-		selec = mult;
+		/*
+		 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
+		 * matching from assumption of independent element occurrence with
+		 * the condition that distinct element count = i.
+		 */
+		if (mcelem_dist[i] > 0)
+			selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
 	}
 
+	pfree(dist);
+	pfree(mcelem_dist);
+	pfree(hist_part);
 	pfree(elem_selec);
 
 	/* Take into account occurrence of NULL element. */

From 80da9e68fdd70b796b3a7de3821589513596c0f7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Mar 2012 22:50:06 -0500
Subject: [PATCH 083/129] Rewrite GiST support code for rangetypes.

This patch installs significantly smarter penalty and picksplit functions
for ranges, making GiST indexes for them smaller and faster to search.

There is no on-disk format change, so no catversion bump, but you'd need
to REINDEX to get the benefits for any existing index.

Alexander Korotkov, reviewed by Jeff Davis
---
 src/backend/utils/adt/rangetypes_gist.c | 1293 +++++++++++++++++++----
 1 file changed, 1102 insertions(+), 191 deletions(-)

diff --git a/src/backend/utils/adt/rangetypes_gist.c b/src/backend/utils/adt/rangetypes_gist.c
index 4267dc8cb613b..87f71e6812c35 100644
--- a/src/backend/utils/adt/rangetypes_gist.c
+++ b/src/backend/utils/adt/rangetypes_gist.c
@@ -34,20 +34,117 @@
 #define RANGESTRAT_CONTAINS_ELEM		16
 #define RANGESTRAT_EQ					18
 
-/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
-#define rangeCopy(r) \
-	((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
-											 false, -1)))
+/*
+ * Range class properties used to segregate different classes of ranges in
+ * GiST.  Each unique combination of properties is a class.  CLS_EMPTY cannot
+ * be combined with anything else.
+ */
+#define CLS_NORMAL			0	/* Ordinary finite range (no bits set) */
+#define CLS_LOWER_INF		1	/* Lower bound is infinity */
+#define CLS_UPPER_INF		2	/* Upper bound is infinity */
+#define CLS_CONTAIN_EMPTY	4	/* Contains underlying empty ranges */
+#define CLS_EMPTY			8	/* Special class for empty ranges */
+
+#define CLS_COUNT			9	/* # of classes; includes all combinations of
+								 * properties. CLS_EMPTY doesn't combine with
+								 * anything else, so it's only 2^3 + 1. */
+
+/*
+ * Minimum accepted ratio of split for items of the same class.  If the items
+ * are of different classes, we will separate along those lines regardless of
+ * the ratio.
+ */
+#define LIMIT_RATIO  0.3
+
+/* Constants for fixed penalty values */
+#define INFINITE_BOUND_PENALTY  2.0
+#define CONTAIN_EMPTY_PENALTY  1.0
+#define DEFAULT_SUBTYPE_DIFF_PENALTY  1.0
 
 /*
- * Auxiliary structure for picksplit method.
+ * Per-item data for range_gist_single_sorting_split.
  */
 typedef struct
 {
-	int			index;			/* original index in entryvec->vector[] */
-	RangeType  *data;			/* range value to sort */
-	TypeCacheEntry *typcache;	/* range type's info */
-}	PickSplitSortItem;
+	int					index;
+	RangeBound			bound;
+} SingleBoundSortItem;
+
+/* place on left or right side of split? */
+typedef enum
+{
+	SPLIT_LEFT = 0,				/* makes initialization to SPLIT_LEFT easier */
+	SPLIT_RIGHT
+} SplitLR;
+
+/*
+ * Context for range_gist_consider_split.
+ */
+typedef struct
+{
+	TypeCacheEntry *typcache;	/* typcache for range type */
+	bool		has_subtype_diff;	/* does it have subtype_diff? */
+	int			entries_count;	/* total number of entries being split */
+
+	/* Information about currently selected split follows */
+
+	bool		first;			/* true if no split was selected yet */
+
+	RangeBound	*left_upper;	/* upper bound of left interval */
+	RangeBound	*right_lower;	/* lower bound of right interval */
+
+	float4		ratio;			/* split ratio */
+	float4		overlap;		/* overlap between left and right predicate */
+	int			common_left;	/* # common entries destined for each side */
+	int			common_right;
+} ConsiderSplitContext;
+
+/*
+ * Bounds extracted from a non-empty range, for use in
+ * range_gist_double_sorting_split.
+ */
+typedef struct
+{
+	RangeBound	lower;
+	RangeBound	upper;
+} NonEmptyRange;
+
+/*
+ * Represents information about an entry that can be placed in either group
+ * without affecting overlap over selected axis ("common entry").
+ */
+typedef struct
+{
+	/* Index of entry in the initial array */
+	int			index;
+	/* Delta between closeness of range to each of the two groups */
+	double		delta;
+} CommonEntry;
+
+/* Helper macros to place an entry in the left or right group during split */
+/* Note direct access to variables v, typcache, left_range, right_range */
+#define PLACE_LEFT(range, off)					\
+	do {										\
+		if (v->spl_nleft > 0)					\
+			left_range = range_super_union(typcache, left_range, range); \
+		else									\
+			left_range = (range);				\
+		v->spl_left[v->spl_nleft++] = (off);	\
+	} while(0)
+
+#define PLACE_RIGHT(range, off)					\
+	do {										\
+		if (v->spl_nright > 0)					\
+			right_range = range_super_union(typcache, right_range, range); \
+		else									\
+			right_range = (range);				\
+		v->spl_right[v->spl_nright++] = (off);	\
+	} while(0)
+
+/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
+#define rangeCopy(r) \
+	((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
+											 false, -1)))
 
 static RangeType *range_super_union(TypeCacheEntry *typcache, RangeType * r1,
 				  RangeType * r2);
@@ -57,7 +154,30 @@ static bool range_gist_consistent_int(FmgrInfo *flinfo,
 static bool range_gist_consistent_leaf(FmgrInfo *flinfo,
 						   StrategyNumber strategy, RangeType *key,
 						   Datum query);
-static int	sort_item_cmp(const void *a, const void *b);
+static void range_gist_fallback_split(TypeCacheEntry *typcache,
+									  GistEntryVector *entryvec,
+									  GIST_SPLITVEC *v);
+static void range_gist_class_split(TypeCacheEntry *typcache,
+								   GistEntryVector *entryvec,
+								   GIST_SPLITVEC *v,
+								   SplitLR *classes_groups);
+static void range_gist_single_sorting_split(TypeCacheEntry *typcache,
+											GistEntryVector *entryvec,
+											GIST_SPLITVEC *v,
+											bool use_upper_bound);
+static void range_gist_double_sorting_split(TypeCacheEntry *typcache,
+											GistEntryVector *entryvec,
+											GIST_SPLITVEC *v);
+static void range_gist_consider_split(ConsiderSplitContext *context,
+						  RangeBound *right_lower, int min_left_count,
+						  RangeBound *left_upper, int max_left_count);
+static int	get_gist_range_class(RangeType *range);
+static int	single_bound_cmp(const void *a, const void *b, void *arg);
+static int	interval_cmp_lower(const void *a, const void *b, void *arg);
+static int	interval_cmp_upper(const void *a, const void *b, void *arg);
+static int	common_entry_cmp(const void *i1, const void *i2);
+static float8 call_subtype_diff(TypeCacheEntry *typcache,
+								Datum val1, Datum val2);
 
 
 /* GiST query consistency check */
@@ -122,7 +242,16 @@ range_gist_decompress(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(entry);
 }
 
-/* page split penalty function */
+/*
+ * GiST page split penalty function.
+ *
+ * The penalty function has the following goals (in order from most to least
+ * important):
+ * - Keep normal ranges separate
+ * - Avoid broadening the class of the original predicate
+ * - Avoid broadening (as determined by subtype_diff) the original predicate
+ * - Favor adding ranges to narrower original predicates
+ */
 Datum
 range_gist_penalty(PG_FUNCTION_ARGS)
 {
@@ -132,118 +261,253 @@ range_gist_penalty(PG_FUNCTION_ARGS)
 	RangeType  *orig = DatumGetRangeType(origentry->key);
 	RangeType  *new = DatumGetRangeType(newentry->key);
 	TypeCacheEntry *typcache;
-	RangeType  *s_union;
-	FmgrInfo   *subtype_diff;
-	RangeBound	lower1,
-				lower2;
-	RangeBound	upper1,
-				upper2;
-	bool		empty1,
-				empty2;
-	float8		lower_diff,
-				upper_diff;
+	bool		has_subtype_diff;
+	RangeBound	orig_lower,
+				new_lower,
+				orig_upper,
+				new_upper;
+	bool		orig_empty,
+				new_empty;
 
 	if (RangeTypeGetOid(orig) != RangeTypeGetOid(new))
 		elog(ERROR, "range types do not match");
 
 	typcache = range_get_typcache(fcinfo, RangeTypeGetOid(orig));
 
-	subtype_diff = &typcache->rng_subdiff_finfo;
+	has_subtype_diff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
 
-	/*
-	 * If new is or contains empty, and orig doesn't, apply infinite penalty.
-	 * We really don't want to pollute an empty-free subtree with empties.
-	 */
-	if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
-	{
-		*penalty = get_float4_infinity();
-		PG_RETURN_POINTER(penalty);
-	}
+	range_deserialize(typcache, orig, &orig_lower, &orig_upper, &orig_empty);
+	range_deserialize(typcache, new, &new_lower, &new_upper, &new_empty);
 
 	/*
-	 * We want to compare the size of "orig" to size of "orig union new".
-	 * The penalty will be the sum of the reduction in the lower bound plus
-	 * the increase in the upper bound.
+	 * Distinct branches for handling distinct classes of ranges.  Note
+	 * that penalty values only need to be commensurate within the same
+	 * class of new range.
 	 */
-	s_union = range_super_union(typcache, orig, new);
-
-	range_deserialize(typcache, orig, &lower1, &upper1, &empty1);
-	range_deserialize(typcache, s_union, &lower2, &upper2, &empty2);
-
-	/* handle cases where orig is empty */
-	if (empty1 && empty2)
+	if (new_empty)
 	{
-		*penalty = 0;
-		PG_RETURN_POINTER(penalty);
+		/* Handle insertion of empty range */
+		if (orig_empty)
+		{
+			/*
+			 * The best case is to insert it to empty original
+			 * range.  Insertion here means no broadening of original range.
+			 * Also original range is the most narrow.
+			 */
+			*penalty = 0.0;
+		}
+		else if (RangeIsOrContainsEmpty(orig))
+		{
+			/*
+			 * The second case is to insert empty range into range which
+			 * contains at least one underlying empty range.  There is still
+			 * no broadening of original range, but original range is not as
+			 * narrow as possible.
+			 */
+			*penalty = CONTAIN_EMPTY_PENALTY;
+		}
+		else if (orig_lower.infinite && orig_upper.infinite)
+		{
+			/*
+			 * Original range requires broadening.  (-inf; +inf) is most far
+			 * from normal range in this case.
+			 */
+			*penalty = 2 * CONTAIN_EMPTY_PENALTY;
+		}
+		else if (orig_lower.infinite || orig_upper.infinite)
+		{
+			/*
+			 * (-inf, x) or (x, +inf) original ranges are closer to normal
+			 * ranges, so it's worse to mix it with empty ranges.
+			 */
+			*penalty = 3 * CONTAIN_EMPTY_PENALTY;
+		}
+		else
+		{
+			/*
+			 * The least preferred case is broadening of normal range.
+			 */
+			*penalty = 4 * CONTAIN_EMPTY_PENALTY;
+		}
 	}
-	else if (empty1)
+	else if (new_lower.infinite && new_upper.infinite)
 	{
-		/* infinite penalty for pushing non-empty into all-empty subtree */
-		*penalty = get_float4_infinity();
-		PG_RETURN_POINTER(penalty);
-	}
-
-	/* if orig isn't empty, s_union can't be either */
-	Assert(!empty2);
-
-	/* similarly, if orig's lower bound is infinite, s_union's must be too */
-	Assert(lower2.infinite || !lower1.infinite);
+		/* Handle insertion of (-inf, +inf) range */
+		if (orig_lower.infinite && orig_upper.infinite)
+		{
+			/*
+			 * Best case is inserting to (-inf, +inf) original range.
+			 */
+			*penalty = 0.0;
+		}
+		else if (orig_lower.infinite || orig_upper.infinite)
+		{
+			/*
+			 * When original range is (-inf, x) or (x, +inf) it requires
+			 * broadening of original range (extension of one bound to
+			 * infinity).
+			 */
+			*penalty = INFINITE_BOUND_PENALTY;
+		}
+		else
+		{
+			/*
+			 * Insertion to normal original range is least preferred.
+			 */
+			*penalty = 2 * INFINITE_BOUND_PENALTY;
+		}
 
-	if (lower2.infinite && lower1.infinite)
-		lower_diff = 0;
-	else if (lower2.infinite)
-		lower_diff = get_float8_infinity();
-	else if (OidIsValid(subtype_diff->fn_oid))
-	{
-		lower_diff = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
-													  typcache->rng_collation,
-													  lower1.val,
-													  lower2.val));
-		/* orig's lower bound must be >= s_union's */
-		if (lower_diff < 0)
-			lower_diff = 0;		/* subtype_diff is broken */
+		if (RangeIsOrContainsEmpty(orig))
+		{
+			/*
+			 * Original range is narrower when it doesn't contain empty ranges.
+			 * Add additional penalty otherwise.
+			 */
+			*penalty += CONTAIN_EMPTY_PENALTY;
+		}
 	}
-	else
+	else if (new_lower.infinite)
 	{
-		/* only know whether there is a difference or not */
-		lower_diff = range_cmp_bounds(typcache, &lower1, &lower2) > 0 ? 1 : 0;
+		/* Handle insertion of (-inf, x) range */
+		if (!orig_empty && orig_lower.infinite)
+		{
+			if (orig_upper.infinite)
+			{
+				/*
+				 * (-inf, +inf) range won't be extended by insertion of
+				 * (-inf, x) range. It's a less desirable case than insertion
+				 * to (-inf, y) original range without extension, because in
+				 * that case original range is narrower. But we can't express
+				 * that in single float value.
+				 */
+				*penalty = 0.0;
+			}
+			else
+			{
+				if (range_cmp_bounds(typcache, &new_upper, &orig_upper) > 0)
+				{
+					/*
+					 * Get extension of original range using subtype_diff.
+					 * Use constant if subtype_diff unavailable.
+					 */
+					if (has_subtype_diff)
+						*penalty = call_subtype_diff(typcache,
+													 new_upper.val,
+													 orig_upper.val);
+					else
+						*penalty = DEFAULT_SUBTYPE_DIFF_PENALTY;
+				}
+				else
+				{
+					/* No extension of original range */
+					*penalty = 0.0;
+				}
+			}
+		}
+		else
+		{
+			/*
+			 * If lower bound of original range is not -inf, then extension
+			 * of it is infinity.
+			 */
+			*penalty = get_float4_infinity();
+		}
 	}
-
-	/* similarly, if orig's upper bound is infinite, s_union's must be too */
-	Assert(upper2.infinite || !upper1.infinite);
-
-	if (upper2.infinite && upper1.infinite)
-		upper_diff = 0;
-	else if (upper2.infinite)
-		upper_diff = get_float8_infinity();
-	else if (OidIsValid(subtype_diff->fn_oid))
+	else if (new_upper.infinite)
 	{
-		upper_diff = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
-													  typcache->rng_collation,
-													  upper2.val,
-													  upper1.val));
-		/* orig's upper bound must be <= s_union's */
-		if (upper_diff < 0)
-			upper_diff = 0;		/* subtype_diff is broken */
+		/* Handle insertion of (x, +inf) range */
+		if (!orig_empty && orig_upper.infinite)
+		{
+			if (orig_lower.infinite)
+			{
+				/*
+				 * (-inf, +inf) range won't be extended by insertion of
+				 * (x, +inf) range. It's a less desirable case than insertion
+				 * to (y, +inf) original range without extension, because in
+				 * that case original range is narrower. But we can't express
+				 * that in single float value.
+				 */
+				*penalty = 0.0;
+			}
+			else
+			{
+				if (range_cmp_bounds(typcache, &new_lower, &orig_lower) < 0)
+				{
+					/*
+					 * Get extension of original range using subtype_diff.
+					 * Use constant if subtype_diff unavailable.
+					 */
+					if (has_subtype_diff)
+						*penalty = call_subtype_diff(typcache,
+													 orig_lower.val,
+													 new_lower.val);
+					else
+						*penalty = DEFAULT_SUBTYPE_DIFF_PENALTY;
+				}
+				else
+				{
+					/* No extension of original range */
+					*penalty = 0.0;
+				}
+			}
+		}
+		else
+		{
+			/*
+			 * If upper bound of original range is not +inf, then extension
+			 * of it is infinity.
+			 */
+			*penalty = get_float4_infinity();
+		}
 	}
 	else
 	{
-		/* only know whether there is a difference or not */
-		upper_diff = range_cmp_bounds(typcache, &upper2, &upper1) > 0 ? 1 : 0;
+		/* Handle insertion of normal (non-empty, non-infinite) range */
+		if (orig_empty || orig_lower.infinite || orig_upper.infinite)
+		{
+			/*
+			 * Avoid mixing normal ranges with infinite and empty ranges.
+			 */
+			*penalty = get_float4_infinity();
+		}
+		else
+		{
+			/*
+			 * Calculate extension of original range by calling subtype_diff.
+			 * Use constant if subtype_diff unavailable.
+			 */
+			float8		diff = 0.0;
+
+			if (range_cmp_bounds(typcache, &new_lower, &orig_lower) < 0)
+			{
+				if (has_subtype_diff)
+					diff += call_subtype_diff(typcache,
+											  orig_lower.val,
+											  new_lower.val);
+				else
+					diff += DEFAULT_SUBTYPE_DIFF_PENALTY;
+			}
+			if (range_cmp_bounds(typcache, &new_upper, &orig_upper) > 0)
+			{
+				if (has_subtype_diff)
+					diff += call_subtype_diff(typcache,
+											  new_upper.val,
+											  orig_upper.val);
+				else
+					diff += DEFAULT_SUBTYPE_DIFF_PENALTY;
+			}
+			*penalty = diff;
+		}
 	}
 
-	Assert(lower_diff >= 0 && upper_diff >= 0);
-
-	*penalty = (float) (lower_diff + upper_diff);
 	PG_RETURN_POINTER(penalty);
 }
 
 /*
  * The GiST PickSplit method for ranges
  *
- * Algorithm based on sorting.  Incoming array of ranges is sorted using
- * sort_item_cmp function.  After that first half of ranges goes to the left
- * output, and the second half of ranges goes to the right output.
+ * Primarily, we try to segregate ranges of different classes.  If splitting
+ * ranges of the same class, use the appropriate split method for that class.
  */
 Datum
 range_gist_picksplit(PG_FUNCTION_ARGS)
@@ -253,73 +517,149 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
 	TypeCacheEntry *typcache;
 	OffsetNumber i;
 	RangeType  *pred_left;
-	RangeType  *pred_right;
-	PickSplitSortItem *sortItems;
 	int			nbytes;
-	OffsetNumber split_idx;
-	OffsetNumber *left;
-	OffsetNumber *right;
 	OffsetNumber maxoff;
+	int			count_in_classes[CLS_COUNT];
+	int			j;
+	int			non_empty_classes_count = 0;
+	int			biggest_class = -1;
+	int			biggest_class_count = 0;
+	int			total_count;
 
 	/* use first item to look up range type's info */
 	pred_left = DatumGetRangeType(entryvec->vector[FirstOffsetNumber].key);
 	typcache = range_get_typcache(fcinfo, RangeTypeGetOid(pred_left));
 
-	/* allocate result and work arrays */
 	maxoff = entryvec->n - 1;
 	nbytes = (maxoff + 1) * sizeof(OffsetNumber);
 	v->spl_left = (OffsetNumber *) palloc(nbytes);
 	v->spl_right = (OffsetNumber *) palloc(nbytes);
-	sortItems = (PickSplitSortItem *) palloc(maxoff * sizeof(PickSplitSortItem));
 
 	/*
-	 * Prepare auxiliary array and sort the values.
+	 * Get count distribution of range classes.
 	 */
+	memset(count_in_classes, 0, sizeof(count_in_classes));
 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
-		sortItems[i - 1].index = i;
-		sortItems[i - 1].data = DatumGetRangeType(entryvec->vector[i].key);
-		sortItems[i - 1].typcache = typcache;
-	}
-	qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp);
-
-	split_idx = maxoff / 2;
+		RangeType *range = DatumGetRangeType(entryvec->vector[i].key);
 
-	left = v->spl_left;
-	v->spl_nleft = 0;
-	right = v->spl_right;
-	v->spl_nright = 0;
+		count_in_classes[get_gist_range_class(range)]++;
+	}
 
 	/*
-	 * First half of items goes to the left output.
+	 * Count non-empty classes and find biggest class.
 	 */
-	pred_left = sortItems[0].data;
-	*left++ = sortItems[0].index;
-	v->spl_nleft++;
-	for (i = 1; i < split_idx; i++)
+	total_count = maxoff;
+	for (j = 0; j < CLS_COUNT; j++)
 	{
-		pred_left = range_super_union(typcache, pred_left, sortItems[i].data);
-		*left++ = sortItems[i].index;
-		v->spl_nleft++;
+		if (count_in_classes[j] > 0)
+		{
+			if (count_in_classes[j] > biggest_class_count)
+			{
+				biggest_class_count = count_in_classes[j];
+				biggest_class = j;
+			}
+			non_empty_classes_count++;
+		}
 	}
 
-	/*
-	 * Second half of items goes to the right output.
-	 */
-	pred_right = sortItems[split_idx].data;
-	*right++ = sortItems[split_idx].index;
-	v->spl_nright++;
-	for (i = split_idx + 1; i < maxoff; i++)
+	Assert(non_empty_classes_count > 0);
+
+	if (non_empty_classes_count == 1)
 	{
-		pred_right = range_super_union(typcache, pred_right, sortItems[i].data);
-		*right++ = sortItems[i].index;
-		v->spl_nright++;
+		/* One non-empty class, so split inside class */
+		if ((biggest_class & ~CLS_CONTAIN_EMPTY) == CLS_NORMAL)
+		{
+			/* double sorting split for normal ranges */
+			range_gist_double_sorting_split(typcache, entryvec, v);
+		}
+		else if ((biggest_class & ~CLS_CONTAIN_EMPTY) == CLS_LOWER_INF)
+		{
+			/* upper bound sorting split for (-inf, x) ranges */
+			range_gist_single_sorting_split(typcache, entryvec, v, true);
+		}
+		else if ((biggest_class & ~CLS_CONTAIN_EMPTY) == CLS_UPPER_INF)
+		{
+			/* lower bound sorting split for (x, +inf) ranges */
+			range_gist_single_sorting_split(typcache, entryvec, v, false);
+		}
+		else
+		{
+			/* trivial split for all (-inf, +inf) or all empty ranges */
+			range_gist_fallback_split(typcache, entryvec, v);
+		}
 	}
+	else
+	{
+		/*
+		 * Class based split.
+		 *
+		 * To which side of the split should each class go?  Initialize them
+		 * all to go to the left side.
+		 */
+		SplitLR classes_groups[CLS_COUNT];
 
-	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
+		memset(classes_groups, 0, sizeof(classes_groups));
 
-	v->spl_ldatum = RangeTypeGetDatum(pred_left);
-	v->spl_rdatum = RangeTypeGetDatum(pred_right);
+		if (count_in_classes[CLS_NORMAL] > 0)
+		{
+			/* separate normal ranges if any */
+			classes_groups[CLS_NORMAL] = SPLIT_RIGHT;
+		}
+		else
+		{
+			/*----------
+			 * Try to split classes in one of two ways:
+			 *  1) containing infinities - not containing infinities
+			 *  2) containing empty - not containing empty
+			 *
+			 * Select the way which balances the ranges between left and right
+			 * the best. If split in these ways is not possible, there are at
+			 * most 3 classes, so just separate biggest class.
+			 *----------
+			 */
+			int infCount, nonInfCount;
+			int emptyCount, nonEmptyCount;
+
+			nonInfCount =
+				count_in_classes[CLS_NORMAL] +
+				count_in_classes[CLS_CONTAIN_EMPTY] +
+				count_in_classes[CLS_EMPTY];
+			infCount = total_count - nonInfCount;
+
+			nonEmptyCount =
+				count_in_classes[CLS_NORMAL]	+
+				count_in_classes[CLS_LOWER_INF] +
+				count_in_classes[CLS_UPPER_INF] +
+				count_in_classes[CLS_LOWER_INF | CLS_UPPER_INF];
+			emptyCount = total_count - nonEmptyCount;
+
+			if (infCount > 0 && nonInfCount > 0 &&
+				(Abs(infCount - nonInfCount) <=
+				 Abs(emptyCount - nonEmptyCount)))
+			{
+				classes_groups[CLS_NORMAL]		  = SPLIT_RIGHT;
+				classes_groups[CLS_CONTAIN_EMPTY] = SPLIT_RIGHT;
+				classes_groups[CLS_EMPTY]		  = SPLIT_RIGHT;
+			}
+			else if (emptyCount > 0 && nonEmptyCount > 0)
+			{
+				classes_groups[CLS_NORMAL]					  = SPLIT_RIGHT;
+				classes_groups[CLS_LOWER_INF]				  = SPLIT_RIGHT;
+				classes_groups[CLS_UPPER_INF]				  = SPLIT_RIGHT;
+				classes_groups[CLS_LOWER_INF | CLS_UPPER_INF] = SPLIT_RIGHT;
+			}
+			else
+			{
+				/*
+				 * Either total_count == emptyCount or total_count == infCount.
+				 */
+				classes_groups[biggest_class] = SPLIT_RIGHT;
+			}
+		}
+
+		range_gist_class_split(typcache, entryvec, v, classes_groups);
+	}
 
 	PG_RETURN_POINTER(v);
 }
@@ -611,78 +951,649 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
 }
 
 /*
- * Compare function for PickSplitSortItem. This is actually the
- * interesting part of the picksplit algorithm.
+ * Trivial split: half of entries will be placed on one page
+ * and the other half on the other page.
+ */
+static void
+range_gist_fallback_split(TypeCacheEntry *typcache,
+						  GistEntryVector *entryvec,
+						  GIST_SPLITVEC *v)
+{
+	RangeType	 *left_range = NULL;
+	RangeType	 *right_range = NULL;
+	OffsetNumber i, maxoff, split_idx;
+
+	maxoff = entryvec->n - 1;
+	/* Split entries before this to left page, after to right: */
+	split_idx = (maxoff - FirstOffsetNumber) / 2 + FirstOffsetNumber;
+
+	v->spl_nleft = 0;
+	v->spl_nright = 0;
+	for (i = FirstOffsetNumber; i <= maxoff; i++)
+	{
+		RangeType *range = DatumGetRangeType(entryvec->vector[i].key);
+
+		if (i < split_idx)
+			PLACE_LEFT(range, i);
+		else
+			PLACE_RIGHT(range, i);
+	}
+
+	v->spl_ldatum = RangeTypeGetDatum(left_range);
+	v->spl_rdatum = RangeTypeGetDatum(right_range);
+}
+
+/*
+ * Split based on classes of ranges.
  *
- * We want to separate out empty ranges, bounded ranges, and unbounded
- * ranges. We assume that "contains" and "overlaps" are the most
- * important queries, so empty ranges will rarely match and unbounded
- * ranges frequently will. Bounded ranges should be in the middle.
+ * See get_gist_range_class for class definitions.
+ * classes_groups is an array of length CLS_COUNT indicating the side of the
+ * split to which each class should go.
+ */
+static void
+range_gist_class_split(TypeCacheEntry *typcache,
+					   GistEntryVector *entryvec,
+					   GIST_SPLITVEC *v,
+					   SplitLR *classes_groups)
+{
+	RangeType			*left_range = NULL;
+	RangeType			*right_range = NULL;
+	OffsetNumber		i, maxoff;
+
+	maxoff = entryvec->n - 1;
+
+	v->spl_nleft = 0;
+	v->spl_nright = 0;
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+	{
+		RangeType *range = DatumGetRangeType(entryvec->vector[i].key);
+		int		class;
+
+		/* Get class of range */
+		class = get_gist_range_class(range);
+
+		/* Place range to appropriate page */
+		if (classes_groups[class] == SPLIT_LEFT)
+			PLACE_LEFT(range, i);
+		else
+		{
+			Assert(classes_groups[class] == SPLIT_RIGHT);
+			PLACE_RIGHT(range, i);
+		}
+	}
+
+	v->spl_ldatum = RangeTypeGetDatum(left_range);
+	v->spl_rdatum = RangeTypeGetDatum(right_range);
+}
+
+/*
+ * Sorting based split. First half of entries according to the sort will be
+ * placed to one page, and second half of entries will be placed to other
+ * page. use_upper_bound parameter indicates whether to use upper or lower
+ * bound for sorting.
+ */
+static void
+range_gist_single_sorting_split(TypeCacheEntry *typcache,
+								GistEntryVector *entryvec,
+								GIST_SPLITVEC *v,
+								bool use_upper_bound)
+{
+	SingleBoundSortItem	*sortItems;
+	RangeType			*left_range = NULL;
+	RangeType			*right_range = NULL;
+	OffsetNumber		i, maxoff, split_idx;
+
+	maxoff = entryvec->n - 1;
+
+	sortItems = (SingleBoundSortItem *)
+		palloc(maxoff * sizeof(SingleBoundSortItem));
+
+	/*
+	 * Prepare auxiliary array and sort the values.
+	 */
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+	{
+		RangeType *range = DatumGetRangeType(entryvec->vector[i].key);
+		RangeBound bound2;
+		bool empty;
+
+		sortItems[i - 1].index = i;
+		/* Put appropriate bound into array */
+		if (use_upper_bound)
+			range_deserialize(typcache, range, &bound2,
+							  &sortItems[i - 1].bound, &empty);
+		else
+			range_deserialize(typcache, range, &sortItems[i - 1].bound,
+							  &bound2, &empty);
+		Assert(!empty);
+	}
+
+	qsort_arg(sortItems, maxoff, sizeof(SingleBoundSortItem),
+			  single_bound_cmp, typcache);
+
+	split_idx = maxoff / 2;
+
+	v->spl_nleft = 0;
+	v->spl_nright = 0;
+
+	for (i = 0; i < maxoff; i++)
+	{
+		int		idx = sortItems[i].index;
+		RangeType *range = DatumGetRangeType(entryvec->vector[idx].key);
+
+		if (i < split_idx)
+			PLACE_LEFT(range, idx);
+		else
+			PLACE_RIGHT(range, idx);
+	}
+
+	v->spl_ldatum = RangeTypeGetDatum(left_range);
+	v->spl_rdatum = RangeTypeGetDatum(right_range);
+}
+
+/*
+ * Double sorting split algorithm.
+ *
+ * The algorithm considers dividing ranges into two groups. The first (left)
+ * group contains general left bound. The second (right) group contains
+ * general right bound. The challenge is to find upper bound of left group
+ * and lower bound of right group so that overlap of groups is minimal and
+ * ratio of distribution is acceptable. Algorithm finds for each lower bound of
+ * right group minimal upper bound of left group, and for each upper bound of
+ * left group maximal lower bound of right group. For each found pair
+ * range_gist_consider_split considers replacement of currently selected
+ * split with the new one.
+ *
+ * After that, all the entries are divided into three groups:
+ * 1) Entries which should be placed to the left group
+ * 2) Entries which should be placed to the right group
+ * 3) "Common entries" which can be placed to either group without affecting
+ *	  amount of overlap.
  *
- * Empty ranges we push all the way to the left, then bounded ranges
- * (sorted on lower bound, then upper), then ranges with no lower
- * bound, then ranges with no upper bound; and finally, ranges with no
- * upper or lower bound all the way to the right.
+ * The common ranges are distributed by difference of distance from lower
+ * bound of common range to lower bound of right group and distance from upper
+ * bound of common range to upper bound of left group.
+ *
+ * For details see:
+ * "A new double sorting-based node splitting algorithm for R-tree",
+ * A. Korotkov
+ * http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36
  */
-static int
-sort_item_cmp(const void *a, const void *b)
+static void
+range_gist_double_sorting_split(TypeCacheEntry *typcache,
+								GistEntryVector *entryvec,
+								GIST_SPLITVEC *v)
 {
-	PickSplitSortItem *i1 = (PickSplitSortItem *) a;
-	PickSplitSortItem *i2 = (PickSplitSortItem *) b;
-	RangeType  *r1 = i1->data;
-	RangeType  *r2 = i2->data;
-	TypeCacheEntry *typcache = i1->typcache;
-	RangeBound	lower1,
-				lower2;
-	RangeBound	upper1,
-				upper2;
-	bool		empty1,
-				empty2;
-	int			cmp;
+	ConsiderSplitContext context;
+	OffsetNumber i,	maxoff;
+	RangeType	*range,
+				*left_range = NULL,
+				*right_range = NULL;
+	int			 common_entries_count;
+	NonEmptyRange *by_lower,
+				  *by_upper;
+	CommonEntry *common_entries;
+	int			 nentries, i1, i2;
+	RangeBound	*right_lower, *left_upper;
+
+	memset(&context, 0, sizeof(ConsiderSplitContext));
+	context.typcache = typcache;
+	context.has_subtype_diff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
 
-	range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
-	range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
+	maxoff = entryvec->n - 1;
+	nentries = context.entries_count = maxoff - FirstOffsetNumber + 1;
+	context.first = true;
+
+	/* Allocate arrays for sorted range bounds */
+	by_lower = (NonEmptyRange *) palloc(nentries * sizeof(NonEmptyRange));
+	by_upper = (NonEmptyRange *) palloc(nentries * sizeof(NonEmptyRange));
+
+	/* Fill arrays of bounds */
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+	{
+		RangeType *range = DatumGetRangeType(entryvec->vector[i].key);
+		bool empty;
+
+		range_deserialize(typcache, range,
+						  &by_lower[i - FirstOffsetNumber].lower,
+						  &by_lower[i - FirstOffsetNumber].upper,
+						  &empty);
+		Assert(!empty);
+	}
+
+	/*
+	 * Make two arrays of range bounds: one sorted by lower bound and another
+	 * sorted by upper bound.
+	 */
+	memcpy(by_upper, by_lower, nentries * sizeof(NonEmptyRange));
+	qsort_arg(by_lower, nentries, sizeof(NonEmptyRange),
+			  interval_cmp_lower, typcache);
+	qsort_arg(by_upper, nentries, sizeof(NonEmptyRange),
+			  interval_cmp_upper, typcache);
+
+	/*----------
+	 * The goal is to form a left and right range, so that every entry
+	 * range is contained by either left or right interval (or both).
+	 *
+	 * For example, with the ranges (0,1), (1,3), (2,3), (2,4):
+	 *
+	 * 0 1 2 3 4
+	 * +-+
+	 *	 +---+
+	 *	   +-+
+	 *	   +---+
+	 *
+	 * The left and right ranges are of the form (0,a) and (b,4).
+	 * We first consider splits where b is the lower bound of an entry.
+	 * We iterate through all entries, and for each b, calculate the
+	 * smallest possible a. Then we consider splits where a is the
+	 * upper bound of an entry, and for each a, calculate the greatest
+	 * possible b.
+	 *
+	 * In the above example, the first loop would consider splits:
+	 * b=0: (0,1)-(0,4)
+	 * b=1: (0,1)-(1,4)
+	 * b=2: (0,3)-(2,4)
+	 *
+	 * And the second loop:
+	 * a=1: (0,1)-(1,4)
+	 * a=3: (0,3)-(2,4)
+	 * a=4: (0,4)-(2,4)
+	 *----------
+	 */
+
+	/*
+	 * Iterate over lower bound of right group, finding smallest possible
+	 * upper bound of left group.
+	 */
+	i1 = 0;
+	i2 = 0;
+	right_lower = &by_lower[i1].lower;
+	left_upper	= &by_upper[i2].lower;
+	while (true)
+	{
+		/*
+		 * Find next lower bound of right group.
+		 */
+		while (i1 < nentries &&
+			   range_cmp_bounds(typcache, right_lower,
+								&by_lower[i1].lower) == 0)
+		{
+			if (range_cmp_bounds(typcache, &by_lower[i1].upper,
+								 left_upper) > 0)
+				left_upper = &by_lower[i1].upper;
+			i1++;
+		}
+		if (i1 >= nentries)
+			break;
+		right_lower = &by_lower[i1].lower;
 
-	if (empty1 || empty2)
+		/*
+		 * Find count of ranges which anyway should be placed to the
+		 * left group.
+		 */
+		while (i2 < nentries &&
+			   range_cmp_bounds(typcache, &by_upper[i2].upper,
+								left_upper) <= 0)
+			i2++;
+
+		/*
+		 * Consider found split to see if it's better than what we had.
+		 */
+		range_gist_consider_split(&context, right_lower, i1, left_upper, i2);
+	}
+
+	/*
+	 * Iterate over upper bound of left group finding greatest possible
+	 * lower bound of right group.
+	 */
+	i1 = nentries - 1;
+	i2 = nentries - 1;
+	right_lower = &by_lower[i1].upper;
+	left_upper	= &by_upper[i2].upper;
+	while (true)
+	{
+		/*
+		 * Find next upper bound of left group.
+		 */
+		while (i2 >= 0 &&
+			   range_cmp_bounds(typcache, left_upper,
+								&by_upper[i2].upper) == 0)
+		{
+			if (range_cmp_bounds(typcache, &by_upper[i2].lower,
+								 right_lower) < 0)
+				right_lower = &by_upper[i2].lower;
+			i2--;
+		}
+		if (i2 < 0)
+			break;
+		left_upper = &by_upper[i2].upper;
+
+		/*
+		 * Find count of intervals which anyway should be placed to the
+		 * right group.
+		 */
+		while (i1 >= 0 &&
+			   range_cmp_bounds(typcache, &by_lower[i1].lower,
+								right_lower) >= 0)
+			i1--;
+
+		/*
+		 * Consider found split to see if it's better than what we had.
+		 */
+		range_gist_consider_split(&context, right_lower, i1 + 1,
+								  left_upper, i2 + 1);
+	}
+
+	/*
+	 * If we failed to find any acceptable splits, use trivial split.
+	 */
+	if (context.first)
+	{
+		range_gist_fallback_split(typcache, entryvec, v);
+		return;
+	}
+
+	/*
+	 * Ok, we have now selected bounds of the groups. Now we have to distribute
+	 * entries themselves. At first we distribute entries which can be placed
+	 * unambiguously and collect "common entries" to array.
+	 */
+
+	/* Allocate vectors for results */
+	v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber));
+	v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber));
+	v->spl_nleft = 0;
+	v->spl_nright = 0;
+
+	/*
+	 * Allocate an array for "common entries" - entries which can be placed to
+	 * either group without affecting overlap along selected axis.
+	 */
+	common_entries_count = 0;
+	common_entries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry));
+
+	/*
+	 * Distribute entries which can be distributed unambiguously, and collect
+	 * common entries.
+	 */
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
-		if (empty1 && empty2)
-			return 0;
-		else if (empty1)
-			return -1;
-		else if (empty2)
-			return 1;
+		RangeBound	lower,
+					upper;
+		bool		empty;
+
+		/*
+		 * Get upper and lower bounds along selected axis.
+		 */
+		range = DatumGetRangeType(entryvec->vector[i].key);
+
+		range_deserialize(typcache, range, &lower, &upper, &empty);
+
+		if (range_cmp_bounds(typcache, &upper, context.left_upper) <= 0)
+		{
+			/* Fits in the left group */
+			if (range_cmp_bounds(typcache, &lower, context.right_lower) >= 0)
+			{
+				/* Fits also in the right group, so "common entry" */
+				common_entries[common_entries_count].index = i;
+				if (context.has_subtype_diff)
+				{
+					/*
+					 * delta = (lower - context.right_lower) -
+					 * (context.left_upper - upper)
+					 */
+					common_entries[common_entries_count].delta =
+						call_subtype_diff(typcache,
+										  lower.val,
+										  context.right_lower->val) -
+						call_subtype_diff(typcache,
+										  context.left_upper->val,
+										  upper.val);
+				}
+				else
+				{
+					/* Without subtype_diff, take all deltas as zero */
+					common_entries[common_entries_count].delta = 0;
+				}
+				common_entries_count++;
+			}
+			else
+			{
+				/* Doesn't fit to the right group, so join to the left group */
+				PLACE_LEFT(range, i);
+			}
+		}
 		else
-			Assert(false);
+		{
+			/*
+			 * Each entry should fit on either left or right group. Since this
+			 * entry didn't fit in the left group, it better fit in the right
+			 * group.
+			 */
+			Assert(range_cmp_bounds(typcache, &lower,
+									context.right_lower) >= 0);
+			PLACE_RIGHT(range, i);
+		}
+	}
+
+	/*
+	 * Distribute "common entries", if any.
+	 */
+	if (common_entries_count > 0)
+	{
+		/*
+		 * Sort "common entries" by calculated deltas in order to distribute
+		 * the most ambiguous entries first.
+		 */
+		qsort(common_entries, common_entries_count, sizeof(CommonEntry),
+			  common_entry_cmp);
+
+		/*
+		 * Distribute "common entries" between groups according to sorting.
+		 */
+		for (i = 0; i < common_entries_count; i++)
+		{
+			int		idx = common_entries[i].index;
+
+			range = DatumGetRangeType(entryvec->vector[idx].key);
+
+			/*
+			 * Check if we have to place this entry in either group to achieve
+			 * LIMIT_RATIO.
+			 */
+			if (i < context.common_left)
+				PLACE_LEFT(range, idx);
+			else
+				PLACE_RIGHT(range, idx);
+		}
 	}
 
+	v->spl_ldatum = PointerGetDatum(left_range);
+	v->spl_rdatum = PointerGetDatum(right_range);
+}
+
+/*
+ * Consider replacement of currently selected split with a better one
+ * during range_gist_double_sorting_split.
+ */
+static void
+range_gist_consider_split(ConsiderSplitContext *context,
+						  RangeBound *right_lower, int min_left_count,
+						  RangeBound *left_upper, int max_left_count)
+{
+	int			left_count,
+				right_count;
+	float4		ratio,
+				overlap;
+
+	/*
+	 * Calculate entries distribution ratio assuming most uniform distribution
+	 * of common entries.
+	 */
+	if (min_left_count >= (context->entries_count + 1) / 2)
+		left_count = min_left_count;
+	else if (max_left_count <= context->entries_count / 2)
+		left_count = max_left_count;
+	else
+		left_count = context->entries_count / 2;
+	right_count = context->entries_count - left_count;
+
 	/*
-	 * If both lower or both upper bounds are infinite, we sort by ascending
-	 * range size. That means that if both upper bounds are infinite, we sort
-	 * by the lower bound _descending_. That creates a slightly odd total
-	 * order, but keeps the pages with very unselective predicates grouped
-	 * more closely together on the right.
+	 * Ratio of split: quotient between size of smaller group and total
+	 * entries count.  This is necessarily 0.5 or less; if it's less than
+	 * LIMIT_RATIO then we will never accept the new split.
 	 */
-	if (lower1.infinite || upper1.infinite ||
-		lower2.infinite || upper2.infinite)
+	ratio = ((float4) Min(left_count, right_count)) /
+		((float4) context->entries_count);
+
+	if (ratio > LIMIT_RATIO)
 	{
-		if (lower1.infinite && lower2.infinite)
-			return range_cmp_bounds(typcache, &upper1, &upper2);
-		else if (lower1.infinite)
-			return -1;
-		else if (lower2.infinite)
-			return 1;
-		else if (upper1.infinite && upper2.infinite)
-			return -(range_cmp_bounds(typcache, &lower1, &lower2));
-		else if (upper1.infinite)
-			return 1;
-		else if (upper2.infinite)
-			return -1;
+		bool		selectthis = false;
+
+		/*
+		 * The ratio is acceptable, so compare current split with previously
+		 * selected one. We search for minimal overlap (allowing negative
+		 * values) and minimal ratio secondarily.  If subtype_diff is
+		 * available, it's used for overlap measure.  Without subtype_diff we
+		 * use number of "common entries" as an overlap measure.
+		 */
+		if (context->has_subtype_diff)
+			overlap = call_subtype_diff(context->typcache,
+										left_upper->val,
+										right_lower->val);
+		else
+			overlap = max_left_count - min_left_count;
+
+		/* If there is no previous selection, select this split */
+		if (context->first)
+			selectthis = true;
 		else
-			Assert(false);
+		{
+			/*
+			 * Choose the new split if it has a smaller overlap, or same
+			 * overlap but better ratio.
+			 */
+			if (overlap < context->overlap ||
+				(overlap == context->overlap && ratio > context->ratio))
+				selectthis = true;
+		}
+
+		if (selectthis)
+		{
+			/* save information about selected split */
+			context->first = false;
+			context->ratio = ratio;
+			context->overlap = overlap;
+			context->right_lower = right_lower;
+			context->left_upper = left_upper;
+			context->common_left = max_left_count - left_count;
+			context->common_right = left_count - min_left_count;
+		}
+	}
+}
+
+/*
+ * Find class number for range.
+ *
+ * The class number is a valid combination of the properties of the
+ * range.  Note: the highest possible number is 8, because CLS_EMPTY
+ * can't be combined with anything else.
+ */
+static int
+get_gist_range_class(RangeType *range)
+{
+	int			classNumber;
+	char		flags;
+
+	flags = range_get_flags(range);
+	if (flags & RANGE_EMPTY)
+	{
+		classNumber = CLS_EMPTY;
 	}
+	else
+	{
+		classNumber = 0;
+		if (flags & RANGE_LB_INF)
+			classNumber |= CLS_LOWER_INF;
+		if (flags & RANGE_UB_INF)
+			classNumber |= CLS_UPPER_INF;
+		if (flags & RANGE_CONTAIN_EMPTY)
+			classNumber |= CLS_CONTAIN_EMPTY;
+	}
+	return classNumber;
+}
+
+/*
+ * Comparison function for range_gist_single_sorting_split.
+ */
+static int
+single_bound_cmp(const void *a, const void *b, void *arg)
+{
+	SingleBoundSortItem	*i1 = (SingleBoundSortItem *) a;
+	SingleBoundSortItem	*i2 = (SingleBoundSortItem *) b;
+	TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
+
+	return range_cmp_bounds(typcache, &i1->bound, &i2->bound);
+}
+
+/*
+ * Compare NonEmptyRanges by lower bound.
+ */
+static int
+interval_cmp_lower(const void *a, const void *b, void *arg)
+{
+	NonEmptyRange *i1 = (NonEmptyRange *) a;
+	NonEmptyRange *i2 = (NonEmptyRange *) b;
+	TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
+
+	return range_cmp_bounds(typcache, &i1->lower, &i2->lower);
+}
+
+/*
+ * Compare NonEmptyRanges by upper bound.
+ */
+static int
+interval_cmp_upper(const void *a, const void *b, void *arg)
+{
+	NonEmptyRange *i1 = (NonEmptyRange *) a;
+	NonEmptyRange *i2 = (NonEmptyRange *) b;
+	TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
+
+	return range_cmp_bounds(typcache, &i1->upper, &i2->upper);
+}
 
-	if ((cmp = range_cmp_bounds(typcache, &lower1, &lower2)) != 0)
-		return cmp;
+/*
+ * Compare CommonEntrys by their deltas.
+ */
+static int
+common_entry_cmp(const void *i1, const void *i2)
+{
+	double		delta1 = ((CommonEntry *) i1)->delta;
+	double		delta2 = ((CommonEntry *) i2)->delta;
+
+	if (delta1 < delta2)
+		return -1;
+	else if (delta1 > delta2)
+		return 1;
+	else
+		return 0;
+}
 
-	return range_cmp_bounds(typcache, &upper1, &upper2);
+/*
+ * Convenience function to invoke type-specific subtype_diff function.
+ * Caller must have already checked that there is one for the range type.
+ */
+static float8
+call_subtype_diff(TypeCacheEntry *typcache, Datum val1, Datum val2)
+{
+	float8		value;
+
+	value = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
+											 typcache->rng_collation,
+											 val1, val2));
+	/* Cope with buggy subtype_diff function by returning zero */
+	if (value >= 0.0)
+		return value;
+	return 0.0;
 }

From cecdf6d4596976bb378ca194bba55a242b883d2d Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Mon, 5 Mar 2012 20:19:20 +0200
Subject: [PATCH 084/129] Add isolation test to check-world and
 installcheck-world

---
 GNUmakefile.in    |  2 +-
 src/test/Makefile | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/GNUmakefile.in b/GNUmakefile.in
index 50fae4128aa32..5ebdb8b202c71 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -64,7 +64,7 @@ distclean maintainer-clean:
 check: all
 
 check installcheck installcheck-parallel:
-	$(MAKE) -C src/test $@
+	$(MAKE) -C src/test/regress $@
 
 $(call recurse,check-world,src/test src/pl src/interfaces/ecpg contrib,check)
 
diff --git a/src/test/Makefile b/src/test/Makefile
index e8cf7041b4a6c..0fd7eabf08f19 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -1,15 +1,17 @@
 #-------------------------------------------------------------------------
 #
-# Makefile.inc--
-#    Makefile for test suites
+# Makefile for src/test
 #
 # Copyright (c) 1994, Regents of the University of California
 #
-#
-# IDENTIFICATION
-#    src/test/Makefile
+# src/test/Makefile
 #
 #-------------------------------------------------------------------------
 
-.DEFAULT:
-	$(MAKE) -C regress $@
+subdir = src/test
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+SUBDIRS = regress isolation
+
+$(recurse)

From 3f47e145f1869f147a807e5a2cb80d21a13e10ae Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 5 Mar 2012 14:08:52 -0500
Subject: [PATCH 085/129] Improve documentation around logging_collector and
 use of stderr.

In backup.sgml, point out that you need to be using the logging collector
if you want to log messages from a failing archive_command script.  (This
is an oversimplification, in that it will work without the collector as
long as you're not sending postmaster stderr to /dev/null; but it seems
like a good idea to encourage use of the collector to avoid problems
with multiple processes concurrently scribbling on one file.)

In config.sgml, do some wordsmithing of logging_collector discussion.

Per bug #6518 from Janning Vygen
---
 doc/src/sgml/backup.sgml | 13 ++++++++++---
 doc/src/sgml/config.sgml | 33 ++++++++++++++++++++++++---------
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml
index 843dc3de9fcfb..4227b666f4274 100644
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -1279,9 +1279,6 @@ archive_command = 'local_backup_script.sh "%p" "%f"'
       This allows all complexity to be managed within the script, which
       can be written in a popular scripting language such as
       <application>bash</> or <application>perl</>.
-      Any messages written to <literal>stderr</> from the script will appear
-      in the database server log, allowing complex configurations to be
-      diagnosed easily if they fail.
      </para>
 
      <para>
@@ -1310,6 +1307,16 @@ archive_command = 'local_backup_script.sh "%p" "%f"'
        </listitem>
       </itemizedlist>
      </para>
+
+     <tip>
+      <para>
+       When using an <varname>archive_command</varname> script, it's desirable
+       to enable <xref linkend="guc-logging-collector">.
+       Any messages written to <systemitem>stderr</> from the script will then
+       appear in the database server log, allowing complex configurations to
+       be diagnosed easily if they fail.
+      </para>
+     </tip>
     </sect3>
   </sect2>
 
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6e1378a9d6dc8..3e178759a29e1 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3123,7 +3123,7 @@ SELECT * FROM parent WHERE key = 2400;
         value</> (<acronym>CSV</>) format, which is convenient for
         loading logs into programs.
         See <xref linkend="runtime-config-logging-csvlog"> for details.
-        <varname>logging_collector</varname> must be enabled to generate
+        <xref linkend="guc-logging-collector"> must be enabled to generate
         CSV-format log output.
        </para>
 
@@ -3163,24 +3163,39 @@ local0.*    /var/log/postgresql
       </indexterm>
       <listitem>
        <para>
-         This parameter captures plain and CSV-format log messages
-         sent to <application>stderr</> and redirects them into log files.
+         This parameter enables the <firstterm>logging collector</>, which
+         is a background process that captures log messages
+         sent to <systemitem>stderr</> and redirects them into log files.
          This approach is often more useful than
          logging to <application>syslog</>, since some types of messages
-         might not appear in <application>syslog</> output (a common example
-         is dynamic-linker failure messages).
+         might not appear in <application>syslog</> output.  (One common
+         example is dynamic-linker failure messages; another is error messages
+         produced by scripts such as <varname>archive_command</>.)
          This parameter can only be set at server start.
        </para>
 
+       <note>
+        <para>
+         It is possible to log to <systemitem>stderr</> without using the
+         logging collector; the log messages will just go to wherever the
+         server's <systemitem>stderr</> is directed.  However, that method is
+         only suitable for low log volumes, since it provides no convenient
+         way to rotate log files.  Also, on some platforms not using the
+         logging collector can result in lost or garbled log output, because
+         multiple processes writing concurrently to the same log file can
+         overwrite each other's output.
+        </para>
+       </note>
+
        <note>
         <para>
           The logging collector is designed to never lose messages.  This means
           that in case of extremely high load, server processes could be
-          blocked due to trying to send additional log messages when the
+          blocked while trying to send additional log messages when the
           collector has fallen behind.  In contrast, <application>syslog</>
-          prefers to drop messages if it cannot write them, which means it's
-          less reliable in those cases but it will not block the rest of the
-          system.
+          prefers to drop messages if it cannot write them, which means it
+          may fail to log some messages in such cases but it will not block
+          the rest of the system.
         </para>
        </note>
 

From 6b289942bfdbbfa2955cedc591c522822a7ffbfe Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 5 Mar 2012 16:15:59 -0500
Subject: [PATCH 086/129] Redesign PlanForeignScan API to allow multiple paths
 for a foreign table.

The original API specification only allowed an FDW to create a single
access path, which doesn't seem like a terribly good idea in hindsight.
Instead, move the responsibility for building the Path node and calling
add_path() into the FDW's PlanForeignScan function.  Now, it can do that
more than once if appropriate.  There is no longer any need for the
transient FdwPlan struct, so get rid of that.

Etsuro Fujita, Shigeru Hanada, Tom Lane
---
 contrib/file_fdw/file_fdw.c             | 39 ++++++++++++++++++------
 doc/src/sgml/fdwhandler.sgml            | 33 ++++++++++++--------
 src/backend/nodes/copyfuncs.c           | 21 ++-----------
 src/backend/nodes/outfuncs.c            | 19 +++---------
 src/backend/optimizer/path/allpaths.c   | 12 +++++---
 src/backend/optimizer/plan/createplan.c |  9 +++---
 src/backend/optimizer/util/pathnode.c   | 40 +++++++++++--------------
 src/include/foreign/fdwapi.h            | 35 ++--------------------
 src/include/nodes/nodes.h               |  1 -
 src/include/nodes/plannodes.h           |  3 +-
 src/include/nodes/relation.h            | 10 +++++--
 src/include/optimizer/pathnode.h        |  6 +++-
 12 files changed, 103 insertions(+), 125 deletions(-)

diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index 46394a80e0545..c2faa6235e766 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -25,6 +25,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "optimizer/cost.h"
+#include "optimizer/pathnode.h"
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
@@ -93,7 +94,7 @@ PG_FUNCTION_INFO_V1(file_fdw_validator);
 /*
  * FDW callback routines
  */
-static FdwPlan *filePlanForeignScan(Oid foreigntableid,
+static void filePlanForeignScan(Oid foreigntableid,
 					PlannerInfo *root,
 					RelOptInfo *baserel);
 static void fileExplainForeignScan(ForeignScanState *node, ExplainState *es);
@@ -406,27 +407,44 @@ get_file_fdw_attribute_options(Oid relid)
 
 /*
  * filePlanForeignScan
- *		Create a FdwPlan for a scan on the foreign table
+ *		Create possible access paths for a scan on the foreign table
+ *
+ *		Currently we don't support any push-down feature, so there is only one
+ *		possible access path, which simply returns all records in the order in
+ *		the data file.
  */
-static FdwPlan *
+static void
 filePlanForeignScan(Oid foreigntableid,
 					PlannerInfo *root,
 					RelOptInfo *baserel)
 {
-	FdwPlan    *fdwplan;
 	char	   *filename;
 	List	   *options;
+	Cost		startup_cost;
+	Cost		total_cost;
 
 	/* Fetch options --- we only need filename at this point */
 	fileGetOptions(foreigntableid, &filename, &options);
 
-	/* Construct FdwPlan with cost estimates */
-	fdwplan = makeNode(FdwPlan);
+	/* Estimate costs and update baserel->rows */
 	estimate_costs(root, baserel, filename,
-				   &fdwplan->startup_cost, &fdwplan->total_cost);
-	fdwplan->fdw_private = NIL; /* not used */
+				   &startup_cost, &total_cost);
+
+	/* Create a ForeignPath node and add it as only possible path */
+	add_path(baserel, (Path *)
+			 create_foreignscan_path(root, baserel,
+									 baserel->rows,
+									 startup_cost,
+									 total_cost,
+									 NIL, /* no pathkeys */
+									 NULL, /* no outer rel either */
+									 NIL,
+									 NIL)); /* no fdw_private data */
 
-	return fdwplan;
+	/*
+	 * If data file was sorted, and we knew it somehow, we could insert
+	 * appropriate pathkeys into the ForeignPath node to tell the planner that.
+	 */
 }
 
 /*
@@ -576,6 +594,9 @@ fileReScanForeignScan(ForeignScanState *node)
 
 /*
  * Estimate costs of scanning a foreign table.
+ *
+ * In addition to setting *startup_cost and *total_cost, this should
+ * update baserel->rows.
  */
 static void
 estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index 76ff243f5d34d..12c5f75bfab4b 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -88,21 +88,31 @@
 
     <para>
 <programlisting>
-FdwPlan *
+void
 PlanForeignScan (Oid foreigntableid,
                  PlannerInfo *root,
                  RelOptInfo *baserel);
 </programlisting>
 
-     Plan a scan on a foreign table. This is called when a query is planned.
+     Create possible access paths for a scan on a foreign table. This is
+     called when a query is planned.
      <literal>foreigntableid</> is the <structname>pg_class</> OID of the
      foreign table.  <literal>root</> is the planner's global information
      about the query, and <literal>baserel</> is the planner's information
      about this table.
-     The function must return a palloc'd struct that contains cost estimates
-     plus any FDW-private information that is needed to execute the foreign
-     scan at a later time.  (Note that the private information must be
-     represented in a form that <function>copyObject</> knows how to copy.)
+    </para>
+
+    <para>
+     The function must generate at least one access path (ForeignPath node)
+     for a scan on the foreign table and must call <function>add_path</> to
+     add the path to <literal>baserel-&gt;pathlist</>.  It's recommended to
+     use <function>create_foreignscan_path</> to build the ForeignPath node.
+     The function may generate multiple access paths, e.g., a path which has
+     valid <literal>pathkeys</> to represent a pre-sorted result.  Each access
+     path must contain cost estimates, and can contain any FDW-private
+     information that is needed to execute the foreign scan at a later time.
+     (Note that the private information must be represented in a form that
+     <function>copyObject</> knows how to copy.)
     </para>
 
     <para>
@@ -159,9 +169,8 @@ BeginForeignScan (ForeignScanState *node,
      its <structfield>fdw_state</> field is still NULL.  Information about
      the table to scan is accessible through the
      <structname>ForeignScanState</> node (in particular, from the underlying
-     <structname>ForeignScan</> plan node, which contains a pointer to the
-     <structname>FdwPlan</> structure returned by
-     <function>PlanForeignScan</>).
+     <structname>ForeignScan</> plan node, which contains any FDW-private
+     information provided by <function>PlanForeignScan</>).
     </para>
 
     <para>
@@ -228,9 +237,9 @@ EndForeignScan (ForeignScanState *node);
     </para>
 
     <para>
-     The <structname>FdwRoutine</> and <structname>FdwPlan</> struct types
-     are declared in <filename>src/include/foreign/fdwapi.h</>, which see
-     for additional details.
+     The <structname>FdwRoutine</> struct type is declared in
+     <filename>src/include/foreign/fdwapi.h</>, which see for additional
+     details.
     </para>
 
    </sect1>
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 7fec4dbf7b56c..868fb7130a8b2 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -23,7 +23,8 @@
 #include "postgres.h"
 
 #include "miscadmin.h"
-#include "foreign/fdwapi.h"
+#include "nodes/plannodes.h"
+#include "nodes/relation.h"
 #include "utils/datum.h"
 
 
@@ -591,21 +592,6 @@ _copyForeignScan(const ForeignScan *from)
 	 * copy remainder of node
 	 */
 	COPY_SCALAR_FIELD(fsSystemCol);
-	COPY_NODE_FIELD(fdwplan);
-
-	return newnode;
-}
-
-/*
- * _copyFdwPlan
- */
-static FdwPlan *
-_copyFdwPlan(const FdwPlan *from)
-{
-	FdwPlan    *newnode = makeNode(FdwPlan);
-
-	COPY_SCALAR_FIELD(startup_cost);
-	COPY_SCALAR_FIELD(total_cost);
 	COPY_NODE_FIELD(fdw_private);
 
 	return newnode;
@@ -3842,9 +3828,6 @@ copyObject(const void *from)
 		case T_ForeignScan:
 			retval = _copyForeignScan(from);
 			break;
-		case T_FdwPlan:
-			retval = _copyFdwPlan(from);
-			break;
 		case T_Join:
 			retval = _copyJoin(from);
 			break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 25a215e9d71f7..9daeb3e7b43e9 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -23,7 +23,9 @@
 
 #include <ctype.h>
 
-#include "foreign/fdwapi.h"
+#include "lib/stringinfo.h"
+#include "nodes/plannodes.h"
+#include "nodes/relation.h"
 #include "utils/datum.h"
 
 
@@ -558,16 +560,6 @@ _outForeignScan(StringInfo str, const ForeignScan *node)
 	_outScanInfo(str, (const Scan *) node);
 
 	WRITE_BOOL_FIELD(fsSystemCol);
-	WRITE_NODE_FIELD(fdwplan);
-}
-
-static void
-_outFdwPlan(StringInfo str, const FdwPlan *node)
-{
-	WRITE_NODE_TYPE("FDWPLAN");
-
-	WRITE_FLOAT_FIELD(startup_cost, "%.2f");
-	WRITE_FLOAT_FIELD(total_cost, "%.2f");
 	WRITE_NODE_FIELD(fdw_private);
 }
 
@@ -1572,7 +1564,7 @@ _outForeignPath(StringInfo str, const ForeignPath *node)
 
 	_outPathInfo(str, (const Path *) node);
 
-	WRITE_NODE_FIELD(fdwplan);
+	WRITE_NODE_FIELD(fdw_private);
 }
 
 static void
@@ -2745,9 +2737,6 @@ _outNode(StringInfo str, const void *obj)
 			case T_ForeignScan:
 				_outForeignScan(str, obj);
 				break;
-			case T_FdwPlan:
-				_outFdwPlan(str, obj);
-				break;
 			case T_Join:
 				_outJoin(str, obj);
 				break;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 8f034176e7cc4..6e81ce0fc2649 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -18,6 +18,7 @@
 #include <math.h>
 
 #include "catalog/pg_class.h"
+#include "foreign/fdwapi.h"
 #include "nodes/nodeFuncs.h"
 #ifdef OPTIMIZER_DEBUG
 #include "nodes/print.h"
@@ -399,15 +400,18 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 
 /*
  * set_foreign_pathlist
- *		Build the (single) access path for a foreign table RTE
+ *		Build access paths for a foreign table RTE
  */
 static void
 set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
-	/* Generate appropriate path */
-	add_path(rel, (Path *) create_foreignscan_path(root, rel));
+	FdwRoutine *fdwroutine;
 
-	/* Select cheapest path (pretty easy in this case...) */
+	/* Call the FDW's PlanForeignScan function to generate path(s) */
+	fdwroutine = GetFdwRoutineByRelId(rte->relid);
+	fdwroutine->PlanForeignScan(rte->relid, root, rel);
+
+	/* Select cheapest path */
 	set_cheapest(rel);
 }
 
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9ac0c9919027a..b1df56cafd25a 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -20,7 +20,6 @@
 #include <math.h>
 
 #include "access/skey.h"
-#include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -121,7 +120,7 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual,
 static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
 				   Index scanrelid, int wtParam);
 static ForeignScan *make_foreignscan(List *qptlist, List *qpqual,
-				 Index scanrelid, bool fsSystemCol, FdwPlan *fdwplan);
+				 Index scanrelid, bool fsSystemCol, List *fdw_private);
 static BitmapAnd *make_bitmap_and(List *bitmapplans);
 static BitmapOr *make_bitmap_or(List *bitmapplans);
 static NestLoop *make_nestloop(List *tlist,
@@ -1847,7 +1846,7 @@ create_foreignscan_plan(PlannerInfo *root, ForeignPath *best_path,
 								 scan_clauses,
 								 scan_relid,
 								 fsSystemCol,
-								 best_path->fdwplan);
+								 best_path->fdw_private);
 
 	copy_path_costsize(&scan_plan->scan.plan, &best_path->path);
 
@@ -3189,7 +3188,7 @@ make_foreignscan(List *qptlist,
 				 List *qpqual,
 				 Index scanrelid,
 				 bool fsSystemCol,
-				 FdwPlan *fdwplan)
+				 List *fdw_private)
 {
 	ForeignScan *node = makeNode(ForeignScan);
 	Plan	   *plan = &node->scan.plan;
@@ -3201,7 +3200,7 @@ make_foreignscan(List *qptlist,
 	plan->righttree = NULL;
 	node->scan.scanrelid = scanrelid;
 	node->fsSystemCol = fsSystemCol;
-	node->fdwplan = fdwplan;
+	node->fdw_private = fdw_private;
 
 	return node;
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index d29b454f7249e..6d1545476df7b 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -16,7 +16,6 @@
 
 #include <math.h>
 
-#include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
@@ -1766,36 +1765,31 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel)
  * create_foreignscan_path
  *	  Creates a path corresponding to a scan of a foreign table,
  *	  returning the pathnode.
+ *
+ * This function is never called from core Postgres; rather, it's expected
+ * to be called by the PlanForeignScan function of a foreign data wrapper.
+ * We make the FDW supply all fields of the path, since we do not have any
+ * way to calculate them in core.
  */
 ForeignPath *
-create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel)
+create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
+						double rows, Cost startup_cost, Cost total_cost,
+						List *pathkeys,
+						Relids required_outer, List *param_clauses,
+						List *fdw_private)
 {
 	ForeignPath *pathnode = makeNode(ForeignPath);
-	RangeTblEntry *rte;
-	FdwRoutine *fdwroutine;
-	FdwPlan    *fdwplan;
 
 	pathnode->path.pathtype = T_ForeignScan;
 	pathnode->path.parent = rel;
-	pathnode->path.pathkeys = NIL;		/* result is always unordered */
-	pathnode->path.required_outer = NULL;
-	pathnode->path.param_clauses = NIL;
+	pathnode->path.rows = rows;
+	pathnode->path.startup_cost = startup_cost;
+	pathnode->path.total_cost = total_cost;
+	pathnode->path.pathkeys = pathkeys;
+	pathnode->path.required_outer = required_outer;
+	pathnode->path.param_clauses = param_clauses;
 
-	/* Get FDW's callback info */
-	rte = planner_rt_fetch(rel->relid, root);
-	fdwroutine = GetFdwRoutineByRelId(rte->relid);
-
-	/* Let the FDW do its planning */
-	fdwplan = fdwroutine->PlanForeignScan(rte->relid, root, rel);
-	if (fdwplan == NULL || !IsA(fdwplan, FdwPlan))
-		elog(ERROR, "foreign-data wrapper PlanForeignScan function for relation %u did not return an FdwPlan struct",
-			 rte->relid);
-	pathnode->fdwplan = fdwplan;
-
-	/* use costs estimated by FDW */
-	pathnode->path.rows = rel->rows;
-	pathnode->path.startup_cost = fdwplan->startup_cost;
-	pathnode->path.total_cost = fdwplan->total_cost;
+	pathnode->fdw_private = fdw_private;
 
 	return pathnode;
 }
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index 3696623742e1d..9e135c62069fd 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -19,42 +19,13 @@
 struct ExplainState;
 
 
-/*
- * FdwPlan is the information returned to the planner by PlanForeignScan.
- */
-typedef struct FdwPlan
-{
-	NodeTag		type;
-
-	/*
-	 * Cost estimation info. The startup_cost is time before retrieving the
-	 * first row, so it should include costs of connecting to the remote host,
-	 * sending over the query, etc.  Note that PlanForeignScan also ought to
-	 * set baserel->rows and baserel->width if it can produce any usable
-	 * estimates of those values.
-	 */
-	Cost		startup_cost;	/* cost expended before fetching any tuples */
-	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
-
-	/*
-	 * FDW private data, which will be available at execution time.
-	 *
-	 * Note that everything in this list must be copiable by copyObject(). One
-	 * way to store an arbitrary blob of bytes is to represent it as a bytea
-	 * Const.  Usually, though, you'll be better off choosing a representation
-	 * that can be dumped usefully by nodeToString().
-	 */
-	List	   *fdw_private;
-} FdwPlan;
-
-
 /*
  * Callback function signatures --- see fdwhandler.sgml for more info.
  */
 
-typedef FdwPlan *(*PlanForeignScan_function) (Oid foreigntableid,
-														  PlannerInfo *root,
-														RelOptInfo *baserel);
+typedef void (*PlanForeignScan_function) (Oid foreigntableid,
+										  PlannerInfo *root,
+										  RelOptInfo *baserel);
 
 typedef void (*ExplainForeignScan_function) (ForeignScanState *node,
 													struct ExplainState *es);
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 0e7d184a0d8b8..905458fd50bfb 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -62,7 +62,6 @@ typedef enum NodeTag
 	T_CteScan,
 	T_WorkTableScan,
 	T_ForeignScan,
-	T_FdwPlan,
 	T_Join,
 	T_NestLoop,
 	T_MergeJoin,
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 7d90b91ad527b..3962792d3d89a 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -468,8 +468,7 @@ typedef struct ForeignScan
 {
 	Scan		scan;
 	bool		fsSystemCol;	/* true if any "system column" is needed */
-	/* use struct pointer to avoid including fdwapi.h here */
-	struct FdwPlan *fdwplan;
+	List	   *fdw_private;	/* private data for FDW */
 } ForeignScan;
 
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 6ba920a479ef6..2a686080059f3 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -794,12 +794,18 @@ typedef struct TidPath
 
 /*
  * ForeignPath represents a scan of a foreign table
+ *
+ * fdw_private contains FDW private data about the scan, which will be copied
+ * to the final ForeignScan plan node so that it is available at execution
+ * time.  Note that everything in this list must be copiable by copyObject().
+ * One way to store an arbitrary blob of bytes is to represent it as a bytea
+ * Const.  Usually, though, you'll be better off choosing a representation
+ * that can be dumped usefully by nodeToString().
  */
 typedef struct ForeignPath
 {
 	Path		path;
-	/* use struct pointer to avoid including fdwapi.h here */
-	struct FdwPlan *fdwplan;
+	List	   *fdw_private;
 } ForeignPath;
 
 /*
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 1cf34171f4fce..3f80ca3fe9f01 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -68,7 +68,11 @@ extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
 extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);
 extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel);
 extern Path *create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel);
-extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel);
+extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
+						double rows, Cost startup_cost, Cost total_cost,
+						List *pathkeys,
+						Relids required_outer, List *param_clauses,
+						List *fdw_private);
 
 extern Relids calc_nestloop_required_outer(Path *outer_path, Path *inner_path);
 extern Relids calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path);

From 2127aac6ef56df14f2fc9e91871c6e9194c52eed Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Mon, 5 Mar 2012 21:19:54 -0500
Subject: [PATCH 087/129] In pg_upgrade, only lock the old cluster if link mode
 is used, and do it right after we restore the schema (a common failure
 point), and right before we do the link operation.

Per suggesgtions from Robert and ^!C^!^@lvaro
---
 contrib/pg_upgrade/check.c       |  9 ++++-----
 contrib/pg_upgrade/controldata.c |  9 ++++++++-
 contrib/pg_upgrade/pg_upgrade.c  | 20 ++++++++++----------
 contrib/pg_upgrade/pg_upgrade.h  |  5 ++---
 doc/src/sgml/pgupgrade.sgml      | 29 +++++++++++++----------------
 5 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index 891eb9a9f0dd4..a5f63eb6c800f 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -148,9 +148,8 @@ report_clusters_compatible(void)
 	}
 
 	pg_log(PG_REPORT, "\n"
-		   "If pg_upgrade fails after this point, you must re-initdb the new cluster\n"
-		   "before continuing.  You will also need to remove the \".old\" suffix from\n"
-		   "%s/global/pg_control.old.\n", old_cluster.pgdata);
+		   "If pg_upgrade fails after this point, you must re-initdb the\n"
+		   "new cluster before continuing.\n");
 }
 
 
@@ -198,8 +197,8 @@ output_completion_banner(char *deletion_script_file_name)
 	/* Did we copy the free space files? */
 	if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
 		pg_log(PG_REPORT,
-			   "Optimizer statistics are not transferred by pg_upgrade so consider\n"
-			   "running:\n"
+			   "Optimizer statistics are not transferred by pg_upgrade so\n"
+			   "consider running:\n"
 			   "    vacuumdb --all --analyze-only\n"
 			   "on the newly-upgraded cluster.\n\n");
 	else
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 8560d881193eb..5239601dc6b65 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -516,11 +516,12 @@ check_control_data(ControlData *oldctrl,
 
 
 void
-rename_old_pg_control(void)
+disable_old_cluster(void)
 {
 	char		old_path[MAXPGPATH],
 				new_path[MAXPGPATH];
 
+	/* rename pg_control so old server cannot be accidentally started */
 	prep_status("Adding \".old\" suffix to old global/pg_control");
 
 	snprintf(old_path, sizeof(old_path), "%s/global/pg_control", old_cluster.pgdata);
@@ -528,4 +529,10 @@ rename_old_pg_control(void)
 	if (pg_mv_file(old_path, new_path) != 0)
 		pg_log(PG_FATAL, "Unable to rename %s to %s.\n", old_path, new_path);
 	check_ok();
+
+	pg_log(PG_REPORT, "\n"
+		   "If you want to start the old cluster, you will need to remove\n"
+		   "the \".old\" suffix from %s/global/pg_control.old.\n"
+		   "Because \"link\" mode was used, the old cluster cannot be safely\n"
+		   "started once the new cluster has been started.\n\n", old_cluster.pgdata);
 }
diff --git a/contrib/pg_upgrade/pg_upgrade.c b/contrib/pg_upgrade/pg_upgrade.c
index 15b30fc0de0d7..3078bcd4cd03b 100644
--- a/contrib/pg_upgrade/pg_upgrade.c
+++ b/contrib/pg_upgrade/pg_upgrade.c
@@ -43,7 +43,6 @@
 #include <langinfo.h>
 #endif
 
-static void disable_old_cluster(void);
 static void prepare_new_cluster(void);
 static void prepare_new_databases(void);
 static void create_new_objects(void);
@@ -87,7 +86,6 @@ main(int argc, char **argv)
 	pg_log(PG_REPORT, "\nPerforming Upgrade\n");
 	pg_log(PG_REPORT, "------------------\n");
 
-	disable_old_cluster();
 	prepare_new_cluster();
 
 	stop_postmaster(false);
@@ -109,6 +107,16 @@ main(int argc, char **argv)
 
 	stop_postmaster(false);
 
+	/*
+	 *	Most failures happen in create_new_objects(), which has
+	 *	completed at this point.  We do this here because it is just
+	 *	before linking, which will link the old and new cluster data
+	 *	files, preventing the old cluster from being safely started
+	 *	once the new cluster is started.
+	 */
+	if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
+		disable_old_cluster();
+
 	transfer_all_new_dbs(&old_cluster.dbarr, &new_cluster.dbarr,
 						 old_cluster.pgdata, new_cluster.pgdata);
 
@@ -176,14 +184,6 @@ setup(char *argv0, bool live_check)
 }
 
 
-static void
-disable_old_cluster(void)
-{
-	/* rename pg_control so old server cannot be accidentally started */
-	rename_old_pg_control();
-}
-
-
 static void
 prepare_new_cluster(void)
 {
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 58d5201bfca8f..a95481509db74 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -282,8 +282,8 @@ void		create_script_for_old_cluster_deletion(char **deletion_script_file_name);
 /* controldata.c */
 
 void		get_control_data(ClusterInfo *cluster, bool live_check);
-void check_control_data(ControlData *oldctrl,
-				   ControlData *newctrl);
+void		check_control_data(ControlData *oldctrl, ControlData *newctrl);
+void		disable_old_cluster(void);
 
 
 /* dump.c */
@@ -298,7 +298,6 @@ int exec_prog(bool throw_error, const char *cmd, ...)
 	__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
 void		verify_directories(void);
 bool		is_server_running(const char *datadir);
-void		rename_old_pg_control(void);
 
 
 /* file.c */
diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
index 1373069243c93..4f263fe6720a1 100644
--- a/doc/src/sgml/pgupgrade.sgml
+++ b/doc/src/sgml/pgupgrade.sgml
@@ -182,7 +182,7 @@
 
     <para>
      If you are using a version-specific installation directory, e.g.
-     <filename>/opt/PostgreSQL/8.4</>, you do not need to move the old cluster. The
+     <filename>/opt/PostgreSQL/9.1</>, you do not need to move the old cluster. The
      one-click installers all use version-specific installation directories.
     </para>
 
@@ -254,7 +254,8 @@ gmake prefix=/usr/local/pgsql.new install
 
     <para>
      Install any custom shared object files (or DLLs) used by the old cluster
-     into the new cluster, e.g. <filename>pgcrypto.so</filename>, whether they are from <filename>contrib</filename>
+     into the new cluster, e.g. <filename>pgcrypto.so</filename>, 
+     whether they are from <filename>contrib</filename>
      or some other source. Do not install the schema definitions, e.g.
      <filename>pgcrypto.sql</>, because these will be upgraded from the old cluster.
     </para>
@@ -454,18 +455,14 @@ psql --username postgres --file script.sql postgres
 
       <listitem>
        <para>
-        If you
-        ran <command>pg_upgrade</command> <emphasis>without</> <option>--link</>
-        or did not start the new server, the old cluster was not
-        modified except that an <literal>.old</> suffix was appended
-        to <filename>$PGDATA/global/pg_control</> and perhaps
-        tablespace directories. To reuse the old cluster, remove
-        the <filename>.old</> suffix
-        from <filename>$PGDATA/global/pg_control</>. and, if upgrading
-        to 8.4 or earlier, remove the tablespace directories created
-        by the upgrade and remove the <filename>.old</> suffix from
-        the tablespace directory names; then you can restart the old
-        cluster.
+        If you ran <command>pg_upgrade</command> <emphasis>without</>
+        <option>--link</> or did not start the new server, the
+        old cluster was not modified except that, if linking
+        started, a <literal>.old</> suffix was appended to
+        <filename>$PGDATA/global/pg_control</>.  To reuse the old
+        cluster, possibly remove the <filename>.old</> suffix from
+        <filename>$PGDATA/global/pg_control</>; you can then restart the
+        old cluster.
        </para>
       </listitem>
      </itemizedlist>
@@ -582,9 +579,9 @@ psql --username postgres --file script.sql postgres
   </para>
 
   <para>
-   If you want to use link mode and you don't want your old cluster
+   If you want to use link mode and you do not want your old cluster
    to be modified when the new cluster is started, make a copy of the
-   old cluster and upgrade that with link mode. To make a valid copy
+   old cluster and upgrade that in link mode. To make a valid copy
    of the old cluster, use <command>rsync</> to create a dirty
    copy of the old cluster while the server is running, then shut down
    the old server and run <command>rsync</> again to update the copy with any

From 3b682df3260aa8e020201e4b6c5cbc31fe8ecb8e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 6 Mar 2012 09:13:00 +0200
Subject: [PATCH 088/129] Simplify the way changes to full_page_writes are
 logged.

It's harmless to do full page writes even when not strictly necessary, so
when turning full_page_writes on, we can set the global flag first, and then
call XLogInsert. Likewise, when turning it off, we can write the WAL record
first, and then clear the flag. This way XLogInsert doesn't need any special
handling of the XLOG_FPW_CHANGE record type. XLogInsert is complicated
enough already, so anything we can keep away from there is a good thing.

Actually I don't think the atomicity of the shared memory flag matters,
anyway, because we only write the XLOG_FPW_CHANGE at the end of recovery,
when there are no concurrent WAL insertions going on. But might as well make
it safe, in case we allow changing full_page_writes on the fly in the
future.
---
 src/backend/access/transam/xlog.c | 60 +++++++++++++------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 266c0decacac8..eb7932e90aeed 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -731,8 +731,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	unsigned	i;
 	bool		updrqst;
 	bool		doPageWrites;
-	bool		isLogSwitch = false;
-	bool		fpwChange = false;
+	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 	uint8		info_orig = info;
 
 	/* cross-check on whether we should be here or not */
@@ -746,30 +745,11 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 
 	/*
-	 * Handle special cases/records.
+	 * In bootstrap mode, we don't actually log anything but XLOG resources;
+	 * return a phony record pointer.
 	 */
-	if (rmid == RM_XLOG_ID)
+	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 	{
-		switch (info)
-		{
-			case XLOG_SWITCH:
-				isLogSwitch = true;
-				break;
-
-			case XLOG_FPW_CHANGE:
-				fpwChange = true;
-				break;
-
-			default:
-				break;
-		}
-	}
-	else if (IsBootstrapProcessingMode())
-	{
-		/*
-		 * In bootstrap mode, we don't actually log anything but XLOG resources;
-		 * return a phony record pointer.
-		 */
 		RecPtr.xlogid = 0;
 		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
 		return RecPtr;
@@ -1232,15 +1212,6 @@ begin:;
 		WriteRqst = XLogCtl->xlblocks[curridx];
 	}
 
-	/*
-	 * If the record is an XLOG_FPW_CHANGE, we update full_page_writes
-	 * in shared memory before releasing WALInsertLock. This ensures that
-	 * an XLOG_FPW_CHANGE record precedes any WAL record affected
-	 * by this change of full_page_writes.
-	 */
-	if (fpwChange)
-		Insert->fullPageWrites = fullPageWrites;
-
 	LWLockRelease(WALInsertLock);
 
 	if (updrqst)
@@ -8517,6 +8488,22 @@ UpdateFullPageWrites(void)
 	if (fullPageWrites == Insert->fullPageWrites)
 		return;
 
+	START_CRIT_SECTION();
+
+	/*
+	 * It's always safe to take full page images, even when not strictly
+	 * required, but not the other round. So if we're setting full_page_writes
+	 * to true, first set it true and then write the WAL record. If we're
+	 * setting it to false, first write the WAL record and then set the
+	 * global flag.
+	 */
+	if (fullPageWrites)
+	{
+		LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+		Insert->fullPageWrites = true;
+		LWLockRelease(WALInsertLock);
+	}
+
 	/*
 	 * Write an XLOG_FPW_CHANGE record. This allows us to keep
 	 * track of full_page_writes during archive recovery, if required.
@@ -8532,12 +8519,15 @@ UpdateFullPageWrites(void)
 
 		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
 	}
-	else
+
+
+	if (!fullPageWrites)
 	{
 		LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
-		Insert->fullPageWrites = fullPageWrites;
+		Insert->fullPageWrites = false;
 		LWLockRelease(WALInsertLock);
 	}
+	END_CRIT_SECTION();
 }
 
 /*

From 7714c6382941383514c0f1954ca831686ac4fcd2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 6 Mar 2012 09:34:10 +0200
Subject: [PATCH 089/129] Remove extra copies of LogwrtResult.

This simplifies the code a little bit. The new rule is that to update
XLogCtl->LogwrtResult, you must hold both WALWriteLock and info_lck, whereas
before we had two copies, one that was protected by WALWriteLock and another
protected by info_lck. The code that updates them was already holding both
locks, so merging the two is trivial.

The third copy, XLogCtl->Insert.LogwrtResult, was not totally redundant, it
was used in AdvanceXLInsertBuffer to update the backend-local copy, before
acquiring the info_lck to read the up-to-date value. But the value of that
seems dubious; at best it's saving one spinlock acquisition per completed
WAL page, which is not significant compared to all the other work involved.
And in practice, it's probably not saving even that much.
---
 src/backend/access/transam/xlog.c | 74 +++++++++----------------------
 1 file changed, 22 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index eb7932e90aeed..49d4b36652653 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -289,35 +289,16 @@ static XLogRecPtr RedoStartLSN = {0, 0};
  * These structs are identical but are declared separately to indicate their
  * slightly different functions.
  *
- * We do a lot of pushups to minimize the amount of access to lockable
- * shared memory values.  There are actually three shared-memory copies of
- * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
- *		XLogCtl->LogwrtResult is protected by info_lck
- *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
- *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
- * One must hold the associated lock to read or write any of these, but
- * of course no lock is needed to read/write the unshared LogwrtResult.
- *
- * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
- * right", since both are updated by a write or flush operation before
- * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
- * is that it can be examined/modified by code that already holds WALWriteLock
- * without needing to grab info_lck as well.
- *
- * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
- * but is updated when convenient.	Again, it exists for the convenience of
- * code that is already holding WALInsertLock but not the other locks.
- *
- * The unshared LogwrtResult may lag behind any or all of these, and again
- * is updated when convenient.
+ * To read XLogCtl->LogwrtResult, you must hold either info_lck or
+ * WALWriteLock.  To update it, you need to hold both locks.  The point of
+ * this arrangement is that the value can be examined by code that already
+ * holds WALWriteLock without needing to grab info_lck as well.  In addition
+ * to the shared variable, each backend has a private copy of LogwrtResult,
+ * which is updated when convenient.
  *
  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
  * (protected by info_lck), but we don't need to cache any copies of it.
  *
- * Note that this all works because the request and result positions can only
- * advance forward, never back up, and so we can easily determine which of two
- * values is "more up to date".
- *
  * info_lck is only held long enough to read/update the protected variables,
  * so it's a plain spinlock.  The other locks are held longer (potentially
  * over I/O operations), so we use LWLocks for them.  These locks are:
@@ -354,7 +335,6 @@ typedef struct XLogwrtResult
  */
 typedef struct XLogCtlInsert
 {
-	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
 	int			curridx;		/* current block index in cache */
 	XLogPageHeader currpage;	/* points to header of block in cache */
@@ -388,7 +368,6 @@ typedef struct XLogCtlInsert
  */
 typedef struct XLogCtlWrite
 {
-	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 	int			curridx;		/* cache index of next block to write */
 	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
 } XLogCtlWrite;
@@ -403,7 +382,6 @@ typedef struct XLogCtlData
 
 	/* Protected by info_lck: */
 	XLogwrtRqst LogwrtRqst;
-	XLogwrtResult LogwrtResult;
 	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
 	TransactionId ckptXid;
 	XLogRecPtr	asyncXactLSN;	/* LSN of newest async commit/abort */
@@ -413,6 +391,12 @@ typedef struct XLogCtlData
 	/* Protected by WALWriteLock: */
 	XLogCtlWrite Write;
 
+	/*
+	 * Protected by info_lck and WALWriteLock (you must hold either lock to
+	 * read it, but both to update)
+	 */
+	XLogwrtResult LogwrtResult;
+
 	/*
 	 * These values do not change after startup, although the pointed-to pages
 	 * and xlblocks values certainly do.  Permission to read/write the pages
@@ -1015,7 +999,7 @@ begin:;
 		}
 
 		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
-		LogwrtResult = XLogCtl->Write.LogwrtResult;
+		LogwrtResult = XLogCtl->LogwrtResult;
 		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 		{
 			XLogwrtRqst FlushRqst;
@@ -1188,8 +1172,6 @@ begin:;
 			SpinLockRelease(&xlogctl->info_lck);
 		}
 
-		Write->LogwrtResult = LogwrtResult;
-
 		LWLockRelease(WALWriteLock);
 
 		updrqst = false;		/* done already */
@@ -1477,7 +1459,6 @@ static bool
 AdvanceXLInsertBuffer(bool new_segment)
 {
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	XLogCtlWrite *Write = &XLogCtl->Write;
 	int			nextidx = NextBufIdx(Insert->curridx);
 	bool		update_needed = true;
 	XLogRecPtr	OldPageRqstPtr;
@@ -1485,10 +1466,6 @@ AdvanceXLInsertBuffer(bool new_segment)
 	XLogRecPtr	NewPageEndPtr;
 	XLogPageHeader NewPage;
 
-	/* Use Insert->LogwrtResult copy if it's more fresh */
-	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
-		LogwrtResult = Insert->LogwrtResult;
-
 	/*
 	 * Get ending-offset of the buffer page we need to replace (this may be
 	 * zero if the buffer hasn't been used yet).  Fall through if it's already
@@ -1516,21 +1493,19 @@ AdvanceXLInsertBuffer(bool new_segment)
 
 		update_needed = false;	/* Did the shared-request update */
 
-		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
-		{
-			/* OK, someone wrote it already */
-			Insert->LogwrtResult = LogwrtResult;
-		}
-		else
+		/*
+		 * Now that we have an up-to-date LogwrtResult value, see if we still
+		 * need to write it or if someone else already did.
+		 */
+		if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
 		{
 			/* Must acquire write lock */
 			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
-			LogwrtResult = Write->LogwrtResult;
+			LogwrtResult = XLogCtl->LogwrtResult;
 			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
 			{
 				/* OK, someone wrote it already */
 				LWLockRelease(WALWriteLock);
-				Insert->LogwrtResult = LogwrtResult;
 			}
 			else
 			{
@@ -1544,7 +1519,6 @@ AdvanceXLInsertBuffer(bool new_segment)
 				WriteRqst.Flush.xrecoff = 0;
 				XLogWrite(WriteRqst, false, false);
 				LWLockRelease(WALWriteLock);
-				Insert->LogwrtResult = LogwrtResult;
 				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
 			}
 		}
@@ -1697,7 +1671,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 	/*
 	 * Update local LogwrtResult (caller probably did this already, but...)
 	 */
-	LogwrtResult = Write->LogwrtResult;
+	LogwrtResult = XLogCtl->LogwrtResult;
 
 	/*
 	 * Since successive pages in the xlog cache are consecutively allocated,
@@ -1931,8 +1905,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
 		SpinLockRelease(&xlogctl->info_lck);
 	}
-
-	Write->LogwrtResult = LogwrtResult;
 }
 
 /*
@@ -2126,7 +2098,7 @@ XLogFlush(XLogRecPtr record)
 			continue;
 		}
 		/* Got the lock */
-		LogwrtResult = XLogCtl->Write.LogwrtResult;
+		LogwrtResult = XLogCtl->LogwrtResult;
 		if (!XLByteLE(record, LogwrtResult.Flush))
 		{
 			/* try to write/flush later additions to XLOG as well */
@@ -2268,7 +2240,7 @@ XLogBackgroundFlush(void)
 
 	/* now wait for the write lock */
 	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
-	LogwrtResult = XLogCtl->Write.LogwrtResult;
+	LogwrtResult = XLogCtl->LogwrtResult;
 	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
 	{
 		XLogwrtRqst WriteRqst;
@@ -6831,8 +6803,6 @@ StartupXLOG(void)
 
 	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
 
-	XLogCtl->Write.LogwrtResult = LogwrtResult;
-	Insert->LogwrtResult = LogwrtResult;
 	XLogCtl->LogwrtResult = LogwrtResult;
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;

From e587e2e3e39ec54772905b8e0ac7155a03253934 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 6 Mar 2012 10:44:51 +0200
Subject: [PATCH 090/129] Make the comments more clear on the fact that
 UpdateFullPageWrites() is not safe to call concurrently from multiple
 processes.

---
 src/backend/access/transam/xlog.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 49d4b36652653..c23cf635395ee 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8442,6 +8442,9 @@ XLogReportParameters(void)
 /*
  * Update full_page_writes in shared memory, and write an
  * XLOG_FPW_CHANGE record if necessary.
+ *
+ * Note: this function assumes there is no other process running
+ * concurrently that could update it.
  */
 void
 UpdateFullPageWrites(void)
@@ -8452,8 +8455,8 @@ UpdateFullPageWrites(void)
 	 * Do nothing if full_page_writes has not been changed.
 	 *
 	 * It's safe to check the shared full_page_writes without the lock,
-	 * because we can guarantee that there is no concurrently running
-	 * process which can update it.
+	 * because we assume that there is no concurrently running process
+	 * which can update it.
 	 */
 	if (fullPageWrites == Insert->fullPageWrites)
 		return;
@@ -8490,7 +8493,6 @@ UpdateFullPageWrites(void)
 		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
 	}
 
-
 	if (!fullPageWrites)
 	{
 		LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

From bc97c38115ed48c1dd27e9d534faae204427f9a5 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 6 Mar 2012 08:23:51 -0500
Subject: [PATCH 091/129] Typo fix.

Fujii Masao
---
 src/backend/access/transam/xlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c23cf635395ee..8bbca613f2674 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6818,7 +6818,7 @@ StartupXLOG(void)
 	else
 	{
 		/*
-		 * Whenever Write.LogwrtResult points to exactly the end of a page,
+		 * Whenever LogwrtResult points to exactly the end of a page,
 		 * Write.curridx must point to the *next* page (see XLogWrite()).
 		 *
 		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but

From 19dbc3463161a142537ba5c569c8e6a073a318de Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 6 Mar 2012 15:35:41 -0500
Subject: [PATCH 092/129] Add a hook for processing messages due to be sent to
 the server log.

Use-cases for this include custom log filtering rules and custom log
message transmission mechanisms (for instance, lossy log message
collection, which has been discussed several times recently).

As is our common practice for hooks, there's no regression test nor
user-facing documentation for this, though the author did exhibit a
sample module using the hook.

Martin Pihlak, reviewed by Marti Raudsepp
---
 src/backend/utils/error/elog.c | 26 ++++++++++++++++++++++++++
 src/include/utils/elog.h       |  4 ++++
 2 files changed, 30 insertions(+)

diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 470081a18009e..239ac19882d6b 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -95,6 +95,15 @@ sigjmp_buf *PG_exception_stack = NULL;
 
 extern bool redirection_done;
 
+/*
+ * Hook for intercepting messages before they are sent to the server log.
+ * Note that the hook will not get called for messages that are suppressed
+ * by log_min_messages.  Also note that logging hooks implemented in preload
+ * libraries will miss any log messages that are generated before the
+ * library is loaded.
+ */
+emit_log_hook_type emit_log_hook = NULL;
+
 /* GUC parameters */
 int			Log_error_verbosity = PGERROR_VERBOSE;
 char	   *Log_line_prefix = NULL;		/* format for extra log line info */
@@ -1276,6 +1285,23 @@ EmitErrorReport(void)
 	CHECK_STACK_DEPTH();
 	oldcontext = MemoryContextSwitchTo(ErrorContext);
 
+	/*
+	 * Call hook before sending message to log.  The hook function is allowed
+	 * to turn off edata->output_to_server, so we must recheck that afterward.
+	 * Making any other change in the content of edata is not considered
+	 * supported.
+	 *
+	 * Note: the reason why the hook can only turn off output_to_server, and
+	 * not turn it on, is that it'd be unreliable: we will never get here at
+	 * all if errstart() deems the message uninteresting.  A hook that could
+	 * make decisions in that direction would have to hook into errstart(),
+	 * where it would have much less information available.  emit_log_hook is
+	 * intended for custom log filtering and custom log message transmission
+	 * mechanisms.
+	 */
+	if (edata->output_to_server && emit_log_hook)
+		(*emit_log_hook) (edata);
+
 	/* Send to server log, if enabled */
 	if (edata->output_to_server)
 		send_message_to_server_log(edata);
diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h
index fbc08df7dec42..7b5bcfae6fdb9 100644
--- a/src/include/utils/elog.h
+++ b/src/include/utils/elog.h
@@ -334,6 +334,10 @@ extern void FlushErrorState(void);
 extern void ReThrowError(ErrorData *edata);
 extern void pg_re_throw(void) __attribute__((noreturn));
 
+/* Hook for intercepting messages before they are sent to the server log */
+typedef void (*emit_log_hook_type) (ErrorData *edata);
+extern PGDLLIMPORT emit_log_hook_type emit_log_hook;
+
 
 /* GUC-configurable parameters */
 

From e685a8e6651b0e55996e6eba6917302607866793 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Tue, 6 Mar 2012 23:20:43 +0200
Subject: [PATCH 093/129] libpq: Small code clarification, and avoid casting
 away const

---
 src/interfaces/libpq/fe-auth.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/interfaces/libpq/fe-auth.c b/src/interfaces/libpq/fe-auth.c
index b7a3a814f0403..7c7383cd9d5d7 100644
--- a/src/interfaces/libpq/fe-auth.c
+++ b/src/interfaces/libpq/fe-auth.c
@@ -739,7 +739,8 @@ static int
 pg_password_sendauth(PGconn *conn, const char *password, AuthRequest areq)
 {
 	int			ret;
-	char	   *crypt_pwd;
+	char	   *crypt_pwd = NULL;
+	const char *pwd_to_send;
 
 	/* Encrypt the password if needed. */
 
@@ -771,21 +772,22 @@ pg_password_sendauth(PGconn *conn, const char *password, AuthRequest areq)
 					free(crypt_pwd);
 					return STATUS_ERROR;
 				}
+
+				pwd_to_send = crypt_pwd;
 				break;
 			}
 		case AUTH_REQ_PASSWORD:
-			/* discard const so we can assign it */
-			crypt_pwd = (char *) password;
+			pwd_to_send = password;
 			break;
 		default:
 			return STATUS_ERROR;
 	}
 	/* Packet has a message type as of protocol 3.0 */
 	if (PG_PROTOCOL_MAJOR(conn->pversion) >= 3)
-		ret = pqPacketSend(conn, 'p', crypt_pwd, strlen(crypt_pwd) + 1);
+		ret = pqPacketSend(conn, 'p', pwd_to_send, strlen(pwd_to_send) + 1);
 	else
-		ret = pqPacketSend(conn, 0, crypt_pwd, strlen(crypt_pwd) + 1);
-	if (areq == AUTH_REQ_MD5)
+		ret = pqPacketSend(conn, 0, pwd_to_send, strlen(pwd_to_send) + 1);
+	if (crypt_pwd)
 		free(crypt_pwd);
 	return ret;
 }

From d4bf3c9c94305e692349fb6fe0c67e483b72ae87 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 7 Mar 2012 14:51:13 -0500
Subject: [PATCH 094/129] Expose an API for calculating catcache hash values.

Now that cache invalidation callbacks get only a hash value, and not a
tuple TID (per commits 632ae6829f7abda34e15082c91d9dfb3fc0f298b and
b5282aa893e565b7844f8237462cb843438cdd5e), the only way they can restrict
what they invalidate is to know what the hash values mean.  setrefs.c was
doing this via a hard-wired assumption but that seems pretty grotty, and
it'll only get worse as more cases come up.  So let's expose a calculation
function that takes the same parameters as SearchSysCache.  Per complaint
from Marko Kreen.
---
 src/backend/optimizer/plan/setrefs.c | 10 +++----
 src/backend/utils/cache/catcache.c   | 40 ++++++++++++++++++++++++++++
 src/backend/utils/cache/syscache.c   | 24 +++++++++++++++++
 src/include/utils/catcache.h         |  4 +++
 src/include/utils/syscache.h         | 12 +++++++++
 5 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 66d21b2b2c3de..e1b48fb4f5306 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -15,7 +15,6 @@
  */
 #include "postgres.h"
 
-#include "access/hash.h"
 #include "access/transam.h"
 #include "catalog/pg_type.h"
 #include "nodes/makefuncs.h"
@@ -1830,14 +1829,11 @@ record_plan_function_dependency(PlannerInfo *root, Oid funcid)
 		/*
 		 * It would work to use any syscache on pg_proc, but the easiest is
 		 * PROCOID since we already have the function's OID at hand.  Note
-		 * that plancache.c knows we use PROCOID.  Also, we're perhaps
-		 * assuming more than we should about how CatalogCacheComputeHashValue
-		 * computes hash values...
+		 * that plancache.c knows we use PROCOID.
 		 */
 		inval_item->cacheId = PROCOID;
-		inval_item->hashValue =
-			DatumGetUInt32(DirectFunctionCall1(hashoid,
-											   ObjectIdGetDatum(funcid)));
+		inval_item->hashValue = GetSysCacheHashValue1(PROCOID,
+													  ObjectIdGetDatum(funcid));
 
 		root->glob->invalItems = lappend(root->glob->invalItems, inval_item);
 	}
diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c
index acd0518748d68..ea3daa599ca00 100644
--- a/src/backend/utils/cache/catcache.c
+++ b/src/backend/utils/cache/catcache.c
@@ -1281,6 +1281,46 @@ ReleaseCatCache(HeapTuple tuple)
 }
 
 
+/*
+ *	GetCatCacheHashValue
+ *
+ *		Compute the hash value for a given set of search keys.
+ *
+ * The reason for exposing this as part of the API is that the hash value is
+ * exposed in cache invalidation operations, so there are places outside the
+ * catcache code that need to be able to compute the hash values.
+ */
+uint32
+GetCatCacheHashValue(CatCache *cache,
+					 Datum v1,
+					 Datum v2,
+					 Datum v3,
+					 Datum v4)
+{
+	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
+
+	/*
+	 * one-time startup overhead for each cache
+	 */
+	if (cache->cc_tupdesc == NULL)
+		CatalogCacheInitializeCache(cache);
+
+	/*
+	 * initialize the search key information
+	 */
+	memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
+	cur_skey[0].sk_argument = v1;
+	cur_skey[1].sk_argument = v2;
+	cur_skey[2].sk_argument = v3;
+	cur_skey[3].sk_argument = v4;
+
+	/*
+	 * calculate the hash value
+	 */
+	return CatalogCacheComputeHashValue(cache, cache->cc_nkeys, cur_skey);
+}
+
+
 /*
  *	SearchCatCacheList
  *
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 78ce0b881a59c..c365ec7597a8f 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -1050,6 +1050,30 @@ SysCacheGetAttr(int cacheId, HeapTuple tup,
 						isNull);
 }
 
+/*
+ * GetSysCacheHashValue
+ *
+ * Get the hash value that would be used for a tuple in the specified cache
+ * with the given search keys.
+ *
+ * The reason for exposing this as part of the API is that the hash value is
+ * exposed in cache invalidation operations, so there are places outside the
+ * catcache code that need to be able to compute the hash values.
+ */
+uint32
+GetSysCacheHashValue(int cacheId,
+					 Datum key1,
+					 Datum key2,
+					 Datum key3,
+					 Datum key4)
+{
+	if (cacheId < 0 || cacheId >= SysCacheSize ||
+		!PointerIsValid(SysCache[cacheId]))
+		elog(ERROR, "invalid cache ID: %d", cacheId);
+
+	return GetCatCacheHashValue(SysCache[cacheId], key1, key2, key3, key4);
+}
+
 /*
  * List-search interface
  */
diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h
index bc19ef0255f74..d91700a07e373 100644
--- a/src/include/utils/catcache.h
+++ b/src/include/utils/catcache.h
@@ -174,6 +174,10 @@ extern HeapTuple SearchCatCache(CatCache *cache,
 			   Datum v3, Datum v4);
 extern void ReleaseCatCache(HeapTuple tuple);
 
+extern uint32 GetCatCacheHashValue(CatCache *cache,
+					 Datum v1, Datum v2,
+					 Datum v3, Datum v4);
+
 extern CatCList *SearchCatCacheList(CatCache *cache, int nkeys,
 				   Datum v1, Datum v2,
 				   Datum v3, Datum v4);
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 0b539dba75017..d59dd4e0c7038 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -113,6 +113,9 @@ extern bool SearchSysCacheExistsAttName(Oid relid, const char *attname);
 extern Datum SysCacheGetAttr(int cacheId, HeapTuple tup,
 				AttrNumber attributeNumber, bool *isNull);
 
+extern uint32 GetSysCacheHashValue(int cacheId,
+					 Datum key1, Datum key2, Datum key3, Datum key4);
+
 /* list-search interface.  Users of this must import catcache.h too */
 extern struct catclist *SearchSysCacheList(int cacheId, int nkeys,
 				   Datum key1, Datum key2, Datum key3, Datum key4);
@@ -158,6 +161,15 @@ extern struct catclist *SearchSysCacheList(int cacheId, int nkeys,
 #define GetSysCacheOid4(cacheId, key1, key2, key3, key4) \
 	GetSysCacheOid(cacheId, key1, key2, key3, key4)
 
+#define GetSysCacheHashValue1(cacheId, key1) \
+	GetSysCacheHashValue(cacheId, key1, 0, 0, 0)
+#define GetSysCacheHashValue2(cacheId, key1, key2) \
+	GetSysCacheHashValue(cacheId, key1, key2, 0, 0)
+#define GetSysCacheHashValue3(cacheId, key1, key2, key3) \
+	GetSysCacheHashValue(cacheId, key1, key2, key3, 0)
+#define GetSysCacheHashValue4(cacheId, key1, key2, key3, key4) \
+	GetSysCacheHashValue(cacheId, key1, key2, key3, key4)
+
 #define SearchSysCacheList1(cacheId, key1) \
 	SearchSysCacheList(cacheId, 1, key1, 0, 0, 0)
 #define SearchSysCacheList2(cacheId, key1, key2) \

From f9325df0fc6dcf3c1a2540b944c00b7316754146 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Wed, 7 Mar 2012 23:35:03 +0200
Subject: [PATCH 095/129] libpq: Fix memory leak

If a client encoding is specified as a connection parameter (or
environment variable), internal storage allocated for it would never
be freed.
---
 src/interfaces/libpq/fe-connect.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 27a9805625bf9..eece99ab1d6e5 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -2749,6 +2749,8 @@ freePGconn(PGconn *conn)
 		free(conn->events[i].name);
 	}
 
+	if (conn->client_encoding_initial)
+		free(conn->client_encoding_initial);
 	if (conn->events)
 		free(conn->events);
 	if (conn->pghost)

From 561ec761332b5608a894210e00d4fee1b5c6522a Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Wed, 7 Mar 2012 23:46:41 +0200
Subject: [PATCH 096/129] psql: Fix invalid memory access

Due to an apparent thinko, when printing a table in expanded mode
(\x), space would be allocated for 1 slot plus 1 byte per line,
instead of 1 slot per line plus 1 slot for the NULL terminator.  When
the line count is small, reading or writing the terminator would
therefore access memory beyond what was allocated.
---
 src/bin/psql/print.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bin/psql/print.c b/src/bin/psql/print.c
index dec440c264e54..594a63acfd240 100644
--- a/src/bin/psql/print.c
+++ b/src/bin/psql/print.c
@@ -1210,8 +1210,8 @@ print_aligned_vertical(const printTableContent *cont, FILE *fout)
 	 * We now have all the information we need to setup the formatting
 	 * structures
 	 */
-	dlineptr = pg_local_malloc((sizeof(*dlineptr) + 1) * dheight);
-	hlineptr = pg_local_malloc((sizeof(*hlineptr) + 1) * hheight);
+	dlineptr = pg_local_malloc((sizeof(*dlineptr)) * (dheight + 1));
+	hlineptr = pg_local_malloc((sizeof(*hlineptr)) * (hheight + 1));
 
 	dlineptr->ptr = pg_local_malloc(dformatsize);
 	hlineptr->ptr = pg_local_malloc(hformatsize);

From 1673122127f9f17f4ca6b214027c0806b0d502f4 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Wed, 7 Mar 2012 23:52:15 +0200
Subject: [PATCH 097/129] psql: Fix memory leak

In expanded auto mode, a lot of allocated memory was not cleaned up.

found by Coverity
---
 src/bin/psql/print.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bin/psql/print.c b/src/bin/psql/print.c
index 594a63acfd240..f7b5e808894c1 100644
--- a/src/bin/psql/print.c
+++ b/src/bin/psql/print.c
@@ -735,7 +735,7 @@ print_aligned_text(const printTableContent *cont, FILE *fout)
 		(output_columns < total_header_width || output_columns < width_total))
 	{
 		print_aligned_vertical(cont, fout);
-		return;
+		goto cleanup;
 	}
 
 	/* If we wrapped beyond the display width, use the pager */
@@ -1050,6 +1050,7 @@ print_aligned_text(const printTableContent *cont, FILE *fout)
 		fputc('\n', fout);
 	}
 
+cleanup:
 	/* clean up */
 	for (i = 0; i < col_count; i++)
 	{

From cf7026b64b3e56889f8a81194a57221500e23a0f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 7 Mar 2012 16:56:42 -0500
Subject: [PATCH 098/129] psql: Avoid some spurious output if the server
 croaks.

Fixes a regression in commit 08146775acd8bfe0fcc509c71857abb928697171.

Noah Misch
---
 src/bin/psql/common.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c
index 5c9bd96002244..715e23167de82 100644
--- a/src/bin/psql/common.c
+++ b/src/bin/psql/common.c
@@ -740,7 +740,7 @@ ProcessResult(PGresult **results)
 	} while (next_result);
 
 	/* may need this to recover from conn loss during COPY */
-	if (!CheckConnection())
+	if (!first_cycle && !CheckConnection())
 		return false;
 
 	return success;
@@ -1015,8 +1015,10 @@ SendQuery(const char *query)
 			case PQTRANS_UNKNOWN:
 			default:
 				OK = false;
-				psql_error("unexpected transaction status (%d)\n",
-						   transaction_status);
+				/* PQTRANS_UNKNOWN is expected given a broken connection. */
+				if (transaction_status != PQTRANS_UNKNOWN || ConnectionUp())
+					psql_error("unexpected transaction status (%d)\n",
+							   transaction_status);
 				break;
 		}
 

From 9088d1b96504717fd589ff7eeacc96b6d1c08ead Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 7 Mar 2012 18:20:58 -0500
Subject: [PATCH 099/129] Add GetForeignColumnOptions() to foreign.c, and add
 some documentation.

GetForeignColumnOptions provides some abstraction for accessing
column-specific FDW options, on a par with the access functions that were
already provided here for other FDW-related information.

Adjust file_fdw.c to use GetForeignColumnOptions instead of equivalent
hand-rolled code.

In addition, add some SGML documentation for the functions exported by
foreign.c that are meant for use by FDW authors.

(This is the fdw_helper portion of the proposed pgsql_fdw patch.)

Hanada Shigeru, reviewed by KaiGai Kohei
---
 contrib/file_fdw/file_fdw.c   |  49 ++++------------
 doc/src/sgml/fdwhandler.sgml  | 105 ++++++++++++++++++++++++++++++++++
 src/backend/foreign/foreign.c |  37 +++++++++++-
 src/include/foreign/foreign.h |   2 +
 4 files changed, 154 insertions(+), 39 deletions(-)

diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index c2faa6235e766..29f203c6f10ee 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -27,7 +27,6 @@
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "utils/rel.h"
-#include "utils/syscache.h"
 
 PG_MODULE_MAGIC;
 
@@ -346,54 +345,30 @@ get_file_fdw_attribute_options(Oid relid)
 	/* Retrieve FDW options for all user-defined attributes. */
 	for (attnum = 1; attnum <= natts; attnum++)
 	{
-		HeapTuple	tuple;
-		Form_pg_attribute attr;
-		Datum		datum;
-		bool		isnull;
+		Form_pg_attribute attr = tupleDesc->attrs[attnum - 1];
+		List	   *options;
+		ListCell   *lc;
 
 		/* Skip dropped attributes. */
-		if (tupleDesc->attrs[attnum - 1]->attisdropped)
+		if (attr->attisdropped)
 			continue;
 
-		/*
-		 * We need the whole pg_attribute tuple not just what is in the
-		 * tupleDesc, so must do a catalog lookup.
-		 */
-		tuple = SearchSysCache2(ATTNUM,
-								RelationGetRelid(rel),
-								Int16GetDatum(attnum));
-		if (!HeapTupleIsValid(tuple))
-			elog(ERROR, "cache lookup failed for attribute %d of relation %u",
-				 attnum, RelationGetRelid(rel));
-		attr = (Form_pg_attribute) GETSTRUCT(tuple);
-
-		datum = SysCacheGetAttr(ATTNUM,
-								tuple,
-								Anum_pg_attribute_attfdwoptions,
-								&isnull);
-		if (!isnull)
+		options = GetForeignColumnOptions(relid, attnum);
+		foreach(lc, options)
 		{
-			List	   *options = untransformRelOptions(datum);
-			ListCell   *lc;
+			DefElem	   *def = (DefElem *) lfirst(lc);
 
-			foreach(lc, options)
+			if (strcmp(def->defname, "force_not_null") == 0)
 			{
-				DefElem	   *def = (DefElem *) lfirst(lc);
-
-				if (strcmp(def->defname, "force_not_null") == 0)
+				if (defGetBoolean(def))
 				{
-					if (defGetBoolean(def))
-					{
-						char   *attname = pstrdup(NameStr(attr->attname));
+					char   *attname = pstrdup(NameStr(attr->attname));
 
-						fnncolumns = lappend(fnncolumns, makeString(attname));
-					}
+					fnncolumns = lappend(fnncolumns, makeString(attname));
 				}
-				/* maybe in future handle other options here */
 			}
+			/* maybe in future handle other options here */
 		}
-
-		ReleaseSysCache(tuple);
 	}
 
 	heap_close(rel, AccessShareLock);
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index 12c5f75bfab4b..dbfcbbc2b36dd 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -244,4 +244,109 @@ EndForeignScan (ForeignScanState *node);
 
    </sect1>
 
+   <sect1 id="fdw-helpers">
+    <title>Foreign Data Wrapper Helper Functions</title>
+
+    <para>
+     Several helper functions are exported from the core server so that
+     authors of foreign data wrappers can get easy access to attributes of
+     FDW-related objects, such as FDW options.
+     To use any of these functions, you need to include the header file
+     <filename>foreign/foreign.h</filename> in your source file.
+     That header also defines the struct types that are returned by
+     these functions.
+    </para>
+
+    <para>
+<programlisting>
+ForeignDataWrapper *
+GetForeignDataWrapper(Oid fdwid);
+</programlisting>
+
+     This function returns a <structname>ForeignDataWrapper</structname>
+     object for the foreign-data wrapper with the given OID.  A
+     <structname>ForeignDataWrapper</structname> object contains properties
+     of the FDW (see <filename>foreign/foreign.h</filename> for details).
+    </para>
+
+    <para>
+<programlisting>
+ForeignServer *
+GetForeignServer(Oid serverid);
+</programlisting>
+
+     This function returns a <structname>ForeignServer</structname> object
+     for the foreign server with the given OID.  A
+     <structname>ForeignServer</structname> object contains properties
+     of the server (see <filename>foreign/foreign.h</filename> for details).
+    </para>
+
+    <para>
+<programlisting>
+UserMapping *
+GetUserMapping(Oid userid, Oid serverid);
+</programlisting>
+
+     This function returns a <structname>UserMapping</structname> object for
+     the user mapping of the given role on the given server.  (If there is no
+     mapping for the specific user, it will return the mapping for
+     <literal>PUBLIC</>, or throw error if there is none.)  A
+     <structname>UserMapping</structname> object contains properties of the
+     user mapping (see <filename>foreign/foreign.h</filename> for details).
+    </para>
+
+    <para>
+<programlisting>
+ForeignTable *
+GetForeignTable(Oid relid);
+</programlisting>
+
+     This function returns a <structname>ForeignTable</structname> object for
+     the foreign table with the given OID.  A
+     <structname>ForeignTable</structname> object contains properties of the
+     foreign table (see <filename>foreign/foreign.h</filename> for details).
+    </para>
+
+    <para>
+<programlisting>
+List *
+GetForeignTableColumnOptions(Oid relid, AttrNumber attnum);
+</programlisting>
+
+     This function returns the per-column FDW options for the column with the
+     given foreign table OID and attribute number, in the form of a list of
+     <structname>DefElem</structname>.  NIL is returned if the column has no
+     options.
+    </para>
+
+    <para>
+     Some object types have name-based lookup functions in addition to the
+     OID-based ones:
+    </para>
+
+    <para>
+<programlisting>
+ForeignDataWrapper *
+GetForeignDataWrapperByName(const char *name, bool missing_ok);
+</programlisting>
+
+     This function returns a <structname>ForeignDataWrapper</structname>
+     object for the foreign-data wrapper with the given name.  If the wrapper
+     is not found, return NULL if missing_ok is true, otherwise raise an
+     error.
+    </para>
+
+    <para>
+<programlisting>
+ForeignServer *
+GetForeignServerByName(const char *name, bool missing_ok);
+</programlisting>
+
+     This function returns a <structname>ForeignServer</structname> object
+     for the foreign server with the given name.  If the server is not found,
+     return NULL if missing_ok is true, otherwise raise an error.
+    </para>
+
+  </sect1>
+
  </chapter>
diff --git a/src/backend/foreign/foreign.c b/src/backend/foreign/foreign.c
index c4c2a61d5dcb9..f27b55a66e4b7 100644
--- a/src/backend/foreign/foreign.c
+++ b/src/backend/foreign/foreign.c
@@ -28,7 +28,6 @@ extern Datum pg_options_to_table(PG_FUNCTION_ARGS);
 extern Datum postgresql_fdw_validator(PG_FUNCTION_ARGS);
 
 
-
 /*
  * GetForeignDataWrapper -	look up the foreign-data wrapper by OID.
  */
@@ -71,7 +70,6 @@ GetForeignDataWrapper(Oid fdwid)
 }
 
 
-
 /*
  * GetForeignDataWrapperByName - look up the foreign-data wrapper
  * definition by name.
@@ -247,6 +245,39 @@ GetForeignTable(Oid relid)
 }
 
 
+/*
+ * GetForeignColumnOptions - Get attfdwoptions of given relation/attnum
+ * as list of DefElem.
+ */
+List *
+GetForeignColumnOptions(Oid relid, AttrNumber attnum)
+{
+	List	   *options;
+	HeapTuple	tp;
+	Datum		datum;
+	bool		isnull;
+
+	tp = SearchSysCache2(ATTNUM,
+						 ObjectIdGetDatum(relid),
+						 Int16GetDatum(attnum));
+	if (!HeapTupleIsValid(tp))
+		elog(ERROR, "cache lookup failed for attribute %d of relation %u",
+			 attnum, relid);
+	datum = SysCacheGetAttr(ATTNUM,
+							tp,
+							Anum_pg_attribute_attfdwoptions,
+							&isnull);
+	if (isnull)
+		options = NIL;
+	else
+		options = untransformRelOptions(datum);
+
+	ReleaseSysCache(tp);
+
+	return options;
+}
+
+
 /*
  * GetFdwRoutine - call the specified foreign-data wrapper handler routine
  * to get its FdwRoutine struct.
@@ -498,6 +529,7 @@ postgresql_fdw_validator(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL(true);
 }
 
+
 /*
  * get_foreign_data_wrapper_oid - given a FDW name, look up the OID
  *
@@ -518,6 +550,7 @@ get_foreign_data_wrapper_oid(const char *fdwname, bool missing_ok)
 	return oid;
 }
 
+
 /*
  * get_foreign_server_oid - given a FDW name, look up the OID
  *
diff --git a/src/include/foreign/foreign.h b/src/include/foreign/foreign.h
index 191122d081569..f8aa99e2a45e4 100644
--- a/src/include/foreign/foreign.h
+++ b/src/include/foreign/foreign.h
@@ -76,6 +76,8 @@ extern ForeignDataWrapper *GetForeignDataWrapperByName(const char *name,
 							bool missing_ok);
 extern ForeignTable *GetForeignTable(Oid relid);
 
+extern List *GetForeignColumnOptions(Oid relid, AttrNumber attnum);
+
 extern Oid	get_foreign_data_wrapper_oid(const char *fdwname, bool missing_ok);
 extern Oid	get_foreign_server_oid(const char *servername, bool missing_ok);
 

From 1ed7f0e6b90a9b693895105a90d8b5b0eefbcd56 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 7 Mar 2012 19:25:59 -0500
Subject: [PATCH 100/129] Fix indentation of \d footers for non-ASCII cases.

Multi-line "Inherits:" and "Child tables:" footers were misindented when
those strings' translations involved multibyte characters, because we were
using strlen() instead of an appropriate display width measurement.

In passing, avoid doing gettext() more than once per loop in these places.

While at it, fix pg_wcswidth(), which has been entirely broken since about
8.2, but fortunately has been unused for the same length of time.

Report and patch by Sergey Burladyan (bug #6480)
---
 src/bin/psql/describe.c | 32 +++++++++++++++++++-------------
 src/bin/psql/mbprint.c  | 16 +++++++++-------
 src/bin/psql/mbprint.h  |  2 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 4eee4be96e0ee..75709afedece9 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2156,22 +2156,28 @@ describeOneTableDetails(const char *schemaname,
 		if (!result)
 			goto error_return;
 		else
-			tuples = PQntuples(result);
-
-		for (i = 0; i < tuples; i++)
 		{
 			const char *s = _("Inherits");
+			int			sw = pg_wcswidth(s, strlen(s), pset.encoding);
 
-			if (i == 0)
-				printfPQExpBuffer(&buf, "%s: %s", s, PQgetvalue(result, i, 0));
-			else
-				printfPQExpBuffer(&buf, "%*s  %s", (int) strlen(s), "", PQgetvalue(result, i, 0));
-			if (i < tuples - 1)
-				appendPQExpBuffer(&buf, ",");
+			tuples = PQntuples(result);
 
-			printTableAddFooter(&cont, buf.data);
+			for (i = 0; i < tuples; i++)
+			{
+				if (i == 0)
+					printfPQExpBuffer(&buf, "%s: %s",
+									  s, PQgetvalue(result, i, 0));
+				else
+					printfPQExpBuffer(&buf, "%*s  %s",
+									  sw, "", PQgetvalue(result, i, 0));
+				if (i < tuples - 1)
+					appendPQExpBuffer(&buf, ",");
+
+				printTableAddFooter(&cont, buf.data);
+			}
+
+			PQclear(result);
 		}
-		PQclear(result);
 
 		/* print child tables */
 		if (pset.sversion >= 80300)
@@ -2198,6 +2204,7 @@ describeOneTableDetails(const char *schemaname,
 		{
 			/* display the list of child tables */
 			const char *ct = _("Child tables");
+			int			ctw = pg_wcswidth(ct, strlen(ct), pset.encoding);
 
 			for (i = 0; i < tuples; i++)
 			{
@@ -2206,8 +2213,7 @@ describeOneTableDetails(const char *schemaname,
 									  ct, PQgetvalue(result, i, 0));
 				else
 					printfPQExpBuffer(&buf, "%*s  %s",
-									  (int) strlen(ct), "",
-									  PQgetvalue(result, i, 0));
+									  ctw, "", PQgetvalue(result, i, 0));
 				if (i < tuples - 1)
 					appendPQExpBuffer(&buf, ",");
 
diff --git a/src/bin/psql/mbprint.c b/src/bin/psql/mbprint.c
index 32fc756efe5d1..d43973e96a005 100644
--- a/src/bin/psql/mbprint.c
+++ b/src/bin/psql/mbprint.c
@@ -168,11 +168,12 @@ mb_utf_validate(unsigned char *pwcs)
  */
 
 /*
- * pg_wcswidth is the dumb width function. It assumes that everything will
- * only appear on one line. OTOH it is easier to use if this applies to you.
+ * pg_wcswidth is the dumb display-width function.
+ * It assumes that everything will appear on one line.
+ * OTOH it is easier to use than pg_wcssize if this applies to you.
  */
 int
-pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
+pg_wcswidth(const char *pwcs, size_t len, int encoding)
 {
 	int			width = 0;
 
@@ -181,15 +182,16 @@ pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
 		int			chlen,
 					chwidth;
 
-		chlen = PQmblen((const char *) pwcs, encoding);
-		if (chlen > len)
+		chlen = PQmblen(pwcs, encoding);
+		if (len < (size_t) chlen)
 			break;				/* Invalid string */
 
-		chwidth = PQdsplen((const char *) pwcs, encoding);
-
+		chwidth = PQdsplen(pwcs, encoding);
 		if (chwidth > 0)
 			width += chwidth;
+
 		pwcs += chlen;
+		len -= chlen;
 	}
 	return width;
 }
diff --git a/src/bin/psql/mbprint.h b/src/bin/psql/mbprint.h
index 83050ffcd7ee7..01064d3100748 100644
--- a/src/bin/psql/mbprint.h
+++ b/src/bin/psql/mbprint.h
@@ -10,7 +10,7 @@ struct lineptr
 };
 
 extern unsigned char *mbvalidate(unsigned char *pwcs, int encoding);
-extern int	pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding);
+extern int	pg_wcswidth(const char *pwcs, size_t len, int encoding);
 extern void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr * lines, int count);
 extern void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
 		   int *width, int *height, int *format_size);

From 66a7e6bae98592d1d98d9ef589753f0e953c5828 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 7 Mar 2012 22:59:49 -0500
Subject: [PATCH 101/129] Improve estimation of IN/NOT IN by assuming array
 elements are distinct.

In constructs such as "x IN (1,2,3,4)" and "x <> ALL(ARRAY[1,2,3,4])",
we formerly always used a general-purpose assumption that the probability
of success is independent for each comparison of "x" to an array element.
But in real-world usage of these constructs, that's a pretty poor
assumption; it's much saner to assume that the array elements are distinct
and so the match probabilities are disjoint.  Apply that assumption if the
operator appears to behave as equality (for ANY) or inequality (for ALL).
But fall back to the normal independent-probabilities calculation if this
yields an impossible result, ie probability > 1 or < 0.  We could protect
ourselves against bad estimates even more by explicitly checking for equal
array elements, but that is expensive and doesn't seem worthwhile: doing
it would amount to optimizing for poorly-written queries at the expense
of well-written ones.

Daniele Varrazzo and Tom Lane, after a suggestion by Ants Aasma
---
 src/backend/utils/adt/selfuncs.c | 74 ++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 382cd7372ba05..7662b31729bb9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -1712,6 +1712,7 @@ scalararraysel(PlannerInfo *root,
 	RegProcedure oprsel;
 	FmgrInfo	oprselproc;
 	Selectivity s1;
+	Selectivity s1disjoint;
 
 	/* First, deconstruct the expression */
 	Assert(list_length(clause->args) == 2);
@@ -1768,6 +1769,19 @@ scalararraysel(PlannerInfo *root,
 		return (Selectivity) 0.5;
 	fmgr_info(oprsel, &oprselproc);
 
+	/*
+	 * In the array-containment check above, we must only believe that an
+	 * operator is equality or inequality if it is the default btree equality
+	 * operator (or its negator) for the element type, since those are the
+	 * operators that array containment will use.  But in what follows, we can
+	 * be a little laxer, and also believe that any operators using eqsel() or
+	 * neqsel() as selectivity estimator act like equality or inequality.
+	 */
+	if (oprsel == F_EQSEL || oprsel == F_EQJOINSEL)
+		isEquality = true;
+	else if (oprsel == F_NEQSEL || oprsel == F_NEQJOINSEL)
+		isInequality = true;
+
 	/*
 	 * We consider three cases:
 	 *
@@ -1802,7 +1816,23 @@ scalararraysel(PlannerInfo *root,
 						  ARR_ELEMTYPE(arrayval),
 						  elmlen, elmbyval, elmalign,
 						  &elem_values, &elem_nulls, &num_elems);
-		s1 = useOr ? 0.0 : 1.0;
+
+		/*
+		 * For generic operators, we assume the probability of success is
+		 * independent for each array element.  But for "= ANY" or "<> ALL",
+		 * if the array elements are distinct (which'd typically be the case)
+		 * then the probabilities are disjoint, and we should just sum them.
+		 *
+		 * If we were being really tense we would try to confirm that the
+		 * elements are all distinct, but that would be expensive and it
+		 * doesn't seem to be worth the cycles; it would amount to penalizing
+		 * well-written queries in favor of poorly-written ones.  However, we
+		 * do protect ourselves a little bit by checking whether the
+		 * disjointness assumption leads to an impossible (out of range)
+		 * probability; if so, we fall back to the normal calculation.
+		 */
+		s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
 		for (i = 0; i < num_elems; i++)
 		{
 			List	   *args;
@@ -1829,11 +1859,25 @@ scalararraysel(PlannerInfo *root,
 												  ObjectIdGetDatum(operator),
 												  PointerGetDatum(args),
 												  Int32GetDatum(varRelid)));
+
 			if (useOr)
+			{
 				s1 = s1 + s2 - s1 * s2;
+				if (isEquality)
+					s1disjoint += s2;
+			}
 			else
+			{
 				s1 = s1 * s2;
+				if (isInequality)
+					s1disjoint += s2 - 1.0;
+			}
 		}
+
+		/* accept disjoint-probability estimate if in range */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			s1 = s1disjoint;
 	}
 	else if (rightop && IsA(rightop, ArrayExpr) &&
 			 !((ArrayExpr *) rightop)->multidims)
@@ -1845,7 +1889,16 @@ scalararraysel(PlannerInfo *root,
 
 		get_typlenbyval(arrayexpr->element_typeid,
 						&elmlen, &elmbyval);
-		s1 = useOr ? 0.0 : 1.0;
+
+		/*
+		 * We use the assumption of disjoint probabilities here too, although
+		 * the odds of equal array elements are rather higher if the elements
+		 * are not all constants (which they won't be, else constant folding
+		 * would have reduced the ArrayExpr to a Const).  In this path it's
+		 * critical to have the sanity check on the s1disjoint estimate.
+		 */
+		s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
 		foreach(l, arrayexpr->elements)
 		{
 			Node	   *elem = (Node *) lfirst(l);
@@ -1871,11 +1924,25 @@ scalararraysel(PlannerInfo *root,
 												  ObjectIdGetDatum(operator),
 												  PointerGetDatum(args),
 												  Int32GetDatum(varRelid)));
+
 			if (useOr)
+			{
 				s1 = s1 + s2 - s1 * s2;
+				if (isEquality)
+					s1disjoint += s2;
+			}
 			else
+			{
 				s1 = s1 * s2;
+				if (isInequality)
+					s1disjoint += s2 - 1.0;
+			}
 		}
+
+		/* accept disjoint-probability estimate if in range */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			s1 = s1disjoint;
 	}
 	else
 	{
@@ -1911,7 +1978,8 @@ scalararraysel(PlannerInfo *root,
 
 		/*
 		 * Arbitrarily assume 10 elements in the eventual array value (see
-		 * also estimate_array_length)
+		 * also estimate_array_length).  We don't risk an assumption of
+		 * disjoint probabilities here.
 		 */
 		for (i = 0; i < 10; i++)
 		{

From d93f209f483f006534ae543667a1254b6fdec183 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 8 Mar 2012 11:10:02 +0200
Subject: [PATCH 102/129] Silence warning about unused variable, when building
 without assertions.

---
 src/backend/access/transam/xlog.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8bbca613f2674..18fc23286aa30 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1117,7 +1117,6 @@ begin:;
 	 */
 	if (isLogSwitch)
 	{
-		XLogCtlWrite *Write = &XLogCtl->Write;
 		XLogwrtRqst FlushRqst;
 		XLogRecPtr	OldSegEnd;
 
@@ -1140,7 +1139,7 @@ begin:;
 
 		/* There should be no unwritten data */
 		curridx = Insert->curridx;
-		Assert(curridx == Write->curridx);
+		Assert(curridx == XLogCtl->Write.curridx);
 
 		/* Compute end address of old segment */
 		OldSegEnd = XLogCtl->xlblocks[curridx];

From 8dd4d10d481ec1da568ab424e62a3e50c069baaf Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 8 Mar 2012 22:21:12 +0200
Subject: [PATCH 103/129] ecpg: Fix rare memory leaks

found by Coverity
---
 src/interfaces/ecpg/ecpglib/connect.c | 4 ++++
 src/interfaces/ecpg/ecpglib/execute.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c
index cf625f96a34c6..b54b1f5503626 100644
--- a/src/interfaces/ecpg/ecpglib/connect.c
+++ b/src/interfaces/ecpg/ecpglib/connect.c
@@ -519,6 +519,10 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p
 			ecpg_free(realname);
 		if (dbname)
 			ecpg_free(dbname);
+		if (conn_keywords)
+			ecpg_free(conn_keywords);
+		if (conn_values)
+			ecpg_free(conn_values);
 		free(this);
 		return false;
 	}
diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c
index 311bc5cbc50a7..50a2d95347edc 100644
--- a/src/interfaces/ecpg/ecpglib/execute.c
+++ b/src/interfaces/ecpg/ecpglib/execute.c
@@ -1776,6 +1776,7 @@ ECPGdo(const int lineno, const int compat, const int force_indicator, const char
 		{
 			setlocale(LC_NUMERIC, oldlocale);
 			ecpg_free(oldlocale);
+			free_statement(stmt);
 			va_end(args);
 			return (false);
 		}
@@ -1807,6 +1808,7 @@ ECPGdo(const int lineno, const int compat, const int force_indicator, const char
 			ecpg_raise(lineno, ECPG_INVALID_STMT, ECPG_SQLSTATE_INVALID_SQL_STATEMENT_NAME, stmt->command);
 			setlocale(LC_NUMERIC, oldlocale);
 			ecpg_free(oldlocale);
+			free_statement(stmt);
 			va_end(args);
 			return (false);
 		}

From c5e073ca2deec1595e4fdd748f14fcb4122e8faf Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 8 Mar 2012 22:29:01 +0200
Subject: [PATCH 104/129] ecpg: Fix off-by-one error in memory copying

In a rare case, one byte past the end of memory belonging to the
sqlca_t structure would be written to.

found by Coverity
---
 src/interfaces/ecpg/ecpglib/misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interfaces/ecpg/ecpglib/misc.c b/src/interfaces/ecpg/ecpglib/misc.c
index f24478271cd61..a4c283a274d39 100644
--- a/src/interfaces/ecpg/ecpglib/misc.c
+++ b/src/interfaces/ecpg/ecpglib/misc.c
@@ -525,7 +525,7 @@ ECPGset_var(int number, void *pointer, int lineno)
 		struct sqlca_t *sqlca = ECPGget_sqlca();
 
 		sqlca->sqlcode = ECPG_OUT_OF_MEMORY;
-		strncpy(sqlca->sqlstate, "YE001", sizeof("YE001"));
+		strncpy(sqlca->sqlstate, "YE001", sizeof(sqlca->sqlstate));
 		snprintf(sqlca->sqlerrm.sqlerrmc, sizeof(sqlca->sqlerrm.sqlerrmc), "out of memory on line %d", lineno);
 		sqlca->sqlerrm.sqlerrml = strlen(sqlca->sqlerrm.sqlerrmc);
 		/* free all memory we have allocated for the user */

From 08dd23cec7d6b5493c83848d7568495815eda5c6 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 8 Mar 2012 15:52:26 -0500
Subject: [PATCH 105/129] Fix some issues with temp/transient tables in
 extension scripts.

Phil Sorber reported that a rewriting ALTER TABLE within an extension
update script failed, because it creates and then drops a placeholder
table; the drop was being disallowed because the table was marked as an
extension member.  We could hack that specific case but it seems likely
that there might be related cases now or in the future, so the most
practical solution seems to be to create an exception to the general rule
that extension member objects can only be dropped by dropping the owning
extension.  To wit: if the DROP is issued within the extension's own
creation or update scripts, we'll allow it, implicitly performing an
"ALTER EXTENSION DROP object" first.  This will simplify cases such as
extension downgrade scripts anyway.

No docs change since we don't seem to have documented the idea that you
would need ALTER EXTENSION DROP for such an action to begin with.

Also, arrange for explicitly temporary tables to not get linked as
extension members in the first place, and the same for the magic
pg_temp_nnn schemas that are created to hold them.  This prevents assorted
unpleasant results if an extension script creates a temp table: the forced
drop at session end would either fail or remove the entire extension, and
neither of those outcomes is desirable.  Note that this doesn't fix the
ALTER TABLE scenario, since the placeholder table is not temp (unless the
table being rewritten is).

Back-patch to 9.1.
---
 src/backend/catalog/dependency.c   | 31 ++++++++++++++++++++++++------
 src/backend/catalog/heap.c         |  9 ++++++++-
 src/backend/catalog/namespace.c    |  6 ++++--
 src/backend/catalog/pg_namespace.c | 15 ++++++++++++---
 src/backend/commands/schemacmds.c  |  2 +-
 src/include/catalog/pg_namespace.h |  2 +-
 6 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index db86262b4f06e..eca064f0cdef7 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -560,17 +560,21 @@ findDependentObjects(const ObjectAddress *object,
 				 * another object, or is part of the extension that is the
 				 * other object.  We have three cases:
 				 *
-				 * 1. At the outermost recursion level, disallow the DROP. (We
-				 * just ereport here, rather than proceeding, since no other
-				 * dependencies are likely to be interesting.)	However, if
-				 * the owning object is listed in pendingObjects, just release
-				 * the caller's lock and return; we'll eventually complete the
-				 * DROP when we reach that entry in the pending list.
+				 * 1. At the outermost recursion level, we normally disallow
+				 * the DROP.  (We just ereport here, rather than proceeding,
+				 * since no other dependencies are likely to be interesting.)
+				 * However, there are exceptions.
 				 */
 				if (stack == NULL)
 				{
 					char	   *otherObjDesc;
 
+					/*
+					 * Exception 1a: if the owning object is listed in
+					 * pendingObjects, just release the caller's lock and
+					 * return.  We'll eventually complete the DROP when we
+					 * reach that entry in the pending list.
+					 */
 					if (pendingObjects &&
 						object_address_present(&otherObject, pendingObjects))
 					{
@@ -579,6 +583,21 @@ findDependentObjects(const ObjectAddress *object,
 						ReleaseDeletionLock(object);
 						return;
 					}
+
+					/*
+					 * Exception 1b: if the owning object is the extension
+					 * currently being created/altered, it's okay to continue
+					 * with the deletion.  This allows dropping of an
+					 * extension's objects within the extension's scripts,
+					 * as well as corner cases such as dropping a transient
+					 * object created within such a script.
+					 */
+					if (creating_extension &&
+						otherObject.classId == ExtensionRelationId &&
+						otherObject.objectId == CurrentExtensionObject)
+						break;
+
+					/* No exception applies, so throw the error */
 					otherObjDesc = getObjectDescription(&otherObject);
 					ereport(ERROR,
 							(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index a8653cd49562d..d1d458d7fa485 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -957,10 +957,12 @@ AddNewRelationType(const char *typeName,
  *	reltablespace: OID of tablespace it goes in
  *	relid: OID to assign to new rel, or InvalidOid to select a new OID
  *	reltypeid: OID to assign to rel's rowtype, or InvalidOid to select one
+ *	reloftypeid: if a typed table, OID of underlying type; else InvalidOid
  *	ownerid: OID of new rel's owner
  *	tupdesc: tuple descriptor (source of column definitions)
  *	cooked_constraints: list of precooked check constraints and defaults
  *	relkind: relkind for new rel
+ *	relpersistence: rel's persistence status (permanent, temp, or unlogged)
  *	shared_relation: TRUE if it's to be a shared relation
  *	mapped_relation: TRUE if the relation will use the relfilenode map
  *	oidislocal: TRUE if oid column (if any) should be marked attislocal
@@ -1235,6 +1237,10 @@ heap_create_with_catalog(const char *relname,
 	 * should they have any ACL entries.  The same applies for extension
 	 * dependencies.
 	 *
+	 * If it's a temp table, we do not make it an extension member; this
+	 * prevents the unintuitive result that deletion of the temp table at
+	 * session end would make the whole extension go away.
+	 *
 	 * Also, skip this in bootstrap mode, since we don't make dependencies
 	 * while bootstrapping.
 	 */
@@ -1255,7 +1261,8 @@ heap_create_with_catalog(const char *relname,
 
 		recordDependencyOnOwner(RelationRelationId, relid, ownerid);
 
-		recordDependencyOnCurrentExtension(&myself, false);
+		if (relpersistence != RELPERSISTENCE_TEMP)
+			recordDependencyOnCurrentExtension(&myself, false);
 
 		if (reloftypeid)
 		{
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 80d6fc7d0c2c7..dc8f8eaf3f3f6 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -3558,7 +3558,8 @@ InitTempTableNamespace(void)
 		 * temp tables.  This works because the places that access the temp
 		 * namespace for my own backend skip permissions checks on it.
 		 */
-		namespaceId = NamespaceCreate(namespaceName, BOOTSTRAP_SUPERUSERID);
+		namespaceId = NamespaceCreate(namespaceName, BOOTSTRAP_SUPERUSERID,
+									  true);
 		/* Advance command counter to make namespace visible */
 		CommandCounterIncrement();
 	}
@@ -3582,7 +3583,8 @@ InitTempTableNamespace(void)
 	toastspaceId = get_namespace_oid(namespaceName, true);
 	if (!OidIsValid(toastspaceId))
 	{
-		toastspaceId = NamespaceCreate(namespaceName, BOOTSTRAP_SUPERUSERID);
+		toastspaceId = NamespaceCreate(namespaceName, BOOTSTRAP_SUPERUSERID,
+									   true);
 		/* Advance command counter to make namespace visible */
 		CommandCounterIncrement();
 	}
diff --git a/src/backend/catalog/pg_namespace.c b/src/backend/catalog/pg_namespace.c
index d5ab48a59dc76..be812a246c09d 100644
--- a/src/backend/catalog/pg_namespace.c
+++ b/src/backend/catalog/pg_namespace.c
@@ -26,10 +26,18 @@
 
 /* ----------------
  * NamespaceCreate
+ *
+ * Create a namespace (schema) with the given name and owner OID.
+ *
+ * If isTemp is true, this schema is a per-backend schema for holding
+ * temporary tables.  Currently, the only effect of that is to prevent it
+ * from being linked as a member of any active extension.  (If someone
+ * does CREATE TEMP TABLE in an extension script, we don't want the temp
+ * schema to become part of the extension.)
  * ---------------
  */
 Oid
-NamespaceCreate(const char *nspName, Oid ownerId)
+NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp)
 {
 	Relation	nspdesc;
 	HeapTuple	tup;
@@ -82,8 +90,9 @@ NamespaceCreate(const char *nspName, Oid ownerId)
 	/* dependency on owner */
 	recordDependencyOnOwner(NamespaceRelationId, nspoid, ownerId);
 
-	/* dependency on extension */
-	recordDependencyOnCurrentExtension(&myself, false);
+	/* dependency on extension ... but not for magic temp schemas */
+	if (!isTemp)
+		recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new schema */
 	InvokeObjectAccessHook(OAT_POST_CREATE, NamespaceRelationId, nspoid, 0);
diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c
index 916328529a071..6745af501d468 100644
--- a/src/backend/commands/schemacmds.c
+++ b/src/backend/commands/schemacmds.c
@@ -94,7 +94,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString)
 							save_sec_context | SECURITY_LOCAL_USERID_CHANGE);
 
 	/* Create the schema's namespace */
-	namespaceId = NamespaceCreate(schemaName, owner_uid);
+	namespaceId = NamespaceCreate(schemaName, owner_uid, false);
 
 	/* Advance cmd counter to make the namespace visible */
 	CommandCounterIncrement();
diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h
index aad76a1452aae..1daba477b409a 100644
--- a/src/include/catalog/pg_namespace.h
+++ b/src/include/catalog/pg_namespace.h
@@ -79,6 +79,6 @@ DESCR("standard public schema");
 /*
  * prototypes for functions in pg_namespace.c
  */
-extern Oid	NamespaceCreate(const char *nspName, Oid ownerId);
+extern Oid	NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp);
 
 #endif   /* PG_NAMESPACE_H */

From 410ee35ed095d85bf49c60ab7fd096ddbb5cb0a1 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 8 Mar 2012 23:13:51 +0200
Subject: [PATCH 106/129] psql: Remove useless code

Apparently a copy-and-paste mistake introduced in
8ddd22f2456af0155f9c183894f481203e86b76e.

found by Coverity
---
 src/bin/psql/print.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/psql/print.c b/src/bin/psql/print.c
index f7b5e808894c1..72bac51ecb0ff 100644
--- a/src/bin/psql/print.c
+++ b/src/bin/psql/print.c
@@ -207,7 +207,7 @@ format_numeric_locale(const char *my_str)
 				leading_digits;
 	int			groupdigits = atoi(grouping);
 	int			new_str_start = 0;
-	char	   *new_str = new_str = pg_local_malloc(
+	char	   *new_str = pg_local_malloc(
 									 strlen_with_numeric_locale(my_str) + 1);
 
 	leading_digits = (int_len % groupdigits != 0) ?

From 342baf4ce61f06ad3898490dc5125579d9e6bd18 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 9 Mar 2012 08:07:56 +0200
Subject: [PATCH 107/129] Update outdated comment. HeapTupleHeader.t_natts
 field doesn't exist anymore.

Kevin Grittner
---
 src/backend/access/common/heaptuple.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 08d2b21c2318d..034dfe574f1b9 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -871,7 +871,8 @@ heap_modifytuple(HeapTuple tuple,
  *		the inverse of heap_form_tuple.
  *
  *		Storage for the values/isnull arrays is provided by the caller;
- *		it should be sized according to tupleDesc->natts not tuple->t_natts.
+ *		it should be sized according to tupleDesc->natts not
+ *		HeapTupleHeaderGetNatts(tuple->t_data).
  *
  *		Note that for pass-by-reference datatypes, the pointer placed
  *		in the Datum will point into the given tuple.
@@ -978,7 +979,8 @@ heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
  *		the inverse of heap_formtuple.
  *
  *		Storage for the values/nulls arrays is provided by the caller;
- *		it should be sized according to tupleDesc->natts not tuple->t_natts.
+ *		it should be sized according to tupleDesc->natts not
+ *		HeapTupleHeaderGetNatts(tuple->t_data).
  *
  *		Note that for pass-by-reference datatypes, the pointer placed
  *		in the Datum will point into the given tuple.

From b14953932dfdda7d915b9e276a09df8458efeec8 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 9 Mar 2012 12:48:48 -0500
Subject: [PATCH 108/129] Revise FDW planning API, again.

Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path.  Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan.  We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.

Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.

In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution.  We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker).  Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)

Per review of Hanada Shigeru's pgsql_fdw patch.  We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
---
 contrib/file_fdw/file_fdw.c             | 161 +++++++++++++----
 doc/src/sgml/fdwhandler.sgml            | 230 ++++++++++++++++++++----
 src/backend/nodes/copyfuncs.c           |   3 +-
 src/backend/nodes/outfuncs.c            |   4 +-
 src/backend/optimizer/path/allpaths.c   |  13 +-
 src/backend/optimizer/path/costsize.c   |   2 +-
 src/backend/optimizer/plan/createplan.c |  67 ++++---
 src/backend/optimizer/plan/setrefs.c    |   2 +
 src/backend/optimizer/plan/subselect.c  |   2 +
 src/backend/optimizer/util/pathnode.c   |   2 +-
 src/backend/optimizer/util/relnode.c    |   4 +
 src/include/foreign/fdwapi.h            |  21 ++-
 src/include/nodes/plannodes.h           |  11 +-
 src/include/nodes/relation.h            |  24 ++-
 src/include/optimizer/planmain.h        |   2 +
 15 files changed, 436 insertions(+), 112 deletions(-)

diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index 29f203c6f10ee..e8907709bd90a 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -26,6 +26,8 @@
 #include "nodes/makefuncs.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
+#include "optimizer/planmain.h"
+#include "optimizer/restrictinfo.h"
 #include "utils/rel.h"
 
 PG_MODULE_MAGIC;
@@ -48,7 +50,7 @@ struct FileFdwOption
  * Note: If you are adding new option for user mapping, you need to modify
  * fileGetOptions(), which currently doesn't bother to look at user mappings.
  */
-static struct FileFdwOption valid_options[] = {
+static const struct FileFdwOption valid_options[] = {
 	/* File options */
 	{"filename", ForeignTableRelationId},
 
@@ -71,6 +73,17 @@ static struct FileFdwOption valid_options[] = {
 	{NULL, InvalidOid}
 };
 
+/*
+ * FDW-specific information for RelOptInfo.fdw_private.
+ */
+typedef struct FileFdwPlanState
+{
+	char	   *filename;		/* file to read */
+	List	   *options;		/* merged COPY options, excluding filename */
+	BlockNumber pages;			/* estimate of file's physical size */
+	double		ntuples;		/* estimate of number of rows in file */
+} FileFdwPlanState;
+
 /*
  * FDW-specific information for ForeignScanState.fdw_state.
  */
@@ -93,9 +106,18 @@ PG_FUNCTION_INFO_V1(file_fdw_validator);
 /*
  * FDW callback routines
  */
-static void filePlanForeignScan(Oid foreigntableid,
-					PlannerInfo *root,
-					RelOptInfo *baserel);
+static void fileGetForeignRelSize(PlannerInfo *root,
+								  RelOptInfo *baserel,
+								  Oid foreigntableid);
+static void fileGetForeignPaths(PlannerInfo *root,
+								RelOptInfo *baserel,
+								Oid foreigntableid);
+static ForeignScan *fileGetForeignPlan(PlannerInfo *root,
+									   RelOptInfo *baserel,
+									   Oid foreigntableid,
+									   ForeignPath *best_path,
+									   List *tlist,
+									   List *scan_clauses);
 static void fileExplainForeignScan(ForeignScanState *node, ExplainState *es);
 static void fileBeginForeignScan(ForeignScanState *node, int eflags);
 static TupleTableSlot *fileIterateForeignScan(ForeignScanState *node);
@@ -109,8 +131,10 @@ static bool is_valid_option(const char *option, Oid context);
 static void fileGetOptions(Oid foreigntableid,
 			   char **filename, List **other_options);
 static List *get_file_fdw_attribute_options(Oid relid);
+static void estimate_size(PlannerInfo *root, RelOptInfo *baserel,
+			  FileFdwPlanState *fdw_private);
 static void estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
-			   const char *filename,
+			   FileFdwPlanState *fdw_private,
 			   Cost *startup_cost, Cost *total_cost);
 
 
@@ -123,7 +147,9 @@ file_fdw_handler(PG_FUNCTION_ARGS)
 {
 	FdwRoutine *fdwroutine = makeNode(FdwRoutine);
 
-	fdwroutine->PlanForeignScan = filePlanForeignScan;
+	fdwroutine->GetForeignRelSize = fileGetForeignRelSize;
+	fdwroutine->GetForeignPaths = fileGetForeignPaths;
+	fdwroutine->GetForeignPlan = fileGetForeignPlan;
 	fdwroutine->ExplainForeignScan = fileExplainForeignScan;
 	fdwroutine->BeginForeignScan = fileBeginForeignScan;
 	fdwroutine->IterateForeignScan = fileIterateForeignScan;
@@ -177,7 +203,7 @@ file_fdw_validator(PG_FUNCTION_ARGS)
 
 		if (!is_valid_option(def->defname, catalog))
 		{
-			struct FileFdwOption *opt;
+			const struct FileFdwOption *opt;
 			StringInfoData buf;
 
 			/*
@@ -249,7 +275,7 @@ file_fdw_validator(PG_FUNCTION_ARGS)
 static bool
 is_valid_option(const char *option, Oid context)
 {
-	struct FileFdwOption *opt;
+	const struct FileFdwOption *opt;
 
 	for (opt = valid_options; opt->optname; opt++)
 	{
@@ -381,7 +407,31 @@ get_file_fdw_attribute_options(Oid relid)
 }
 
 /*
- * filePlanForeignScan
+ * fileGetForeignRelSize
+ *		Obtain relation size estimates for a foreign table
+ */
+static void
+fileGetForeignRelSize(PlannerInfo *root,
+					  RelOptInfo *baserel,
+					  Oid foreigntableid)
+{
+	FileFdwPlanState *fdw_private;
+
+	/*
+	 * Fetch options.  We only need filename at this point, but we might
+	 * as well get everything and not need to re-fetch it later in planning.
+	 */
+	fdw_private = (FileFdwPlanState *) palloc(sizeof(FileFdwPlanState));
+	fileGetOptions(foreigntableid,
+				   &fdw_private->filename, &fdw_private->options);
+	baserel->fdw_private = (void *) fdw_private;
+
+	/* Estimate relation size */
+	estimate_size(root, baserel, fdw_private);
+}
+
+/*
+ * fileGetForeignPaths
  *		Create possible access paths for a scan on the foreign table
  *
  *		Currently we don't support any push-down feature, so there is only one
@@ -389,20 +439,16 @@ get_file_fdw_attribute_options(Oid relid)
  *		the data file.
  */
 static void
-filePlanForeignScan(Oid foreigntableid,
-					PlannerInfo *root,
-					RelOptInfo *baserel)
+fileGetForeignPaths(PlannerInfo *root,
+					RelOptInfo *baserel,
+					Oid foreigntableid)
 {
-	char	   *filename;
-	List	   *options;
+	FileFdwPlanState *fdw_private = (FileFdwPlanState *) baserel->fdw_private;
 	Cost		startup_cost;
 	Cost		total_cost;
 
-	/* Fetch options --- we only need filename at this point */
-	fileGetOptions(foreigntableid, &filename, &options);
-
-	/* Estimate costs and update baserel->rows */
-	estimate_costs(root, baserel, filename,
+	/* Estimate costs */
+	estimate_costs(root, baserel, fdw_private,
 				   &startup_cost, &total_cost);
 
 	/* Create a ForeignPath node and add it as only possible path */
@@ -422,6 +468,37 @@ filePlanForeignScan(Oid foreigntableid,
 	 */
 }
 
+/*
+ * fileGetForeignPlan
+ *		Create a ForeignScan plan node for scanning the foreign table
+ */
+static ForeignScan *
+fileGetForeignPlan(PlannerInfo *root,
+				   RelOptInfo *baserel,
+				   Oid foreigntableid,
+				   ForeignPath *best_path,
+				   List *tlist,
+				   List *scan_clauses)
+{
+	Index		scan_relid = baserel->relid;
+
+	/*
+	 * We have no native ability to evaluate restriction clauses, so we just
+	 * put all the scan_clauses into the plan node's qual list for the
+	 * executor to check.  So all we have to do here is strip RestrictInfo
+	 * nodes from the clauses and ignore pseudoconstants (which will be
+	 * handled elsewhere).
+	 */
+	scan_clauses = extract_actual_clauses(scan_clauses, false);
+
+	/* Create the ForeignScan node */
+	return make_foreignscan(tlist,
+							scan_clauses,
+							scan_relid,
+							NIL, /* no expressions to evaluate */
+							NIL); /* no private state either */
+}
+
 /*
  * fileExplainForeignScan
  *		Produce extra output for EXPLAIN
@@ -568,38 +645,38 @@ fileReScanForeignScan(ForeignScanState *node)
 }
 
 /*
- * Estimate costs of scanning a foreign table.
+ * Estimate size of a foreign table.
  *
- * In addition to setting *startup_cost and *total_cost, this should
- * update baserel->rows.
+ * The main result is returned in baserel->rows.  We also set
+ * fdw_private->pages and fdw_private->ntuples for later use in the cost
+ * calculation.
  */
 static void
-estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
-			   const char *filename,
-			   Cost *startup_cost, Cost *total_cost)
+estimate_size(PlannerInfo *root, RelOptInfo *baserel,
+			  FileFdwPlanState *fdw_private)
 {
 	struct stat stat_buf;
 	BlockNumber pages;
 	int			tuple_width;
 	double		ntuples;
 	double		nrows;
-	Cost		run_cost = 0;
-	Cost		cpu_per_tuple;
 
 	/*
 	 * Get size of the file.  It might not be there at plan time, though, in
 	 * which case we have to use a default estimate.
 	 */
-	if (stat(filename, &stat_buf) < 0)
+	if (stat(fdw_private->filename, &stat_buf) < 0)
 		stat_buf.st_size = 10 * BLCKSZ;
 
 	/*
-	 * Convert size to pages for use in I/O cost estimate below.
+	 * Convert size to pages for use in I/O cost estimate later.
 	 */
 	pages = (stat_buf.st_size + (BLCKSZ - 1)) / BLCKSZ;
 	if (pages < 1)
 		pages = 1;
 
+	fdw_private->pages = pages;
+
 	/*
 	 * Estimate the number of tuples in the file.  We back into this estimate
 	 * using the planner's idea of the relation width; which is bogus if not
@@ -611,6 +688,8 @@ estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
 
 	ntuples = clamp_row_est((double) stat_buf.st_size / (double) tuple_width);
 
+	fdw_private->ntuples = ntuples;
+
 	/*
 	 * Now estimate the number of rows returned by the scan after applying the
 	 * baserestrictinfo quals.	This is pretty bogus too, since the planner
@@ -627,12 +706,28 @@ estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
 
 	/* Save the output-rows estimate for the planner */
 	baserel->rows = nrows;
+}
+
+/*
+ * Estimate costs of scanning a foreign table.
+ *
+ * Results are returned in *startup_cost and *total_cost.
+ */
+static void
+estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
+			   FileFdwPlanState *fdw_private,
+			   Cost *startup_cost, Cost *total_cost)
+{
+	BlockNumber pages = fdw_private->pages;
+	double		ntuples = fdw_private->ntuples;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
 
 	/*
-	 * Now estimate costs.	We estimate costs almost the same way as
-	 * cost_seqscan(), thus assuming that I/O costs are equivalent to a
-	 * regular table file of the same size.  However, we take per-tuple CPU
-	 * costs as 10x of a seqscan, to account for the cost of parsing records.
+	 * We estimate costs almost the same way as cost_seqscan(), thus assuming
+	 * that I/O costs are equivalent to a regular table file of the same size.
+	 * However, we take per-tuple CPU costs as 10x of a seqscan, to account
+	 * for the cost of parsing records.
 	 */
 	run_cost += seq_page_cost * pages;
 
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index dbfcbbc2b36dd..f7bf3d8a39571 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -89,52 +89,92 @@
     <para>
 <programlisting>
 void
-PlanForeignScan (Oid foreigntableid,
-                 PlannerInfo *root,
-                 RelOptInfo *baserel);
+GetForeignRelSize (PlannerInfo *root,
+                   RelOptInfo *baserel,
+                   Oid foreigntableid);
 </programlisting>
 
-     Create possible access paths for a scan on a foreign table. This is
-     called when a query is planned.
+     Obtain relation size estimates for a foreign table.  This is called
+     at the beginning of planning for a query involving a foreign table.
+     <literal>root</> is the planner's global information about the query;
+     <literal>baserel</> is the planner's information about this table; and
      <literal>foreigntableid</> is the <structname>pg_class</> OID of the
-     foreign table.  <literal>root</> is the planner's global information
-     about the query, and <literal>baserel</> is the planner's information
-     about this table.
+     foreign table.  (<literal>foreigntableid</> could be obtained from the
+     planner data structures, but it's passed explicitly to save effort.)
     </para>
 
     <para>
-     The function must generate at least one access path (ForeignPath node)
-     for a scan on the foreign table and must call <function>add_path</> to
-     add the path to <literal>baserel-&gt;pathlist</>.  It's recommended to
-     use <function>create_foreignscan_path</> to build the ForeignPath node.
-     The function may generate multiple access paths, e.g., a path which has
-     valid <literal>pathkeys</> to represent a pre-sorted result.  Each access
-     path must contain cost estimates, and can contain any FDW-private
-     information that is needed to execute the foreign scan at a later time.
-     (Note that the private information must be represented in a form that
-     <function>copyObject</> knows how to copy.)
+     This function should update <literal>baserel-&gt;rows</> to be the
+     expected number of rows returned by the table scan, after accounting for
+     the filtering done by the restriction quals.  The initial value of
+     <literal>baserel-&gt;rows</> is just a constant default estimate, which
+     should be replaced if at all possible.  The function may also choose to
+     update <literal>baserel-&gt;width</> if it can compute a better estimate
+     of the average result row width.
     </para>
 
     <para>
-     The information in <literal>root</> and <literal>baserel</> can be used
-     to reduce the amount of information that has to be fetched from the
-     foreign table (and therefore reduce the cost estimate).
-     <literal>baserel-&gt;baserestrictinfo</> is particularly interesting, as
-     it contains restriction quals (<literal>WHERE</> clauses) that can be
-     used to filter the rows to be fetched.  (The FDW is not required to
-     enforce these quals, as the finished plan will recheck them anyway.)
-     <literal>baserel-&gt;reltargetlist</> can be used to determine which
-     columns need to be fetched.
+     See <xref linkend="fdw-planning"> for additional information.
+    </para>
+
+    <para>
+<programlisting>
+void
+GetForeignPaths (PlannerInfo *root,
+                 RelOptInfo *baserel,
+                 Oid foreigntableid);
+</programlisting>
+
+     Create possible access paths for a scan on a foreign table.
+     This is called during query planning.
+     The parameters are the same as for <function>GetForeignRelSize</>,
+     which has already been called.
+    </para>
+
+    <para>
+     This function must generate at least one access path
+     (<structname>ForeignPath</> node) for a scan on the foreign table and
+     must call <function>add_path</> to add each such path to
+     <literal>baserel-&gt;pathlist</>.  It's recommended to use
+     <function>create_foreignscan_path</> to build the
+     <structname>ForeignPath</> nodes.  The function can generate multiple
+     access paths, e.g., a path which has valid <literal>pathkeys</> to
+     represent a pre-sorted result.  Each access path must contain cost
+     estimates, and can contain any FDW-private information that is needed to
+     identify the specific scan method intended.
+    </para>
+
+    <para>
+     See <xref linkend="fdw-planning"> for additional information.
+    </para>
+
+    <para>
+<programlisting>
+ForeignScan *
+GetForeignPlan (PlannerInfo *root,
+                RelOptInfo *baserel,
+                Oid foreigntableid,
+                ForeignPath *best_path,
+                List *tlist,
+                List *scan_clauses);
+</programlisting>
+
+     Create a <structname>ForeignScan</> plan node from the selected foreign
+     access path.  This is called at the end of query planning.
+     The parameters are as for <function>GetForeignRelSize</>, plus
+     the selected <structname>ForeignPath</> (previously produced by
+     <function>GetForeignPaths</>), the target list to be emitted by the
+     plan node, and the restriction clauses to be enforced by the plan node.
     </para>
 
     <para>
-     In addition to returning cost estimates, the function should update
-     <literal>baserel-&gt;rows</> to be the expected number of rows returned
-     by the scan, after accounting for the filtering done by the restriction
-     quals.  The initial value of <literal>baserel-&gt;rows</> is just a
-     constant default estimate, which should be replaced if at all possible.
-     The function may also choose to update <literal>baserel-&gt;width</> if
-     it can compute a better estimate of the average result row width.
+     This function must create and return a <structname>ForeignScan</> plan
+     node; it's recommended to use <function>make_foreignscan</> to build the
+     <structname>ForeignScan</> node.
+    </para>
+
+    <para>
+     See <xref linkend="fdw-planning"> for additional information.
     </para>
 
     <para>
@@ -170,7 +210,7 @@ BeginForeignScan (ForeignScanState *node,
      the table to scan is accessible through the
      <structname>ForeignScanState</> node (in particular, from the underlying
      <structname>ForeignScan</> plan node, which contains any FDW-private
-     information provided by <function>PlanForeignScan</>).
+     information provided by <function>GetForeignPlan</>).
     </para>
 
     <para>
@@ -347,6 +387,126 @@ GetForeignServerByName(const char *name, bool missing_ok);
      return NULL if missing_ok is true, otherwise raise an error.
     </para>
 
+   </sect1>
+
+   <sect1 id="fdw-planning">
+    <title>Foreign Data Wrapper Query Planning</title>
+
+    <para>
+     The FDW callback functions <function>GetForeignRelSize</>,
+     <function>GetForeignPaths</>, and <function>GetForeignPlan</> must fit
+     into the workings of the <productname>PostgreSQL</> planner.  Here are
+     some notes about what they must do.
+    </para>
+
+    <para>
+     The information in <literal>root</> and <literal>baserel</> can be used
+     to reduce the amount of information that has to be fetched from the
+     foreign table (and therefore reduce the cost).
+     <literal>baserel-&gt;baserestrictinfo</> is particularly interesting, as
+     it contains restriction quals (<literal>WHERE</> clauses) that should be
+     used to filter the rows to be fetched.  (The FDW itself is not required
+     to enforce these quals, as the core executor can check them instead.)
+     <literal>baserel-&gt;reltargetlist</> can be used to determine which
+     columns need to be fetched; but note that it only lists columns that
+     have to be emitted by the <structname>ForeignScan</> plan node, not
+     columns that are used in qual evaluation but not output by the query.
+    </para>
+
+    <para>
+     Various private fields are available for the FDW planning functions to
+     keep information in.  Generally, whatever you store in FDW private fields
+     should be palloc'd, so that it will be reclaimed at the end of planning.
+    </para>
+
+    <para>
+     <literal>baserel-&gt;fdw_private</> is a <type>void</> pointer that is
+     available for FDW planning functions to store information relevant to
+     the particular foreign table.  The core planner does not touch it except
+     to initialize it to NULL when the <literal>baserel</> node is created.
+     It is useful for passing information forward from
+     <function>GetForeignRelSize</> to <function>GetForeignPaths</> and/or
+     <function>GetForeignPaths</> to <function>GetForeignPlan</>, thereby
+     avoiding recalculation.
+    </para>
+
+    <para>
+     <function>GetForeignPaths</> can identify the meaning of different
+     access paths by storing private information in the
+     <structfield>fdw_private</> field of <structname>ForeignPath</> nodes.
+     <structfield>fdw_private</> is declared as a <type>List</> pointer, but
+     could actually contain anything since the core planner does not touch
+     it.  However, best practice is to use a representation that's dumpable
+     by <function>nodeToString</>, for use with debugging support available
+     in the backend.
+    </para>
+
+    <para>
+     <function>GetForeignPlan</> can examine the <structfield>fdw_private</>
+     field of the selected <structname>ForeignPath</> node, and can generate
+     <structfield>fdw_exprs</> and <structfield>fdw_private</> lists to be
+     placed in the <structname>ForeignScan</> plan node, where they will be
+     available at execution time.  Both of these lists must be
+     represented in a form that <function>copyObject</> knows how to copy.
+     The <structfield>fdw_private</> list has no other restrictions and is
+     not interpreted by the core backend in any way.  The
+     <structfield>fdw_exprs</> list, if not NIL, is expected to contain
+     expression trees that are intended to be executed at runtime.  These
+     trees will undergo post-processing by the planner to make them fully
+     executable.
+    </para>
+
+    <para>
+     In <function>GetForeignPlan</>, generally the passed-in targetlist can
+     be copied into the plan node as-is.  The passed scan_clauses list
+     contains the same clauses as <literal>baserel-&gt;baserestrictinfo</>,
+     but may be re-ordered for better execution efficiency.  In simple cases
+     the FDW can just strip <structname>RestrictInfo</> nodes from the
+     scan_clauses list (using <function>extract_actual_clauses</>) and put
+     all the clauses into the plan node's qual list, which means that all the
+     clauses will be checked by the executor at runtime.  More complex FDWs
+     may be able to check some of the clauses internally, in which case those
+     clauses can be removed from the plan node's qual list so that the
+     executor doesn't waste time rechecking them.
+    </para>
+
+    <para>
+     As an example, the FDW might identify some restriction clauses of the
+     form <replaceable>foreign_variable</> <literal>=</>
+     <replaceable>sub_expression</>, which it determines can be executed on
+     the remote server given the locally-evaluated value of the
+     <replaceable>sub_expression</>.  The actual identification of such a
+     clause should happen during <function>GetForeignPaths</>, since it would
+     affect the cost estimate for the path.  The path's
+     <structfield>fdw_private</> field would probably include a pointer to
+     the identified clause's <structname>RestrictInfo</> node.  Then
+     <function>GetForeignPlan</> would remove that clause from scan_clauses,
+     but add the <replaceable>sub_expression</> to <structfield>fdw_exprs</>
+     to ensure that it gets massaged into executable form.  It would probably
+     also put control information into the plan node's
+     <structfield>fdw_private</> field to tell the execution functions what
+     to do at runtime.  The query transmitted to the remote server would
+     involve something like <literal>WHERE <replaceable>foreign_variable</> =
+     $1</literal>, with the parameter value obtained at runtime from
+     evaluation of the <structfield>fdw_exprs</> expression tree.
+    </para>
+
+    <para>
+     The FDW should always construct at least one path that depends only on
+     the table's restriction clauses.  In join queries, it might also choose
+     to construct path(s) that depend on join clauses, for example
+     <replaceable>foreign_variable</> <literal>=</>
+     <replaceable>local_variable</>.  Such clauses will not be found in
+     <literal>baserel-&gt;baserestrictinfo</> but must be sought in the
+     relation's join lists.  A path using such a clause is called a
+     <quote>parameterized path</>.  It must show the other relation(s) as
+     <literal>required_outer</> and list the specific join clause(s) in
+     <literal>param_clauses</>.  In <function>GetForeignPlan</>, the
+     <replaceable>local_variable</> portion of the join clause would be added
+     to <structfield>fdw_exprs</>, and then at runtime the case works the
+     same as for an ordinary restriction clause.
+    </para>
+
   </sect1>
 
  </chapter>
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 868fb7130a8b2..5cde22543f5b7 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -591,8 +591,9 @@ _copyForeignScan(const ForeignScan *from)
 	/*
 	 * copy remainder of node
 	 */
-	COPY_SCALAR_FIELD(fsSystemCol);
+	COPY_NODE_FIELD(fdw_exprs);
 	COPY_NODE_FIELD(fdw_private);
+	COPY_SCALAR_FIELD(fsSystemCol);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 9daeb3e7b43e9..51181a9a7438e 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -559,8 +559,9 @@ _outForeignScan(StringInfo str, const ForeignScan *node)
 
 	_outScanInfo(str, (const Scan *) node);
 
-	WRITE_BOOL_FIELD(fsSystemCol);
+	WRITE_NODE_FIELD(fdw_exprs);
 	WRITE_NODE_FIELD(fdw_private);
+	WRITE_BOOL_FIELD(fsSystemCol);
 }
 
 static void
@@ -1741,6 +1742,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node)
 	WRITE_FLOAT_FIELD(allvisfrac, "%.6f");
 	WRITE_NODE_FIELD(subplan);
 	WRITE_NODE_FIELD(subroot);
+	/* we don't try to print fdwroutine or fdw_private */
 	WRITE_NODE_FIELD(baserestrictinfo);
 	WRITE_NODE_FIELD(joininfo);
 	WRITE_BOOL_FIELD(has_eclass_joins);
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 6e81ce0fc2649..03c604a03d6f3 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -396,6 +396,12 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
 	/* Mark rel with estimated output rows, width, etc */
 	set_foreign_size_estimates(root, rel);
+
+	/* Get FDW routine pointers for the rel */
+	rel->fdwroutine = GetFdwRoutineByRelId(rte->relid);
+
+	/* Let FDW adjust the size estimates, if it can */
+	rel->fdwroutine->GetForeignRelSize(root, rel, rte->relid);
 }
 
 /*
@@ -405,11 +411,8 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 static void
 set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
-	FdwRoutine *fdwroutine;
-
-	/* Call the FDW's PlanForeignScan function to generate path(s) */
-	fdwroutine = GetFdwRoutineByRelId(rte->relid);
-	fdwroutine->PlanForeignScan(rte->relid, root, rel);
+	/* Call the FDW's GetForeignPaths function to generate path(s) */
+	rel->fdwroutine->GetForeignPaths(root, rel, rte->relid);
 
 	/* Select cheapest path */
 	set_cheapest(rel);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 885d8558c319f..24c853d47ef1a 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3745,7 +3745,7 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan)
  * using what will be purely datatype-driven estimates from the targetlist.
  * There is no way to do anything sane with the rows value, so we just put
  * a default estimate and hope that the wrapper can improve on it.	The
- * wrapper's PlanForeignScan function will be called momentarily.
+ * wrapper's GetForeignRelSize function will be called momentarily.
  *
  * The rel's targetlist and restrictinfo list must have been constructed
  * already.
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index b1df56cafd25a..94140d304f754 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -20,6 +20,7 @@
 #include <math.h>
 
 #include "access/skey.h"
+#include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -119,8 +120,6 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual,
 			 Index scanrelid, int ctePlanId, int cteParam);
 static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
 				   Index scanrelid, int wtParam);
-static ForeignScan *make_foreignscan(List *qptlist, List *qpqual,
-				 Index scanrelid, bool fsSystemCol, List *fdw_private);
 static BitmapAnd *make_bitmap_and(List *bitmapplans);
 static BitmapOr *make_bitmap_or(List *bitmapplans);
 static NestLoop *make_nestloop(List *tlist,
@@ -1816,7 +1815,6 @@ create_foreignscan_plan(PlannerInfo *root, ForeignPath *best_path,
 	RelOptInfo *rel = best_path->path.parent;
 	Index		scan_relid = rel->relid;
 	RangeTblEntry *rte;
-	bool		fsSystemCol;
 	int			i;
 
 	/* it should be a base rel... */
@@ -1825,31 +1823,56 @@ create_foreignscan_plan(PlannerInfo *root, ForeignPath *best_path,
 	rte = planner_rt_fetch(scan_relid, root);
 	Assert(rte->rtekind == RTE_RELATION);
 
-	/* Sort clauses into best execution order */
+	/*
+	 * Sort clauses into best execution order.  We do this first since the
+	 * FDW might have more info than we do and wish to adjust the ordering.
+	 */
 	scan_clauses = order_qual_clauses(root, scan_clauses);
 
-	/* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */
-	scan_clauses = extract_actual_clauses(scan_clauses, false);
+	/*
+	 * Let the FDW perform its processing on the restriction clauses and
+	 * generate the plan node.  Note that the FDW might remove restriction
+	 * clauses that it intends to execute remotely, or even add more (if it
+	 * has selected some join clauses for remote use but also wants them
+	 * rechecked locally).
+	 */
+	scan_plan = rel->fdwroutine->GetForeignPlan(root, rel, rte->relid,
+												best_path,
+												tlist, scan_clauses);
+
+	/* Copy cost data from Path to Plan; no need to make FDW do this */
+	copy_path_costsize(&scan_plan->scan.plan, &best_path->path);
 
-	/* Detect whether any system columns are requested from rel */
-	fsSystemCol = false;
+	/*
+	 * Replace any outer-relation variables with nestloop params in the qual
+	 * and fdw_exprs expressions.  We do this last so that the FDW doesn't
+	 * have to be involved.  (Note that parts of fdw_exprs could have come
+	 * from join clauses, so doing this beforehand on the scan_clauses
+	 * wouldn't work.)
+	 */
+	if (best_path->path.required_outer)
+	{
+		scan_plan->scan.plan.qual = (List *)
+			replace_nestloop_params(root, (Node *) scan_plan->scan.plan.qual);
+		scan_plan->fdw_exprs = (List *)
+			replace_nestloop_params(root, (Node *) scan_plan->fdw_exprs);
+	}
+
+	/*
+	 * Detect whether any system columns are requested from rel.  This is a
+	 * bit of a kluge and might go away someday, so we intentionally leave it
+	 * out of the API presented to FDWs.
+	 */
+	scan_plan->fsSystemCol = false;
 	for (i = rel->min_attr; i < 0; i++)
 	{
 		if (!bms_is_empty(rel->attr_needed[i - rel->min_attr]))
 		{
-			fsSystemCol = true;
+			scan_plan->fsSystemCol = true;
 			break;
 		}
 	}
 
-	scan_plan = make_foreignscan(tlist,
-								 scan_clauses,
-								 scan_relid,
-								 fsSystemCol,
-								 best_path->fdw_private);
-
-	copy_path_costsize(&scan_plan->scan.plan, &best_path->path);
-
 	return scan_plan;
 }
 
@@ -3183,24 +3206,26 @@ make_worktablescan(List *qptlist,
 	return node;
 }
 
-static ForeignScan *
+ForeignScan *
 make_foreignscan(List *qptlist,
 				 List *qpqual,
 				 Index scanrelid,
-				 bool fsSystemCol,
+				 List *fdw_exprs,
 				 List *fdw_private)
 {
 	ForeignScan *node = makeNode(ForeignScan);
 	Plan	   *plan = &node->scan.plan;
 
-	/* cost should be inserted by caller */
+	/* cost will be filled in by create_foreignscan_plan */
 	plan->targetlist = qptlist;
 	plan->qual = qpqual;
 	plan->lefttree = NULL;
 	plan->righttree = NULL;
 	node->scan.scanrelid = scanrelid;
-	node->fsSystemCol = fsSystemCol;
+	node->fdw_exprs = fdw_exprs;
 	node->fdw_private = fdw_private;
+	/* fsSystemCol will be filled in by create_foreignscan_plan */
+	node->fsSystemCol = false;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index e1b48fb4f5306..69396694aaa9d 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -428,6 +428,8 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 					fix_scan_list(root, splan->scan.plan.targetlist, rtoffset);
 				splan->scan.plan.qual =
 					fix_scan_list(root, splan->scan.plan.qual, rtoffset);
+				splan->fdw_exprs =
+					fix_scan_list(root, splan->fdw_exprs, rtoffset);
 			}
 			break;
 
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 40a420a3546f1..b64db1e1c0659 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2137,6 +2137,8 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
 			break;
 
 		case T_ForeignScan:
+			finalize_primnode((Node *) ((ForeignScan *) plan)->fdw_exprs,
+							  &context);
 			context.paramids = bms_add_members(context.paramids, scan_params);
 			break;
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 6d1545476df7b..a2fc75a659e50 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1767,7 +1767,7 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel)
  *	  returning the pathnode.
  *
  * This function is never called from core Postgres; rather, it's expected
- * to be called by the PlanForeignScan function of a foreign data wrapper.
+ * to be called by the GetForeignPaths function of a foreign data wrapper.
  * We make the FDW supply all fields of the path, since we do not have any
  * way to calculate them in core.
  */
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 0cdf638c1ddb1..cee092a881010 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -113,6 +113,8 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind)
 	rel->allvisfrac = 0;
 	rel->subplan = NULL;
 	rel->subroot = NULL;
+	rel->fdwroutine = NULL;
+	rel->fdw_private = NULL;
 	rel->baserestrictinfo = NIL;
 	rel->baserestrictcost.startup = 0;
 	rel->baserestrictcost.per_tuple = 0;
@@ -366,6 +368,8 @@ build_join_rel(PlannerInfo *root,
 	joinrel->allvisfrac = 0;
 	joinrel->subplan = NULL;
 	joinrel->subroot = NULL;
+	joinrel->fdwroutine = NULL;
+	joinrel->fdw_private = NULL;
 	joinrel->baserestrictinfo = NIL;
 	joinrel->baserestrictcost.startup = 0;
 	joinrel->baserestrictcost.per_tuple = 0;
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index 9e135c62069fd..854f17755c454 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -23,9 +23,20 @@ struct ExplainState;
  * Callback function signatures --- see fdwhandler.sgml for more info.
  */
 
-typedef void (*PlanForeignScan_function) (Oid foreigntableid,
-										  PlannerInfo *root,
-										  RelOptInfo *baserel);
+typedef void (*GetForeignRelSize_function) (PlannerInfo *root,
+											RelOptInfo *baserel,
+											Oid foreigntableid);
+
+typedef void (*GetForeignPaths_function) (PlannerInfo *root,
+										  RelOptInfo *baserel,
+										  Oid foreigntableid);
+
+typedef ForeignScan *(*GetForeignPlan_function) (PlannerInfo *root,
+												 RelOptInfo *baserel,
+												 Oid foreigntableid,
+												 ForeignPath *best_path,
+												 List *tlist,
+												 List *scan_clauses);
 
 typedef void (*ExplainForeignScan_function) (ForeignScanState *node,
 													struct ExplainState *es);
@@ -53,7 +64,9 @@ typedef struct FdwRoutine
 {
 	NodeTag		type;
 
-	PlanForeignScan_function PlanForeignScan;
+	GetForeignRelSize_function GetForeignRelSize;
+	GetForeignPaths_function GetForeignPaths;
+	GetForeignPlan_function GetForeignPlan;
 	ExplainForeignScan_function ExplainForeignScan;
 	BeginForeignScan_function BeginForeignScan;
 	IterateForeignScan_function IterateForeignScan;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 3962792d3d89a..e6bb3239f4214 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -462,13 +462,22 @@ typedef struct WorkTableScan
 
 /* ----------------
  *		ForeignScan node
+ *
+ * fdw_exprs and fdw_private are both under the control of the foreign-data
+ * wrapper, but fdw_exprs is presumed to contain expression trees and will
+ * be post-processed accordingly by the planner; fdw_private won't be.
+ * Note that everything in both lists must be copiable by copyObject().
+ * One way to store an arbitrary blob of bytes is to represent it as a bytea
+ * Const.  Usually, though, you'll be better off choosing a representation
+ * that can be dumped usefully by nodeToString().
  * ----------------
  */
 typedef struct ForeignScan
 {
 	Scan		scan;
-	bool		fsSystemCol;	/* true if any "system column" is needed */
+	List	   *fdw_exprs;		/* expressions that FDW may evaluate */
 	List	   *fdw_private;	/* private data for FDW */
+	bool		fsSystemCol;	/* true if any "system column" is needed */
 } ForeignScan;
 
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 2a686080059f3..8616223f24a8c 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -334,10 +334,13 @@ typedef struct PlannerInfo
  *		allvisfrac - fraction of disk pages that are marked all-visible
  *		subplan - plan for subquery (NULL if it's not a subquery)
  *		subroot - PlannerInfo for subquery (NULL if it's not a subquery)
+ *		fdwroutine - function hooks for FDW, if foreign table (else NULL)
+ *		fdw_private - private state for FDW, if foreign table (else NULL)
  *
  *		Note: for a subquery, tuples, subplan, subroot are not set immediately
  *		upon creation of the RelOptInfo object; they are filled in when
- *		set_base_rel_pathlist processes the object.
+ *		set_subquery_pathlist processes the object.  Likewise, fdwroutine
+ *		and fdw_private are filled during initial path creation.
  *
  *		For otherrels that are appendrel members, these fields are filled
  *		in just as for a baserel.
@@ -414,8 +417,12 @@ typedef struct RelOptInfo
 	BlockNumber pages;			/* size estimates derived from pg_class */
 	double		tuples;
 	double		allvisfrac;
+	/* use "struct Plan" to avoid including plannodes.h here */
 	struct Plan *subplan;		/* if subquery */
 	PlannerInfo *subroot;		/* if subquery */
+	/* use "struct FdwRoutine" to avoid including fdwapi.h here */
+	struct FdwRoutine *fdwroutine;	/* if foreign table */
+	void	   *fdw_private;	/* if foreign table */
 
 	/* used by various scans and joins: */
 	List	   *baserestrictinfo;		/* RestrictInfo structures (if base
@@ -793,14 +800,13 @@ typedef struct TidPath
 } TidPath;
 
 /*
- * ForeignPath represents a scan of a foreign table
- *
- * fdw_private contains FDW private data about the scan, which will be copied
- * to the final ForeignScan plan node so that it is available at execution
- * time.  Note that everything in this list must be copiable by copyObject().
- * One way to store an arbitrary blob of bytes is to represent it as a bytea
- * Const.  Usually, though, you'll be better off choosing a representation
- * that can be dumped usefully by nodeToString().
+ * ForeignPath represents a potential scan of a foreign table
+ *
+ * fdw_private stores FDW private data about the scan.  While fdw_private is
+ * not actually touched by the core code during normal operations, it's
+ * generally a good idea to use a representation that can be dumped by
+ * nodeToString(), so that you can examine the structure during debugging
+ * with tools like pprint().
  */
 typedef struct ForeignPath
 {
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 8bd603124b3ac..47cc39cf1d9c3 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -42,6 +42,8 @@ extern Plan *optimize_minmax_aggregates(PlannerInfo *root, List *tlist,
 extern Plan *create_plan(PlannerInfo *root, Path *best_path);
 extern SubqueryScan *make_subqueryscan(List *qptlist, List *qpqual,
 				  Index scanrelid, Plan *subplan);
+extern ForeignScan *make_foreignscan(List *qptlist, List *qpqual,
+				 Index scanrelid, List *fdw_exprs, List *fdw_private);
 extern Append *make_append(List *appendplans, List *tlist);
 extern RecursiveUnion *make_recursive_union(List *tlist,
 					 Plan *lefttree, Plan *righttree, int wtParam,

From 07d1edb954bc8f5d0e2c010dec8482328af38cb8 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 9 Mar 2012 14:34:56 -0500
Subject: [PATCH 109/129] Extend object access hook framework to support
 arguments, and DROP.

This allows loadable modules to get control at drop time, perhaps for the
purpose of performing additional security checks or to log the event.
The initial purpose of this code is to support sepgsql, but other
applications should be possible as well.

KaiGai Kohei, reviewed by me.
---
 src/backend/catalog/dependency.c           | 10 ++++++++
 src/backend/catalog/heap.c                 |  3 ++-
 src/backend/catalog/pg_collation.c         |  2 +-
 src/backend/catalog/pg_constraint.c        |  3 ++-
 src/backend/catalog/pg_conversion.c        |  4 +--
 src/backend/catalog/pg_namespace.c         |  3 ++-
 src/backend/catalog/pg_operator.c          |  4 +--
 src/backend/catalog/pg_proc.c              |  3 ++-
 src/backend/catalog/pg_type.c              |  6 +++--
 src/backend/commands/dbcommands.c          | 12 ++++++++-
 src/backend/commands/extension.c           |  2 +-
 src/backend/commands/foreigncmds.c         |  8 +++---
 src/backend/commands/functioncmds.c        |  3 ++-
 src/backend/commands/opclasscmds.c         |  4 +--
 src/backend/commands/proclang.c            |  2 +-
 src/backend/commands/tablecmds.c           |  2 +-
 src/backend/commands/tablespace.c          | 11 +++++++-
 src/backend/commands/trigger.c             |  2 +-
 src/backend/commands/tsearchcmds.c         | 11 +++++---
 src/backend/commands/user.c                | 12 ++++++++-
 src/backend/rewrite/rewriteDefine.c        |  2 +-
 src/backend/storage/large_object/inv_api.c |  2 +-
 src/include/catalog/objectaccess.h         | 29 +++++++++++++++++-----
 23 files changed, 104 insertions(+), 36 deletions(-)

diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index eca064f0cdef7..1b92f5c38a1ef 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -20,6 +20,7 @@
 #include "catalog/heap.h"
 #include "catalog/index.h"
 #include "catalog/namespace.h"
+#include "catalog/objectaccess.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_attrdef.h"
@@ -991,6 +992,15 @@ deleteOneObject(const ObjectAddress *object, Relation depRel, int flags)
 	SysScanDesc scan;
 	HeapTuple	tup;
 
+	/* DROP hook of the objects being removed */
+	if (object_access_hook)
+	{
+		ObjectAccessDrop	drop_arg;
+		drop_arg.dropflags = flags;
+		InvokeObjectAccessHook(OAT_DROP, object->classId, object->objectId,
+							   object->objectSubId, &drop_arg);
+	}
+
 	/*
 	 * First remove any pg_depend records that link from this object to
 	 * others.	(Any records linking to this object should be gone already.)
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index d1d458d7fa485..8bd5a9296e1be 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1286,7 +1286,8 @@ heap_create_with_catalog(const char *relname,
 	}
 
 	/* Post creation hook for new relation */
-	InvokeObjectAccessHook(OAT_POST_CREATE, RelationRelationId, relid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   RelationRelationId, relid, 0, NULL);
 
 	/*
 	 * Store any supplied constraints and defaults.
diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c
index 511d70c044d6b..18c7acf0e81ff 100644
--- a/src/backend/catalog/pg_collation.c
+++ b/src/backend/catalog/pg_collation.c
@@ -136,7 +136,7 @@ CollationCreate(const char *collname, Oid collnamespace,
 
 	/* Post creation hook for new collation */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   CollationRelationId, oid, 0);
+						   CollationRelationId, oid, 0, NULL);
 
 	heap_freetuple(tup);
 	heap_close(rel, RowExclusiveLock);
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c
index 0bad4d99cb42f..342cf75270a3f 100644
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -366,7 +366,8 @@ CreateConstraintEntry(const char *constraintName,
 	}
 
 	/* Post creation hook for new constraint */
-	InvokeObjectAccessHook(OAT_POST_CREATE, ConstraintRelationId, conOid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   ConstraintRelationId, conOid, 0, NULL);
 
 	return conOid;
 }
diff --git a/src/backend/catalog/pg_conversion.c b/src/backend/catalog/pg_conversion.c
index 8194cd6c2671f..f86c84fc4bb23 100644
--- a/src/backend/catalog/pg_conversion.c
+++ b/src/backend/catalog/pg_conversion.c
@@ -134,8 +134,8 @@ ConversionCreate(const char *conname, Oid connamespace,
 	recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new conversion */
-	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   ConversionRelationId, HeapTupleGetOid(tup), 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE, ConversionRelationId,
+						   HeapTupleGetOid(tup), 0, NULL);
 
 	heap_freetuple(tup);
 	heap_close(rel, RowExclusiveLock);
diff --git a/src/backend/catalog/pg_namespace.c b/src/backend/catalog/pg_namespace.c
index be812a246c09d..de856760f08b3 100644
--- a/src/backend/catalog/pg_namespace.c
+++ b/src/backend/catalog/pg_namespace.c
@@ -95,7 +95,8 @@ NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp)
 		recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new schema */
-	InvokeObjectAccessHook(OAT_POST_CREATE, NamespaceRelationId, nspoid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   NamespaceRelationId, nspoid, 0, NULL);
 
 	return nspoid;
 }
diff --git a/src/backend/catalog/pg_operator.c b/src/backend/catalog/pg_operator.c
index 3b727222413ac..4fd55ae570607 100644
--- a/src/backend/catalog/pg_operator.c
+++ b/src/backend/catalog/pg_operator.c
@@ -275,7 +275,7 @@ OperatorShellMake(const char *operatorName,
 
 	/* Post creation hook for new shell operator */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   OperatorRelationId, operatorObjectId, 0);
+						   OperatorRelationId, operatorObjectId, 0, NULL);
 
 	/*
 	 * Make sure the tuple is visible for subsequent lookups/updates.
@@ -544,7 +544,7 @@ OperatorCreate(const char *operatorName,
 
 	/* Post creation hook for new operator */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   OperatorRelationId, operatorObjectId, 0);
+						   OperatorRelationId, operatorObjectId, 0, NULL);
 
 	heap_close(pg_operator_desc, RowExclusiveLock);
 
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index 91ead4cb9d266..1fffe1c6ac313 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -655,7 +655,8 @@ ProcedureCreate(const char *procedureName,
 	heap_freetuple(tup);
 
 	/* Post creation hook for new function */
-	InvokeObjectAccessHook(OAT_POST_CREATE, ProcedureRelationId, retval, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   ProcedureRelationId, retval, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 
diff --git a/src/backend/catalog/pg_type.c b/src/backend/catalog/pg_type.c
index 2c2e3b3e7cf95..5b2ad6bfe0d41 100644
--- a/src/backend/catalog/pg_type.c
+++ b/src/backend/catalog/pg_type.c
@@ -162,7 +162,8 @@ TypeShellMake(const char *typeName, Oid typeNamespace, Oid ownerId)
 								 false);
 
 	/* Post creation hook for new shell type */
-	InvokeObjectAccessHook(OAT_POST_CREATE, TypeRelationId, typoid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   TypeRelationId, typoid, 0, NULL);
 
 	/*
 	 * clean up and return the type-oid
@@ -474,7 +475,8 @@ TypeCreate(Oid newTypeOid,
 								 rebuildDeps);
 
 	/* Post creation hook for new type */
-	InvokeObjectAccessHook(OAT_POST_CREATE, TypeRelationId, typeObjectId, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   TypeRelationId, typeObjectId, 0, NULL);
 
 	/*
 	 * finish up
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 42a8b31b2a826..91d74815287c1 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -515,7 +515,8 @@ createdb(const CreatedbStmt *stmt)
 	copyTemplateDependencies(src_dboid, dboid);
 
 	/* Post creation hook for new database */
-	InvokeObjectAccessHook(OAT_POST_CREATE, DatabaseRelationId, dboid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   DatabaseRelationId, dboid, 0, NULL);
 
 	/*
 	 * Force a checkpoint before starting the copy. This will force dirty
@@ -777,6 +778,15 @@ dropdb(const char *dbname, bool missing_ok)
 		aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_DATABASE,
 					   dbname);
 
+	/* DROP hook for the database being removed */
+	if (object_access_hook)
+	{
+		ObjectAccessDrop    drop_arg;
+		memset(&drop_arg, 0, sizeof(ObjectAccessDrop));
+		InvokeObjectAccessHook(OAT_DROP,
+							   DatabaseRelationId, db_id, 0, &drop_arg);
+	}
+
 	/*
 	 * Disallow dropping a DB that is marked istemplate.  This is just to
 	 * prevent people from accidentally dropping template0 or template1; they
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index a9963ac93b939..732791cc41366 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -1558,7 +1558,7 @@ InsertExtensionTuple(const char *extName, Oid extOwner,
 	}
 	/* Post creation hook for new extension */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   ExtensionRelationId, extensionOid, 0);
+						   ExtensionRelationId, extensionOid, 0, NULL);
 
 	return extensionOid;
 }
diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c
index 5d18bdcf0a972..30135e6de8b02 100644
--- a/src/backend/commands/foreigncmds.c
+++ b/src/backend/commands/foreigncmds.c
@@ -666,7 +666,7 @@ CreateForeignDataWrapper(CreateFdwStmt *stmt)
 
 	/* Post creation hook for new foreign data wrapper */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   ForeignDataWrapperRelationId, fdwId, 0);
+						   ForeignDataWrapperRelationId, fdwId, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 }
@@ -962,7 +962,8 @@ CreateForeignServer(CreateForeignServerStmt *stmt)
 	recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new foreign server */
-	InvokeObjectAccessHook(OAT_POST_CREATE, ForeignServerRelationId, srvId, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   ForeignServerRelationId, srvId, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 }
@@ -1202,7 +1203,8 @@ CreateUserMapping(CreateUserMappingStmt *stmt)
 	recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new user mapping */
-	InvokeObjectAccessHook(OAT_POST_CREATE, UserMappingRelationId, umId, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   UserMappingRelationId, umId, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 }
diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c
index ce866a20a990c..4125b97e89eb2 100644
--- a/src/backend/commands/functioncmds.c
+++ b/src/backend/commands/functioncmds.c
@@ -1759,7 +1759,8 @@ CreateCast(CreateCastStmt *stmt)
 	recordDependencyOnCurrentExtension(&myself, false);
 
 	/* Post creation hook for new cast */
-	InvokeObjectAccessHook(OAT_POST_CREATE, CastRelationId, castid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   CastRelationId, castid, 0, NULL);
 
 	heap_freetuple(tuple);
 
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 5dc131a50e222..87c889604e2ad 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -314,7 +314,7 @@ CreateOpFamily(char *amname, char *opfname, Oid namespaceoid, Oid amoid)
 
 	/* Post creation hook for new operator family */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   OperatorFamilyRelationId, opfamilyoid, 0);
+						   OperatorFamilyRelationId, opfamilyoid, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 
@@ -717,7 +717,7 @@ DefineOpClass(CreateOpClassStmt *stmt)
 
 	/* Post creation hook for new operator class */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   OperatorClassRelationId, opclassoid, 0);
+						   OperatorClassRelationId, opclassoid, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 }
diff --git a/src/backend/commands/proclang.c b/src/backend/commands/proclang.c
index 8d6a0416d5ff9..41775fd86745a 100644
--- a/src/backend/commands/proclang.c
+++ b/src/backend/commands/proclang.c
@@ -428,7 +428,7 @@ create_proc_lang(const char *languageName, bool replace,
 
 	/* Post creation hook for new procedural language */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   LanguageRelationId, myself.objectId, 0);
+						   LanguageRelationId, myself.objectId, 0, NULL);
 
 	heap_close(rel, RowExclusiveLock);
 }
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cd4490a1c24e5..25ca356b867ec 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4382,7 +4382,7 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 
 	/* Post creation hook for new attribute */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   RelationRelationId, myrelid, newattnum);
+						   RelationRelationId, myrelid, newattnum, NULL);
 
 	heap_close(pgclass, RowExclusiveLock);
 
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 5e10f8c9a33b7..d66ea3b6c12a9 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -330,7 +330,7 @@ CreateTableSpace(CreateTableSpaceStmt *stmt)
 
 	/* Post creation hook for new tablespace */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   TableSpaceRelationId, tablespaceoid, 0);
+						   TableSpaceRelationId, tablespaceoid, 0, NULL);
 
 	create_tablespace_directories(location, tablespaceoid);
 
@@ -434,6 +434,15 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 		aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE,
 					   tablespacename);
 
+	/* DROP hook for the tablespace being removed */
+	if (object_access_hook)
+	{
+		ObjectAccessDrop    drop_arg;
+		memset(&drop_arg, 0, sizeof(ObjectAccessDrop));
+		InvokeObjectAccessHook(OAT_DROP, TableSpaceRelationId,
+							   tablespaceoid, 0, &drop_arg);
+	}
+
 	/*
 	 * Remove the pg_tablespace tuple (this will roll back if we fail below)
 	 */
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index caae2dafab159..a98d1b884ee52 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -756,7 +756,7 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 
 	/* Post creation hook for new trigger */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   TriggerRelationId, trigoid, 0);
+						   TriggerRelationId, trigoid, 0, NULL);
 
 	/* Keep lock on target rel until end of xact */
 	heap_close(rel, NoLock);
diff --git a/src/backend/commands/tsearchcmds.c b/src/backend/commands/tsearchcmds.c
index fe500a6d7f97f..86cb8704da811 100644
--- a/src/backend/commands/tsearchcmds.c
+++ b/src/backend/commands/tsearchcmds.c
@@ -271,7 +271,8 @@ DefineTSParser(List *names, List *parameters)
 	makeParserDependencies(tup);
 
 	/* Post creation hook for new text search parser */
-	InvokeObjectAccessHook(OAT_POST_CREATE, TSParserRelationId, prsOid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   TSParserRelationId, prsOid, 0, NULL);
 
 	heap_freetuple(tup);
 
@@ -565,7 +566,7 @@ DefineTSDictionary(List *names, List *parameters)
 
 	/* Post creation hook for new text search dictionary */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   TSDictionaryRelationId, dictOid, 0);
+						   TSDictionaryRelationId, dictOid, 0, NULL);
 
 	heap_freetuple(tup);
 
@@ -1036,7 +1037,8 @@ DefineTSTemplate(List *names, List *parameters)
 	makeTSTemplateDependencies(tup);
 
 	/* Post creation hook for new text search template */
-	InvokeObjectAccessHook(OAT_POST_CREATE, TSTemplateRelationId, dictOid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   TSTemplateRelationId, dictOid, 0, NULL);
 
 	heap_freetuple(tup);
 
@@ -1419,7 +1421,8 @@ DefineTSConfiguration(List *names, List *parameters)
 	makeConfigurationDependencies(tup, false, mapRel);
 
 	/* Post creation hook for new text search configuration */
-	InvokeObjectAccessHook(OAT_POST_CREATE, TSConfigRelationId, cfgOid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   TSConfigRelationId, cfgOid, 0, NULL);
 
 	heap_freetuple(tup);
 
diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 9a88c907894b6..2edbabe7549ba 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -425,7 +425,8 @@ CreateRole(CreateRoleStmt *stmt)
 				GetUserId(), false);
 
 	/* Post creation hook for new role */
-	InvokeObjectAccessHook(OAT_POST_CREATE, AuthIdRelationId, roleid, 0);
+	InvokeObjectAccessHook(OAT_POST_CREATE,
+						   AuthIdRelationId, roleid, 0, NULL);
 
 	/*
 	 * Close pg_authid, but keep lock till commit.
@@ -932,6 +933,15 @@ DropRole(DropRoleStmt *stmt)
 					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 					 errmsg("must be superuser to drop superusers")));
 
+		/* DROP hook for the role being removed */
+		if (object_access_hook)
+		{
+			ObjectAccessDrop	drop_arg;
+			memset(&drop_arg, 0, sizeof(ObjectAccessDrop));
+			InvokeObjectAccessHook(OAT_DROP,
+								   AuthIdRelationId, roleid, 0, &drop_arg);
+		}
+
 		/*
 		 * Lock the role, so nobody can add dependencies to her while we drop
 		 * her.  We keep the lock until the end of transaction.
diff --git a/src/backend/rewrite/rewriteDefine.c b/src/backend/rewrite/rewriteDefine.c
index 8c87ac599f958..645182dbfa466 100644
--- a/src/backend/rewrite/rewriteDefine.c
+++ b/src/backend/rewrite/rewriteDefine.c
@@ -178,7 +178,7 @@ InsertRule(char *rulname,
 
 	/* Post creation hook for new rule */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   RewriteRelationId, rewriteObjectId, 0);
+						   RewriteRelationId, rewriteObjectId, 0, NULL);
 
 	heap_close(pg_rewrite_desc, RowExclusiveLock);
 
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
index a14ce442c1c35..3adfb159b8baa 100644
--- a/src/backend/storage/large_object/inv_api.c
+++ b/src/backend/storage/large_object/inv_api.c
@@ -217,7 +217,7 @@ inv_create(Oid lobjId)
 
 	/* Post creation hook for new large object */
 	InvokeObjectAccessHook(OAT_POST_CREATE,
-						   LargeObjectRelationId, lobjId_new, 0);
+						   LargeObjectRelationId, lobjId_new, 0, NULL);
 
 	/*
 	 * Advance command counter to make new tuple visible to later operations.
diff --git a/src/include/catalog/objectaccess.h b/src/include/catalog/objectaccess.h
index 5c7a40a31cb77..9763280177b81 100644
--- a/src/include/catalog/objectaccess.h
+++ b/src/include/catalog/objectaccess.h
@@ -19,28 +19,45 @@
  * Typically, this is done after inserting the primary catalog records and
  * associated dependencies.
  *
+ * OAT_DROP should be invoked just before deletion of objects; typically
+ * deleteOneObject(). Its arguments are packed within ObjectAccessDrop.
+ *
  * Other types may be added in the future.
  */
 typedef enum ObjectAccessType
 {
 	OAT_POST_CREATE,
+	OAT_DROP,
 } ObjectAccessType;
 
 /*
- * Hook, and a macro to invoke it.
+ * Arguments of OAT_DROP event
  */
+typedef struct
+{
+	/*
+	 * Flags to inform extensions the context of this deletion.
+	 * Also see PERFORM_DELETION_* in dependency.h
+	 */
+	int		dropflags;
+} ObjectAccessDrop;
 
+/*
+ * Hook, and a macro to invoke it.
+ */
 typedef void (*object_access_hook_type) (ObjectAccessType access,
 													 Oid classId,
 													 Oid objectId,
-													 int subId);
+													 int subId,
+													 void *arg);
 
 extern PGDLLIMPORT object_access_hook_type object_access_hook;
 
-#define InvokeObjectAccessHook(access,classId,objectId,subId)			\
-	do {																\
-		if (object_access_hook)											\
-			(*object_access_hook)((access),(classId),(objectId),(subId)); \
+#define InvokeObjectAccessHook(access,classId,objectId,subId,arg)	\
+	do {															\
+		if (object_access_hook)										\
+			(*object_access_hook)((access),(classId),				\
+								  (objectId),(subId),(arg));		\
 	} while(0)
 
 #endif   /* OBJECTACCESS_H */

From e914a144d3aaa0a09e0aab031d7e6f58389401ce Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 9 Mar 2012 15:18:45 -0500
Subject: [PATCH 110/129] sepgsql DROP support.

KaiGai Kohei
---
 contrib/sepgsql/database.c                  |  27 ++++
 contrib/sepgsql/expected/create.out         |  80 ----------
 contrib/sepgsql/expected/ddl.out            | 164 ++++++++++++++++++++
 contrib/sepgsql/hooks.c                     |  46 +++++-
 contrib/sepgsql/proc.c                      |  42 +++++
 contrib/sepgsql/relation.c                  | 119 ++++++++++++++
 contrib/sepgsql/schema.c                    |  27 ++++
 contrib/sepgsql/sepgsql.h                   |   5 +
 contrib/sepgsql/sql/{create.sql => ddl.sql} |  41 ++++-
 contrib/sepgsql/test_sepgsql                |   2 +-
 doc/src/sgml/sepgsql.sgml                   |  14 ++
 11 files changed, 481 insertions(+), 86 deletions(-)
 delete mode 100644 contrib/sepgsql/expected/create.out
 create mode 100644 contrib/sepgsql/expected/ddl.out
 rename contrib/sepgsql/sql/{create.sql => ddl.sql} (53%)

diff --git a/contrib/sepgsql/database.c b/contrib/sepgsql/database.c
index be3a7be96557b..0c395c42a3cb0 100644
--- a/contrib/sepgsql/database.c
+++ b/contrib/sepgsql/database.c
@@ -118,6 +118,33 @@ sepgsql_database_post_create(Oid databaseId, const char *dtemplate)
 	pfree(tcontext);
 }
 
+/*
+ * sepgsql_database_drop
+ *
+ * It checks privileges to drop the supplied database
+ */
+void
+sepgsql_database_drop(Oid databaseId)
+{
+	ObjectAddress	object;
+	char		   *audit_name;
+
+	/*
+	 * check db_database:{drop} permission
+	 */
+	object.classId = DatabaseRelationId;
+	object.objectId = databaseId;
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+							SEPG_CLASS_DB_DATABASE,
+							SEPG_DB_DATABASE__DROP,
+							audit_name,
+							true);
+	pfree(audit_name);
+}
+
 /*
  * sepgsql_database_relabel
  *
diff --git a/contrib/sepgsql/expected/create.out b/contrib/sepgsql/expected/create.out
deleted file mode 100644
index 0f04a3e739cf1..0000000000000
--- a/contrib/sepgsql/expected/create.out
+++ /dev/null
@@ -1,80 +0,0 @@
---
--- Regression Test for Creation of Object Permission Checks
---
--- confirm required permissions using audit messages
-SELECT sepgsql_getcon();	-- confirm client privilege
-              sepgsql_getcon               
--------------------------------------------
- unconfined_u:unconfined_r:unconfined_t:s0
-(1 row)
-
-SET sepgsql.debug_audit = true;
-SET client_min_messages = LOG;
-CREATE DATABASE regtest_sepgsql_test_database;
-LOG:  SELinux: allowed { getattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=system_u:object_r:sepgsql_db_t:s0 tclass=db_database name="database template1"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_db_t:s0 tclass=db_database name="database regtest_sepgsql_test_database"
-CREATE SCHEMA regtest_schema;
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-SET search_path = regtest_schema, public;
-CREATE TABLE regtest_table (x serial primary key, y text);
-NOTICE:  CREATE TABLE will create implicit sequence "regtest_table_x_seq" for serial column "regtest_table.x"
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_x_seq"
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column tableoid"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmax"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmax"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmin"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmin"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column ctid"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column x"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column y"
-NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "regtest_table_pkey" for table "regtest_table"
-ALTER TABLE regtest_table ADD COLUMN z int;
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column z"
-CREATE TABLE regtest_table_2 (a int) WITH OIDS;
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_2"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column tableoid"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmax"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmax"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmin"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmin"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column oid"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column ctid"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column a"
--- corresponding toast table should not have label and permission checks
-ALTER TABLE regtest_table_2 ADD COLUMN b text;
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column b"
--- VACUUM FULL internally create a new table and swap them later.
-VACUUM FULL regtest_table;
-CREATE VIEW regtest_view AS SELECT * FROM regtest_table WHERE x < 100;
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_view_t:s0 tclass=db_view name="view regtest_view"
-CREATE SEQUENCE regtest_seq;
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_seq"
-CREATE TYPE regtest_comptype AS (a int, b text);
-CREATE FUNCTION regtest_func(text,int[]) RETURNS bool LANGUAGE plpgsql
-	   AS 'BEGIN RAISE NOTICE ''regtest_func => %'', $1; RETURN true; END';
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func(text,integer[])"
-CREATE AGGREGATE regtest_agg (
-           sfunc1 = int4pl, basetype = int4, stype1 = int4, initcond1 = '0'
-);
-LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
-LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_agg(integer)"
---
--- clean-up
---
-DROP DATABASE IF EXISTS regtest_sepgsql_test_database;
-DROP SCHEMA IF EXISTS regtest_schema CASCADE;
-NOTICE:  drop cascades to 7 other objects
-DETAIL:  drop cascades to table regtest_table
-drop cascades to table regtest_table_2
-drop cascades to view regtest_view
-drop cascades to sequence regtest_seq
-drop cascades to type regtest_comptype
-drop cascades to function regtest_func(text,integer[])
-drop cascades to function regtest_agg(integer)
diff --git a/contrib/sepgsql/expected/ddl.out b/contrib/sepgsql/expected/ddl.out
new file mode 100644
index 0000000000000..1c7bcc5ca9124
--- /dev/null
+++ b/contrib/sepgsql/expected/ddl.out
@@ -0,0 +1,164 @@
+--
+-- Regression Test for DDL of Object Permission Checks
+--
+-- confirm required permissions using audit messages
+SELECT sepgsql_getcon();	-- confirm client privilege
+              sepgsql_getcon               
+-------------------------------------------
+ unconfined_u:unconfined_r:unconfined_t:s0
+(1 row)
+
+SET sepgsql.debug_audit = true;
+SET client_min_messages = LOG;
+--
+-- CREATE Permission checks
+--
+CREATE DATABASE regtest_sepgsql_test_database;
+LOG:  SELinux: allowed { getattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=system_u:object_r:sepgsql_db_t:s0 tclass=db_database name="database template1"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_db_t:s0 tclass=db_database name="database regtest_sepgsql_test_database"
+CREATE USER regtest_sepgsql_test_user;
+CREATE SCHEMA regtest_schema;
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+GRANT ALL ON SCHEMA regtest_schema TO regtest_sepgsql_test_user;
+SET search_path = regtest_schema, public;
+CREATE TABLE regtest_table (x serial primary key, y text);
+NOTICE:  CREATE TABLE will create implicit sequence "regtest_table_x_seq" for serial column "regtest_table.x"
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_x_seq"
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column tableoid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column ctid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column x"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column y"
+NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "regtest_table_pkey" for table "regtest_table"
+ALTER TABLE regtest_table ADD COLUMN z int;
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column z"
+CREATE TABLE regtest_table_2 (a int) WITH OIDS;
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_2"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column tableoid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column oid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column ctid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column a"
+-- corresponding toast table should not have label and permission checks
+ALTER TABLE regtest_table_2 ADD COLUMN b text;
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column b"
+-- VACUUM FULL internally create a new table and swap them later.
+VACUUM FULL regtest_table;
+CREATE VIEW regtest_view AS SELECT * FROM regtest_table WHERE x < 100;
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_view_t:s0 tclass=db_view name="view regtest_view"
+CREATE SEQUENCE regtest_seq;
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_seq"
+CREATE TYPE regtest_comptype AS (a int, b text);
+CREATE FUNCTION regtest_func(text,int[]) RETURNS bool LANGUAGE plpgsql
+	   AS 'BEGIN RAISE NOTICE ''regtest_func => %'', $1; RETURN true; END';
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func(text,integer[])"
+CREATE AGGREGATE regtest_agg (
+           sfunc1 = int4pl, basetype = int4, stype1 = int4, initcond1 = '0'
+);
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_agg(integer)"
+-- CREATE objects owned by others
+SET SESSION AUTHORIZATION regtest_sepgsql_test_user;
+SET search_path = regtest_schema, public;
+CREATE TABLE regtest_table_3 (x int, y serial);
+NOTICE:  CREATE TABLE will create implicit sequence "regtest_table_3_y_seq" for serial column "regtest_table_3.y"
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_3_y_seq"
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_3"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column tableoid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column cmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column xmax"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column cmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column xmin"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column ctid"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column x"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column y"
+CREATE VIEW regtest_view_2 AS SELECT * FROM regtest_table_3 WHERE x < y;
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_view_t:s0 tclass=db_view name="view regtest_view_2"
+CREATE FUNCTION regtest_func_2(int) RETURNS bool LANGUAGE plpgsql
+           AS 'BEGIN RETURN $1 * $1 < 100; END';
+LOG:  SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func_2(integer)"
+RESET SESSION AUTHORIZATION;
+--
+-- DROP Permission checks (with clean-up)
+--
+DROP FUNCTION regtest_func(text,int[]);
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func(text,integer[])"
+DROP AGGREGATE regtest_agg(int);
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_agg(integer)"
+DROP SEQUENCE regtest_seq;
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_seq"
+DROP VIEW regtest_view;
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_view_t:s0 tclass=db_view name="view regtest_view"
+ALTER TABLE regtest_table DROP COLUMN y;
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column y"
+ALTER TABLE regtest_table_2 SET WITHOUT OIDS;
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column oid"
+DROP TABLE regtest_table;
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_x_seq"
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column tableoid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column xmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column ctid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column x"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column z"
+DROP OWNED BY regtest_sepgsql_test_user;
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func_2(integer)"
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_view_t:s0 tclass=db_view name="view regtest_view_2"
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_3_y_seq"
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_3"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column tableoid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column cmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column xmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column cmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column xmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column ctid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column x"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_3 column y"
+DROP DATABASE regtest_sepgsql_test_database;
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_db_t:s0 tclass=db_database name="database regtest_sepgsql_test_database"
+DROP USER regtest_sepgsql_test_user;
+DROP SCHEMA IF EXISTS regtest_schema CASCADE;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table regtest_table_2
+drop cascades to type regtest_comptype
+LOG:  SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_2"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column tableoid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmax"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column cmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column xmin"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column ctid"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column a"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_2 column b"
+LOG:  SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema"
diff --git a/contrib/sepgsql/hooks.c b/contrib/sepgsql/hooks.c
index 70934950e51d3..ffa078677c849 100644
--- a/contrib/sepgsql/hooks.c
+++ b/contrib/sepgsql/hooks.c
@@ -10,6 +10,7 @@
  */
 #include "postgres.h"
 
+#include "catalog/dependency.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_database.h"
@@ -87,10 +88,11 @@ static void
 sepgsql_object_access(ObjectAccessType access,
 					  Oid classId,
 					  Oid objectId,
-					  int subId)
+					  int subId,
+					  void *arg)
 {
 	if (next_object_access_hook)
-		(*next_object_access_hook) (access, classId, objectId, subId);
+		(*next_object_access_hook) (access, classId, objectId, subId, arg);
 
 	switch (access)
 	{
@@ -146,6 +148,46 @@ sepgsql_object_access(ObjectAccessType access,
 			}
 			break;
 
+		case OAT_DROP:
+			{
+				ObjectAccessDrop *drop_arg = (ObjectAccessDrop *)arg;
+
+				/*
+				 * No need to apply permission checks on object deletion
+				 * due to internal cleanups; such as removal of temporary
+				 * database object on session closed.
+				 */
+				if ((drop_arg->dropflags & PERFORM_DELETION_INTERNAL) != 0)
+					break;
+
+				switch (classId)
+				{
+					case DatabaseRelationId:
+						sepgsql_database_drop(objectId);
+						break;
+
+					case NamespaceRelationId:
+						sepgsql_schema_drop(objectId);
+						break;
+
+					case RelationRelationId:
+						if (subId == 0)
+							sepgsql_relation_drop(objectId);
+						else
+							sepgsql_attribute_drop(objectId, subId);
+						break;
+
+					case ProcedureRelationId:
+						sepgsql_proc_drop(objectId);
+						break;
+
+					default:
+						/* Ignore unsupported object classes */
+						break;
+				}
+			}
+			break;
+
 		default:
 			elog(ERROR, "unexpected object access type: %d", (int) access);
 			break;
diff --git a/contrib/sepgsql/proc.c b/contrib/sepgsql/proc.c
index b902797d8fb3b..1efbc906c6b63 100644
--- a/contrib/sepgsql/proc.c
+++ b/contrib/sepgsql/proc.c
@@ -130,6 +130,48 @@ sepgsql_proc_post_create(Oid functionId)
 	pfree(ncontext);
 }
 
+/*
+ * sepgsql_proc_drop
+ *
+ * It checks privileges to drop the supplied function.
+ */
+void
+sepgsql_proc_drop(Oid functionId)
+{
+	ObjectAddress	object;
+	char		   *audit_name;
+
+	/*
+	 * check db_schema:{remove_name} permission
+	 */
+	object.classId = NamespaceRelationId;
+	object.objectId = get_func_namespace(functionId);
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+							SEPG_CLASS_DB_SCHEMA,
+							SEPG_DB_SCHEMA__REMOVE_NAME,
+							audit_name,
+							true);
+	pfree(audit_name);
+
+    /*
+     * check db_procedure:{drop} permission
+     */
+	object.classId = ProcedureRelationId;
+	object.objectId = functionId;
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+    sepgsql_avc_check_perms(&object,
+                            SEPG_CLASS_DB_PROCEDURE,
+                            SEPG_DB_PROCEDURE__DROP,
+                            audit_name,
+                            true);
+	pfree(audit_name);
+}
+
 /*
  * sepgsql_proc_relabel
  *
diff --git a/contrib/sepgsql/relation.c b/contrib/sepgsql/relation.c
index efce9148a5b10..259be492684ff 100644
--- a/contrib/sepgsql/relation.c
+++ b/contrib/sepgsql/relation.c
@@ -21,6 +21,7 @@
 #include "commands/seclabel.h"
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
+#include "utils/syscache.h"
 #include "utils/tqual.h"
 
 #include "sepgsql.h"
@@ -109,6 +110,36 @@ sepgsql_attribute_post_create(Oid relOid, AttrNumber attnum)
 	pfree(ncontext);
 }
 
+/*
+ * sepgsql_attribute_drop
+ *
+ * It checks privileges to drop the supplied column.
+ */
+void
+sepgsql_attribute_drop(Oid relOid, AttrNumber attnum)
+{
+	ObjectAddress	object;
+	char		   *audit_name;
+
+	if (get_rel_relkind(relOid) != RELKIND_RELATION)
+		return;
+
+	/*
+	 * check db_column:{drop} permission
+	 */
+	object.classId = RelationRelationId;
+	object.objectId = relOid;
+	object.objectSubId = attnum;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+							SEPG_CLASS_DB_COLUMN,
+							SEPG_DB_COLUMN__DROP,
+							audit_name,
+							true);
+	pfree(audit_name);
+}
+
 /*
  * sepgsql_attribute_relabel
  *
@@ -309,6 +340,94 @@ sepgsql_relation_post_create(Oid relOid)
 	heap_close(rel, AccessShareLock);
 }
 
+/*
+ * sepgsql_relation_drop
+ *
+ * It checks privileges to drop the supplied relation.
+ */
+void
+sepgsql_relation_drop(Oid relOid)
+{
+	ObjectAddress	object;
+	char		   *audit_name;
+	uint16_t		tclass = 0;
+	char			relkind;
+
+	relkind = get_rel_relkind(relOid);
+	if (relkind == RELKIND_RELATION)
+		tclass = SEPG_CLASS_DB_TABLE;
+	else if (relkind == RELKIND_SEQUENCE)
+		tclass = SEPG_CLASS_DB_SEQUENCE;
+	else if (relkind == RELKIND_VIEW)
+		tclass = SEPG_CLASS_DB_VIEW;
+	else
+		return;
+
+	/*
+	 * check db_schema:{remove_name} permission
+	 */
+	object.classId = NamespaceRelationId;
+	object.objectId = get_rel_namespace(relOid);
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+							SEPG_CLASS_DB_SCHEMA,
+							SEPG_DB_SCHEMA__REMOVE_NAME,
+							audit_name,
+							true);
+	pfree(audit_name);
+
+	/*
+	 * check db_table/sequence/view:{drop} permission
+	 */
+	object.classId = RelationRelationId;
+	object.objectId = relOid;
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+							tclass,
+							SEPG_DB_TABLE__DROP,
+							audit_name,
+							true);
+	pfree(audit_name);
+
+	/*
+	 * check db_column:{drop} permission
+	 */
+	if (relkind == RELKIND_RELATION)
+	{
+		Form_pg_attribute	attForm;
+		CatCList   *attrList;
+		HeapTuple	atttup;
+		int			i;
+
+		attrList = SearchSysCacheList1(ATTNUM, ObjectIdGetDatum(relOid));
+		for (i=0; i < attrList->n_members; i++)
+		{
+			atttup = &attrList->members[i]->tuple;
+			attForm = (Form_pg_attribute) GETSTRUCT(atttup);
+
+			if (attForm->attisdropped)
+				continue;
+
+			object.classId = RelationRelationId;
+			object.objectId = relOid;
+			object.objectSubId = attForm->attnum;
+			audit_name = getObjectDescription(&object);
+
+			sepgsql_avc_check_perms(&object,
+									SEPG_CLASS_DB_COLUMN,
+									SEPG_DB_COLUMN__DROP,
+									audit_name,
+									true);
+			pfree(audit_name);
+		}
+		ReleaseCatCacheList(attrList);
+	}
+}
+
 /*
  * sepgsql_relation_relabel
  *
diff --git a/contrib/sepgsql/schema.c b/contrib/sepgsql/schema.c
index 90dca1d6a75e7..31d60efe18514 100644
--- a/contrib/sepgsql/schema.c
+++ b/contrib/sepgsql/schema.c
@@ -96,6 +96,33 @@ sepgsql_schema_post_create(Oid namespaceId)
 	pfree(tcontext);
 }
 
+/*
+ * sepgsql_schema_drop
+ *
+ * It checks privileges to drop the supplied schema object.
+ */
+void
+sepgsql_schema_drop(Oid namespaceId)
+{
+	ObjectAddress	object;
+	char		   *audit_name;
+
+	/*
+	 * check db_schema:{drop} permission
+	 */
+	object.classId = NamespaceRelationId;
+	object.objectId = namespaceId;
+	object.objectSubId = 0;
+	audit_name = getObjectDescription(&object);
+
+	sepgsql_avc_check_perms(&object,
+                            SEPG_CLASS_DB_SCHEMA,
+							SEPG_DB_SCHEMA__DROP,
+							audit_name,
+							true);
+	pfree(audit_name);
+}
+
 /*
  * sepgsql_schema_relabel
  *
diff --git a/contrib/sepgsql/sepgsql.h b/contrib/sepgsql/sepgsql.h
index 9ce8d2d9c482a..0100a09d49b1b 100644
--- a/contrib/sepgsql/sepgsql.h
+++ b/contrib/sepgsql/sepgsql.h
@@ -288,27 +288,32 @@ extern bool sepgsql_dml_privileges(List *rangeTabls, bool abort);
  */
 extern void sepgsql_database_post_create(Oid databaseId,
 										 const char *dtemplate);
+extern void sepgsql_database_drop(Oid databaseId);
 extern void sepgsql_database_relabel(Oid databaseId, const char *seclabel);
 
 /*
  * schema.c
  */
 extern void sepgsql_schema_post_create(Oid namespaceId);
+extern void sepgsql_schema_drop(Oid namespaceId);
 extern void sepgsql_schema_relabel(Oid namespaceId, const char *seclabel);
 
 /*
  * relation.c
  */
 extern void sepgsql_attribute_post_create(Oid relOid, AttrNumber attnum);
+extern void sepgsql_attribute_drop(Oid relOid, AttrNumber attnum);
 extern void sepgsql_attribute_relabel(Oid relOid, AttrNumber attnum,
 						  const char *seclabel);
 extern void sepgsql_relation_post_create(Oid relOid);
+extern void sepgsql_relation_drop(Oid relOid);
 extern void sepgsql_relation_relabel(Oid relOid, const char *seclabel);
 
 /*
  * proc.c
  */
 extern void sepgsql_proc_post_create(Oid functionId);
+extern void sepgsql_proc_drop(Oid functionId);
 extern void sepgsql_proc_relabel(Oid functionId, const char *seclabel);
 
 #endif   /* SEPGSQL_H */
diff --git a/contrib/sepgsql/sql/create.sql b/contrib/sepgsql/sql/ddl.sql
similarity index 53%
rename from contrib/sepgsql/sql/create.sql
rename to contrib/sepgsql/sql/ddl.sql
index b0695b41a94c5..8dd57e0eaf414 100644
--- a/contrib/sepgsql/sql/create.sql
+++ b/contrib/sepgsql/sql/ddl.sql
@@ -1,5 +1,5 @@
 --
--- Regression Test for Creation of Object Permission Checks
+-- Regression Test for DDL of Object Permission Checks
 --
 
 -- confirm required permissions using audit messages
@@ -7,10 +7,17 @@
 SET sepgsql.debug_audit = true;
 SET client_min_messages = LOG;
 
+--
+-- CREATE Permission checks
+--
 CREATE DATABASE regtest_sepgsql_test_database;
 
+CREATE USER regtest_sepgsql_test_user;
+
 CREATE SCHEMA regtest_schema;
 
+GRANT ALL ON SCHEMA regtest_schema TO regtest_sepgsql_test_user;
+
 SET search_path = regtest_schema, public;
 
 CREATE TABLE regtest_table (x serial primary key, y text);
@@ -38,9 +45,37 @@ CREATE AGGREGATE regtest_agg (
            sfunc1 = int4pl, basetype = int4, stype1 = int4, initcond1 = '0'
 );
 
+-- CREATE objects owned by others
+SET SESSION AUTHORIZATION regtest_sepgsql_test_user;
+
+SET search_path = regtest_schema, public;
+
+CREATE TABLE regtest_table_3 (x int, y serial);
+
+CREATE VIEW regtest_view_2 AS SELECT * FROM regtest_table_3 WHERE x < y;
+
+CREATE FUNCTION regtest_func_2(int) RETURNS bool LANGUAGE plpgsql
+           AS 'BEGIN RETURN $1 * $1 < 100; END';
+
+RESET SESSION AUTHORIZATION;
+
 --
--- clean-up
+-- DROP Permission checks (with clean-up)
 --
-DROP DATABASE IF EXISTS regtest_sepgsql_test_database;
 
+DROP FUNCTION regtest_func(text,int[]);
+DROP AGGREGATE regtest_agg(int);
+
+DROP SEQUENCE regtest_seq;
+DROP VIEW regtest_view;
+
+ALTER TABLE regtest_table DROP COLUMN y;
+ALTER TABLE regtest_table_2 SET WITHOUT OIDS;
+
+DROP TABLE regtest_table;
+
+DROP OWNED BY regtest_sepgsql_test_user;
+
+DROP DATABASE regtest_sepgsql_test_database;
+DROP USER regtest_sepgsql_test_user;
 DROP SCHEMA IF EXISTS regtest_schema CASCADE;
diff --git a/contrib/sepgsql/test_sepgsql b/contrib/sepgsql/test_sepgsql
index 52237e6691a6d..473004f6d22e3 100755
--- a/contrib/sepgsql/test_sepgsql
+++ b/contrib/sepgsql/test_sepgsql
@@ -259,6 +259,6 @@ echo "found ${NUM}"
 echo
 echo "============== running sepgsql regression tests       =============="
 
-make REGRESS="label dml create misc" REGRESS_OPTS="--launcher ./launcher" installcheck
+make REGRESS="label dml ddl misc" REGRESS_OPTS="--launcher ./launcher" installcheck
 
 # exit with the exit code provided by "make"
diff --git a/doc/src/sgml/sepgsql.sgml b/doc/src/sgml/sepgsql.sgml
index 68cc6078af2ef..dbddf86bb1c9a 100644
--- a/doc/src/sgml/sepgsql.sgml
+++ b/doc/src/sgml/sepgsql.sgml
@@ -440,6 +440,20 @@ UPDATE t1 SET x = 2, y = md5sum(y) WHERE z = 100;
     on the schema, not only <literal>create</> on the new object itself.
    </para>
 
+   <para>
+    When <literal>DROP</> command is executed, <literal>drop</> will be
+    checked on the object being removed for each object types.
+    Please note that it shall not be checked on the objects removed by
+    cascaded deletion according to the standard manner in SQL.
+   </para>
+   <para>
+    A few additional checks are applied depending on object types.
+    On deletion of objects underlying a particula schema (tables, views,
+    sequences and procedures), <literal>remove_name</> shall be also checked
+    on the schema, not only <literal>drop</> on the object being removed
+    itself.
+   </para>
+
    <para>
     When <xref linkend="sql-security-label"> is executed, <literal>setattr</>
     and <literal>relabelfrom</> will be checked on the object being relabeled

From 39d74e346c083aa371ba64c4edb1332c40b56530 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Sat, 10 Mar 2012 20:19:13 +0200
Subject: [PATCH 111/129] Add support for renaming constraints

reviewed by Josh Berkus and Dimitri Fontaine
---
 doc/src/sgml/ref/alter_table.sgml         |  16 ++-
 src/backend/commands/alter.c              |   4 +
 src/backend/commands/tablecmds.c          | 102 ++++++++++++++++
 src/backend/parser/gram.y                 |  10 ++
 src/include/commands/tablecmds.h          |   2 +
 src/test/regress/expected/alter_table.out | 135 ++++++++++++++++++++++
 src/test/regress/sql/alter_table.sql      |  40 +++++++
 7 files changed, 306 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index 951b63b5b4fa4..c3039c8167270 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -25,6 +25,8 @@ ALTER TABLE [ IF EXISTS ] [ ONLY ] <replaceable class="PARAMETER">name</replacea
     <replaceable class="PARAMETER">action</replaceable> [, ... ]
 ALTER TABLE [ IF EXISTS ] [ ONLY ] <replaceable class="PARAMETER">name</replaceable> [ * ]
     RENAME [ COLUMN ] <replaceable class="PARAMETER">column</replaceable> TO <replaceable class="PARAMETER">new_column</replaceable>
+ALTER TABLE [ IF EXISTS ] [ ONLY ] <replaceable class="PARAMETER">name</replaceable> [ * ]
+    RENAME CONSTRAINT <replaceable class="PARAMETER">constraint_name</replaceable> TO <replaceable class="PARAMETER">new_constraint_name</replaceable>
 ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
     RENAME TO <replaceable class="PARAMETER">new_name</replaceable>
 ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
@@ -569,8 +571,8 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
     <listitem>
      <para>
       The <literal>RENAME</literal> forms change the name of a table
-      (or an index, sequence, or view) or the name of an individual column in
-      a table. There is no effect on the stored data.
+      (or an index, sequence, or view), the name of an individual column in
+      a table, or the name of a constraint of the table. There is no effect on the stored data.
      </para>
     </listitem>
    </varlistentry>
@@ -883,7 +885,8 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
 
    <para>
     If a table has any descendant tables, it is not permitted to add,
-    rename, or change the type of a column in the parent table without doing
+    rename, or change the type of a column, or rename an inherited constraint
+    in the parent table without doing
     the same to the descendants.  That is, <command>ALTER TABLE ONLY</command>
     will be rejected.  This ensures that the descendants always have
     columns matching the parent.
@@ -982,6 +985,13 @@ ALTER TABLE distributors RENAME TO suppliers;
 </programlisting>
   </para>
 
+  <para>
+   To rename an existing constraint:
+<programlisting>
+ALTER TABLE distributors RENAME CONSTRAINT zipchk TO zip_check;
+</programlisting>
+  </para>
+
   <para>
    To add a not-null constraint to a column:
 <programlisting>
diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c
index 9175405af2af5..4dd9927afbafa 100644
--- a/src/backend/commands/alter.c
+++ b/src/backend/commands/alter.c
@@ -57,6 +57,10 @@ ExecRenameStmt(RenameStmt *stmt)
 			RenameCollation(stmt->object, stmt->newname);
 			break;
 
+		case OBJECT_CONSTRAINT:
+			RenameConstraint(stmt);
+			break;
+
 		case OBJECT_CONVERSION:
 			RenameConversion(stmt->object, stmt->newname);
 			break;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 25ca356b867ec..9615380f05b36 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -2327,6 +2327,108 @@ renameatt(RenameStmt *stmt)
 					   stmt->behavior);
 }
 
+
+/*
+ * same logic as renameatt_internal
+ */
+static void
+rename_constraint_internal(Oid myrelid,
+						   const char *oldconname,
+						   const char *newconname,
+						   bool recurse,
+						   bool recursing,
+						   int expected_parents)
+{
+	Relation	targetrelation;
+	Oid			constraintOid;
+	HeapTuple   tuple;
+	Form_pg_constraint con;
+
+	targetrelation = relation_open(myrelid, AccessExclusiveLock);
+	/* don't tell it whether we're recursing; we allow changing typed tables here */
+	renameatt_check(myrelid, RelationGetForm(targetrelation), false);
+
+	constraintOid = get_constraint_oid(myrelid, oldconname, false);
+
+	tuple = SearchSysCache1(CONSTROID, ObjectIdGetDatum(constraintOid));
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for constraint %u",
+			 constraintOid);
+	con = (Form_pg_constraint) GETSTRUCT(tuple);
+
+	if (con->contype == CONSTRAINT_CHECK && !con->conisonly)
+	{
+		if (recurse)
+		{
+			List	   *child_oids,
+				*child_numparents;
+			ListCell   *lo,
+				*li;
+
+			child_oids = find_all_inheritors(myrelid, AccessExclusiveLock,
+											 &child_numparents);
+
+			forboth(lo, child_oids, li, child_numparents)
+			{
+				Oid			childrelid = lfirst_oid(lo);
+				int			numparents = lfirst_int(li);
+
+				if (childrelid == myrelid)
+					continue;
+
+				rename_constraint_internal(childrelid, oldconname, newconname, false, true, numparents);
+			}
+		}
+		else
+		{
+			if (expected_parents == 0 &&
+				find_inheritance_children(myrelid, NoLock) != NIL)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+						 errmsg("inherited constraint \"%s\" must be renamed in child tables too",
+								oldconname)));
+		}
+
+		if (con->coninhcount > expected_parents)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("cannot rename inherited constraint \"%s\"",
+							oldconname)));
+	}
+
+	if (con->conindid
+		&& (con->contype == CONSTRAINT_PRIMARY
+			|| con->contype == CONSTRAINT_UNIQUE
+			|| con->contype == CONSTRAINT_EXCLUSION))
+		/* rename the index; this renames the constraint as well */
+		RenameRelationInternal(con->conindid, newconname);
+	else
+		RenameConstraintById(constraintOid, newconname);
+
+	ReleaseSysCache(tuple);
+
+	relation_close(targetrelation, NoLock);		/* close rel but keep lock */
+}
+
+void
+RenameConstraint(RenameStmt *stmt)
+{
+	Oid			relid;
+
+	/* lock level taken here should match rename_constraint_internal */
+	relid = RangeVarGetRelidExtended(stmt->relation, AccessExclusiveLock,
+									 false, false,
+									 RangeVarCallbackForRenameAttribute,
+									 NULL);
+
+	rename_constraint_internal(relid,
+							   stmt->subname,
+							   stmt->newname,
+							   interpretInhOption(stmt->relation->inhOpt),		/* recursive? */
+							   false,	/* recursing? */
+							   0		/* expected inhcount */);
+}
+
 /*
  * Execute ALTER TABLE/INDEX/SEQUENCE/VIEW/FOREIGN TABLE RENAME
  */
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 9aea2cd80b5b0..feb28a41720f2 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -6731,6 +6731,16 @@ RenameStmt: ALTER AGGREGATE func_name aggr_args RENAME TO name
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
+			| ALTER TABLE relation_expr RENAME CONSTRAINT name TO name
+				{
+					RenameStmt *n = makeNode(RenameStmt);
+					n->renameType = OBJECT_CONSTRAINT;
+					n->relationType = OBJECT_TABLE;
+					n->relation = $3;
+					n->subname = $6;
+					n->newname = $8;
+					$$ = (Node *)n;
+				}
 			| ALTER FOREIGN TABLE relation_expr RENAME opt_column name TO name
 				{
 					RenameStmt *n = makeNode(RenameStmt);
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index 03f397de6390d..47b0cddc9bb86 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -48,6 +48,8 @@ extern void SetRelationHasSubclass(Oid relationId, bool relhassubclass);
 
 extern void renameatt(RenameStmt *stmt);
 
+extern void RenameConstraint(RenameStmt *stmt);
+
 extern void RenameRelation(RenameStmt *stmt);
 
 extern void RenameRelationInternal(Oid myrelid,
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 4aba58c450361..eba0493089f93 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -160,6 +160,141 @@ DROP VIEW tmp_view_new;
 -- toast-like relation name
 alter table stud_emp rename to pg_toast_stud_emp;
 alter table pg_toast_stud_emp rename to stud_emp;
+-- renaming index should rename constraint as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+NOTICE:  ALTER TABLE / ADD UNIQUE will create implicit index "onek_unique1_constraint" for table "onek"
+ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraint
+ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0);
+ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo;
+-- renaming constraint should rename index as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+NOTICE:  ALTER TABLE / ADD UNIQUE will create implicit index "onek_unique1_constraint" for table "onek"
+DROP INDEX onek_unique1_constraint;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint on table onek instead.
+ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo;
+DROP INDEX onek_unique1_constraint_foo;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint_foo on table onek instead.
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraints vs. inheritance
+CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int);
+\d constraint_rename_test
+Table "public.constraint_rename_test"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+Check constraints:
+    "con1" CHECK (a > 0)
+
+CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging constraint "con1" with inherited definition
+\d constraint_rename_test2
+Table "public.constraint_rename_test2"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+ d      | integer | 
+Check constraints:
+    "con1" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  cannot rename inherited constraint "con1"
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  inherited constraint "con1" must be renamed in child tables too
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok
+\d constraint_rename_test
+Table "public.constraint_rename_test"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+Table "public.constraint_rename_test2"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+ d      | integer | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE ONLY constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0);
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok
+\d constraint_rename_test
+Table "public.constraint_rename_test"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+Check constraints:
+    "con2bar" (ONLY) CHECK (b > 0)
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+Table "public.constraint_rename_test2"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+ d      | integer | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a);
+NOTICE:  ALTER TABLE / ADD PRIMARY KEY will create implicit index "con3" for table "constraint_rename_test"
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok
+\d constraint_rename_test
+Table "public.constraint_rename_test"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | not null
+ b      | integer | 
+ c      | integer | 
+Indexes:
+    "con3foo" PRIMARY KEY, btree (a)
+Check constraints:
+    "con2bar" (ONLY) CHECK (b > 0)
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+Table "public.constraint_rename_test2"
+ Column |  Type   | Modifiers 
+--------+---------+-----------
+ a      | integer | 
+ b      | integer | 
+ c      | integer | 
+ d      | integer | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+DROP TABLE constraint_rename_test2;
+DROP TABLE constraint_rename_test;
+ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a);
+NOTICE:  relation "constraint_rename_test" does not exist, skipping
 -- FOREIGN KEY CONSTRAINT adding TEST
 CREATE TABLE tmp2 (a int primary key);
 NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "tmp2_pkey" for table "tmp2"
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index d4e4c4958d186..50c58d23e1879 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -191,6 +191,46 @@ DROP VIEW tmp_view_new;
 alter table stud_emp rename to pg_toast_stud_emp;
 alter table pg_toast_stud_emp rename to stud_emp;
 
+-- renaming index should rename constraint as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+
+-- renaming constraint
+ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0);
+ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo;
+
+-- renaming constraint should rename index as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+DROP INDEX onek_unique1_constraint;  -- to see whether it's there
+ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo;
+DROP INDEX onek_unique1_constraint_foo;  -- to see whether it's there
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+
+-- renaming constraints vs. inheritance
+CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int);
+\d constraint_rename_test
+CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test);
+\d constraint_rename_test2
+ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok
+\d constraint_rename_test
+\d constraint_rename_test2
+ALTER TABLE ONLY constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0);
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok
+\d constraint_rename_test
+\d constraint_rename_test2
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a);
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok
+\d constraint_rename_test
+\d constraint_rename_test2
+DROP TABLE constraint_rename_test2;
+DROP TABLE constraint_rename_test;
+ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a);
+
 -- FOREIGN KEY CONSTRAINT adding TEST
 
 CREATE TABLE tmp2 (a int primary key);

From 03e56f798e365763486b03a2630fbc3190ccd29a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 10 Mar 2012 18:36:49 -0500
Subject: [PATCH 112/129] Restructure SPGiST opclass interface API to support
 whole-index scans.

The original API definition was incapable of supporting whole-index scans
because there was no way to invoke leaf-value reconstruction without
checking any qual conditions.  Also, it was inefficient for
multiple-qual-condition scans because value reconstruction got done over
again for each qual condition, and because other internal work in the
consistent functions likewise had to be done for each qual.  To fix these
issues, pass the whole scankey array to the opclass consistent functions,
instead of only letting them see one item at a time.  (Essentially, the
loop over scankey entries is now inside the consistent functions not
outside them.  This makes the consistent functions a bit more complicated,
but not unreasonably so.)

In itself this commit does nothing except save a few cycles in
multiple-qual-condition index scans, since we can't support whole-index
scans on SPGiST indexes until nulls are included in the index.  However,
I consider this a must-fix for 9.2 because once we release it will get
very much harder to change the opclass API definition.
---
 doc/src/sgml/spgist.sgml                    |  38 ++-
 src/backend/access/spgist/spgkdtreeproc.c   | 188 +++++------
 src/backend/access/spgist/spgquadtreeproc.c | 240 +++++++-------
 src/backend/access/spgist/spgscan.c         | 345 ++++++++++----------
 src/backend/access/spgist/spgtextproc.c     | 218 +++++++------
 src/include/access/spgist.h                 |   8 +-
 src/include/access/spgist_private.h         |   6 +-
 7 files changed, 517 insertions(+), 526 deletions(-)

diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index dcc3cc2d73312..0202dbcdd5a26 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -439,8 +439,8 @@ CREATE FUNCTION my_inner_consistent(internal, internal) RETURNS void ...
 <programlisting>
 typedef struct spgInnerConsistentIn
 {
-    StrategyNumber strategy;    /* operator strategy number */
-    Datum       query;          /* operator's RHS value */
+    ScanKey     scankeys;       /* array of operators and comparison values */
+    int         nkeys;          /* length of array */
 
     Datum       reconstructedValue;     /* value reconstructed at parent */
     int         level;          /* current level (counting from zero) */
@@ -463,8 +463,17 @@ typedef struct spgInnerConsistentOut
 } spgInnerConsistentOut;
 </programlisting>
 
-       <structfield>strategy</> and
-       <structfield>query</> describe the index search condition.
+       The array <structfield>scankeys</>, of length <structfield>nkeys</>,
+       describes the index search condition(s).  These conditions are
+       combined with AND &mdash; only index entries that satisfy all of
+       them are interesting.  (Note that <structfield>nkeys</> = 0 implies
+       that all index entries satisfy the query.)  Usually the consistent
+       function only cares about the <structfield>sk_strategy</> and
+       <structfield>sk_argument</> fields of each array entry, which
+       respectively give the indexable operator and comparison value.
+       In particular it is not necessary to check <structfield>sk_flags</> to
+       see if the comparison value is NULL, because the SP-GiST core code
+       will filter out such conditions.
        <structfield>reconstructedValue</> is the value reconstructed for the
        parent tuple; it is <literal>(Datum) 0</> at the root level or if the
        <function>inner_consistent</> function did not provide a value at the
@@ -527,8 +536,8 @@ CREATE FUNCTION my_leaf_consistent(internal, internal) RETURNS bool ...
 <programlisting>
 typedef struct spgLeafConsistentIn
 {
-    StrategyNumber strategy;    /* operator strategy number */
-    Datum       query;          /* operator's RHS value */
+    ScanKey     scankeys;       /* array of operators and comparison values */
+    int         nkeys;          /* length of array */
 
     Datum       reconstructedValue;     /* value reconstructed at parent */
     int         level;          /* current level (counting from zero) */
@@ -544,8 +553,17 @@ typedef struct spgLeafConsistentOut
 } spgLeafConsistentOut;
 </programlisting>
 
-       <structfield>strategy</> and
-       <structfield>query</> define the index search condition.
+       The array <structfield>scankeys</>, of length <structfield>nkeys</>,
+       describes the index search condition(s).  These conditions are
+       combined with AND &mdash; only index entries that satisfy all of
+       them satisfy the query.  (Note that <structfield>nkeys</> = 0 implies
+       that all index entries satisfy the query.)  Usually the consistent
+       function only cares about the <structfield>sk_strategy</> and
+       <structfield>sk_argument</> fields of each array entry, which
+       respectively give the indexable operator and comparison value.
+       In particular it is not necessary to check <structfield>sk_flags</> to
+       see if the comparison value is NULL, because the SP-GiST core code
+       will filter out such conditions.
        <structfield>reconstructedValue</> is the value reconstructed for the
        parent tuple; it is <literal>(Datum) 0</> at the root level or if the
        <function>inner_consistent</> function did not provide a value at the
@@ -566,8 +584,8 @@ typedef struct spgLeafConsistentOut
        <structfield>leafValue</> must be set to the value originally supplied
        to be indexed for this leaf tuple.  Also,
        <structfield>recheck</> may be set to <literal>true</> if the match
-       is uncertain and so the operator must be re-applied to the actual heap
-       tuple to verify the match.
+       is uncertain and so the operator(s) must be re-applied to the actual
+       heap tuple to verify the match.
       </para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c
index eca972a6f0b8e..adfe287581b8c 100644
--- a/src/backend/access/spgist/spgkdtreeproc.c
+++ b/src/backend/access/spgist/spgkdtreeproc.c
@@ -159,11 +159,10 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS)
 {
 	spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
 	spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
-	Point	   *query;
-	BOX		   *boxQuery;
 	double		coord;
+	int			which;
+	int			i;
 
-	query = DatumGetPointP(in->query);
 	Assert(in->hasPrefix);
 	coord = DatumGetFloat8(in->prefixDatum);
 
@@ -171,124 +170,97 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS)
 		elog(ERROR, "allTheSame should not occur for k-d trees");
 
 	Assert(in->nNodes == 2);
-	out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
-	out->levelAdds = (int *) palloc(sizeof(int) * 2);
-	out->levelAdds[0] = 1;
-	out->levelAdds[1] = 1;
-	out->nNodes = 0;
 
-	switch (in->strategy)
+	/* "which" is a bitmask of children that satisfy all constraints */
+	which = (1 << 1) | (1 << 2);
+
+	for (i = 0; i < in->nkeys; i++)
 	{
-		case RTLeftStrategyNumber:
-			out->nNodes = 1;
-			out->nodeNumbers[0] = 0;
-
-			if ((in->level % 2) == 0 || FPge(query->x, coord))
-			{
-				out->nodeNumbers[1] = 1;
-				out->nNodes++;
-			}
-			break;
-		case RTRightStrategyNumber:
-			out->nNodes = 1;
-			out->nodeNumbers[0] = 1;
-
-			if ((in->level % 2) == 0 || FPle(query->x, coord))
-			{
-				out->nodeNumbers[1] = 0;
-				out->nNodes++;
-			}
-			break;
-		case RTSameStrategyNumber:
-			if (in->level % 2)
-			{
-				if (FPle(query->x, coord))
-				{
-					out->nodeNumbers[out->nNodes] = 0;
-					out->nNodes++;
-				}
-				if (FPge(query->x, coord))
-				{
-					out->nodeNumbers[out->nNodes] = 1;
-					out->nNodes++;
-				}
-			}
-			else
-			{
-				if (FPle(query->y, coord))
+		Point	   *query = DatumGetPointP(in->scankeys[i].sk_argument);
+		BOX		   *boxQuery;
+
+		switch (in->scankeys[i].sk_strategy)
+		{
+			case RTLeftStrategyNumber:
+				if ((in->level % 2) != 0 && FPlt(query->x, coord))
+					which &= (1 << 1);
+				break;
+			case RTRightStrategyNumber:
+				if ((in->level % 2) != 0 && FPgt(query->x, coord))
+					which &= (1 << 2);
+				break;
+			case RTSameStrategyNumber:
+				if ((in->level % 2) != 0)
 				{
-					out->nodeNumbers[out->nNodes] = 0;
-					out->nNodes++;
+					if (FPlt(query->x, coord))
+						which &= (1 << 1);
+					else if (FPgt(query->x, coord))
+						which &= (1 << 2);
 				}
-				if (FPge(query->y, coord))
+				else
 				{
-					out->nodeNumbers[out->nNodes] = 1;
-					out->nNodes++;
+					if (FPlt(query->y, coord))
+						which &= (1 << 1);
+					else if (FPgt(query->y, coord))
+						which &= (1 << 2);
 				}
-			}
-			break;
-		case RTBelowStrategyNumber:
-			out->nNodes = 1;
-			out->nodeNumbers[0] = 0;
-
-			if ((in->level % 2) == 1 || FPge(query->y, coord))
-			{
-				out->nodeNumbers[1] = 1;
-				out->nNodes++;
-			}
-			break;
-		case RTAboveStrategyNumber:
-			out->nNodes = 1;
-			out->nodeNumbers[0] = 1;
-
-			if ((in->level % 2) == 1 || FPle(query->y, coord))
-			{
-				out->nodeNumbers[1] = 0;
-				out->nNodes++;
-			}
-			break;
-		case RTContainedByStrategyNumber:
-
-			/*
-			 * For this operator, the query is a box not a point.  We cheat to
-			 * the extent of assuming that DatumGetPointP won't do anything
-			 * that would be bad for a pointer-to-box.
-			 */
-			boxQuery = DatumGetBoxP(in->query);
-
-			out->nNodes = 1;
-			if (in->level % 2)
-			{
-				if (FPlt(boxQuery->high.x, coord))
-					out->nodeNumbers[0] = 0;
-				else if (FPgt(boxQuery->low.x, coord))
-					out->nodeNumbers[0] = 1;
-				else
+				break;
+			case RTBelowStrategyNumber:
+				if ((in->level % 2) == 0 && FPlt(query->y, coord))
+					which &= (1 << 1);
+				break;
+			case RTAboveStrategyNumber:
+				if ((in->level % 2) == 0 && FPgt(query->y, coord))
+					which &= (1 << 2);
+				break;
+			case RTContainedByStrategyNumber:
+
+				/*
+				 * For this operator, the query is a box not a point.  We
+				 * cheat to the extent of assuming that DatumGetPointP won't
+				 * do anything that would be bad for a pointer-to-box.
+				 */
+				boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument);
+
+				if ((in->level % 2) != 0)
 				{
-					out->nodeNumbers[0] = 0;
-					out->nodeNumbers[1] = 1;
-					out->nNodes = 2;
+					if (FPlt(boxQuery->high.x, coord))
+						which &= (1 << 1);
+					else if (FPgt(boxQuery->low.x, coord))
+						which &= (1 << 2);
 				}
-			}
-			else
-			{
-				if (FPlt(boxQuery->high.y, coord))
-					out->nodeNumbers[0] = 0;
-				else if (FPgt(boxQuery->low.y, coord))
-					out->nodeNumbers[0] = 1;
 				else
 				{
-					out->nodeNumbers[0] = 0;
-					out->nodeNumbers[1] = 1;
-					out->nNodes = 2;
+					if (FPlt(boxQuery->high.y, coord))
+						which &= (1 << 1);
+					else if (FPgt(boxQuery->low.y, coord))
+						which &= (1 << 2);
 				}
-			}
-			break;
-		default:
-			elog(ERROR, "unrecognized strategy number: %d", in->strategy);
-			break;
+				break;
+			default:
+				elog(ERROR, "unrecognized strategy number: %d",
+					 in->scankeys[i].sk_strategy);
+				break;
+		}
+
+		if (which == 0)
+			break;				/* no need to consider remaining conditions */
 	}
 
+	/* We must descend into the children identified by which */
+	out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
+	out->nNodes = 0;
+	for (i = 1; i <= 2; i++)
+	{
+		if (which & (1 << i))
+			out->nodeNumbers[out->nNodes++] = i - 1;
+	}
+
+	/* Set up level increments, too */
+	out->levelAdds = (int *) palloc(sizeof(int) * 2);
+	out->levelAdds[0] = 1;
+	out->levelAdds[1] = 1;
+
 	PG_RETURN_VOID();
 }
 
diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c
index 231749e992dba..10fafe58643b2 100644
--- a/src/backend/access/spgist/spgquadtreeproc.c
+++ b/src/backend/access/spgist/spgquadtreeproc.c
@@ -190,45 +190,21 @@ spg_quad_picksplit(PG_FUNCTION_ARGS)
 }
 
 
-/* Subroutine to fill out->nodeNumbers[] for spg_quad_inner_consistent */
-static void
-setNodes(spgInnerConsistentOut *out, bool isAll, int first, int second)
-{
-	if (isAll)
-	{
-		out->nNodes = 4;
-		out->nodeNumbers[0] = 0;
-		out->nodeNumbers[1] = 1;
-		out->nodeNumbers[2] = 2;
-		out->nodeNumbers[3] = 3;
-	}
-	else
-	{
-		out->nNodes = 2;
-		out->nodeNumbers[0] = first - 1;
-		out->nodeNumbers[1] = second - 1;
-	}
-}
-
-
 Datum
 spg_quad_inner_consistent(PG_FUNCTION_ARGS)
 {
 	spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
 	spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
-	Point	   *query,
-			   *centroid;
-	BOX		   *boxQuery;
+	Point	   *centroid;
+	int			which;
+	int			i;
 
-	query = DatumGetPointP(in->query);
 	Assert(in->hasPrefix);
 	centroid = DatumGetPointP(in->prefixDatum);
 
 	if (in->allTheSame)
 	{
 		/* Report that all nodes should be visited */
-		int		i;
-
 		out->nNodes = in->nNodes;
 		out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
 		for (i = 0; i < in->nNodes; i++)
@@ -237,76 +213,86 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS)
 	}
 
 	Assert(in->nNodes == 4);
-	out->nodeNumbers = (int *) palloc(sizeof(int) * 4);
 
-	switch (in->strategy)
+	/* "which" is a bitmask of quadrants that satisfy all constraints */
+	which = (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
+
+	for (i = 0; i < in->nkeys; i++)
 	{
-		case RTLeftStrategyNumber:
-			setNodes(out, SPTEST(point_left, centroid, query), 3, 4);
-			break;
-		case RTRightStrategyNumber:
-			setNodes(out, SPTEST(point_right, centroid, query), 1, 2);
-			break;
-		case RTSameStrategyNumber:
-			out->nNodes = 1;
-			out->nodeNumbers[0] = getQuadrant(centroid, query) - 1;
-			break;
-		case RTBelowStrategyNumber:
-			setNodes(out, SPTEST(point_below, centroid, query), 2, 3);
-			break;
-		case RTAboveStrategyNumber:
-			setNodes(out, SPTEST(point_above, centroid, query), 1, 4);
-			break;
-		case RTContainedByStrategyNumber:
-
-			/*
-			 * For this operator, the query is a box not a point.  We cheat to
-			 * the extent of assuming that DatumGetPointP won't do anything
-			 * that would be bad for a pointer-to-box.
-			 */
-			boxQuery = DatumGetBoxP(in->query);
-
-			if (DatumGetBool(DirectFunctionCall2(box_contain_pt,
-												 PointerGetDatum(boxQuery),
-												 PointerGetDatum(centroid))))
-			{
-				/* centroid is in box, so descend to all quadrants */
-				setNodes(out, true, 0, 0);
-			}
-			else
-			{
-				/* identify quadrant(s) containing all corners of box */
-				Point		p;
-				int			i,
-							r = 0;
-
-				p = boxQuery->low;
-				r |= 1 << (getQuadrant(centroid, &p) - 1);
-
-				p.y = boxQuery->high.y;
-				r |= 1 << (getQuadrant(centroid, &p) - 1);
-
-				p = boxQuery->high;
-				r |= 1 << (getQuadrant(centroid, &p) - 1);
-
-				p.x = boxQuery->low.x;
-				r |= 1 << (getQuadrant(centroid, &p) - 1);
-
-				/* we must descend into those quadrant(s) */
-				out->nNodes = 0;
-				for (i = 0; i < 4; i++)
+		Point	   *query = DatumGetPointP(in->scankeys[i].sk_argument);
+		BOX		   *boxQuery;
+
+		switch (in->scankeys[i].sk_strategy)
+		{
+			case RTLeftStrategyNumber:
+				if (SPTEST(point_right, centroid, query))
+					which &= (1 << 3) | (1 << 4);
+				break;
+			case RTRightStrategyNumber:
+				if (SPTEST(point_left, centroid, query))
+					which &= (1 << 1) | (1 << 2);
+				break;
+			case RTSameStrategyNumber:
+				which &= (1 << getQuadrant(centroid, query));
+				break;
+			case RTBelowStrategyNumber:
+				if (SPTEST(point_above, centroid, query))
+					which &= (1 << 2) | (1 << 3);
+				break;
+			case RTAboveStrategyNumber:
+				if (SPTEST(point_below, centroid, query))
+					which &= (1 << 1) | (1 << 4);
+				break;
+			case RTContainedByStrategyNumber:
+
+				/*
+				 * For this operator, the query is a box not a point.  We
+				 * cheat to the extent of assuming that DatumGetPointP won't
+				 * do anything that would be bad for a pointer-to-box.
+				 */
+				boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument);
+
+				if (DatumGetBool(DirectFunctionCall2(box_contain_pt,
+													 PointerGetDatum(boxQuery),
+													 PointerGetDatum(centroid))))
 				{
-					if (r & (1 << i))
-					{
-						out->nodeNumbers[out->nNodes] = i;
-						out->nNodes++;
-					}
+					/* centroid is in box, so all quadrants are OK */
 				}
-			}
-			break;
-		default:
-			elog(ERROR, "unrecognized strategy number: %d", in->strategy);
-			break;
+				else
+				{
+					/* identify quadrant(s) containing all corners of box */
+					Point		p;
+					int			r = 0;
+
+					p = boxQuery->low;
+					r |= 1 << getQuadrant(centroid, &p);
+					p.y = boxQuery->high.y;
+					r |= 1 << getQuadrant(centroid, &p);
+					p = boxQuery->high;
+					r |= 1 << getQuadrant(centroid, &p);
+					p.x = boxQuery->low.x;
+					r |= 1 << getQuadrant(centroid, &p);
+
+					which &= r;
+				}
+				break;
+			default:
+				elog(ERROR, "unrecognized strategy number: %d",
+					 in->scankeys[i].sk_strategy);
+				break;
+		}
+
+		if (which == 0)
+			break;				/* no need to consider remaining conditions */
+	}
+
+	/* We must descend into the quadrant(s) identified by which */
+	out->nodeNumbers = (int *) palloc(sizeof(int) * 4);
+	out->nNodes = 0;
+	for (i = 1; i <= 4; i++)
+	{
+		if (which & (1 << i))
+			out->nodeNumbers[out->nNodes++] = i - 1;
 	}
 
 	PG_RETURN_VOID();
@@ -318,9 +304,9 @@ spg_quad_leaf_consistent(PG_FUNCTION_ARGS)
 {
 	spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
 	spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
-	Point	   *query = DatumGetPointP(in->query);
 	Point	   *datum = DatumGetPointP(in->leafDatum);
 	bool		res;
+	int			i;
 
 	/* all tests are exact */
 	out->recheck = false;
@@ -328,35 +314,45 @@ spg_quad_leaf_consistent(PG_FUNCTION_ARGS)
 	/* leafDatum is what it is... */
 	out->leafValue = in->leafDatum;
 
-	switch (in->strategy)
+	/* Perform the required comparison(s) */
+	res = true;
+	for (i = 0; i < in->nkeys; i++)
 	{
-		case RTLeftStrategyNumber:
-			res = SPTEST(point_left, datum, query);
-			break;
-		case RTRightStrategyNumber:
-			res = SPTEST(point_right, datum, query);
-			break;
-		case RTSameStrategyNumber:
-			res = SPTEST(point_eq, datum, query);
-			break;
-		case RTBelowStrategyNumber:
-			res = SPTEST(point_below, datum, query);
-			break;
-		case RTAboveStrategyNumber:
-			res = SPTEST(point_above, datum, query);
-			break;
-		case RTContainedByStrategyNumber:
-
-			/*
-			 * For this operator, the query is a box not a point.  We cheat to
-			 * the extent of assuming that DatumGetPointP won't do anything
-			 * that would be bad for a pointer-to-box.
-			 */
-			res = SPTEST(box_contain_pt, query, datum);
-			break;
-		default:
-			elog(ERROR, "unrecognized strategy number: %d", in->strategy);
-			res = false;
+		Point	   *query = DatumGetPointP(in->scankeys[i].sk_argument);
+
+		switch (in->scankeys[i].sk_strategy)
+		{
+			case RTLeftStrategyNumber:
+				res = SPTEST(point_left, datum, query);
+				break;
+			case RTRightStrategyNumber:
+				res = SPTEST(point_right, datum, query);
+				break;
+			case RTSameStrategyNumber:
+				res = SPTEST(point_eq, datum, query);
+				break;
+			case RTBelowStrategyNumber:
+				res = SPTEST(point_below, datum, query);
+				break;
+			case RTAboveStrategyNumber:
+				res = SPTEST(point_above, datum, query);
+				break;
+			case RTContainedByStrategyNumber:
+
+				/*
+				 * For this operator, the query is a box not a point.  We
+				 * cheat to the extent of assuming that DatumGetPointP won't
+				 * do anything that would be bad for a pointer-to-box.
+				 */
+				res = SPTEST(box_contain_pt, query, datum);
+				break;
+			default:
+				elog(ERROR, "unrecognized strategy number: %d",
+					 in->scankeys[i].sk_strategy);
+				break;
+		}
+
+		if (!res)
 			break;
 	}
 
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 22cfcc8792933..99b0852611fbc 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -56,18 +56,25 @@ freeScanStack(SpGistScanOpaque so)
 }
 
 /*
- * Initialize scanStack with a single entry for the root page, resetting
+ * Initialize scanStack to search the root page, resetting
  * any previously active scan
  */
 static void
 resetSpGistScanOpaque(SpGistScanOpaque so)
 {
-	ScanStackEntry *startEntry = palloc0(sizeof(ScanStackEntry));
-
-	ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber);
+	ScanStackEntry *startEntry;
 
 	freeScanStack(so);
-	so->scanStack = list_make1(startEntry);
+
+	Assert(!so->searchNulls);	/* XXX fixme */
+
+	if (so->searchNonNulls)
+	{
+		/* Stack a work item to scan the non-null index entries */
+		startEntry = (ScanStackEntry *) palloc0(sizeof(ScanStackEntry));
+		ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber);
+		so->scanStack = list_make1(startEntry);
+	}
 
 	if (so->want_itup)
 	{
@@ -80,6 +87,82 @@ resetSpGistScanOpaque(SpGistScanOpaque so)
 	so->iPtr = so->nPtrs = 0;
 }
 
+/*
+ * Prepare scan keys in SpGistScanOpaque from caller-given scan keys
+ *
+ * Sets searchNulls, searchNonNulls, numberOfKeys, keyData fields of *so.
+ *
+ * The point here is to eliminate null-related considerations from what the
+ * opclass consistent functions need to deal with.  We assume all SPGiST-
+ * indexable operators are strict, so any null RHS value makes the scan
+ * condition unsatisfiable.  We also pull out any IS NULL/IS NOT NULL
+ * conditions; their effect is reflected into searchNulls/searchNonNulls.
+ */
+static void
+spgPrepareScanKeys(IndexScanDesc scan)
+{
+	SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+	bool		qual_ok;
+	bool		haveIsNull;
+	bool		haveNotNull;
+	int			nkeys;
+	int			i;
+
+	if (scan->numberOfKeys <= 0)
+	{
+		/* If no quals, whole-index scan is required */
+		so->searchNulls = true;
+		so->searchNonNulls = true;
+		so->numberOfKeys = 0;
+		return;
+	}
+
+	/* Examine the given quals */
+	qual_ok = true;
+	haveIsNull = haveNotNull = false;
+	nkeys = 0;
+	for (i = 0; i < scan->numberOfKeys; i++)
+	{
+		ScanKey		skey = &scan->keyData[i];
+
+		if (skey->sk_flags & SK_SEARCHNULL)
+			haveIsNull = true;
+		else if (skey->sk_flags & SK_SEARCHNOTNULL)
+			haveNotNull = true;
+		else if (skey->sk_flags & SK_ISNULL)
+		{
+			/* ordinary qual with null argument - unsatisfiable */
+			qual_ok = false;
+			break;
+		}
+		else
+		{
+			/* ordinary qual, propagate into so->keyData */
+			so->keyData[nkeys++] = *skey;
+			/* this effectively creates a not-null requirement */
+			haveNotNull = true;
+		}
+	}
+
+	/* IS NULL in combination with something else is unsatisfiable */
+	if (haveIsNull && haveNotNull)
+		qual_ok = false;
+
+	/* Emit results */
+	if (qual_ok)
+	{
+		so->searchNulls = haveIsNull;
+		so->searchNonNulls = haveNotNull;
+		so->numberOfKeys = nkeys;
+	}
+	else
+	{
+		so->searchNulls = false;
+		so->searchNonNulls = false;
+		so->numberOfKeys = 0;
+	}
+}
+
 Datum
 spgbeginscan(PG_FUNCTION_ARGS)
 {
@@ -92,13 +175,16 @@ spgbeginscan(PG_FUNCTION_ARGS)
 	scan = RelationGetIndexScan(rel, keysz, 0);
 
 	so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData));
+	if (keysz > 0)
+		so->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * keysz);
+	else
+		so->keyData = NULL;
 	initSpGistState(&so->state, scan->indexRelation);
 	so->tempCxt = AllocSetContextCreate(CurrentMemoryContext,
 										"SP-GiST search temporary context",
 										ALLOCSET_DEFAULT_MINSIZE,
 										ALLOCSET_DEFAULT_INITSIZE,
 										ALLOCSET_DEFAULT_MAXSIZE);
-	resetSpGistScanOpaque(so);
 
 	/* Set up indexTupDesc and xs_itupdesc in case it's an index-only scan */
 	so->indexTupDesc = scan->xs_itupdesc = RelationGetDescr(rel);
@@ -115,12 +201,17 @@ spgrescan(PG_FUNCTION_ARGS)
 	SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
 	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(1);
 
+	/* copy scankeys into local storage */
 	if (scankey && scan->numberOfKeys > 0)
 	{
 		memmove(scan->keyData, scankey,
 				scan->numberOfKeys * sizeof(ScanKeyData));
 	}
 
+	/* preprocess scankeys, set up the representation in *so */
+	spgPrepareScanKeys(scan);
+
+	/* set up starting stack entries */
 	resetSpGistScanOpaque(so);
 
 	PG_RETURN_VOID();
@@ -162,53 +253,34 @@ spgLeafTest(Relation index, SpGistScanOpaque so, Datum leafDatum,
 			int level, Datum reconstructedValue,
 			Datum *leafValue, bool *recheck)
 {
-	bool		result = true;
+	bool		result;
 	spgLeafConsistentIn in;
 	spgLeafConsistentOut out;
 	FmgrInfo   *procinfo;
 	MemoryContext oldCtx;
-	int			i;
 
-	*leafValue = (Datum) 0;
-	*recheck = false;
+	/* use temp context for calling leaf_consistent */
+	oldCtx = MemoryContextSwitchTo(so->tempCxt);
 
-	/* set up values that are the same for all quals */
+	in.scankeys = so->keyData;
+	in.nkeys = so->numberOfKeys;
 	in.reconstructedValue = reconstructedValue;
 	in.level = level;
 	in.returnData = so->want_itup;
 	in.leafDatum = leafDatum;
 
-	/* Apply each leaf consistency check, working in the temp context */
-	oldCtx = MemoryContextSwitchTo(so->tempCxt);
+	out.leafValue = (Datum) 0;
+	out.recheck = false;
 
 	procinfo = index_getprocinfo(index, 1, SPGIST_LEAF_CONSISTENT_PROC);
+	result = DatumGetBool(FunctionCall2Coll(procinfo,
+											index->rd_indcollation[0],
+											PointerGetDatum(&in),
+											PointerGetDatum(&out)));
 
-	for (i = 0; i < so->numberOfKeys; i++)
-	{
-		ScanKey		skey = &so->keyData[i];
-
-		/* Assume SPGiST-indexable operators are strict */
-		if (skey->sk_flags & SK_ISNULL)
-		{
-			result = false;
-			break;
-		}
+	*leafValue = out.leafValue;
+	*recheck = out.recheck;
 
-		in.strategy = skey->sk_strategy;
-		in.query = skey->sk_argument;
-
-		out.leafValue = (Datum) 0;
-		out.recheck = false;
-
-		result = DatumGetBool(FunctionCall2Coll(procinfo,
-												skey->sk_collation,
-												PointerGetDatum(&in),
-												PointerGetDatum(&out)));
-		*leafValue = out.leafValue;
-		*recheck |= out.recheck;
-		if (!result)
-			break;
-	}
 	MemoryContextSwitchTo(oldCtx);
 
 	return result;
@@ -349,8 +421,13 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 		else	/* page is inner */
 		{
 			SpGistInnerTuple innerTuple;
+			spgInnerConsistentIn in;
+			spgInnerConsistentOut out;
+			FmgrInfo   *procinfo;
+			SpGistNodeTuple *nodes;
 			SpGistNodeTuple node;
 			int			i;
+			MemoryContext oldCtx;
 
 			innerTuple = (SpGistInnerTuple) PageGetItem(page,
 														PageGetItemId(page, offset));
@@ -368,144 +445,68 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 					 innerTuple->tupstate);
 			}
 
-			if (so->numberOfKeys == 0)
+			/* use temp context for calling inner_consistent */
+			oldCtx = MemoryContextSwitchTo(so->tempCxt);
+
+			in.scankeys = so->keyData;
+			in.nkeys = so->numberOfKeys;
+			in.reconstructedValue = stackEntry->reconstructedValue;
+			in.level = stackEntry->level;
+			in.returnData = so->want_itup;
+			in.allTheSame = innerTuple->allTheSame;
+			in.hasPrefix = (innerTuple->prefixSize > 0);
+			in.prefixDatum = SGITDATUM(innerTuple, &so->state);
+			in.nNodes = innerTuple->nNodes;
+			in.nodeLabels = spgExtractNodeLabels(&so->state, innerTuple);
+
+			/* collect node pointers */
+			nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * in.nNodes);
+			SGITITERATE(innerTuple, i, node)
 			{
-				/*
-				 * This case cannot happen at the moment, because we don't
-				 * set pg_am.amoptionalkey for SP-GiST.  In order for full
-				 * index scans to produce correct answers, we'd need to
-				 * index nulls, which we don't.
-				 */
-				Assert(false);
-
-#ifdef NOT_USED
-				/*
-				 * A full index scan could be done approximately like this,
-				 * but note that reconstruction of indexed values would be
-				 * impossible unless the API for inner_consistent is changed.
-				 */
-				SGITITERATE(innerTuple, i, node)
-				{
-					if (ItemPointerIsValid(&node->t_tid))
-					{
-						ScanStackEntry *newEntry = palloc(sizeof(ScanStackEntry));
-
-						newEntry->ptr = node->t_tid;
-						newEntry->level = -1;
-						newEntry->reconstructedValue = (Datum) 0;
-						so->scanStack = lcons(newEntry, so->scanStack);
-					}
-				}
-#endif
+				nodes[i] = node;
 			}
-			else
-			{
-				spgInnerConsistentIn in;
-				spgInnerConsistentOut out;
-				FmgrInfo   *procinfo;
-				SpGistNodeTuple *nodes;
-				int		   *andMap;
-				int		   *levelAdds;
-				Datum	   *reconstructedValues;
-				int			j,
-							nMatches = 0;
-				MemoryContext oldCtx;
-
-				/* use temp context for calling inner_consistent */
-				oldCtx = MemoryContextSwitchTo(so->tempCxt);
-
-				/* set up values that are the same for all scankeys */
-				in.reconstructedValue = stackEntry->reconstructedValue;
-				in.level = stackEntry->level;
-				in.returnData = so->want_itup;
-				in.allTheSame = innerTuple->allTheSame;
-				in.hasPrefix = (innerTuple->prefixSize > 0);
-				in.prefixDatum = SGITDATUM(innerTuple, &so->state);
-				in.nNodes = innerTuple->nNodes;
-				in.nodeLabels = spgExtractNodeLabels(&so->state, innerTuple);
-
-				/* collect node pointers */
-				nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * in.nNodes);
-				SGITITERATE(innerTuple, i, node)
-				{
-					nodes[i] = node;
-				}
 
-				andMap = (int *) palloc0(sizeof(int) * in.nNodes);
-				levelAdds = (int *) palloc0(sizeof(int) * in.nNodes);
-				reconstructedValues = (Datum *) palloc0(sizeof(Datum) * in.nNodes);
+			memset(&out, 0, sizeof(out));
 
-				procinfo = index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC);
+			procinfo = index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC);
+			FunctionCall2Coll(procinfo,
+							  index->rd_indcollation[0],
+							  PointerGetDatum(&in),
+							  PointerGetDatum(&out));
 
-				for (j = 0; j < so->numberOfKeys; j++)
-				{
-					ScanKey		skey = &so->keyData[j];
-
-					/* Assume SPGiST-indexable operators are strict */
-					if (skey->sk_flags & SK_ISNULL)
-					{
-						nMatches = 0;
-						break;
-					}
-
-					in.strategy = skey->sk_strategy;
-					in.query = skey->sk_argument;
+			MemoryContextSwitchTo(oldCtx);
 
-					memset(&out, 0, sizeof(out));
-
-					FunctionCall2Coll(procinfo,
-									  skey->sk_collation,
-									  PointerGetDatum(&in),
-									  PointerGetDatum(&out));
-
-					/* If allTheSame, they should all or none of 'em match */
-					if (innerTuple->allTheSame)
-						if (out.nNodes != 0 && out.nNodes != in.nNodes)
-							elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple");
-
-					nMatches = 0;
-					for (i = 0; i < out.nNodes; i++)
-					{
-						int		nodeN = out.nodeNumbers[i];
-
-						andMap[nodeN]++;
-						if (andMap[nodeN] == j + 1)
-							nMatches++;
-						if (out.levelAdds)
-							levelAdds[nodeN] = out.levelAdds[i];
-						if (out.reconstructedValues)
-							reconstructedValues[nodeN] = out.reconstructedValues[i];
-					}
+			/* If allTheSame, they should all or none of 'em match */
+			if (innerTuple->allTheSame)
+				if (out.nNodes != 0 && out.nNodes != in.nNodes)
+					elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple");
 
-					/* quit as soon as all nodes have failed some qual */
-					if (nMatches == 0)
-						break;
-				}
-
-				MemoryContextSwitchTo(oldCtx);
+			for (i = 0; i < out.nNodes; i++)
+			{
+				int		nodeN = out.nodeNumbers[i];
 
-				if (nMatches > 0)
+				Assert(nodeN >= 0 && nodeN < in.nNodes);
+				if (ItemPointerIsValid(&nodes[nodeN]->t_tid))
 				{
-					for (i = 0; i < in.nNodes; i++)
-					{
-						if (andMap[i] == so->numberOfKeys &&
-							ItemPointerIsValid(&nodes[i]->t_tid))
-						{
-							ScanStackEntry *newEntry;
-
-							/* Create new work item for this node */
-							newEntry = palloc(sizeof(ScanStackEntry));
-							newEntry->ptr = nodes[i]->t_tid;
-							newEntry->level = stackEntry->level + levelAdds[i];
-							/* Must copy value out of temp context */
-							newEntry->reconstructedValue =
-								datumCopy(reconstructedValues[i],
-										  so->state.attType.attbyval,
-										  so->state.attType.attlen);
-
-							so->scanStack = lcons(newEntry, so->scanStack);
-						}
-					}
+					ScanStackEntry *newEntry;
+
+					/* Create new work item for this node */
+					newEntry = palloc(sizeof(ScanStackEntry));
+					newEntry->ptr = nodes[nodeN]->t_tid;
+					if (out.levelAdds)
+						newEntry->level = stackEntry->level + out.levelAdds[i];
+					else
+						newEntry->level = stackEntry->level;
+					/* Must copy value out of temp context */
+					if (out.reconstructedValues)
+						newEntry->reconstructedValue =
+							datumCopy(out.reconstructedValues[i],
+									  so->state.attType.attbyval,
+									  so->state.attType.attlen);
+					else
+						newEntry->reconstructedValue = (Datum) 0;
+
+					so->scanStack = lcons(newEntry, so->scanStack);
 				}
 			}
 		}
@@ -536,10 +537,7 @@ spggetbitmap(PG_FUNCTION_ARGS)
 	TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
 	SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
 
-	/* Copy scankey to *so so we don't need to pass it around separately */
-	so->numberOfKeys = scan->numberOfKeys;
-	so->keyData = scan->keyData;
-	/* Ditto for the want_itup flag */
+	/* Copy want_itup to *so so we don't need to pass it around separately */
 	so->want_itup = false;
 
 	so->tbm = tbm;
@@ -583,10 +581,7 @@ spggettuple(PG_FUNCTION_ARGS)
 	if (dir != ForwardScanDirection)
 		elog(ERROR, "SP-GiST only supports forward scan direction");
 
-	/* Copy scankey to *so so we don't need to pass it around separately */
-	so->numberOfKeys = scan->numberOfKeys;
-	so->keyData = scan->keyData;
-	/* Ditto for the want_itup flag */
+	/* Copy want_itup to *so so we don't need to pass it around separately */
 	so->want_itup = scan->xs_want_itup;
 
 	for (;;)
diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c
index b194fc1b13e52..656015ea7e659 100644
--- a/src/backend/access/spgist/spgtextproc.c
+++ b/src/backend/access/spgist/spgtextproc.c
@@ -362,25 +362,12 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
 {
 	spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
 	spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
-	StrategyNumber strategy = in->strategy;
-	text	   *inText;
-	int			inSize;
-	int			i;
+	bool		collate_is_c = lc_collate_is_c(PG_GET_COLLATION());
 	text	   *reconstrText = NULL;
 	int			maxReconstrLen = 0;
 	text	   *prefixText = NULL;
 	int			prefixSize = 0;
-
-	/*
-	 * If it's a collation-aware operator, but the collation is C, we can
-	 * treat it as non-collation-aware.
-	 */
-	if (strategy > 10 &&
-		lc_collate_is_c(PG_GET_COLLATION()))
-		strategy -= 10;
-
-	inText = DatumGetTextPP(in->query);
-	inSize = VARSIZE_ANY_EXHDR(inText);
+	int			i;
 
 	/*
 	 * Reconstruct values represented at this tuple, including parent data,
@@ -431,8 +418,8 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
 	{
 		uint8		nodeChar = DatumGetUInt8(in->nodeLabels[i]);
 		int			thisLen;
-		int			r;
-		bool		res = false;
+		bool		res = true;
+		int			j;
 
 		/* If nodeChar is zero, don't include it in data */
 		if (nodeChar == '\0')
@@ -443,38 +430,57 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
 			thisLen = maxReconstrLen;
 		}
 
-		r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText),
-				   Min(inSize, thisLen));
-
-		switch (strategy)
+		for (j = 0; j < in->nkeys; j++)
 		{
-			case BTLessStrategyNumber:
-			case BTLessEqualStrategyNumber:
-				if (r <= 0)
-					res = true;
-				break;
-			case BTEqualStrategyNumber:
-				if (r == 0 && inSize >= thisLen)
-					res = true;
-				break;
-			case BTGreaterEqualStrategyNumber:
-			case BTGreaterStrategyNumber:
-				if (r >= 0)
-					res = true;
-				break;
-			case BTLessStrategyNumber + 10:
-			case BTLessEqualStrategyNumber + 10:
-			case BTGreaterEqualStrategyNumber + 10:
-			case BTGreaterStrategyNumber + 10:
-				/*
-				 * with non-C collation we need to traverse whole tree :-(
-				 */
-				res = true;
-				break;
-			default:
-				elog(ERROR, "unrecognized strategy number: %d",
-					 in->strategy);
-				break;
+			StrategyNumber strategy = in->scankeys[j].sk_strategy;
+			text	   *inText;
+			int			inSize;
+			int			r;
+
+			/*
+			 * If it's a collation-aware operator, but the collation is C, we
+			 * can treat it as non-collation-aware.  With non-C collation we
+			 * need to traverse whole tree :-( so there's no point in making
+			 * any check here.
+			 */
+			if (strategy > 10)
+			{
+				if (collate_is_c)
+					strategy -= 10;
+				else
+					continue;
+			}
+
+			inText = DatumGetTextPP(in->scankeys[j].sk_argument);
+			inSize = VARSIZE_ANY_EXHDR(inText);
+
+			r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText),
+					   Min(inSize, thisLen));
+
+			switch (strategy)
+			{
+				case BTLessStrategyNumber:
+				case BTLessEqualStrategyNumber:
+					if (r > 0)
+						res = false;
+					break;
+				case BTEqualStrategyNumber:
+					if (r != 0 || inSize < thisLen)
+						res = false;
+					break;
+				case BTGreaterEqualStrategyNumber:
+				case BTGreaterStrategyNumber:
+					if (r < 0)
+						res = false;
+					break;
+				default:
+					elog(ERROR, "unrecognized strategy number: %d",
+						 in->scankeys[j].sk_strategy);
+					break;
+			}
+
+			if (!res)
+				break;			/* no need to consider remaining conditions */
 		}
 
 		if (res)
@@ -496,16 +502,13 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
 {
 	spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
 	spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
-	StrategyNumber strategy = in->strategy;
-	text	   *query = DatumGetTextPP(in->query);
 	int			level = in->level;
 	text	   *leafValue,
 			   *reconstrValue = NULL;
 	char	   *fullValue;
 	int			fullLen;
-	int			queryLen;
-	int			r;
 	bool		res;
+	int			j;
 
 	/* all tests are exact */
 	out->recheck = false;
@@ -518,18 +521,8 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
 	Assert(level == 0 ? reconstrValue == NULL :
 		   VARSIZE_ANY_EXHDR(reconstrValue) == level);
 
+	/* Reconstruct the full string represented by this leaf tuple */
 	fullLen = level + VARSIZE_ANY_EXHDR(leafValue);
-
-	queryLen = VARSIZE_ANY_EXHDR(query);
-
-	/*
-	 * For an equality check, we needn't reconstruct fullValue if not same
-	 * length; it can't match
-	 */
-	if (strategy == BTEqualStrategyNumber && queryLen != fullLen)
-		PG_RETURN_BOOL(false);
-
-	/* Else, reconstruct the full string represented by this leaf tuple */
 	if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0)
 	{
 		fullValue = VARDATA(reconstrValue);
@@ -549,54 +542,67 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
 		out->leafValue = PointerGetDatum(fullText);
 	}
 
-	/* Run the appropriate type of comparison */
-	if (strategy > 10)
+	/* Perform the required comparison(s) */
+	res = true;
+	for (j = 0; j < in->nkeys; j++)
 	{
-		/* Collation-aware comparison */
-		strategy -= 10;
+		StrategyNumber strategy = in->scankeys[j].sk_strategy;
+		text	   *query = DatumGetTextPP(in->scankeys[j].sk_argument);
+		int			queryLen = VARSIZE_ANY_EXHDR(query);
+		int			r;
 
-		/* If asserts are enabled, verify encoding of reconstructed string */
-		Assert(pg_verifymbstr(fullValue, fullLen, false));
+		if (strategy > 10)
+		{
+			/* Collation-aware comparison */
+			strategy -= 10;
 
-		r = varstr_cmp(fullValue, Min(queryLen, fullLen),
-					   VARDATA_ANY(query), Min(queryLen, fullLen),
-					   PG_GET_COLLATION());
-	}
-	else
-	{
-		/* Non-collation-aware comparison */
-		r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen));
-	}
+			/* If asserts enabled, verify encoding of reconstructed string */
+			Assert(pg_verifymbstr(fullValue, fullLen, false));
 
-	if (r == 0)
-	{
-		if (queryLen > fullLen)
-			r = -1;
-		else if (queryLen < fullLen)
-			r = 1;
-	}
+			r = varstr_cmp(fullValue, Min(queryLen, fullLen),
+						   VARDATA_ANY(query), Min(queryLen, fullLen),
+						   PG_GET_COLLATION());
+		}
+		else
+		{
+			/* Non-collation-aware comparison */
+			r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen));
+		}
 
-	switch (strategy)
-	{
-		case BTLessStrategyNumber:
-			res = (r < 0);
-			break;
-		case BTLessEqualStrategyNumber:
-			res = (r <= 0);
-			break;
-		case BTEqualStrategyNumber:
-			res = (r == 0);
-			break;
-		case BTGreaterEqualStrategyNumber:
-			res = (r >= 0);
-			break;
-		case BTGreaterStrategyNumber:
-			res = (r > 0);
-			break;
-		default:
-			elog(ERROR, "unrecognized strategy number: %d", in->strategy);
-			res = false;
-			break;
+		if (r == 0)
+		{
+			if (queryLen > fullLen)
+				r = -1;
+			else if (queryLen < fullLen)
+				r = 1;
+		}
+
+		switch (strategy)
+		{
+			case BTLessStrategyNumber:
+				res = (r < 0);
+				break;
+			case BTLessEqualStrategyNumber:
+				res = (r <= 0);
+				break;
+			case BTEqualStrategyNumber:
+				res = (r == 0);
+				break;
+			case BTGreaterEqualStrategyNumber:
+				res = (r >= 0);
+				break;
+			case BTGreaterStrategyNumber:
+				res = (r > 0);
+				break;
+			default:
+				elog(ERROR, "unrecognized strategy number: %d",
+					 in->scankeys[j].sk_strategy);
+				res = false;
+				break;
+		}
+
+		if (!res)
+			break;				/* no need to consider remaining conditions */
 	}
 
 	PG_RETURN_BOOL(res);
diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h
index cd6de2c98da86..8d0205e691f2d 100644
--- a/src/include/access/spgist.h
+++ b/src/include/access/spgist.h
@@ -128,8 +128,8 @@ typedef struct spgPickSplitOut
  */
 typedef struct spgInnerConsistentIn
 {
-	StrategyNumber strategy;	/* operator strategy number */
-	Datum		query;			/* operator's RHS value */
+	ScanKey		scankeys;		/* array of operators and comparison values */
+	int			nkeys;			/* length of array */
 
 	Datum		reconstructedValue;		/* value reconstructed at parent */
 	int			level;			/* current level (counting from zero) */
@@ -156,8 +156,8 @@ typedef struct spgInnerConsistentOut
  */
 typedef struct spgLeafConsistentIn
 {
-	StrategyNumber strategy;	/* operator strategy number */
-	Datum		query;			/* operator's RHS value */
+	ScanKey		scankeys;		/* array of operators and comparison values */
+	int			nkeys;			/* length of array */
 
 	Datum		reconstructedValue;		/* value reconstructed at parent */
 	int			level;			/* current level (counting from zero) */
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index fa23acf6cdeaf..76ea5a1578fc8 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -126,7 +126,11 @@ typedef struct SpGistScanOpaqueData
 	SpGistState state;			/* see above */
 	MemoryContext tempCxt;		/* short-lived memory context */
 
-	/* Index quals for scan (copied from IndexScanDesc for convenience) */
+	/* Control flags showing whether to search nulls and/or non-nulls */
+	bool		searchNulls;	/* scan matches (all) null entries */
+	bool		searchNonNulls;	/* scan matches (some) non-null entries */
+
+	/* Index quals to be passed to opclass (null-related quals removed) */
 	int			numberOfKeys;	/* number of index qualifier conditions */
 	ScanKey		keyData;		/* array of index qualifier descriptors */
 

From 86947e666d39229558311d7b0be45608fd071ed8 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Sun, 11 Mar 2012 01:52:05 +0200
Subject: [PATCH 113/129] Add more detail to error message for invalid
 arguments for server process

It now prints the argument that was at fault.

Also fix a small misbehavior where the error message issued by
getopt() would complain about a program named "--single", because
that's what argv[0] is in the server process.
---
 src/backend/tcop/postgres.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 49a396918d9bd..397c0734c2900 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3190,6 +3190,13 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx)
 		gucsource = PGC_S_CLIENT;		/* switches came from client */
 	}
 
+#ifdef HAVE_INT_OPTERR
+	/* Turn this off because it's either printed to stderr and not the log
+	 * where we'd want it, or argv[0] is now "--single", which would make for a
+	 * weird error message.  We print our own error message below. */
+	opterr = 0;
+#endif
+
 	/*
 	 * Parse command-line options.	CAUTION: keep this in sync with
 	 * postmaster/postmaster.c (the option sets should not conflict) and with
@@ -3363,33 +3370,39 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx)
 				errs++;
 				break;
 		}
+
+		if (errs)
+			break;
 	}
 
 	/*
 	 * Should be no more arguments except an optional database name, and
 	 * that's only in the secure case.
 	 */
-	if (errs || argc - optind > 1 || (argc != optind && !secure))
+	if (!errs && secure && argc - optind >= 1)
+		dbname = strdup(argv[optind++]);
+	else
+		dbname = NULL;
+
+	if (errs || argc != optind)
 	{
+		if (errs)
+			optind--;			/* complain about the previous argument */
+
 		/* spell the error message a bit differently depending on context */
 		if (IsUnderPostmaster)
 			ereport(FATAL,
 					(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid command-line arguments for server process"),
+				 errmsg("invalid command-line argument for server process: %s", argv[optind]),
 			  errhint("Try \"%s --help\" for more information.", progname)));
 		else
 			ereport(FATAL,
 					(errcode(ERRCODE_SYNTAX_ERROR),
-					 errmsg("%s: invalid command-line arguments",
-							progname),
+					 errmsg("%s: invalid command-line argument: %s",
+							progname, argv[optind]),
 			  errhint("Try \"%s --help\" for more information.", progname)));
 	}
 
-	if (argc - optind == 1)
-		dbname = strdup(argv[optind]);
-	else
-		dbname = NULL;
-
 	/*
 	 * Reset getopt(3) library so that it will work correctly in subprocesses
 	 * or when this function is called a second time with another array.

From da9e73a137871ab8ebe432ea443193c336bfc81a Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <ishii@postgresql.org>
Date: Sun, 11 Mar 2012 08:23:20 +0900
Subject: [PATCH 114/129] Add description for --no-locale and
 --text-search-config.

---
 doc/src/sgml/ref/initdb.sgml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 81f4d58a859f8..5570562a5cddf 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -208,6 +208,15 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--no-locale</option></term>
+      <listitem>
+       <para>
+		Equivalent to <option>--locale=C</option>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>--pwfile=<replaceable>filename</></option></term>
       <listitem>
@@ -217,6 +226,17 @@ PostgreSQL documentation
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry>
+      <term><option>--text-search-config=<replaceable>CFG</></option></term>
+      <listitem>
+       <para>
+		Sets the default text search configuration.
+		See <xref linkend="guc-default-text-search-config"> for further information.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-U <replaceable class="parameter">username</replaceable></option></term>
       <term><option>--username=<replaceable class="parameter">username</replaceable></option></term>

From fc227a4e3b84f7bc243c4606780dde28aea257ee Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Sun, 11 Mar 2012 12:25:52 +0100
Subject: [PATCH 115/129] Removed redundant "the" from ecpg's docs.

Typo spotted by Erik Rijkers.
---
 doc/src/sgml/ecpg.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/ecpg.sgml b/doc/src/sgml/ecpg.sgml
index dedd886f65e80..2c5c330023bc7 100644
--- a/doc/src/sgml/ecpg.sgml
+++ b/doc/src/sgml/ecpg.sgml
@@ -4038,7 +4038,7 @@ typedef struct sqlvar_struct sqlvar_t;
          <term><literal>sqlname</></term>
           <listitem>
            <para>
-            The the name of the field.
+            The name of the field.
            </para>
           </listitem>
          </varlistentry>

From c6a11b89e48dfb47b305cea405924333dabc20b6 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 11 Mar 2012 16:29:04 -0400
Subject: [PATCH 116/129] Teach SPGiST to store nulls and do whole-index scans.

This patch fixes the other major compatibility-breaking limitation of
SPGiST, that it didn't store anything for null values of the indexed
column, and so could not support whole-index scans or "x IS NULL"
tests.  The approach is to create a wholly separate search tree for
the null entries, and use fixed "allTheSame" insertion and search
rules when processing this tree, instead of calling the index opclass
methods.  This way the opclass methods do not need to worry about
dealing with nulls.

Catversion bump is for pg_am updates as well as the change in on-disk
format of SPGiST indexes; there are some tweaks in SPGiST WAL records
as well.

Heavily rewritten version of a patch by Oleg Bartunov and Teodor Sigaev.
(The original also stored nulls separately, but it reused GIN code to do
so; which required undesirable compromises in the on-disk format, and
would likely lead to bugs due to the GIN code being required to work in
two very different contexts.)
---
 doc/src/sgml/spgist.sgml                   |  12 ++
 src/backend/access/spgist/README           |  32 +++-
 src/backend/access/spgist/spgdoinsert.c    | 191 ++++++++++++++-------
 src/backend/access/spgist/spginsert.c      |  48 ++++--
 src/backend/access/spgist/spgscan.c        |  77 ++++++---
 src/backend/access/spgist/spgutils.c       |  76 ++++----
 src/backend/access/spgist/spgvacuum.c      |  17 +-
 src/backend/access/spgist/spgxlog.c        |  37 ++--
 src/include/access/spgist_private.h        |  62 +++++--
 src/include/catalog/catversion.h           |   2 +-
 src/include/catalog/pg_am.h                |   2 +-
 src/test/regress/expected/create_index.out | 112 ++++++++++++
 src/test/regress/sql/create_index.sql      |  32 ++++
 13 files changed, 534 insertions(+), 166 deletions(-)

diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index 0202dbcdd5a26..fd312cf4368ea 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -100,6 +100,18 @@
   value when that is needed.
  </para>
 
+ <note>
+  <para>
+   The <acronym>SP-GiST</acronym> core code takes care of NULL entries.
+   Although <acronym>SP-GiST</acronym> indexes do store entries for nulls
+   in indexed columns, this is hidden from the index operator class code:
+   no null index entries or search conditions will ever be passed to the
+   operator class methods.  (It is assumed that <acronym>SP-GiST</acronym>
+   operators are strict and so cannot succeed for NULL values.)  NULLs
+   are therefore not discussed further here.
+  </para>
+ </note>
+
  <para>
   There are five user-defined methods that an index operator class for
   <acronym>SP-GiST</acronym> must provide.  All five follow the convention
diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README
index 4ff0e357cb423..d20ad17a4b669 100644
--- a/src/backend/access/spgist/README
+++ b/src/backend/access/spgist/README
@@ -11,6 +11,7 @@ should have a high fanout to minimize I/O.  The challenge is to map tree
 nodes to disk pages in such a way that the search algorithm accesses only a
 few disk pages, even if it traverses many nodes.
 
+
 COMMON STRUCTURE DESCRIPTION
 
 Logically, an SP-GiST tree is a set of tuples, each of which can be either
@@ -71,6 +72,21 @@ Leaf tuple consists of:
 
   ItemPointer to the heap
 
+
+NULLS HANDLING
+
+We assume that SPGiST-indexable operators are strict (can never succeed for
+null inputs).  It is still desirable to index nulls, so that whole-table
+indexscans are possible and so that "x IS NULL" can be implemented by an
+SPGiST indexscan.  However, we prefer that SPGiST index opclasses not have
+to cope with nulls.  Therefore, the main tree of an SPGiST index does not
+include any null entries.  We store null entries in a separate SPGiST tree
+occupying a disjoint set of pages (in particular, its own root page).
+Insertions and searches in the nulls tree do not use any of the
+opclass-supplied functions, but just use hardwired logic comparable to
+AllTheSame cases in the normal tree.
+
+
 INSERTION ALGORITHM
 
 Insertion algorithm is designed to keep the tree in a consistent state at
@@ -181,6 +197,7 @@ described in (5).
 and a new tuple to another page, if the list is short enough. This improves
 space utilization, but doesn't change the basis of the algorithm.
 
+
 CONCURRENCY
 
 While descending the tree, the insertion algorithm holds exclusive lock on
@@ -218,6 +235,7 @@ scan that had already visited the parent level could possibly reach such a
 redirect tuple, so we can remove redirects once all active transactions have
 been flushed out of the system.
 
+
 DEAD TUPLES
 
 Tuples on leaf pages can be in one of four states:
@@ -269,6 +287,7 @@ to PLACEHOLDER status by VACUUM, and are then candidates for replacement.
 DEAD state is not currently possible, since VACUUM does not attempt to
 remove unused inner tuples.
 
+
 VACUUM
 
 VACUUM (or more precisely, spgbulkdelete) performs a single sequential scan
@@ -302,13 +321,16 @@ performed; otherwise, it does an spgbulkdelete scan with an empty target
 list, so as to clean up redirections and placeholders, update the free
 space map, and gather statistics.
 
+
 LAST USED PAGE MANAGEMENT
 
-List of last used pages contains four pages - a leaf page and three inner
-pages, one from each "triple parity" group.  This list is stored between
-calls on the index meta page, but updates are never WAL-logged to decrease
-WAL traffic.  Incorrect data on meta page isn't critical, because we could
-allocate a new page at any moment.
+The list of last used pages contains four pages - a leaf page and three
+inner pages, one from each "triple parity" group.  (Actually, there's one
+such list for the main tree and a separate one for the nulls tree.)  This
+list is stored between calls on the index meta page, but updates are never
+WAL-logged to decrease WAL traffic.  Incorrect data on meta page isn't
+critical, because we could allocate a new page at any moment.
+
 
 AUTHORS
 
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 85704762a6f24..5ddb6672c5c8c 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -200,7 +200,7 @@ saveNodeLink(Relation index, SPPageDesc *parent,
  */
 static void
 addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
-			 SPPageDesc *current, SPPageDesc *parent, bool isNew)
+			 SPPageDesc *current, SPPageDesc *parent, bool isNulls, bool isNew)
 {
 	XLogRecData rdata[4];
 	spgxlogAddLeaf xlrec;
@@ -208,6 +208,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
 	xlrec.node = index->rd_node;
 	xlrec.blknoLeaf = current->blkno;
 	xlrec.newPage = isNew;
+	xlrec.storesNulls = isNulls;
 
 	/* these will be filled below as needed */
 	xlrec.offnumLeaf = InvalidOffsetNumber;
@@ -224,7 +225,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
 	START_CRIT_SECTION();
 
 	if (current->offnum == InvalidOffsetNumber ||
-		current->blkno == SPGIST_HEAD_BLKNO)
+		SpGistBlockIsRoot(current->blkno))
 	{
 		/* Tuple is not part of a chain */
 		leafTuple->nextOffset = InvalidOffsetNumber;
@@ -337,7 +338,7 @@ checkSplitConditions(Relation index, SpGistState *state,
 				n = 0,
 				totalSize = 0;
 
-	if (current->blkno == SPGIST_HEAD_BLKNO)
+	if (SpGistBlockIsRoot(current->blkno))
 	{
 		/* return impossible values to force split */
 		*nToSplit = BLCKSZ;
@@ -386,7 +387,7 @@ checkSplitConditions(Relation index, SpGistState *state,
 static void
 moveLeafs(Relation index, SpGistState *state,
 		  SPPageDesc *current, SPPageDesc *parent,
-		  SpGistLeafTuple newLeafTuple)
+		  SpGistLeafTuple newLeafTuple, bool isNulls)
 {
 	int			i,
 				nDelete,
@@ -451,7 +452,8 @@ moveLeafs(Relation index, SpGistState *state,
 	}
 
 	/* Find a leaf page that will hold them */
-	nbuf = SpGistGetBuffer(index, GBUF_LEAF, size, &xlrec.newPage);
+	nbuf = SpGistGetBuffer(index, GBUF_LEAF | (isNulls ? GBUF_NULLS : 0),
+						   size, &xlrec.newPage);
 	npage = BufferGetPage(nbuf);
 	nblkno = BufferGetBlockNumber(nbuf);
 	Assert(nblkno != current->blkno);
@@ -464,6 +466,7 @@ moveLeafs(Relation index, SpGistState *state,
 	xlrec.blknoDst = nblkno;
 	xlrec.nMoves = nDelete;
 	xlrec.replaceDead = replaceDead;
+	xlrec.storesNulls = isNulls;
 
 	xlrec.blknoParent = parent->blkno;
 	xlrec.offnumParent = parent->offnum;
@@ -584,6 +587,8 @@ setRedirectionTuple(SPPageDesc *current, OffsetNumber position,
  * If so, randomly divide the tuples into several nodes (all with the same
  * label) and return TRUE to select allTheSame mode for this inner tuple.
  *
+ * (This code is also used to forcibly select allTheSame mode for nulls.)
+ *
  * If we know that the leaf tuples wouldn't all fit on one page, then we
  * exclude the last tuple (which is the incoming new tuple that forced a split)
  * from the check to see if more than one node is used.  The reason for this
@@ -674,7 +679,8 @@ checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
 static bool
 doPickSplit(Relation index, SpGistState *state,
 			SPPageDesc *current, SPPageDesc *parent,
-			SpGistLeafTuple newLeafTuple, int level, bool isNew)
+			SpGistLeafTuple newLeafTuple,
+			int level, bool isNulls, bool isNew)
 {
 	bool		insertedNew = false;
 	spgPickSplitIn in;
@@ -733,11 +739,18 @@ doPickSplit(Relation index, SpGistState *state,
 	 * also, count up the amount of space that will be freed from current.
 	 * (Note that in the non-root case, we won't actually delete the old
 	 * tuples, only replace them with redirects or placeholders.)
+	 *
+	 * Note: the SGLTDATUM calls here are safe even when dealing with a nulls
+	 * page.  For a pass-by-value data type we will fetch a word that must
+	 * exist even though it may contain garbage (because of the fact that leaf
+	 * tuples must have size at least SGDTSIZE).  For a pass-by-reference type
+	 * we are just computing a pointer that isn't going to get dereferenced.
+	 * So it's not worth guarding the calls with isNulls checks.
 	 */
 	nToInsert = 0;
 	nToDelete = 0;
 	spaceToDelete = 0;
-	if (current->blkno == SPGIST_HEAD_BLKNO)
+	if (SpGistBlockIsRoot(current->blkno))
 	{
 		/*
 		 * We are splitting the root (which up to now is also a leaf page).
@@ -813,26 +826,53 @@ doPickSplit(Relation index, SpGistState *state,
 	heapPtrs[in.nTuples] = newLeafTuple->heapPtr;
 	in.nTuples++;
 
-	/*
-	 * Perform split using user-defined method.
-	 */
 	memset(&out, 0, sizeof(out));
 
-	procinfo = index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC);
-	FunctionCall2Coll(procinfo,
-					  index->rd_indcollation[0],
-					  PointerGetDatum(&in),
-					  PointerGetDatum(&out));
+	if (!isNulls)
+	{
+		/*
+		 * Perform split using user-defined method.
+		 */
+		procinfo = index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC);
+		FunctionCall2Coll(procinfo,
+						  index->rd_indcollation[0],
+						  PointerGetDatum(&in),
+						  PointerGetDatum(&out));
 
-	/*
-	 * Form new leaf tuples and count up the total space needed.
-	 */
-	totalLeafSizes = 0;
-	for (i = 0; i < in.nTuples; i++)
+		/*
+		 * Form new leaf tuples and count up the total space needed.
+		 */
+		totalLeafSizes = 0;
+		for (i = 0; i < in.nTuples; i++)
+		{
+			newLeafs[i] = spgFormLeafTuple(state, heapPtrs + i,
+										   out.leafTupleDatums[i],
+										   false);
+			totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+		}
+	}
+	else
 	{
-		newLeafs[i] = spgFormLeafTuple(state, heapPtrs + i,
-									   out.leafTupleDatums[i]);
-		totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+		/*
+		 * Perform dummy split that puts all tuples into one node.
+		 * checkAllTheSame will override this and force allTheSame mode.
+		 */
+		out.hasPrefix = false;
+		out.nNodes = 1;
+		out.nodeLabels = NULL;
+		out.mapTuplesToNodes = palloc0(sizeof(int) * in.nTuples);
+
+		/*
+		 * Form new leaf tuples and count up the total space needed.
+		 */
+		totalLeafSizes = 0;
+		for (i = 0; i < in.nTuples; i++)
+		{
+			newLeafs[i] = spgFormLeafTuple(state, heapPtrs + i,
+										   (Datum) 0,
+										   true);
+			totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+		}
 	}
 
 	/*
@@ -872,11 +912,11 @@ doPickSplit(Relation index, SpGistState *state,
 	for (i = 0; i < out.nNodes; i++)
 	{
 		Datum		label = (Datum) 0;
-		bool		isnull = (out.nodeLabels == NULL);
+		bool		labelisnull = (out.nodeLabels == NULL);
 
-		if (!isnull)
+		if (!labelisnull)
 			label = out.nodeLabels[i];
-		nodes[i] = spgFormNodeTuple(state, label, isnull);
+		nodes[i] = spgFormNodeTuple(state, label, labelisnull);
 	}
 	innerTuple = spgFormInnerTuple(state,
 								   out.hasPrefix, out.prefixDatum,
@@ -914,7 +954,7 @@ doPickSplit(Relation index, SpGistState *state,
 	 */
 	xlrec.initInner = false;
 	if (parent->buffer != InvalidBuffer &&
-		parent->blkno != SPGIST_HEAD_BLKNO &&
+		!SpGistBlockIsRoot(parent->blkno) &&
 		(SpGistPageGetFreeSpace(parent->page, 1) >=
 		 innerTuple->size + sizeof(ItemIdData)))
 	{
@@ -925,7 +965,8 @@ doPickSplit(Relation index, SpGistState *state,
 	{
 		/* Send tuple to page with next triple parity (see README) */
 		newInnerBuffer = SpGistGetBuffer(index,
-										 GBUF_INNER_PARITY(parent->blkno + 1),
+										 GBUF_INNER_PARITY(parent->blkno + 1) |
+										 (isNulls ? GBUF_NULLS : 0),
 										 innerTuple->size + sizeof(ItemIdData),
 										 &xlrec.initInner);
 	}
@@ -935,7 +976,7 @@ doPickSplit(Relation index, SpGistState *state,
 		newInnerBuffer = InvalidBuffer;
 	}
 
-	/*----------
+	/*
 	 * Because a WAL record can't involve more than four buffers, we can
 	 * only afford to deal with two leaf pages in each picksplit action,
 	 * ie the current page and at most one other.
@@ -956,9 +997,8 @@ doPickSplit(Relation index, SpGistState *state,
 	 * If we are splitting the root page (turning it from a leaf page into an
 	 * inner page), then no leaf tuples can go back to the current page; they
 	 * must all go somewhere else.
-	 *----------
 	 */
-	if (current->blkno != SPGIST_HEAD_BLKNO)
+	if (!SpGistBlockIsRoot(current->blkno))
 		currentFreeSpace = PageGetExactFreeSpace(current->page) + spaceToDelete;
 	else
 		currentFreeSpace = 0;	/* prevent assigning any tuples to current */
@@ -996,7 +1036,8 @@ doPickSplit(Relation index, SpGistState *state,
 		int			curspace;
 		int			newspace;
 
-		newLeafBuffer = SpGistGetBuffer(index, GBUF_LEAF,
+		newLeafBuffer = SpGistGetBuffer(index,
+										GBUF_LEAF | (isNulls ? GBUF_NULLS : 0),
 										Min(totalLeafSizes,
 											SPGIST_PAGE_CAPACITY),
 										&xlrec.initDest);
@@ -1076,6 +1117,7 @@ doPickSplit(Relation index, SpGistState *state,
 	xlrec.blknoDest = InvalidBlockNumber;
 	xlrec.nDelete = 0;
 	xlrec.initSrc = isNew;
+	xlrec.storesNulls = isNulls;
 
 	leafdata = leafptr = (char *) palloc(totalLeafSizes);
 
@@ -1091,7 +1133,7 @@ doPickSplit(Relation index, SpGistState *state,
 	 * the root; in that case there's no need because we'll re-init the page
 	 * below.  We do this first to make room for reinserting new leaf tuples.
 	 */
-	if (current->blkno != SPGIST_HEAD_BLKNO)
+	if (!SpGistBlockIsRoot(current->blkno))
 	{
 		/*
 		 * Init buffer instead of deleting individual tuples, but only if
@@ -1102,7 +1144,8 @@ doPickSplit(Relation index, SpGistState *state,
 			nToDelete + SpGistPageGetOpaque(current->page)->nPlaceholder ==
 			PageGetMaxOffsetNumber(current->page))
 		{
-			SpGistInitBuffer(current->buffer, SPGIST_LEAF);
+			SpGistInitBuffer(current->buffer,
+							 SPGIST_LEAF | (isNulls ? SPGIST_NULLS : 0));
 			xlrec.initSrc = true;
 		}
 		else if (isNew)
@@ -1317,10 +1360,10 @@ doPickSplit(Relation index, SpGistState *state,
 		 * Splitting root page, which was a leaf but now becomes inner page
 		 * (and so "current" continues to point at it)
 		 */
-		Assert(current->blkno == SPGIST_HEAD_BLKNO);
+		Assert(SpGistBlockIsRoot(current->blkno));
 		Assert(redirectTuplePos == InvalidOffsetNumber);
 
-		SpGistInitBuffer(current->buffer, 0);
+		SpGistInitBuffer(current->buffer, (isNulls ? SPGIST_NULLS : 0));
 		xlrec.initInner = true;
 
 		xlrec.blknoInner = current->blkno;
@@ -1461,6 +1504,9 @@ spgAddNodeAction(Relation index, SpGistState *state,
 	XLogRecData rdata[5];
 	spgxlogAddNode xlrec;
 
+	/* Should not be applied to nulls */
+	Assert(!SpGistPageStoresNulls(current->page));
+
 	/* Construct new inner tuple with additional node */
 	newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN);
 
@@ -1527,7 +1573,7 @@ spgAddNodeAction(Relation index, SpGistState *state,
 		 * allow only one inner tuple on the root page, and spgFormInnerTuple
 		 * always checks that inner tuples don't exceed the size of a page.
 		 */
-		if (current->blkno == SPGIST_HEAD_BLKNO)
+		if (SpGistBlockIsRoot(current->blkno))
 			elog(ERROR, "cannot enlarge root tuple any more");
 		Assert(parent->buffer != InvalidBuffer);
 
@@ -1657,6 +1703,9 @@ spgSplitNodeAction(Relation index, SpGistState *state,
 	spgxlogSplitTuple xlrec;
 	Buffer		newBuffer = InvalidBuffer;
 
+	/* Should not be applied to nulls */
+	Assert(!SpGistPageStoresNulls(current->page));
+
 	/*
 	 * Construct new prefix tuple, containing a single node with the
 	 * specified label.  (We'll update the node's downlink to point to the
@@ -1709,7 +1758,7 @@ spgSplitNodeAction(Relation index, SpGistState *state,
 	 * For the space calculation, note that prefixTuple replaces innerTuple
 	 * but postfixTuple will be a new entry.
 	 */
-	if (current->blkno == SPGIST_HEAD_BLKNO ||
+	if (SpGistBlockIsRoot(current->blkno) ||
 		SpGistPageGetFreeSpace(current->page, 1) + innerTuple->size <
 		prefixTuple->size + postfixTuple->size + sizeof(ItemIdData))
 	{
@@ -1804,7 +1853,7 @@ spgSplitNodeAction(Relation index, SpGistState *state,
  */
 void
 spgdoinsert(Relation index, SpGistState *state,
-			ItemPointer heapPtr, Datum datum)
+			ItemPointer heapPtr, Datum datum, bool isnull)
 {
 	int			level = 0;
 	Datum		leafDatum;
@@ -1817,7 +1866,7 @@ spgdoinsert(Relation index, SpGistState *state,
 	 * value to be inserted is not toasted; FormIndexDatum doesn't guarantee
 	 * that.
 	 */
-	if (state->attType.attlen == -1)
+	if (!isnull && state->attType.attlen == -1)
 		datum = PointerGetDatum(PG_DETOAST_DATUM(datum));
 
 	leafDatum = datum;
@@ -1828,8 +1877,11 @@ spgdoinsert(Relation index, SpGistState *state,
 	 * If it isn't gonna fit, and the opclass can't reduce the datum size by
 	 * suffixing, bail out now rather than getting into an endless loop.
 	 */
-	leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
-		SpGistGetTypeSize(&state->attType, leafDatum);
+	if (!isnull)
+		leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
+			SpGistGetTypeSize(&state->attType, leafDatum);
+	else
+		leafSize = SGDTSIZE + sizeof(ItemIdData);
 
 	if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
 		ereport(ERROR,
@@ -1840,8 +1892,8 @@ spgdoinsert(Relation index, SpGistState *state,
 				   RelationGetRelationName(index)),
 		  errhint("Values larger than a buffer page cannot be indexed.")));
 
-	/* Initialize "current" to the root page */
-	current.blkno = SPGIST_HEAD_BLKNO;
+	/* Initialize "current" to the appropriate root page */
+	current.blkno = isnull ? SPGIST_NULL_BLKNO : SPGIST_ROOT_BLKNO;
 	current.buffer = InvalidBuffer;
 	current.page = NULL;
 	current.offnum = FirstOffsetNumber;
@@ -1873,10 +1925,11 @@ spgdoinsert(Relation index, SpGistState *state,
 			 * for doPickSplit to always have a leaf page at hand; so just
 			 * quietly limit our request to a page size.
 			 */
-			current.buffer = SpGistGetBuffer(index, GBUF_LEAF,
-											 Min(leafSize,
-												 SPGIST_PAGE_CAPACITY),
-											 &isNew);
+			current.buffer =
+				SpGistGetBuffer(index,
+								GBUF_LEAF | (isnull ? GBUF_NULLS : 0),
+								Min(leafSize, SPGIST_PAGE_CAPACITY),
+								&isNew);
 			current.blkno = BufferGetBlockNumber(current.buffer);
 		}
 		else if (parent.buffer == InvalidBuffer ||
@@ -1892,19 +1945,25 @@ spgdoinsert(Relation index, SpGistState *state,
 		}
 		current.page = BufferGetPage(current.buffer);
 
+		/* should not arrive at a page of the wrong type */
+		if (isnull ? !SpGistPageStoresNulls(current.page) :
+			SpGistPageStoresNulls(current.page))
+			elog(ERROR, "SPGiST index page %u has wrong nulls flag",
+				 current.blkno);
+
 		if (SpGistPageIsLeaf(current.page))
 		{
 			SpGistLeafTuple leafTuple;
 			int			nToSplit,
 						sizeToSplit;
 
-			leafTuple = spgFormLeafTuple(state, heapPtr, leafDatum);
+			leafTuple = spgFormLeafTuple(state, heapPtr, leafDatum, isnull);
 			if (leafTuple->size + sizeof(ItemIdData) <=
 				SpGistPageGetFreeSpace(current.page, 1))
 			{
 				/* it fits on page, so insert it and we're done */
 				addLeafTuple(index, state, leafTuple,
-							 &current, &parent, isNew);
+							 &current, &parent, isnull, isNew);
 				break;
 			}
 			else if ((sizeToSplit =
@@ -1918,14 +1977,14 @@ spgdoinsert(Relation index, SpGistState *state,
 				 * chain to another leaf page rather than splitting it.
 				 */
 				Assert(!isNew);
-				moveLeafs(index, state, &current, &parent, leafTuple);
+				moveLeafs(index, state, &current, &parent, leafTuple, isnull);
 				break;			/* we're done */
 			}
 			else
 			{
 				/* picksplit */
 				if (doPickSplit(index, state, &current, &parent,
-								leafTuple, level, isNew))
+								leafTuple, level, isnull, isNew))
 					break;		/* doPickSplit installed new tuples */
 
 				/* leaf tuple will not be inserted yet */
@@ -1972,11 +2031,20 @@ spgdoinsert(Relation index, SpGistState *state,
 
 			memset(&out, 0, sizeof(out));
 
-			procinfo = index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC);
-			FunctionCall2Coll(procinfo,
-							  index->rd_indcollation[0],
-							  PointerGetDatum(&in),
-							  PointerGetDatum(&out));
+			if (!isnull)
+			{
+				/* use user-defined choose method */
+				procinfo = index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC);
+				FunctionCall2Coll(procinfo,
+								  index->rd_indcollation[0],
+								  PointerGetDatum(&in),
+								  PointerGetDatum(&out));
+			}
+			else
+			{
+				/* force "match" action (to insert to random subnode) */
+				out.resultType = spgMatchNode;
+			}
 
 			if (innerTuple->allTheSame)
 			{
@@ -2001,9 +2069,12 @@ spgdoinsert(Relation index, SpGistState *state,
 					/* Adjust level as per opclass request */
 					level += out.result.matchNode.levelAdd;
 					/* Replace leafDatum and recompute leafSize */
-					leafDatum = out.result.matchNode.restDatum;
-					leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
-						SpGistGetTypeSize(&state->attType, leafDatum);
+					if (!isnull)
+					{
+						leafDatum = out.result.matchNode.restDatum;
+						leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
+							SpGistGetTypeSize(&state->attType, leafDatum);
+					}
 
 					/*
 					 * Loop around and attempt to insert the new leafDatum
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index cbcf655674ac5..8ff9245e179ac 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -38,18 +38,15 @@ spgistBuildCallback(Relation index, HeapTuple htup, Datum *values,
 					bool *isnull, bool tupleIsAlive, void *state)
 {
 	SpGistBuildState *buildstate = (SpGistBuildState *) state;
+	MemoryContext oldCtx;
 
-	/* SPGiST doesn't index nulls */
-	if (*isnull == false)
-	{
-		/* Work in temp context, and reset it after each tuple */
-		MemoryContext oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+	/* Work in temp context, and reset it after each tuple */
+	oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
 
-		spgdoinsert(index, &buildstate->spgstate, &htup->t_self, *values);
+	spgdoinsert(index, &buildstate->spgstate, &htup->t_self, *values, *isnull);
 
-		MemoryContextSwitchTo(oldCtx);
-		MemoryContextReset(buildstate->tmpCtx);
-	}
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(buildstate->tmpCtx);
 }
 
 /*
@@ -65,20 +62,23 @@ spgbuild(PG_FUNCTION_ARGS)
 	double		reltuples;
 	SpGistBuildState buildstate;
 	Buffer		metabuffer,
-				rootbuffer;
+				rootbuffer,
+				nullbuffer;
 
 	if (RelationGetNumberOfBlocks(index) != 0)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
 	/*
-	 * Initialize the meta page and root page
+	 * Initialize the meta page and root pages
 	 */
 	metabuffer = SpGistNewBuffer(index);
 	rootbuffer = SpGistNewBuffer(index);
+	nullbuffer = SpGistNewBuffer(index);
 
 	Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
-	Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_HEAD_BLKNO);
+	Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
+	Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
 
 	START_CRIT_SECTION();
 
@@ -86,6 +86,8 @@ spgbuild(PG_FUNCTION_ARGS)
 	MarkBufferDirty(metabuffer);
 	SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
 	MarkBufferDirty(rootbuffer);
+	SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
+	MarkBufferDirty(nullbuffer);
 
 	if (RelationNeedsWAL(index))
 	{
@@ -104,12 +106,15 @@ spgbuild(PG_FUNCTION_ARGS)
 		PageSetTLI(BufferGetPage(metabuffer), ThisTimeLineID);
 		PageSetLSN(BufferGetPage(rootbuffer), recptr);
 		PageSetTLI(BufferGetPage(rootbuffer), ThisTimeLineID);
+		PageSetLSN(BufferGetPage(nullbuffer), recptr);
+		PageSetTLI(BufferGetPage(nullbuffer), ThisTimeLineID);
 	}
 
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(metabuffer);
 	UnlockReleaseBuffer(rootbuffer);
+	UnlockReleaseBuffer(nullbuffer);
 
 	/*
 	 * Now insert all the heap data into the index
@@ -159,11 +164,20 @@ spgbuildempty(PG_FUNCTION_ARGS)
 	/* Likewise for the root page. */
 	SpGistInitPage(page, SPGIST_LEAF);
 
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_HEAD_BLKNO,
+	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
+			  (char *) page, true);
+	if (XLogIsNeeded())
+		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+					SPGIST_ROOT_BLKNO, page);
+
+	/* Likewise for the null-tuples root page. */
+	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
+
+	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
 			  (char *) page, true);
 	if (XLogIsNeeded())
 		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					SPGIST_HEAD_BLKNO, page);
+					SPGIST_NULL_BLKNO, page);
 
 	/*
 	 * An immediate sync is required even if we xlog'd the pages, because the
@@ -194,10 +208,6 @@ spginsert(PG_FUNCTION_ARGS)
 	MemoryContext oldCtx;
 	MemoryContext insertCtx;
 
-	/* SPGiST doesn't index nulls */
-	if (*isnull)
-		PG_RETURN_BOOL(false);
-
 	insertCtx = AllocSetContextCreate(CurrentMemoryContext,
 									  "SP-GiST insert temporary context",
 									  ALLOCSET_DEFAULT_MINSIZE,
@@ -207,7 +217,7 @@ spginsert(PG_FUNCTION_ARGS)
 
 	initSpGistState(&spgstate, index);
 
-	spgdoinsert(index, &spgstate, ht_ctid, *values);
+	spgdoinsert(index, &spgstate, ht_ctid, *values, *isnull);
 
 	SpGistUpdateMetaPage(index);
 
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 99b0852611fbc..7a3a96230d176 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -23,6 +23,9 @@
 #include "utils/memutils.h"
 
 
+typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr,
+							   Datum leafValue, bool isnull, bool recheck);
+
 typedef struct ScanStackEntry
 {
 	Datum		reconstructedValue;		/* value reconstructed from parent */
@@ -66,14 +69,20 @@ resetSpGistScanOpaque(SpGistScanOpaque so)
 
 	freeScanStack(so);
 
-	Assert(!so->searchNulls);	/* XXX fixme */
+	if (so->searchNulls)
+	{
+		/* Stack a work item to scan the null index entries */
+		startEntry = (ScanStackEntry *) palloc0(sizeof(ScanStackEntry));
+		ItemPointerSet(&startEntry->ptr, SPGIST_NULL_BLKNO, FirstOffsetNumber);
+		so->scanStack = lappend(so->scanStack, startEntry);
+	}
 
 	if (so->searchNonNulls)
 	{
 		/* Stack a work item to scan the non-null index entries */
 		startEntry = (ScanStackEntry *) palloc0(sizeof(ScanStackEntry));
-		ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber);
-		so->scanStack = list_make1(startEntry);
+		ItemPointerSet(&startEntry->ptr, SPGIST_ROOT_BLKNO, FirstOffsetNumber);
+		so->scanStack = lappend(so->scanStack, startEntry);
 	}
 
 	if (so->want_itup)
@@ -243,22 +252,35 @@ spgrestrpos(PG_FUNCTION_ARGS)
 }
 
 /*
- * Test whether a leaf datum satisfies all the scan keys
+ * Test whether a leaf tuple satisfies all the scan keys
  *
  * *leafValue is set to the reconstructed datum, if provided
  * *recheck is set true if any of the operators are lossy
  */
 static bool
-spgLeafTest(Relation index, SpGistScanOpaque so, Datum leafDatum,
+spgLeafTest(Relation index, SpGistScanOpaque so,
+			SpGistLeafTuple leafTuple, bool isnull,
 			int level, Datum reconstructedValue,
 			Datum *leafValue, bool *recheck)
 {
 	bool		result;
+	Datum		leafDatum;
 	spgLeafConsistentIn in;
 	spgLeafConsistentOut out;
 	FmgrInfo   *procinfo;
 	MemoryContext oldCtx;
 
+	if (isnull)
+	{
+		/* Should not have arrived on a nulls page unless nulls are wanted */
+		Assert(so->searchNulls);
+		*leafValue = (Datum) 0;
+		*recheck = false;
+		return true;
+	}
+
+	leafDatum = SGLTDATUM(leafTuple, &so->state);
+
 	/* use temp context for calling leaf_consistent */
 	oldCtx = MemoryContextSwitchTo(so->tempCxt);
 
@@ -295,7 +317,7 @@ spgLeafTest(Relation index, SpGistScanOpaque so, Datum leafDatum,
  */
 static void
 spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
-		void (*storeRes) (SpGistScanOpaque, ItemPointer, Datum, bool))
+		storeRes_func storeRes)
 {
 	Buffer		buffer = InvalidBuffer;
 	bool		reportedSome = false;
@@ -306,6 +328,7 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 		BlockNumber blkno;
 		OffsetNumber offset;
 		Page		page;
+		bool		isnull;
 
 		/* Pull next to-do item from the list */
 		if (so->scanStack == NIL)
@@ -336,6 +359,8 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 
 		page = BufferGetPage(buffer);
 
+		isnull = SpGistPageStoresNulls(page) ? true : false;
+
 		if (SpGistPageIsLeaf(page))
 		{
 			SpGistLeafTuple leafTuple;
@@ -343,7 +368,7 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 			Datum		leafValue = (Datum) 0;
 			bool		recheck = false;
 
-			if (blkno == SPGIST_HEAD_BLKNO)
+			if (SpGistBlockIsRoot(blkno))
 			{
 				/* When root is a leaf, examine all its tuples */
 				for (offset = FirstOffsetNumber; offset <= max; offset++)
@@ -359,13 +384,14 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 
 					Assert(ItemPointerIsValid(&leafTuple->heapPtr));
 					if (spgLeafTest(index, so,
-									SGLTDATUM(leafTuple, &so->state),
+									leafTuple, isnull,
 									stackEntry->level,
 									stackEntry->reconstructedValue,
 									&leafValue,
 									&recheck))
 					{
-						storeRes(so, &leafTuple->heapPtr, leafValue, recheck);
+						storeRes(so, &leafTuple->heapPtr,
+								 leafValue, isnull, recheck);
 						reportedSome = true;
 					}
 				}
@@ -404,13 +430,14 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 
 					Assert(ItemPointerIsValid(&leafTuple->heapPtr));
 					if (spgLeafTest(index, so,
-									SGLTDATUM(leafTuple, &so->state),
+									leafTuple, isnull,
 									stackEntry->level,
 									stackEntry->reconstructedValue,
 									&leafValue,
 									&recheck))
 					{
-						storeRes(so, &leafTuple->heapPtr, leafValue, recheck);
+						storeRes(so, &leafTuple->heapPtr,
+								 leafValue, isnull, recheck);
 						reportedSome = true;
 					}
 
@@ -468,11 +495,23 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 
 			memset(&out, 0, sizeof(out));
 
-			procinfo = index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC);
-			FunctionCall2Coll(procinfo,
-							  index->rd_indcollation[0],
-							  PointerGetDatum(&in),
-							  PointerGetDatum(&out));
+			if (!isnull)
+			{
+				/* use user-defined inner consistent method */
+				procinfo = index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC);
+				FunctionCall2Coll(procinfo,
+								  index->rd_indcollation[0],
+								  PointerGetDatum(&in),
+								  PointerGetDatum(&out));
+			}
+			else
+			{
+				/* force all children to be visited */
+				out.nNodes = in.nNodes;
+				out.nodeNumbers = (int *) palloc(sizeof(int) * in.nNodes);
+				for (i = 0; i < in.nNodes; i++)
+					out.nodeNumbers[i] = i;
+			}
 
 			MemoryContextSwitchTo(oldCtx);
 
@@ -524,7 +563,7 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 /* storeRes subroutine for getbitmap case */
 static void
 storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr,
-			Datum leafValue, bool recheck)
+			Datum leafValue, bool isnull, bool recheck)
 {
 	tbm_add_tuples(so->tbm, heapPtr, 1, recheck);
 	so->ntids++;
@@ -551,7 +590,7 @@ spggetbitmap(PG_FUNCTION_ARGS)
 /* storeRes subroutine for gettuple case */
 static void
 storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
-			  Datum leafValue, bool recheck)
+			  Datum leafValue, bool isnull, bool recheck)
 {
 	Assert(so->nPtrs < MaxIndexTuplesPerPage);
 	so->heapPtrs[so->nPtrs] = *heapPtr;
@@ -562,8 +601,6 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
 		 * Reconstruct desired IndexTuple.  We have to copy the datum out of
 		 * the temp context anyway, so we may as well create the tuple here.
 		 */
-		bool	isnull = false;
-
 		so->indexTups[so->nPtrs] = index_form_tuple(so->indexTupDesc,
 													&leafValue,
 													&isnull);
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 1f88562be78e7..46a10f6a20617 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -148,10 +148,10 @@ SpGistNewBuffer(Relation index)
 			break;				/* nothing known to FSM */
 
 		/*
-		 * The root page shouldn't ever be listed in FSM, but just in case it
-		 * is, ignore it.
+		 * The fixed pages shouldn't ever be listed in FSM, but just in case
+		 * one is, ignore it.
 		 */
-		if (blkno == SPGIST_HEAD_BLKNO)
+		if (SpGistBlockIsFixed(blkno))
 			continue;
 
 		buffer = ReadBuffer(index, blkno);
@@ -226,9 +226,8 @@ SpGistUpdateMetaPage(Relation index)
 }
 
 /* Macro to select proper element of lastUsedPages cache depending on flags */
-#define GET_LUP(c, f)	(((f) & GBUF_LEAF) ? \
-						 &(c)->lastUsedPages.leafPage : \
-						 &(c)->lastUsedPages.innerPage[(f) & GBUF_PARITY_MASK])
+/* Masking flags with SPGIST_CACHED_PAGES is just for paranoia's sake */
+#define GET_LUP(c, f)  (&(c)->lastUsedPages.cachedPage[((unsigned int) (f)) % SPGIST_CACHED_PAGES])
 
 /*
  * Allocate and initialize a new buffer of the type and parity specified by
@@ -254,15 +253,21 @@ static Buffer
 allocNewBuffer(Relation index, int flags)
 {
 	SpGistCache *cache = spgGetCache(index);
+	uint16		pageflags = 0;
+
+	if (GBUF_REQ_LEAF(flags))
+		pageflags |= SPGIST_LEAF;
+	if (GBUF_REQ_NULLS(flags))
+		pageflags |= SPGIST_NULLS;
 
 	for (;;)
 	{
 		Buffer		buffer;
 
 		buffer = SpGistNewBuffer(index);
-		SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0);
+		SpGistInitBuffer(buffer, pageflags);
 
-		if (flags & GBUF_LEAF)
+		if (pageflags & SPGIST_LEAF)
 		{
 			/* Leaf pages have no parity concerns, so just use it */
 			return buffer;
@@ -270,9 +275,9 @@ allocNewBuffer(Relation index, int flags)
 		else
 		{
 			BlockNumber blkno = BufferGetBlockNumber(buffer);
-			int		blkParity = blkno % 3;
+			int		blkFlags = GBUF_INNER_PARITY(blkno);
 
-			if ((flags & GBUF_PARITY_MASK) == blkParity)
+			if ((flags & GBUF_PARITY_MASK) == blkFlags)
 			{
 				/* Page has right parity, use it */
 				return buffer;
@@ -280,8 +285,10 @@ allocNewBuffer(Relation index, int flags)
 			else
 			{
 				/* Page has wrong parity, record it in cache and try again */
-				cache->lastUsedPages.innerPage[blkParity].blkno = blkno;
-				cache->lastUsedPages.innerPage[blkParity].freeSpace =
+				if (pageflags & SPGIST_NULLS)
+					blkFlags |= GBUF_NULLS;
+				cache->lastUsedPages.cachedPage[blkFlags].blkno = blkno;
+				cache->lastUsedPages.cachedPage[blkFlags].freeSpace =
 					PageGetExactFreeSpace(BufferGetPage(buffer));
 				UnlockReleaseBuffer(buffer);
 			}
@@ -329,8 +336,8 @@ SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
 		return allocNewBuffer(index, flags);
 	}
 
-	/* root page should never be in cache */
-	Assert(lup->blkno != SPGIST_HEAD_BLKNO);
+	/* fixed pages should never be in cache */
+	Assert(!SpGistBlockIsFixed(lup->blkno));
 
 	/* If cached freeSpace isn't enough, don't bother looking at the page */
 	if (lup->freeSpace >= needSpace)
@@ -355,7 +362,13 @@ SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
 		if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page))
 		{
 			/* OK to initialize the page */
-			SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0);
+			uint16		pageflags = 0;
+
+			if (GBUF_REQ_LEAF(flags))
+				pageflags |= SPGIST_LEAF;
+			if (GBUF_REQ_NULLS(flags))
+				pageflags |= SPGIST_NULLS;
+			SpGistInitBuffer(buffer, pageflags);
 			lup->freeSpace = PageGetExactFreeSpace(page) - needSpace;
 			*isNew = true;
 			return buffer;
@@ -365,8 +378,8 @@ SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
 		 * Check that page is of right type and has enough space.  We must
 		 * recheck this since our cache isn't necessarily up to date.
 		 */
-		if ((flags & GBUF_LEAF) ? SpGistPageIsLeaf(page) :
-			!SpGistPageIsLeaf(page))
+		if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) &&
+			(GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page)))
 		{
 			int			freeSpace = PageGetExactFreeSpace(page);
 
@@ -407,14 +420,16 @@ SpGistSetLastUsedPage(Relation index, Buffer buffer)
 	BlockNumber blkno = BufferGetBlockNumber(buffer);
 	int			flags;
 
-	/* Never enter the root page in cache, though */
-	if (blkno == SPGIST_HEAD_BLKNO)
+	/* Never enter fixed pages (root pages) in cache, though */
+	if (SpGistBlockIsFixed(blkno))
 		return;
 
 	if (SpGistPageIsLeaf(page))
 		flags = GBUF_LEAF;
 	else
 		flags = GBUF_INNER_PARITY(blkno);
+	if (SpGistPageStoresNulls(page))
+		flags |= GBUF_NULLS;
 
 	lup = GET_LUP(cache, flags);
 
@@ -459,6 +474,7 @@ void
 SpGistInitMetapage(Page page)
 {
 	SpGistMetaPageData *metadata;
+	int			i;
 
 	SpGistInitPage(page, SPGIST_META);
 	metadata = SpGistPageGetMeta(page);
@@ -466,10 +482,8 @@ SpGistInitMetapage(Page page)
 	metadata->magicNumber = SPGIST_MAGIC_NUMBER;
 
 	/* initialize last-used-page cache to empty */
-	metadata->lastUsedPages.innerPage[0].blkno = InvalidBlockNumber;
-	metadata->lastUsedPages.innerPage[1].blkno = InvalidBlockNumber;
-	metadata->lastUsedPages.innerPage[2].blkno = InvalidBlockNumber;
-	metadata->lastUsedPages.leafPage.blkno = InvalidBlockNumber;
+	for (i = 0; i < SPGIST_CACHED_PAGES; i++)
+		metadata->lastUsedPages.cachedPage[i].blkno = InvalidBlockNumber;
 }
 
 /*
@@ -490,7 +504,7 @@ spgoptions(PG_FUNCTION_ARGS)
 }
 
 /*
- * Get the space needed to store a datum of the indicated type.
+ * Get the space needed to store a non-null datum of the indicated type.
  * Note the result is already rounded up to a MAXALIGN boundary.
  * Also, we follow the SPGiST convention that pass-by-val types are
  * just stored in their Datum representation (compare memcpyDatum).
@@ -511,7 +525,7 @@ SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum)
 }
 
 /*
- * Copy the given datum to *target
+ * Copy the given non-null datum to *target
  */
 static void
 memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum)
@@ -533,17 +547,20 @@ memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum)
  * Construct a leaf tuple containing the given heap TID and datum value
  */
 SpGistLeafTuple
-spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, Datum datum)
+spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr,
+				 Datum datum, bool isnull)
 {
 	SpGistLeafTuple tup;
 	unsigned int size;
 
 	/* compute space needed (note result is already maxaligned) */
-	size = SGLTHDRSZ + SpGistGetTypeSize(&state->attType, datum);
+	size = SGLTHDRSZ;
+	if (!isnull)
+		size += SpGistGetTypeSize(&state->attType, datum);
 
 	/*
 	 * Ensure that we can replace the tuple with a dead tuple later.  This
-	 * test is unnecessary given current tuple layouts, but let's be safe.
+	 * test is unnecessary when !isnull, but let's be safe.
 	 */
 	if (size < SGDTSIZE)
 		size = SGDTSIZE;
@@ -554,7 +571,8 @@ spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, Datum datum)
 	tup->size = size;
 	tup->nextOffset = InvalidOffsetNumber;
 	tup->heapPtr = *heapPtr;
-	memcpyDatum(SGLTDATAPTR(tup), &state->attType, datum);
+	if (!isnull)
+		memcpyDatum(SGLTDATAPTR(tup), &state->attType, datum);
 
 	return tup;
 }
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index 4598ea8d67fd4..a09da84a2aac1 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -307,7 +307,7 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
 }
 
 /*
- * Vacuum the root page when it is a leaf
+ * Vacuum a root page when it is also a leaf
  *
  * On the root, we just delete any dead leaf tuples; no fancy business
  */
@@ -321,6 +321,7 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
 	OffsetNumber i,
 				max = PageGetMaxOffsetNumber(page);
 
+	xlrec.blkno = BufferGetBlockNumber(buffer);
 	xlrec.nDelete = 0;
 
 	/* Scan page, identify tuples to delete, accumulate stats */
@@ -537,7 +538,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 	}
 	else if (SpGistPageIsLeaf(page))
 	{
-		if (blkno == SPGIST_HEAD_BLKNO)
+		if (SpGistBlockIsRoot(blkno))
 		{
 			vacuumLeafRoot(bds, index, buffer);
 			/* no need for vacuumRedirectAndPlaceholder */
@@ -560,7 +561,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 	 * put a new tuple.  Otherwise, check for empty/deletable page, and
 	 * make sure FSM knows about it.
 	 */
-	if (blkno != SPGIST_HEAD_BLKNO)
+	if (!SpGistBlockIsRoot(blkno))
 	{
 		/* If page is now empty, mark it deleted */
 		if (PageIsEmpty(page) && !SpGistPageIsDeleted(page))
@@ -598,7 +599,7 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	/* Finish setting up spgBulkDeleteState */
 	initSpGistState(&bds->spgstate, index);
 	bds->OldestXmin = GetOldestXmin(true, false);
-	bds->lastFilledBlock = SPGIST_HEAD_BLKNO;
+	bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
 
 	/*
 	 * Reset counts that will be incremented during the scan; needed in case
@@ -619,7 +620,7 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	 * delete some deletable tuples.  See more extensive comments about
 	 * this in btvacuumscan().
 	 */
-	blkno = SPGIST_HEAD_BLKNO;
+	blkno = SPGIST_METAPAGE_BLKNO + 1;
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -648,6 +649,12 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	 * XXX disabled because it's unsafe due to possible concurrent inserts.
 	 * We'd have to rescan the pages to make sure they're still empty, and it
 	 * doesn't seem worth it.  Note that btree doesn't do this either.
+	 *
+	 * Another reason not to truncate is that it could invalidate the cached
+	 * pages-with-freespace pointers in the metapage and other backends'
+	 * relation caches, that is leave them pointing to nonexistent pages.
+	 * Adding RelationGetNumberOfBlocks calls to protect the places that use
+	 * those pointers would be unduly expensive.
 	 */
 #ifdef NOT_USED
 	if (num_pages > bds->lastFilledBlock + 1)
diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
index daa8ae300bae9..8e87e2adc9060 100644
--- a/src/backend/access/spgist/spgxlog.c
+++ b/src/backend/access/spgist/spgxlog.c
@@ -84,7 +84,7 @@ spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
 	MarkBufferDirty(buffer);
 	UnlockReleaseBuffer(buffer);
 
-	buffer = XLogReadBuffer(*node, SPGIST_HEAD_BLKNO, true);
+	buffer = XLogReadBuffer(*node, SPGIST_ROOT_BLKNO, true);
 	Assert(BufferIsValid(buffer));
 	SpGistInitBuffer(buffer, SPGIST_LEAF);
 	page = (Page) BufferGetPage(buffer);
@@ -92,6 +92,15 @@ spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
 	PageSetTLI(page, ThisTimeLineID);
 	MarkBufferDirty(buffer);
 	UnlockReleaseBuffer(buffer);
+
+	buffer = XLogReadBuffer(*node, SPGIST_NULL_BLKNO, true);
+	Assert(BufferIsValid(buffer));
+	SpGistInitBuffer(buffer, SPGIST_LEAF | SPGIST_NULLS);
+	page = (Page) BufferGetPage(buffer);
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
 }
 
 static void
@@ -116,7 +125,8 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record)
 			page = BufferGetPage(buffer);
 
 			if (xldata->newPage)
-				SpGistInitBuffer(buffer, SPGIST_LEAF);
+				SpGistInitBuffer(buffer,
+								 SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
 
 			if (!XLByteLE(lsn, PageGetLSN(page)))
 			{
@@ -218,7 +228,8 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record)
 			page = BufferGetPage(buffer);
 
 			if (xldata->newPage)
-				SpGistInitBuffer(buffer, SPGIST_LEAF);
+				SpGistInitBuffer(buffer,
+								 SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
 
 			if (!XLByteLE(lsn, PageGetLSN(page)))
 			{
@@ -344,6 +355,7 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record)
 			{
 				page = BufferGetPage(buffer);
 
+				/* AddNode is not used for nulls pages */
 				if (xldata->newPage)
 					SpGistInitBuffer(buffer, 0);
 
@@ -464,6 +476,7 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record)
 		{
 			page = BufferGetPage(buffer);
 
+			/* SplitTuple is not used for nulls pages */
 			if (xldata->newPage)
 				SpGistInitBuffer(buffer, 0);
 
@@ -545,7 +558,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
 	 */
 	bbi = 0;
 
-	if (xldata->blknoSrc == SPGIST_HEAD_BLKNO)
+	if (SpGistBlockIsRoot(xldata->blknoSrc))
 	{
 		/* when splitting root, we touch it only in the guise of new inner */
 		srcBuffer = InvalidBuffer;
@@ -557,7 +570,8 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
 		Assert(BufferIsValid(srcBuffer));
 		page = (Page) BufferGetPage(srcBuffer);
 
-		SpGistInitBuffer(srcBuffer, SPGIST_LEAF);
+		SpGistInitBuffer(srcBuffer,
+						 SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
 		/* don't update LSN etc till we're done with it */
 	}
 	else
@@ -612,7 +626,8 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
 		Assert(BufferIsValid(destBuffer));
 		page = (Page) BufferGetPage(destBuffer);
 
-		SpGistInitBuffer(destBuffer, SPGIST_LEAF);
+		SpGistInitBuffer(destBuffer,
+						 SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
 		/* don't update LSN etc till we're done with it */
 	}
 	else
@@ -678,7 +693,8 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
 			page = BufferGetPage(buffer);
 
 			if (xldata->initInner)
-				SpGistInitBuffer(buffer, 0);
+				SpGistInitBuffer(buffer,
+								 (xldata->storesNulls ? SPGIST_NULLS : 0));
 
 			if (!XLByteLE(lsn, PageGetLSN(page)))
 			{
@@ -709,7 +725,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
 	if (xldata->blknoParent == InvalidBlockNumber)
 	{
 		/* no parent cause we split the root */
-		Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO);
+		Assert(SpGistBlockIsRoot(xldata->blknoInner));
 	}
 	else if (xldata->blknoInner != xldata->blknoParent)
 	{
@@ -842,7 +858,7 @@ spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record)
 
 	if (!(record->xl_info & XLR_BKP_BLOCK_1))
 	{
-		buffer = XLogReadBuffer(xldata->node, SPGIST_HEAD_BLKNO, false);
+		buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
 		if (BufferIsValid(buffer))
 		{
 			page = BufferGetPage(buffer);
@@ -1039,7 +1055,8 @@ spg_desc(StringInfo buf, uint8 xl_info, char *rec)
 			break;
 		case XLOG_SPGIST_VACUUM_ROOT:
 			out_target(buf, ((spgxlogVacuumRoot *) rec)->node);
-			appendStringInfo(buf, "vacuum leaf tuples on root page");
+			appendStringInfo(buf, "vacuum leaf tuples on root page %u",
+							 ((spgxlogVacuumRoot *) rec)->blkno);
 			break;
 		case XLOG_SPGIST_VACUUM_REDIRECT:
 			out_target(buf, ((spgxlogVacuumRedirect *) rec)->node);
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index 76ea5a1578fc8..aa5a602418941 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -21,8 +21,15 @@
 
 
 /* Page numbers of fixed-location pages */
-#define SPGIST_METAPAGE_BLKNO	 (0)
-#define SPGIST_HEAD_BLKNO		 (1)
+#define SPGIST_METAPAGE_BLKNO	 (0)	/* metapage */
+#define SPGIST_ROOT_BLKNO		 (1)	/* root for normal entries */
+#define SPGIST_NULL_BLKNO		 (2)	/* root for null-value entries */
+#define SPGIST_LAST_FIXED_BLKNO	 SPGIST_NULL_BLKNO
+
+#define SpGistBlockIsRoot(blkno) \
+	((blkno) == SPGIST_ROOT_BLKNO || (blkno) == SPGIST_NULL_BLKNO)
+#define SpGistBlockIsFixed(blkno) \
+	((BlockNumber) (blkno) <= (BlockNumber) SPGIST_LAST_FIXED_BLKNO)
 
 /*
  * Contents of page special space on SPGiST index pages
@@ -42,15 +49,14 @@ typedef SpGistPageOpaqueData *SpGistPageOpaque;
 #define SPGIST_META			(1<<0)
 #define SPGIST_DELETED		(1<<1)
 #define SPGIST_LEAF			(1<<2)
+#define SPGIST_NULLS		(1<<3)
 
 #define SpGistPageGetOpaque(page) ((SpGistPageOpaque) PageGetSpecialPointer(page))
 #define SpGistPageIsMeta(page) (SpGistPageGetOpaque(page)->flags & SPGIST_META)
 #define SpGistPageIsDeleted(page) (SpGistPageGetOpaque(page)->flags & SPGIST_DELETED)
 #define SpGistPageSetDeleted(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_DELETED)
-#define SpGistPageSetNonDeleted(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_DELETED)
 #define SpGistPageIsLeaf(page) (SpGistPageGetOpaque(page)->flags & SPGIST_LEAF)
-#define SpGistPageSetLeaf(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_LEAF)
-#define SpGistPageSetInner(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_LEAF)
+#define SpGistPageStoresNulls(page) (SpGistPageGetOpaque(page)->flags & SPGIST_NULLS)
 
 /*
  * The page ID is for the convenience of pg_filedump and similar utilities,
@@ -67,14 +73,16 @@ typedef SpGistPageOpaqueData *SpGistPageOpaque;
  */
 typedef struct SpGistLastUsedPage
 {
-	BlockNumber blkno;			/* block number of described page */
-	int			freeSpace;		/* its free space (could be obsolete!) */
+	BlockNumber blkno;			/* block number, or InvalidBlockNumber */
+	int			freeSpace;		/* page's free space (could be obsolete!) */
 } SpGistLastUsedPage;
 
+/* Note: indexes in cachedPage[] match flag assignments for SpGistGetBuffer */
+#define SPGIST_CACHED_PAGES 8
+
 typedef struct SpGistLUPCache
 {
-	SpGistLastUsedPage innerPage[3];	/* one per triple-parity group */
-	SpGistLastUsedPage leafPage;
+	SpGistLastUsedPage cachedPage[SPGIST_CACHED_PAGES];
 } SpGistLUPCache;
 
 /*
@@ -86,7 +94,7 @@ typedef struct SpGistMetaPageData
 	SpGistLUPCache lastUsedPages;	/* shared storage of last-used info */
 } SpGistMetaPageData;
 
-#define SPGIST_MAGIC_NUMBER (0xBA0BABED)
+#define SPGIST_MAGIC_NUMBER (0xBA0BABEE)
 
 #define SpGistPageGetMeta(p) \
 	((SpGistMetaPageData *) PageGetContents(p))
@@ -266,7 +274,15 @@ typedef SpGistNodeTupleData *SpGistNodeTuple;
  * node (which must be on the same page).  But when the root page is a leaf
  * page, we don't chain its tuples, so nextOffset is always 0 on the root.
  *
- * size must be a multiple of MAXALIGN
+ * size must be a multiple of MAXALIGN; also, it must be at least SGDTSIZE
+ * so that the tuple can be converted to REDIRECT status later.  (This
+ * restriction only adds bytes for the null-datum case, otherwise alignment
+ * restrictions force it anyway.)
+ *
+ * In a leaf tuple for a NULL indexed value, there's no useful datum value;
+ * however, the SGDTSIZE limit ensures that's there's a Datum word there
+ * anyway, so SGLTDATUM can be applied safely as long as you don't do
+ * anything with the result.
  */
 typedef struct SpGistLeafTupleData
 {
@@ -397,6 +413,7 @@ typedef struct spgxlogAddLeaf
 
 	BlockNumber blknoLeaf;		/* destination page for leaf tuple */
 	bool		newPage;		/* init dest page? */
+	bool		storesNulls;	/* page is in the nulls tree? */
 	OffsetNumber offnumLeaf;	/* offset where leaf tuple gets placed */
 	OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
 
@@ -419,6 +436,7 @@ typedef struct spgxlogMoveLeafs
 	uint16		nMoves;			/* number of tuples moved from source page */
 	bool		newPage;		/* init dest page? */
 	bool		replaceDead;	/* are we replacing a DEAD source tuple? */
+	bool		storesNulls;	/* pages are in the nulls tree? */
 
 	BlockNumber blknoParent;	/* where the parent downlink is */
 	OffsetNumber offnumParent;
@@ -502,6 +520,8 @@ typedef struct spgxlogPickSplit
 	OffsetNumber offnumInner;
 	bool		initInner;		/* re-init the Inner page? */
 
+	bool		storesNulls;	/* pages are in the nulls tree? */
+
 	BlockNumber blknoParent;	/* where the parent downlink is, if any */
 	OffsetNumber offnumParent;
 	uint16		nodeI;
@@ -553,9 +573,10 @@ typedef struct spgxlogVacuumLeaf
 
 typedef struct spgxlogVacuumRoot
 {
-	/* vacuum root page when it is a leaf */
+	/* vacuum a root page when it is also a leaf */
 	RelFileNode node;
 
+	BlockNumber blkno;			/* block number to clean */
 	uint16		nDelete;		/* number of tuples to delete */
 
 	spgxlogState stateSrc;
@@ -580,10 +601,18 @@ typedef struct spgxlogVacuumRedirect
  * page in the same triple-parity group as the specified block number.
  * (Typically, this should be GBUF_INNER_PARITY(parentBlockNumber + 1)
  * to follow the rule described in spgist/README.)
+ * In addition, GBUF_NULLS can be OR'd in to get a page for storage of
+ * null-valued tuples.
+ *
+ * Note: these flag values are used as indexes into lastUsedPages.
  */
-#define GBUF_PARITY_MASK		0x03
-#define GBUF_LEAF				0x04
+#define GBUF_LEAF				0x03
 #define GBUF_INNER_PARITY(x)	((x) % 3)
+#define GBUF_NULLS				0x04
+
+#define GBUF_PARITY_MASK		0x03
+#define GBUF_REQ_LEAF(flags)	(((flags) & GBUF_PARITY_MASK) == GBUF_LEAF)
+#define GBUF_REQ_NULLS(flags)	((flags) & GBUF_NULLS)
 
 /* spgutils.c */
 extern SpGistCache *spgGetCache(Relation index);
@@ -598,7 +627,8 @@ extern void SpGistInitBuffer(Buffer b, uint16 f);
 extern void SpGistInitMetapage(Page page);
 extern unsigned int SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum);
 extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state,
-										ItemPointer heapPtr, Datum datum);
+										ItemPointer heapPtr,
+										Datum datum, bool isnull);
 extern SpGistNodeTuple spgFormNodeTuple(SpGistState *state,
 										Datum label, bool isnull);
 extern SpGistInnerTuple spgFormInnerTuple(SpGistState *state,
@@ -621,6 +651,6 @@ extern void spgPageIndexMultiDelete(SpGistState *state, Page page,
 						int firststate, int reststate,
 						BlockNumber blkno, OffsetNumber offnum);
 extern void spgdoinsert(Relation index, SpGistState *state,
-						ItemPointer heapPtr, Datum datum);
+						ItemPointer heapPtr, Datum datum, bool isnull);
 
 #endif   /* SPGIST_PRIVATE_H */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 993e3872c7b76..59fd53d2c5fd7 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201203041
+#define CATALOG_VERSION_NO	201203111
 
 #endif
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index 9aac9e953b369..0d7ed6857e832 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -129,7 +129,7 @@ DESCR("GiST index access method");
 DATA(insert OID = 2742 (  gin		0 5 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
-DATA(insert OID = 4000 (  spgist	0 5 f f f f f f f f f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
+DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
 DESCR("SP-GiST index access method");
 #define SPGIST_AM_OID 4000
 
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index b1fcada1be4d0..b7497b047f74b 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -68,6 +68,7 @@ CREATE TABLE quad_point_tbl AS
     SELECT point(unique1,unique2) AS p FROM tenk1;
 INSERT INTO quad_point_tbl
     SELECT '(333.0,400.0)'::point FROM generate_series(1,1000);
+INSERT INTO quad_point_tbl VALUES (NULL), (NULL), (NULL);
 CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p);
 CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl;
 CREATE INDEX sp_kd_ind ON kd_point_tbl USING spgist (p kd_point_ops);
@@ -227,6 +228,24 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
  (10,10)
 (4 rows)
 
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+ count 
+-------
+     3
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+ count 
+-------
+ 11000
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl;
+ count 
+-------
+ 11003
+(1 row)
+
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
  count 
 -------
@@ -678,6 +697,50 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
  (10,10)
 (4 rows)
 
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Aggregate
+   ->  Index Only Scan using sp_quad_ind on quad_point_tbl
+         Index Cond: (p IS NULL)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+ count 
+-------
+     3
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Aggregate
+   ->  Index Only Scan using sp_quad_ind on quad_point_tbl
+         Index Cond: (p IS NOT NULL)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+ count 
+-------
+ 11000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Aggregate
+   ->  Index Only Scan using sp_quad_ind on quad_point_tbl
+(2 rows)
+
+SELECT count(*) FROM quad_point_tbl;
+ count 
+-------
+ 11003
+(1 row)
+
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
                         QUERY PLAN                         
@@ -1108,6 +1171,55 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
  (10,10)
 (4 rows)
 
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on quad_point_tbl
+         Recheck Cond: (p IS NULL)
+         ->  Bitmap Index Scan on sp_quad_ind
+               Index Cond: (p IS NULL)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+ count 
+-------
+     3
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on quad_point_tbl
+         Recheck Cond: (p IS NOT NULL)
+         ->  Bitmap Index Scan on sp_quad_ind
+               Index Cond: (p IS NOT NULL)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+ count 
+-------
+ 11000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl;
+                  QUERY PLAN                  
+----------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on quad_point_tbl
+         ->  Bitmap Index Scan on sp_quad_ind
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl;
+ count 
+-------
+ 11003
+(1 row)
+
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
                           QUERY PLAN                           
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 5e5fc22579bb0..57f52612dfa6d 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -102,6 +102,8 @@ CREATE TABLE quad_point_tbl AS
 INSERT INTO quad_point_tbl
     SELECT '(333.0,400.0)'::point FROM generate_series(1,1000);
 
+INSERT INTO quad_point_tbl VALUES (NULL), (NULL), (NULL);
+
 CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p);
 
 CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl;
@@ -172,6 +174,12 @@ SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1';
 
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
 
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+
+SELECT count(*) FROM quad_point_tbl;
+
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 
 SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
@@ -305,6 +313,18 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
 
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl;
+SELECT count(*) FROM quad_point_tbl;
+
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
@@ -422,6 +442,18 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
 
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+SELECT count(*) FROM quad_point_tbl WHERE p IS NULL;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+SELECT count(*) FROM quad_point_tbl WHERE p IS NOT NULL;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl;
+SELECT count(*) FROM quad_point_tbl;
+
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';

From 1e4964478add0278a3f7ff685bc033f0f52625ad Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 11 Mar 2012 16:53:04 -0400
Subject: [PATCH 117/129] Fix documented type of t_infomask2.

Per Koizumi Satoru
---
 doc/src/sgml/storage.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index cb2f60e1eeef4..45223f563d77c 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -823,7 +823,7 @@ data. Empty in ordinary tables.</entry>
   </row>
   <row>
    <entry>t_infomask2</entry>
-   <entry>int16</entry>
+   <entry>uint16</entry>
    <entry>2 bytes</entry>
    <entry>number of attributes, plus various flag bits</entry>
   </row>

From 8142166162efb193b1fece5cf32afb07fe24e17e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 11 Mar 2012 16:56:26 -0400
Subject: [PATCH 118/129] Make parameter name consistent with syntax summary.

Thomas Hunger
---
 doc/src/sgml/ref/create_database.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
index 2c34c11cae43f..4e0ed5616c489 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -81,7 +81,7 @@ CREATE DATABASE <replaceable class="PARAMETER">name</replaceable>
       </listitem>
      </varlistentry>
      <varlistentry>
-      <term><replaceable class="parameter">use_name</replaceable></term>
+      <term><replaceable class="parameter">user_name</replaceable></term>
       <listitem>
        <para>
         The name of the database user who will own the new database,

From c6be1f43ab0551a95ec8ac77364e2f8558ae6345 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 11 Mar 2012 18:14:23 -0400
Subject: [PATCH 119/129] Make INSERT/UPDATE queries depend on their specific
 target columns.

We have always created a whole-table dependency for the target relation,
but that's not really good enough, as it doesn't prevent scenarios such
as dropping an individual target column or altering its type.  So we
have to create an individual dependency for each target column, as well.

Per report from Bill MacArthur of a rule containing UPDATE breaking
after such an alteration.  Note that this patch doesn't try to make
such cases work, only to ensure that the attempted ALTER TABLE throws
an error telling you it can't cope with adjusting the rule.

This is a long-standing bug, but given the lack of prior reports
I'm not going to risk back-patching it.  A back-patch wouldn't do
anything to fix existing rules' dependency lists, anyway.
---
 src/backend/catalog/dependency.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 1b92f5c38a1ef..fed724c51cb8c 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -1743,6 +1743,37 @@ find_expr_references_walker(Node *node,
 			}
 		}
 
+		/*
+		 * If the query is an INSERT or UPDATE, we should create a dependency
+		 * on each target column, to prevent the specific target column from
+		 * being dropped.  Although we will visit the TargetEntry nodes again
+		 * during query_tree_walker, we won't have enough context to do this
+		 * conveniently, so do it here.
+		 */
+		if (query->commandType == CMD_INSERT ||
+			query->commandType == CMD_UPDATE)
+		{
+			RangeTblEntry *rte;
+
+			if (query->resultRelation <= 0 ||
+				query->resultRelation > list_length(query->rtable))
+				elog(ERROR, "invalid resultRelation %d",
+					 query->resultRelation);
+			rte = rt_fetch(query->resultRelation, query->rtable);
+			if (rte->rtekind == RTE_RELATION)
+			{
+				foreach(lc, query->targetList)
+				{
+					TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+					if (tle->resjunk)
+						continue;		/* ignore junk tlist items */
+					add_object_address(OCLASS_CLASS, rte->relid, tle->resno,
+									   context->addrs);
+				}
+			}
+		}
+
 		/*
 		 * Add dependencies on constraints listed in query's constraintDeps
 		 */

From 9a39583264e46e6d2f869f202847cd8ab6c43f81 Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Mon, 12 Mar 2012 10:13:42 -0400
Subject: [PATCH 120/129] Remove tabs in SGML files

---
 doc/src/sgml/ref/initdb.sgml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 5570562a5cddf..0090f49d9f522 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -212,7 +212,7 @@ PostgreSQL documentation
       <term><option>--no-locale</option></term>
       <listitem>
        <para>
-		Equivalent to <option>--locale=C</option>.
+        Equivalent to <option>--locale=C</option>.
        </para>
       </listitem>
      </varlistentry>
@@ -231,8 +231,8 @@ PostgreSQL documentation
       <term><option>--text-search-config=<replaceable>CFG</></option></term>
       <listitem>
        <para>
-		Sets the default text search configuration.
-		See <xref linkend="guc-default-text-search-config"> for further information.
+        Sets the default text search configuration.
+        See <xref linkend="guc-default-text-search-config"> for further information.
        </para>
       </listitem>
      </varlistentry>

From c9f310d377d1d8f8acd2d05bf7920704d33affe5 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Mon, 12 Mar 2012 20:55:09 +0200
Subject: [PATCH 121/129] Add comment for missing break in switch

For clarity, following other sites, and to silence Coverity.
---
 src/backend/utils/adt/formatting.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index d6721f067b1ce..c5e25d942ec4f 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1104,6 +1104,7 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 			case NUM_D:
 				num->flag |= NUM_F_LDECIMAL;
 				num->need_locale = TRUE;
+				/* FALLTHROUGH */
 			case NUM_DEC:
 				if (IS_DECIMAL(num))
 					ereport(ERROR,

From bad250f4f31704f05247fa4696ac2077f884ed8e Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Mon, 12 Mar 2012 20:56:13 +0200
Subject: [PATCH 122/129] Use correct sizeof operand in qsort call

Probably no practical impact, since all pointers ought to have the
same size, but it was wrong nonetheless.  Found by Coverity.
---
 src/backend/utils/adt/tsrank.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index fb7dcc955f7a7..38e384c6505cf 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -175,7 +175,7 @@ SortAndUniqItems(TSQuery q, int *size)
 	if (*size < 2)
 		return res;
 
-	qsort_arg(res, *size, sizeof(QueryOperand **), compareQueryOperand, (void *) operand);
+	qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
 
 	ptr = res + 1;
 	prevptr = res;

From b4af1c25bbc636379efc5d2ffb9d420765705b8a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 12 Mar 2012 16:10:05 -0400
Subject: [PATCH 123/129] Fix SPGiST vacuum algorithm to handle concurrent
 tuple motion properly.

A leaf tuple that we need to delete could get moved as a consequence of an
insertion happening concurrently with the VACUUM scan.  If it moves from a
page past the current scan point to a page before, we'll miss it, which is
not acceptable.  Hence, when we see a leaf-page REDIRECT that could have
been made since our scan started, chase down the redirection pointer much
as if we were doing a normal index search, and be sure to vacuum every page
it leads to.  This fixes the issue because, if the tuple was on page N at
the instant we start our scan, we will surely find it as a consequence of
chasing the redirect from page N, no matter how much it moves around in
between.  Problem noted by Takashi Yamamoto.
---
 src/backend/access/spgist/README      |  21 +++
 src/backend/access/spgist/spgvacuum.c | 231 ++++++++++++++++++++++++--
 2 files changed, 242 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README
index d20ad17a4b669..1b86e275914d4 100644
--- a/src/backend/access/spgist/README
+++ b/src/backend/access/spgist/README
@@ -314,6 +314,27 @@ the reverse map of the nextOffset links (ie, when we see tuple x links to
 tuple y, we set predecessor[y] = x).  Then head tuples are the ones with
 no predecessor.
 
+Because insertions can occur while VACUUM runs, a pure sequential scan
+could miss deleting some target leaf tuples, because they could get moved
+from a not-yet-visited leaf page to an already-visited leaf page as a
+consequence of a PickSplit or MoveLeafs operation.  Failing to delete any
+target TID is not acceptable, so we have to extend the algorithm to cope
+with such cases.  We recognize that such a move might have occurred when
+we see a leaf-page REDIRECT tuple whose XID indicates it might have been
+created after the VACUUM scan started.  We add the redirection target TID
+to a "pending list" of places we need to recheck.  Between pages of the
+main sequential scan, we empty the pending list by visiting each listed
+TID.  If it points to an inner tuple (from a PickSplit), add each downlink
+TID to the pending list.  If it points to a leaf page, vacuum that page.
+(We could just vacuum the single pointed-to chain, but vacuuming the
+whole page simplifies the code and reduces the odds of VACUUM having to
+modify the same page multiple times.)  To ensure that pending-list
+processing can never get into an endless loop, even in the face of
+concurrent index changes, we don't remove list entries immediately but
+only after we've completed all pending-list processing; instead we just
+mark items as done after processing them.  Adding a TID that's already in
+the list is a no-op, whether or not that item is marked done yet.
+
 spgbulkdelete also updates the index's free space map.
 
 Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index a09da84a2aac1..856790ee2aa41 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -25,9 +25,18 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "utils/snapmgr.h"
 
 
-/* local state for vacuum operations */
+/* Entry in pending-list of TIDs we need to revisit */
+typedef struct spgVacPendingItem
+{
+	ItemPointerData tid;				/* redirection target to visit */
+	bool		done;					/* have we dealt with this? */
+	struct spgVacPendingItem *next;		/* list link */
+} spgVacPendingItem;
+
+/* Local state for vacuum operations */
 typedef struct spgBulkDeleteState
 {
 	/* Parameters passed in to spgvacuumscan */
@@ -35,22 +44,87 @@ typedef struct spgBulkDeleteState
 	IndexBulkDeleteResult *stats;
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
+
 	/* Additional working state */
-	SpGistState spgstate;
-	TransactionId OldestXmin;
-	BlockNumber lastFilledBlock;
+	SpGistState spgstate;			/* for SPGiST operations that need one */
+	spgVacPendingItem *pendingList;	/* TIDs we need to (re)visit */
+	TransactionId myXmin;			/* for detecting newly-added redirects */
+	TransactionId OldestXmin;		/* for deciding a redirect is obsolete */
+	BlockNumber lastFilledBlock;	/* last non-deletable block */
 } spgBulkDeleteState;
 
 
+/*
+ * Add TID to pendingList, but only if not already present.
+ *
+ * Note that new items are always appended at the end of the list; this
+ * ensures that scans of the list don't miss items added during the scan.
+ */
+static void
+spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
+{
+	spgVacPendingItem *pitem;
+	spgVacPendingItem **listLink;
+
+	/* search the list for pre-existing entry */
+	listLink = &bds->pendingList;
+	while (*listLink != NULL)
+	{
+		pitem = *listLink;
+		if (ItemPointerEquals(tid, &pitem->tid))
+			return;				/* already in list, do nothing */
+		listLink = &pitem->next;
+	}
+	/* not there, so append new entry */
+	pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
+	pitem->tid = *tid;
+	pitem->done = false;
+	pitem->next = NULL;
+	*listLink = pitem;
+}
+
+/*
+ * Clear pendingList
+ */
+static void
+spgClearPendingList(spgBulkDeleteState *bds)
+{
+	spgVacPendingItem *pitem;
+	spgVacPendingItem *nitem;
+
+	for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
+	{
+		nitem = pitem->next;
+		/* All items in list should have been dealt with */
+		Assert(pitem->done);
+		pfree(pitem);
+	}
+	bds->pendingList = NULL;
+}
+
 /*
  * Vacuum a regular (non-root) leaf page
  *
  * We must delete tuples that are targeted for deletion by the VACUUM,
  * but not move any tuples that are referenced by outside links; we assume
  * those are the ones that are heads of chains.
+ *
+ * If we find a REDIRECT that was made by a concurrently-running transaction,
+ * we must add its target TID to pendingList.  (We don't try to visit the
+ * target immediately, first because we don't want VACUUM locking more than
+ * one buffer at a time, and second because the duplicate-filtering logic
+ * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
+ * loop in the face of continuous concurrent insertions.)
+ *
+ * If forPending is true, we are examining the page as a consequence of
+ * chasing a redirect link, not as part of the normal sequential scan.
+ * We still vacuum the page normally, but we don't increment the stats
+ * about live tuples; else we'd double-count those tuples, since the page
+ * has been or will be visited in the sequential scan as well.
  */
 static void
-vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
+vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
+			   bool forPending)
 {
 	Page		page = BufferGetPage(buffer);
 	spgxlogVacuumLeaf xlrec;
@@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
 			}
 			else
 			{
-				bds->stats->num_index_tuples += 1;
+				if (!forPending)
+					bds->stats->num_index_tuples += 1;
 			}
 
 			/* Form predecessor map, too */
@@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
 				predecessor[lt->nextOffset] = i;
 			}
 		}
+		else if (lt->tupstate == SPGIST_REDIRECT)
+		{
+			SpGistDeadTuple dt = (SpGistDeadTuple) lt;
+
+			Assert(dt->nextOffset == InvalidOffsetNumber);
+			Assert(ItemPointerIsValid(&dt->pointer));
+
+			/*
+			 * Add target TID to pending list if the redirection could have
+			 * happened since VACUUM started.
+			 *
+			 * Note: we could make a tighter test by seeing if the xid is
+			 * "running" according to the active snapshot; but tqual.c doesn't
+			 * currently export a suitable API, and it's not entirely clear
+			 * that a tighter test is worth the cycles anyway.
+			 */
+			if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
+				spgAddPendingTID(bds, &dt->pointer);
+		}
 		else
 		{
 			Assert(lt->nextOffset == InvalidOffsetNumber);
@@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 		}
 		else
 		{
-			vacuumLeafPage(bds, index, buffer);
+			vacuumLeafPage(bds, index, buffer, false);
 			vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
 		}
 	}
@@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 	}
 
 	/*
-	 * The root page must never be deleted, nor marked as available in FSM,
-	 * because we don't want it ever returned by a search for a place to
+	 * The root pages must never be deleted, nor marked as available in FSM,
+	 * because we don't want them ever returned by a search for a place to
 	 * put a new tuple.  Otherwise, check for empty/deletable page, and
 	 * make sure FSM knows about it.
 	 */
@@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 	UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Process the pending-TID list between pages of the main scan
+ */
+static void
+spgprocesspending(spgBulkDeleteState *bds)
+{
+	Relation	index = bds->info->index;
+	spgVacPendingItem *pitem;
+	spgVacPendingItem *nitem;
+	BlockNumber	blkno;
+	Buffer		buffer;
+	Page		page;
+
+	for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
+	{
+		if (pitem->done)
+			continue;			/* ignore already-done items */
+
+		/* call vacuum_delay_point while not holding any buffer lock */
+		vacuum_delay_point();
+
+		/* examine the referenced page */
+		blkno = ItemPointerGetBlockNumber(&pitem->tid);
+		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+									RBM_NORMAL, bds->info->strategy);
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		page = (Page) BufferGetPage(buffer);
+
+		if (PageIsNew(page) || SpGistPageIsDeleted(page))
+		{
+			/* Probably shouldn't happen, but ignore it */
+		}
+		else if (SpGistPageIsLeaf(page))
+		{
+			if (SpGistBlockIsRoot(blkno))
+			{
+				/* this should definitely not happen */
+				elog(ERROR, "redirection leads to root page of index \"%s\"",
+					 RelationGetRelationName(index));
+			}
+
+			/* deal with any deletable tuples */
+			vacuumLeafPage(bds, index, buffer, true);
+			/* might as well do this while we are here */
+			vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
+
+			SpGistSetLastUsedPage(index, buffer);
+
+			/*
+			 * We can mark as done not only this item, but any later ones
+			 * pointing at the same page, since we vacuumed the whole page.
+			 */
+			pitem->done = true;
+			for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
+			{
+				if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+					nitem->done = true;
+			}
+		}
+		else
+		{
+			/*
+			 * On an inner page, visit the referenced inner tuple and add
+			 * all its downlinks to the pending list.  We might have pending
+			 * items for more than one inner tuple on the same page (in fact
+			 * this is pretty likely given the way space allocation works),
+			 * so get them all while we are here.
+			 */
+			for (nitem = pitem; nitem != NULL; nitem = nitem->next)
+			{
+				if (nitem->done)
+					continue;
+				if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+				{
+					OffsetNumber offset;
+					SpGistInnerTuple innerTuple;
+
+					offset = ItemPointerGetOffsetNumber(&nitem->tid);
+					innerTuple = (SpGistInnerTuple) PageGetItem(page,
+												PageGetItemId(page, offset));
+					if (innerTuple->tupstate == SPGIST_LIVE)
+					{
+						SpGistNodeTuple node;
+						int			i;
+
+						SGITITERATE(innerTuple, i, node)
+						{
+							if (ItemPointerIsValid(&node->t_tid))
+								spgAddPendingTID(bds, &node->t_tid);
+						}
+					}
+					else if (innerTuple->tupstate == SPGIST_REDIRECT)
+					{
+						/* transfer attention to redirect point */
+						spgAddPendingTID(bds,
+										 &((SpGistDeadTuple) innerTuple)->pointer);
+					}
+					else
+						elog(ERROR, "unexpected SPGiST tuple state: %d",
+							 innerTuple->tupstate);
+
+					nitem->done = true;
+				}
+			}
+		}
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	spgClearPendingList(bds);
+}
+
 /*
  * Perform a bulkdelete scan
  */
@@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
 
 	/* Finish setting up spgBulkDeleteState */
 	initSpGistState(&bds->spgstate, index);
+	bds->pendingList = NULL;
+	bds->myXmin = GetActiveSnapshot()->xmin;
 	bds->OldestXmin = GetOldestXmin(true, false);
 	bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
 
@@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds)
 		for (; blkno < num_pages; blkno++)
 		{
 			spgvacuumpage(bds, blkno);
+			/* empty the pending-list after each page */
+			if (bds->pendingList != NULL)
+				spgprocesspending(bds);
 		}
 	}
 
@@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS)
 	IndexFreeSpaceMapVacuum(index);
 
 	/*
-	 * It's quite possible for us to be fooled by concurrent page splits into
+	 * It's quite possible for us to be fooled by concurrent tuple moves into
 	 * double-counting some index tuples, so disbelieve any total that exceeds
 	 * the underlying heap's count ... if we know that accurately.  Otherwise
 	 * this might just make matters worse.

From 717f6d60859cc2d1d7dfd57520531a50bd78df9c Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Mon, 12 Mar 2012 19:47:54 -0400
Subject: [PATCH 124/129] In pg_upgrade, add various logging improvements:

	add ability to control permissions of created files
	have psql echo its queries for easier debugging
	output four separate log files, and delete them on success
	add -r/--retain option to keep log files after success
	make logs file append-only
	remove -g/-G/-l logging options
	sugggest tailing appropriate log file on failure
	enhance -v/--verbose behavior
---
 contrib/pg_upgrade/check.c           |  26 +++---
 contrib/pg_upgrade/controldata.c     |   4 +-
 contrib/pg_upgrade/dump.c            |  20 +++--
 contrib/pg_upgrade/exec.c            |  23 ++++--
 contrib/pg_upgrade/file.c            |  16 ++++
 contrib/pg_upgrade/function.c        |   5 +-
 contrib/pg_upgrade/info.c            |  20 +++--
 contrib/pg_upgrade/option.c          | 100 ++++++++---------------
 contrib/pg_upgrade/pg_upgrade.c      | 116 +++++++++++++++++----------
 contrib/pg_upgrade/pg_upgrade.h      |  60 ++++++++------
 contrib/pg_upgrade/relfilenode.c     |   4 +-
 contrib/pg_upgrade/server.c          |  19 +++--
 contrib/pg_upgrade/util.c            |  16 ++--
 contrib/pg_upgrade/version.c         |   5 +-
 contrib/pg_upgrade/version_old_8_3.c |  34 ++++----
 doc/src/sgml/pgupgrade.sgml          |  27 ++-----
 16 files changed, 259 insertions(+), 236 deletions(-)

diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index a5f63eb6c800f..cf4338400dd00 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -165,12 +165,13 @@ issue_warnings(char *sequence_script_file_name)
 		if (sequence_script_file_name)
 		{
 			prep_status("Adjusting sequences");
-			exec_prog(true,
-					  SYSTEMQUOTE "\"%s/psql\" --set ON_ERROR_STOP=on "
+			exec_prog(true, true, UTILITY_LOG_FILE,
+					  SYSTEMQUOTE "\"%s/psql\" --echo-queries "
+					  "--set ON_ERROR_STOP=on "
 					  "--no-psqlrc --port %d --username \"%s\" "
-					  "-f \"%s\" --dbname template1 >> \"%s\"" SYSTEMQUOTE,
+					  "-f \"%s\" --dbname template1 >> \"%s\" 2>&1" SYSTEMQUOTE,
 					  new_cluster.bindir, new_cluster.port, os_info.user,
-					  sequence_script_file_name, log_opts.filename2);
+					  sequence_script_file_name, UTILITY_LOG_FILE);
 			unlink(sequence_script_file_name);
 			check_ok();
 		}
@@ -393,10 +394,10 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
 
 	prep_status("Creating script to delete old cluster");
 
-	snprintf(*deletion_script_file_name, MAXPGPATH, "%s/delete_old_cluster.%s",
-			 os_info.cwd, SCRIPT_EXT);
+	snprintf(*deletion_script_file_name, MAXPGPATH, "delete_old_cluster.%s",
+			 SCRIPT_EXT);
 
-	if ((script = fopen(*deletion_script_file_name, "w")) == NULL)
+	if ((script = fopen_priv(*deletion_script_file_name, "w")) == NULL)
 		pg_log(PG_FATAL, "Could not open file \"%s\": %s\n",
 			   *deletion_script_file_name, getErrorText(errno));
 
@@ -541,8 +542,8 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
 		return;
 	}
 
-	snprintf(output_path, sizeof(output_path), "%s/contrib_isn_and_int8_pass_by_value.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path),
+			 "contrib_isn_and_int8_pass_by_value.txt");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -569,7 +570,7 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
 		for (rowno = 0; rowno < ntups; rowno++)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "Could not open file \"%s\": %s\n",
 					   output_path, getErrorText(errno));
 			if (!db_used)
@@ -628,8 +629,7 @@ check_for_reg_data_type_usage(ClusterInfo *cluster)
 
 	prep_status("Checking for reg* system OID user data types");
 
-	snprintf(output_path, sizeof(output_path), "%s/tables_using_reg.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "tables_using_reg.txt");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -675,7 +675,7 @@ check_for_reg_data_type_usage(ClusterInfo *cluster)
 		for (rowno = 0; rowno < ntups; rowno++)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "Could not open file \"%s\": %s\n",
 					   output_path, getErrorText(errno));
 			if (!db_used)
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 5239601dc6b65..e01280db9e284 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -126,11 +126,9 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 	/* we have the result of cmd in "output". so parse it line by line now */
 	while (fgets(bufin, sizeof(bufin), output))
 	{
-		if (log_opts.debug)
-			fputs(bufin, log_opts.debug_fd);
+		pg_log(PG_VERBOSE, "%s", bufin);
 
 #ifdef WIN32
-
 		/*
 		 * Due to an installer bug, LANG=C doesn't work for PG 8.3.3, but does
 		 * work 8.2.6 and 8.3.7, so check for non-ASCII output and suggest a
diff --git a/contrib/pg_upgrade/dump.c b/contrib/pg_upgrade/dump.c
index 772ca37e8d083..571792b1d4014 100644
--- a/contrib/pg_upgrade/dump.c
+++ b/contrib/pg_upgrade/dump.c
@@ -11,6 +11,7 @@
 
 #include "pg_upgrade.h"
 
+#include <sys/types.h>
 
 void
 generate_old_dump(void)
@@ -22,10 +23,12 @@ generate_old_dump(void)
 	 * --binary-upgrade records the width of dropped columns in pg_class, and
 	 * restores the frozenid's for databases and relations.
 	 */
-	exec_prog(true,
+	exec_prog(true, true, UTILITY_LOG_FILE,
 			  SYSTEMQUOTE "\"%s/pg_dumpall\" --port %d --username \"%s\" "
-			  "--schema-only --binary-upgrade > \"%s/" ALL_DUMP_FILE "\""
-			  SYSTEMQUOTE, new_cluster.bindir, old_cluster.port, os_info.user, os_info.cwd);
+			  "--schema-only --binary-upgrade %s > \"%s\" 2>> \"%s\""
+			  SYSTEMQUOTE, new_cluster.bindir, old_cluster.port, os_info.user,
+			  log_opts.verbose ? "--verbose" : "",
+			  ALL_DUMP_FILE, UTILITY_LOG_FILE);
 	check_ok();
 }
 
@@ -56,15 +59,16 @@ split_old_dump(void)
 	char		filename[MAXPGPATH];
 	bool		suppressed_username = false;
 
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, ALL_DUMP_FILE);
+	snprintf(filename, sizeof(filename), "%s", ALL_DUMP_FILE);
 	if ((all_dump = fopen(filename, "r")) == NULL)
 		pg_log(PG_FATAL, "Could not open dump file \"%s\": %s\n", filename, getErrorText(errno));
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, GLOBALS_DUMP_FILE);
-	if ((globals_dump = fopen(filename, "w")) == NULL)
+	snprintf(filename, sizeof(filename), "%s", GLOBALS_DUMP_FILE);
+	if ((globals_dump = fopen_priv(filename, "w")) == NULL)
 		pg_log(PG_FATAL, "Could not write to dump file \"%s\": %s\n", filename, getErrorText(errno));
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, DB_DUMP_FILE);
-	if ((db_dump = fopen(filename, "w")) == NULL)
+	snprintf(filename, sizeof(filename), "%s", DB_DUMP_FILE);
+	if ((db_dump = fopen_priv(filename, "w")) == NULL)
 		pg_log(PG_FATAL, "Could not write to dump file \"%s\": %s\n", filename, getErrorText(errno));
+
 	current_output = globals_dump;
 
 	/* patterns used to prevent our own username from being recreated */
diff --git a/contrib/pg_upgrade/exec.c b/contrib/pg_upgrade/exec.c
index b870deda79a80..42c5c0fff4026 100644
--- a/contrib/pg_upgrade/exec.c
+++ b/contrib/pg_upgrade/exec.c
@@ -13,7 +13,7 @@
 
 #include <fcntl.h>
 #include <unistd.h>
-
+#include <sys/types.h>
 
 static void check_data_dir(const char *pg_data);
 static void check_bin_dir(ClusterInfo *cluster);
@@ -34,24 +34,37 @@ static int win32_check_directory_write_permissions(void);
  *	instead of returning should an error occur.
  */
 int
-exec_prog(bool throw_error, const char *fmt,...)
+exec_prog(bool throw_error, bool is_priv,
+		  const char *log_file, const char *fmt,...)
 {
 	va_list		args;
 	int			result;
 	char		cmd[MAXPGPATH];
+	mode_t old_umask;
+
+	if (is_priv)
+		old_umask = umask(S_IRWXG | S_IRWXO);
 
 	va_start(args, fmt);
 	vsnprintf(cmd, MAXPGPATH, fmt, args);
 	va_end(args);
 
-	pg_log(PG_INFO, "%s\n", cmd);
+	pg_log(PG_VERBOSE, "%s\n", cmd);
 
 	result = system(cmd);
 
+	if (is_priv)
+		umask(old_umask);
+
 	if (result != 0)
 	{
-		pg_log(throw_error ? PG_FATAL : PG_INFO,
-			   "There were problems executing \"%s\"\n", cmd);
+		report_status(PG_REPORT, "*failure*");
+		fflush(stdout);
+		pg_log(PG_VERBOSE, "There were problems executing \"%s\"\n", cmd);
+		pg_log(throw_error ? PG_FATAL : PG_REPORT,
+			   "Consult the last few lines of \"%s\" for\n"
+			   "the probable cause of the failure.\n",
+				log_file);
 		return 1;
 	}
 
diff --git a/contrib/pg_upgrade/file.c b/contrib/pg_upgrade/file.c
index fcf1c440a705c..0276636e03fa4 100644
--- a/contrib/pg_upgrade/file.c
+++ b/contrib/pg_upgrade/file.c
@@ -316,3 +316,19 @@ win32_pghardlink(const char *src, const char *dst)
 }
 
 #endif
+
+
+/* fopen() file with no group/other permissions */
+FILE *
+fopen_priv(const char *path, const char *mode)
+{
+	mode_t old_umask = umask(S_IRWXG | S_IRWXO);
+	FILE	*fp;
+
+	fp = fopen(path, mode);
+	umask(old_umask);
+
+	return fp;
+}
+	
+
diff --git a/contrib/pg_upgrade/function.c b/contrib/pg_upgrade/function.c
index 267f29143db34..322503946ea76 100644
--- a/contrib/pg_upgrade/function.c
+++ b/contrib/pg_upgrade/function.c
@@ -218,8 +218,7 @@ check_loadable_libraries(void)
 
 	prep_status("Checking for presence of required libraries");
 
-	snprintf(output_path, sizeof(output_path), "%s/loadable_libraries.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "loadable_libraries.txt");
 
 	for (libnum = 0; libnum < os_info.num_libraries; libnum++)
 	{
@@ -257,7 +256,7 @@ check_loadable_libraries(void)
 		if (PQresultStatus(res) != PGRES_COMMAND_OK)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "Could not open file \"%s\": %s\n",
 					   output_path, getErrorText(errno));
 			fprintf(script, "Could not load library \"%s\"\n%s\n",
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
index 692cdc2e6248f..36683fa15eaa9 100644
--- a/contrib/pg_upgrade/info.c
+++ b/contrib/pg_upgrade/info.c
@@ -132,19 +132,19 @@ create_rel_filename_map(const char *old_data, const char *new_data,
 void
 print_maps(FileNameMap *maps, int n_maps, const char *db_name)
 {
-	if (log_opts.debug)
+	if (log_opts.verbose)
 	{
 		int			mapnum;
 
-		pg_log(PG_DEBUG, "mappings for database \"%s\":\n", db_name);
+		pg_log(PG_VERBOSE, "mappings for database \"%s\":\n", db_name);
 
 		for (mapnum = 0; mapnum < n_maps; mapnum++)
-			pg_log(PG_DEBUG, "%s.%s: %u to %u\n",
+			pg_log(PG_VERBOSE, "%s.%s: %u to %u\n",
 				   maps[mapnum].nspname, maps[mapnum].relname,
 				   maps[mapnum].old_relfilenode,
 				   maps[mapnum].new_relfilenode);
 
-		pg_log(PG_DEBUG, "\n\n");
+		pg_log(PG_VERBOSE, "\n\n");
 	}
 }
 
@@ -168,11 +168,9 @@ get_db_and_rel_infos(ClusterInfo *cluster)
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 		get_rel_infos(cluster, &cluster->dbarr.dbs[dbnum]);
 
-	if (log_opts.debug)
-	{
-		pg_log(PG_DEBUG, "\n%s databases:\n", CLUSTER_NAME(cluster));
+	pg_log(PG_VERBOSE, "\n%s databases:\n", CLUSTER_NAME(cluster));
+	if (log_opts.verbose)
 		print_db_infos(&cluster->dbarr);
-	}
 }
 
 
@@ -368,9 +366,9 @@ print_db_infos(DbInfoArr *db_arr)
 
 	for (dbnum = 0; dbnum < db_arr->ndbs; dbnum++)
 	{
-		pg_log(PG_DEBUG, "Database: %s\n", db_arr->dbs[dbnum].db_name);
+		pg_log(PG_VERBOSE, "Database: %s\n", db_arr->dbs[dbnum].db_name);
 		print_rel_infos(&db_arr->dbs[dbnum].rel_arr);
-		pg_log(PG_DEBUG, "\n\n");
+		pg_log(PG_VERBOSE, "\n\n");
 	}
 }
 
@@ -381,7 +379,7 @@ print_rel_infos(RelInfoArr *arr)
 	int			relnum;
 
 	for (relnum = 0; relnum < arr->nrels; relnum++)
-		pg_log(PG_DEBUG, "relname: %s.%s: reloid: %u reltblspace: %s\n",
+		pg_log(PG_VERBOSE, "relname: %s.%s: reloid: %u reltblspace: %s\n",
 			   arr->rels[relnum].nspname, arr->rels[relnum].relname,
 			   arr->rels[relnum].reloid, arr->rels[relnum].tablespace);
 }
diff --git a/contrib/pg_upgrade/option.c b/contrib/pg_upgrade/option.c
index 0a105efec713f..a97be28d92c52 100644
--- a/contrib/pg_upgrade/option.c
+++ b/contrib/pg_upgrade/option.c
@@ -11,8 +11,10 @@
 
 #include "pg_upgrade.h"
 
-#include "getopt_long.h"
-
+#include <getopt_long.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #ifdef WIN32
 #include <io.h>
 #endif
@@ -46,18 +48,18 @@ parseCommandLine(int argc, char *argv[])
 
 		{"user", required_argument, NULL, 'u'},
 		{"check", no_argument, NULL, 'c'},
-		{"debug", no_argument, NULL, 'g'},
-		{"debugfile", required_argument, NULL, 'G'},
 		{"link", no_argument, NULL, 'k'},
-		{"logfile", required_argument, NULL, 'l'},
+		{"retain", no_argument, NULL, 'r'},
 		{"verbose", no_argument, NULL, 'v'},
 		{NULL, 0, NULL, 0}
 	};
 	int			option;			/* Command line option */
 	int			optindex = 0;	/* used by getopt_long */
 	int			os_user_effective_id;
-	char		*return_buf;
-
+	FILE		*fp;
+	int			i;
+	time_t		run_time = time(NULL);
+	
 	user_opts.transfer_mode = TRANSFER_MODE_COPY;
 
 	os_info.progname = get_progname(argv[0]);
@@ -94,11 +96,10 @@ parseCommandLine(int argc, char *argv[])
 	if (os_user_effective_id == 0)
 		pg_log(PG_FATAL, "%s: cannot be run as root\n", os_info.progname);
 
-	return_buf = getcwd(os_info.cwd, MAXPGPATH);
-	if (return_buf == NULL)
-		pg_log(PG_FATAL, "Could not access current working directory: %s\n", getErrorText(errno));
+	if ((log_opts.internal = fopen_priv(INTERNAL_LOG_FILE, "a")) == NULL)
+		pg_log(PG_FATAL, "cannot write to log file %s\n", INTERNAL_LOG_FILE);
 
-	while ((option = getopt_long(argc, argv, "d:D:b:B:cgG:kl:o:O:p:P:u:v",
+	while ((option = getopt_long(argc, argv, "d:D:b:B:cko:O:p:P:ru:v",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -125,27 +126,10 @@ parseCommandLine(int argc, char *argv[])
 				new_cluster.pgconfig = pg_strdup(optarg);
 				break;
 
-			case 'g':
-				pg_log(PG_REPORT, "Running in debug mode\n");
-				log_opts.debug = true;
-				break;
-
-			case 'G':
-				if ((log_opts.debug_fd = fopen(optarg, "w")) == NULL)
-				{
-					pg_log(PG_FATAL, "cannot open debug file\n");
-					exit(1);
-				}
-				break;
-
 			case 'k':
 				user_opts.transfer_mode = TRANSFER_MODE_LINK;
 				break;
 
-			case 'l':
-				log_opts.filename = pg_strdup(optarg);
-				break;
-
 			case 'o':
 				old_cluster.pgopts = pg_strdup(optarg);
 				break;
@@ -175,6 +159,10 @@ parseCommandLine(int argc, char *argv[])
 				}
 				break;
 
+			case 'r':
+				log_opts.retain = true;
+				break;
+
 			case 'u':
 				pg_free(os_info.user);
 				os_info.user = pg_strdup(optarg);
@@ -199,36 +187,18 @@ parseCommandLine(int argc, char *argv[])
 		}
 	}
 
-	if (log_opts.filename != NULL)
-	{
-		/*
-		 * We must use append mode so output generated by child processes via
-		 * ">>" will not be overwritten, and we want the file truncated on
-		 * start.
-		 */
-		/* truncate */
-		if ((log_opts.fd = fopen(log_opts.filename, "w")) == NULL)
-			pg_log(PG_FATAL, "cannot write to log file %s\n", log_opts.filename);
-		fclose(log_opts.fd);
-		if ((log_opts.fd = fopen(log_opts.filename, "a")) == NULL)
-			pg_log(PG_FATAL, "cannot write to log file %s\n", log_opts.filename);
-	}
-	else
-		log_opts.filename = pg_strdup(DEVNULL);
-
-	/* WIN32 files do not accept writes from multiple processes */
-#ifndef WIN32
-	log_opts.filename2 = pg_strdup(log_opts.filename);
-#else
-	log_opts.filename2 = pg_strdup(DEVNULL);
-#endif
-		
-	/* if no debug file name, output to the terminal */
-	if (log_opts.debug && !log_opts.debug_fd)
+	/* label start of upgrade in logfiles */
+	for (i = 0; i < NUM_LOG_FILES; i++)
 	{
-		log_opts.debug_fd = fopen(DEVTTY, "w");
-		if (!log_opts.debug_fd)
-			pg_log(PG_FATAL, "cannot write to terminal\n");
+		if ((fp = fopen_priv(output_files[i], "a")) == NULL)
+			pg_log(PG_FATAL, "cannot write to log file %s\n",
+				   output_files[i]);
+		fprintf(fp, "\n"
+		"-----------------------------------------------------------------\n"
+		"  pg_upgrade run on %s"
+		"-----------------------------------------------------------------\n\n",
+		ctime(&run_time));
+		fclose(fp);
 	}
 
 	/* Get values from env if not already set */
@@ -256,16 +226,14 @@ Options:\n\
   -c, --check                   check clusters only, don't change any data\n\
   -d, --old-datadir=OLDDATADIR  old cluster data directory\n\
   -D, --new-datadir=NEWDATADIR  new cluster data directory\n\
-  -g, --debug                   enable debugging\n\
-  -G, --debugfile=FILENAME      output debugging activity to file\n\
   -k, --link                    link instead of copying files to new cluster\n\
-  -l, --logfile=FILENAME        log internal activity to file\n\
   -o, --old-options=OPTIONS     old cluster options to pass to the server\n\
   -O, --new-options=OPTIONS     new cluster options to pass to the server\n\
   -p, --old-port=OLDPORT        old cluster port number (default %d)\n\
   -P, --new-port=NEWPORT        new cluster port number (default %d)\n\
+  -r, --retain                  retain SQL and log files after success\n\
   -u, --user=NAME               cluster superuser (default \"%s\")\n\
-  -v, --verbose                 enable verbose output\n\
+  -v, --verbose                 enable verbose internal logging\n\
   -V, --version                 display version information, then exit\n\
   -h, --help                    show this help, then exit\n\
 \n\
@@ -354,19 +322,19 @@ adjust_data_dir(ClusterInfo *cluster)
 {
 	char		filename[MAXPGPATH];
 	char		cmd[MAXPGPATH], cmd_output[MAX_STRING];
-	FILE	   *fd, *output;
+	FILE	   *fp, *output;
 
 	/* If there is no postgresql.conf, it can't be a config-only dir */
 	snprintf(filename, sizeof(filename), "%s/postgresql.conf", cluster->pgconfig);
-	if ((fd = fopen(filename, "r")) == NULL)
+	if ((fp = fopen(filename, "r")) == NULL)
 		return;
-	fclose(fd);
+	fclose(fp);
 
 	/* If PG_VERSION exists, it can't be a config-only dir */
 	snprintf(filename, sizeof(filename), "%s/PG_VERSION", cluster->pgconfig);
-	if ((fd = fopen(filename, "r")) != NULL)
+	if ((fp = fopen(filename, "r")) != NULL)
 	{
-		fclose(fd);
+		fclose(fp);
 		return;
 	}
 
diff --git a/contrib/pg_upgrade/pg_upgrade.c b/contrib/pg_upgrade/pg_upgrade.c
index 3078bcd4cd03b..269f8adeb1533 100644
--- a/contrib/pg_upgrade/pg_upgrade.c
+++ b/contrib/pg_upgrade/pg_upgrade.c
@@ -55,6 +55,14 @@ ClusterInfo old_cluster,
 			new_cluster;
 OSInfo		os_info;
 
+char *output_files[NUM_LOG_FILES] = {
+	SERVER_LOG_FILE,
+	RESTORE_LOG_FILE,
+	UTILITY_LOG_FILE,
+	INTERNAL_LOG_FILE
+};
+
+
 int
 main(int argc, char **argv)
 {
@@ -127,9 +135,11 @@ main(int argc, char **argv)
 	 * because there is no need to have the schema load use new oids.
 	 */
 	prep_status("Setting next OID for new cluster");
-	exec_prog(true, SYSTEMQUOTE "\"%s/pg_resetxlog\" -o %u \"%s\" > "
-			  DEVNULL SYSTEMQUOTE,
-			  new_cluster.bindir, old_cluster.controldata.chkpnt_nxtoid, new_cluster.pgdata);
+	exec_prog(true, true, UTILITY_LOG_FILE,
+			  SYSTEMQUOTE "\"%s/pg_resetxlog\" -o %u \"%s\" >> \"%s\" 2>&1"
+			  SYSTEMQUOTE,
+			  new_cluster.bindir, old_cluster.controldata.chkpnt_nxtoid,
+			  new_cluster.pgdata, UTILITY_LOG_FILE);
 	check_ok();
 
 	create_script_for_old_cluster_deletion(&deletion_script_file_name);
@@ -193,10 +203,11 @@ prepare_new_cluster(void)
 	 * --analyze so autovacuum doesn't update statistics later
 	 */
 	prep_status("Analyzing all rows in the new cluster");
-	exec_prog(true,
+	exec_prog(true, true, UTILITY_LOG_FILE,
 			  SYSTEMQUOTE "\"%s/vacuumdb\" --port %d --username \"%s\" "
-			  "--all --analyze >> \"%s\" 2>&1" SYSTEMQUOTE,
-	  new_cluster.bindir, new_cluster.port, os_info.user, log_opts.filename2);
+			  "--all --analyze %s >> \"%s\" 2>&1" SYSTEMQUOTE,
+	  new_cluster.bindir, new_cluster.port, os_info.user,
+	  log_opts.verbose ? "--verbose" : "", UTILITY_LOG_FILE);
 	check_ok();
 
 	/*
@@ -206,10 +217,11 @@ prepare_new_cluster(void)
 	 * later.
 	 */
 	prep_status("Freezing all rows on the new cluster");
-	exec_prog(true,
+	exec_prog(true, true, UTILITY_LOG_FILE,
 			  SYSTEMQUOTE "\"%s/vacuumdb\" --port %d --username \"%s\" "
-			  "--all --freeze >> \"%s\" 2>&1" SYSTEMQUOTE,
-	  new_cluster.bindir, new_cluster.port, os_info.user, log_opts.filename2);
+			  "--all --freeze %s >> \"%s\" 2>&1" SYSTEMQUOTE,
+	  new_cluster.bindir, new_cluster.port, os_info.user,
+	  log_opts.verbose ? "--verbose" : "", UTILITY_LOG_FILE);
 	check_ok();
 
 	get_pg_database_relfilenode(&new_cluster);
@@ -243,13 +255,14 @@ prepare_new_databases(void)
 	 * support functions in template1 but pg_dumpall creates database using
 	 * the template0 template.
 	 */
-	exec_prog(true,
-			  SYSTEMQUOTE "\"%s/psql\" --set ON_ERROR_STOP=on "
-	/* --no-psqlrc prevents AUTOCOMMIT=off */
+	exec_prog(true, true, RESTORE_LOG_FILE,
+			  SYSTEMQUOTE "\"%s/psql\" --echo-queries "
+			  "--set ON_ERROR_STOP=on "
+			  /* --no-psqlrc prevents AUTOCOMMIT=off */
 			  "--no-psqlrc --port %d --username \"%s\" "
-			  "-f \"%s/%s\" --dbname template1 >> \"%s\"" SYSTEMQUOTE,
-			  new_cluster.bindir, new_cluster.port, os_info.user, os_info.cwd,
-			  GLOBALS_DUMP_FILE, log_opts.filename2);
+			  "-f \"%s\" --dbname template1 >> \"%s\" 2>&1" SYSTEMQUOTE,
+			  new_cluster.bindir, new_cluster.port, os_info.user,
+			  GLOBALS_DUMP_FILE, RESTORE_LOG_FILE);
 	check_ok();
 
 	/* we load this to get a current list of databases */
@@ -275,12 +288,13 @@ create_new_objects(void)
 	check_ok();
 
 	prep_status("Restoring database schema to new cluster");
-	exec_prog(true,
-			  SYSTEMQUOTE "\"%s/psql\" --set ON_ERROR_STOP=on "
+	exec_prog(true, true, RESTORE_LOG_FILE,
+			  SYSTEMQUOTE "\"%s/psql\" --echo-queries "
+			  "--set ON_ERROR_STOP=on "
 			  "--no-psqlrc --port %d --username \"%s\" "
-			  "-f \"%s/%s\" --dbname template1 >> \"%s\"" SYSTEMQUOTE,
-			  new_cluster.bindir, new_cluster.port, os_info.user, os_info.cwd,
-			  DB_DUMP_FILE, log_opts.filename2);
+			  "-f \"%s\" --dbname template1 >> \"%s\" 2>&1" SYSTEMQUOTE,
+			  new_cluster.bindir, new_cluster.port, os_info.user,
+			  DB_DUMP_FILE, RESTORE_LOG_FILE);
 	check_ok();
 
 	/* regenerate now that we have objects in the databases */
@@ -306,29 +320,38 @@ copy_clog_xlog_xid(void)
 	check_ok();
 
 	prep_status("Copying old commit clogs to new server");
+	exec_prog(true, false, UTILITY_LOG_FILE,
 #ifndef WIN32
-	exec_prog(true, SYSTEMQUOTE "%s \"%s\" \"%s\"" SYSTEMQUOTE,
+			  SYSTEMQUOTE "%s \"%s\" \"%s\" >> \"%s\" 2>&1" SYSTEMQUOTE,
 			  "cp -Rf",
 #else
 	/* flags: everything, no confirm, quiet, overwrite read-only */
-	exec_prog(true, SYSTEMQUOTE "%s \"%s\" \"%s\\\"" SYSTEMQUOTE,
+			  SYSTEMQUOTE "%s \"%s\" \"%s\\\" >> \"%s\" 2>&1" SYSTEMQUOTE,
 			  "xcopy /e /y /q /r",
 #endif
-			  old_clog_path, new_clog_path);
+			  old_clog_path, new_clog_path, UTILITY_LOG_FILE);
 	check_ok();
 
 	/* set the next transaction id of the new cluster */
 	prep_status("Setting next transaction ID for new cluster");
-	exec_prog(true, SYSTEMQUOTE "\"%s/pg_resetxlog\" -f -x %u \"%s\" > " DEVNULL SYSTEMQUOTE,
-			  new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid, new_cluster.pgdata);
+	exec_prog(true, true, UTILITY_LOG_FILE,
+			  SYSTEMQUOTE
+			  "\"%s/pg_resetxlog\" -f -x %u \"%s\" >> \"%s\" 2>&1"
+			  SYSTEMQUOTE, new_cluster.bindir,
+			  old_cluster.controldata.chkpnt_nxtxid,
+			  new_cluster.pgdata, UTILITY_LOG_FILE);
 	check_ok();
 
 	/* now reset the wal archives in the new cluster */
 	prep_status("Resetting WAL archives");
-	exec_prog(true, SYSTEMQUOTE "\"%s/pg_resetxlog\" -l %u,%u,%u \"%s\" >> \"%s\" 2>&1" SYSTEMQUOTE,
-			  new_cluster.bindir, old_cluster.controldata.chkpnt_tli,
-			old_cluster.controldata.logid, old_cluster.controldata.nxtlogseg,
-			  new_cluster.pgdata, log_opts.filename2);
+	exec_prog(true, true, UTILITY_LOG_FILE,
+			  SYSTEMQUOTE
+			  "\"%s/pg_resetxlog\" -l %u,%u,%u \"%s\" >> \"%s\" 2>&1"
+			  SYSTEMQUOTE, new_cluster.bindir,
+			  old_cluster.controldata.chkpnt_tli,
+			  old_cluster.controldata.logid,
+			  old_cluster.controldata.nxtlogseg,
+			  new_cluster.pgdata, UTILITY_LOG_FILE);
 	check_ok();
 }
 
@@ -421,18 +444,27 @@ set_frozenxids(void)
 static void
 cleanup(void)
 {
-	char		filename[MAXPGPATH];
-
-	if (log_opts.fd)
-		fclose(log_opts.fd);
+	
+	fclose(log_opts.internal);
 
-	if (log_opts.debug_fd)
-		fclose(log_opts.debug_fd);
-
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, ALL_DUMP_FILE);
-	unlink(filename);
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, GLOBALS_DUMP_FILE);
-	unlink(filename);
-	snprintf(filename, sizeof(filename), "%s/%s", os_info.cwd, DB_DUMP_FILE);
-	unlink(filename);
+	/* Remove dump and log files? */
+	if (!log_opts.retain)
+	{
+		char		filename[MAXPGPATH];
+		int i;
+
+		for (i = 0; i < NUM_LOG_FILES; i++)
+		{
+			snprintf(filename, sizeof(filename), "%s", output_files[i]);
+			unlink(filename);
+		}
+
+		/* remove SQL files */
+		snprintf(filename, sizeof(filename), "%s", ALL_DUMP_FILE);
+		unlink(filename);
+		snprintf(filename, sizeof(filename), "%s", GLOBALS_DUMP_FILE);
+		unlink(filename);
+		snprintf(filename, sizeof(filename), "%s", DB_DUMP_FILE);
+		unlink(filename);
+	}
 }
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index a95481509db74..46f9169d0c409 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -35,6 +35,34 @@
 #define GLOBALS_DUMP_FILE	"pg_upgrade_dump_globals.sql"
 #define DB_DUMP_FILE		"pg_upgrade_dump_db.sql"
 
+#define SERVER_LOG_FILE		"pg_upgrade_server.log"
+#define RESTORE_LOG_FILE	"pg_upgrade_restore.log"
+#define UTILITY_LOG_FILE	"pg_upgrade_utility.log"
+#define INTERNAL_LOG_FILE	"pg_upgrade_internal.log"
+
+#define NUM_LOG_FILES		4
+extern char *output_files[];
+
+/*
+ * WIN32 files do not accept writes from multiple processes
+ *
+ * On Win32, we can't send both pg_upgrade output and command output to the
+ * same file because we get the error: "The process cannot access the file
+ * because it is being used by another process." so send the pg_ctl
+ * command-line output to the utility log file on Windows, rather than
+ * into the server log file.
+ *
+ * We could use the Windows pgwin32_open() flags to allow shared file
+ * writes but is unclear how all other tools would use those flags, so
+ * we just avoid it and log a little differently on Windows;  we adjust
+ * the error message appropriately.
+ */
+#ifndef WIN32
+#define SERVER_LOG_FILE2	SERVER_LOG_FILE
+#else
+#define SERVER_LOG_FILE2	UTILITY_LOG_FILE
+#endif
+
 #ifndef WIN32
 #define pg_copy_file		copy_file
 #define pg_mv_file			rename
@@ -166,11 +194,10 @@ typedef enum
  */
 typedef enum
 {
-	PG_INFO,
+	PG_VERBOSE,
 	PG_REPORT,
 	PG_WARNING,
-	PG_FATAL,
-	PG_DEBUG
+	PG_FATAL
 } eLogType;
 
 
@@ -204,25 +231,9 @@ typedef struct
 */
 typedef struct
 {
-	char	   *filename;		/* name of log file (may be /dev/null) */
-	/*
-	 * WIN32 files do not accept writes from multiple processes
-	 *
-	 * On Win32, we can't send both pg_upgrade output and command output to the
-	 * same file because we get the error: "The process cannot access the file
-	 * because it is being used by another process." so we have to send all
-	 * other output to 'nul'.  Therefore, we set this to DEVNULL on Win32, and
-	 * it equals 'filename' on all other platforms.
-	 *
-	 * We could use the Windows pgwin32_open() flags to allow shared file
-	 * writes but is unclear how all other tools would use those flags, so
-	 * we just avoid it and log a little less on Windows.
-	 */
-	char	   *filename2;
-	FILE	   *fd;				/* log FILE */
-	bool		debug;			/* TRUE -> log more information */
-	FILE	   *debug_fd;		/* debug-level log FILE */
+	FILE	   *internal;		/* internal log FILE */
 	bool		verbose;		/* TRUE -> be verbose in messages */
+	bool		retain;			/* retain log files on success */
 } LogOpts;
 
 
@@ -245,7 +256,6 @@ typedef struct
 	const char *progname;		/* complete pathname for this program */
 	char	   *exec_path;		/* full path to my executable */
 	char	   *user;			/* username for clusters */
-	char		cwd[MAXPGPATH]; /* current working directory, used for output */
 	char	  **tablespaces;	/* tablespaces */
 	int			num_tablespaces;
 	char	  **libraries;		/* loadable libraries */
@@ -294,8 +304,9 @@ void		split_old_dump(void);
 
 /* exec.c */
 
-int exec_prog(bool throw_error, const char *cmd, ...)
-	__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
+int exec_prog(bool throw_error, bool is_priv,
+	const char *log_file, const char *cmd, ...)
+	__attribute__((format(PG_PRINTF_ATTRIBUTE, 4, 5)));
 void		verify_directories(void);
 bool		is_server_running(const char *datadir);
 
@@ -339,6 +350,7 @@ const char *linkAndUpdateFile(pageCnvCtx *pageConverter, const char *src,
 				  const char *dst);
 
 void		check_hard_link(void);
+FILE 	   *fopen_priv(const char *path, const char *mode);
 
 /* function.c */
 
diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c
index a1e30b1f0ce91..45d6c5415bf15 100644
--- a/contrib/pg_upgrade/relfilenode.c
+++ b/contrib/pg_upgrade/relfilenode.c
@@ -267,7 +267,7 @@ transfer_relfile(pageCnvCtx *pageConverter, const char *old_file,
 
 	if (user_opts.transfer_mode == TRANSFER_MODE_COPY)
 	{
-		pg_log(PG_INFO, "copying \"%s\" to \"%s\"\n", old_file, new_file);
+		pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
 
 		if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL)
 			pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
@@ -275,7 +275,7 @@ transfer_relfile(pageCnvCtx *pageConverter, const char *old_file,
 	}
 	else
 	{
-		pg_log(PG_INFO, "linking \"%s\" to \"%s\"\n", old_file, new_file);
+		pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
 
 		if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL)
 			pg_log(PG_FATAL,
diff --git a/contrib/pg_upgrade/server.c b/contrib/pg_upgrade/server.c
index 989af63a783a4..b515e0504c34c 100644
--- a/contrib/pg_upgrade/server.c
+++ b/contrib/pg_upgrade/server.c
@@ -80,7 +80,7 @@ executeQueryOrDie(PGconn *conn, const char *fmt,...)
 	vsnprintf(command, sizeof(command), fmt, args);
 	va_end(args);
 
-	pg_log(PG_DEBUG, "executing: %s\n", command);
+	pg_log(PG_VERBOSE, "executing: %s\n", command);
 	result = PQexec(conn, command);
 	status = PQresultStatus(result);
 
@@ -161,17 +161,22 @@ start_postmaster(ClusterInfo *cluster)
 	snprintf(cmd, sizeof(cmd),
 			 SYSTEMQUOTE "\"%s/pg_ctl\" -w -l \"%s\" -D \"%s\" "
 			 "-o \"-p %d %s %s\" start >> \"%s\" 2>&1" SYSTEMQUOTE,
-			 cluster->bindir, log_opts.filename2, cluster->pgconfig, cluster->port,
+			 cluster->bindir, SERVER_LOG_FILE, cluster->pgconfig, cluster->port,
 			 (cluster->controldata.cat_ver >=
 			  BINARY_UPGRADE_SERVER_FLAG_CAT_VER) ? "-b" :
 			 "-c autovacuum=off -c autovacuum_freeze_max_age=2000000000",
-			 cluster->pgopts ? cluster->pgopts : "", log_opts.filename2);
+			 cluster->pgopts ? cluster->pgopts : "", SERVER_LOG_FILE2);
 
 	/*
 	 * Don't throw an error right away, let connecting throw the error because
 	 * it might supply a reason for the failure.
 	 */
-	pg_ctl_return = exec_prog(false, "%s", cmd);
+	pg_ctl_return = exec_prog(false, true,
+					/* pass both file names if the differ */
+					(strcmp(SERVER_LOG_FILE, SERVER_LOG_FILE2) == 0) ?
+						SERVER_LOG_FILE :
+						SERVER_LOG_FILE " or " SERVER_LOG_FILE2,
+					"%s", cmd);
 
 	/* Check to see if we can connect to the server; if not, report it. */
 	if ((conn = get_db_conn(cluster, "template1")) == NULL ||
@@ -211,11 +216,11 @@ stop_postmaster(bool fast)
 	snprintf(cmd, sizeof(cmd),
 			 SYSTEMQUOTE "\"%s/pg_ctl\" -w -l \"%s\" -D \"%s\" -o \"%s\" "
 			 "%s stop >> \"%s\" 2>&1" SYSTEMQUOTE,
-			 cluster->bindir, log_opts.filename2, cluster->pgconfig,
+			 cluster->bindir, SERVER_LOG_FILE2, cluster->pgconfig,
 			 cluster->pgopts ? cluster->pgopts : "",
-			fast ? "-m fast" : "", log_opts.filename2);
+			fast ? "-m fast" : "", SERVER_LOG_FILE2);
 
-	exec_prog(fast ? false : true, "%s", cmd);
+	exec_prog(fast ? false : true, true, SERVER_LOG_FILE2, "%s", cmd);
 
 	os_info.running_cluster = NULL;
 }
diff --git a/contrib/pg_upgrade/util.c b/contrib/pg_upgrade/util.c
index 94eaa189b0e31..6977663b63aab 100644
--- a/contrib/pg_upgrade/util.c
+++ b/contrib/pg_upgrade/util.c
@@ -77,18 +77,19 @@ pg_log(eLogType type, char *fmt,...)
 	vsnprintf(message, sizeof(message), fmt, args);
 	va_end(args);
 
-	if (log_opts.fd != NULL)
+	/* PG_VERBOSE is only output in verbose mode */
+	if (type != PG_VERBOSE || log_opts.verbose)
 	{
-		fwrite(message, strlen(message), 1, log_opts.fd);
+		fwrite(message, strlen(message), 1, log_opts.internal);
 		/* if we are using OVERWRITE_MESSAGE, add newline */
 		if (strchr(message, '\r') != NULL)
-			fwrite("\n", 1, 1, log_opts.fd);
-		fflush(log_opts.fd);
+			fwrite("\n", 1, 1, log_opts.internal);
+		fflush(log_opts.internal);
 	}
 
 	switch (type)
 	{
-		case PG_INFO:
+		case PG_VERBOSE:
 			if (log_opts.verbose)
 				printf("%s", _(message));
 			break;
@@ -104,11 +105,6 @@ pg_log(eLogType type, char *fmt,...)
 			exit(1);
 			break;
 
-		case PG_DEBUG:
-			if (log_opts.debug)
-				fprintf(log_opts.debug_fd, "%s\n", _(message));
-			break;
-
 		default:
 			break;
 	}
diff --git a/contrib/pg_upgrade/version.c b/contrib/pg_upgrade/version.c
index e8799a4085638..5d790a0803e2a 100644
--- a/contrib/pg_upgrade/version.c
+++ b/contrib/pg_upgrade/version.c
@@ -28,8 +28,7 @@ new_9_0_populate_pg_largeobject_metadata(ClusterInfo *cluster, bool check_mode)
 
 	prep_status("Checking for large objects");
 
-	snprintf(output_path, sizeof(output_path), "%s/pg_largeobject.sql",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "pg_largeobject.sql");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -49,7 +48,7 @@ new_9_0_populate_pg_largeobject_metadata(ClusterInfo *cluster, bool check_mode)
 			found = true;
 			if (!check_mode)
 			{
-				if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+				if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 					pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 				fprintf(script, "\\connect %s\n",
 						quote_identifier(active_db->db_name));
diff --git a/contrib/pg_upgrade/version_old_8_3.c b/contrib/pg_upgrade/version_old_8_3.c
index a8641076aa075..c60374ee9b2a9 100644
--- a/contrib/pg_upgrade/version_old_8_3.c
+++ b/contrib/pg_upgrade/version_old_8_3.c
@@ -30,8 +30,7 @@ old_8_3_check_for_name_data_type_usage(ClusterInfo *cluster)
 
 	prep_status("Checking for invalid \"name\" user columns");
 
-	snprintf(output_path, sizeof(output_path), "%s/tables_using_name.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "tables_using_name.txt");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -73,7 +72,7 @@ old_8_3_check_for_name_data_type_usage(ClusterInfo *cluster)
 		for (rowno = 0; rowno < ntups; rowno++)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 			if (!db_used)
 			{
@@ -126,8 +125,7 @@ old_8_3_check_for_tsquery_usage(ClusterInfo *cluster)
 
 	prep_status("Checking for tsquery user columns");
 
-	snprintf(output_path, sizeof(output_path), "%s/tables_using_tsquery.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "tables_using_tsquery.txt");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -164,7 +162,7 @@ old_8_3_check_for_tsquery_usage(ClusterInfo *cluster)
 		for (rowno = 0; rowno < ntups; rowno++)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 			if (!db_used)
 			{
@@ -216,8 +214,7 @@ old_8_3_check_ltree_usage(ClusterInfo *cluster)
 
 	prep_status("Checking for contrib/ltree");
 
-	snprintf(output_path, sizeof(output_path), "%s/contrib_ltree.txt",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "contrib_ltree.txt");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -244,7 +241,7 @@ old_8_3_check_ltree_usage(ClusterInfo *cluster)
 		for (rowno = 0; rowno < ntups; rowno++)
 		{
 			found = true;
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "Could not open file \"%s\": %s\n",
 					   output_path, getErrorText(errno));
 			if (!db_used)
@@ -304,8 +301,7 @@ old_8_3_rebuild_tsvector_tables(ClusterInfo *cluster, bool check_mode)
 
 	prep_status("Checking for tsvector user columns");
 
-	snprintf(output_path, sizeof(output_path), "%s/rebuild_tsvector_tables.sql",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "rebuild_tsvector_tables.sql");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -364,7 +360,7 @@ old_8_3_rebuild_tsvector_tables(ClusterInfo *cluster, bool check_mode)
 			found = true;
 			if (!check_mode)
 			{
-				if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+				if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 					pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 				if (!db_used)
 				{
@@ -446,8 +442,7 @@ old_8_3_invalidate_hash_gin_indexes(ClusterInfo *cluster, bool check_mode)
 
 	prep_status("Checking for hash and GIN indexes");
 
-	snprintf(output_path, sizeof(output_path), "%s/reindex_hash_and_gin.sql",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "reindex_hash_and_gin.sql");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -481,7 +476,7 @@ old_8_3_invalidate_hash_gin_indexes(ClusterInfo *cluster, bool check_mode)
 			found = true;
 			if (!check_mode)
 			{
-				if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+				if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 					pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 				if (!db_used)
 				{
@@ -556,8 +551,7 @@ old_8_3_invalidate_bpchar_pattern_ops_indexes(ClusterInfo *cluster,
 
 	prep_status("Checking for bpchar_pattern_ops indexes");
 
-	snprintf(output_path, sizeof(output_path), "%s/reindex_bpchar_ops.sql",
-			 os_info.cwd);
+	snprintf(output_path, sizeof(output_path), "reindex_bpchar_ops.sql");
 
 	for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
 	{
@@ -601,7 +595,7 @@ old_8_3_invalidate_bpchar_pattern_ops_indexes(ClusterInfo *cluster,
 			found = true;
 			if (!check_mode)
 			{
-				if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+				if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 					pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 				if (!db_used)
 				{
@@ -683,7 +677,7 @@ old_8_3_create_sequence_script(ClusterInfo *cluster)
 	bool		found = false;
 	char	   *output_path = pg_malloc(MAXPGPATH);
 
-	snprintf(output_path, MAXPGPATH, "%s/adjust_sequences.sql", os_info.cwd);
+	snprintf(output_path, MAXPGPATH, "adjust_sequences.sql");
 
 	prep_status("Creating script to adjust sequences");
 
@@ -723,7 +717,7 @@ old_8_3_create_sequence_script(ClusterInfo *cluster)
 
 			found = true;
 
-			if (script == NULL && (script = fopen(output_path, "w")) == NULL)
+			if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
 				pg_log(PG_FATAL, "could not open file \"%s\": %s\n", output_path, getErrorText(errno));
 			if (!db_used)
 			{
diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
index 4f263fe6720a1..6ecdfda67198c 100644
--- a/doc/src/sgml/pgupgrade.sgml
+++ b/doc/src/sgml/pgupgrade.sgml
@@ -90,30 +90,12 @@
       variable <envar>PGDATANEW</></para></listitem>
      </varlistentry>
 
-     <varlistentry>
-      <term><option>-g</option></term>
-      <term><option>--debug</option></term>
-      <listitem><para>enable debugging</para></listitem>
-     </varlistentry>
-
-     <varlistentry>
-      <term><option>-G</option> <replaceable>debug_filename</></term>
-      <term><option>--debugfile=</option><replaceable>debug_filename</></term>
-      <listitem><para>output debugging activity to file</para></listitem>
-     </varlistentry>
-
      <varlistentry>
       <term><option>-k</option></term>
       <term><option>--link</option></term>
       <listitem><para>use hard links instead of copying files to the new cluster</para></listitem>
      </varlistentry>
 
-     <varlistentry>
-      <term><option>-l</option> <replaceable>log_filename</></term>
-      <term><option>--logfile=</option><replaceable>log_filename</></term>
-      <listitem><para>log internal activity to file</para></listitem>
-     </varlistentry>
-
      <varlistentry>
       <term><option>-o</option> <replaceable class="parameter">options</replaceable></term>
       <term><option>--old-options</option> <replaceable class="parameter">options</replaceable></term>
@@ -142,6 +124,13 @@
       variable <envar>PGPORTNEW</></para></listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>-r</option></term>
+      <term><option>--retain</option></term>
+      <listitem><para>retain SQL and log files even after successful completion
+      </para></listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-u</option> <replaceable>user_name</></term>
       <term><option>--user=</option><replaceable>user_name</></term>
@@ -152,7 +141,7 @@
      <varlistentry>
       <term><option>-v</option></term>
       <term><option>--verbose</option></term>
-      <listitem><para>enable verbose output</para></listitem>
+      <listitem><para>enable verbose internal logging</para></listitem>
      </varlistentry>
 
      <varlistentry>

From 97c85098de1e21825adf447df60b95a56cef7bd8 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 13 Mar 2012 09:35:55 -0400
Subject: [PATCH 125/129] pgstattuple: Add new error case for spgist indexes.

Extracted from a larger patch by Jaime Casanova, reviewed by Noah Misch.
I think this error message could use some more extensive revision, but
this at least makes the handling of spgist consistent with what we do for
other types of indexes that this code doesn't know how to handle.
---
 contrib/pgstattuple/pgstattuple.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index e5ddd87091036..7af724f24bb1d 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -231,6 +231,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
 				case GIN_AM_OID:
 					err = "gin index";
 					break;
+				case SPGIST_AM_OID:
+					err = "spgist index";
+					break;
 				default:
 					err = "unknown index";
 					break;

From 2e46bf67114586835f4a9908f1a1f08ee8ba83a8 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 13 Mar 2012 09:51:03 -0400
Subject: [PATCH 126/129] pgstattuple: Use a BufferAccessStrategy object to
 avoid cache-trashing.

Jaime Casanova, reviewed by Noah Misch, slightly modified by me.
---
 contrib/pgstattuple/pgstatindex.c |  5 ++--
 contrib/pgstattuple/pgstattuple.c | 42 +++++++++++++++++++++----------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index beff1b9855bae..9f2ec1f210840 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -95,6 +95,7 @@ pgstatindex(PG_FUNCTION_ARGS)
 	BlockNumber nblocks;
 	BlockNumber blkno;
 	BTIndexStat indexStat;
+ 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	if (!superuser())
 		ereport(ERROR,
@@ -122,7 +123,7 @@ pgstatindex(PG_FUNCTION_ARGS)
 	 * Read metapage
 	 */
 	{
-		Buffer		buffer = ReadBuffer(rel, 0);
+		Buffer		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy);
 		Page		page = BufferGetPage(buffer);
 		BTMetaPageData *metad = BTPageGetMeta(page);
 
@@ -159,7 +160,7 @@ pgstatindex(PG_FUNCTION_ARGS)
 		CHECK_FOR_INTERRUPTS();
 
 		/* Read and lock buffer */
-		buffer = ReadBuffer(rel, blkno);
+ 		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buffer);
diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index 7af724f24bb1d..c9be8c92e4bf2 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -61,18 +61,22 @@ typedef struct pgstattuple_type
 	uint64		free_space;		/* free/reusable space in bytes */
 } pgstattuple_type;
 
-typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber);
+typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
+							 BufferAccessStrategy);
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 					   FunctionCallInfo fcinfo);
 static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
-				  Relation rel, BlockNumber blkno);
+				  Relation rel, BlockNumber blkno,
+				  BufferAccessStrategy bstrategy);
 static void pgstat_hash_page(pgstattuple_type *stat,
-				 Relation rel, BlockNumber blkno);
+				 Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy);
 static void pgstat_gist_page(pgstattuple_type *stat,
-				 Relation rel, BlockNumber blkno);
+				 Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy);
 static Datum pgstat_index(Relation rel, BlockNumber start,
 			 pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
@@ -273,12 +277,17 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 	BlockNumber tupblock;
 	Buffer		buffer;
 	pgstattuple_type stat = {0};
+	BufferAccessStrategy bstrategy;
 
 	/* Disable syncscan because we assume we scan from block zero upwards */
 	scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false);
 
 	nblocks = scan->rs_nblocks; /* # blocks to be scanned */
 
+	/* prepare access strategy for this table */
+	bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	scan->rs_strategy = bstrategy;
+
 	/* scan the relation */
 	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
@@ -312,7 +321,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 		{
 			CHECK_FOR_INTERRUPTS();
 
-			buffer = ReadBuffer(rel, block);
+			buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, RBM_NORMAL, bstrategy);
 			LockBuffer(buffer, BUFFER_LOCK_SHARE);
 			stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
 			UnlockReleaseBuffer(buffer);
@@ -325,7 +334,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		buffer = ReadBuffer(rel, block);
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, RBM_NORMAL, bstrategy);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
 		UnlockReleaseBuffer(buffer);
@@ -343,12 +352,13 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
  * pgstat_btree_page -- check tuples in a btree page
  */
 static void
-pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno)
+pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
+				  BufferAccessStrategy bstrategy)
 {
 	Buffer		buf;
 	Page		page;
 
-	buf = ReadBuffer(rel, blkno);
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 
@@ -386,13 +396,14 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno)
  * pgstat_hash_page -- check tuples in a hash page
  */
 static void
-pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno)
+pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy)
 {
 	Buffer		buf;
 	Page		page;
 
 	_hash_getlock(rel, blkno, HASH_SHARE);
-	buf = _hash_getbuf(rel, blkno, HASH_READ, 0);
+	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
 	page = BufferGetPage(buf);
 
 	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
@@ -429,12 +440,13 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno)
  * pgstat_gist_page -- check tuples in a gist page
  */
 static void
-pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno)
+pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy)
 {
 	Buffer		buf;
 	Page		page;
 
-	buf = ReadBuffer(rel, blkno);
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, GIST_SHARE);
 	gistcheckpage(rel, buf);
 	page = BufferGetPage(buf);
@@ -461,8 +473,12 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 {
 	BlockNumber nblocks;
 	BlockNumber blkno;
+	BufferAccessStrategy bstrategy;
 	pgstattuple_type stat = {0};
 
+	/* prepare access strategy for this index */
+	bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
 	blkno = start;
 	for (;;)
 	{
@@ -483,7 +499,7 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 		{
 			CHECK_FOR_INTERRUPTS();
 
-			pagefn(&stat, rel, blkno);
+			pagefn(&stat, rel, blkno, bstrategy);
 		}
 	}
 

From ed75380bdae30dc1313aef44beafad860cf246c0 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 13 Mar 2012 13:19:06 -0400
Subject: [PATCH 127/129] Create a stack of pl/python "execution contexts".
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This replaces the former global variable PLy_curr_procedure, and provides
a place to stash per-call-level information.  In particular we create a
per-call-level scratch memory context.

For the moment, the scratch context is just used to avoid leaking memory
from datatype output function calls in PLyDict_FromTuple.  There probably
will be more use-cases in future.

Although this is a fix for a pre-existing memory leakage bug, it seems
sufficiently invasive to not want to back-patch; it feels better as part
of the major rearrangement of plpython code that we've already done as
part of 9.2.

Jan Urbański
---
 src/pl/plpython/plpy_cursorobject.c |   7 +-
 src/pl/plpython/plpy_elog.c         |   8 +-
 src/pl/plpython/plpy_exec.c         |   8 +-
 src/pl/plpython/plpy_main.c         | 114 ++++++++++++++++++++++------
 src/pl/plpython/plpy_main.h         |  15 ++++
 src/pl/plpython/plpy_procedure.c    |   3 -
 src/pl/plpython/plpy_procedure.h    |   4 -
 src/pl/plpython/plpy_spi.c          |   8 +-
 src/pl/plpython/plpy_typeio.c       |  14 ++++
 9 files changed, 141 insertions(+), 40 deletions(-)

diff --git a/src/pl/plpython/plpy_cursorobject.c b/src/pl/plpython/plpy_cursorobject.c
index 4226dc7d193d3..e8240b63c901e 100644
--- a/src/pl/plpython/plpy_cursorobject.c
+++ b/src/pl/plpython/plpy_cursorobject.c
@@ -14,6 +14,7 @@
 #include "plpy_cursorobject.h"
 
 #include "plpy_elog.h"
+#include "plpy_main.h"
 #include "plpy_planobject.h"
 #include "plpy_procedure.h"
 #include "plpy_resultobject.h"
@@ -119,6 +120,7 @@ PLy_cursor_query(const char *query)
 
 	PG_TRY();
 	{
+		PLyExecutionContext	*exec_ctx = PLy_current_execution_context();
 		SPIPlanPtr	plan;
 		Portal		portal;
 
@@ -130,7 +132,7 @@ PLy_cursor_query(const char *query)
 				 SPI_result_code_string(SPI_result));
 
 		portal = SPI_cursor_open(NULL, plan, NULL, NULL,
-								 PLy_curr_procedure->fn_readonly);
+								 exec_ctx->curr_proc->fn_readonly);
 		SPI_freeplan(plan);
 
 		if (portal == NULL)
@@ -207,6 +209,7 @@ PLy_cursor_plan(PyObject *ob, PyObject *args)
 
 	PG_TRY();
 	{
+		PLyExecutionContext *exec_ctx = PLy_current_execution_context();
 		Portal		portal;
 		char	   *volatile nulls;
 		volatile int j;
@@ -253,7 +256,7 @@ PLy_cursor_plan(PyObject *ob, PyObject *args)
 		}
 
 		portal = SPI_cursor_open(NULL, plan->plan, plan->values, nulls,
-								 PLy_curr_procedure->fn_readonly);
+								 exec_ctx->curr_proc->fn_readonly);
 		if (portal == NULL)
 			elog(ERROR, "SPI_cursor_open() failed: %s",
 				 SPI_result_code_string(SPI_result));
diff --git a/src/pl/plpython/plpy_elog.c b/src/pl/plpython/plpy_elog.c
index 741980c7c5674..2f04a8c0dba17 100644
--- a/src/pl/plpython/plpy_elog.c
+++ b/src/pl/plpython/plpy_elog.c
@@ -12,6 +12,7 @@
 
 #include "plpy_elog.h"
 
+#include "plpy_main.h"
 #include "plpy_procedure.h"
 
 
@@ -255,6 +256,7 @@ PLy_traceback(char **xmsg, char **tbmsg, int *tb_depth)
 		/* The first frame always points at <module>, skip it. */
 		if (*tb_depth > 0)
 		{
+			PLyExecutionContext	*exec_ctx = PLy_current_execution_context();
 			char	   *proname;
 			char	   *fname;
 			char	   *line;
@@ -270,7 +272,7 @@ PLy_traceback(char **xmsg, char **tbmsg, int *tb_depth)
 			else
 				fname = PyString_AsString(name);
 
-			proname = PLy_procedure_name(PLy_curr_procedure);
+			proname = PLy_procedure_name(exec_ctx->curr_proc);
 			plain_filename = PyString_AsString(filename);
 			plain_lineno = PyInt_AsLong(lineno);
 
@@ -287,7 +289,7 @@ PLy_traceback(char **xmsg, char **tbmsg, int *tb_depth)
 			 * function code object was compiled with "<string>" as the
 			 * filename
 			 */
-			if (PLy_curr_procedure && plain_filename != NULL &&
+			if (exec_ctx->curr_proc && plain_filename != NULL &&
 				strcmp(plain_filename, "<string>") == 0)
 			{
 				/*
@@ -299,7 +301,7 @@ PLy_traceback(char **xmsg, char **tbmsg, int *tb_depth)
 				 * for.  But we do not go as far as traceback.py in reading
 				 * the source of imported modules.
 				 */
-				line = get_source_line(PLy_curr_procedure->src, plain_lineno);
+				line = get_source_line(exec_ctx->curr_proc->src, plain_lineno);
 				if (line)
 				{
 					appendStringInfo(&tbstr, "\n    %s", line);
diff --git a/src/pl/plpython/plpy_exec.c b/src/pl/plpython/plpy_exec.c
index ecf4996e8cf31..280d3ed1aca08 100644
--- a/src/pl/plpython/plpy_exec.c
+++ b/src/pl/plpython/plpy_exec.c
@@ -455,7 +455,9 @@ PLy_function_delete_args(PLyProcedure *proc)
 static void
 plpython_return_error_callback(void *arg)
 {
-	if (PLy_curr_procedure)
+	PLyExecutionContext *exec_ctx = PLy_current_execution_context();
+
+	if (exec_ctx->curr_proc)
 		errcontext("while creating return value");
 }
 
@@ -781,7 +783,9 @@ PLy_modify_tuple(PLyProcedure *proc, PyObject *pltd, TriggerData *tdata,
 static void
 plpython_trigger_error_callback(void *arg)
 {
-	if (PLy_curr_procedure)
+	PLyExecutionContext *exec_ctx = PLy_current_execution_context();
+
+	if (exec_ctx->curr_proc)
 		errcontext("while modifying trigger row");
 }
 
diff --git a/src/pl/plpython/plpy_main.c b/src/pl/plpython/plpy_main.c
index ae9d87e9a6329..277dedc22d2da 100644
--- a/src/pl/plpython/plpy_main.c
+++ b/src/pl/plpython/plpy_main.c
@@ -12,6 +12,7 @@
 #include "executor/spi.h"
 #include "miscadmin.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 #include "utils/syscache.h"
 
 #include "plpython.h"
@@ -66,11 +67,17 @@ static void plpython_error_callback(void *arg);
 static void plpython_inline_error_callback(void *arg);
 static void PLy_init_interp(void);
 
+static PLyExecutionContext *PLy_push_execution_context(void);
+static void PLy_pop_execution_context(void);
+
 static const int plpython_python_version = PY_MAJOR_VERSION;
 
 /* initialize global variables */
 PyObject *PLy_interp_globals = NULL;
 
+/* this doesn't need to be global; use PLy_current_execution_context() */
+static PLyExecutionContext *PLy_execution_contexts = NULL;
+
 
 void
 _PG_init(void)
@@ -114,6 +121,8 @@ _PG_init(void)
 
 	explicit_subtransactions = NIL;
 
+	PLy_execution_contexts = NULL;
+
 	inited = true;
 }
 
@@ -179,13 +188,20 @@ Datum
 plpython_call_handler(PG_FUNCTION_ARGS)
 {
 	Datum		retval;
-	PLyProcedure *save_curr_proc;
+	PLyExecutionContext *exec_ctx;
 	ErrorContextCallback plerrcontext;
 
+	/* Note: SPI_finish() happens in plpy_exec.c, which is dubious design */
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
 
-	save_curr_proc = PLy_curr_procedure;
+	/*
+	 * Push execution context onto stack.  It is important that this get
+	 * popped again, so avoid putting anything that could throw error between
+	 * here and the PG_TRY.  (plpython_error_callback expects the stack entry
+	 * to be there, so we have to make the context first.)
+	 */
+	exec_ctx = PLy_push_execution_context();
 
 	/*
 	 * Setup error traceback support for ereport()
@@ -203,20 +219,20 @@ plpython_call_handler(PG_FUNCTION_ARGS)
 			HeapTuple	trv;
 
 			proc = PLy_procedure_get(fcinfo->flinfo->fn_oid, true);
-			PLy_curr_procedure = proc;
+			exec_ctx->curr_proc = proc;
 			trv = PLy_exec_trigger(fcinfo, proc);
 			retval = PointerGetDatum(trv);
 		}
 		else
 		{
 			proc = PLy_procedure_get(fcinfo->flinfo->fn_oid, false);
-			PLy_curr_procedure = proc;
+			exec_ctx->curr_proc = proc;
 			retval = PLy_exec_function(fcinfo, proc);
 		}
 	}
 	PG_CATCH();
 	{
-		PLy_curr_procedure = save_curr_proc;
+		PLy_pop_execution_context();
 		PyErr_Clear();
 		PG_RE_THROW();
 	}
@@ -224,8 +240,8 @@ plpython_call_handler(PG_FUNCTION_ARGS)
 
 	/* Pop the error context stack */
 	error_context_stack = plerrcontext.previous;
-
-	PLy_curr_procedure = save_curr_proc;
+	/* ... and then the execution context */
+	PLy_pop_execution_context();
 
 	return retval;
 }
@@ -244,22 +260,14 @@ plpython_inline_handler(PG_FUNCTION_ARGS)
 	InlineCodeBlock *codeblock = (InlineCodeBlock *) DatumGetPointer(PG_GETARG_DATUM(0));
 	FunctionCallInfoData fake_fcinfo;
 	FmgrInfo	flinfo;
-	PLyProcedure *save_curr_proc;
 	PLyProcedure proc;
+	PLyExecutionContext *exec_ctx;
 	ErrorContextCallback plerrcontext;
 
+	/* Note: SPI_finish() happens in plpy_exec.c, which is dubious design */
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
 
-	save_curr_proc = PLy_curr_procedure;
-
-	/*
-	 * Setup error traceback support for ereport()
-	 */
-	plerrcontext.callback = plpython_inline_error_callback;
-	plerrcontext.previous = error_context_stack;
-	error_context_stack = &plerrcontext;
-
 	MemSet(&fake_fcinfo, 0, sizeof(fake_fcinfo));
 	MemSet(&flinfo, 0, sizeof(flinfo));
 	fake_fcinfo.flinfo = &flinfo;
@@ -270,27 +278,44 @@ plpython_inline_handler(PG_FUNCTION_ARGS)
 	proc.pyname = PLy_strdup("__plpython_inline_block");
 	proc.result.out.d.typoid = VOIDOID;
 
+	/*
+	 * Push execution context onto stack.  It is important that this get
+	 * popped again, so avoid putting anything that could throw error between
+	 * here and the PG_TRY.  (plpython_inline_error_callback doesn't currently
+	 * need the stack entry, but for consistency with plpython_call_handler
+	 * we do it in this order.)
+	 */
+	exec_ctx = PLy_push_execution_context();
+
+	/*
+	 * Setup error traceback support for ereport()
+	 */
+	plerrcontext.callback = plpython_inline_error_callback;
+	plerrcontext.previous = error_context_stack;
+	error_context_stack = &plerrcontext;
+
 	PG_TRY();
 	{
 		PLy_procedure_compile(&proc, codeblock->source_text);
-		PLy_curr_procedure = &proc;
+		exec_ctx->curr_proc = &proc;
 		PLy_exec_function(&fake_fcinfo, &proc);
 	}
 	PG_CATCH();
 	{
+		PLy_pop_execution_context();
 		PLy_procedure_delete(&proc);
-		PLy_curr_procedure = save_curr_proc;
 		PyErr_Clear();
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
 
-	PLy_procedure_delete(&proc);
-
 	/* Pop the error context stack */
 	error_context_stack = plerrcontext.previous;
+	/* ... and then the execution context */
+	PLy_pop_execution_context();
 
-	PLy_curr_procedure = save_curr_proc;
+	/* Now clean up the transient procedure we made */
+	PLy_procedure_delete(&proc);
 
 	PG_RETURN_VOID();
 }
@@ -313,9 +338,11 @@ static bool PLy_procedure_is_trigger(Form_pg_proc procStruct)
 static void
 plpython_error_callback(void *arg)
 {
-	if (PLy_curr_procedure)
+	PLyExecutionContext *exec_ctx = PLy_current_execution_context();
+
+	if (exec_ctx->curr_proc)
 		errcontext("PL/Python function \"%s\"",
-				   PLy_procedure_name(PLy_curr_procedure));
+				   PLy_procedure_name(exec_ctx->curr_proc));
 }
 
 static void
@@ -323,3 +350,42 @@ plpython_inline_error_callback(void *arg)
 {
 	errcontext("PL/Python anonymous code block");
 }
+
+PLyExecutionContext *
+PLy_current_execution_context(void)
+{
+	if (PLy_execution_contexts == NULL)
+		elog(ERROR, "no Python function is currently executing");
+
+	return PLy_execution_contexts;
+}
+
+static PLyExecutionContext *
+PLy_push_execution_context(void)
+{
+	PLyExecutionContext	*context = PLy_malloc(sizeof(PLyExecutionContext));
+
+	context->curr_proc = NULL;
+	context->scratch_ctx = AllocSetContextCreate(TopTransactionContext,
+												 "PL/Python scratch context",
+												 ALLOCSET_DEFAULT_MINSIZE,
+												 ALLOCSET_DEFAULT_INITSIZE,
+												 ALLOCSET_DEFAULT_MAXSIZE);
+	context->next = PLy_execution_contexts;
+	PLy_execution_contexts = context;
+	return context;
+}
+
+static void
+PLy_pop_execution_context(void)
+{
+	PLyExecutionContext	*context = PLy_execution_contexts;
+
+	if (context == NULL)
+		elog(ERROR, "no Python function is currently executing");
+
+	PLy_execution_contexts = context->next;
+
+	MemoryContextDelete(context->scratch_ctx);
+	PLy_free(context);
+}
diff --git a/src/pl/plpython/plpy_main.h b/src/pl/plpython/plpy_main.h
index a71aead176905..cb214bf83c77c 100644
--- a/src/pl/plpython/plpy_main.h
+++ b/src/pl/plpython/plpy_main.h
@@ -10,4 +10,19 @@
 /* the interpreter's globals dict */
 extern PyObject *PLy_interp_globals;
 
+/*
+ * A stack of PL/Python execution contexts. Each time user-defined Python code
+ * is called, an execution context is created and put on the stack. After the
+ * Python code returns, the context is destroyed.
+ */
+typedef struct PLyExecutionContext
+{
+	PLyProcedure	*curr_proc;		/* the currently executing procedure */
+	MemoryContext	scratch_ctx;	/* a context for things like type I/O */
+	struct PLyExecutionContext *next;	/* previous stack level */
+} PLyExecutionContext;
+
+/* Get the current execution context */
+extern PLyExecutionContext *PLy_current_execution_context(void);
+
 #endif	/* PLPY_MAIN_H */
diff --git a/src/pl/plpython/plpy_procedure.c b/src/pl/plpython/plpy_procedure.c
index 229966ad795bf..7fb5f00e0f210 100644
--- a/src/pl/plpython/plpy_procedure.c
+++ b/src/pl/plpython/plpy_procedure.c
@@ -22,9 +22,6 @@
 #include "plpy_main.h"
 
 
-PLyProcedure *PLy_curr_procedure = NULL;
-
-
 static HTAB *PLy_procedure_cache = NULL;
 static HTAB *PLy_trigger_cache = NULL;
 
diff --git a/src/pl/plpython/plpy_procedure.h b/src/pl/plpython/plpy_procedure.h
index e986c7ecc56a1..c7405e064ec1b 100644
--- a/src/pl/plpython/plpy_procedure.h
+++ b/src/pl/plpython/plpy_procedure.h
@@ -45,8 +45,4 @@ extern PLyProcedure *PLy_procedure_get(Oid fn_oid, bool is_trigger);
 extern void PLy_procedure_compile(PLyProcedure *proc, const char *src);
 extern void PLy_procedure_delete(PLyProcedure *proc);
 
-
-/* currently active plpython function */
-extern PLyProcedure *PLy_curr_procedure;
-
 #endif	/* PLPY_PROCEDURE_H */
diff --git a/src/pl/plpython/plpy_spi.c b/src/pl/plpython/plpy_spi.c
index 0d63c4f5ce850..a75839b93ea3c 100644
--- a/src/pl/plpython/plpy_spi.c
+++ b/src/pl/plpython/plpy_spi.c
@@ -18,6 +18,7 @@
 #include "plpy_spi.h"
 
 #include "plpy_elog.h"
+#include "plpy_main.h"
 #include "plpy_planobject.h"
 #include "plpy_plpymodule.h"
 #include "plpy_procedure.h"
@@ -236,6 +237,7 @@ PLy_spi_execute_plan(PyObject *ob, PyObject *list, long limit)
 
 	PG_TRY();
 	{
+		PLyExecutionContext *exec_ctx = PLy_current_execution_context();
 		char	   *volatile nulls;
 		volatile int j;
 
@@ -281,7 +283,7 @@ PLy_spi_execute_plan(PyObject *ob, PyObject *list, long limit)
 		}
 
 		rv = SPI_execute_plan(plan->plan, plan->values, nulls,
-							  PLy_curr_procedure->fn_readonly, limit);
+							  exec_ctx->curr_proc->fn_readonly, limit);
 		ret = PLy_spi_execute_fetch_result(SPI_tuptable, SPI_processed, rv);
 
 		if (nargs > 0)
@@ -347,8 +349,10 @@ PLy_spi_execute_query(char *query, long limit)
 
 	PG_TRY();
 	{
+		PLyExecutionContext	*exec_ctx = PLy_current_execution_context();
+
 		pg_verifymbstr(query, strlen(query), false);
-		rv = SPI_execute(query, PLy_curr_procedure->fn_readonly, limit);
+		rv = SPI_execute(query, exec_ctx->curr_proc->fn_readonly, limit);
 		ret = PLy_spi_execute_fetch_result(SPI_tuptable, SPI_processed, rv);
 
 		PLy_spi_subtransaction_commit(oldcontext, oldowner);
diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c
index d5cac9f1f0dff..1cc9b1e210e09 100644
--- a/src/pl/plpython/plpy_typeio.c
+++ b/src/pl/plpython/plpy_typeio.c
@@ -23,6 +23,7 @@
 #include "plpy_typeio.h"
 
 #include "plpy_elog.h"
+#include "plpy_main.h"
 
 
 /* I/O function caching */
@@ -258,10 +259,15 @@ PLy_output_record_funcs(PLyTypeInfo *arg, TupleDesc desc)
 	Assert(arg->is_rowtype == 1);
 }
 
+/*
+ * Transform a tuple into a Python dict object.
+ */
 PyObject *
 PLyDict_FromTuple(PLyTypeInfo *info, HeapTuple tuple, TupleDesc desc)
 {
 	PyObject   *volatile dict;
+	PLyExecutionContext *exec_ctx = PLy_current_execution_context();
+	MemoryContext oldcontext = CurrentMemoryContext;
 	int			i;
 
 	if (info->is_rowtype != 1)
@@ -273,6 +279,11 @@ PLyDict_FromTuple(PLyTypeInfo *info, HeapTuple tuple, TupleDesc desc)
 
 	PG_TRY();
 	{
+		/*
+		 * Do the work in the scratch context to avoid leaking memory from
+		 * the datatype output function calls.
+		 */
+		MemoryContextSwitchTo(exec_ctx->scratch_ctx);
 		for (i = 0; i < info->in.r.natts; i++)
 		{
 			char	   *key;
@@ -295,9 +306,12 @@ PLyDict_FromTuple(PLyTypeInfo *info, HeapTuple tuple, TupleDesc desc)
 				Py_DECREF(value);
 			}
 		}
+		MemoryContextSwitchTo(oldcontext);
+		MemoryContextReset(exec_ctx->scratch_ctx);
 	}
 	PG_CATCH();
 	{
+		MemoryContextSwitchTo(oldcontext);
 		Py_DECREF(dict);
 		PG_RE_THROW();
 	}

From a14fa84693659c4c4a17204406945b29fae3d9c4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 13 Mar 2012 13:28:11 -0400
Subject: [PATCH 128/129] Fix minor memory leak in PLy_typeinfo_dealloc().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We forgot to free the per-attribute array element descriptors.

Jan Urbański
---
 src/pl/plpython/plpy_typeio.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c
index 1cc9b1e210e09..d04fe46d4b77e 100644
--- a/src/pl/plpython/plpy_typeio.c
+++ b/src/pl/plpython/plpy_typeio.c
@@ -74,8 +74,20 @@ PLy_typeinfo_dealloc(PLyTypeInfo *arg)
 {
 	if (arg->is_rowtype == 1)
 	{
+		int			i;
+
+		for (i = 0; i < arg->in.r.natts; i++)
+		{
+			if (arg->in.r.atts[i].elm != NULL)
+				PLy_free(arg->in.r.atts[i].elm);
+		}
 		if (arg->in.r.atts)
 			PLy_free(arg->in.r.atts);
+		for (i = 0; i < arg->out.r.natts; i++)
+		{
+			if (arg->out.r.atts[i].elm != NULL)
+				PLy_free(arg->out.r.atts[i].elm);
+		}
 		if (arg->out.r.atts)
 			PLy_free(arg->out.r.atts);
 	}

From 5cd72c7a7c7bd76ab028e1dc59d90a47750acebe Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 13 Mar 2012 15:26:32 -0400
Subject: [PATCH 129/129] Patch some corner-case bugs in pl/python.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dave Malcolm of Red Hat is working on a static code analysis tool for
Python-related C code.  It reported a number of problems in plpython,
most of which were failures to check for NULL results from object-creation
functions, so would only be an issue in very-low-memory situations.

Patch in HEAD and 9.1.  We could go further back but it's not clear that
these issues are important enough to justify the work.

Jan Urbański
---
 src/pl/plpython/plpy_elog.c       |  4 ++++
 src/pl/plpython/plpy_main.c       |  2 ++
 src/pl/plpython/plpy_plpymodule.c | 18 ++++++++++++++++--
 src/pl/plpython/plpy_spi.c        |  3 ++-
 src/pl/plpython/plpy_typeio.c     |  2 ++
 5 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/pl/plpython/plpy_elog.c b/src/pl/plpython/plpy_elog.c
index 2f04a8c0dba17..f7d321289d4ac 100644
--- a/src/pl/plpython/plpy_elog.c
+++ b/src/pl/plpython/plpy_elog.c
@@ -367,6 +367,10 @@ get_source_line(const char *src, int lineno)
 	const char *next = src;
 	int			current = 0;
 
+	/* sanity check */
+	if (lineno <= 0)
+		return NULL;
+
 	while (current < lineno)
 	{
 		s = next;
diff --git a/src/pl/plpython/plpy_main.c b/src/pl/plpython/plpy_main.c
index 277dedc22d2da..c126db995ae34 100644
--- a/src/pl/plpython/plpy_main.c
+++ b/src/pl/plpython/plpy_main.c
@@ -142,6 +142,8 @@ PLy_init_interp(void)
 	Py_INCREF(mainmod);
 	PLy_interp_globals = PyModule_GetDict(mainmod);
 	PLy_interp_safe_globals = PyDict_New();
+	if (PLy_interp_safe_globals == NULL)
+		PLy_elog(ERROR, "could not create globals");
 	PyDict_SetItemString(PLy_interp_globals, "GD", PLy_interp_safe_globals);
 	Py_DECREF(mainmod);
 	if (PLy_interp_globals == NULL || PyErr_Occurred())
diff --git a/src/pl/plpython/plpy_plpymodule.c b/src/pl/plpython/plpy_plpymodule.c
index d2d0a2a2323c0..01caa0a3236e5 100644
--- a/src/pl/plpython/plpy_plpymodule.c
+++ b/src/pl/plpython/plpy_plpymodule.c
@@ -173,9 +173,11 @@ PLy_init_plpy(void)
 	main_mod = PyImport_AddModule("__main__");
 	main_dict = PyModule_GetDict(main_mod);
 	plpy_mod = PyImport_AddModule("plpy");
+	if (plpy_mod == NULL)
+		PLy_elog(ERROR, "could not initialize plpy");
 	PyDict_SetItemString(main_dict, "plpy", plpy_mod);
 	if (PyErr_Occurred())
-		elog(ERROR, "could not initialize plpy");
+		PLy_elog(ERROR, "could not initialize plpy");
 }
 
 static void
@@ -208,6 +210,11 @@ PLy_add_exceptions(PyObject *plpy)
 	PLy_exc_fatal = PyErr_NewException("plpy.Fatal", NULL, NULL);
 	PLy_exc_spi_error = PyErr_NewException("plpy.SPIError", NULL, NULL);
 
+	if (PLy_exc_error == NULL ||
+		PLy_exc_fatal == NULL ||
+		PLy_exc_spi_error == NULL)
+		PLy_elog(ERROR, "could not create the base SPI exceptions");
+
 	Py_INCREF(PLy_exc_error);
 	PyModule_AddObject(plpy, "Error", PLy_exc_error);
 	Py_INCREF(PLy_exc_fatal);
@@ -241,7 +248,13 @@ PLy_generate_spi_exceptions(PyObject *mod, PyObject *base)
 		PyObject   *sqlstate;
 		PyObject   *dict = PyDict_New();
 
+		if (dict == NULL)
+			PLy_elog(ERROR, "could not generate SPI exceptions");
+
 		sqlstate = PyString_FromString(unpack_sql_state(exception_map[i].sqlstate));
+		if (sqlstate == NULL)
+			PLy_elog(ERROR, "could not generate SPI exceptions");
+
 		PyDict_SetItemString(dict, "sqlstate", sqlstate);
 		Py_DECREF(sqlstate);
 		exc = PyErr_NewException(exception_map[i].name, base, dict);
@@ -370,7 +383,8 @@ PLy_output(volatile int level, PyObject *self, PyObject *args)
 		 */
 		PyObject   *o;
 
-		PyArg_UnpackTuple(args, "plpy.elog", 1, 1, &o);
+		if (!PyArg_UnpackTuple(args, "plpy.elog", 1, 1, &o))
+			PLy_elog(ERROR, "could not unpack arguments in plpy.elog");
 		so = PyObject_Str(o);
 	}
 	else
diff --git a/src/pl/plpython/plpy_spi.c b/src/pl/plpython/plpy_spi.c
index a75839b93ea3c..cde3c08f967bd 100644
--- a/src/pl/plpython/plpy_spi.c
+++ b/src/pl/plpython/plpy_spi.c
@@ -340,7 +340,7 @@ PLy_spi_execute_query(char *query, long limit)
 	int			rv;
 	volatile MemoryContext oldcontext;
 	volatile ResourceOwner oldowner;
-	PyObject   *ret;
+	PyObject   *ret = NULL;
 
 	oldcontext = CurrentMemoryContext;
 	oldowner = CurrentResourceOwner;
@@ -366,6 +366,7 @@ PLy_spi_execute_query(char *query, long limit)
 
 	if (rv < 0)
 	{
+		Py_XDECREF(ret);
 		PLy_exception_set(PLy_exc_spi_error,
 						  "SPI_execute failed: %s",
 						  SPI_result_code_string(rv));
diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c
index d04fe46d4b77e..9d6af053761a1 100644
--- a/src/pl/plpython/plpy_typeio.c
+++ b/src/pl/plpython/plpy_typeio.c
@@ -584,6 +584,8 @@ PLyList_FromArray(PLyDatumToOb *arg, Datum d)
 	length = ARR_DIMS(array)[0];
 	lbound = ARR_LBOUND(array)[0];
 	list = PyList_New(length);
+	if (list == NULL)
+		PLy_elog(ERROR, "could not create new Python list");
 
 	for (i = 0; i < length; i++)
 	{