make sure PartitionFilter is enabled while performing [concurrent] partitioning, fixes for ConcurrentPartWorker: pythonish tests pass

funbringer · funbringer · commit fd4405def3fe · 2016-08-29T05:45:46.000+03:00
diff --git a/init.sql b/init.sql
@@ -247,7 +247,8 @@ BEGIN
 	RETURN;
 END
 $$
-LANGUAGE plpgsql;
+LANGUAGE plpgsql
+SET pg_pathman.enable_partitionfilter = on; /* ensures that PartitionFilter is ON */
 
 /*
  * Old school way to distribute rows to partitions.
@@ -275,7 +276,8 @@ BEGIN
 	RETURN;
 END
 $$
-LANGUAGE plpgsql;
+LANGUAGE plpgsql
+SET pg_pathman.enable_partitionfilter = on; /* ensures that PartitionFilter is ON */
 
 /*
  * Disable pathman partitioning for specified relation.
@@ -541,7 +543,7 @@ BEGIN
 	RETURN v_part_count;
 END
 $$ LANGUAGE plpgsql
-SET pg_pathman.enable_partitionfilter = off;
+SET pg_pathman.enable_partitionfilter = off; /* ensures that PartitionFilter is OFF */
 
 
 
diff --git a/src/pathman_workers.c b/src/pathman_workers.c
@@ -421,9 +421,11 @@ bgw_main_concurrent_part(Datum main_arg)
 	/* Do the job */
 	do
 	{
-		Oid		types[2]	= { OIDOID,				INT4OID };
-		Datum	vals[2]		= { part_slot->relid,	part_slot->batch_size };
-		bool	nulls[2]	= { false,				false };
+		MemoryContext	old_mcxt;
+
+		Oid				types[2]	= { OIDOID,				INT4OID };
+		Datum			vals[2]		= { part_slot->relid,	part_slot->batch_size };
+		bool			nulls[2]	= { false,				false };
 
 		/* Reset loop variables */
 		failed = false;
@@ -432,22 +434,25 @@ bgw_main_concurrent_part(Datum main_arg)
 		/* Start new transaction (syscache access etc.) */
 		StartTransactionCommand();
 
+		/* We'll need this to recover from errors */
+		old_mcxt = CurrentMemoryContext;
+
 		SPI_connect();
 		PushActiveSnapshot(GetTransactionSnapshot());
 
 		/* Prepare the query if needed */
 		if (sql == NULL)
 		{
-			MemoryContext oldcontext;
+			MemoryContext	current_mcxt;
 
 			/*
 			 * Allocate as SQL query in top memory context because current
 			 * context will be destroyed after transaction finishes
 			 */
-			oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+			current_mcxt = MemoryContextSwitchTo(TopMemoryContext);
 			sql = psprintf("SELECT %s._partition_data_concurrent($1::oid, p_limit:=$2)",
 						   get_namespace_name(get_pathman_schema()));
-			MemoryContextSwitchTo(oldcontext);
+			MemoryContextSwitchTo(current_mcxt);
 		}
 
 		/* Exec ret = _partition_data_concurrent() */
@@ -471,21 +476,33 @@ bgw_main_concurrent_part(Datum main_arg)
 		}
 		PG_CATCH();
 		{
-			ErrorData *error;
-
-			EmitErrorReport();
+			ErrorData  *error;
+			char	   *sleep_time_str;
 
+			/* Switch to the original context & copy edata */
+			MemoryContextSwitchTo(old_mcxt);
 			error = CopyErrorData();
-			elog(LOG, "%s: %s", concurrent_part_bgw, error->message);
 			FlushErrorState();
+
+			/* Print messsage for this BGWorker to server log */
+			sleep_time_str = datum_to_cstring(Float8GetDatum(part_slot->sleep_time),
+											  FLOAT8OID);
+			ereport(LOG,
+					(errmsg("%s: %s", concurrent_part_bgw, error->message),
+					 errdetail("Attempt: %d/%d, sleep time: %s",
+							   failures_count + 1,
+							   PART_WORKER_MAX_ATTEMPTS,
+							   sleep_time_str)));
+			pfree(sleep_time_str); /* free the time string */
+
 			FreeErrorData(error);
 
 			/*
 			 * The most common exception we can catch here is a deadlock with
 			 * concurrent user queries. Check that attempts count doesn't exceed
 			 * some reasonable value
 			 */
-			if (failures_count++ > PART_WORKER_MAX_ATTEMPTS)
+			if (failures_count++ >= PART_WORKER_MAX_ATTEMPTS)
 			{
 				/* Mark slot as FREE */
 				part_slot->worker_status = WS_FREE;
@@ -510,8 +527,11 @@ bgw_main_concurrent_part(Datum main_arg)
 		if (failed)
 		{
 #ifdef USE_ASSERT_CHECKING
-			elog(DEBUG2, "%s: could not relocate batch, total: %lu [%u]",
-				 concurrent_part_bgw, part_slot->total_rows, MyProcPid);
+			elog(DEBUG1, "%s: could not relocate batch (%d/%d), total: %lu [%u]",
+				 concurrent_part_bgw,
+				 failures_count, PART_WORKER_MAX_ATTEMPTS, /* current/max */
+				 part_slot->total_rows,
+				 MyProcPid);
 #endif
 
 			/* Abort transaction and sleep for a second */
@@ -528,7 +548,7 @@ bgw_main_concurrent_part(Datum main_arg)
 			part_slot->total_rows += rows;
 
 #ifdef USE_ASSERT_CHECKING
-			elog(DEBUG2, "%s: relocated %d rows, total: %lu [%u]",
+			elog(DEBUG1, "%s: relocated %d rows, total: %lu [%u]",
 				 concurrent_part_bgw, rows, part_slot->total_rows, MyProcPid);
 #endif
 		}
diff --git a/src/pathman_workers.h b/src/pathman_workers.h
@@ -82,7 +82,7 @@ typedef struct
 #define PART_WORKER_SLOTS			10
 
 /* Max number of attempts per batch */
-#define PART_WORKER_MAX_ATTEMPTS	100
+#define PART_WORKER_MAX_ATTEMPTS	60
 
 
 /*
diff --git a/tests/partitioning_test.py b/tests/partitioning_test.py
@@ -53,8 +53,10 @@ def test_concurrent(self):
 		while True:
 			# update some rows to check for deadlocks
 			node.safe_psql('postgres', 
-				'''update abc set t = 'test'
-				where id in (select (random() * 300000)::int from generate_series(1, 3000))''')
+				'''
+					update abc set t = 'test'
+					where id in (select (random() * 300000)::int from generate_series(1, 3000))
+				''')
 
 			count = node.execute('postgres', 'select count(*) from pathman_concurrent_part_tasks')