Merge branch 'PGPROEE9_6_MULTIMASTER' of https://gitlab.postgrespro.ru/pgpro-dev/postgrespro into PGPROEE9_6_MULTIMASTER

knizhnik · knizhnik · commit 0fcd073ab918 · 2017-02-15T18:44:40.000+03:00
diff --git a/contrib/mmts/Cluster.pm b/contrib/mmts/Cluster.pm
@@ -3,7 +3,6 @@ package Cluster;
 use strict;
 use warnings;
 
-use Proc::ProcessTable;
 use PostgresNode;
 use TestLib;
 use Test::More;
@@ -166,45 +165,6 @@ sub stopid
 	return stopnode($self->{nodes}->[$idx]);
 }
 
-sub killtree
-{
-	my $root = shift;
-	diag("killtree $root\n");
-
-	my $t = new Proc::ProcessTable;
-
-	my %parent = ();
-	#my %cmd = ();
-	foreach my $p (@{$t->table}) {
-		$parent{$p->pid} = $p->ppid;
-	#	$cmd{$p->pid} = $p->cmndline;
-	}
-
-	if (!defined $root) {
-		return;
-	}
-	my @queue = ($root);
-	my @killist = ();
-
-	while (scalar @queue) {
-		my $victim = shift @queue;
-		while (my ($pid, $ppid) = each %parent) {
-			if ($ppid == $victim) {
-				push @queue, $pid;
-			}
-		}
-		diag("SIGSTOP to $victim");
-		kill 'STOP', $victim;
-		unshift @killist, $victim;
-	}
-
-	diag("SIGKILL to " . join(' ', @killist));
-	kill 'KILL', @killist;
-	#foreach my $victim (@killist) {
-	#	print("kill $victim " . $cmd{$victim} . "\n");
-	#}
-}
-
 sub stop
 {
 	my ($self, $mode) = @_;
diff --git a/contrib/mmts/Makefile b/contrib/mmts/Makefile
@@ -6,6 +6,8 @@ DATA = multimaster--1.0.sql
 
 .PHONY: all
 
+EXTRA_INSTALL=contrib/mmts
+
 all: multimaster.so
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
@@ -21,3 +23,7 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+check: temp-install
+	$(prove_check)
+
diff --git a/contrib/mmts/arbiter.c b/contrib/mmts/arbiter.c
@@ -336,7 +336,7 @@ static void MtmCheckResponse(MtmArbiterMessage* resp)
 		&& Mtm->status != MTM_RECOVERED
 		&& Mtm->nodes[MtmNodeId-1].lastStatusChangeTime + MSEC_TO_USEC(MtmNodeDisableDelay) < MtmGetSystemTime()) 
 	{ 
-		MTM_ELOG(WARNING, "Node %d thinks that I am dead, while I am %s (message %s)", resp->node, MtmNodeStatusMnem[Mtm->status], MtmMessageKindMnem[resp->code]);
+		MTM_ELOG(WARNING, "Node %d thinks that I'm dead, while I'm %s (message %s)", resp->node, MtmNodeStatusMnem[Mtm->status], MtmMessageKindMnem[resp->code]);
 		BIT_SET(Mtm->disabledNodeMask, MtmNodeId-1);
 		Mtm->nConfigChanges += 1;
 		MtmSwitchClusterMode(MTM_RECOVERY);
@@ -411,8 +411,8 @@ static void MtmSendHeartbeat()
 	
 }
 
-/* This function shoudl be called from all places where sender can be blocked.
- * It checks send_heartbeat flag set by timer and if it is set hthen sends heartbeats to all alive nodes 
+/* This function should be called from all places where sender can be blocked.
+ * It checks send_heartbeat flag set by timer and if it is set then sends heartbeats to all alive nodes 
  */
 void MtmCheckHeartbeat()
 {
@@ -577,8 +577,8 @@ static bool MtmSendToNode(int node, void const* buf, int size, time_t reconnectT
 	BIT_SET(busy_mask, node);
 	while (true) {
 #if 0
-		/* Original intention was to reestablish connectect when reconnet mask is set to avoid hanged-up connection.
-		 * But reconnectMask is set not only when connection is broken, so breaking connection in all this cases cause avalunch of connection failures.
+		/* Original intention was to reestablish connection when reconnect mask is set to avoid hanged-up connection.
+		 * But reconnectMask is set not only when connection is broken, so breaking connection in all this cases cause avalanche of connection failures.
 		 */
 		if (sockets[node] >= 0 && BIT_CHECK(Mtm->reconnectMask, node)) {
 			MTM_ELOG(WARNING, "Arbiter is forced to reconnect to node %d", node+1); 
@@ -978,7 +978,7 @@ static void MtmReceiver(Datum arg)
 						Assert(*msg->gid);
 						tm = (MtmTransMap*)hash_search(MtmGid2State, msg->gid, HASH_FIND, NULL);
 						if (tm == NULL || tm->state == NULL) { 
-							MTM_ELOG(WARNING, "Response for unexisted transaction %s from node %d", msg->gid, node);
+							MTM_ELOG(WARNING, "Response for non-existing transaction %s from node %d", msg->gid, node);
 						} else {
 							ts = tm->state;
 							BIT_SET(ts->votedMask, node-1);
@@ -1031,7 +1031,7 @@ static void MtmReceiver(Datum arg)
 					}
 					ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
 					if (ts == NULL) { 
-						MTM_ELOG(WARNING, "Ignore response for unexisted transaction %llu from node %d", (long64)msg->dxid, node);
+						MTM_ELOG(WARNING, "Ignore response for non-existing transaction %llu from node %d", (long64)msg->dxid, node);
 						continue;
 					}
 					Assert(msg->code == MSG_ABORTED || strcmp(msg->gid, ts->gid) == 0);
@@ -1130,7 +1130,7 @@ static void MtmReceiver(Datum arg)
 							} else { 
 								Assert(ts->status == TRANSACTION_STATUS_ABORTED);
 								MTM_ELOG(WARNING, "Receive PRECOMMITTED response for aborted transaction %s (%llu) from node %d", 
-									 ts->gid, (long64)ts->xid, node); // How it can happen? SHould we use assert here?
+									 ts->gid, (long64)ts->xid, node); // How it can happen? Should we use assert here?
 								if ((ts->participantsMask & ~Mtm->disabledNodeMask & ~ts->votedMask) == 0) {
 									MtmWakeUpBackend(ts);
 								}
@@ -1169,7 +1169,7 @@ static void MtmReceiver(Datum arg)
 		}
 		if (Mtm->status == MTM_ONLINE) { 
 			now = MtmGetSystemTime();
-			/* Check for heartbeats only in case of timeout expiration: it means that we do not have unproceeded events.
+			/* Check for heartbeats only in case of timeout expiration: it means that we do not have non-processed events.
 			 * It helps to avoid false node failure detection because of blocking receiver.
 			 */
 			if (n == 0) {
diff --git a/contrib/mmts/doc/configuration.md b/contrib/mmts/doc/configuration.md
@@ -1,22 +1,29 @@
-# `Configuration parameters`
+# `GUC Variables`
 
-```multimaster.node_id``` Multimaster node ID, unique number identifying this node. Nodes should be numbered by natural numbers starting from 1 without gaps (e.g. 1, 2, 3, ...). node_id is also used as an offset in ```multimaster.conn_strings```, thus i-th node's connection string expected to be on i-th position in ```multimaster.conn_strings```. Mandatory.
+```multimaster.node_id``` Node ID - a unique natural number identifying the node of a multi-master cluster. You must start node numbering from 1 and cannot have any gaps in numbering. For example, for a cluster of five nodes, set node IDs to 1, 2, 3, 4, and 5. 
 
-```multimaster.conn_strings``` Multimaster node connection strings separated by commas, i.e. 'dbname=mydb host=node1, dbname=mydb host=node2, dbname=mydb host=node3'. Order here is important and should be consistent with ```multimaster.node_id```. Multimaster allows to specify custom arbiter_port value for all connection strings. Also this parameter is expected to be identical on all nodes. Mandatory.
+```multimaster.conn_strings``` Connection strings for each node of a multi-master cluster, separated by commas. Each connection string must include the name of the database to replicate and the cluster node domain name. For example, 'dbname=mydb host=node1, dbname=mydb host=node2, dbname=mydb host=node3'. Connection strings must appear in the order of the node IDs specified in the ```multimaster.node_id``` variable. Connection string for the i-th node must be on the i-th position. This parameter must be identical on all nodes. You can specify a custom port for all connection strings using the `multimaster.arbiter_port` variable. 
 
-```multimaster.arbiter_port``` Port for arbiter process to listen on. Default to 5433.
+```multimaster.arbiter_port``` Port for the arbiter process to listen on. 
+Default: 5433
 
-```multimaster.heartbeat_send_timeout``` Period of broadcasting heartbeat messages by arbiter to all nodes. In milliseconds. Default to 1000.
+```multimaster.heartbeat_send_timeout``` Time interval between heartbeat messages, in milliseconds. An arbiter process broadcasts heartbeat messages to all nodes to detect connection problems. Default: 1000.
 
-```multimaster.heartbeat_recv_timeout``` If no heartbeat message is received from node within this period, it assumed to be dead. In milliseconds. Default to 10000.
+```multimaster.heartbeat_recv_timeout``` Timeout, in milliseconds. If no heartbeat message is received from the node within this timeframe, the node is excluded from the cluster. 
+Default: 10000
 
-```multimaster.min_recovery_lag``` Minimal lag of WAL-sender performing recovery after which cluster is locked until recovery is completed. When wal-sender almost catch-up WAL current position we need to stop 'Achilles tortile competition' and temporary stop commit of new transactions until node will be completely repared. In bytes. Default to 100000.
 
-```multimaster.max_recovery_lag``` Maximal lag of replication slot of failed node after which this slot is dropped to avoid transaction log overflow. Dropping slot makes it not possible to recover node using logical replication mechanism, it will be necessary to completely copy content of some alive node using pg_basebackup or similar tool. Zero value of parameter disable slot dropping. In bytes. Default to 100000000.
+```multimaster.min_recovery_lag``` Minimal WAL lag between the current cluster state and the node to be restored, in bytes. When this threshold is reached during node recovery, the cluster is locked for write transactions until the recovery is complete. 
+Default: 100000
+
+```multimaster.max_recovery_lag``` Maximal WAL lag size, in bytes. When a node is disconnected from the cluster, other nodes copy WALs for all new trasactions into the replication slot of this node. Upon reaching the `multimaster.max_recovery_lag` value, the replication slot for the disconnected node is deleted to avoid overflow. At this point, automatic recovery of the node is no longer possible. In this case, you can restore the node manually by cloning the data from one of the alive nodes using `pg_basebackup` or a similar tool. If you set this variable to zero, replication slot will not be deleted. 
+Default: 10000000
+
+```multimaster.ignore_tables_without_pk``` Boolean. This variable enables/disables replication of tables without primary keys. By default, replication of tables without primary keys is disabled because of the logical replication restrictions. To enable replication, you can set this variable to false. However, take into account that `multimaster` does not allow update operations on such tables. Default: true
+
+```multimaster.cluster_name``` Name of the cluster. If you set this variable, `multimaster` checks that the cluster name is the same for all the cluster nodes.
 
-```multimaster.ignore_tables_without_pk``` Do not replicate tables withpout primary key. Boolean.
 
-```multimaster.cluster_name``` Name of the cluster, desn't affect anything. Just in case. If set that mmts will check name correspondence.
 
 ## Questionable
 
@@ -38,8 +45,6 @@
 
 ```multimaster.gc_period``` Number of distributed transactions after which garbage collection is started. Multimaster is building xid->csn hash map which has to be cleaned to avoid hash overflow. This parameter specifies interval of invoking garbage collector for this map. default = MTM_HASH_SIZE/10
 
-```multimaster.max_node``` Maximal number of cluster nodes. This parameters allows to add new nodes to the cluster, default value 0 restricts number of nodes to one specified in multimaster.conn_strings (May be just set that to 64 and allow user to add node when trey need without restart?) default = 0
-
 ```multimaster.node_disable_delay``` Minimal amount of time (msec) between node status change. This delay is used to avoid false detection of node failure and to prevent blinking of node status node. default = 2000. (We can just increase heartbeat_recv_timeout)
 
 ```multimaster.connect_timeout``` Multimaster nodes connect timeout. Interval in milliseconds for establishing connection with cluster node. default = 10000, /* 10 seconds */
diff --git a/contrib/mmts/multimaster.c b/contrib/mmts/multimaster.c
@@ -456,13 +456,9 @@ csn_t MtmDistributedTransactionSnapshot(TransactionId xid, int nodeId, nodemask_
 
 Snapshot MtmGetSnapshot(Snapshot snapshot)
 {
-    snapshot = PgGetSnapshotData(snapshot);
-#if 0
-	if (snapshot != &CatalogSnapshotData) {
-		RecentGlobalDataXmin = RecentGlobalXmin = Mtm->oldestXid;
-	}
-#endif
-    return snapshot;
+	snapshot = PgGetSnapshotData(snapshot);
+	RecentGlobalDataXmin = RecentGlobalXmin = Mtm->oldestXid;
+	return snapshot;
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -456,13 +456,9 @@ csn_t MtmDistributedTransactionSnapshot(TransactionId xid, int nodeId, nodemask_`
`456`	`456`
`457`	`457`	`Snapshot MtmGetSnapshot(Snapshot snapshot)`
`458`	`458`	`{`
`459`		`- snapshot = PgGetSnapshotData(snapshot);`
`460`		`-#if 0`
`461`		`- if (snapshot != &CatalogSnapshotData) {`
`462`		`- RecentGlobalDataXmin = RecentGlobalXmin = Mtm->oldestXid;`
`463`		`- }`
`464`		`-#endif`
`465`		`- return snapshot;`
	`459`	`+ snapshot = PgGetSnapshotData(snapshot);`
	`460`	`+ RecentGlobalDataXmin = RecentGlobalXmin = Mtm->oldestXid;`
	`461`	`+ return snapshot;`
`466`	`462`	`}`
`467`	`463`
`468`	`464`