postgrespro
diff --git a/‎contrib/mmts/README.md
Lines changed: 3 additions & 2 deletions b/‎contrib/mmts/README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎contrib/mmts/arbiter.c
Lines changed: 20 additions & 20 deletions b/‎contrib/mmts/arbiter.c
Lines changed: 20 additions & 20 deletions
diff --git a/‎contrib/mmts/bgwpool.h
Lines changed: 3 additions & 1 deletion b/‎contrib/mmts/bgwpool.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎contrib/mmts/bkb.h
Lines changed: 5 additions & 1 deletion b/‎contrib/mmts/bkb.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎contrib/mmts/bkbtest.c
Lines changed: 1 addition & 1 deletion b/‎contrib/mmts/bkbtest.c
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/mmts/doc/architecture.md
Lines changed: 46 additions & 0 deletions b/‎contrib/mmts/doc/architecture.md
Lines changed: 46 additions & 0 deletions
diff --git a/‎contrib/mmts/doc/mmts_commit.svg
Lines changed: 2 additions & 0 deletions b/‎contrib/mmts/doc/mmts_commit.svg
Lines changed: 2 additions & 0 deletions
diff --git a/‎contrib/mmts/multimaster--1.0.sql
Lines changed: 9 additions & 9 deletions b/‎contrib/mmts/multimaster--1.0.sql
Lines changed: 9 additions & 9 deletions
@@ -65,7 +65,8 @@ cd ../../contrib/mmts && make install
 
 ### Docker
 
-Directory contrib/mmts also includes docker-compose.yml that is capable of building multi-master and starting 3 node cluster.
+Directory contrib/mmts also includes docker-compose.yml that is capable of building multi-master and starting 
+3 node cluster.
 
 ```sh
 cd contrib/mmts
@@ -136,7 +137,7 @@ Read description of all management functions at [functions](/contrib/mmts/doc/fu
 
 * Commit latency.
 Current implementation of logical replication sends data to subscriber nodes only after local commit, so in case of
-heavy-write transaction user will wait for transaction processing two times: on local node and al other nodes
+heavy-write transaction user will wait for transaction processing two times: on local node and on all other nodes
 (simultaneosly). We have plans to address this issue in future.
 
 * DDL replication.
 
@@ -360,7 +360,7 @@ static void MtmSendHeartbeat()
 	msg.node = MtmNodeId;
 	msg.csn = now;
 	if (last_sent_heartbeat != 0 && last_sent_heartbeat + MSEC_TO_USEC(MtmHeartbeatSendTimeout)*2 < now) { 
-		MTM_LOG1("More than %ld microseconds since last heartbeat", now - last_sent_heartbeat);
+		MTM_LOG1("More than %lld microseconds since last heartbeat", now - last_sent_heartbeat);
 	}
 	last_sent_heartbeat = now;
 
@@ -377,7 +377,7 @@ static void MtmSendHeartbeat()
 					elog(LOG, "Arbiter failed to send heartbeat to node %d", i+1);
 				} else {
 					if (last_heartbeat_to_node[i] + MSEC_TO_USEC(MtmHeartbeatSendTimeout)*2 < now) { 
-						MTM_LOG1("Last heartbeat to node %d was sent %ld microseconds ago", i+1, now - last_heartbeat_to_node[i]);
+						MTM_LOG1("Last heartbeat to node %d was sent %lld microseconds ago", i+1, now - last_heartbeat_to_node[i]);
 					}
 					last_heartbeat_to_node[i] = now;
 					/* Connectivity mask can be cleared by MtmWatchdog: in this case sockets[i] >= 0 */
@@ -388,7 +388,7 @@ static void MtmSendHeartbeat()
 						MtmReconnectNode(i+1); /* set reconnect mask to force node reconnent */
 						//MtmOnNodeConnect(i+1);
 					}
-					MTM_LOG4("Send heartbeat to node %d with timestamp %ld", i+1, now);    
+					MTM_LOG4("Send heartbeat to node %d with timestamp %lld", i+1, now);    
 				}
 			} else { 
 				MTM_LOG2("Do not send heartbeat to node %d, busy mask %lld, status %s", i+1, (long long) busy_mask, MtmNodeStatusMnem[Mtm->status]);
@@ -940,7 +940,7 @@ static void MtmReceiver(Datum arg)
 
 					switch (msg->code) {
 					  case MSG_HEARTBEAT:
-						MTM_LOG4("Receive HEARTBEAT from node %d with timestamp %ld delay %ld", 
+						MTM_LOG4("Receive HEARTBEAT from node %d with timestamp %lld delay %lld", 
 								 node, msg->csn, USEC_TO_MSEC(MtmGetSystemTime() - msg->csn)); 
 						continue;
 					  case MSG_POLL_REQUEST:
@@ -1017,13 +1017,13 @@ static void MtmReceiver(Datum arg)
 					}
 					ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
 					if (ts == NULL) { 
-						elog(WARNING, "Ignore response for unexisted transaction %d from node %d", msg->dxid, node);
+						elog(WARNING, "Ignore response for unexisted transaction %llu from node %d", (long64)msg->dxid, node);
 						continue;
 					}
 					Assert(msg->code == MSG_ABORTED || strcmp(msg->gid, ts->gid) == 0);
 					if (BIT_CHECK(ts->votedMask, node-1)) {
-						elog(WARNING, "Receive deteriorated %s response for transaction %d (%s) from node %d",
-							 MtmMessageKindMnem[msg->code], ts->xid, ts->gid, node);
+						elog(WARNING, "Receive deteriorated %s response for transaction %s (%llu) from node %d",
+							 MtmMessageKindMnem[msg->code], ts->gid, (long64)ts->xid, node);
 						continue;
 					}
 					BIT_SET(ts->votedMask, node-1);
@@ -1033,8 +1033,8 @@ static void MtmReceiver(Datum arg)
 						  case MSG_PREPARED:
 							MTM_TXTRACE(ts, "MtmTransReceiver got MSG_PREPARED");
 							if (ts->status == TRANSACTION_STATUS_COMMITTED) { 
-								elog(WARNING, "Receive PREPARED response for already committed transaction %d from node %d",
-									 ts->xid, node);
+								elog(WARNING, "Receive PREPARED response for already committed transaction %llu from node %d",
+									 (long64)ts->xid, node);
 								continue;
 							}
 							Mtm->nodes[node-1].transDelay += MtmGetCurrentTime() - ts->csn;
@@ -1043,8 +1043,8 @@ static void MtmReceiver(Datum arg)
 							if ((~msg->disabledNodeMask & Mtm->disabledNodeMask) != 0) { 
 								/* Coordinator's disabled mask is wider than of this node: so reject such transaction to avoid 
 								   commit on smaller subset of nodes */
-								elog(WARNING, "Coordinator of distributed transaction %s (%d) see less nodes than node %d: %llx instead of %llx",
-									 ts->gid, ts->xid, node, (long long) Mtm->disabledNodeMask, (long long) msg->disabledNodeMask);
+								elog(WARNING, "Coordinator of distributed transaction %s (%llu) see less nodes than node %d: %llx instead of %llx",
+									 ts->gid, (long64)ts->xid, node, Mtm->disabledNodeMask, msg->disabledNodeMask);
 								MtmAbortTransaction(ts);
 							}
 							if ((ts->participantsMask & ~Mtm->disabledNodeMask & ~ts->votedMask) == 0) {
@@ -1053,7 +1053,7 @@ static void MtmReceiver(Datum arg)
 									MtmWakeUpBackend(ts);								
 								} else { 
 									Assert(ts->status == TRANSACTION_STATUS_IN_PROGRESS);
-									MTM_LOG2("Transaction %s is prepared (status=%s participants=%lx disabled=%lx, voted=%lx)", 
+									MTM_LOG2("Transaction %s is prepared (status=%s participants=%llx disabled=%llx, voted=%llx)", 
 											 ts->gid, MtmTxnStatusMnem[ts->status], ts->participantsMask, Mtm->disabledNodeMask, ts->votedMask);
 									ts->isPrepared = true;
 									if (ts->isTwoPhase) { 
@@ -1079,12 +1079,12 @@ static void MtmReceiver(Datum arg)
 							break;						   
 						  case MSG_ABORTED:
 							if (ts->status == TRANSACTION_STATUS_COMMITTED) { 
-								elog(WARNING, "Receive ABORTED response for already committed transaction %d (%s) from node %d",
-									 ts->xid, ts->gid, node);
+								elog(WARNING, "Receive ABORTED response for already committed transaction %s (%llu) from node %d",
+									 ts->gid, (long64)ts->xid, node);
 								continue;
 							}
 							if (ts->status != TRANSACTION_STATUS_ABORTED) { 
-								MTM_LOG1("Arbiter receive abort message for transaction %s (%d)", ts->gid, ts->xid);
+								MTM_LOG1("Arbiter receive abort message for transaction %s (%llu)", ts->gid, (long64)ts->xid);
 								Assert(ts->status == TRANSACTION_STATUS_IN_PROGRESS);
 								MtmAbortTransaction(ts);
 							}
@@ -1095,8 +1095,8 @@ static void MtmReceiver(Datum arg)
 						  case MSG_PRECOMMITTED:
 							MTM_TXTRACE(ts, "MtmTransReceiver got MSG_PRECOMMITTED");
                             if (ts->status == TRANSACTION_STATUS_COMMITTED) {
-                                elog(WARNING, "Receive PRECOMMITTED response for already committed transaction %d (%s) from node %d",
-                                     ts->xid, ts->gid, node);
+                                elog(WARNING, "Receive PRECOMMITTED response for already committed transaction %s (%llu) from node %d",
+                                     ts->gid, (long64)ts->xid, node);
                                 continue;
                             }
 							if (ts->status == TRANSACTION_STATUS_IN_PROGRESS) {
@@ -1111,8 +1111,8 @@ static void MtmReceiver(Datum arg)
 								}
 							} else { 
 								Assert(ts->status == TRANSACTION_STATUS_ABORTED);
-								elog(WARNING, "Receive PRECOMMITTED response for aborted transaction %d (%s) from node %d", 
-									 ts->xid, ts->gid, node); // How it can happen? SHould we use assert here?
+								elog(WARNING, "Receive PRECOMMITTED response for aborted transaction %s (%llu) from node %d", 
+									 ts->gid, (long64)ts->xid, node); // How it can happen? SHould we use assert here?
 								if ((ts->participantsMask & ~Mtm->disabledNodeMask & ~ts->votedMask) == 0) {
 									MtmWakeUpBackend(ts);
 								}
@@ -1160,7 +1160,7 @@ static void MtmReceiver(Datum arg)
 					if (!MtmWatchdog(now)) { 
 						for (i = 0; i < nNodes; i++) { 
 							if (Mtm->nodes[i].lastHeartbeat != 0 && sockets[i] >= 0) {
-								MTM_LOG1("Last heartbeat from node %d received %ld microseconds ago", i+1, now - Mtm->nodes[i].lastHeartbeat);
+								MTM_LOG1("Last heartbeat from node %d received %lld microseconds ago", i+1, now - Mtm->nodes[i].lastHeartbeat);
 							}
 						}
 					}
 
@@ -4,10 +4,12 @@
 #include "storage/s_lock.h"
 #include "storage/spin.h"
 #include "storage/pg_sema.h"
+#include "bkb.h"
 
 typedef void(*BgwPoolExecutor)(void* work, size_t size);
 
-typedef uint64 timestamp_t;
+typedef ulong64 timestamp_t;
+
 
 #define MAX_DBNAME_LEN 30
 #define MAX_DBUSER_LEN 30
 
@@ -6,7 +6,11 @@
 
 #define MAX_NODES 64
 
-typedef uint64_t nodemask_t;
+typedef long long long64; /* we are not using int64 here because we want to use %lld format for this type */
+typedef unsigned long long ulong64; /* we are not using uint64 here because we want to use %lld format for this type */
+
+typedef ulong64 nodemask_t;
+
 #define BIT_CHECK(mask, bit) (((mask) & ((nodemask_t)1 << (bit))) != 0)
 #define BIT_CLEAR(mask, bit) (mask &= ~((nodemask_t)1 << (bit)))
 #define BIT_SET(mask, bit)   (mask |= ((nodemask_t)1 << (bit)))
 
@@ -11,7 +11,7 @@ int main() {
 	matrix[2] = 1;
 	matrix[4] = 3;
 	clique = MtmFindMaxClique(matrix, 64, &clique_size);
-	printf("Clique=%lx\n", clique);
+	printf("Clique=%llx\n", clique);
 	return 0;
 }
 
@@ -0,0 +1,46 @@
+# `Multi-master architecture`
+
+## Intro
+
+Multi-master consists of two major subsystems: synchronous logical replication and arbiter process that is
+respostible for health checks and cluster recovery automation.
+
+## Replication
+
+When postgres loads multi-master shared library it sets up [[logical replication|logrep doc link]] producer an consumer to each node in the cluster and hooks into transaction commit pipeline. Since each server can accept writes it is possible that any server can abort transaction due to concurrent update - in the same way as it happens on a single server between different backends. Usual way of dealing with such situations is to perform transaction in two steps: first try to ensure that commit is possible (PREPARE stage) and if all nodes acknowledged that then we can finally commit. Postgres support such [[two-phase commit|https://www.postgresql.org/docs/9.6/static/sql-prepare-transaction.html]] procedure. So multi-master captures each commit statement and implicitly transforms it to PREPARE, waits when cohort (all nodes except our) will get that transaction via replication protocol and only after successfull responses from cohort finally commit it.
+
+Also to be able to resist node crashes and network failures ordinary two-phase commit (2PC) is insufficient. When failure happens between PREPARE and COMMIT survived nodes may not have enough information to decide what to do with prepared transaction -- crashed node can already commit or abort that transaction, but didn't notified other nodes about that and such transaction will block resouces (hold locks) until recovery of crashed node. Otherwise if we decide to commit/abort transaction without knowing faled node's decision then we can end up with data inconsistencies in database when failed node will be recovered (e.g. failed node committed transaction but survived node aborted it).
+
+To be able to deal with crashes E3PC commit protocol was used [1][2]. Main idea of 3PC-like protocols is to write intention to commit transaction before actual commit, introducing new message (PRECOMMIT) in protocol between PREPARE and COMMIT messages. That message is not used during normal work, but in case of failure all nodes have enough information to decide what to do with transaction using quorum-based voting procedure. For voting to complete protocol requires majority of nodes to be presenet, hence the rule that cluster of 2N+1 can tolerate N simultaneous failures.
+
+This process summarized on following diagram:
+
+![](https://cdn.rawgit.com/postgrespro/postgres_cluster/fac1e9fa/contrib/mmts/doc/mmts_commit.svg)
+
+Here user, connected to a backend (BE) decides to commit his transaction. Multi-master extension hooks that commit and changes it to a PREPARE statement. During transaction execution walsender process (WS) already started to decode transaction to "reorder buffer", and by the time when PREPARE statement happend WS starting sending our transaction to all neighbouring nodes (cohort). Then cohort nodes applies that transaction in walreceiver process (WR) and, after succes, signaling arbbiter process (Arb on diagram, custom background worker implemented in multimaster) to send vote for transaction (prepared) on initiating node.
+Arbiter process on initiating node wait until all nodes from cohort will send vote for transaction; after that he send "precommit" messages and waits till all nodes will respond to that with "precommited" message.
+When all participating sites answered with "precommited" message arbiter signalling backend to stop waiting and commit our prepared transaction.
+After that commit WAL record reaches cohort nodes via walsender/walreceiver connections.
+
+[1] Idit Keidar, Danny Dolev. Increasing the Resilience of Distributed and Replicated Database Systems. http://dx.doi.org/10.1006/jcss.1998.1566
+
+[2] Tim Kempster, Colin Stirling, Peter Thanisch. A more committed quorum-based three phase commit protocol. http://dx.doi.org/10.1007/BFb0056487
+
+
+<!--
+
+## DDL replication
+
+Multi-master replicates such statements on statement-based level wrapping them as part of two-phase transaction.
+
+## Sequences
+
+-->
+
+## Failure detection and recovery
+
+While multi-master allows writes to each node it waits responses about transaction acknowledgement from all other nodes, so without special actions in case of failure of any node each commit will wait until failed node recovery. To deal with such kind of situations multi-master periodically send heartbeats to check health and connectivity between nodes. When several hearbeats to the node are lost in a row (see configuration parameters ```multimaster.heartbeat_recv_timeout``` and ```multimaster.heartbeat_send_timeout```) that node can be kicked out the cluster to allow writes to alive nodes.
+
+For alive nodes there is no way to distinguish between faled node that stopped serving requests and network-partitioned node that isn't reacheable by other nodes, but can be reacheble by database users. So to protect from split-brain situations (conflicting writes to nodes in different network partitions) in case pf failure multi-master allow writes only to nodes that sees majority of other nodes. For example when 5-node multi-master cluster experienced failure that splitted network into two isolated subnets with 2 and 3 cluster nodes then multi-master based on heartbeats propagation info will continue to accept writes at each node in bigger patition and deny all writes in smaller one. Speking generaly cluster consisting from 2N+1 can tolerate N node failures and will be alive if any N+1 alive and connected to each other. In case of partial network split, when different nodes have different connectivity (for example in 3-node cluster when node B can't access node C, but node A can access both B and C) multi-master will find fully-connected subset of nodes and switch off other nodes. Each node maintance data structure that keeps status of all nodes from this node's point of view, that is accessible through ```mtm.get_nodes_state()``` system view.
+
+When failed node connects back to the cluster recovery process is started. Recovering node will select one of the cluster nodes to apply changes that were made while node was offline. That process will continue till recovering catches up to ```multimaster.min_recovery_lag``` WAL lag (default: 100kB). After that all cluster locks for writes to allow recovery process to finish. After recovery is done returned node is promoted to online status and returned back to replication scheme as it was before failure. Such automatic recovery only possible when failed node WAL lag behind the working ones is not more then ```multimaster.max_recovery_lag```. When failed node's lag is bigger ```multimaster.max_recovery_lag``` then node should be manually recovered using pg_basebackup from one of the working nodes.
@@ -9,15 +9,15 @@ CREATE FUNCTION mtm.stop_replication() RETURNS void
 AS 'MODULE_PATHNAME','mtm_stop_replication'
 LANGUAGE C;
 
-CREATE FUNCTION mtm.drop_node(node integer, drop_slot bool default false) RETURNS void
-AS 'MODULE_PATHNAME','mtm_drop_node'
+CREATE FUNCTION mtm.stop_node(node integer, drop_slot bool default false) RETURNS void
+AS 'MODULE_PATHNAME','mtm_stop_node'
 LANGUAGE C;
 
 CREATE FUNCTION mtm.add_node(conn_str text) RETURNS void
 AS 'MODULE_PATHNAME','mtm_add_node'
 LANGUAGE C;
 
--- Create replication slot for the node which was previously dropped together with it's slot 
+-- Create replication slot for the node which was previously stopped 
 CREATE FUNCTION mtm.recover_node(node integer) RETURNS void
 AS 'MODULE_PATHNAME','mtm_recover_node'
 LANGUAGE C;
@@ -27,7 +27,7 @@ CREATE FUNCTION mtm.get_snapshot() RETURNS bigint
 AS 'MODULE_PATHNAME','mtm_get_snapshot'
 LANGUAGE C;
 
-CREATE FUNCTION mtm.get_csn(xid integer) RETURNS bigint
+CREATE FUNCTION mtm.get_csn(xid bigint) RETURNS bigint
 AS 'MODULE_PATHNAME','mtm_get_csn'
 LANGUAGE C;
 
@@ -36,22 +36,22 @@ AS 'MODULE_PATHNAME','mtm_get_last_csn'
 LANGUAGE C;
 
 
-CREATE TYPE mtm.node_state AS ("id" integer, "disabled" bool, "disconnected" bool, "catchUp" bool, "slotLag" bigint, "avgTransDelay" bigint, "lastStatusChange" timestamp, "oldestSnapshot" bigint, "SenderPid" integer, "SenderStartTime" timestamp, "ReceiverPid" integer, "ReceiverStartTime" timestamp, "connStr" text, "connectivityMask" bigint);
+CREATE TYPE mtm.node_state AS ("id" integer, "disabled" bool, "disconnected" bool, "catchUp" bool, "slotLag" bigint, "avgTransDelay" bigint, "lastStatusChange" timestamp, "oldestSnapshot" bigint, "SenderPid" integer, "SenderStartTime" timestamp, "ReceiverPid" integer, "ReceiverStartTime" timestamp, "connStr" text, "connectivityMask" bigint, "stalled" bool, "stopped" bool);
 
 CREATE FUNCTION mtm.get_nodes_state() RETURNS SETOF mtm.node_state
 AS 'MODULE_PATHNAME','mtm_get_nodes_state'
 LANGUAGE C;
 
 CREATE TYPE mtm.cluster_state AS ("status" text, "disabledNodeMask" bigint, "disconnectedNodeMask" bigint, "catchUpNodeMask" bigint, "liveNodes" integer, "allNodes" integer, "nActiveQueries" integer, "nPendingQueries" integer, "queueSize" bigint, "transCount" bigint, "timeShift" bigint, "recoverySlot" integer,
-"xidHashSize" bigint, "gidHashSize" bigint, "oldestXid" integer, "configChanges" integer);
+"xidHashSize" bigint, "gidHashSize" bigint, "oldestXid" bigint, "configChanges" integer, "stalledNodeMask" bigint, "stoppedNodeMask" bigint);
 
-CREATE TYPE mtm.trans_state AS ("status" text, "gid" text, "xid" integer, "coordinator" integer, "gxid" integer, "csn" timestamp, "snapshot" timestamp, "local" boolean, "prepared" boolean, "active" boolean, "twophase" boolean, "votingCompleted" boolean, "participants" bigint, "voted" bigint, "configChanges" integer);
+CREATE TYPE mtm.trans_state AS ("status" text, "gid" text, "xid" bigint, "coordinator" integer, "gxid" bigint, "csn" timestamp, "snapshot" timestamp, "local" boolean, "prepared" boolean, "active" boolean, "twophase" boolean, "votingCompleted" boolean, "participants" bigint, "voted" bigint, "configChanges" integer);
 
 CREATE FUNCTION mtm.get_trans_by_gid(git text) RETURNS mtm.trans_state
 AS 'MODULE_PATHNAME','mtm_get_trans_by_gid'
 LANGUAGE C;
 
-CREATE FUNCTION mtm.get_trans_by_xid(xid integer) RETURNS mtm.trans_state
+CREATE FUNCTION mtm.get_trans_by_xid(xid bigint) RETURNS mtm.trans_state
 AS 'MODULE_PATHNAME','mtm_get_trans_by_xid'
 LANGUAGE C;
 
@@ -79,7 +79,7 @@ CREATE FUNCTION mtm.inject_2pc_error(stage integer) RETURNS void
 AS 'MODULE_PATHNAME','mtm_inject_2pc_error'
 LANGUAGE C;
 
-CREATE FUNCTION mtm.check_deadlock(xid integer) RETURNS boolean
+CREATE FUNCTION mtm.check_deadlock(xid bigint) RETURNS boolean
 AS 'MODULE_PATHNAME','mtm_check_deadlock'
 LANGUAGE C;
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ int main() {`
`11`	`11`	`matrix[2] = 1;`
`12`	`12`	`matrix[4] = 3;`
`13`	`13`	`clique = MtmFindMaxClique(matrix, 64, &clique_size);`
`14`		`- printf("Clique=%lx\n", clique);`
	`14`	`+ printf("Clique=%llx\n", clique);`
`15`	`15`	`return 0;`
`16`	`16`	`}`
`17`	`17`