Skip to content

Commit 254d802

Browse files
committed
Detect zombies
1 parent 982cc5b commit 254d802

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

contrib/mmts/arbiter.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,10 @@ static void MtmTransReceiver(Datum arg)
694694
MtmTransState* ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
695695
Assert(ts != NULL);
696696
Assert(msg->node > 0 && msg->node <= nNodes && msg->node != MtmNodeId);
697-
697+
698+
if (BIT_CHECK(msg->disabledNodeMask, MtmNodeId-1) && Mtm->status != MTM_RECOVERY) {
699+
elog(PANIC, "Node %d thinks that I was dead: perform hara-kiri not to be a zombie", msg->node);
700+
}
698701
Mtm->nodes[msg->node-1].oldestSnapshot = msg->oldestSnapshot;
699702

700703
if (MtmIsCoordinator(ts)) {

contrib/mmts/multimaster.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ HTAB* MtmXid2State;
140140
static HTAB* MtmGid2State;
141141
static HTAB* MtmLocalTables;
142142

143+
static bool MtmIsRecoverySession;
144+
143145
static MtmCurrentTrans MtmTx;
144146

145147
static TransactionManager MtmTM = {
@@ -1023,7 +1025,15 @@ static int64 MtmGetSlotLag(int nodeId)
10231025
*/
10241026
bool MtmIsRecoveredNode(int nodeId)
10251027
{
1026-
return BIT_CHECK(Mtm->disabledNodeMask, nodeId-1);
1028+
if (BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) {
1029+
if (!MtmIsRecoverySession) {
1030+
elog(ERROR, "Node %d is marked as disabled but is not in recovery mode", nodeId);
1031+
}
1032+
return true;
1033+
} else {
1034+
MtmIsRecoverySession = false; /* recovery is completed */
1035+
return false;
1036+
}
10271037
}
10281038

10291039

@@ -1872,17 +1882,17 @@ static void
18721882
MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
18731883
{
18741884
ListCell *param;
1875-
bool isRecoverySession = false;
1885+
MtmIsRecoverySession = false;
18761886
foreach(param, args->in_params)
18771887
{
18781888
DefElem *elem = lfirst(param);
18791889
if (strcmp("mtm_replication_mode", elem->defname) == 0) {
1880-
isRecoverySession = elem->arg != NULL && strVal(elem->arg) != NULL && strcmp(strVal(elem->arg), "recovery") == 0;
1890+
MtmIsRecoverySession = elem->arg != NULL && strVal(elem->arg) != NULL && strcmp(strVal(elem->arg), "recovery") == 0;
18811891
break;
18821892
}
18831893
}
18841894
MtmLock(LW_EXCLUSIVE);
1885-
if (isRecoverySession) {
1895+
if (MtmIsRecoverySession) {
18861896
elog(WARNING, "%d: Node %d start recovery of node %d", MyProcPid, MtmNodeId, MtmReplicationNodeId);
18871897
if (!BIT_CHECK(Mtm->disabledNodeMask, MtmReplicationNodeId-1)) {
18881898
BIT_SET(Mtm->disabledNodeMask, MtmReplicationNodeId-1);

0 commit comments

Comments
 (0)