Skip to content

Commit 33c12a5

Browse files
Sunil Mushranjlbec
authored andcommitted
ocfs2/cluster: Heartbeat mismatch message improved
If o2hb finds unexpected values in the heartbeat slot, it prints a message "ERROR: Device "dm-6": another node is heartbeating in our slot!" This message could be misleading. This patch adds two more messages to help users better diagnose the problem. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <jlbec@evilplan.org>
1 parent 76d9fc2 commit 33c12a5

File tree

1 file changed

+31
-17
lines changed

1 file changed

+31
-17
lines changed

fs/ocfs2/cluster/heartbeat.c

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -539,25 +539,41 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
539539

540540
/* We want to make sure that nobody is heartbeating on top of us --
541541
* this will help detect an invalid configuration. */
542-
static int o2hb_check_last_timestamp(struct o2hb_region *reg)
542+
static void o2hb_check_last_timestamp(struct o2hb_region *reg)
543543
{
544-
int node_num, ret;
545544
struct o2hb_disk_slot *slot;
546545
struct o2hb_disk_heartbeat_block *hb_block;
546+
char *errstr;
547547

548-
node_num = o2nm_this_node();
549-
550-
ret = 1;
551-
slot = &reg->hr_slots[node_num];
548+
slot = &reg->hr_slots[o2nm_this_node()];
552549
/* Don't check on our 1st timestamp */
553-
if (slot->ds_last_time) {
554-
hb_block = slot->ds_raw_block;
550+
if (!slot->ds_last_time)
551+
return;
555552

556-
if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
557-
ret = 0;
558-
}
553+
hb_block = slot->ds_raw_block;
554+
if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
555+
le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
556+
hb_block->hb_node == slot->ds_node_num)
557+
return;
559558

560-
return ret;
559+
#define ERRSTR1 "Another node is heartbeating on device"
560+
#define ERRSTR2 "Heartbeat generation mismatch on device"
561+
#define ERRSTR3 "Heartbeat sequence mismatch on device"
562+
563+
if (hb_block->hb_node != slot->ds_node_num)
564+
errstr = ERRSTR1;
565+
else if (le64_to_cpu(hb_block->hb_generation) !=
566+
slot->ds_last_generation)
567+
errstr = ERRSTR2;
568+
else
569+
errstr = ERRSTR3;
570+
571+
mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
572+
"ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
573+
slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
574+
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
575+
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
576+
(unsigned long long)le64_to_cpu(hb_block->hb_seq));
561577
}
562578

563579
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -983,9 +999,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
983999
/* With an up to date view of the slots, we can check that no
9841000
* other node has been improperly configured to heartbeat in
9851001
* our slot. */
986-
if (!o2hb_check_last_timestamp(reg))
987-
mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
988-
"in our slot!\n", reg->hr_dev_name);
1002+
o2hb_check_last_timestamp(reg);
9891003

9901004
/* fill in the proper info for our next heartbeat */
9911005
o2hb_prepare_block(reg, reg->hr_generation);
@@ -999,8 +1013,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
9991013
}
10001014

10011015
i = -1;
1002-
while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
1003-
1016+
while((i = find_next_bit(configured_nodes,
1017+
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
10041018
change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
10051019
}
10061020

0 commit comments

Comments
 (0)