Skip to content

Commit a3c06a3

Browse files
mauelshasnitm
authored andcommitted
dm raid: enhance attempt_restore_of_faulty_devices() to support more devices
attempt_restore_of_faulty_devices() is limited to 64 when it should support the new maximum of 253 when identifying any failed devices. It clears any revivable devices via an MD personality hot remove and add cylce to allow for their recovery. Address by using existing functions to retrieve and update all failed devices' bitfield members in the dm raid superblocks on all RAID devices and check for any devices to clear in it. Whilst on it, don't call attempt_restore_of_faulty_devices() for any MD personality not providing disk hot add/remove methods (i.e. raid0 now), because such personalities don't support reviving of failed disks. Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
1 parent 31e10a4 commit a3c06a3

File tree

1 file changed

+24
-8
lines changed

1 file changed

+24
-8
lines changed

drivers/md/dm-raid.c

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3391,11 +3391,19 @@ static void raid_postsuspend(struct dm_target *ti)
33913391
static void attempt_restore_of_faulty_devices(struct raid_set *rs)
33923392
{
33933393
int i;
3394-
uint64_t failed_devices, cleared_failed_devices = 0;
3394+
uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS];
33953395
unsigned long flags;
3396+
bool cleared = false;
33963397
struct dm_raid_superblock *sb;
3398+
struct mddev *mddev = &rs->md;
33973399
struct md_rdev *r;
33983400

3401+
/* RAID personalities have to provide hot add/remove methods or we need to bail out. */
3402+
if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk)
3403+
return;
3404+
3405+
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
3406+
33993407
for (i = 0; i < rs->md.raid_disks; i++) {
34003408
r = &rs->dev[i].rdev;
34013409
if (test_bit(Faulty, &r->flags) && r->sb_page &&
@@ -3415,7 +3423,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
34153423
* ourselves.
34163424
*/
34173425
if ((r->raid_disk >= 0) &&
3418-
(r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
3426+
(mddev->pers->hot_remove_disk(mddev, r) != 0))
34193427
/* Failed to revive this device, try next */
34203428
continue;
34213429

@@ -3425,22 +3433,30 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
34253433
clear_bit(Faulty, &r->flags);
34263434
clear_bit(WriteErrorSeen, &r->flags);
34273435
clear_bit(In_sync, &r->flags);
3428-
if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
3436+
if (mddev->pers->hot_add_disk(mddev, r)) {
34293437
r->raid_disk = -1;
34303438
r->saved_raid_disk = -1;
34313439
r->flags = flags;
34323440
} else {
34333441
r->recovery_offset = 0;
3434-
cleared_failed_devices |= 1 << i;
3442+
set_bit(i, (void *) cleared_failed_devices);
3443+
cleared = true;
34353444
}
34363445
}
34373446
}
3438-
if (cleared_failed_devices) {
3447+
3448+
/* If any failed devices could be cleared, update all sbs failed_devices bits */
3449+
if (cleared) {
3450+
uint64_t failed_devices[DISKS_ARRAY_ELEMS];
3451+
34393452
rdev_for_each(r, &rs->md) {
34403453
sb = page_address(r->sb_page);
3441-
failed_devices = le64_to_cpu(sb->failed_devices);
3442-
failed_devices &= ~cleared_failed_devices;
3443-
sb->failed_devices = cpu_to_le64(failed_devices);
3454+
sb_retrieve_failed_devices(sb, failed_devices);
3455+
3456+
for (i = 0; i < DISKS_ARRAY_ELEMS; i++)
3457+
failed_devices[i] &= ~cleared_failed_devices[i];
3458+
3459+
sb_update_failed_devices(sb, failed_devices);
34443460
}
34453461
}
34463462
}

0 commit comments

Comments
 (0)