Skip to content

Commit c3cce6c

Browse files
author
NeilBrown
committed
md/raid5: ensure device failure recorded before write request returns.
When a write to one of the devices of a RAID5/6 fails, the failure is recorded in the metadata of the other devices so that after a restart the data on the failed drive wont be trusted even if that drive seems to be working again (maybe a cable was unplugged). Similarly when we record a bad-block in response to a write failure, we must not let the write complete until the bad-block update is safe. Currently there is no interlock between the write request completing and the metadata update. So it is possible that the write will complete, the app will confirm success in some way, and then the machine will crash before the metadata update completes. This is an extremely small hole for a racy to fit in, but it is theoretically possible and so should be closed. So: - set MD_CHANGE_PENDING when requesting a metadata update for a failed device, so we can know with certainty when it completes - queue requests that completed when MD_CHANGE_PENDING is set to only be processed after the metadata update completes - call raid_end_bio_io() on bios in that queue when the time comes. Signed-off-by: NeilBrown <neilb@suse.com>
1 parent 34a6f80 commit c3cce6c

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

drivers/md/raid5.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2513,6 +2513,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
25132513
set_bit(Blocked, &rdev->flags);
25142514
set_bit(Faulty, &rdev->flags);
25152515
set_bit(MD_CHANGE_DEVS, &mddev->flags);
2516+
set_bit(MD_CHANGE_PENDING, &mddev->flags);
25162517
printk(KERN_ALERT
25172518
"md/raid:%s: Disk failure on %s, disabling device.\n"
25182519
"md/raid:%s: Operation continuing on %d devices.\n",
@@ -4601,7 +4602,15 @@ static void handle_stripe(struct stripe_head *sh)
46014602
md_wakeup_thread(conf->mddev->thread);
46024603
}
46034604

4604-
return_io(&s.return_bi);
4605+
if (!bio_list_empty(&s.return_bi)) {
4606+
if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
4607+
spin_lock_irq(&conf->device_lock);
4608+
bio_list_merge(&conf->return_bi, &s.return_bi);
4609+
spin_unlock_irq(&conf->device_lock);
4610+
md_wakeup_thread(conf->mddev->thread);
4611+
} else
4612+
return_io(&s.return_bi);
4613+
}
46054614

46064615
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
46074616
}
@@ -5817,6 +5826,18 @@ static void raid5d(struct md_thread *thread)
58175826

58185827
md_check_recovery(mddev);
58195828

5829+
if (!bio_list_empty(&conf->return_bi) &&
5830+
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5831+
struct bio_list tmp = BIO_EMPTY_LIST;
5832+
spin_lock_irq(&conf->device_lock);
5833+
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5834+
bio_list_merge(&tmp, &conf->return_bi);
5835+
bio_list_init(&conf->return_bi);
5836+
}
5837+
spin_unlock_irq(&conf->device_lock);
5838+
return_io(&tmp);
5839+
}
5840+
58205841
blk_start_plug(&plug);
58215842
handled = 0;
58225843
spin_lock_irq(&conf->device_lock);
@@ -6476,6 +6497,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
64766497
INIT_LIST_HEAD(&conf->hold_list);
64776498
INIT_LIST_HEAD(&conf->delayed_list);
64786499
INIT_LIST_HEAD(&conf->bitmap_list);
6500+
bio_list_init(&conf->return_bi);
64796501
init_llist_head(&conf->released_stripes);
64806502
atomic_set(&conf->active_stripes, 0);
64816503
atomic_set(&conf->preread_active_stripes, 0);

drivers/md/raid5.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,9 @@ struct r5conf {
476476
int skip_copy; /* Don't copy data from bio to stripe cache */
477477
struct list_head *last_hold; /* detect hold_list promotions */
478478

479+
/* bios to have bi_end_io called after metadata is synced */
480+
struct bio_list return_bi;
481+
479482
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
480483
/* unfortunately we need two cache names as we temporarily have
481484
* two caches.

0 commit comments

Comments
 (0)