Skip to content

Commit 2e52d44

Browse files
NeilBrownshligit
authored andcommitted
md/raid1: add failfast handling for reads.
If a device is marked FailFast and it is not the only device we can read from, we mark the bio with REQ_FAILFAST_* flags. If this does fail, we don't try read repair but just allow failure. If it was the last device it doesn't fail of course, so the retry happens on the same device - this time without FAILFAST. A subsequent failure will not retry but will just pass up the error. During resync we may use FAILFAST requests and on a failure we will simply use the other device(s). During recovery we will only use FAILFAST in the unusual case were there are multiple places to read from - i.e. if there are > 2 devices. If we get a failure we will fail the device and complete the resync/recovery with remaining devices. The new R1BIO_FailFast flag is set on read reqest to suggest the a FAILFAST request might be acceptable. The rdev needs to have FailFast set as well for the read to actually use REQ_FAILFAST_*. We need to know there are at least two working devices before we can set R1BIO_FailFast, so we mustn't stop looking at the first device we find. So the "min_pending == 0" handling to not exit early, but too always choose the best_pending_disk if min_pending == 0. The spinlocked region in raid1_error() in enlarged to ensure that if two bios, reading from two different devices, fail at the same time, then there is no risk that both devices will be marked faulty, leaving zero "In_sync" devices. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 46533ff commit 2e52d44

File tree

2 files changed

+43
-10
lines changed

2 files changed

+43
-10
lines changed

drivers/md/raid1.c

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio)
329329

330330
if (uptodate)
331331
set_bit(R1BIO_Uptodate, &r1_bio->state);
332+
else if (test_bit(FailFast, &rdev->flags) &&
333+
test_bit(R1BIO_FailFast, &r1_bio->state))
334+
/* This was a fail-fast read so we definitely
335+
* want to retry */
336+
;
332337
else {
333338
/* If all other devices have failed, we want to return
334339
* the error upwards rather than fail the last device.
@@ -535,6 +540,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
535540
best_good_sectors = 0;
536541
has_nonrot_disk = 0;
537542
choose_next_idle = 0;
543+
clear_bit(R1BIO_FailFast, &r1_bio->state);
538544

539545
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
540546
(mddev_is_clustered(conf->mddev) &&
@@ -608,6 +614,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
608614
} else
609615
best_good_sectors = sectors;
610616

617+
if (best_disk >= 0)
618+
/* At least two disks to choose from so failfast is OK */
619+
set_bit(R1BIO_FailFast, &r1_bio->state);
620+
611621
nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
612622
has_nonrot_disk |= nonrot;
613623
pending = atomic_read(&rdev->nr_pending);
@@ -646,11 +656,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
646656
}
647657
break;
648658
}
649-
/* If device is idle, use it */
650-
if (pending == 0) {
651-
best_disk = disk;
652-
break;
653-
}
654659

655660
if (choose_next_idle)
656661
continue;
@@ -673,7 +678,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
673678
* mixed ratation/non-rotational disks depending on workload.
674679
*/
675680
if (best_disk == -1) {
676-
if (has_nonrot_disk)
681+
if (has_nonrot_disk || min_pending == 0)
677682
best_disk = best_pending_disk;
678683
else
679684
best_disk = best_dist_disk;
@@ -1167,6 +1172,9 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
11671172
read_bio->bi_bdev = mirror->rdev->bdev;
11681173
read_bio->bi_end_io = raid1_end_read_request;
11691174
bio_set_op_attrs(read_bio, op, do_sync);
1175+
if (test_bit(FailFast, &mirror->rdev->flags) &&
1176+
test_bit(R1BIO_FailFast, &r1_bio->state))
1177+
read_bio->bi_opf |= MD_FAILFAST;
11701178
read_bio->bi_private = r1_bio;
11711179

11721180
if (mddev->gendisk)
@@ -1464,6 +1472,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
14641472
* next level up know.
14651473
* else mark the drive as failed
14661474
*/
1475+
spin_lock_irqsave(&conf->device_lock, flags);
14671476
if (test_bit(In_sync, &rdev->flags)
14681477
&& (conf->raid_disks - mddev->degraded) == 1) {
14691478
/*
@@ -1473,10 +1482,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
14731482
* it is very likely to fail.
14741483
*/
14751484
conf->recovery_disabled = mddev->recovery_disabled;
1485+
spin_unlock_irqrestore(&conf->device_lock, flags);
14761486
return;
14771487
}
14781488
set_bit(Blocked, &rdev->flags);
1479-
spin_lock_irqsave(&conf->device_lock, flags);
14801489
if (test_and_clear_bit(In_sync, &rdev->flags)) {
14811490
mddev->degraded++;
14821491
set_bit(Faulty, &rdev->flags);
@@ -1815,12 +1824,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
18151824
sector_t sect = r1_bio->sector;
18161825
int sectors = r1_bio->sectors;
18171826
int idx = 0;
1827+
struct md_rdev *rdev;
1828+
1829+
rdev = conf->mirrors[r1_bio->read_disk].rdev;
1830+
if (test_bit(FailFast, &rdev->flags)) {
1831+
/* Don't try recovering from here - just fail it
1832+
* ... unless it is the last working device of course */
1833+
md_error(mddev, rdev);
1834+
if (test_bit(Faulty, &rdev->flags))
1835+
/* Don't try to read from here, but make sure
1836+
* put_buf does it's thing
1837+
*/
1838+
bio->bi_end_io = end_sync_write;
1839+
}
18181840

18191841
while(sectors) {
18201842
int s = sectors;
18211843
int d = r1_bio->read_disk;
18221844
int success = 0;
1823-
struct md_rdev *rdev;
18241845
int start;
18251846

18261847
if (s > (PAGE_SIZE>>9))
@@ -2331,7 +2352,9 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
23312352
bio_put(bio);
23322353
r1_bio->bios[r1_bio->read_disk] = NULL;
23332354

2334-
if (mddev->ro == 0) {
2355+
rdev = conf->mirrors[r1_bio->read_disk].rdev;
2356+
if (mddev->ro == 0
2357+
&& !test_bit(FailFast, &rdev->flags)) {
23352358
freeze_array(conf, 1);
23362359
fix_read_error(conf, r1_bio->read_disk,
23372360
r1_bio->sector, r1_bio->sectors);
@@ -2340,7 +2363,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
23402363
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
23412364
}
23422365

2343-
rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
2366+
rdev_dec_pending(rdev, conf->mddev);
23442367

23452368
read_more:
23462369
disk = read_balance(conf, r1_bio, &max_sectors);
@@ -2365,6 +2388,9 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
23652388
bio->bi_bdev = rdev->bdev;
23662389
bio->bi_end_io = raid1_end_read_request;
23672390
bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2391+
if (test_bit(FailFast, &rdev->flags) &&
2392+
test_bit(R1BIO_FailFast, &r1_bio->state))
2393+
bio->bi_opf |= MD_FAILFAST;
23682394
bio->bi_private = r1_bio;
23692395
if (max_sectors < r1_bio->sectors) {
23702396
/* Drat - have to split this up more */
@@ -2653,6 +2679,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
26532679
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
26542680
bio->bi_bdev = rdev->bdev;
26552681
bio->bi_private = r1_bio;
2682+
if (test_bit(FailFast, &rdev->flags))
2683+
bio->bi_opf |= MD_FAILFAST;
26562684
}
26572685
}
26582686
rcu_read_unlock();
@@ -2783,13 +2811,17 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
27832811
if (bio->bi_end_io == end_sync_read) {
27842812
read_targets--;
27852813
md_sync_acct(bio->bi_bdev, nr_sectors);
2814+
if (read_targets == 1)
2815+
bio->bi_opf &= ~MD_FAILFAST;
27862816
generic_make_request(bio);
27872817
}
27882818
}
27892819
} else {
27902820
atomic_set(&r1_bio->remaining, 1);
27912821
bio = r1_bio->bios[r1_bio->read_disk];
27922822
md_sync_acct(bio->bi_bdev, nr_sectors);
2823+
if (read_targets == 1)
2824+
bio->bi_opf &= ~MD_FAILFAST;
27932825
generic_make_request(bio);
27942826

27952827
}

drivers/md/raid1.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,5 +183,6 @@ enum r1bio_state {
183183
*/
184184
R1BIO_MadeGood,
185185
R1BIO_WriteError,
186+
R1BIO_FailFast,
186187
};
187188
#endif

0 commit comments

Comments
 (0)