Skip to content

Commit 46533ff

Browse files
NeilBrownshligit
authored andcommitted
md: Use REQ_FAILFAST_* on metadata writes where appropriate
This can only be supported on personalities which ensure that md_error() never causes an array to enter the 'failed' state. i.e. if marking a device Faulty would cause some data to be inaccessible, the device is status is left as non-Faulty. This is true for RAID1 and RAID10. If we get a failure writing metadata but the device doesn't fail, it must be the last device so we re-write without FAILFAST to improve chance of success. We also flag the device as LastDev so that future metadata updates don't waste time on failfast writes. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 688834e commit 46533ff

File tree

5 files changed

+68
-14
lines changed

5 files changed

+68
-14
lines changed

drivers/md/bitmap.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
209209

210210
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
211211
{
212-
struct md_rdev *rdev = NULL;
212+
struct md_rdev *rdev;
213213
struct block_device *bdev;
214214
struct mddev *mddev = bitmap->mddev;
215215
struct bitmap_storage *store = &bitmap->storage;
216216

217+
restart:
218+
rdev = NULL;
217219
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
218220
int size = PAGE_SIZE;
219221
loff_t offset = mddev->bitmap_info.offset;
@@ -269,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
269271
page);
270272
}
271273

272-
if (wait)
273-
md_super_wait(mddev);
274+
if (wait && md_super_wait(mddev) < 0)
275+
goto restart;
274276
return 0;
275277

276278
bad_alignment:
@@ -428,6 +430,13 @@ static void bitmap_wait_writes(struct bitmap *bitmap)
428430
wait_event(bitmap->write_wait,
429431
atomic_read(&bitmap->pending_writes)==0);
430432
else
433+
/* Note that we ignore the return value. The writes
434+
* might have failed, but that would just mean that
435+
* some bits which should be cleared haven't been,
436+
* which is safe. The relevant bitmap blocks will
437+
* probably get written again, but there is no great
438+
* loss if they aren't.
439+
*/
431440
md_super_wait(bitmap->mddev);
432441
}
433442

drivers/md/md.c

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,13 @@ static void super_written(struct bio *bio)
727727
if (bio->bi_error) {
728728
pr_err("md: super_written gets error=%d\n", bio->bi_error);
729729
md_error(mddev, rdev);
730-
}
730+
if (!test_bit(Faulty, &rdev->flags)
731+
&& (bio->bi_opf & MD_FAILFAST)) {
732+
set_bit(MD_NEED_REWRITE, &mddev->flags);
733+
set_bit(LastDev, &rdev->flags);
734+
}
735+
} else
736+
clear_bit(LastDev, &rdev->flags);
731737

732738
if (atomic_dec_and_test(&mddev->pending_writes))
733739
wake_up(&mddev->sb_wait);
@@ -744,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
744750
* if zero is reached.
745751
* If an error occurred, call md_error
746752
*/
747-
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
753+
struct bio *bio;
754+
int ff = 0;
755+
756+
if (test_bit(Faulty, &rdev->flags))
757+
return;
758+
759+
bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
748760

749761
atomic_inc(&rdev->nr_pending);
750762

@@ -753,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
753765
bio_add_page(bio, page, size, 0);
754766
bio->bi_private = rdev;
755767
bio->bi_end_io = super_written;
756-
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA);
768+
769+
if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
770+
test_bit(FailFast, &rdev->flags) &&
771+
!test_bit(LastDev, &rdev->flags))
772+
ff = MD_FAILFAST;
773+
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA | ff);
757774

758775
atomic_inc(&mddev->pending_writes);
759776
submit_bio(bio);
760777
}
761778

762-
void md_super_wait(struct mddev *mddev)
779+
int md_super_wait(struct mddev *mddev)
763780
{
764781
/* wait for all superblock writes that were scheduled to complete */
765782
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
783+
if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags))
784+
return -EAGAIN;
785+
return 0;
766786
}
767787

768788
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
@@ -1334,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
13341354
if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
13351355
rdev->mddev->level >= 1)
13361356
num_sectors = (sector_t)(2ULL << 32) - 2;
1337-
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1357+
do {
1358+
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
13381359
rdev->sb_page);
1339-
md_super_wait(rdev->mddev);
1360+
} while (md_super_wait(rdev->mddev) < 0);
13401361
return num_sectors;
13411362
}
13421363

@@ -1877,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
18771898
sb->data_size = cpu_to_le64(num_sectors);
18781899
sb->super_offset = rdev->sb_start;
18791900
sb->sb_csum = calc_sb_1_csum(sb);
1880-
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1881-
rdev->sb_page);
1882-
md_super_wait(rdev->mddev);
1901+
do {
1902+
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1903+
rdev->sb_page);
1904+
} while (md_super_wait(rdev->mddev) < 0);
18831905
return num_sectors;
18841906

18851907
}
@@ -2416,6 +2438,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
24162438

24172439
if (mddev->queue)
24182440
blk_add_trace_msg(mddev->queue, "md md_update_sb");
2441+
rewrite:
24192442
bitmap_update_sb(mddev->bitmap);
24202443
rdev_for_each(rdev, mddev) {
24212444
char b[BDEVNAME_SIZE];
@@ -2447,7 +2470,8 @@ void md_update_sb(struct mddev *mddev, int force_change)
24472470
/* only need to write one superblock... */
24482471
break;
24492472
}
2450-
md_super_wait(mddev);
2473+
if (md_super_wait(mddev) < 0)
2474+
goto rewrite;
24512475
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
24522476

24532477
if (mddev_is_clustered(mddev) && ret == 0)

drivers/md/md.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@
2929

3030
#define MaxSector (~(sector_t)0)
3131

32+
/*
33+
* These flags should really be called "NO_RETRY" rather than
34+
* "FAILFAST" because they don't make any promise about time lapse,
35+
* only about the number of retries, which will be zero.
36+
* REQ_FAILFAST_DRIVER is not included because
37+
* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
38+
* seems to suggest that the errors it avoids retrying should usually
39+
* be retried.
40+
*/
41+
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
3242
/*
3343
* MD's 'extended' device
3444
*/
@@ -177,6 +187,10 @@ enum flag_bits {
177187
* It is expects that no bad block log
178188
* is present.
179189
*/
190+
LastDev, /* Seems to be the last working dev as
191+
* it didn't fail, so don't use FailFast
192+
* any more for metadata
193+
*/
180194
};
181195

182196
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -213,6 +227,11 @@ enum mddev_flags {
213227
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
214228
* already took resync lock, need to
215229
* release the lock */
230+
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
231+
* supported as calls to md_error() will
232+
* never cause the array to become failed.
233+
*/
234+
MD_NEED_REWRITE, /* metadata write needs to be repeated */
216235
};
217236
#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
218237
BIT(MD_CHANGE_CLEAN) | \
@@ -628,7 +647,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
628647
extern void md_flush_request(struct mddev *mddev, struct bio *bio);
629648
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
630649
sector_t sector, int size, struct page *page);
631-
extern void md_super_wait(struct mddev *mddev);
650+
extern int md_super_wait(struct mddev *mddev);
632651
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
633652
struct page *page, int op, int op_flags,
634653
bool metadata_op);

drivers/md/raid1.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2988,6 +2988,7 @@ static int raid1_run(struct mddev *mddev)
29882988
mddev->thread = conf->thread;
29892989
conf->thread = NULL;
29902990
mddev->private = conf;
2991+
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
29912992

29922993
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
29932994

drivers/md/raid10.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3729,6 +3729,7 @@ static int raid10_run(struct mddev *mddev)
37293729
size = raid10_size(mddev, 0, 0);
37303730
md_set_array_sectors(mddev, size);
37313731
mddev->resync_max_sectors = size;
3732+
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
37323733

37333734
if (mddev->queue) {
37343735
int stripe = conf->geo.raid_disks *

0 commit comments

Comments
 (0)