Skip to content

Commit e7c0c3f

Browse files
committed
md/raid10: close race that lose writes lost when replacement completes.
When a replacement operation completes there is a small window when the original device is marked 'faulty' and the replacement still looks like a replacement. The faulty should be removed and the replacement moved in place very quickly, bit it isn't instant. So the code write out to the array must handle the possibility that the only working device for some slot in the replacement - but it doesn't. If the primary device is faulty it just gives up. This can lead to corruption. So make the code more robust: if either the primary or the replacement is present and working, write to them. Only when neither are present do we give up. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" <linux@horizon.com> Cc: stable@vger.kernel.org Signed-off-by: NeilBrown <neilb@suse.de>
1 parent ca64cae commit e7c0c3f

File tree

1 file changed

+68
-61
lines changed

1 file changed

+68
-61
lines changed

drivers/md/raid10.c

Lines changed: 68 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,18 +1334,21 @@ static void make_request(struct mddev *mddev, struct bio * bio)
13341334
blocked_rdev = rrdev;
13351335
break;
13361336
}
1337+
if (rdev && (test_bit(Faulty, &rdev->flags)
1338+
|| test_bit(Unmerged, &rdev->flags)))
1339+
rdev = NULL;
13371340
if (rrdev && (test_bit(Faulty, &rrdev->flags)
13381341
|| test_bit(Unmerged, &rrdev->flags)))
13391342
rrdev = NULL;
13401343

13411344
r10_bio->devs[i].bio = NULL;
13421345
r10_bio->devs[i].repl_bio = NULL;
1343-
if (!rdev || test_bit(Faulty, &rdev->flags) ||
1344-
test_bit(Unmerged, &rdev->flags)) {
1346+
1347+
if (!rdev && !rrdev) {
13451348
set_bit(R10BIO_Degraded, &r10_bio->state);
13461349
continue;
13471350
}
1348-
if (test_bit(WriteErrorSeen, &rdev->flags)) {
1351+
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
13491352
sector_t first_bad;
13501353
sector_t dev_sector = r10_bio->devs[i].addr;
13511354
int bad_sectors;
@@ -1387,8 +1390,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
13871390
max_sectors = good_sectors;
13881391
}
13891392
}
1390-
r10_bio->devs[i].bio = bio;
1391-
atomic_inc(&rdev->nr_pending);
1393+
if (rdev) {
1394+
r10_bio->devs[i].bio = bio;
1395+
atomic_inc(&rdev->nr_pending);
1396+
}
13921397
if (rrdev) {
13931398
r10_bio->devs[i].repl_bio = bio;
13941399
atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1449,71 @@ static void make_request(struct mddev *mddev, struct bio * bio)
14441449
for (i = 0; i < conf->copies; i++) {
14451450
struct bio *mbio;
14461451
int d = r10_bio->devs[i].devnum;
1447-
if (!r10_bio->devs[i].bio)
1448-
continue;
1452+
if (r10_bio->devs[i].bio) {
1453+
struct md_rdev *rdev = conf->mirrors[d].rdev;
1454+
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1455+
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1456+
max_sectors);
1457+
r10_bio->devs[i].bio = mbio;
1458+
1459+
mbio->bi_sector = (r10_bio->devs[i].addr+
1460+
choose_data_offset(r10_bio,
1461+
rdev));
1462+
mbio->bi_bdev = rdev->bdev;
1463+
mbio->bi_end_io = raid10_end_write_request;
1464+
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1465+
mbio->bi_private = r10_bio;
14491466

1450-
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1451-
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1452-
max_sectors);
1453-
r10_bio->devs[i].bio = mbio;
1467+
atomic_inc(&r10_bio->remaining);
14541468

1455-
mbio->bi_sector = (r10_bio->devs[i].addr+
1456-
choose_data_offset(r10_bio,
1457-
conf->mirrors[d].rdev));
1458-
mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1459-
mbio->bi_end_io = raid10_end_write_request;
1460-
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1461-
mbio->bi_private = r10_bio;
1469+
cb = blk_check_plugged(raid10_unplug, mddev,
1470+
sizeof(*plug));
1471+
if (cb)
1472+
plug = container_of(cb, struct raid10_plug_cb,
1473+
cb);
1474+
else
1475+
plug = NULL;
1476+
spin_lock_irqsave(&conf->device_lock, flags);
1477+
if (plug) {
1478+
bio_list_add(&plug->pending, mbio);
1479+
plug->pending_cnt++;
1480+
} else {
1481+
bio_list_add(&conf->pending_bio_list, mbio);
1482+
conf->pending_count++;
1483+
}
1484+
spin_unlock_irqrestore(&conf->device_lock, flags);
1485+
if (!plug)
1486+
md_wakeup_thread(mddev->thread);
1487+
}
14621488

1463-
atomic_inc(&r10_bio->remaining);
1489+
if (r10_bio->devs[i].repl_bio) {
1490+
struct md_rdev *rdev = conf->mirrors[d].replacement;
1491+
if (rdev == NULL) {
1492+
/* Replacement just got moved to main 'rdev' */
1493+
smp_mb();
1494+
rdev = conf->mirrors[d].rdev;
1495+
}
1496+
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1497+
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1498+
max_sectors);
1499+
r10_bio->devs[i].repl_bio = mbio;
1500+
1501+
mbio->bi_sector = (r10_bio->devs[i].addr +
1502+
choose_data_offset(
1503+
r10_bio, rdev));
1504+
mbio->bi_bdev = rdev->bdev;
1505+
mbio->bi_end_io = raid10_end_write_request;
1506+
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1507+
mbio->bi_private = r10_bio;
14641508

1465-
cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1466-
if (cb)
1467-
plug = container_of(cb, struct raid10_plug_cb, cb);
1468-
else
1469-
plug = NULL;
1470-
spin_lock_irqsave(&conf->device_lock, flags);
1471-
if (plug) {
1472-
bio_list_add(&plug->pending, mbio);
1473-
plug->pending_cnt++;
1474-
} else {
1509+
atomic_inc(&r10_bio->remaining);
1510+
spin_lock_irqsave(&conf->device_lock, flags);
14751511
bio_list_add(&conf->pending_bio_list, mbio);
14761512
conf->pending_count++;
1513+
spin_unlock_irqrestore(&conf->device_lock, flags);
1514+
if (!mddev_check_plugged(mddev))
1515+
md_wakeup_thread(mddev->thread);
14771516
}
1478-
spin_unlock_irqrestore(&conf->device_lock, flags);
1479-
if (!plug)
1480-
md_wakeup_thread(mddev->thread);
1481-
1482-
if (!r10_bio->devs[i].repl_bio)
1483-
continue;
1484-
1485-
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1486-
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1487-
max_sectors);
1488-
r10_bio->devs[i].repl_bio = mbio;
1489-
1490-
/* We are actively writing to the original device
1491-
* so it cannot disappear, so the replacement cannot
1492-
* become NULL here
1493-
*/
1494-
mbio->bi_sector = (r10_bio->devs[i].addr +
1495-
choose_data_offset(
1496-
r10_bio,
1497-
conf->mirrors[d].replacement));
1498-
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1499-
mbio->bi_end_io = raid10_end_write_request;
1500-
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1501-
mbio->bi_private = r10_bio;
1502-
1503-
atomic_inc(&r10_bio->remaining);
1504-
spin_lock_irqsave(&conf->device_lock, flags);
1505-
bio_list_add(&conf->pending_bio_list, mbio);
1506-
conf->pending_count++;
1507-
spin_unlock_irqrestore(&conf->device_lock, flags);
1508-
if (!mddev_check_plugged(mddev))
1509-
md_wakeup_thread(mddev->thread);
15101517
}
15111518

15121519
/* Don't remove the bias on 'remaining' (one_write_done) until

0 commit comments

Comments
 (0)