Skip to content

Commit 1e6d690

Browse files
liu-song-6shligit
authored andcommitted
md/r5cache: caching phase of r5cache
As described in previous patch, write back cache operates in two phases: caching and writing-out. The caching phase works as: 1. write data to journal (r5c_handle_stripe_dirtying, r5c_cache_data) 2. call bio_endio (r5c_handle_data_cached, r5c_return_dev_pending_writes). Then the writing-out phase is as: 1. Mark the stripe as write-out (r5c_make_stripe_write_out) 2. Calcualte parity (reconstruct or RMW) 3. Write parity (and maybe some other data) to journal device 4. Write data and parity to RAID disks This patch implements caching phase. The cache is integrated with stripe cache of raid456. It leverages code of r5l_log to write data to journal device. Writing-out phase of the cache is implemented in the next patch. With r5cache, write operation does not wait for parity calculation and write out, so the write latency is lower (1 write to journal device vs. read and then write to raid disks). Also, r5cache will reduce RAID overhead (multipile IO due to read-modify-write of parity) and provide more opportunities of full stripe writes. This patch adds 2 flags to stripe_head.state: - STRIPE_R5C_PARTIAL_STRIPE, - STRIPE_R5C_FULL_STRIPE, Instead of inactive_list, stripes with cached data are tracked in r5conf->r5c_full_stripe_list and r5conf->r5c_partial_stripe_list. STRIPE_R5C_FULL_STRIPE and STRIPE_R5C_PARTIAL_STRIPE are flags for stripes in these lists. Note: stripes in r5c_full/partial_stripe_list are not considered as "active". For RMW, the code allocates an extra page for each data block being updated. This is stored in r5dev->orig_page and the old data is read into it. Then the prexor calculation subtracts ->orig_page from the parity block, and the reconstruct calculation adds the ->page data back into the parity block. r5cache naturally excludes SkipCopy. When the array has write back cache, async_copy_data() will not skip copy. There are some known limitations of the cache implementation: 1. Write cache only covers full page writes (R5_OVERWRITE). Writes of smaller granularity are write through. 2. Only one log io (sh->log_io) for each stripe at anytime. Later writes for the same stripe have to wait. This can be improved by moving log_io to r5dev. 3. With writeback cache, read path must enter state machine, which is a significant bottleneck for some workloads. 4. There is no per stripe checkpoint (with r5l_payload_flush) in the log, so recovery code has to replay more than necessary data (sometimes all the log from last_checkpoint). This reduces availability of the array. This patch includes a fix proposed by ZhengYuan Liu <liuzhengyuan@kylinos.cn> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 2ded370 commit 1e6d690

File tree

3 files changed

+381
-32
lines changed

3 files changed

+381
-32
lines changed

drivers/md/raid5-cache.c

Lines changed: 233 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <linux/random.h>
2121
#include "md.h"
2222
#include "raid5.h"
23+
#include "bitmap.h"
2324

2425
/*
2526
* metadata/data stored in disk with 4k size unit (a block) regardless
@@ -218,6 +219,43 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
218219
io->state = state;
219220
}
220221

222+
static void
223+
r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
224+
struct bio_list *return_bi)
225+
{
226+
struct bio *wbi, *wbi2;
227+
228+
wbi = dev->written;
229+
dev->written = NULL;
230+
while (wbi && wbi->bi_iter.bi_sector <
231+
dev->sector + STRIPE_SECTORS) {
232+
wbi2 = r5_next_bio(wbi, dev->sector);
233+
if (!raid5_dec_bi_active_stripes(wbi)) {
234+
md_write_end(conf->mddev);
235+
bio_list_add(return_bi, wbi);
236+
}
237+
wbi = wbi2;
238+
}
239+
}
240+
241+
void r5c_handle_cached_data_endio(struct r5conf *conf,
242+
struct stripe_head *sh, int disks, struct bio_list *return_bi)
243+
{
244+
int i;
245+
246+
for (i = sh->disks; i--; ) {
247+
if (sh->dev[i].written) {
248+
set_bit(R5_UPTODATE, &sh->dev[i].flags);
249+
r5c_return_dev_pending_writes(conf, &sh->dev[i],
250+
return_bi);
251+
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
252+
STRIPE_SECTORS,
253+
!test_bit(STRIPE_DEGRADED, &sh->state),
254+
0);
255+
}
256+
}
257+
}
258+
221259
/*
222260
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
223261
* This function should only be called in write-back mode.
@@ -231,6 +269,44 @@ static void r5c_make_stripe_write_out(struct stripe_head *sh)
231269

232270
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
233271
clear_bit(STRIPE_R5C_CACHING, &sh->state);
272+
273+
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
274+
atomic_inc(&conf->preread_active_stripes);
275+
276+
if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
277+
BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
278+
atomic_dec(&conf->r5c_cached_partial_stripes);
279+
}
280+
281+
if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
282+
BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
283+
atomic_dec(&conf->r5c_cached_full_stripes);
284+
}
285+
}
286+
287+
static void r5c_handle_data_cached(struct stripe_head *sh)
288+
{
289+
int i;
290+
291+
for (i = sh->disks; i--; )
292+
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
293+
set_bit(R5_InJournal, &sh->dev[i].flags);
294+
clear_bit(R5_LOCKED, &sh->dev[i].flags);
295+
}
296+
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
297+
}
298+
299+
/*
300+
* this journal write must contain full parity,
301+
* it may also contain some data pages
302+
*/
303+
static void r5c_handle_parity_cached(struct stripe_head *sh)
304+
{
305+
int i;
306+
307+
for (i = sh->disks; i--; )
308+
if (test_bit(R5_InJournal, &sh->dev[i].flags))
309+
set_bit(R5_Wantwrite, &sh->dev[i].flags);
234310
}
235311

236312
/*
@@ -250,8 +326,12 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
250326
* two parities are written out together.
251327
*/
252328
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
253-
} else
254-
BUG(); /* write-back logic in next patch */
329+
} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
330+
r5c_handle_data_cached(sh);
331+
} else {
332+
r5c_handle_parity_cached(sh);
333+
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
334+
}
255335
}
256336

257337
static void r5l_io_run_stripes(struct r5l_io_unit *io)
@@ -491,7 +571,8 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
491571
io = log->current_io;
492572

493573
for (i = 0; i < sh->disks; i++) {
494-
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
574+
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
575+
test_bit(R5_InJournal, &sh->dev[i].flags))
495576
continue;
496577
if (i == sh->pd_idx || i == sh->qd_idx)
497578
continue;
@@ -550,8 +631,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
550631
for (i = 0; i < sh->disks; i++) {
551632
void *addr;
552633

553-
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
634+
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
635+
test_bit(R5_InJournal, &sh->dev[i].flags))
554636
continue;
637+
555638
write_disks++;
556639
/* checksum is already calculated in last run */
557640
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
@@ -817,7 +900,6 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
817900
}
818901
}
819902

820-
821903
static void r5l_do_reclaim(struct r5l_log *log)
822904
{
823905
sector_t reclaim_target = xchg(&log->reclaim_target, 0);
@@ -1218,12 +1300,80 @@ int r5c_try_caching_write(struct r5conf *conf,
12181300
int disks)
12191301
{
12201302
struct r5l_log *log = conf->log;
1303+
int i;
1304+
struct r5dev *dev;
1305+
int to_cache = 0;
12211306

12221307
BUG_ON(!r5c_is_writeback(log));
12231308

1224-
/* more write-back logic in next patches */
1225-
r5c_make_stripe_write_out(sh);
1226-
return -EAGAIN;
1309+
if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1310+
/*
1311+
* There are two different scenarios here:
1312+
* 1. The stripe has some data cached, and it is sent to
1313+
* write-out phase for reclaim
1314+
* 2. The stripe is clean, and this is the first write
1315+
*
1316+
* For 1, return -EAGAIN, so we continue with
1317+
* handle_stripe_dirtying().
1318+
*
1319+
* For 2, set STRIPE_R5C_CACHING and continue with caching
1320+
* write.
1321+
*/
1322+
1323+
/* case 1: anything injournal or anything in written */
1324+
if (s->injournal > 0 || s->written > 0)
1325+
return -EAGAIN;
1326+
/* case 2 */
1327+
set_bit(STRIPE_R5C_CACHING, &sh->state);
1328+
}
1329+
1330+
for (i = disks; i--; ) {
1331+
dev = &sh->dev[i];
1332+
/* if non-overwrite, use writing-out phase */
1333+
if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1334+
!test_bit(R5_InJournal, &dev->flags)) {
1335+
r5c_make_stripe_write_out(sh);
1336+
return -EAGAIN;
1337+
}
1338+
}
1339+
1340+
for (i = disks; i--; ) {
1341+
dev = &sh->dev[i];
1342+
if (dev->towrite) {
1343+
set_bit(R5_Wantwrite, &dev->flags);
1344+
set_bit(R5_Wantdrain, &dev->flags);
1345+
set_bit(R5_LOCKED, &dev->flags);
1346+
to_cache++;
1347+
}
1348+
}
1349+
1350+
if (to_cache) {
1351+
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1352+
/*
1353+
* set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1354+
* in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1355+
* r5c_handle_data_cached()
1356+
*/
1357+
set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1358+
}
1359+
1360+
return 0;
1361+
}
1362+
1363+
/*
1364+
* free extra pages (orig_page) we allocated for prexor
1365+
*/
1366+
void r5c_release_extra_page(struct stripe_head *sh)
1367+
{
1368+
int i;
1369+
1370+
for (i = sh->disks; i--; )
1371+
if (sh->dev[i].page != sh->dev[i].orig_page) {
1372+
struct page *p = sh->dev[i].orig_page;
1373+
1374+
sh->dev[i].orig_page = sh->dev[i].page;
1375+
put_page(p);
1376+
}
12271377
}
12281378

12291379
/*
@@ -1234,6 +1384,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
12341384
struct stripe_head *sh,
12351385
struct stripe_head_state *s)
12361386
{
1387+
int i;
1388+
int do_wakeup = 0;
1389+
12371390
if (!conf->log ||
12381391
!test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
12391392
return;
@@ -1243,7 +1396,78 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
12431396

12441397
if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
12451398
return;
1246-
BUG(); /* write-back logic in following patches */
1399+
1400+
for (i = sh->disks; i--; ) {
1401+
clear_bit(R5_InJournal, &sh->dev[i].flags);
1402+
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1403+
do_wakeup = 1;
1404+
}
1405+
1406+
/*
1407+
* analyse_stripe() runs before r5c_finish_stripe_write_out(),
1408+
* We updated R5_InJournal, so we also update s->injournal.
1409+
*/
1410+
s->injournal = 0;
1411+
1412+
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1413+
if (atomic_dec_and_test(&conf->pending_full_writes))
1414+
md_wakeup_thread(conf->mddev->thread);
1415+
1416+
if (do_wakeup)
1417+
wake_up(&conf->wait_for_overlap);
1418+
}
1419+
1420+
int
1421+
r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1422+
struct stripe_head_state *s)
1423+
{
1424+
int pages = 0;
1425+
int reserve;
1426+
int i;
1427+
int ret = 0;
1428+
1429+
BUG_ON(!log);
1430+
1431+
for (i = 0; i < sh->disks; i++) {
1432+
void *addr;
1433+
1434+
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1435+
continue;
1436+
addr = kmap_atomic(sh->dev[i].page);
1437+
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1438+
addr, PAGE_SIZE);
1439+
kunmap_atomic(addr);
1440+
pages++;
1441+
}
1442+
WARN_ON(pages == 0);
1443+
1444+
/*
1445+
* The stripe must enter state machine again to call endio, so
1446+
* don't delay.
1447+
*/
1448+
clear_bit(STRIPE_DELAYED, &sh->state);
1449+
atomic_inc(&sh->count);
1450+
1451+
mutex_lock(&log->io_mutex);
1452+
/* meta + data */
1453+
reserve = (1 + pages) << (PAGE_SHIFT - 9);
1454+
if (!r5l_has_free_space(log, reserve)) {
1455+
spin_lock(&log->no_space_stripes_lock);
1456+
list_add_tail(&sh->log_list, &log->no_space_stripes);
1457+
spin_unlock(&log->no_space_stripes_lock);
1458+
1459+
r5l_wake_reclaim(log, reserve);
1460+
} else {
1461+
ret = r5l_log_stripe(log, sh, pages, 0);
1462+
if (ret) {
1463+
spin_lock_irq(&log->io_list_lock);
1464+
list_add_tail(&sh->log_list, &log->no_mem_stripes);
1465+
spin_unlock_irq(&log->io_list_lock);
1466+
}
1467+
}
1468+
1469+
mutex_unlock(&log->io_mutex);
1470+
return 0;
12471471
}
12481472

12491473

0 commit comments

Comments
 (0)