Skip to content

Commit 2ded370

Browse files
liu-song-6shligit
authored andcommitted
md/r5cache: State machine for raid5-cache write back mode
This patch adds state machine for raid5-cache. With log device, the raid456 array could operate in two different modes (r5c_journal_mode): - write-back (R5C_MODE_WRITE_BACK) - write-through (R5C_MODE_WRITE_THROUGH) Existing code of raid5-cache only has write-through mode. For write-back cache, it is necessary to extend the state machine. With write-back cache, every stripe could operate in two different phases: - caching - writing-out In caching phase, the stripe handles writes as: - write to journal - return IO In writing-out phase, the stripe behaviors as a stripe in write through mode R5C_MODE_WRITE_THROUGH. STRIPE_R5C_CACHING is added to sh->state to differentiate caching and writing-out phase. Please note: this is a "no-op" patch for raid5-cache write-through mode. The following detailed explanation is copied from the raid5-cache.c: /* * raid5 cache state machine * * With rhe RAID cache, each stripe works in two phases: * - caching phase * - writing-out phase * * These two phases are controlled by bit STRIPE_R5C_CACHING: * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase * * When there is no journal, or the journal is in write-through mode, * the stripe is always in writing-out phase. * * For write-back journal, the stripe is sent to caching phase on write * (r5c_handle_stripe_dirtying). r5c_make_stripe_write_out() kicks off * the write-out phase by clearing STRIPE_R5C_CACHING. * * Stripes in caching phase do not write the raid disks. Instead, all * writes are committed from the log device. Therefore, a stripe in * caching phase handles writes as: * - write to log device * - return IO * * Stripes in writing-out phase handle writes as: * - calculate parity * - write pending data and parity to journal * - write data and parity to raid disks * - return IO for pending writes */ Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 937621c commit 2ded370

File tree

3 files changed

+211
-8
lines changed

3 files changed

+211
-8
lines changed

drivers/md/raid5-cache.c

Lines changed: 140 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,47 @@
4040
*/
4141
#define R5L_POOL_SIZE 4
4242

43+
/*
44+
* r5c journal modes of the array: write-back or write-through.
45+
* write-through mode has identical behavior as existing log only
46+
* implementation.
47+
*/
48+
enum r5c_journal_mode {
49+
R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
50+
R5C_JOURNAL_MODE_WRITE_BACK = 1,
51+
};
52+
53+
/*
54+
* raid5 cache state machine
55+
*
56+
* With rhe RAID cache, each stripe works in two phases:
57+
* - caching phase
58+
* - writing-out phase
59+
*
60+
* These two phases are controlled by bit STRIPE_R5C_CACHING:
61+
* if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
62+
* if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
63+
*
64+
* When there is no journal, or the journal is in write-through mode,
65+
* the stripe is always in writing-out phase.
66+
*
67+
* For write-back journal, the stripe is sent to caching phase on write
68+
* (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
69+
* the write-out phase by clearing STRIPE_R5C_CACHING.
70+
*
71+
* Stripes in caching phase do not write the raid disks. Instead, all
72+
* writes are committed from the log device. Therefore, a stripe in
73+
* caching phase handles writes as:
74+
* - write to log device
75+
* - return IO
76+
*
77+
* Stripes in writing-out phase handle writes as:
78+
* - calculate parity
79+
* - write pending data and parity to journal
80+
* - write data and parity to raid disks
81+
* - return IO for pending writes
82+
*/
83+
4384
struct r5l_log {
4485
struct md_rdev *rdev;
4586

@@ -96,6 +137,9 @@ struct r5l_log {
96137
spinlock_t no_space_stripes_lock;
97138

98139
bool need_cache_flush;
140+
141+
/* for r5c_cache */
142+
enum r5c_journal_mode r5c_journal_mode;
99143
};
100144

101145
/*
@@ -133,6 +177,12 @@ enum r5l_io_unit_state {
133177
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
134178
};
135179

180+
bool r5c_is_writeback(struct r5l_log *log)
181+
{
182+
return (log != NULL &&
183+
log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
184+
}
185+
136186
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
137187
{
138188
start += inc;
@@ -168,12 +218,51 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
168218
io->state = state;
169219
}
170220

221+
/*
222+
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
223+
* This function should only be called in write-back mode.
224+
*/
225+
static void r5c_make_stripe_write_out(struct stripe_head *sh)
226+
{
227+
struct r5conf *conf = sh->raid_conf;
228+
struct r5l_log *log = conf->log;
229+
230+
BUG_ON(!r5c_is_writeback(log));
231+
232+
WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
233+
clear_bit(STRIPE_R5C_CACHING, &sh->state);
234+
}
235+
236+
/*
237+
* Setting proper flags after writing (or flushing) data and/or parity to the
238+
* log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
239+
*/
240+
static void r5c_finish_cache_stripe(struct stripe_head *sh)
241+
{
242+
struct r5l_log *log = sh->raid_conf->log;
243+
244+
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
245+
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
246+
/*
247+
* Set R5_InJournal for parity dev[pd_idx]. This means
248+
* all data AND parity in the journal. For RAID 6, it is
249+
* NOT necessary to set the flag for dev[qd_idx], as the
250+
* two parities are written out together.
251+
*/
252+
set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
253+
} else
254+
BUG(); /* write-back logic in next patch */
255+
}
256+
171257
static void r5l_io_run_stripes(struct r5l_io_unit *io)
172258
{
173259
struct stripe_head *sh, *next;
174260

175261
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
176262
list_del_init(&sh->log_list);
263+
264+
r5c_finish_cache_stripe(sh);
265+
177266
set_bit(STRIPE_HANDLE, &sh->state);
178267
raid5_release_stripe(sh);
179268
}
@@ -412,18 +501,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
412501
r5l_append_payload_page(log, sh->dev[i].page);
413502
}
414503

415-
if (sh->qd_idx >= 0) {
504+
if (parity_pages == 2) {
416505
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
417506
sh->sector, sh->dev[sh->pd_idx].log_checksum,
418507
sh->dev[sh->qd_idx].log_checksum, true);
419508
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
420509
r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
421-
} else {
510+
} else if (parity_pages == 1) {
422511
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
423512
sh->sector, sh->dev[sh->pd_idx].log_checksum,
424513
0, false);
425514
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
426-
}
515+
} else /* Just writing data, not parity, in caching phase */
516+
BUG_ON(parity_pages != 0);
427517

428518
list_add_tail(&sh->log_list, &io->stripe_list);
429519
atomic_inc(&io->pending_stripe);
@@ -455,6 +545,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
455545
return -EAGAIN;
456546
}
457547

548+
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
549+
458550
for (i = 0; i < sh->disks; i++) {
459551
void *addr;
460552

@@ -1112,6 +1204,49 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
11121204
set_bit(MD_CHANGE_DEVS, &mddev->flags);
11131205
}
11141206

1207+
/*
1208+
* Try handle write operation in caching phase. This function should only
1209+
* be called in write-back mode.
1210+
*
1211+
* If all outstanding writes can be handled in caching phase, returns 0
1212+
* If writes requires write-out phase, call r5c_make_stripe_write_out()
1213+
* and returns -EAGAIN
1214+
*/
1215+
int r5c_try_caching_write(struct r5conf *conf,
1216+
struct stripe_head *sh,
1217+
struct stripe_head_state *s,
1218+
int disks)
1219+
{
1220+
struct r5l_log *log = conf->log;
1221+
1222+
BUG_ON(!r5c_is_writeback(log));
1223+
1224+
/* more write-back logic in next patches */
1225+
r5c_make_stripe_write_out(sh);
1226+
return -EAGAIN;
1227+
}
1228+
1229+
/*
1230+
* clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1231+
* stripe is committed to RAID disks.
1232+
*/
1233+
void r5c_finish_stripe_write_out(struct r5conf *conf,
1234+
struct stripe_head *sh,
1235+
struct stripe_head_state *s)
1236+
{
1237+
if (!conf->log ||
1238+
!test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1239+
return;
1240+
1241+
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1242+
clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1243+
1244+
if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1245+
return;
1246+
BUG(); /* write-back logic in following patches */
1247+
}
1248+
1249+
11151250
static int r5l_load_log(struct r5l_log *log)
11161251
{
11171252
struct md_rdev *rdev = log->rdev;
@@ -1249,6 +1384,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
12491384
INIT_LIST_HEAD(&log->no_space_stripes);
12501385
spin_lock_init(&log->no_space_stripes_lock);
12511386

1387+
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1388+
12521389
if (r5l_load_log(log))
12531390
goto error;
12541391

drivers/md/raid5.c

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
41074107
if (rdev && !test_bit(Faulty, &rdev->flags))
41084108
do_recovery = 1;
41094109
}
4110+
4111+
if (test_bit(R5_InJournal, &dev->flags))
4112+
s->injournal++;
41104113
}
41114114
if (test_bit(STRIPE_SYNCING, &sh->state)) {
41124115
/* If there is a failed device being replaced,
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
43864389
|| s.expanding)
43874390
handle_stripe_fill(sh, &s, disks);
43884391

4389-
/* Now to consider new write requests and what else, if anything
4390-
* should be read. We do not handle new writes when:
4392+
/*
4393+
* When the stripe finishes full journal write cycle (write to journal
4394+
* and raid disk), this is the clean up procedure so it is ready for
4395+
* next operation.
4396+
*/
4397+
r5c_finish_stripe_write_out(conf, sh, &s);
4398+
4399+
/*
4400+
* Now to consider new write requests, cache write back and what else,
4401+
* if anything should be read. We do not handle new writes when:
43914402
* 1/ A 'write' operation (copy+xor) is already in flight.
43924403
* 2/ A 'check' operation is in flight, as it may clobber the parity
43934404
* block.
4405+
* 3/ A r5c cache log write is in flight.
43944406
*/
4395-
if (s.to_write && !sh->reconstruct_state && !sh->check_state)
4396-
handle_stripe_dirtying(conf, sh, &s, disks);
4407+
4408+
if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4409+
if (!r5c_is_writeback(conf->log)) {
4410+
if (s.to_write)
4411+
handle_stripe_dirtying(conf, sh, &s, disks);
4412+
} else { /* write back cache */
4413+
int ret = 0;
4414+
4415+
/* First, try handle writes in caching phase */
4416+
if (s.to_write)
4417+
ret = r5c_try_caching_write(conf, sh, &s,
4418+
disks);
4419+
/*
4420+
* If caching phase failed: ret == -EAGAIN
4421+
* OR
4422+
* stripe under reclaim: !caching && injournal
4423+
*
4424+
* fall back to handle_stripe_dirtying()
4425+
*/
4426+
if (ret == -EAGAIN ||
4427+
/* stripe under reclaim: !caching && injournal */
4428+
(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4429+
s.injournal > 0))
4430+
handle_stripe_dirtying(conf, sh, &s, disks);
4431+
}
4432+
}
43974433

43984434
/* maybe we need to check and possibly fix the parity for this stripe
43994435
* Any reads will already have been scheduled, so we just see if enough
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
51105146
* data on failed drives.
51115147
*/
51125148
if (rw == READ && mddev->degraded == 0 &&
5149+
!r5c_is_writeback(conf->log) &&
51135150
mddev->reshape_position == MaxSector) {
51145151
bi = chunk_aligned_read(mddev, bi);
51155152
if (!bi)

drivers/md/raid5.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ struct stripe_head_state {
264264
int syncing, expanding, expanded, replacing;
265265
int locked, uptodate, to_read, to_write, failed, written;
266266
int to_fill, compute, req_compute, non_overwrite;
267+
int injournal;
267268
int failed_num[2];
268269
int p_failed, q_failed;
269270
int dec_preread_active;
@@ -313,6 +314,11 @@ enum r5dev_flags {
313314
*/
314315
R5_Discard, /* Discard the stripe */
315316
R5_SkipCopy, /* Don't copy data from bio to stripe cache */
317+
R5_InJournal, /* data being written is in the journal device.
318+
* if R5_InJournal is set for parity pd_idx, all the
319+
* data and parity being written are in the journal
320+
* device
321+
*/
316322
};
317323

318324
/*
@@ -345,7 +351,23 @@ enum {
345351
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
346352
* to batch yet.
347353
*/
348-
STRIPE_LOG_TRAPPED, /* trapped into log */
354+
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
355+
* this bit is used in two scenarios:
356+
*
357+
* 1. write-out phase
358+
* set in first entry of r5l_write_stripe
359+
* clear in second entry of r5l_write_stripe
360+
* used to bypass logic in handle_stripe
361+
*
362+
* 2. caching phase
363+
* set in r5c_try_caching_write()
364+
* clear when journal write is done
365+
* used to initiate r5c_cache_data()
366+
* also used to bypass logic in handle_stripe
367+
*/
368+
STRIPE_R5C_CACHING, /* the stripe is in caching phase
369+
* see more detail in the raid5-cache.c
370+
*/
349371
};
350372

351373
#define STRIPE_EXPAND_SYNC_FLAGS \
@@ -710,4 +732,11 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
710732
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
711733
extern void r5l_quiesce(struct r5l_log *log, int state);
712734
extern bool r5l_log_disk_error(struct r5conf *conf);
735+
extern bool r5c_is_writeback(struct r5l_log *log);
736+
extern int
737+
r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
738+
struct stripe_head_state *s, int disks);
739+
extern void
740+
r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
741+
struct stripe_head_state *s);
713742
#endif

0 commit comments

Comments
 (0)