Skip to content

Commit 682aa8e

Browse files
htejunaxboe
authored andcommitted
writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent 87e1d78 commit 682aa8e

File tree

6 files changed

+196
-14
lines changed

6 files changed

+196
-14
lines changed

fs/fs-writeback.c

Lines changed: 111 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,115 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
308308
return locked_inode_to_wb_and_lock_list(inode);
309309
}
310310

311+
struct inode_switch_wbs_context {
312+
struct inode *inode;
313+
struct bdi_writeback *new_wb;
314+
315+
struct rcu_head rcu_head;
316+
struct work_struct work;
317+
};
318+
319+
static void inode_switch_wbs_work_fn(struct work_struct *work)
320+
{
321+
struct inode_switch_wbs_context *isw =
322+
container_of(work, struct inode_switch_wbs_context, work);
323+
struct inode *inode = isw->inode;
324+
struct bdi_writeback *new_wb = isw->new_wb;
325+
326+
/*
327+
* By the time control reaches here, RCU grace period has passed
328+
* since I_WB_SWITCH assertion and all wb stat update transactions
329+
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
330+
* synchronizing against mapping->tree_lock.
331+
*/
332+
spin_lock(&inode->i_lock);
333+
334+
inode->i_wb_frn_winner = 0;
335+
inode->i_wb_frn_avg_time = 0;
336+
inode->i_wb_frn_history = 0;
337+
338+
/*
339+
* Paired with load_acquire in unlocked_inode_to_wb_begin() and
340+
* ensures that the new wb is visible if they see !I_WB_SWITCH.
341+
*/
342+
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
343+
344+
spin_unlock(&inode->i_lock);
345+
346+
iput(inode);
347+
wb_put(new_wb);
348+
kfree(isw);
349+
}
350+
351+
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
352+
{
353+
struct inode_switch_wbs_context *isw = container_of(rcu_head,
354+
struct inode_switch_wbs_context, rcu_head);
355+
356+
/* needs to grab bh-unsafe locks, bounce to work item */
357+
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
358+
schedule_work(&isw->work);
359+
}
360+
361+
/**
362+
* inode_switch_wbs - change the wb association of an inode
363+
* @inode: target inode
364+
* @new_wb_id: ID of the new wb
365+
*
366+
* Switch @inode's wb association to the wb identified by @new_wb_id. The
367+
* switching is performed asynchronously and may fail silently.
368+
*/
369+
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
370+
{
371+
struct backing_dev_info *bdi = inode_to_bdi(inode);
372+
struct cgroup_subsys_state *memcg_css;
373+
struct inode_switch_wbs_context *isw;
374+
375+
/* noop if seems to be already in progress */
376+
if (inode->i_state & I_WB_SWITCH)
377+
return;
378+
379+
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
380+
if (!isw)
381+
return;
382+
383+
/* find and pin the new wb */
384+
rcu_read_lock();
385+
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
386+
if (memcg_css)
387+
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
388+
rcu_read_unlock();
389+
if (!isw->new_wb)
390+
goto out_free;
391+
392+
/* while holding I_WB_SWITCH, no one else can update the association */
393+
spin_lock(&inode->i_lock);
394+
if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
395+
inode_to_wb(inode) == isw->new_wb) {
396+
spin_unlock(&inode->i_lock);
397+
goto out_free;
398+
}
399+
inode->i_state |= I_WB_SWITCH;
400+
spin_unlock(&inode->i_lock);
401+
402+
ihold(inode);
403+
isw->inode = inode;
404+
405+
/*
406+
* In addition to synchronizing among switchers, I_WB_SWITCH tells
407+
* the RCU protected stat update paths to grab the mapping's
408+
* tree_lock so that stat transfer can synchronize against them.
409+
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
410+
*/
411+
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
412+
return;
413+
414+
out_free:
415+
if (isw->new_wb)
416+
wb_put(isw->new_wb);
417+
kfree(isw);
418+
}
419+
311420
/**
312421
* wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
313422
* @wbc: writeback_control of interest
@@ -433,12 +542,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
433542
* is okay. The main goal is avoiding keeping an inode on
434543
* the wrong wb for an extended period of time.
435544
*/
436-
if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
437-
/* switch */
438-
max_id = 0;
439-
avg_time = 0;
440-
history = 0;
441-
}
545+
if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
546+
inode_switch_wbs(inode, max_id);
442547
}
443548

444549
/*

include/linux/backing-dev.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
332332
return inode->i_wb;
333333
}
334334

335+
/**
336+
* unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
337+
* @inode: target inode
338+
* @lockedp: temp bool output param, to be passed to the end function
339+
*
340+
* The caller wants to access the wb associated with @inode but isn't
341+
* holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
342+
* function determines the wb associated with @inode and ensures that the
343+
* association doesn't change until the transaction is finished with
344+
* unlocked_inode_to_wb_end().
345+
*
346+
* The caller must call unlocked_inode_to_wb_end() with *@lockdep
347+
* afterwards and can't sleep during transaction. IRQ may or may not be
348+
* disabled on return.
349+
*/
350+
static inline struct bdi_writeback *
351+
unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
352+
{
353+
rcu_read_lock();
354+
355+
/*
356+
* Paired with store_release in inode_switch_wb_work_fn() and
357+
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
358+
*/
359+
*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
360+
361+
if (unlikely(*lockedp))
362+
spin_lock_irq(&inode->i_mapping->tree_lock);
363+
return inode_to_wb(inode);
364+
}
365+
366+
/**
367+
* unlocked_inode_to_wb_end - end inode wb access transaction
368+
* @inode: target inode
369+
* @locked: *@lockedp from unlocked_inode_to_wb_begin()
370+
*/
371+
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
372+
{
373+
if (unlikely(locked))
374+
spin_unlock_irq(&inode->i_mapping->tree_lock);
375+
376+
rcu_read_unlock();
377+
}
378+
335379
struct wb_iter {
336380
int start_blkcg_id;
337381
struct radix_tree_iter tree_iter;
@@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
420464
return &inode_to_bdi(inode)->wb;
421465
}
422466

467+
static inline struct bdi_writeback *
468+
unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
469+
{
470+
return inode_to_wb(inode);
471+
}
472+
473+
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
474+
{
475+
}
476+
423477
static inline void wb_memcg_offline(struct mem_cgroup *memcg)
424478
{
425479
}

include/linux/fs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,6 +1815,11 @@ struct super_operations {
18151815
*
18161816
* I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
18171817
*
1818+
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
1819+
* synchronize competing switching instances and to tell
1820+
* wb stat updates to grab mapping->tree_lock. See
1821+
* inode_switch_wb_work_fn() for details.
1822+
*
18181823
* Q: What is the difference between I_WILL_FREE and I_FREEING?
18191824
*/
18201825
#define I_DIRTY_SYNC (1 << 0)
@@ -1834,6 +1839,7 @@ struct super_operations {
18341839
#define I_DIRTY_TIME (1 << 11)
18351840
#define __I_DIRTY_TIME_EXPIRED 12
18361841
#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
1842+
#define I_WB_SWITCH (1 << 13)
18371843

18381844
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
18391845
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

include/linux/mm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ struct anon_vma_chain;
2727
struct file_ra_state;
2828
struct user_struct;
2929
struct writeback_control;
30+
struct bdi_writeback;
3031

3132
#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
3233
extern unsigned long max_mapnr;
@@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
12141215
void account_page_dirtied(struct page *page, struct address_space *mapping,
12151216
struct mem_cgroup *memcg);
12161217
void account_page_cleaned(struct page *page, struct address_space *mapping,
1217-
struct mem_cgroup *memcg);
1218+
struct mem_cgroup *memcg, struct bdi_writeback *wb);
12181219
int set_page_dirty(struct page *page);
12191220
int set_page_dirty_lock(struct page *page);
12201221
void cancel_dirty_page(struct page *page);

mm/filemap.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,8 @@ void __delete_from_page_cache(struct page *page, void *shadow,
213213
* anyway will be cleared before returning page into buddy allocator.
214214
*/
215215
if (WARN_ON_ONCE(PageDirty(page)))
216-
account_page_cleaned(page, mapping, memcg);
216+
account_page_cleaned(page, mapping, memcg,
217+
inode_to_wb(mapping->host));
217218
}
218219

219220
/**

mm/page-writeback.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2422,12 +2422,12 @@ EXPORT_SYMBOL(account_page_dirtied);
24222422
* Caller must hold mem_cgroup_begin_page_stat().
24232423
*/
24242424
void account_page_cleaned(struct page *page, struct address_space *mapping,
2425-
struct mem_cgroup *memcg)
2425+
struct mem_cgroup *memcg, struct bdi_writeback *wb)
24262426
{
24272427
if (mapping_cap_account_dirty(mapping)) {
24282428
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
24292429
dec_zone_page_state(page, NR_FILE_DIRTY);
2430-
dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
2430+
dec_wb_stat(wb, WB_RECLAIMABLE);
24312431
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
24322432
}
24332433
}
@@ -2490,11 +2490,15 @@ void account_page_redirty(struct page *page)
24902490
struct address_space *mapping = page->mapping;
24912491

24922492
if (mapping && mapping_cap_account_dirty(mapping)) {
2493-
struct bdi_writeback *wb = inode_to_wb(mapping->host);
2493+
struct inode *inode = mapping->host;
2494+
struct bdi_writeback *wb;
2495+
bool locked;
24942496

2497+
wb = unlocked_inode_to_wb_begin(inode, &locked);
24952498
current->nr_dirtied--;
24962499
dec_zone_page_state(page, NR_DIRTIED);
24972500
dec_wb_stat(wb, WB_DIRTIED);
2501+
unlocked_inode_to_wb_end(inode, locked);
24982502
}
24992503
}
25002504
EXPORT_SYMBOL(account_page_redirty);
@@ -2597,13 +2601,18 @@ void cancel_dirty_page(struct page *page)
25972601
struct address_space *mapping = page_mapping(page);
25982602

25992603
if (mapping_cap_account_dirty(mapping)) {
2604+
struct inode *inode = mapping->host;
2605+
struct bdi_writeback *wb;
26002606
struct mem_cgroup *memcg;
2607+
bool locked;
26012608

26022609
memcg = mem_cgroup_begin_page_stat(page);
2610+
wb = unlocked_inode_to_wb_begin(inode, &locked);
26032611

26042612
if (TestClearPageDirty(page))
2605-
account_page_cleaned(page, mapping, memcg);
2613+
account_page_cleaned(page, mapping, memcg, wb);
26062614

2615+
unlocked_inode_to_wb_end(inode, locked);
26072616
mem_cgroup_end_page_stat(memcg);
26082617
} else {
26092618
ClearPageDirty(page);
@@ -2628,12 +2637,16 @@ EXPORT_SYMBOL(cancel_dirty_page);
26282637
int clear_page_dirty_for_io(struct page *page)
26292638
{
26302639
struct address_space *mapping = page_mapping(page);
2631-
struct mem_cgroup *memcg;
26322640
int ret = 0;
26332641

26342642
BUG_ON(!PageLocked(page));
26352643

26362644
if (mapping && mapping_cap_account_dirty(mapping)) {
2645+
struct inode *inode = mapping->host;
2646+
struct bdi_writeback *wb;
2647+
struct mem_cgroup *memcg;
2648+
bool locked;
2649+
26372650
/*
26382651
* Yes, Virginia, this is indeed insane.
26392652
*
@@ -2670,12 +2683,14 @@ int clear_page_dirty_for_io(struct page *page)
26702683
* exclusion.
26712684
*/
26722685
memcg = mem_cgroup_begin_page_stat(page);
2686+
wb = unlocked_inode_to_wb_begin(inode, &locked);
26732687
if (TestClearPageDirty(page)) {
26742688
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
26752689
dec_zone_page_state(page, NR_FILE_DIRTY);
2676-
dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
2690+
dec_wb_stat(wb, WB_RECLAIMABLE);
26772691
ret = 1;
26782692
}
2693+
unlocked_inode_to_wb_end(inode, locked);
26792694
mem_cgroup_end_page_stat(memcg);
26802695
return ret;
26812696
}

0 commit comments

Comments
 (0)