Skip to content

Commit 52ebea7

Browse files
htejunaxboe
authored andcommitted
writeback: make backing_dev_info host cgroup-specific bdi_writebacks
For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: kbuild test robot <fengguang.wu@intel.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent 89e9b9e commit 52ebea7

File tree

11 files changed

+698
-11
lines changed

11 files changed

+698
-11
lines changed

block/blk-cgroup.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <linux/module.h>
1616
#include <linux/err.h>
1717
#include <linux/blkdev.h>
18+
#include <linux/backing-dev.h>
1819
#include <linux/slab.h>
1920
#include <linux/genhd.h>
2021
#include <linux/delay.h>
@@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
797798
}
798799

799800
spin_unlock_irq(&blkcg->lock);
801+
802+
wb_blkcg_offline(blkcg);
800803
}
801804

802805
static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -827,7 +830,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
827830
spin_lock_init(&blkcg->lock);
828831
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
829832
INIT_HLIST_HEAD(&blkcg->blkg_list);
830-
833+
#ifdef CONFIG_CGROUP_WRITEBACK
834+
INIT_LIST_HEAD(&blkcg->cgwb_list);
835+
#endif
831836
return &blkcg->css;
832837
}
833838

fs/fs-writeback.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
185185
*/
186186
void inode_wb_list_del(struct inode *inode)
187187
{
188-
struct backing_dev_info *bdi = inode_to_bdi(inode);
188+
struct bdi_writeback *wb = inode_to_wb(inode);
189189

190-
spin_lock(&bdi->wb.list_lock);
190+
spin_lock(&wb->list_lock);
191191
list_del_init(&inode->i_wb_list);
192-
spin_unlock(&bdi->wb.list_lock);
192+
spin_unlock(&wb->list_lock);
193193
}
194194

195195
/*
@@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
12681268
if ((inode->i_state & flags) != flags) {
12691269
const int was_dirty = inode->i_state & I_DIRTY;
12701270

1271+
inode_attach_wb(inode, NULL);
1272+
12711273
if (flags & I_DIRTY_INODE)
12721274
inode->i_state &= ~I_DIRTY_TIME;
12731275
inode->i_state |= flags;

fs/inode.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
223223
void __destroy_inode(struct inode *inode)
224224
{
225225
BUG_ON(inode_has_buffers(inode));
226+
inode_detach_wb(inode);
226227
security_inode_free(inode);
227228
fsnotify_inode_delete(inode);
228229
locks_free_lock_context(inode->i_flctx);

include/linux/backing-dev-defs.h

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
#define __LINUX_BACKING_DEV_DEFS_H
33

44
#include <linux/list.h>
5+
#include <linux/radix-tree.h>
6+
#include <linux/rbtree.h>
57
#include <linux/spinlock.h>
68
#include <linux/percpu_counter.h>
9+
#include <linux/percpu-refcount.h>
710
#include <linux/flex_proportions.h>
811
#include <linux/timer.h>
912
#include <linux/workqueue.h>
@@ -37,10 +40,43 @@ enum wb_stat_item {
3740

3841
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
3942

43+
/*
44+
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
45+
* wb's can operate mostly independently but should share the congested
46+
* state. To facilitate such sharing, the congested state is tracked using
47+
* the following struct which is created on demand, indexed by blkcg ID on
48+
* its bdi, and refcounted.
49+
*/
4050
struct bdi_writeback_congested {
4151
unsigned long state; /* WB_[a]sync_congested flags */
52+
53+
#ifdef CONFIG_CGROUP_WRITEBACK
54+
struct backing_dev_info *bdi; /* the associated bdi */
55+
atomic_t refcnt; /* nr of attached wb's and blkg */
56+
int blkcg_id; /* ID of the associated blkcg */
57+
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
58+
#endif
4259
};
4360

61+
/*
62+
* Each wb (bdi_writeback) can perform writeback operations, is measured
63+
* and throttled, independently. Without cgroup writeback, each bdi
64+
* (bdi_writeback) is served by its embedded bdi->wb.
65+
*
66+
* On the default hierarchy, blkcg implicitly enables memcg. This allows
67+
* using memcg's page ownership for attributing writeback IOs, and every
68+
* memcg - blkcg combination can be served by its own wb by assigning a
69+
* dedicated wb to each memcg, which enables isolation across different
70+
* cgroups and propagation of IO back pressure down from the IO layer upto
71+
* the tasks which are generating the dirty pages to be written back.
72+
*
73+
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
74+
* refcounted with the number of inodes attached to it, and pins the memcg
75+
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
76+
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
77+
* is tested for blkcg after lookup and removed from index on mismatch so
78+
* that a new wb for the combination can be created.
79+
*/
4480
struct bdi_writeback {
4581
struct backing_dev_info *bdi; /* our parent bdi */
4682

@@ -78,6 +114,19 @@ struct bdi_writeback {
78114
spinlock_t work_lock; /* protects work_list & dwork scheduling */
79115
struct list_head work_list;
80116
struct delayed_work dwork; /* work item used for writeback */
117+
118+
#ifdef CONFIG_CGROUP_WRITEBACK
119+
struct percpu_ref refcnt; /* used only for !root wb's */
120+
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
121+
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
122+
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
123+
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
124+
125+
union {
126+
struct work_struct release_work;
127+
struct rcu_head rcu;
128+
};
129+
#endif
81130
};
82131

83132
struct backing_dev_info {
@@ -92,9 +141,13 @@ struct backing_dev_info {
92141
unsigned int min_ratio;
93142
unsigned int max_ratio, max_prop_frac;
94143

95-
struct bdi_writeback wb; /* default writeback info for this bdi */
96-
struct bdi_writeback_congested wb_congested;
97-
144+
struct bdi_writeback wb; /* the root writeback info for this bdi */
145+
struct bdi_writeback_congested wb_congested; /* its congested state */
146+
#ifdef CONFIG_CGROUP_WRITEBACK
147+
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
148+
struct rb_root cgwb_congested_tree; /* their congested states */
149+
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
150+
#endif
98151
struct device *dev;
99152

100153
struct timer_list laptop_mode_wb_timer;

include/linux/backing-dev.h

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/sched.h>
1414
#include <linux/blkdev.h>
1515
#include <linux/writeback.h>
16+
#include <linux/blk-cgroup.h>
1617
#include <linux/backing-dev-defs.h>
1718

1819
int __must_check bdi_init(struct backing_dev_info *bdi);
@@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word)
234235

235236
#ifdef CONFIG_CGROUP_WRITEBACK
236237

238+
struct bdi_writeback_congested *
239+
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
240+
void wb_congested_put(struct bdi_writeback_congested *congested);
241+
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
242+
struct cgroup_subsys_state *memcg_css,
243+
gfp_t gfp);
244+
void __inode_attach_wb(struct inode *inode, struct page *page);
245+
void wb_memcg_offline(struct mem_cgroup *memcg);
246+
void wb_blkcg_offline(struct blkcg *blkcg);
247+
237248
/**
238249
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
239250
* @inode: inode of interest
@@ -250,13 +261,197 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
250261
(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
251262
}
252263

264+
/**
265+
* wb_tryget - try to increment a wb's refcount
266+
* @wb: bdi_writeback to get
267+
*/
268+
static inline bool wb_tryget(struct bdi_writeback *wb)
269+
{
270+
if (wb != &wb->bdi->wb)
271+
return percpu_ref_tryget(&wb->refcnt);
272+
return true;
273+
}
274+
275+
/**
276+
* wb_get - increment a wb's refcount
277+
* @wb: bdi_writeback to get
278+
*/
279+
static inline void wb_get(struct bdi_writeback *wb)
280+
{
281+
if (wb != &wb->bdi->wb)
282+
percpu_ref_get(&wb->refcnt);
283+
}
284+
285+
/**
286+
* wb_put - decrement a wb's refcount
287+
* @wb: bdi_writeback to put
288+
*/
289+
static inline void wb_put(struct bdi_writeback *wb)
290+
{
291+
if (wb != &wb->bdi->wb)
292+
percpu_ref_put(&wb->refcnt);
293+
}
294+
295+
/**
296+
* wb_find_current - find wb for %current on a bdi
297+
* @bdi: bdi of interest
298+
*
299+
* Find the wb of @bdi which matches both the memcg and blkcg of %current.
300+
* Must be called under rcu_read_lock() which protects the returend wb.
301+
* NULL if not found.
302+
*/
303+
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
304+
{
305+
struct cgroup_subsys_state *memcg_css;
306+
struct bdi_writeback *wb;
307+
308+
memcg_css = task_css(current, memory_cgrp_id);
309+
if (!memcg_css->parent)
310+
return &bdi->wb;
311+
312+
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
313+
314+
/*
315+
* %current's blkcg equals the effective blkcg of its memcg. No
316+
* need to use the relatively expensive cgroup_get_e_css().
317+
*/
318+
if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
319+
return wb;
320+
return NULL;
321+
}
322+
323+
/**
324+
* wb_get_create_current - get or create wb for %current on a bdi
325+
* @bdi: bdi of interest
326+
* @gfp: allocation mask
327+
*
328+
* Equivalent to wb_get_create() on %current's memcg. This function is
329+
* called from a relatively hot path and optimizes the common cases using
330+
* wb_find_current().
331+
*/
332+
static inline struct bdi_writeback *
333+
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
334+
{
335+
struct bdi_writeback *wb;
336+
337+
rcu_read_lock();
338+
wb = wb_find_current(bdi);
339+
if (wb && unlikely(!wb_tryget(wb)))
340+
wb = NULL;
341+
rcu_read_unlock();
342+
343+
if (unlikely(!wb)) {
344+
struct cgroup_subsys_state *memcg_css;
345+
346+
memcg_css = task_get_css(current, memory_cgrp_id);
347+
wb = wb_get_create(bdi, memcg_css, gfp);
348+
css_put(memcg_css);
349+
}
350+
return wb;
351+
}
352+
353+
/**
354+
* inode_attach_wb - associate an inode with its wb
355+
* @inode: inode of interest
356+
* @page: page being dirtied (may be NULL)
357+
*
358+
* If @inode doesn't have its wb, associate it with the wb matching the
359+
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
360+
* @inode->i_lock.
361+
*/
362+
static inline void inode_attach_wb(struct inode *inode, struct page *page)
363+
{
364+
if (!inode->i_wb)
365+
__inode_attach_wb(inode, page);
366+
}
367+
368+
/**
369+
* inode_detach_wb - disassociate an inode from its wb
370+
* @inode: inode of interest
371+
*
372+
* @inode is being freed. Detach from its wb.
373+
*/
374+
static inline void inode_detach_wb(struct inode *inode)
375+
{
376+
if (inode->i_wb) {
377+
wb_put(inode->i_wb);
378+
inode->i_wb = NULL;
379+
}
380+
}
381+
382+
/**
383+
* inode_to_wb - determine the wb of an inode
384+
* @inode: inode of interest
385+
*
386+
* Returns the wb @inode is currently associated with.
387+
*/
388+
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
389+
{
390+
return inode->i_wb;
391+
}
392+
253393
#else /* CONFIG_CGROUP_WRITEBACK */
254394

255395
static inline bool inode_cgwb_enabled(struct inode *inode)
256396
{
257397
return false;
258398
}
259399

400+
static inline struct bdi_writeback_congested *
401+
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
402+
{
403+
return bdi->wb.congested;
404+
}
405+
406+
static inline void wb_congested_put(struct bdi_writeback_congested *congested)
407+
{
408+
}
409+
410+
static inline bool wb_tryget(struct bdi_writeback *wb)
411+
{
412+
return true;
413+
}
414+
415+
static inline void wb_get(struct bdi_writeback *wb)
416+
{
417+
}
418+
419+
static inline void wb_put(struct bdi_writeback *wb)
420+
{
421+
}
422+
423+
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
424+
{
425+
return &bdi->wb;
426+
}
427+
428+
static inline struct bdi_writeback *
429+
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
430+
{
431+
return &bdi->wb;
432+
}
433+
434+
static inline void inode_attach_wb(struct inode *inode, struct page *page)
435+
{
436+
}
437+
438+
static inline void inode_detach_wb(struct inode *inode)
439+
{
440+
}
441+
442+
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
443+
{
444+
return &inode_to_bdi(inode)->wb;
445+
}
446+
447+
static inline void wb_memcg_offline(struct mem_cgroup *memcg)
448+
{
449+
}
450+
451+
static inline void wb_blkcg_offline(struct blkcg *blkcg)
452+
{
453+
}
454+
260455
#endif /* CONFIG_CGROUP_WRITEBACK */
261456

262457
#endif /* _LINUX_BACKING_DEV_H */

include/linux/blk-cgroup.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ struct blkcg {
5353
/* TODO: per-policy storage in blkcg */
5454
unsigned int cfq_weight; /* belongs to cfq */
5555
unsigned int cfq_leaf_weight;
56+
57+
#ifdef CONFIG_CGROUP_WRITEBACK
58+
struct list_head cgwb_list;
59+
#endif
5660
};
5761

5862
struct blkg_stat {

0 commit comments

Comments
 (0)