Skip to content

Commit 739f79f

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries to update the memcg stats: BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0 IP: test_clear_page_writeback+0x12e/0x2c0 [...] RIP: 0010:test_clear_page_writeback+0x12e/0x2c0 Call Trace: <IRQ> end_page_writeback+0x47/0x70 f2fs_write_end_io+0x76/0x180 [f2fs] bio_endio+0x9f/0x120 blk_update_request+0xa8/0x2f0 scsi_end_request+0x39/0x1d0 scsi_io_completion+0x211/0x690 scsi_finish_command+0xd9/0x120 scsi_softirq_done+0x127/0x150 __blk_mq_complete_request_remote+0x13/0x20 flush_smp_call_function_queue+0x56/0x110 generic_smp_call_function_single_interrupt+0x13/0x30 smp_call_function_single_interrupt+0x27/0x40 call_function_single_interrupt+0x89/0x90 RIP: 0010:native_safe_halt+0x6/0x10 (gdb) l *(test_clear_page_writeback+0x12e) 0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619). 614 mod_node_page_state(page_pgdat(page), idx, val); 615 if (mem_cgroup_disabled() || !page->mem_cgroup) 616 return; 617 mod_memcg_state(page->mem_cgroup, idx, val); 618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; 619 this_cpu_add(pn->lruvec_stat->count[idx], val); 620 } 621 622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 623 gfp_t gfp_mask, The issue is that writeback doesn't hold a page reference and the page might get freed after PG_writeback is cleared (and the mapping is unlocked) in test_clear_page_writeback(). The stat functions looking up the page's node or zone are safe, as those attributes are static across allocation and free cycles. But page->mem_cgroup is not, and it will get cleared if we race with truncation or migration. It appears this race window has been around for a while, but less likely to trigger when the memcg stats were updated first thing after PG_writeback is cleared. Recent changes reshuffled this code to update the global node stats before the memcg ones, though, stretching the race window out to an extent where people can reproduce the problem. Update test_clear_page_writeback() to look up and pin page->mem_cgroup before clearing PG_writeback, then not use that pointer afterward. It is a partial revert of 62cccb8 ("mm: simplify lock_page_memcg()") but leaves the pageref-holding callsites that aren't affected alone. Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org Fixes: 62cccb8 ("mm: simplify lock_page_memcg()") Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: Jaegeuk Kim <jaegeuk@kernel.org> Tested-by: Jaegeuk Kim <jaegeuk@kernel.org> Reported-by: Bradley Bolen <bradleybolen@gmail.com> Tested-by: Brad Bolen <bradleybolen@gmail.com> Cc: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: <stable@vger.kernel.org> [4.6+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 039a8e3 commit 739f79f

File tree

3 files changed

+51
-17
lines changed

3 files changed

+51
-17
lines changed

include/linux/memcontrol.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
484484
extern int do_swap_account;
485485
#endif
486486

487-
void lock_page_memcg(struct page *page);
487+
struct mem_cgroup *lock_page_memcg(struct page *page);
488+
void __unlock_page_memcg(struct mem_cgroup *memcg);
488489
void unlock_page_memcg(struct page *page);
489490

490491
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
809810
{
810811
}
811812

812-
static inline void lock_page_memcg(struct page *page)
813+
static inline struct mem_cgroup *lock_page_memcg(struct page *page)
814+
{
815+
return NULL;
816+
}
817+
818+
static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
813819
{
814820
}
815821

mm/memcontrol.c

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,9 +1611,13 @@ bool mem_cgroup_oom_synchronize(bool handle)
16111611
* @page: the page
16121612
*
16131613
* This function protects unlocked LRU pages from being moved to
1614-
* another cgroup and stabilizes their page->mem_cgroup binding.
1614+
* another cgroup.
1615+
*
1616+
* It ensures lifetime of the returned memcg. Caller is responsible
1617+
* for the lifetime of the page; __unlock_page_memcg() is available
1618+
* when @page might get freed inside the locked section.
16151619
*/
1616-
void lock_page_memcg(struct page *page)
1620+
struct mem_cgroup *lock_page_memcg(struct page *page)
16171621
{
16181622
struct mem_cgroup *memcg;
16191623
unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
16221626
* The RCU lock is held throughout the transaction. The fast
16231627
* path can get away without acquiring the memcg->move_lock
16241628
* because page moving starts with an RCU grace period.
1625-
*/
1629+
*
1630+
* The RCU lock also protects the memcg from being freed when
1631+
* the page state that is going to change is the only thing
1632+
* preventing the page itself from being freed. E.g. writeback
1633+
* doesn't hold a page reference and relies on PG_writeback to
1634+
* keep off truncation, migration and so forth.
1635+
*/
16261636
rcu_read_lock();
16271637

16281638
if (mem_cgroup_disabled())
1629-
return;
1639+
return NULL;
16301640
again:
16311641
memcg = page->mem_cgroup;
16321642
if (unlikely(!memcg))
1633-
return;
1643+
return NULL;
16341644

16351645
if (atomic_read(&memcg->moving_account) <= 0)
1636-
return;
1646+
return memcg;
16371647

16381648
spin_lock_irqsave(&memcg->move_lock, flags);
16391649
if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ void lock_page_memcg(struct page *page)
16491659
memcg->move_lock_task = current;
16501660
memcg->move_lock_flags = flags;
16511661

1652-
return;
1662+
return memcg;
16531663
}
16541664
EXPORT_SYMBOL(lock_page_memcg);
16551665

16561666
/**
1657-
* unlock_page_memcg - unlock a page->mem_cgroup binding
1658-
* @page: the page
1667+
* __unlock_page_memcg - unlock and unpin a memcg
1668+
* @memcg: the memcg
1669+
*
1670+
* Unlock and unpin a memcg returned by lock_page_memcg().
16591671
*/
1660-
void unlock_page_memcg(struct page *page)
1672+
void __unlock_page_memcg(struct mem_cgroup *memcg)
16611673
{
1662-
struct mem_cgroup *memcg = page->mem_cgroup;
1663-
16641674
if (memcg && memcg->move_lock_task == current) {
16651675
unsigned long flags = memcg->move_lock_flags;
16661676

@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
16721682

16731683
rcu_read_unlock();
16741684
}
1685+
1686+
/**
1687+
* unlock_page_memcg - unlock a page->mem_cgroup binding
1688+
* @page: the page
1689+
*/
1690+
void unlock_page_memcg(struct page *page)
1691+
{
1692+
__unlock_page_memcg(page->mem_cgroup);
1693+
}
16751694
EXPORT_SYMBOL(unlock_page_memcg);
16761695

16771696
/*

mm/page-writeback.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
27242724
int test_clear_page_writeback(struct page *page)
27252725
{
27262726
struct address_space *mapping = page_mapping(page);
2727+
struct mem_cgroup *memcg;
2728+
struct lruvec *lruvec;
27272729
int ret;
27282730

2729-
lock_page_memcg(page);
2731+
memcg = lock_page_memcg(page);
2732+
lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
27302733
if (mapping && mapping_use_writeback_tags(mapping)) {
27312734
struct inode *inode = mapping->host;
27322735
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
27542757
} else {
27552758
ret = TestClearPageWriteback(page);
27562759
}
2760+
/*
2761+
* NOTE: Page might be free now! Writeback doesn't hold a page
2762+
* reference on its own, it relies on truncation to wait for
2763+
* the clearing of PG_writeback. The below can only access
2764+
* page state that is static across allocation cycles.
2765+
*/
27572766
if (ret) {
2758-
dec_lruvec_page_state(page, NR_WRITEBACK);
2767+
dec_lruvec_state(lruvec, NR_WRITEBACK);
27592768
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
27602769
inc_node_page_state(page, NR_WRITTEN);
27612770
}
2762-
unlock_page_memcg(page);
2771+
__unlock_page_memcg(memcg);
27632772
return ret;
27642773
}
27652774

0 commit comments

Comments
 (0)