Skip to content

Commit a983b5e

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: fix excessive complexity in memory.stat reporting
We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 2845426 commit a983b5e

File tree

2 files changed

+113
-84
lines changed

2 files changed

+113
-84
lines changed

include/linux/memcontrol.h

Lines changed: 62 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,10 @@ struct lruvec_stat {
108108
*/
109109
struct mem_cgroup_per_node {
110110
struct lruvec lruvec;
111-
struct lruvec_stat __percpu *lruvec_stat;
111+
112+
struct lruvec_stat __percpu *lruvec_stat_cpu;
113+
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
114+
112115
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
113116

114117
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
@@ -227,10 +230,10 @@ struct mem_cgroup {
227230
spinlock_t move_lock;
228231
struct task_struct *move_lock_task;
229232
unsigned long move_lock_flags;
230-
/*
231-
* percpu counter.
232-
*/
233-
struct mem_cgroup_stat_cpu __percpu *stat;
233+
234+
struct mem_cgroup_stat_cpu __percpu *stat_cpu;
235+
atomic_long_t stat[MEMCG_NR_STAT];
236+
atomic_long_t events[MEMCG_NR_EVENTS];
234237

235238
unsigned long socket_pressure;
236239

@@ -265,6 +268,12 @@ struct mem_cgroup {
265268
/* WARNING: nodeinfo must be the last member here */
266269
};
267270

271+
/*
272+
* size of first charge trial. "32" comes from vmscan.c's magic value.
273+
* TODO: maybe necessary to use big numbers in big irons.
274+
*/
275+
#define MEMCG_CHARGE_BATCH 32U
276+
268277
extern struct mem_cgroup *root_mem_cgroup;
269278

270279
static inline bool mem_cgroup_disabled(void)
@@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page);
485494
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
486495
int idx)
487496
{
488-
long val = 0;
489-
int cpu;
490-
491-
for_each_possible_cpu(cpu)
492-
val += per_cpu(memcg->stat->count[idx], cpu);
493-
494-
if (val < 0)
495-
val = 0;
496-
497-
return val;
497+
long x = atomic_long_read(&memcg->stat[idx]);
498+
#ifdef CONFIG_SMP
499+
if (x < 0)
500+
x = 0;
501+
#endif
502+
return x;
498503
}
499504

500505
/* idx can be of type enum memcg_stat_item or node_stat_item */
501506
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
502507
int idx, int val)
503508
{
504-
if (!mem_cgroup_disabled())
505-
__this_cpu_add(memcg->stat->count[idx], val);
509+
long x;
510+
511+
if (mem_cgroup_disabled())
512+
return;
513+
514+
x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
515+
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
516+
atomic_long_add(x, &memcg->stat[idx]);
517+
x = 0;
518+
}
519+
__this_cpu_write(memcg->stat_cpu->count[idx], x);
506520
}
507521

508522
/* idx can be of type enum memcg_stat_item or node_stat_item */
509523
static inline void mod_memcg_state(struct mem_cgroup *memcg,
510524
int idx, int val)
511525
{
512-
if (!mem_cgroup_disabled())
513-
this_cpu_add(memcg->stat->count[idx], val);
526+
preempt_disable();
527+
__mod_memcg_state(memcg, idx, val);
528+
preempt_enable();
514529
}
515530

516531
/**
@@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
548563
enum node_stat_item idx)
549564
{
550565
struct mem_cgroup_per_node *pn;
551-
long val = 0;
552-
int cpu;
566+
long x;
553567

554568
if (mem_cgroup_disabled())
555569
return node_page_state(lruvec_pgdat(lruvec), idx);
556570

557571
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
558-
for_each_possible_cpu(cpu)
559-
val += per_cpu(pn->lruvec_stat->count[idx], cpu);
560-
561-
if (val < 0)
562-
val = 0;
563-
564-
return val;
572+
x = atomic_long_read(&pn->lruvec_stat[idx]);
573+
#ifdef CONFIG_SMP
574+
if (x < 0)
575+
x = 0;
576+
#endif
577+
return x;
565578
}
566579

567580
static inline void __mod_lruvec_state(struct lruvec *lruvec,
568581
enum node_stat_item idx, int val)
569582
{
570583
struct mem_cgroup_per_node *pn;
584+
long x;
571585

572586
/* Update node */
573587
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
@@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec,
581595
__mod_memcg_state(pn->memcg, idx, val);
582596

583597
/* Update lruvec */
584-
__this_cpu_add(pn->lruvec_stat->count[idx], val);
598+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
599+
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
600+
atomic_long_add(x, &pn->lruvec_stat[idx]);
601+
x = 0;
602+
}
603+
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
585604
}
586605

587606
static inline void mod_lruvec_state(struct lruvec *lruvec,
@@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
624643
static inline void __count_memcg_events(struct mem_cgroup *memcg,
625644
int idx, unsigned long count)
626645
{
627-
if (!mem_cgroup_disabled())
628-
__this_cpu_add(memcg->stat->events[idx], count);
646+
unsigned long x;
647+
648+
if (mem_cgroup_disabled())
649+
return;
650+
651+
x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
652+
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
653+
atomic_long_add(x, &memcg->events[idx]);
654+
x = 0;
655+
}
656+
__this_cpu_write(memcg->stat_cpu->events[idx], x);
629657
}
630658

631-
/* idx can be of type enum memcg_event_item or vm_event_item */
632659
static inline void count_memcg_events(struct mem_cgroup *memcg,
633660
int idx, unsigned long count)
634661
{
635-
if (!mem_cgroup_disabled())
636-
this_cpu_add(memcg->stat->events[idx], count);
662+
preempt_disable();
663+
__count_memcg_events(memcg, idx, count);
664+
preempt_enable();
637665
}
638666

639667
/* idx can be of type enum memcg_event_item or vm_event_item */

mm/memcontrol.c

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
542542
return mz;
543543
}
544544

545-
/*
546-
* Return page count for single (non recursive) @memcg.
547-
*
548-
* Implementation Note: reading percpu statistics for memcg.
549-
*
550-
* Both of vmstat[] and percpu_counter has threshold and do periodic
551-
* synchronization to implement "quick" read. There are trade-off between
552-
* reading cost and precision of value. Then, we may have a chance to implement
553-
* a periodic synchronization of counter in memcg's counter.
554-
*
555-
* But this _read() function is used for user interface now. The user accounts
556-
* memory usage by memory cgroup and he _always_ requires exact value because
557-
* he accounts memory. Even if we provide quick-and-fuzzy read, we always
558-
* have to visit all online cpus and make sum. So, for now, unnecessary
559-
* synchronization is not implemented. (just implemented for cpu hotplug)
560-
*
561-
* If there are kernel internal actions which can make use of some not-exact
562-
* value, and reading all cpu value can be performance bottleneck in some
563-
* common workload, threshold and synchronization as vmstat[] should be
564-
* implemented.
565-
*
566-
* The parameter idx can be of type enum memcg_event_item or vm_event_item.
567-
*/
568-
569545
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
570546
int event)
571547
{
572-
unsigned long val = 0;
573-
int cpu;
574-
575-
for_each_possible_cpu(cpu)
576-
val += per_cpu(memcg->stat->events[event], cpu);
577-
return val;
548+
return atomic_long_read(&memcg->events[event]);
578549
}
579550

580551
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -606,7 +577,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
606577
nr_pages = -nr_pages; /* for event */
607578
}
608579

609-
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
580+
__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
610581
}
611582

612583
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
642613
{
643614
unsigned long val, next;
644615

645-
val = __this_cpu_read(memcg->stat->nr_page_events);
646-
next = __this_cpu_read(memcg->stat->targets[target]);
616+
val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
617+
next = __this_cpu_read(memcg->stat_cpu->targets[target]);
647618
/* from time_after() in jiffies.h */
648619
if ((long)(next - val) < 0) {
649620
switch (target) {
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
659630
default:
660631
break;
661632
}
662-
__this_cpu_write(memcg->stat->targets[target], next);
633+
__this_cpu_write(memcg->stat_cpu->targets[target], next);
663634
return true;
664635
}
665636
return false;
@@ -1707,11 +1678,6 @@ void unlock_page_memcg(struct page *page)
17071678
}
17081679
EXPORT_SYMBOL(unlock_page_memcg);
17091680

1710-
/*
1711-
* size of first charge trial. "32" comes from vmscan.c's magic value.
1712-
* TODO: maybe necessary to use big numbers in big irons.
1713-
*/
1714-
#define CHARGE_BATCH 32U
17151681
struct memcg_stock_pcp {
17161682
struct mem_cgroup *cached; /* this never be root cgroup */
17171683
unsigned int nr_pages;
@@ -1739,7 +1705,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
17391705
unsigned long flags;
17401706
bool ret = false;
17411707

1742-
if (nr_pages > CHARGE_BATCH)
1708+
if (nr_pages > MEMCG_CHARGE_BATCH)
17431709
return ret;
17441710

17451711
local_irq_save(flags);
@@ -1808,7 +1774,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
18081774
}
18091775
stock->nr_pages += nr_pages;
18101776

1811-
if (stock->nr_pages > CHARGE_BATCH)
1777+
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
18121778
drain_stock(stock);
18131779

18141780
local_irq_restore(flags);
@@ -1858,9 +1824,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
18581824
static int memcg_hotplug_cpu_dead(unsigned int cpu)
18591825
{
18601826
struct memcg_stock_pcp *stock;
1827+
struct mem_cgroup *memcg;
18611828

18621829
stock = &per_cpu(memcg_stock, cpu);
18631830
drain_stock(stock);
1831+
1832+
for_each_mem_cgroup(memcg) {
1833+
int i;
1834+
1835+
for (i = 0; i < MEMCG_NR_STAT; i++) {
1836+
int nid;
1837+
long x;
1838+
1839+
x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
1840+
if (x)
1841+
atomic_long_add(x, &memcg->stat[i]);
1842+
1843+
if (i >= NR_VM_NODE_STAT_ITEMS)
1844+
continue;
1845+
1846+
for_each_node(nid) {
1847+
struct mem_cgroup_per_node *pn;
1848+
1849+
pn = mem_cgroup_nodeinfo(memcg, nid);
1850+
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
1851+
if (x)
1852+
atomic_long_add(x, &pn->lruvec_stat[i]);
1853+
}
1854+
}
1855+
1856+
for (i = 0; i < MEMCG_NR_EVENTS; i++) {
1857+
long x;
1858+
1859+
x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
1860+
if (x)
1861+
atomic_long_add(x, &memcg->events[i]);
1862+
}
1863+
}
1864+
18641865
return 0;
18651866
}
18661867

@@ -1881,7 +1882,7 @@ static void high_work_func(struct work_struct *work)
18811882
struct mem_cgroup *memcg;
18821883

18831884
memcg = container_of(work, struct mem_cgroup, high_work);
1884-
reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
1885+
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
18851886
}
18861887

18871888
/*
@@ -1905,7 +1906,7 @@ void mem_cgroup_handle_over_high(void)
19051906
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
19061907
unsigned int nr_pages)
19071908
{
1908-
unsigned int batch = max(CHARGE_BATCH, nr_pages);
1909+
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
19091910
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
19101911
struct mem_cgroup *mem_over_limit;
19111912
struct page_counter *counter;
@@ -4161,8 +4162,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
41614162
if (!pn)
41624163
return 1;
41634164

4164-
pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
4165-
if (!pn->lruvec_stat) {
4165+
pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4166+
if (!pn->lruvec_stat_cpu) {
41664167
kfree(pn);
41674168
return 1;
41684169
}
@@ -4180,7 +4181,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
41804181
{
41814182
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
41824183

4183-
free_percpu(pn->lruvec_stat);
4184+
free_percpu(pn->lruvec_stat_cpu);
41844185
kfree(pn);
41854186
}
41864187

@@ -4190,7 +4191,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
41904191

41914192
for_each_node(node)
41924193
free_mem_cgroup_per_node_info(memcg, node);
4193-
free_percpu(memcg->stat);
4194+
free_percpu(memcg->stat_cpu);
41944195
kfree(memcg);
41954196
}
41964197

@@ -4219,8 +4220,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
42194220
if (memcg->id.id < 0)
42204221
goto fail;
42214222

4222-
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4223-
if (!memcg->stat)
4223+
memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4224+
if (!memcg->stat_cpu)
42244225
goto fail;
42254226

42264227
for_each_node(node)
@@ -5638,7 +5639,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
56385639
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
56395640
__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
56405641
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
5641-
__this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
5642+
__this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
56425643
memcg_check_events(ug->memcg, ug->dummy_page);
56435644
local_irq_restore(flags);
56445645

0 commit comments

Comments
 (0)