Skip to content

Commit 23047a9

Browse files
hnaztorvalds
authored andcommitted
mm: workingset: per-cgroup cache thrash detection
Cache thrash detection (see a528910 "mm: thrash detection-based file cache sizing" for details) currently only works on the system level, not inside cgroups. Worse, as the refaults are compared to the global number of active cache, cgroups might wrongfully get all their refaults activated when their pages are hotter than those of others. Move the refault machinery from the zone to the lruvec, and then tag eviction entries with the memcg ID. This makes the thrash detection work correctly inside cgroups. [sergey.senozhatsky@gmail.com: do not return from workingset_activation() with locked rcu and page] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 612e449 commit 23047a9

File tree

5 files changed

+134
-57
lines changed

5 files changed

+134
-57
lines changed

include/linux/memcontrol.h

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ enum mem_cgroup_events_target {
8989
};
9090

9191
#ifdef CONFIG_MEMCG
92+
93+
#define MEM_CGROUP_ID_SHIFT 16
94+
#define MEM_CGROUP_ID_MAX USHRT_MAX
95+
9296
struct mem_cgroup_stat_cpu {
9397
long count[MEMCG_NR_STAT];
9498
unsigned long events[MEMCG_NR_EVENTS];
@@ -265,6 +269,11 @@ struct mem_cgroup {
265269

266270
extern struct mem_cgroup *root_mem_cgroup;
267271

272+
static inline bool mem_cgroup_disabled(void)
273+
{
274+
return !cgroup_subsys_enabled(memory_cgrp_subsys);
275+
}
276+
268277
/**
269278
* mem_cgroup_events - count memory events against a cgroup
270279
* @memcg: the memory cgroup
@@ -312,6 +321,28 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
312321
struct mem_cgroup_reclaim_cookie *);
313322
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
314323

324+
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
325+
{
326+
if (mem_cgroup_disabled())
327+
return 0;
328+
329+
return memcg->css.id;
330+
}
331+
332+
/**
333+
* mem_cgroup_from_id - look up a memcg from an id
334+
* @id: the id to look up
335+
*
336+
* Caller must hold rcu_read_lock() and use css_tryget() as necessary.
337+
*/
338+
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
339+
{
340+
struct cgroup_subsys_state *css;
341+
342+
css = css_from_id(id, &memory_cgrp_subsys);
343+
return mem_cgroup_from_css(css);
344+
}
345+
315346
/**
316347
* parent_mem_cgroup - find the accounting parent of a memcg
317348
* @memcg: memcg whose parent to find
@@ -353,11 +384,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
353384
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
354385
ino_t page_cgroup_ino(struct page *page);
355386

356-
static inline bool mem_cgroup_disabled(void)
357-
{
358-
return !cgroup_subsys_enabled(memory_cgrp_subsys);
359-
}
360-
361387
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
362388
{
363389
if (mem_cgroup_disabled())
@@ -502,8 +528,17 @@ void mem_cgroup_split_huge_fixup(struct page *head);
502528
#endif
503529

504530
#else /* CONFIG_MEMCG */
531+
532+
#define MEM_CGROUP_ID_SHIFT 0
533+
#define MEM_CGROUP_ID_MAX 0
534+
505535
struct mem_cgroup;
506536

537+
static inline bool mem_cgroup_disabled(void)
538+
{
539+
return true;
540+
}
541+
507542
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
508543
enum mem_cgroup_events_index idx,
509544
unsigned int nr)
@@ -586,9 +621,16 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
586621
{
587622
}
588623

589-
static inline bool mem_cgroup_disabled(void)
624+
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
590625
{
591-
return true;
626+
return 0;
627+
}
628+
629+
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
630+
{
631+
WARN_ON_ONCE(id);
632+
/* XXX: This should always return root_mem_cgroup */
633+
return NULL;
592634
}
593635

594636
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)

include/linux/mmzone.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,12 @@ struct zone_reclaim_stat {
212212
};
213213

214214
struct lruvec {
215-
struct list_head lists[NR_LRU_LISTS];
216-
struct zone_reclaim_stat reclaim_stat;
215+
struct list_head lists[NR_LRU_LISTS];
216+
struct zone_reclaim_stat reclaim_stat;
217+
/* Evictions & activations on the inactive file list */
218+
atomic_long_t inactive_age;
217219
#ifdef CONFIG_MEMCG
218-
struct zone *zone;
220+
struct zone *zone;
219221
#endif
220222
};
221223

@@ -490,9 +492,6 @@ struct zone {
490492
spinlock_t lru_lock;
491493
struct lruvec lruvec;
492494

493-
/* Evictions & activations on the inactive file list */
494-
atomic_long_t inactive_age;
495-
496495
/*
497496
* When free pages are below this point, additional steps are taken
498497
* when reading the number of free pages to avoid per-cpu counter
@@ -761,6 +760,8 @@ static inline struct zone *lruvec_zone(struct lruvec *lruvec)
761760
#endif
762761
}
763762

763+
extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
764+
764765
#ifdef CONFIG_HAVE_MEMORY_PRESENT
765766
void memory_present(int nid, unsigned long start, unsigned long end);
766767
#else

mm/memcontrol.c

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
268268
return (memcg == root_mem_cgroup);
269269
}
270270

271-
/*
272-
* We restrict the id in the range of [1, 65535], so it can fit into
273-
* an unsigned short.
274-
*/
275-
#define MEM_CGROUP_ID_MAX USHRT_MAX
276-
277-
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
278-
{
279-
return memcg->css.id;
280-
}
281-
282-
/*
283-
* A helper function to get mem_cgroup from ID. must be called under
284-
* rcu_read_lock(). The caller is responsible for calling
285-
* css_tryget_online() if the mem_cgroup is used for charging. (dropping
286-
* refcnt from swap can be called against removed memcg.)
287-
*/
288-
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
289-
{
290-
struct cgroup_subsys_state *css;
291-
292-
css = css_from_id(id, &memory_cgrp_subsys);
293-
return mem_cgroup_from_css(css);
294-
}
295-
296271
#ifndef CONFIG_SLOB
297272
/*
298273
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.

mm/vmscan.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
213213
zone_reclaimable_pages(zone) * 6;
214214
}
215215

216-
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
216+
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
217217
{
218218
if (!mem_cgroup_disabled())
219219
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
19231923
unsigned long inactive;
19241924
unsigned long active;
19251925

1926-
inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1927-
active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1926+
inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
1927+
active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
19281928

19291929
return active > inactive;
19301930
}
@@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
20632063
* system is under heavy pressure.
20642064
*/
20652065
if (!inactive_file_is_low(lruvec) &&
2066-
get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2066+
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
20672067
scan_balance = SCAN_FILE;
20682068
goto out;
20692069
}
@@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
20892089
* anon in [0], file in [1]
20902090
*/
20912091

2092-
anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
2093-
get_lru_size(lruvec, LRU_INACTIVE_ANON);
2094-
file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
2095-
get_lru_size(lruvec, LRU_INACTIVE_FILE);
2092+
anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
2093+
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
2094+
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
2095+
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
20962096

20972097
spin_lock_irq(&zone->lru_lock);
20982098
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2130,7 +2130,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
21302130
unsigned long size;
21312131
unsigned long scan;
21322132

2133-
size = get_lru_size(lruvec, lru);
2133+
size = lruvec_lru_size(lruvec, lru);
21342134
scan = size >> sc->priority;
21352135

21362136
if (!scan && pass && force_scan)

mm/workingset.c

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,8 @@
153153
*/
154154

155155
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156-
ZONES_SHIFT + NODES_SHIFT)
156+
ZONES_SHIFT + NODES_SHIFT + \
157+
MEM_CGROUP_ID_SHIFT)
157158
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
158159

159160
/*
@@ -166,28 +167,32 @@
166167
*/
167168
static unsigned int bucket_order __read_mostly;
168169

169-
static void *pack_shadow(unsigned long eviction, struct zone *zone)
170+
static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
170171
{
171172
eviction >>= bucket_order;
173+
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
172174
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
173175
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
174176
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
175177

176178
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
177179
}
178180

179-
static void unpack_shadow(void *shadow, struct zone **zonep,
181+
static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
180182
unsigned long *evictionp)
181183
{
182184
unsigned long entry = (unsigned long)shadow;
183-
int zid, nid;
185+
int memcgid, nid, zid;
184186

185187
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
186188
zid = entry & ((1UL << ZONES_SHIFT) - 1);
187189
entry >>= ZONES_SHIFT;
188190
nid = entry & ((1UL << NODES_SHIFT) - 1);
189191
entry >>= NODES_SHIFT;
192+
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193+
entry >>= MEM_CGROUP_ID_SHIFT;
190194

195+
*memcgidp = memcgid;
191196
*zonep = NODE_DATA(nid)->node_zones + zid;
192197
*evictionp = entry << bucket_order;
193198
}
@@ -202,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
202207
*/
203208
void *workingset_eviction(struct address_space *mapping, struct page *page)
204209
{
210+
struct mem_cgroup *memcg = page_memcg(page);
205211
struct zone *zone = page_zone(page);
212+
int memcgid = mem_cgroup_id(memcg);
206213
unsigned long eviction;
214+
struct lruvec *lruvec;
207215

208-
eviction = atomic_long_inc_return(&zone->inactive_age);
209-
return pack_shadow(eviction, zone);
216+
/* Page is fully exclusive and pins page->mem_cgroup */
217+
VM_BUG_ON_PAGE(PageLRU(page), page);
218+
VM_BUG_ON_PAGE(page_count(page), page);
219+
VM_BUG_ON_PAGE(!PageLocked(page), page);
220+
221+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
222+
eviction = atomic_long_inc_return(&lruvec->inactive_age);
223+
return pack_shadow(memcgid, zone, eviction);
210224
}
211225

212226
/**
@@ -221,13 +235,42 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
221235
bool workingset_refault(void *shadow)
222236
{
223237
unsigned long refault_distance;
238+
unsigned long active_file;
239+
struct mem_cgroup *memcg;
224240
unsigned long eviction;
241+
struct lruvec *lruvec;
225242
unsigned long refault;
226243
struct zone *zone;
244+
int memcgid;
227245

228-
unpack_shadow(shadow, &zone, &eviction);
246+
unpack_shadow(shadow, &memcgid, &zone, &eviction);
229247

230-
refault = atomic_long_read(&zone->inactive_age);
248+
rcu_read_lock();
249+
/*
250+
* Look up the memcg associated with the stored ID. It might
251+
* have been deleted since the page's eviction.
252+
*
253+
* Note that in rare events the ID could have been recycled
254+
* for a new cgroup that refaults a shared page. This is
255+
* impossible to tell from the available data. However, this
256+
* should be a rare and limited disturbance, and activations
257+
* are always speculative anyway. Ultimately, it's the aging
258+
* algorithm's job to shake out the minimum access frequency
259+
* for the active cache.
260+
*
261+
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
262+
* would be better if the root_mem_cgroup existed in all
263+
* configurations instead.
264+
*/
265+
memcg = mem_cgroup_from_id(memcgid);
266+
if (!mem_cgroup_disabled() && !memcg) {
267+
rcu_read_unlock();
268+
return false;
269+
}
270+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
271+
refault = atomic_long_read(&lruvec->inactive_age);
272+
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273+
rcu_read_unlock();
231274

232275
/*
233276
* The unsigned subtraction here gives an accurate distance
@@ -249,7 +292,7 @@ bool workingset_refault(void *shadow)
249292

250293
inc_zone_state(zone, WORKINGSET_REFAULT);
251294

252-
if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
295+
if (refault_distance <= active_file) {
253296
inc_zone_state(zone, WORKINGSET_ACTIVATE);
254297
return true;
255298
}
@@ -262,7 +305,23 @@ bool workingset_refault(void *shadow)
262305
*/
263306
void workingset_activation(struct page *page)
264307
{
265-
atomic_long_inc(&page_zone(page)->inactive_age);
308+
struct mem_cgroup *memcg;
309+
struct lruvec *lruvec;
310+
311+
memcg = lock_page_memcg(page);
312+
/*
313+
* Filter non-memcg pages here, e.g. unmap can call
314+
* mark_page_accessed() on VDSO pages.
315+
*
316+
* XXX: See workingset_refault() - this should return
317+
* root_mem_cgroup even for !CONFIG_MEMCG.
318+
*/
319+
if (!mem_cgroup_disabled() && !memcg)
320+
goto out;
321+
lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg);
322+
atomic_long_inc(&lruvec->inactive_age);
323+
out:
324+
unlock_page_memcg(memcg);
266325
}
267326

268327
/*

0 commit comments

Comments
 (0)