Skip to content

Commit 27a7faa

Browse files
hkamezawatorvalds
authored andcommitted
memcg: swap cgroup for remembering usage
For accounting swap, we need a record per swap entry, at least. This patch adds following function. - swap_cgroup_swapon() .... called from swapon - swap_cgroup_swapoff() ... called at the end of swapoff. - swap_cgroup_record() .... record information of swap entry. - swap_cgroup_lookup() .... lookup information of swap entry. This patch just implements "how to record information". No actual method for limit the usage of swap. These routine uses flat table to record and lookup. "wise" lookup system like radix-tree requires requires memory allocation at new records but swap-out is usually called under memory shortage (or memcg hits limit.) So, I used static allocation. (maybe dynamic allocation is not very hard but it adds additional memory allocation in memory shortage path.) Note1: In this, we use pointer to record information and this means 8bytes per swap entry. I think we can reduce this when we create "id of cgroup" in the range of 0-65535 or 0-255. Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Hugh Dickins <hugh@veritas.com> Reported-by: Balbir Singh <balbir@linux.vnet.ibm.com> Reported-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent c077719 commit 27a7faa

File tree

3 files changed

+242
-0
lines changed

3 files changed

+242
-0
lines changed

include/linux/page_cgroup.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,40 @@ static inline void page_cgroup_init(void)
104104
{
105105
}
106106

107+
#endif
108+
109+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
110+
#include <linux/swap.h>
111+
extern struct mem_cgroup *
112+
swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
113+
extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
114+
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
115+
extern void swap_cgroup_swapoff(int type);
116+
#else
117+
#include <linux/swap.h>
118+
119+
static inline
120+
struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
121+
{
122+
return NULL;
123+
}
124+
125+
static inline
126+
struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
127+
{
128+
return NULL;
129+
}
130+
131+
static inline int
132+
swap_cgroup_swapon(int type, unsigned long max_pages)
133+
{
134+
return 0;
135+
}
136+
137+
static inline void swap_cgroup_swapoff(int type)
138+
{
139+
return;
140+
}
141+
107142
#endif
108143
#endif

mm/page_cgroup.c

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <linux/memory.h>
99
#include <linux/vmalloc.h>
1010
#include <linux/cgroup.h>
11+
#include <linux/swapops.h>
1112

1213
static void __meminit
1314
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -270,3 +271,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
270271
}
271272

272273
#endif
274+
275+
276+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
277+
278+
static DEFINE_MUTEX(swap_cgroup_mutex);
279+
struct swap_cgroup_ctrl {
280+
struct page **map;
281+
unsigned long length;
282+
};
283+
284+
struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
285+
286+
/*
287+
* This 8bytes seems big..maybe we can reduce this when we can use "id" for
288+
* cgroup rather than pointer.
289+
*/
290+
struct swap_cgroup {
291+
struct mem_cgroup *val;
292+
};
293+
#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
294+
#define SC_POS_MASK (SC_PER_PAGE - 1)
295+
296+
/*
297+
* SwapCgroup implements "lookup" and "exchange" operations.
298+
* In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
299+
* against SwapCache. At swap_free(), this is accessed directly from swap.
300+
*
301+
* This means,
302+
* - we have no race in "exchange" when we're accessed via SwapCache because
303+
* SwapCache(and its swp_entry) is under lock.
304+
* - When called via swap_free(), there is no user of this entry and no race.
305+
* Then, we don't need lock around "exchange".
306+
*
307+
* TODO: we can push these buffers out to HIGHMEM.
308+
*/
309+
310+
/*
311+
* allocate buffer for swap_cgroup.
312+
*/
313+
static int swap_cgroup_prepare(int type)
314+
{
315+
struct page *page;
316+
struct swap_cgroup_ctrl *ctrl;
317+
unsigned long idx, max;
318+
319+
if (!do_swap_account)
320+
return 0;
321+
ctrl = &swap_cgroup_ctrl[type];
322+
323+
for (idx = 0; idx < ctrl->length; idx++) {
324+
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
325+
if (!page)
326+
goto not_enough_page;
327+
ctrl->map[idx] = page;
328+
}
329+
return 0;
330+
not_enough_page:
331+
max = idx;
332+
for (idx = 0; idx < max; idx++)
333+
__free_page(ctrl->map[idx]);
334+
335+
return -ENOMEM;
336+
}
337+
338+
/**
339+
* swap_cgroup_record - record mem_cgroup for this swp_entry.
340+
* @ent: swap entry to be recorded into
341+
* @mem: mem_cgroup to be recorded
342+
*
343+
* Returns old value at success, NULL at failure.
344+
* (Of course, old value can be NULL.)
345+
*/
346+
struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
347+
{
348+
int type = swp_type(ent);
349+
unsigned long offset = swp_offset(ent);
350+
unsigned long idx = offset / SC_PER_PAGE;
351+
unsigned long pos = offset & SC_POS_MASK;
352+
struct swap_cgroup_ctrl *ctrl;
353+
struct page *mappage;
354+
struct swap_cgroup *sc;
355+
struct mem_cgroup *old;
356+
357+
if (!do_swap_account)
358+
return NULL;
359+
360+
ctrl = &swap_cgroup_ctrl[type];
361+
362+
mappage = ctrl->map[idx];
363+
sc = page_address(mappage);
364+
sc += pos;
365+
old = sc->val;
366+
sc->val = mem;
367+
368+
return old;
369+
}
370+
371+
/**
372+
* lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
373+
* @ent: swap entry to be looked up.
374+
*
375+
* Returns pointer to mem_cgroup at success. NULL at failure.
376+
*/
377+
struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
378+
{
379+
int type = swp_type(ent);
380+
unsigned long offset = swp_offset(ent);
381+
unsigned long idx = offset / SC_PER_PAGE;
382+
unsigned long pos = offset & SC_POS_MASK;
383+
struct swap_cgroup_ctrl *ctrl;
384+
struct page *mappage;
385+
struct swap_cgroup *sc;
386+
struct mem_cgroup *ret;
387+
388+
if (!do_swap_account)
389+
return NULL;
390+
391+
ctrl = &swap_cgroup_ctrl[type];
392+
mappage = ctrl->map[idx];
393+
sc = page_address(mappage);
394+
sc += pos;
395+
ret = sc->val;
396+
return ret;
397+
}
398+
399+
int swap_cgroup_swapon(int type, unsigned long max_pages)
400+
{
401+
void *array;
402+
unsigned long array_size;
403+
unsigned long length;
404+
struct swap_cgroup_ctrl *ctrl;
405+
406+
if (!do_swap_account)
407+
return 0;
408+
409+
length = ((max_pages/SC_PER_PAGE) + 1);
410+
array_size = length * sizeof(void *);
411+
412+
array = vmalloc(array_size);
413+
if (!array)
414+
goto nomem;
415+
416+
memset(array, 0, array_size);
417+
ctrl = &swap_cgroup_ctrl[type];
418+
mutex_lock(&swap_cgroup_mutex);
419+
ctrl->length = length;
420+
ctrl->map = array;
421+
if (swap_cgroup_prepare(type)) {
422+
/* memory shortage */
423+
ctrl->map = NULL;
424+
ctrl->length = 0;
425+
vfree(array);
426+
mutex_unlock(&swap_cgroup_mutex);
427+
goto nomem;
428+
}
429+
mutex_unlock(&swap_cgroup_mutex);
430+
431+
printk(KERN_INFO
432+
"swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
433+
" and %ld bytes to hold mem_cgroup pointers on swap\n",
434+
array_size, length * PAGE_SIZE);
435+
printk(KERN_INFO
436+
"swap_cgroup can be disabled by noswapaccount boot option.\n");
437+
438+
return 0;
439+
nomem:
440+
printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
441+
printk(KERN_INFO
442+
"swap_cgroup can be disabled by noswapaccount boot option\n");
443+
return -ENOMEM;
444+
}
445+
446+
void swap_cgroup_swapoff(int type)
447+
{
448+
int i;
449+
struct swap_cgroup_ctrl *ctrl;
450+
451+
if (!do_swap_account)
452+
return;
453+
454+
mutex_lock(&swap_cgroup_mutex);
455+
ctrl = &swap_cgroup_ctrl[type];
456+
if (ctrl->map) {
457+
for (i = 0; i < ctrl->length; i++) {
458+
struct page *page = ctrl->map[i];
459+
if (page)
460+
__free_page(page);
461+
}
462+
vfree(ctrl->map);
463+
ctrl->map = NULL;
464+
ctrl->length = 0;
465+
}
466+
mutex_unlock(&swap_cgroup_mutex);
467+
}
468+
469+
#endif

mm/swapfile.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <asm/pgtable.h>
3434
#include <asm/tlbflush.h>
3535
#include <linux/swapops.h>
36+
#include <linux/page_cgroup.h>
3637

3738
static DEFINE_SPINLOCK(swap_lock);
3839
static unsigned int nr_swapfiles;
@@ -1494,6 +1495,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
14941495
spin_unlock(&swap_lock);
14951496
mutex_unlock(&swapon_mutex);
14961497
vfree(swap_map);
1498+
/* Destroy swap account informatin */
1499+
swap_cgroup_swapoff(type);
1500+
14971501
inode = mapping->host;
14981502
if (S_ISBLK(inode->i_mode)) {
14991503
struct block_device *bdev = I_BDEV(inode);
@@ -1811,6 +1815,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
18111815
}
18121816
swap_map[page_nr] = SWAP_MAP_BAD;
18131817
}
1818+
1819+
error = swap_cgroup_swapon(type, maxpages);
1820+
if (error)
1821+
goto bad_swap;
1822+
18141823
nr_good_pages = swap_header->info.last_page -
18151824
swap_header->info.nr_badpages -
18161825
1 /* header page */;
@@ -1882,6 +1891,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
18821891
bd_release(bdev);
18831892
}
18841893
destroy_swap_extents(p);
1894+
swap_cgroup_swapoff(type);
18851895
bad_swap_2:
18861896
spin_lock(&swap_lock);
18871897
p->swap_file = NULL;

0 commit comments

Comments
 (0)