Skip to content

Commit 9cbb78b

Browse files
rientjestorvalds
authored andcommitted
mm, memcg: introduce own oom handler to iterate only over its own threads
The global oom killer is serialized by the per-zonelist try_set_zonelist_oom() which is used in the page allocator. Concurrent oom kills are thus a rare event and only occur in systems using mempolicies and with a large number of nodes. Memory controller oom kills, however, can frequently be concurrent since there is no serialization once the oom killer is called for oom conditions in several different memcgs in parallel. This creates a massive contention on tasklist_lock since the oom killer requires the readside for the tasklist iteration. If several memcgs are calling the oom killer, this lock can be held for a substantial amount of time, especially if threads continue to enter it as other threads are exiting. Since the exit path grabs the writeside of the lock with irqs disabled in a few different places, this can cause a soft lockup on cpus as a result of tasklist_lock starvation. The kernel lacks unfair writelocks, and successful calls to the oom killer usually result in at least one thread entering the exit path, so an alternative solution is needed. This patch introduces a seperate oom handler for memcgs so that they do not require tasklist_lock for as much time. Instead, it iterates only over the threads attached to the oom memcg and grabs a reference to the selected thread before calling oom_kill_process() to ensure it doesn't prematurely exit. This still requires tasklist_lock for the tasklist dump, iterating children of the selected process, and killing all other threads on the system sharing the same memory as the selected victim. So while this isn't a complete solution to tasklist_lock starvation, it significantly reduces the amount of time that it is held. Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: David Rientjes <rientjes@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Sha Zhengju <handai.szj@taobao.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 462607e commit 9cbb78b

File tree

4 files changed

+93
-41
lines changed

4 files changed

+93
-41
lines changed

include/linux/memcontrol.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
180180
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
181181
gfp_t gfp_mask,
182182
unsigned long *total_scanned);
183-
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
183+
extern void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
184+
int order);
184185

185186
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
186187
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -364,12 +365,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
364365
return 0;
365366
}
366367

367-
static inline
368-
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
369-
{
370-
return 0;
371-
}
372-
373368
static inline void mem_cgroup_split_huge_fixup(struct page *head)
374369
{
375370
}

include/linux/oom.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,33 @@ enum oom_constraint {
4040
CONSTRAINT_MEMCG,
4141
};
4242

43+
enum oom_scan_t {
44+
OOM_SCAN_OK, /* scan thread and find its badness */
45+
OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
46+
OOM_SCAN_ABORT, /* abort the iteration and return */
47+
OOM_SCAN_SELECT, /* always select this thread first */
48+
};
49+
4350
extern void compare_swap_oom_score_adj(int old_val, int new_val);
4451
extern int test_set_oom_score_adj(int new_val);
4552

4653
extern unsigned long oom_badness(struct task_struct *p,
4754
struct mem_cgroup *memcg, const nodemask_t *nodemask,
4855
unsigned long totalpages);
56+
extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
57+
unsigned int points, unsigned long totalpages,
58+
struct mem_cgroup *memcg, nodemask_t *nodemask,
59+
const char *message);
60+
4961
extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
5062
extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
5163

64+
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
65+
unsigned long totalpages, const nodemask_t *nodemask,
66+
bool force_kill);
5267
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
5368
int order);
69+
5470
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
5571
int order, nodemask_t *mask, bool force_kill);
5672
extern int register_oom_notifier(struct notifier_block *nb);

mm/memcontrol.c

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1453,7 +1453,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
14531453
/*
14541454
* Return the memory (and swap, if configured) limit for a memcg.
14551455
*/
1456-
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1456+
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
14571457
{
14581458
u64 limit;
14591459
u64 memsw;
@@ -1469,6 +1469,65 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
14691469
return min(limit, memsw);
14701470
}
14711471

1472+
void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1473+
int order)
1474+
{
1475+
struct mem_cgroup *iter;
1476+
unsigned long chosen_points = 0;
1477+
unsigned long totalpages;
1478+
unsigned int points = 0;
1479+
struct task_struct *chosen = NULL;
1480+
1481+
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1482+
for_each_mem_cgroup_tree(iter, memcg) {
1483+
struct cgroup *cgroup = iter->css.cgroup;
1484+
struct cgroup_iter it;
1485+
struct task_struct *task;
1486+
1487+
cgroup_iter_start(cgroup, &it);
1488+
while ((task = cgroup_iter_next(cgroup, &it))) {
1489+
switch (oom_scan_process_thread(task, totalpages, NULL,
1490+
false)) {
1491+
case OOM_SCAN_SELECT:
1492+
if (chosen)
1493+
put_task_struct(chosen);
1494+
chosen = task;
1495+
chosen_points = ULONG_MAX;
1496+
get_task_struct(chosen);
1497+
/* fall through */
1498+
case OOM_SCAN_CONTINUE:
1499+
continue;
1500+
case OOM_SCAN_ABORT:
1501+
cgroup_iter_end(cgroup, &it);
1502+
mem_cgroup_iter_break(memcg, iter);
1503+
if (chosen)
1504+
put_task_struct(chosen);
1505+
return;
1506+
case OOM_SCAN_OK:
1507+
break;
1508+
};
1509+
points = oom_badness(task, memcg, NULL, totalpages);
1510+
if (points > chosen_points) {
1511+
if (chosen)
1512+
put_task_struct(chosen);
1513+
chosen = task;
1514+
chosen_points = points;
1515+
get_task_struct(chosen);
1516+
}
1517+
}
1518+
cgroup_iter_end(cgroup, &it);
1519+
}
1520+
1521+
if (!chosen)
1522+
return;
1523+
points = chosen_points * 1000 / totalpages;
1524+
read_lock(&tasklist_lock);
1525+
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1526+
NULL, "Memory cgroup out of memory");
1527+
read_unlock(&tasklist_lock);
1528+
put_task_struct(chosen);
1529+
}
1530+
14721531
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
14731532
gfp_t gfp_mask,
14741533
unsigned long flags)

mm/oom_kill.c

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -288,20 +288,13 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
288288
}
289289
#endif
290290

291-
enum oom_scan_t {
292-
OOM_SCAN_OK, /* scan thread and find its badness */
293-
OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
294-
OOM_SCAN_ABORT, /* abort the iteration and return */
295-
OOM_SCAN_SELECT, /* always select this thread first */
296-
};
297-
298-
static enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
299-
struct mem_cgroup *memcg, unsigned long totalpages,
300-
const nodemask_t *nodemask, bool force_kill)
291+
enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292+
unsigned long totalpages, const nodemask_t *nodemask,
293+
bool force_kill)
301294
{
302295
if (task->exit_state)
303296
return OOM_SCAN_CONTINUE;
304-
if (oom_unkillable_task(task, memcg, nodemask))
297+
if (oom_unkillable_task(task, NULL, nodemask))
305298
return OOM_SCAN_CONTINUE;
306299

307300
/*
@@ -348,8 +341,8 @@ static enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
348341
* (not docbooked, we don't want this one cluttering up the manual)
349342
*/
350343
static struct task_struct *select_bad_process(unsigned int *ppoints,
351-
unsigned long totalpages, struct mem_cgroup *memcg,
352-
const nodemask_t *nodemask, bool force_kill)
344+
unsigned long totalpages, const nodemask_t *nodemask,
345+
bool force_kill)
353346
{
354347
struct task_struct *g, *p;
355348
struct task_struct *chosen = NULL;
@@ -358,7 +351,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
358351
do_each_thread(g, p) {
359352
unsigned int points;
360353

361-
switch (oom_scan_process_thread(p, memcg, totalpages, nodemask,
354+
switch (oom_scan_process_thread(p, totalpages, nodemask,
362355
force_kill)) {
363356
case OOM_SCAN_SELECT:
364357
chosen = p;
@@ -371,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
371364
case OOM_SCAN_OK:
372365
break;
373366
};
374-
points = oom_badness(p, memcg, nodemask, totalpages);
367+
points = oom_badness(p, NULL, nodemask, totalpages);
375368
if (points > chosen_points) {
376369
chosen = p;
377370
chosen_points = points;
@@ -443,10 +436,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
443436
}
444437

445438
#define K(x) ((x) << (PAGE_SHIFT-10))
446-
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
447-
unsigned int points, unsigned long totalpages,
448-
struct mem_cgroup *memcg, nodemask_t *nodemask,
449-
const char *message)
439+
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
440+
unsigned int points, unsigned long totalpages,
441+
struct mem_cgroup *memcg, nodemask_t *nodemask,
442+
const char *message)
450443
{
451444
struct task_struct *victim = p;
452445
struct task_struct *child;
@@ -564,10 +557,6 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
564557
void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
565558
int order)
566559
{
567-
unsigned long limit;
568-
unsigned int points = 0;
569-
struct task_struct *p;
570-
571560
/*
572561
* If current has a pending SIGKILL, then automatically select it. The
573562
* goal is to allow it to allocate so that it may quickly exit and free
@@ -579,13 +568,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
579568
}
580569

581570
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
582-
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
583-
read_lock(&tasklist_lock);
584-
p = select_bad_process(&points, limit, memcg, NULL, false);
585-
if (p && PTR_ERR(p) != -1UL)
586-
oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
587-
"Memory cgroup out of memory");
588-
read_unlock(&tasklist_lock);
571+
__mem_cgroup_out_of_memory(memcg, gfp_mask, order);
589572
}
590573
#endif
591574

@@ -710,7 +693,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
710693
struct task_struct *p;
711694
unsigned long totalpages;
712695
unsigned long freed = 0;
713-
unsigned int points;
696+
unsigned int uninitialized_var(points);
714697
enum oom_constraint constraint = CONSTRAINT_NONE;
715698
int killed = 0;
716699

@@ -748,8 +731,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
748731
goto out;
749732
}
750733

751-
p = select_bad_process(&points, totalpages, NULL, mpol_mask,
752-
force_kill);
734+
p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
753735
/* Found nothing?!?! Either we hang forever, or we panic. */
754736
if (!p) {
755737
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);

0 commit comments

Comments
 (0)