Skip to content

Commit 6b0c81b

Browse files
rientjestorvalds
authored andcommitted
mm, oom: reduce dependency on tasklist_lock
Since exiting tasks require write_lock_irq(&tasklist_lock) several times, try to reduce the amount of time the readside is held for oom kills. This makes the interface with the memcg oom handler more consistent since it now never needs to take tasklist_lock unnecessarily. The only time the oom killer now takes tasklist_lock is when iterating the children of the selected task, everything else is protected by rcu_read_lock(). This requires that a reference to the selected process, p, is grabbed before calling oom_kill_process(). It may release it and grab a reference on another one of p's threads if !p->mm, but it also guarantees that it will release the reference before returning. [hughd@google.com: fix duplicate put_task_struct()] Signed-off-by: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: Oleg Nesterov <oleg@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 9cbb78b commit 6b0c81b

File tree

2 files changed

+30
-14
lines changed

2 files changed

+30
-14
lines changed

mm/memcontrol.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,11 +1521,8 @@ void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
15211521
if (!chosen)
15221522
return;
15231523
points = chosen_points * 1000 / totalpages;
1524-
read_lock(&tasklist_lock);
15251524
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
15261525
NULL, "Memory cgroup out of memory");
1527-
read_unlock(&tasklist_lock);
1528-
put_task_struct(chosen);
15291526
}
15301527

15311528
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

mm/oom_kill.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
336336

337337
/*
338338
* Simple selection loop. We chose the process with the highest
339-
* number of 'points'. We expect the caller will lock the tasklist.
339+
* number of 'points'.
340340
*
341341
* (not docbooked, we don't want this one cluttering up the manual)
342342
*/
@@ -348,6 +348,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
348348
struct task_struct *chosen = NULL;
349349
unsigned long chosen_points = 0;
350350

351+
rcu_read_lock();
351352
do_each_thread(g, p) {
352353
unsigned int points;
353354

@@ -360,6 +361,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
360361
case OOM_SCAN_CONTINUE:
361362
continue;
362363
case OOM_SCAN_ABORT:
364+
rcu_read_unlock();
363365
return ERR_PTR(-1UL);
364366
case OOM_SCAN_OK:
365367
break;
@@ -370,6 +372,9 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
370372
chosen_points = points;
371373
}
372374
} while_each_thread(g, p);
375+
if (chosen)
376+
get_task_struct(chosen);
377+
rcu_read_unlock();
373378

374379
*ppoints = chosen_points * 1000 / totalpages;
375380
return chosen;
@@ -385,15 +390,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
385390
* are not shown.
386391
* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
387392
* swapents, oom_score_adj value, and name.
388-
*
389-
* Call with tasklist_lock read-locked.
390393
*/
391394
static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
392395
{
393396
struct task_struct *p;
394397
struct task_struct *task;
395398

396399
pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400+
rcu_read_lock();
397401
for_each_process(p) {
398402
if (oom_unkillable_task(p, memcg, nodemask))
399403
continue;
@@ -416,6 +420,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
416420
task->signal->oom_score_adj, task->comm);
417421
task_unlock(task);
418422
}
423+
rcu_read_unlock();
419424
}
420425

421426
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -436,6 +441,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
436441
}
437442

438443
#define K(x) ((x) << (PAGE_SHIFT-10))
444+
/*
445+
* Must be called while holding a reference to p, which will be released upon
446+
* returning.
447+
*/
439448
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
440449
unsigned int points, unsigned long totalpages,
441450
struct mem_cgroup *memcg, nodemask_t *nodemask,
@@ -455,6 +464,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
455464
*/
456465
if (p->flags & PF_EXITING) {
457466
set_tsk_thread_flag(p, TIF_MEMDIE);
467+
put_task_struct(p);
458468
return;
459469
}
460470

@@ -472,6 +482,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
472482
* parent. This attempts to lose the minimal amount of work done while
473483
* still freeing memory.
474484
*/
485+
read_lock(&tasklist_lock);
475486
do {
476487
list_for_each_entry(child, &t->children, sibling) {
477488
unsigned int child_points;
@@ -484,15 +495,26 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
484495
child_points = oom_badness(child, memcg, nodemask,
485496
totalpages);
486497
if (child_points > victim_points) {
498+
put_task_struct(victim);
487499
victim = child;
488500
victim_points = child_points;
501+
get_task_struct(victim);
489502
}
490503
}
491504
} while_each_thread(p, t);
505+
read_unlock(&tasklist_lock);
492506

493-
victim = find_lock_task_mm(victim);
494-
if (!victim)
507+
rcu_read_lock();
508+
p = find_lock_task_mm(victim);
509+
if (!p) {
510+
rcu_read_unlock();
511+
put_task_struct(victim);
495512
return;
513+
} else if (victim != p) {
514+
get_task_struct(p);
515+
put_task_struct(victim);
516+
victim = p;
517+
}
496518

497519
/* mm cannot safely be dereferenced after task_unlock(victim) */
498520
mm = victim->mm;
@@ -523,9 +545,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
523545
task_unlock(p);
524546
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
525547
}
548+
rcu_read_unlock();
526549

527550
set_tsk_thread_flag(victim, TIF_MEMDIE);
528551
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552+
put_task_struct(victim);
529553
}
530554
#undef K
531555

@@ -546,9 +570,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
546570
if (constraint != CONSTRAINT_NONE)
547571
return;
548572
}
549-
read_lock(&tasklist_lock);
550573
dump_header(NULL, gfp_mask, order, NULL, nodemask);
551-
read_unlock(&tasklist_lock);
552574
panic("Out of memory: %s panic_on_oom is enabled\n",
553575
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
554576
}
@@ -721,10 +743,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
721743
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
722744
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
723745

724-
read_lock(&tasklist_lock);
725746
if (sysctl_oom_kill_allocating_task && current->mm &&
726747
!oom_unkillable_task(current, NULL, nodemask) &&
727748
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
749+
get_task_struct(current);
728750
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
729751
nodemask,
730752
"Out of memory (oom_kill_allocating_task)");
@@ -735,7 +757,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
735757
/* Found nothing?!?! Either we hang forever, or we panic. */
736758
if (!p) {
737759
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
738-
read_unlock(&tasklist_lock);
739760
panic("Out of memory and no killable processes...\n");
740761
}
741762
if (PTR_ERR(p) != -1UL) {
@@ -744,8 +765,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
744765
killed = 1;
745766
}
746767
out:
747-
read_unlock(&tasklist_lock);
748-
749768
/*
750769
* Give the killed threads a good chance of exiting before trying to
751770
* allocate memory again.

0 commit comments

Comments
 (0)