Skip to content

Commit 36324a9

Browse files
Michal Hockotorvalds
authored andcommitted
oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space
When oom_reaper manages to unmap all the eligible vmas there shouldn't be much of the freable memory held by the oom victim left anymore so it makes sense to clear the TIF_MEMDIE flag for the victim and allow the OOM killer to select another task. The lack of TIF_MEMDIE also means that the victim cannot access memory reserves anymore but that shouldn't be a problem because it would get the access again if it needs to allocate and hits the OOM killer again due to the fatal_signal_pending resp. PF_EXITING check. We can safely hide the task from the OOM killer because it is clearly not a good candidate anymore as everyhing reclaimable has been torn down already. This patch will allow to cap the time an OOM victim can keep TIF_MEMDIE and thus hold off further global OOM killer actions granted the oom reaper is able to take mmap_sem for the associated mm struct. This is not guaranteed now but further steps should make sure that mmap_sem for write should be blocked killable which will help to reduce such a lock contention. This is not done by this patch. Note that exit_oom_victim might be called on a remote task from __oom_reap_task now so we have to check and clear the flag atomically otherwise we might race and underflow oom_victims or wake up waiters too early. Signed-off-by: Michal Hocko <mhocko@suse.com> Suggested-by: Johannes Weiner <hannes@cmpxchg.org> Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Andrea Argangeli <andrea@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent aac4536 commit 36324a9

File tree

3 files changed

+50
-27
lines changed

3 files changed

+50
-27
lines changed

include/linux/oom.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
9191

9292
extern bool out_of_memory(struct oom_control *oc);
9393

94-
extern void exit_oom_victim(void);
94+
extern void exit_oom_victim(struct task_struct *tsk);
9595

9696
extern int register_oom_notifier(struct notifier_block *nb);
9797
extern int unregister_oom_notifier(struct notifier_block *nb);

kernel/exit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
435435
mm_update_next_owner(mm);
436436
mmput(mm);
437437
if (test_thread_flag(TIF_MEMDIE))
438-
exit_oom_victim();
438+
exit_oom_victim(tsk);
439439
}
440440

441441
static struct task_struct *find_alive_thread(struct task_struct *p)

mm/oom_kill.c

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -416,20 +416,36 @@ bool oom_killer_disabled __read_mostly;
416416
* victim (if that is possible) to help the OOM killer to move on.
417417
*/
418418
static struct task_struct *oom_reaper_th;
419-
static struct mm_struct *mm_to_reap;
419+
static struct task_struct *task_to_reap;
420420
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
421421

422-
static bool __oom_reap_vmas(struct mm_struct *mm)
422+
static bool __oom_reap_task(struct task_struct *tsk)
423423
{
424424
struct mmu_gather tlb;
425425
struct vm_area_struct *vma;
426+
struct mm_struct *mm;
427+
struct task_struct *p;
426428
struct zap_details details = {.check_swap_entries = true,
427429
.ignore_dirty = true};
428430
bool ret = true;
429431

430-
/* We might have raced with exit path */
431-
if (!atomic_inc_not_zero(&mm->mm_users))
432+
/*
433+
* Make sure we find the associated mm_struct even when the particular
434+
* thread has already terminated and cleared its mm.
435+
* We might have race with exit path so consider our work done if there
436+
* is no mm.
437+
*/
438+
p = find_lock_task_mm(tsk);
439+
if (!p)
440+
return true;
441+
442+
mm = p->mm;
443+
if (!atomic_inc_not_zero(&mm->mm_users)) {
444+
task_unlock(p);
432445
return true;
446+
}
447+
448+
task_unlock(p);
433449

434450
if (!down_read_trylock(&mm->mmap_sem)) {
435451
ret = false;
@@ -464,60 +480,66 @@ static bool __oom_reap_vmas(struct mm_struct *mm)
464480
}
465481
tlb_finish_mmu(&tlb, 0, -1);
466482
up_read(&mm->mmap_sem);
483+
484+
/*
485+
* Clear TIF_MEMDIE because the task shouldn't be sitting on a
486+
* reasonably reclaimable memory anymore. OOM killer can continue
487+
* by selecting other victim if unmapping hasn't led to any
488+
* improvements. This also means that selecting this task doesn't
489+
* make any sense.
490+
*/
491+
tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
492+
exit_oom_victim(tsk);
467493
out:
468494
mmput(mm);
469495
return ret;
470496
}
471497

472-
static void oom_reap_vmas(struct mm_struct *mm)
498+
static void oom_reap_task(struct task_struct *tsk)
473499
{
474500
int attempts = 0;
475501

476502
/* Retry the down_read_trylock(mmap_sem) a few times */
477-
while (attempts++ < 10 && !__oom_reap_vmas(mm))
503+
while (attempts++ < 10 && !__oom_reap_task(tsk))
478504
schedule_timeout_idle(HZ/10);
479505

480506
/* Drop a reference taken by wake_oom_reaper */
481-
mmdrop(mm);
507+
put_task_struct(tsk);
482508
}
483509

484510
static int oom_reaper(void *unused)
485511
{
486512
while (true) {
487-
struct mm_struct *mm;
513+
struct task_struct *tsk;
488514

489515
wait_event_freezable(oom_reaper_wait,
490-
(mm = READ_ONCE(mm_to_reap)));
491-
oom_reap_vmas(mm);
492-
WRITE_ONCE(mm_to_reap, NULL);
516+
(tsk = READ_ONCE(task_to_reap)));
517+
oom_reap_task(tsk);
518+
WRITE_ONCE(task_to_reap, NULL);
493519
}
494520

495521
return 0;
496522
}
497523

498-
static void wake_oom_reaper(struct mm_struct *mm)
524+
static void wake_oom_reaper(struct task_struct *tsk)
499525
{
500-
struct mm_struct *old_mm;
526+
struct task_struct *old_tsk;
501527

502528
if (!oom_reaper_th)
503529
return;
504530

505-
/*
506-
* Pin the given mm. Use mm_count instead of mm_users because
507-
* we do not want to delay the address space tear down.
508-
*/
509-
atomic_inc(&mm->mm_count);
531+
get_task_struct(tsk);
510532

511533
/*
512534
* Make sure that only a single mm is ever queued for the reaper
513535
* because multiple are not necessary and the operation might be
514536
* disruptive so better reduce it to the bare minimum.
515537
*/
516-
old_mm = cmpxchg(&mm_to_reap, NULL, mm);
517-
if (!old_mm)
538+
old_tsk = cmpxchg(&task_to_reap, NULL, tsk);
539+
if (!old_tsk)
518540
wake_up(&oom_reaper_wait);
519541
else
520-
mmdrop(mm);
542+
put_task_struct(tsk);
521543
}
522544

523545
static int __init oom_init(void)
@@ -532,7 +554,7 @@ static int __init oom_init(void)
532554
}
533555
subsys_initcall(oom_init)
534556
#else
535-
static void wake_oom_reaper(struct mm_struct *mm)
557+
static void wake_oom_reaper(struct task_struct *tsk)
536558
{
537559
}
538560
#endif
@@ -563,9 +585,10 @@ void mark_oom_victim(struct task_struct *tsk)
563585
/**
564586
* exit_oom_victim - note the exit of an OOM victim
565587
*/
566-
void exit_oom_victim(void)
588+
void exit_oom_victim(struct task_struct *tsk)
567589
{
568-
clear_thread_flag(TIF_MEMDIE);
590+
if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
591+
return;
569592

570593
if (!atomic_dec_return(&oom_victims))
571594
wake_up_all(&oom_victims_wait);
@@ -748,7 +771,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
748771
rcu_read_unlock();
749772

750773
if (can_oom_reap)
751-
wake_oom_reaper(mm);
774+
wake_oom_reaper(victim);
752775

753776
mmdrop(mm);
754777
put_task_struct(victim);

0 commit comments

Comments
 (0)