Skip to content

Commit b4e98d9

Browse files
kiryltorvalds
authored andcommitted
mm: account pud page tables
On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a3 ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 7d6c4df commit b4e98d9

File tree

11 files changed

+71
-20
lines changed

11 files changed

+71
-20
lines changed

Documentation/sysctl/vm.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -629,10 +629,10 @@ oom_dump_tasks
629629

630630
Enables a system-wide task dump (excluding kernel threads) to be produced
631631
when the kernel performs an OOM-killing and includes such information as
632-
pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
633-
score, and name. This is helpful to determine why the OOM killer was
634-
invoked, to identify the rogue task that caused it, and to determine why
635-
the OOM killer chose the task it did to kill.
632+
pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
633+
oom_score_adj score, and name. This is helpful to determine why the OOM
634+
killer was invoked, to identify the rogue task that caused it, and to
635+
determine why the OOM killer chose the task it did to kill.
636636

637637
If this is set to zero, this information is suppressed. On very
638638
large systems with thousands of tasks it may not be feasible to dump

arch/powerpc/mm/hugetlbpage.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
433433
pud = pud_offset(pgd, start);
434434
pgd_clear(pgd);
435435
pud_free_tlb(tlb, pud, start);
436+
mm_dec_nr_puds(tlb->mm);
436437
}
437438

438439
/*

arch/s390/include/asm/mmu_context.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
4444
mm->context.asce_limit = STACK_TOP_MAX;
4545
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
4646
_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
47+
/* pgd_alloc() did not account this pud */
48+
mm_inc_nr_puds(mm);
4749
break;
4850
case -PAGE_SIZE:
4951
/* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
5961
/* forked 2-level compat task, set new asce with new mm->pgd */
6062
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
6163
_ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
62-
/* pgd_alloc() did not increase mm->nr_pmds */
64+
/* pgd_alloc() did not account this pmd */
6365
mm_inc_nr_pmds(mm);
6466
}
6567
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));

arch/sparc/mm/hugetlbpage.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
472472
pud = pud_offset(pgd, start);
473473
pgd_clear(pgd);
474474
pud_free_tlb(tlb, pud, start);
475+
mm_dec_nr_puds(tlb->mm);
475476
}
476477

477478
void hugetlb_free_pgd_range(struct mmu_gather *tlb,

fs/proc/task_mmu.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
void task_mem(struct seq_file *m, struct mm_struct *mm)
2828
{
29-
unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
29+
unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
3030
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
3131

3232
anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
5252
swap = get_mm_counter(mm, MM_SWAPENTS);
5353
ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
5454
pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
55+
puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
5556
seq_printf(m,
5657
"VmPeak:\t%8lu kB\n"
5758
"VmSize:\t%8lu kB\n"
@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
6869
"VmLib:\t%8lu kB\n"
6970
"VmPTE:\t%8lu kB\n"
7071
"VmPMD:\t%8lu kB\n"
72+
"VmPUD:\t%8lu kB\n"
7173
"VmSwap:\t%8lu kB\n",
7274
hiwater_vm << (PAGE_SHIFT-10),
7375
total_vm << (PAGE_SHIFT-10),
@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
8284
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
8385
ptes >> 10,
8486
pmds >> 10,
87+
puds >> 10,
8588
swap << (PAGE_SHIFT-10));
8689
hugetlb_report_usage(m, mm);
8790
}

include/linux/mm.h

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
15991599
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
16001600
#endif
16011601

1602-
#ifdef __PAGETABLE_PUD_FOLDED
1602+
#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
16031603
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
16041604
unsigned long address)
16051605
{
16061606
return 0;
16071607
}
1608+
1609+
static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
1610+
{
1611+
return 0;
1612+
}
1613+
1614+
static inline void mm_nr_puds_init(struct mm_struct *mm) {}
1615+
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
1616+
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
1617+
16081618
#else
16091619
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
1620+
1621+
static inline void mm_nr_puds_init(struct mm_struct *mm)
1622+
{
1623+
atomic_long_set(&mm->nr_puds, 0);
1624+
}
1625+
1626+
static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
1627+
{
1628+
return atomic_long_read(&mm->nr_puds);
1629+
}
1630+
1631+
static inline void mm_inc_nr_puds(struct mm_struct *mm)
1632+
{
1633+
atomic_long_inc(&mm->nr_puds);
1634+
}
1635+
1636+
static inline void mm_dec_nr_puds(struct mm_struct *mm)
1637+
{
1638+
atomic_long_dec(&mm->nr_puds);
1639+
}
16101640
#endif
16111641

16121642
#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
16181648

16191649
static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
16201650

1621-
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1651+
static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
16221652
{
16231653
return 0;
16241654
}
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
16341664
atomic_long_set(&mm->nr_pmds, 0);
16351665
}
16361666

1637-
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1667+
static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
16381668
{
16391669
return atomic_long_read(&mm->nr_pmds);
16401670
}

include/linux/mm_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,9 @@ struct mm_struct {
404404
atomic_long_t nr_ptes; /* PTE page table pages */
405405
#if CONFIG_PGTABLE_LEVELS > 2
406406
atomic_long_t nr_pmds; /* PMD page table pages */
407+
#endif
408+
#if CONFIG_PGTABLE_LEVELS > 3
409+
atomic_long_t nr_puds; /* PUD page table pages */
407410
#endif
408411
int map_count; /* number of VMAs */
409412

kernel/fork.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
819819
mm->core_state = NULL;
820820
atomic_long_set(&mm->nr_ptes, 0);
821821
mm_nr_pmds_init(mm);
822+
mm_nr_puds_init(mm);
822823
mm->map_count = 0;
823824
mm->locked_vm = 0;
824825
mm->pinned_vm = 0;
@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
878879
if (mm_nr_pmds(mm))
879880
pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
880881
mm_nr_pmds(mm));
882+
if (mm_nr_puds(mm))
883+
pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
884+
mm_nr_puds(mm));
881885

882886
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
883887
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);

mm/debug.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
105105
"get_unmapped_area %p\n"
106106
#endif
107107
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
108-
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
108+
"pgd %p mm_users %d mm_count %d\n"
109+
"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
109110
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
110111
"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
111112
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
136137
mm->pgd, atomic_read(&mm->mm_users),
137138
atomic_read(&mm->mm_count),
138139
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
139-
mm_nr_pmds((struct mm_struct *)mm),
140+
mm_nr_pmds(mm),
141+
mm_nr_puds(mm),
140142
mm->map_count,
141143
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
142144
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,

mm/memory.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
506506
pud = pud_offset(p4d, start);
507507
p4d_clear(p4d);
508508
pud_free_tlb(tlb, pud, start);
509+
mm_dec_nr_puds(tlb->mm);
509510
}
510511

511512
static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
41494150

41504151
spin_lock(&mm->page_table_lock);
41514152
#ifndef __ARCH_HAS_5LEVEL_HACK
4152-
if (p4d_present(*p4d)) /* Another has populated it */
4153-
pud_free(mm, new);
4154-
else
4153+
if (!p4d_present(*p4d)) {
4154+
mm_inc_nr_puds(mm);
41554155
p4d_populate(mm, p4d, new);
4156-
#else
4157-
if (pgd_present(*p4d)) /* Another has populated it */
4156+
} else /* Another has populated it */
41584157
pud_free(mm, new);
4159-
else
4158+
#else
4159+
if (!pgd_present(*p4d)) {
4160+
mm_inc_nr_puds(mm);
41604161
pgd_populate(mm, p4d, new);
4162+
} else /* Another has populated it */
4163+
pud_free(mm, new);
41614164
#endif /* __ARCH_HAS_5LEVEL_HACK */
41624165
spin_unlock(&mm->page_table_lock);
41634166
return 0;

mm/oom_kill.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
221221
* task's rss, pagetable and swap space use.
222222
*/
223223
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
224-
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
224+
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
225+
mm_nr_puds(p->mm);
225226
task_unlock(p);
226227

227228
/*
@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
397398
struct task_struct *p;
398399
struct task_struct *task;
399400

400-
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
401+
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
401402
rcu_read_lock();
402403
for_each_process(p) {
403404
if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
413414
continue;
414415
}
415416

416-
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
417+
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n",
417418
task->pid, from_kuid(&init_user_ns, task_uid(task)),
418419
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
419420
atomic_long_read(&task->mm->nr_ptes),
420421
mm_nr_pmds(task->mm),
422+
mm_nr_puds(task->mm),
421423
get_mm_counter(task->mm, MM_SWAPENTS),
422424
task->signal->oom_score_adj, task->comm);
423425
task_unlock(task);

0 commit comments

Comments
 (0)