Skip to content

Commit ad361f0

Browse files
committed
KVM: ARM: Support hugetlbfs backed huge pages
Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on the unmap path may feel a bit silly as the pud_huge check is always defined to false, but the compiler should be smart about this. Note: This deals only with VMAs marked as huge which are allocated by users through hugetlbfs only. Transparent huge pages can only be detected by looking at the underlying pages (or the page tables themselves) and this patch so far simply maps these on a page-by-page level in the Stage-2 page tables. Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
1 parent 86ed81a commit ad361f0

File tree

5 files changed

+158
-44
lines changed

5 files changed

+158
-44
lines changed

arch/arm/include/asm/kvm_mmu.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
6262
int kvm_mmu_init(void);
6363
void kvm_clear_hyp_idmap(void);
6464

65+
static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
66+
{
67+
*pmd = new_pmd;
68+
flush_pmd_entry(pmd);
69+
}
70+
6571
static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
6672
{
6773
*pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
103109
pte_val(*pte) |= L_PTE_S2_RDWR;
104110
}
105111

112+
static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
113+
{
114+
pmd_val(*pmd) |= L_PMD_S2_RDWR;
115+
}
116+
106117
struct kvm;
107118

108-
static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
119+
static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
120+
unsigned long size)
109121
{
110122
/*
111123
* If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
120132
* need any kind of flushing (DDI 0406C.b - Page B3-1392).
121133
*/
122134
if (icache_is_pipt()) {
123-
unsigned long hva = gfn_to_hva(kvm, gfn);
124-
__cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
135+
__cpuc_coherent_user_range(hva, hva + size);
125136
} else if (!icache_is_vivt_asid_tagged()) {
126137
/* any kind of VIPT cache */
127138
__flush_icache_all();

arch/arm/include/asm/pgtable-3level.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@
126126
#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
127127
#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
128128

129+
#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
130+
129131
/*
130132
* Hyp-mode PL2 PTE definitions for LPAE.
131133
*/

arch/arm/kvm/mmu.c

Lines changed: 131 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/mman.h>
2020
#include <linux/kvm_host.h>
2121
#include <linux/io.h>
22+
#include <linux/hugetlb.h>
2223
#include <trace/events/kvm.h>
2324
#include <asm/pgalloc.h>
2425
#include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
4142
static unsigned long hyp_idmap_end;
4243
static phys_addr_t hyp_idmap_vector;
4344

45+
#define kvm_pmd_huge(_x) (pmd_huge(_x))
46+
4447
static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
4548
{
4649
/*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
9396

9497
static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
9598
{
96-
pmd_t *pmd_table = pmd_offset(pud, 0);
97-
pud_clear(pud);
98-
kvm_tlb_flush_vmid_ipa(kvm, addr);
99-
pmd_free(NULL, pmd_table);
99+
if (pud_huge(*pud)) {
100+
pud_clear(pud);
101+
kvm_tlb_flush_vmid_ipa(kvm, addr);
102+
} else {
103+
pmd_t *pmd_table = pmd_offset(pud, 0);
104+
pud_clear(pud);
105+
kvm_tlb_flush_vmid_ipa(kvm, addr);
106+
pmd_free(NULL, pmd_table);
107+
}
100108
put_page(virt_to_page(pud));
101109
}
102110

103111
static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
104112
{
105-
pte_t *pte_table = pte_offset_kernel(pmd, 0);
106-
pmd_clear(pmd);
107-
kvm_tlb_flush_vmid_ipa(kvm, addr);
108-
pte_free_kernel(NULL, pte_table);
113+
if (kvm_pmd_huge(*pmd)) {
114+
pmd_clear(pmd);
115+
kvm_tlb_flush_vmid_ipa(kvm, addr);
116+
} else {
117+
pte_t *pte_table = pte_offset_kernel(pmd, 0);
118+
pmd_clear(pmd);
119+
kvm_tlb_flush_vmid_ipa(kvm, addr);
120+
pte_free_kernel(NULL, pte_table);
121+
}
109122
put_page(virt_to_page(pmd));
110123
}
111124

@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
136149
continue;
137150
}
138151

152+
if (pud_huge(*pud)) {
153+
/*
154+
* If we are dealing with a huge pud, just clear it and
155+
* move on.
156+
*/
157+
clear_pud_entry(kvm, pud, addr);
158+
addr = pud_addr_end(addr, end);
159+
continue;
160+
}
161+
139162
pmd = pmd_offset(pud, addr);
140163
if (pmd_none(*pmd)) {
141164
addr = pmd_addr_end(addr, end);
142165
continue;
143166
}
144167

145-
pte = pte_offset_kernel(pmd, addr);
146-
clear_pte_entry(kvm, pte, addr);
147-
next = addr + PAGE_SIZE;
168+
if (!kvm_pmd_huge(*pmd)) {
169+
pte = pte_offset_kernel(pmd, addr);
170+
clear_pte_entry(kvm, pte, addr);
171+
next = addr + PAGE_SIZE;
172+
}
148173

149-
/* If we emptied the pte, walk back up the ladder */
150-
if (page_empty(pte)) {
174+
/*
175+
* If the pmd entry is to be cleared, walk back up the ladder
176+
*/
177+
if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
151178
clear_pmd_entry(kvm, pmd, addr);
152179
next = pmd_addr_end(addr, end);
153180
if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
420447
kvm->arch.pgd = NULL;
421448
}
422449

423-
424-
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
425-
phys_addr_t addr, const pte_t *new_pte, bool iomap)
450+
static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
451+
phys_addr_t addr)
426452
{
427453
pgd_t *pgd;
428454
pud_t *pud;
429455
pmd_t *pmd;
430-
pte_t *pte, old_pte;
431456

432-
/* Create 2nd stage page table mapping - Level 1 */
433457
pgd = kvm->arch.pgd + pgd_index(addr);
434458
pud = pud_offset(pgd, addr);
435459
if (pud_none(*pud)) {
436460
if (!cache)
437-
return 0; /* ignore calls from kvm_set_spte_hva */
461+
return NULL;
438462
pmd = mmu_memory_cache_alloc(cache);
439463
pud_populate(NULL, pud, pmd);
440464
get_page(virt_to_page(pud));
441465
}
442466

443-
pmd = pmd_offset(pud, addr);
467+
return pmd_offset(pud, addr);
468+
}
469+
470+
static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
471+
*cache, phys_addr_t addr, const pmd_t *new_pmd)
472+
{
473+
pmd_t *pmd, old_pmd;
474+
475+
pmd = stage2_get_pmd(kvm, cache, addr);
476+
VM_BUG_ON(!pmd);
444477

445-
/* Create 2nd stage page table mapping - Level 2 */
478+
/*
479+
* Mapping in huge pages should only happen through a fault. If a
480+
* page is merged into a transparent huge page, the individual
481+
* subpages of that huge page should be unmapped through MMU
482+
* notifiers before we get here.
483+
*
484+
* Merging of CompoundPages is not supported; they should become
485+
* splitting first, unmapped, merged, and mapped back in on-demand.
486+
*/
487+
VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
488+
489+
old_pmd = *pmd;
490+
kvm_set_pmd(pmd, *new_pmd);
491+
if (pmd_present(old_pmd))
492+
kvm_tlb_flush_vmid_ipa(kvm, addr);
493+
else
494+
get_page(virt_to_page(pmd));
495+
return 0;
496+
}
497+
498+
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
499+
phys_addr_t addr, const pte_t *new_pte, bool iomap)
500+
{
501+
pmd_t *pmd;
502+
pte_t *pte, old_pte;
503+
504+
/* Create stage-2 page table mapping - Level 1 */
505+
pmd = stage2_get_pmd(kvm, cache, addr);
506+
if (!pmd) {
507+
/*
508+
* Ignore calls from kvm_set_spte_hva for unallocated
509+
* address ranges.
510+
*/
511+
return 0;
512+
}
513+
514+
/* Create stage-2 page mappings - Level 2 */
446515
if (pmd_none(*pmd)) {
447516
if (!cache)
448517
return 0; /* ignore calls from kvm_set_spte_hva */
@@ -508,22 +577,34 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
508577
}
509578

510579
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
511-
gfn_t gfn, struct kvm_memory_slot *memslot,
580+
struct kvm_memory_slot *memslot,
512581
unsigned long fault_status)
513582
{
514-
pte_t new_pte;
515-
pfn_t pfn;
516583
int ret;
517-
bool write_fault, writable;
584+
bool write_fault, writable, hugetlb = false;
518585
unsigned long mmu_seq;
586+
gfn_t gfn = fault_ipa >> PAGE_SHIFT;
587+
unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
588+
struct kvm *kvm = vcpu->kvm;
519589
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
590+
struct vm_area_struct *vma;
591+
pfn_t pfn;
520592

521593
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
522594
if (fault_status == FSC_PERM && !write_fault) {
523595
kvm_err("Unexpected L2 read permission error\n");
524596
return -EFAULT;
525597
}
526598

599+
/* Let's check if we will get back a huge page backed by hugetlbfs */
600+
down_read(&current->mm->mmap_sem);
601+
vma = find_vma_intersection(current->mm, hva, hva + 1);
602+
if (is_vm_hugetlb_page(vma)) {
603+
hugetlb = true;
604+
gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
605+
}
606+
up_read(&current->mm->mmap_sem);
607+
527608
/* We need minimum second+third level pages */
528609
ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
529610
if (ret)
@@ -541,26 +622,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
541622
*/
542623
smp_rmb();
543624

544-
pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
625+
pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
545626
if (is_error_pfn(pfn))
546627
return -EFAULT;
547628

548-
new_pte = pfn_pte(pfn, PAGE_S2);
549-
coherent_icache_guest_page(vcpu->kvm, gfn);
550-
551-
spin_lock(&vcpu->kvm->mmu_lock);
552-
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
629+
spin_lock(&kvm->mmu_lock);
630+
if (mmu_notifier_retry(kvm, mmu_seq))
553631
goto out_unlock;
554-
if (writable) {
555-
kvm_set_s2pte_writable(&new_pte);
556-
kvm_set_pfn_dirty(pfn);
632+
633+
if (hugetlb) {
634+
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
635+
new_pmd = pmd_mkhuge(new_pmd);
636+
if (writable) {
637+
kvm_set_s2pmd_writable(&new_pmd);
638+
kvm_set_pfn_dirty(pfn);
639+
}
640+
coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
641+
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
642+
} else {
643+
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
644+
if (writable) {
645+
kvm_set_s2pte_writable(&new_pte);
646+
kvm_set_pfn_dirty(pfn);
647+
}
648+
coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
649+
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
557650
}
558-
stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
651+
559652

560653
out_unlock:
561-
spin_unlock(&vcpu->kvm->mmu_lock);
654+
spin_unlock(&kvm->mmu_lock);
562655
kvm_release_pfn_clean(pfn);
563-
return 0;
656+
return ret;
564657
}
565658

566659
/**
@@ -629,7 +722,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
629722

630723
memslot = gfn_to_memslot(vcpu->kvm, gfn);
631724

632-
ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status);
725+
ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
633726
if (ret == 0)
634727
ret = 1;
635728
out_unlock:

arch/arm64/include/asm/kvm_mmu.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ int kvm_mmu_init(void);
9191
void kvm_clear_hyp_idmap(void);
9292

9393
#define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
94+
#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
9495

9596
static inline bool kvm_is_write_fault(unsigned long esr)
9697
{
@@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
116117
pte_val(*pte) |= PTE_S2_RDWR;
117118
}
118119

120+
static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
121+
{
122+
pmd_val(*pmd) |= PMD_S2_RDWR;
123+
}
124+
119125
struct kvm;
120126

121-
static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
127+
static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
128+
unsigned long size)
122129
{
123130
if (!icache_is_aliasing()) { /* PIPT */
124-
unsigned long hva = gfn_to_hva(kvm, gfn);
125-
flush_icache_range(hva, hva + PAGE_SIZE);
131+
flush_icache_range(hva, hva + size);
126132
} else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */
127133
/* any kind of VIPT cache */
128134
__flush_icache_all();

arch/arm64/include/asm/pgtable-hwdef.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@
8585
#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
8686
#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
8787

88+
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
89+
8890
/*
8991
* Memory Attribute override for Stage-2 (MemAttr[3:0])
9092
*/

0 commit comments

Comments
 (0)