Skip to content

Commit 690edec

Browse files
committed
Merge tag 'kvmarm-fixes-for-5.1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into kvm-master
KVM/ARM fixes for 5.1 - Fix THP handling in the presence of pre-existing PTEs - Honor request for PTE mappings even when THPs are available - GICv4 performance improvement - Take the srcu lock when writing to guest-controlled ITS data structures - Reset the virtual PMU in preemptible context - Various cleanups
2 parents e2788c4 + 8324c3d commit 690edec

File tree

9 files changed

+133
-75
lines changed

9 files changed

+133
-75
lines changed

arch/arm/include/asm/kvm_mmu.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
381381
return ret;
382382
}
383383

384+
static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
385+
const void *data, unsigned long len)
386+
{
387+
int srcu_idx = srcu_read_lock(&kvm->srcu);
388+
int ret = kvm_write_guest(kvm, gpa, data, len);
389+
390+
srcu_read_unlock(&kvm->srcu, srcu_idx);
391+
392+
return ret;
393+
}
394+
384395
static inline void *kvm_get_hyp_vector(void)
385396
{
386397
switch(read_cpuid_part()) {

arch/arm/include/asm/stage2_pgtable.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm)
7575

7676
#define S2_PMD_MASK PMD_MASK
7777
#define S2_PMD_SIZE PMD_SIZE
78+
#define S2_PUD_MASK PUD_MASK
79+
#define S2_PUD_SIZE PUD_SIZE
7880

7981
static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
8082
{

arch/arm64/include/asm/kvm_mmu.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
445445
return ret;
446446
}
447447

448+
static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
449+
const void *data, unsigned long len)
450+
{
451+
int srcu_idx = srcu_read_lock(&kvm->srcu);
452+
int ret = kvm_write_guest(kvm, gpa, data, len);
453+
454+
srcu_read_unlock(&kvm->srcu, srcu_idx);
455+
456+
return ret;
457+
}
458+
448459
#ifdef CONFIG_KVM_INDIRECT_VECTORS
449460
/*
450461
* EL2 vectors can be mapped and rerouted in a number of ways,

arch/arm64/kvm/reset.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
123123
int ret = -EINVAL;
124124
bool loaded;
125125

126+
/* Reset PMU outside of the non-preemptible section */
127+
kvm_pmu_vcpu_reset(vcpu);
128+
126129
preempt_disable();
127130
loaded = (vcpu->cpu != -1);
128131
if (loaded)
@@ -170,9 +173,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
170173
vcpu->arch.reset_state.reset = false;
171174
}
172175

173-
/* Reset PMU */
174-
kvm_pmu_vcpu_reset(vcpu);
175-
176176
/* Default workaround setup is enabled (if supported) */
177177
if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
178178
vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;

virt/kvm/arm/hyp/vgic-v3-sr.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
222222
}
223223
}
224224

225-
if (used_lrs) {
225+
if (used_lrs || cpu_if->its_vpe.its_vm) {
226226
int i;
227227
u32 elrsr;
228228

@@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
247247
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
248248
int i;
249249

250-
if (used_lrs) {
250+
if (used_lrs || cpu_if->its_vpe.its_vm) {
251251
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
252252

253253
for (i = 0; i < used_lrs; i++)

virt/kvm/arm/mmu.c

Lines changed: 73 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
102102
* @addr: IPA
103103
* @pmd: pmd pointer for IPA
104104
*
105-
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
106-
* pages in the range dirty.
105+
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
107106
*/
108107
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
109108
{
@@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
121120
* @addr: IPA
122121
* @pud: pud pointer for IPA
123122
*
124-
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
125-
* pages in the range dirty.
123+
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
126124
*/
127125
static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
128126
{
@@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
899897
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
900898
* @kvm: The KVM struct pointer for the VM.
901899
*
902-
* Allocates only the stage-2 HW PGD level table(s) (can support either full
903-
* 40-bit input addresses or limited to 32-bit input addresses). Clears the
904-
* allocated pages.
900+
* Allocates only the stage-2 HW PGD level table(s) of size defined by
901+
* stage2_pgd_size(kvm).
905902
*
906903
* Note we don't need locking here as this is only called when the VM is
907904
* created, which can only be done once.
@@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
10671064
{
10681065
pmd_t *pmd, old_pmd;
10691066

1067+
retry:
10701068
pmd = stage2_get_pmd(kvm, cache, addr);
10711069
VM_BUG_ON(!pmd);
10721070

10731071
old_pmd = *pmd;
1072+
/*
1073+
* Multiple vcpus faulting on the same PMD entry, can
1074+
* lead to them sequentially updating the PMD with the
1075+
* same value. Following the break-before-make
1076+
* (pmd_clear() followed by tlb_flush()) process can
1077+
* hinder forward progress due to refaults generated
1078+
* on missing translations.
1079+
*
1080+
* Skip updating the page table if the entry is
1081+
* unchanged.
1082+
*/
1083+
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1084+
return 0;
1085+
10741086
if (pmd_present(old_pmd)) {
10751087
/*
1076-
* Multiple vcpus faulting on the same PMD entry, can
1077-
* lead to them sequentially updating the PMD with the
1078-
* same value. Following the break-before-make
1079-
* (pmd_clear() followed by tlb_flush()) process can
1080-
* hinder forward progress due to refaults generated
1081-
* on missing translations.
1088+
* If we already have PTE level mapping for this block,
1089+
* we must unmap it to avoid inconsistent TLB state and
1090+
* leaking the table page. We could end up in this situation
1091+
* if the memory slot was marked for dirty logging and was
1092+
* reverted, leaving PTE level mappings for the pages accessed
1093+
* during the period. So, unmap the PTE level mapping for this
1094+
* block and retry, as we could have released the upper level
1095+
* table in the process.
10821096
*
1083-
* Skip updating the page table if the entry is
1084-
* unchanged.
1097+
* Normal THP split/merge follows mmu_notifier callbacks and do
1098+
* get handled accordingly.
10851099
*/
1086-
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1087-
return 0;
1088-
1100+
if (!pmd_thp_or_huge(old_pmd)) {
1101+
unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1102+
goto retry;
1103+
}
10891104
/*
10901105
* Mapping in huge pages should only happen through a
10911106
* fault. If a page is merged into a transparent huge
@@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
10971112
* should become splitting first, unmapped, merged,
10981113
* and mapped back in on-demand.
10991114
*/
1100-
VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1101-
1115+
WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
11021116
pmd_clear(pmd);
11031117
kvm_tlb_flush_vmid_ipa(kvm, addr);
11041118
} else {
@@ -1114,21 +1128,31 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
11141128
{
11151129
pud_t *pudp, old_pud;
11161130

1131+
retry:
11171132
pudp = stage2_get_pud(kvm, cache, addr);
11181133
VM_BUG_ON(!pudp);
11191134

11201135
old_pud = *pudp;
11211136

11221137
/*
11231138
* A large number of vcpus faulting on the same stage 2 entry,
1124-
* can lead to a refault due to the
1125-
* stage2_pud_clear()/tlb_flush(). Skip updating the page
1126-
* tables if there is no change.
1139+
* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1140+
* Skip updating the page tables if there is no change.
11271141
*/
11281142
if (pud_val(old_pud) == pud_val(*new_pudp))
11291143
return 0;
11301144

11311145
if (stage2_pud_present(kvm, old_pud)) {
1146+
/*
1147+
* If we already have table level mapping for this block, unmap
1148+
* the range for this block and retry.
1149+
*/
1150+
if (!stage2_pud_huge(kvm, old_pud)) {
1151+
unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1152+
goto retry;
1153+
}
1154+
1155+
WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
11321156
stage2_pud_clear(kvm, pudp);
11331157
kvm_tlb_flush_vmid_ipa(kvm, addr);
11341158
} else {
@@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
14511475
}
14521476

14531477
/**
1454-
* stage2_wp_puds - write protect PGD range
1455-
* @pgd: pointer to pgd entry
1456-
* @addr: range start address
1457-
* @end: range end address
1458-
*
1459-
* Process PUD entries, for a huge PUD we cause a panic.
1460-
*/
1478+
* stage2_wp_puds - write protect PGD range
1479+
* @pgd: pointer to pgd entry
1480+
* @addr: range start address
1481+
* @end: range end address
1482+
*/
14611483
static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
14621484
phys_addr_t addr, phys_addr_t end)
14631485
{
@@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
15941616
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
15951617
}
15961618

1597-
static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1598-
unsigned long hva)
1619+
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1620+
unsigned long hva,
1621+
unsigned long map_size)
15991622
{
16001623
gpa_t gpa_start;
16011624
hva_t uaddr_start, uaddr_end;
@@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
16101633

16111634
/*
16121635
* Pages belonging to memslots that don't have the same alignment
1613-
* within a PMD for userspace and IPA cannot be mapped with stage-2
1614-
* PMD entries, because we'll end up mapping the wrong pages.
1636+
* within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1637+
* PMD/PUD entries, because we'll end up mapping the wrong pages.
16151638
*
16161639
* Consider a layout like the following:
16171640
*
16181641
* memslot->userspace_addr:
16191642
* +-----+--------------------+--------------------+---+
1620-
* |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
1643+
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
16211644
* +-----+--------------------+--------------------+---+
16221645
*
16231646
* memslot->base_gfn << PAGE_SIZE:
16241647
* +---+--------------------+--------------------+-----+
1625-
* |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
1648+
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
16261649
* +---+--------------------+--------------------+-----+
16271650
*
1628-
* If we create those stage-2 PMDs, we'll end up with this incorrect
1651+
* If we create those stage-2 blocks, we'll end up with this incorrect
16291652
* mapping:
16301653
* d -> f
16311654
* e -> g
16321655
* f -> h
16331656
*/
1634-
if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
1657+
if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
16351658
return false;
16361659

16371660
/*
16381661
* Next, let's make sure we're not trying to map anything not covered
1639-
* by the memslot. This means we have to prohibit PMD size mappings
1640-
* for the beginning and end of a non-PMD aligned and non-PMD sized
1662+
* by the memslot. This means we have to prohibit block size mappings
1663+
* for the beginning and end of a non-block aligned and non-block sized
16411664
* memory slot (illustrated by the head and tail parts of the
16421665
* userspace view above containing pages 'abcde' and 'xyz',
16431666
* respectively).
@@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
16461669
* userspace_addr or the base_gfn, as both are equally aligned (per
16471670
* the check above) and equally sized.
16481671
*/
1649-
return (hva & S2_PMD_MASK) >= uaddr_start &&
1650-
(hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
1672+
return (hva & ~(map_size - 1)) >= uaddr_start &&
1673+
(hva & ~(map_size - 1)) + map_size <= uaddr_end;
16511674
}
16521675

16531676
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16761699
return -EFAULT;
16771700
}
16781701

1679-
if (!fault_supports_stage2_pmd_mappings(memslot, hva))
1680-
force_pte = true;
1681-
1682-
if (logging_active)
1683-
force_pte = true;
1684-
16851702
/* Let's check if we will get back a huge page backed by hugetlbfs */
16861703
down_read(&current->mm->mmap_sem);
16871704
vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1692,18 +1709,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16921709
}
16931710

16941711
vma_pagesize = vma_kernel_pagesize(vma);
1712+
if (logging_active ||
1713+
!fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1714+
force_pte = true;
1715+
vma_pagesize = PAGE_SIZE;
1716+
}
1717+
16951718
/*
16961719
* The stage2 has a minimum of 2 level table (For arm64 see
16971720
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
16981721
* use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
16991722
* As for PUD huge maps, we must make sure that we have at least
17001723
* 3 levels, i.e, PMD is not folded.
17011724
*/
1702-
if ((vma_pagesize == PMD_SIZE ||
1703-
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) &&
1704-
!force_pte) {
1725+
if (vma_pagesize == PMD_SIZE ||
1726+
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
17051727
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1706-
}
17071728
up_read(&current->mm->mmap_sem);
17081729

17091730
/* We need minimum second+third level pages */

0 commit comments

Comments
 (0)