@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
102
102
* @addr: IPA
103
103
* @pmd: pmd pointer for IPA
104
104
*
105
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
106
- * pages in the range dirty.
105
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
107
106
*/
108
107
static void stage2_dissolve_pmd (struct kvm * kvm , phys_addr_t addr , pmd_t * pmd )
109
108
{
@@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
121
120
* @addr: IPA
122
121
* @pud: pud pointer for IPA
123
122
*
124
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
125
- * pages in the range dirty.
123
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
126
124
*/
127
125
static void stage2_dissolve_pud (struct kvm * kvm , phys_addr_t addr , pud_t * pudp )
128
126
{
@@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
899
897
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
900
898
* @kvm: The KVM struct pointer for the VM.
901
899
*
902
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
903
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
904
- * allocated pages.
900
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
901
+ * stage2_pgd_size(kvm).
905
902
*
906
903
* Note we don't need locking here as this is only called when the VM is
907
904
* created, which can only be done once.
@@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1067
1064
{
1068
1065
pmd_t * pmd , old_pmd ;
1069
1066
1067
+ retry :
1070
1068
pmd = stage2_get_pmd (kvm , cache , addr );
1071
1069
VM_BUG_ON (!pmd );
1072
1070
1073
1071
old_pmd = * pmd ;
1072
+ /*
1073
+ * Multiple vcpus faulting on the same PMD entry, can
1074
+ * lead to them sequentially updating the PMD with the
1075
+ * same value. Following the break-before-make
1076
+ * (pmd_clear() followed by tlb_flush()) process can
1077
+ * hinder forward progress due to refaults generated
1078
+ * on missing translations.
1079
+ *
1080
+ * Skip updating the page table if the entry is
1081
+ * unchanged.
1082
+ */
1083
+ if (pmd_val (old_pmd ) == pmd_val (* new_pmd ))
1084
+ return 0 ;
1085
+
1074
1086
if (pmd_present (old_pmd )) {
1075
1087
/*
1076
- * Multiple vcpus faulting on the same PMD entry, can
1077
- * lead to them sequentially updating the PMD with the
1078
- * same value. Following the break-before-make
1079
- * (pmd_clear() followed by tlb_flush()) process can
1080
- * hinder forward progress due to refaults generated
1081
- * on missing translations.
1088
+ * If we already have PTE level mapping for this block,
1089
+ * we must unmap it to avoid inconsistent TLB state and
1090
+ * leaking the table page. We could end up in this situation
1091
+ * if the memory slot was marked for dirty logging and was
1092
+ * reverted, leaving PTE level mappings for the pages accessed
1093
+ * during the period. So, unmap the PTE level mapping for this
1094
+ * block and retry, as we could have released the upper level
1095
+ * table in the process.
1082
1096
*
1083
- * Skip updating the page table if the entry is
1084
- * unchanged .
1097
+ * Normal THP split/merge follows mmu_notifier callbacks and do
1098
+ * get handled accordingly .
1085
1099
*/
1086
- if (pmd_val (old_pmd ) == pmd_val (* new_pmd ))
1087
- return 0 ;
1088
-
1100
+ if (!pmd_thp_or_huge (old_pmd )) {
1101
+ unmap_stage2_range (kvm , addr & S2_PMD_MASK , S2_PMD_SIZE );
1102
+ goto retry ;
1103
+ }
1089
1104
/*
1090
1105
* Mapping in huge pages should only happen through a
1091
1106
* fault. If a page is merged into a transparent huge
@@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1097
1112
* should become splitting first, unmapped, merged,
1098
1113
* and mapped back in on-demand.
1099
1114
*/
1100
- VM_BUG_ON (pmd_pfn (old_pmd ) != pmd_pfn (* new_pmd ));
1101
-
1115
+ WARN_ON_ONCE (pmd_pfn (old_pmd ) != pmd_pfn (* new_pmd ));
1102
1116
pmd_clear (pmd );
1103
1117
kvm_tlb_flush_vmid_ipa (kvm , addr );
1104
1118
} else {
@@ -1114,21 +1128,31 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
1114
1128
{
1115
1129
pud_t * pudp , old_pud ;
1116
1130
1131
+ retry :
1117
1132
pudp = stage2_get_pud (kvm , cache , addr );
1118
1133
VM_BUG_ON (!pudp );
1119
1134
1120
1135
old_pud = * pudp ;
1121
1136
1122
1137
/*
1123
1138
* A large number of vcpus faulting on the same stage 2 entry,
1124
- * can lead to a refault due to the
1125
- * stage2_pud_clear()/tlb_flush(). Skip updating the page
1126
- * tables if there is no change.
1139
+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1140
+ * Skip updating the page tables if there is no change.
1127
1141
*/
1128
1142
if (pud_val (old_pud ) == pud_val (* new_pudp ))
1129
1143
return 0 ;
1130
1144
1131
1145
if (stage2_pud_present (kvm , old_pud )) {
1146
+ /*
1147
+ * If we already have table level mapping for this block, unmap
1148
+ * the range for this block and retry.
1149
+ */
1150
+ if (!stage2_pud_huge (kvm , old_pud )) {
1151
+ unmap_stage2_range (kvm , addr & S2_PUD_MASK , S2_PUD_SIZE );
1152
+ goto retry ;
1153
+ }
1154
+
1155
+ WARN_ON_ONCE (kvm_pud_pfn (old_pud ) != kvm_pud_pfn (* new_pudp ));
1132
1156
stage2_pud_clear (kvm , pudp );
1133
1157
kvm_tlb_flush_vmid_ipa (kvm , addr );
1134
1158
} else {
@@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1451
1475
}
1452
1476
1453
1477
/**
1454
- * stage2_wp_puds - write protect PGD range
1455
- * @pgd: pointer to pgd entry
1456
- * @addr: range start address
1457
- * @end: range end address
1458
- *
1459
- * Process PUD entries, for a huge PUD we cause a panic.
1460
- */
1478
+ * stage2_wp_puds - write protect PGD range
1479
+ * @pgd: pointer to pgd entry
1480
+ * @addr: range start address
1481
+ * @end: range end address
1482
+ */
1461
1483
static void stage2_wp_puds (struct kvm * kvm , pgd_t * pgd ,
1462
1484
phys_addr_t addr , phys_addr_t end )
1463
1485
{
@@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
1594
1616
send_sig_mceerr (BUS_MCEERR_AR , (void __user * )address , lsb , current );
1595
1617
}
1596
1618
1597
- static bool fault_supports_stage2_pmd_mappings (struct kvm_memory_slot * memslot ,
1598
- unsigned long hva )
1619
+ static bool fault_supports_stage2_huge_mapping (struct kvm_memory_slot * memslot ,
1620
+ unsigned long hva ,
1621
+ unsigned long map_size )
1599
1622
{
1600
1623
gpa_t gpa_start ;
1601
1624
hva_t uaddr_start , uaddr_end ;
@@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1610
1633
1611
1634
/*
1612
1635
* Pages belonging to memslots that don't have the same alignment
1613
- * within a PMD for userspace and IPA cannot be mapped with stage-2
1614
- * PMD entries, because we'll end up mapping the wrong pages.
1636
+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1637
+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
1615
1638
*
1616
1639
* Consider a layout like the following:
1617
1640
*
1618
1641
* memslot->userspace_addr:
1619
1642
* +-----+--------------------+--------------------+---+
1620
- * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
1643
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1621
1644
* +-----+--------------------+--------------------+---+
1622
1645
*
1623
1646
* memslot->base_gfn << PAGE_SIZE:
1624
1647
* +---+--------------------+--------------------+-----+
1625
- * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
1648
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1626
1649
* +---+--------------------+--------------------+-----+
1627
1650
*
1628
- * If we create those stage-2 PMDs , we'll end up with this incorrect
1651
+ * If we create those stage-2 blocks , we'll end up with this incorrect
1629
1652
* mapping:
1630
1653
* d -> f
1631
1654
* e -> g
1632
1655
* f -> h
1633
1656
*/
1634
- if ((gpa_start & ~ S2_PMD_MASK ) != (uaddr_start & ~ S2_PMD_MASK ))
1657
+ if ((gpa_start & ( map_size - 1 )) != (uaddr_start & ( map_size - 1 ) ))
1635
1658
return false;
1636
1659
1637
1660
/*
1638
1661
* Next, let's make sure we're not trying to map anything not covered
1639
- * by the memslot. This means we have to prohibit PMD size mappings
1640
- * for the beginning and end of a non-PMD aligned and non-PMD sized
1662
+ * by the memslot. This means we have to prohibit block size mappings
1663
+ * for the beginning and end of a non-block aligned and non-block sized
1641
1664
* memory slot (illustrated by the head and tail parts of the
1642
1665
* userspace view above containing pages 'abcde' and 'xyz',
1643
1666
* respectively).
@@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1646
1669
* userspace_addr or the base_gfn, as both are equally aligned (per
1647
1670
* the check above) and equally sized.
1648
1671
*/
1649
- return (hva & S2_PMD_MASK ) >= uaddr_start &&
1650
- (hva & S2_PMD_MASK ) + S2_PMD_SIZE <= uaddr_end ;
1672
+ return (hva & ~( map_size - 1 ) ) >= uaddr_start &&
1673
+ (hva & ~( map_size - 1 )) + map_size <= uaddr_end ;
1651
1674
}
1652
1675
1653
1676
static int user_mem_abort (struct kvm_vcpu * vcpu , phys_addr_t fault_ipa ,
@@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1676
1699
return - EFAULT ;
1677
1700
}
1678
1701
1679
- if (!fault_supports_stage2_pmd_mappings (memslot , hva ))
1680
- force_pte = true;
1681
-
1682
- if (logging_active )
1683
- force_pte = true;
1684
-
1685
1702
/* Let's check if we will get back a huge page backed by hugetlbfs */
1686
1703
down_read (& current -> mm -> mmap_sem );
1687
1704
vma = find_vma_intersection (current -> mm , hva , hva + 1 );
@@ -1692,18 +1709,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1692
1709
}
1693
1710
1694
1711
vma_pagesize = vma_kernel_pagesize (vma );
1712
+ if (logging_active ||
1713
+ !fault_supports_stage2_huge_mapping (memslot , hva , vma_pagesize )) {
1714
+ force_pte = true;
1715
+ vma_pagesize = PAGE_SIZE ;
1716
+ }
1717
+
1695
1718
/*
1696
1719
* The stage2 has a minimum of 2 level table (For arm64 see
1697
1720
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1698
1721
* use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1699
1722
* As for PUD huge maps, we must make sure that we have at least
1700
1723
* 3 levels, i.e, PMD is not folded.
1701
1724
*/
1702
- if ((vma_pagesize == PMD_SIZE ||
1703
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd (kvm ))) &&
1704
- !force_pte ) {
1725
+ if (vma_pagesize == PMD_SIZE ||
1726
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd (kvm )))
1705
1727
gfn = (fault_ipa & huge_page_mask (hstate_vma (vma ))) >> PAGE_SHIFT ;
1706
- }
1707
1728
up_read (& current -> mm -> mmap_sem );
1708
1729
1709
1730
/* We need minimum second+third level pages */
0 commit comments