19
19
#include <linux/mman.h>
20
20
#include <linux/kvm_host.h>
21
21
#include <linux/io.h>
22
+ #include <linux/hugetlb.h>
22
23
#include <trace/events/kvm.h>
23
24
#include <asm/pgalloc.h>
24
25
#include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
41
42
static unsigned long hyp_idmap_end ;
42
43
static phys_addr_t hyp_idmap_vector ;
43
44
45
+ #define kvm_pmd_huge (_x ) (pmd_huge(_x))
46
+
44
47
static void kvm_tlb_flush_vmid_ipa (struct kvm * kvm , phys_addr_t ipa )
45
48
{
46
49
/*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
93
96
94
97
static void clear_pud_entry (struct kvm * kvm , pud_t * pud , phys_addr_t addr )
95
98
{
96
- pmd_t * pmd_table = pmd_offset (pud , 0 );
97
- pud_clear (pud );
98
- kvm_tlb_flush_vmid_ipa (kvm , addr );
99
- pmd_free (NULL , pmd_table );
99
+ if (pud_huge (* pud )) {
100
+ pud_clear (pud );
101
+ kvm_tlb_flush_vmid_ipa (kvm , addr );
102
+ } else {
103
+ pmd_t * pmd_table = pmd_offset (pud , 0 );
104
+ pud_clear (pud );
105
+ kvm_tlb_flush_vmid_ipa (kvm , addr );
106
+ pmd_free (NULL , pmd_table );
107
+ }
100
108
put_page (virt_to_page (pud ));
101
109
}
102
110
103
111
static void clear_pmd_entry (struct kvm * kvm , pmd_t * pmd , phys_addr_t addr )
104
112
{
105
- pte_t * pte_table = pte_offset_kernel (pmd , 0 );
106
- pmd_clear (pmd );
107
- kvm_tlb_flush_vmid_ipa (kvm , addr );
108
- pte_free_kernel (NULL , pte_table );
113
+ if (kvm_pmd_huge (* pmd )) {
114
+ pmd_clear (pmd );
115
+ kvm_tlb_flush_vmid_ipa (kvm , addr );
116
+ } else {
117
+ pte_t * pte_table = pte_offset_kernel (pmd , 0 );
118
+ pmd_clear (pmd );
119
+ kvm_tlb_flush_vmid_ipa (kvm , addr );
120
+ pte_free_kernel (NULL , pte_table );
121
+ }
109
122
put_page (virt_to_page (pmd ));
110
123
}
111
124
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
136
149
continue ;
137
150
}
138
151
152
+ if (pud_huge (* pud )) {
153
+ /*
154
+ * If we are dealing with a huge pud, just clear it and
155
+ * move on.
156
+ */
157
+ clear_pud_entry (kvm , pud , addr );
158
+ addr = pud_addr_end (addr , end );
159
+ continue ;
160
+ }
161
+
139
162
pmd = pmd_offset (pud , addr );
140
163
if (pmd_none (* pmd )) {
141
164
addr = pmd_addr_end (addr , end );
142
165
continue ;
143
166
}
144
167
145
- pte = pte_offset_kernel (pmd , addr );
146
- clear_pte_entry (kvm , pte , addr );
147
- next = addr + PAGE_SIZE ;
168
+ if (!kvm_pmd_huge (* pmd )) {
169
+ pte = pte_offset_kernel (pmd , addr );
170
+ clear_pte_entry (kvm , pte , addr );
171
+ next = addr + PAGE_SIZE ;
172
+ }
148
173
149
- /* If we emptied the pte, walk back up the ladder */
150
- if (page_empty (pte )) {
174
+ /*
175
+ * If the pmd entry is to be cleared, walk back up the ladder
176
+ */
177
+ if (kvm_pmd_huge (* pmd ) || page_empty (pte )) {
151
178
clear_pmd_entry (kvm , pmd , addr );
152
179
next = pmd_addr_end (addr , end );
153
180
if (page_empty (pmd ) && !page_empty (pud )) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
420
447
kvm -> arch .pgd = NULL ;
421
448
}
422
449
423
-
424
- static int stage2_set_pte (struct kvm * kvm , struct kvm_mmu_memory_cache * cache ,
425
- phys_addr_t addr , const pte_t * new_pte , bool iomap )
450
+ static pmd_t * stage2_get_pmd (struct kvm * kvm , struct kvm_mmu_memory_cache * cache ,
451
+ phys_addr_t addr )
426
452
{
427
453
pgd_t * pgd ;
428
454
pud_t * pud ;
429
455
pmd_t * pmd ;
430
- pte_t * pte , old_pte ;
431
456
432
- /* Create 2nd stage page table mapping - Level 1 */
433
457
pgd = kvm -> arch .pgd + pgd_index (addr );
434
458
pud = pud_offset (pgd , addr );
435
459
if (pud_none (* pud )) {
436
460
if (!cache )
437
- return 0 ; /* ignore calls from kvm_set_spte_hva */
461
+ return NULL ;
438
462
pmd = mmu_memory_cache_alloc (cache );
439
463
pud_populate (NULL , pud , pmd );
440
464
get_page (virt_to_page (pud ));
441
465
}
442
466
443
- pmd = pmd_offset (pud , addr );
467
+ return pmd_offset (pud , addr );
468
+ }
469
+
470
+ static int stage2_set_pmd_huge (struct kvm * kvm , struct kvm_mmu_memory_cache
471
+ * cache , phys_addr_t addr , const pmd_t * new_pmd )
472
+ {
473
+ pmd_t * pmd , old_pmd ;
474
+
475
+ pmd = stage2_get_pmd (kvm , cache , addr );
476
+ VM_BUG_ON (!pmd );
444
477
445
- /* Create 2nd stage page table mapping - Level 2 */
478
+ /*
479
+ * Mapping in huge pages should only happen through a fault. If a
480
+ * page is merged into a transparent huge page, the individual
481
+ * subpages of that huge page should be unmapped through MMU
482
+ * notifiers before we get here.
483
+ *
484
+ * Merging of CompoundPages is not supported; they should become
485
+ * splitting first, unmapped, merged, and mapped back in on-demand.
486
+ */
487
+ VM_BUG_ON (pmd_present (* pmd ) && pmd_pfn (* pmd ) != pmd_pfn (* new_pmd ));
488
+
489
+ old_pmd = * pmd ;
490
+ kvm_set_pmd (pmd , * new_pmd );
491
+ if (pmd_present (old_pmd ))
492
+ kvm_tlb_flush_vmid_ipa (kvm , addr );
493
+ else
494
+ get_page (virt_to_page (pmd ));
495
+ return 0 ;
496
+ }
497
+
498
+ static int stage2_set_pte (struct kvm * kvm , struct kvm_mmu_memory_cache * cache ,
499
+ phys_addr_t addr , const pte_t * new_pte , bool iomap )
500
+ {
501
+ pmd_t * pmd ;
502
+ pte_t * pte , old_pte ;
503
+
504
+ /* Create stage-2 page table mapping - Level 1 */
505
+ pmd = stage2_get_pmd (kvm , cache , addr );
506
+ if (!pmd ) {
507
+ /*
508
+ * Ignore calls from kvm_set_spte_hva for unallocated
509
+ * address ranges.
510
+ */
511
+ return 0 ;
512
+ }
513
+
514
+ /* Create stage-2 page mappings - Level 2 */
446
515
if (pmd_none (* pmd )) {
447
516
if (!cache )
448
517
return 0 ; /* ignore calls from kvm_set_spte_hva */
@@ -508,22 +577,34 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
508
577
}
509
578
510
579
static int user_mem_abort (struct kvm_vcpu * vcpu , phys_addr_t fault_ipa ,
511
- gfn_t gfn , struct kvm_memory_slot * memslot ,
580
+ struct kvm_memory_slot * memslot ,
512
581
unsigned long fault_status )
513
582
{
514
- pte_t new_pte ;
515
- pfn_t pfn ;
516
583
int ret ;
517
- bool write_fault , writable ;
584
+ bool write_fault , writable , hugetlb = false ;
518
585
unsigned long mmu_seq ;
586
+ gfn_t gfn = fault_ipa >> PAGE_SHIFT ;
587
+ unsigned long hva = gfn_to_hva (vcpu -> kvm , gfn );
588
+ struct kvm * kvm = vcpu -> kvm ;
519
589
struct kvm_mmu_memory_cache * memcache = & vcpu -> arch .mmu_page_cache ;
590
+ struct vm_area_struct * vma ;
591
+ pfn_t pfn ;
520
592
521
593
write_fault = kvm_is_write_fault (kvm_vcpu_get_hsr (vcpu ));
522
594
if (fault_status == FSC_PERM && !write_fault ) {
523
595
kvm_err ("Unexpected L2 read permission error\n" );
524
596
return - EFAULT ;
525
597
}
526
598
599
+ /* Let's check if we will get back a huge page backed by hugetlbfs */
600
+ down_read (& current -> mm -> mmap_sem );
601
+ vma = find_vma_intersection (current -> mm , hva , hva + 1 );
602
+ if (is_vm_hugetlb_page (vma )) {
603
+ hugetlb = true;
604
+ gfn = (fault_ipa & PMD_MASK ) >> PAGE_SHIFT ;
605
+ }
606
+ up_read (& current -> mm -> mmap_sem );
607
+
527
608
/* We need minimum second+third level pages */
528
609
ret = mmu_topup_memory_cache (memcache , 2 , KVM_NR_MEM_OBJS );
529
610
if (ret )
@@ -541,26 +622,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
541
622
*/
542
623
smp_rmb ();
543
624
544
- pfn = gfn_to_pfn_prot (vcpu -> kvm , gfn , write_fault , & writable );
625
+ pfn = gfn_to_pfn_prot (kvm , gfn , write_fault , & writable );
545
626
if (is_error_pfn (pfn ))
546
627
return - EFAULT ;
547
628
548
- new_pte = pfn_pte (pfn , PAGE_S2 );
549
- coherent_icache_guest_page (vcpu -> kvm , gfn );
550
-
551
- spin_lock (& vcpu -> kvm -> mmu_lock );
552
- if (mmu_notifier_retry (vcpu -> kvm , mmu_seq ))
629
+ spin_lock (& kvm -> mmu_lock );
630
+ if (mmu_notifier_retry (kvm , mmu_seq ))
553
631
goto out_unlock ;
554
- if (writable ) {
555
- kvm_set_s2pte_writable (& new_pte );
556
- kvm_set_pfn_dirty (pfn );
632
+
633
+ if (hugetlb ) {
634
+ pmd_t new_pmd = pfn_pmd (pfn , PAGE_S2 );
635
+ new_pmd = pmd_mkhuge (new_pmd );
636
+ if (writable ) {
637
+ kvm_set_s2pmd_writable (& new_pmd );
638
+ kvm_set_pfn_dirty (pfn );
639
+ }
640
+ coherent_icache_guest_page (kvm , hva & PMD_MASK , PMD_SIZE );
641
+ ret = stage2_set_pmd_huge (kvm , memcache , fault_ipa , & new_pmd );
642
+ } else {
643
+ pte_t new_pte = pfn_pte (pfn , PAGE_S2 );
644
+ if (writable ) {
645
+ kvm_set_s2pte_writable (& new_pte );
646
+ kvm_set_pfn_dirty (pfn );
647
+ }
648
+ coherent_icache_guest_page (kvm , hva , PAGE_SIZE );
649
+ ret = stage2_set_pte (kvm , memcache , fault_ipa , & new_pte , false);
557
650
}
558
- stage2_set_pte ( vcpu -> kvm , memcache , fault_ipa , & new_pte , false);
651
+
559
652
560
653
out_unlock :
561
- spin_unlock (& vcpu -> kvm -> mmu_lock );
654
+ spin_unlock (& kvm -> mmu_lock );
562
655
kvm_release_pfn_clean (pfn );
563
- return 0 ;
656
+ return ret ;
564
657
}
565
658
566
659
/**
@@ -629,7 +722,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
629
722
630
723
memslot = gfn_to_memslot (vcpu -> kvm , gfn );
631
724
632
- ret = user_mem_abort (vcpu , fault_ipa , gfn , memslot , fault_status );
725
+ ret = user_mem_abort (vcpu , fault_ipa , memslot , fault_status );
633
726
if (ret == 0 )
634
727
ret = 1 ;
635
728
out_unlock :
0 commit comments