Skip to content

Commit cf4ee73

Browse files
Changbin Duzhenyw
authored andcommitted
drm/i915/gvt: Fix guest vGPU hang caused by very high dma setup overhead
The implementation of current kvmgt implicitly setup dma mapping at MPT API gfn_to_mfn. First this design against the API's original purpose. Second, there is no unmap hit in this design. The result is that the dma mapping keep growing larger and larger. For mutl-vm case, they will consume IOMMU IOVA low 4GB address space quickly and so tons of rbtree entries crated in the IOMMU IOVA allocator. Finally, single IOVA allocation can take as long as ~70ms. Such latency is intolerable. To address both above issues, this patch introduced two new MPT API: o dma_map_guest_page - setup dma map for guest page o dma_unmap_guest_page - cancel dma map for guest page The kvmgt implements these 2 API. And to reduce dma setup overhead for duplicated pages (eg. scratch pages), two caches are used: one is for mapping gfn to struct gvt_dma, another is for mapping dma addr to struct gvt_dma. With these 2 new API, the gtt now is able to cancel dma mapping when page table is invalidated. The dma mapping is not in a gradual increase now. v2: follow the old logic for VFIO_IOMMU_NOTIFY_DMA_UNMAP at this point. Cc: Hang Yuan <hang.yuan@intel.com> Cc: Xiong Zhang <xiong.y.zhang@intel.com> Signed-off-by: Changbin Du <changbin.du@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
1 parent b52646f commit cf4ee73

File tree

5 files changed

+246
-134
lines changed

5 files changed

+246
-134
lines changed

drivers/gpu/drm/i915/gvt/gtt.c

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,23 @@ static int ppgtt_invalidate_spt_by_shadow_entry(struct intel_vgpu *vgpu,
822822
return ppgtt_invalidate_spt(s);
823823
}
824824

825+
static inline void ppgtt_invalidate_pte(struct intel_vgpu_ppgtt_spt *spt,
826+
struct intel_gvt_gtt_entry *entry)
827+
{
828+
struct intel_vgpu *vgpu = spt->vgpu;
829+
struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
830+
unsigned long pfn;
831+
int type;
832+
833+
pfn = ops->get_pfn(entry);
834+
type = spt->shadow_page.type;
835+
836+
if (pfn == vgpu->gtt.scratch_pt[type].page_mfn)
837+
return;
838+
839+
intel_gvt_hypervisor_dma_unmap_guest_page(vgpu, pfn << PAGE_SHIFT);
840+
}
841+
825842
static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
826843
{
827844
struct intel_vgpu *vgpu = spt->vgpu;
@@ -838,14 +855,12 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
838855
if (atomic_dec_return(&spt->refcount) > 0)
839856
return 0;
840857

841-
if (gtt_type_is_pte_pt(spt->shadow_page.type))
842-
goto release;
843-
844858
for_each_present_shadow_entry(spt, &e, index) {
845859
switch (e.type) {
846860
case GTT_TYPE_PPGTT_PTE_4K_ENTRY:
847861
gvt_vdbg_mm("invalidate 4K entry\n");
848-
continue;
862+
ppgtt_invalidate_pte(spt, &e);
863+
break;
849864
case GTT_TYPE_PPGTT_PTE_2M_ENTRY:
850865
case GTT_TYPE_PPGTT_PTE_1G_ENTRY:
851866
WARN(1, "GVT doesn't support 2M/1GB page\n");
@@ -863,7 +878,7 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
863878
GEM_BUG_ON(1);
864879
}
865880
}
866-
release:
881+
867882
trace_spt_change(spt->vgpu->id, "release", spt,
868883
spt->guest_page.gfn, spt->shadow_page.type);
869884
ppgtt_free_spt(spt);
@@ -932,7 +947,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
932947
{
933948
struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
934949
struct intel_gvt_gtt_entry se = *ge;
935-
unsigned long gfn, mfn;
950+
unsigned long gfn;
951+
dma_addr_t dma_addr;
952+
int ret;
936953

937954
if (!pte_ops->test_present(ge))
938955
return 0;
@@ -952,11 +969,11 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
952969
};
953970

954971
/* direct shadow */
955-
mfn = intel_gvt_hypervisor_gfn_to_mfn(vgpu, gfn);
956-
if (mfn == INTEL_GVT_INVALID_ADDR)
972+
ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu, gfn, &dma_addr);
973+
if (ret)
957974
return -ENXIO;
958975

959-
pte_ops->set_pfn(&se, mfn);
976+
pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
960977
ppgtt_set_shadow_entry(spt, &se, index);
961978
return 0;
962979
}
@@ -1035,7 +1052,9 @@ static int ppgtt_handle_guest_entry_removal(struct intel_vgpu_ppgtt_spt *spt,
10351052
ret = ppgtt_invalidate_spt(s);
10361053
if (ret)
10371054
goto fail;
1038-
}
1055+
} else
1056+
ppgtt_invalidate_pte(spt, se);
1057+
10391058
return 0;
10401059
fail:
10411060
gvt_vgpu_err("fail: shadow page %p guest entry 0x%llx type %d\n",
@@ -1807,8 +1826,10 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
18071826
struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm;
18081827
struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops;
18091828
unsigned long g_gtt_index = off >> info->gtt_entry_size_shift;
1810-
unsigned long gma, gfn, mfn;
1829+
unsigned long gma, gfn;
18111830
struct intel_gvt_gtt_entry e, m;
1831+
dma_addr_t dma_addr;
1832+
int ret;
18121833

18131834
if (bytes != 4 && bytes != 8)
18141835
return -EINVAL;
@@ -1836,16 +1857,17 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
18361857
goto out;
18371858
}
18381859

1839-
mfn = intel_gvt_hypervisor_gfn_to_mfn(vgpu, gfn);
1840-
if (mfn == INTEL_GVT_INVALID_ADDR) {
1860+
ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu, gfn,
1861+
&dma_addr);
1862+
if (ret) {
18411863
gvt_vgpu_err("fail to populate guest ggtt entry\n");
18421864
/* guest driver may read/write the entry when partial
18431865
* update the entry in this situation p2m will fail
18441866
* settting the shadow entry to point to a scratch page
18451867
*/
18461868
ops->set_pfn(&m, gvt->gtt.scratch_mfn);
18471869
} else
1848-
ops->set_pfn(&m, mfn);
1870+
ops->set_pfn(&m, dma_addr >> PAGE_SHIFT);
18491871
} else
18501872
ops->set_pfn(&m, gvt->gtt.scratch_mfn);
18511873

drivers/gpu/drm/i915/gvt/gvt.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,15 @@ struct intel_vgpu {
201201
int num_regions;
202202
struct eventfd_ctx *intx_trigger;
203203
struct eventfd_ctx *msi_trigger;
204-
struct rb_root cache;
204+
205+
/*
206+
* Two caches are used to avoid mapping duplicated pages (eg.
207+
* scratch pages). This help to reduce dma setup overhead.
208+
*/
209+
struct rb_root gfn_cache;
210+
struct rb_root dma_addr_cache;
205211
struct mutex cache_lock;
212+
206213
struct notifier_block iommu_notifier;
207214
struct notifier_block group_notifier;
208215
struct kvm *kvm;

drivers/gpu/drm/i915/gvt/hypercall.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ struct intel_gvt_mpt {
5151
int (*write_gpa)(unsigned long handle, unsigned long gpa, void *buf,
5252
unsigned long len);
5353
unsigned long (*gfn_to_mfn)(unsigned long handle, unsigned long gfn);
54+
55+
int (*dma_map_guest_page)(unsigned long handle, unsigned long gfn,
56+
dma_addr_t *dma_addr);
57+
void (*dma_unmap_guest_page)(unsigned long handle, dma_addr_t dma_addr);
58+
5459
int (*map_gfn_to_mfn)(unsigned long handle, unsigned long gfn,
5560
unsigned long mfn, unsigned int nr, bool map);
5661
int (*set_trap_area)(unsigned long handle, u64 start, u64 end,

0 commit comments

Comments
 (0)