Skip to content

Commit 6e301a8

Browse files
aikpaulusmack
authored andcommitted
KVM: PPC: Optimize clearing TCEs for sparse tables
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE table and a table with userspace addresses. These tables are radix trees, we allocate indirect levels when they are written to. Since the memory allocation is problematic in real mode, we have 2 accessors to the entries: - for virtual mode: it allocates the memory and it is always expected to return non-NULL; - fr real mode: it does not allocate and can return NULL. Also, DMA windows can span to up to 55 bits of the address space and since we never have this much RAM, such windows are sparse. However currently the SPAPR TCE IOMMU driver walks through all TCEs to unpin DMA memory. Since we maintain a userspace addresses table for VFIO which is a mirror of the hardware table, we can use it to know which parts of the DMA window have not been mapped and skip these so does this patch. The bare metal systems do not have this problem as they use a bypass mode of a PHB which maps RAM directly. This helps a lot with sparse DMA windows, reducing the shutdown time from about 3 minutes per 1 billion TCEs to a few seconds for 32GB sparse guest. Just skipping the last level seems to be good enough. As non-allocating accessor is used now in virtual mode as well, rename it from IOMMU_TABLE_USERSPACE_ENTRY_RM (real mode) to _RO (read only). Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 parent 8d9fcac commit 6e301a8

File tree

4 files changed

+27
-9
lines changed

4 files changed

+27
-9
lines changed

arch/powerpc/include/asm/iommu.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ struct iommu_table {
126126
int it_nid;
127127
};
128128

129-
#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
129+
#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
130130
((tbl)->it_ops->useraddrptr((tbl), (entry), false))
131131
#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
132132
((tbl)->it_ops->useraddrptr((tbl), (entry), true))

arch/powerpc/kvm/book3s_64_vio.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
410410
{
411411
struct mm_iommu_table_group_mem_t *mem = NULL;
412412
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
413-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
413+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
414414

415415
if (!pua)
416-
/* it_userspace allocation might be delayed */
417-
return H_TOO_HARD;
416+
return H_SUCCESS;
418417

419418
mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
420419
if (!mem)

arch/powerpc/kvm/book3s_64_vio_hv.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
214214

215215
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
216216
(*direction == DMA_BIDIRECTIONAL))) {
217-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
217+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
218218
/*
219219
* kvmppc_rm_tce_iommu_do_map() updates the UA cache after
220220
* calling this so we still get here a valid UA.
@@ -240,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
240240
{
241241
struct mm_iommu_table_group_mem_t *mem = NULL;
242242
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
243-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
243+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
244244

245245
if (!pua)
246246
/* it_userspace allocation might be delayed */
@@ -304,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
304304
{
305305
long ret;
306306
unsigned long hpa = 0;
307-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
307+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
308308
struct mm_iommu_table_group_mem_t *mem;
309309

310310
if (!pua)

drivers/vfio/vfio_iommu_spapr_tce.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container,
444444
struct mm_iommu_table_group_mem_t *mem = NULL;
445445
int ret;
446446
unsigned long hpa = 0;
447-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
447+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
448448

449449
if (!pua)
450450
return;
@@ -467,8 +467,27 @@ static int tce_iommu_clear(struct tce_container *container,
467467
unsigned long oldhpa;
468468
long ret;
469469
enum dma_data_direction direction;
470+
unsigned long lastentry = entry + pages;
471+
472+
for ( ; entry < lastentry; ++entry) {
473+
if (tbl->it_indirect_levels && tbl->it_userspace) {
474+
/*
475+
* For multilevel tables, we can take a shortcut here
476+
* and skip some TCEs as we know that the userspace
477+
* addresses cache is a mirror of the real TCE table
478+
* and if it is missing some indirect levels, then
479+
* the hardware table does not have them allocated
480+
* either and therefore does not require updating.
481+
*/
482+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
483+
entry);
484+
if (!pua) {
485+
/* align to level_size which is power of two */
486+
entry |= tbl->it_level_size - 1;
487+
continue;
488+
}
489+
}
470490

471-
for ( ; pages; --pages, ++entry) {
472491
cond_resched();
473492

474493
direction = DMA_NONE;

0 commit comments

Comments
 (0)