Skip to content

Commit a68bd12

Browse files
aikmpe
authored andcommitted
powerpc/powernv/ioda: Allocate indirect TCE levels on demand
At the moment we allocate the entire TCE table, twice (hardware part and userspace translation cache). This normally works as we normally have contigous memory and the guest will map entire RAM for 64bit DMA. However if we have sparse RAM (one example is a memory device), then we will allocate TCEs which will never be used as the guest only maps actual memory for DMA. If it is a single level TCE table, there is nothing we can really do but if it a multilevel table, we can skip allocating TCEs we know we won't need. This adds ability to allocate only first level, saving memory. This changes iommu_table::free() to avoid allocating of an extra level; iommu_table::set() will do this when needed. This adds @alloc parameter to iommu_table::exchange() to tell the callback if it can allocate an extra level; the flag is set to "false" for the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns H_TOO_HARD. This still requires the entire table to be counted in mm::locked_vm. To be conservative, this only does on-demand allocation when the usespace cache table is requested which is the case of VFIO. The example math for a system replicating a powernv setup with NVLink2 in a guest: 16GB RAM mapped at 0x0 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000 the table to cover that all with 64K pages takes: (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB If we allocate only necessary TCE levels, we will only need: (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect levels). Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent 9bc98c8 commit a68bd12

File tree

6 files changed

+73
-27
lines changed

6 files changed

+73
-27
lines changed

arch/powerpc/include/asm/iommu.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ struct iommu_table_ops {
7070
unsigned long *hpa,
7171
enum dma_data_direction *direction);
7272

73-
__be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
73+
__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
7474
#endif
7575
void (*clear)(struct iommu_table *tbl,
7676
long index, long npages);
@@ -122,10 +122,13 @@ struct iommu_table {
122122
__be64 *it_userspace; /* userspace view of the table */
123123
struct iommu_table_ops *it_ops;
124124
struct kref it_kref;
125+
int it_nid;
125126
};
126127

128+
#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
129+
((tbl)->it_ops->useraddrptr((tbl), (entry), false))
127130
#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
128-
((tbl)->it_ops->useraddrptr((tbl), (entry)))
131+
((tbl)->it_ops->useraddrptr((tbl), (entry), true))
129132

130133
/* Pure 2^n version of get_order */
131134
static inline __attribute_const__

arch/powerpc/kvm/book3s_64_vio_hv.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
200200
{
201201
struct mm_iommu_table_group_mem_t *mem = NULL;
202202
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
203-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
203+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
204204

205205
if (!pua)
206206
/* it_userspace allocation might be delayed */
@@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
264264
{
265265
long ret;
266266
unsigned long hpa = 0;
267-
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
267+
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
268268
struct mm_iommu_table_group_mem_t *mem;
269269

270270
if (!pua)

arch/powerpc/platforms/powernv/pci-ioda-tce.c

Lines changed: 57 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
4848
return addr;
4949
}
5050

51-
static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
51+
static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
5252
{
5353
__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
5454
int level = tbl->it_indirect_levels;
@@ -57,7 +57,23 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
5757

5858
while (level) {
5959
int n = (idx & mask) >> (level * shift);
60-
unsigned long tce = be64_to_cpu(tmp[n]);
60+
unsigned long tce;
61+
62+
if (tmp[n] == 0) {
63+
__be64 *tmp2;
64+
65+
if (!alloc)
66+
return NULL;
67+
68+
tmp2 = pnv_alloc_tce_level(tbl->it_nid,
69+
ilog2(tbl->it_level_size) + 3);
70+
if (!tmp2)
71+
return NULL;
72+
73+
tmp[n] = cpu_to_be64(__pa(tmp2) |
74+
TCE_PCI_READ | TCE_PCI_WRITE);
75+
}
76+
tce = be64_to_cpu(tmp[n]);
6177

6278
tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
6379
idx &= ~mask;
@@ -84,39 +100,54 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
84100
((rpn + i) << tbl->it_page_shift);
85101
unsigned long idx = index - tbl->it_offset + i;
86102

87-
*(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce);
103+
*(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce);
88104
}
89105

90106
return 0;
91107
}
92108

93109
#ifdef CONFIG_IOMMU_API
94110
int pnv_tce_xchg(struct iommu_table *tbl, long index,
95-
unsigned long *hpa, enum dma_data_direction *direction)
111+
unsigned long *hpa, enum dma_data_direction *direction,
112+
bool alloc)
96113
{
97114
u64 proto_tce = iommu_direction_to_tce_perm(*direction);
98115
unsigned long newtce = *hpa | proto_tce, oldtce;
99116
unsigned long idx = index - tbl->it_offset;
117+
__be64 *ptce = NULL;
100118

101119
BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
102120

121+
if (*direction == DMA_NONE) {
122+
ptce = pnv_tce(tbl, false, idx, false);
123+
if (!ptce) {
124+
*hpa = 0;
125+
return 0;
126+
}
127+
}
128+
129+
if (!ptce) {
130+
ptce = pnv_tce(tbl, false, idx, alloc);
131+
if (!ptce)
132+
return alloc ? H_HARDWARE : H_TOO_HARD;
133+
}
134+
103135
if (newtce & TCE_PCI_WRITE)
104136
newtce |= TCE_PCI_READ;
105137

106-
oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx),
107-
cpu_to_be64(newtce)));
138+
oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce)));
108139
*hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
109140
*direction = iommu_tce_direction(oldtce);
110141

111142
return 0;
112143
}
113144

114-
__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index)
145+
__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc)
115146
{
116147
if (WARN_ON_ONCE(!tbl->it_userspace))
117148
return NULL;
118149

119-
return pnv_tce(tbl, true, index - tbl->it_offset);
150+
return pnv_tce(tbl, true, index - tbl->it_offset, alloc);
120151
}
121152
#endif
122153

@@ -126,14 +157,19 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
126157

127158
for (i = 0; i < npages; i++) {
128159
unsigned long idx = index - tbl->it_offset + i;
160+
__be64 *ptce = pnv_tce(tbl, false, idx, false);
129161

130-
*(pnv_tce(tbl, false, idx)) = cpu_to_be64(0);
162+
if (ptce)
163+
*ptce = cpu_to_be64(0);
131164
}
132165
}
133166

134167
unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
135168
{
136-
__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset);
169+
__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false);
170+
171+
if (!ptce)
172+
return 0;
137173

138174
return be64_to_cpu(*ptce);
139175
}
@@ -224,13 +260,17 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
224260
unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
225261
PAGE_SHIFT);
226262
const unsigned long tce_table_size = 1UL << table_shift;
263+
unsigned int tmplevels = levels;
227264

228265
if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
229266
return -EINVAL;
230267

231268
if (!is_power_of_2(window_size))
232269
return -EINVAL;
233270

271+
if (alloc_userspace_copy && (window_size > (1ULL << 32)))
272+
tmplevels = 1;
273+
234274
/* Adjust direct table size from window_size and levels */
235275
entries_shift = (entries_shift + levels - 1) / levels;
236276
level_shift = entries_shift + 3;
@@ -241,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
241281

242282
/* Allocate TCE table */
243283
addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
244-
levels, tce_table_size, &offset, &total_allocated);
284+
tmplevels, tce_table_size, &offset, &total_allocated);
245285

246286
/* addr==NULL means that the first level allocation failed */
247287
if (!addr)
@@ -252,7 +292,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
252292
* we did not allocate as much as we wanted,
253293
* release partially allocated table.
254294
*/
255-
if (offset < tce_table_size)
295+
if (tmplevels == levels && offset < tce_table_size)
256296
goto free_tces_exit;
257297

258298
/* Allocate userspace view of the TCE table */
@@ -263,8 +303,8 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
263303
&total_allocated_uas);
264304
if (!uas)
265305
goto free_tces_exit;
266-
if (offset < tce_table_size ||
267-
total_allocated_uas != total_allocated)
306+
if (tmplevels == levels && (offset < tce_table_size ||
307+
total_allocated_uas != total_allocated))
268308
goto free_uas_exit;
269309
}
270310

@@ -275,10 +315,11 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
275315
tbl->it_indirect_levels = levels - 1;
276316
tbl->it_allocated_size = total_allocated;
277317
tbl->it_userspace = uas;
318+
tbl->it_nid = nid;
278319

279-
pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n",
320+
pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
280321
window_size, tce_table_size, bus_offset, tbl->it_base,
281-
tbl->it_userspace, levels);
322+
tbl->it_userspace, tmplevels, levels);
282323

283324
return 0;
284325

arch/powerpc/platforms/powernv/pci-ioda.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,7 +2003,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
20032003
static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
20042004
unsigned long *hpa, enum dma_data_direction *direction)
20052005
{
2006-
long ret = pnv_tce_xchg(tbl, index, hpa, direction);
2006+
long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
20072007

20082008
if (!ret)
20092009
pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
@@ -2014,7 +2014,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
20142014
static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
20152015
unsigned long *hpa, enum dma_data_direction *direction)
20162016
{
2017-
long ret = pnv_tce_xchg(tbl, index, hpa, direction);
2017+
long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
20182018

20192019
if (!ret)
20202020
pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
@@ -2168,7 +2168,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
21682168
static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
21692169
unsigned long *hpa, enum dma_data_direction *direction)
21702170
{
2171-
long ret = pnv_tce_xchg(tbl, index, hpa, direction);
2171+
long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
21722172

21732173
if (!ret)
21742174
pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
@@ -2179,7 +2179,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
21792179
static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
21802180
unsigned long *hpa, enum dma_data_direction *direction)
21812181
{
2182-
long ret = pnv_tce_xchg(tbl, index, hpa, direction);
2182+
long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
21832183

21842184
if (!ret)
21852185
pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);

arch/powerpc/platforms/powernv/pci.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
266266
unsigned long attrs);
267267
extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
268268
extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
269-
unsigned long *hpa, enum dma_data_direction *direction);
270-
extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
269+
unsigned long *hpa, enum dma_data_direction *direction,
270+
bool alloc);
271+
extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
272+
bool alloc);
271273
extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
272274

273275
extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,

drivers/vfio/vfio_iommu_spapr_tce.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ static long tce_iommu_create_table(struct tce_container *container,
631631
page_shift, window_size, levels, ptbl);
632632

633633
WARN_ON(!ret && !(*ptbl)->it_ops->free);
634-
WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
634+
WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
635635

636636
return ret;
637637
}

0 commit comments

Comments
 (0)