Skip to content

Commit b4cfe44

Browse files
haggaierolandd
authored andcommitted
IB/mlx5: Implement on demand paging by adding support for MMU notifiers
* Implement the relevant invalidation functions (zap MTTs as needed) * Implement interlocking (and rollback in the page fault handlers) for cases of a racing notifier and fault. * With this patch we can now enable the capability bits for supporting RC send/receive/RDMA read/RDMA write, and UD send. Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Shachar Raindel <raindel@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
1 parent eab668a commit b4cfe44

File tree

4 files changed

+198
-16
lines changed

4 files changed

+198
-16
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
574574
goto out_count;
575575
}
576576

577+
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
578+
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
579+
#endif
580+
577581
INIT_LIST_HEAD(&context->db_page_list);
578582
mutex_init(&context->db_page_mutex);
579583

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ struct mlx5_ib_mr {
325325
struct mlx5_ib_dev *dev;
326326
struct mlx5_create_mkey_mbox_out out;
327327
struct mlx5_core_sig_ctx *sig;
328+
int live;
328329
};
329330

330331
struct mlx5_ib_fast_reg_page_list {
@@ -629,6 +630,8 @@ int __init mlx5_ib_odp_init(void);
629630
void mlx5_ib_odp_cleanup(void);
630631
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
631632
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
633+
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
634+
unsigned long end);
632635

633636
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
634637
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <linux/export.h>
3838
#include <linux/delay.h>
3939
#include <rdma/ib_umem.h>
40+
#include <rdma/ib_umem_odp.h>
4041
#include <rdma/ib_verbs.h>
4142
#include "mlx5_ib.h"
4243

@@ -54,6 +55,18 @@ static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
5455

5556
static int clean_mr(struct mlx5_ib_mr *mr);
5657

58+
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
59+
{
60+
int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
61+
62+
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
63+
/* Wait until all page fault handlers using the mr complete. */
64+
synchronize_srcu(&dev->mr_srcu);
65+
#endif
66+
67+
return err;
68+
}
69+
5770
static int order2idx(struct mlx5_ib_dev *dev, int order)
5871
{
5972
struct mlx5_mr_cache *cache = &dev->cache;
@@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
191204
ent->cur--;
192205
ent->size--;
193206
spin_unlock_irq(&ent->lock);
194-
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
207+
err = destroy_mkey(dev, mr);
195208
if (err)
196209
mlx5_ib_warn(dev, "failed destroy mkey\n");
197210
else
@@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
482495
ent->cur--;
483496
ent->size--;
484497
spin_unlock_irq(&ent->lock);
485-
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
498+
err = destroy_mkey(dev, mr);
486499
if (err)
487500
mlx5_ib_warn(dev, "failed destroy mkey\n");
488501
else
@@ -812,6 +825,8 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
812825
mr->mmr.size = len;
813826
mr->mmr.pd = to_mpd(pd)->pdn;
814827

828+
mr->live = 1;
829+
815830
unmap_dma:
816831
up(&umrc->sem);
817832
dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
@@ -997,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
9971012
goto err_2;
9981013
}
9991014
mr->umem = umem;
1015+
mr->live = 1;
10001016
kvfree(in);
10011017

10021018
mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@@ -1074,10 +1090,47 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
10741090
mr->ibmr.lkey = mr->mmr.key;
10751091
mr->ibmr.rkey = mr->mmr.key;
10761092

1093+
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1094+
if (umem->odp_data) {
1095+
/*
1096+
* This barrier prevents the compiler from moving the
1097+
* setting of umem->odp_data->private to point to our
1098+
* MR, before reg_umr finished, to ensure that the MR
1099+
* initialization have finished before starting to
1100+
* handle invalidations.
1101+
*/
1102+
smp_wmb();
1103+
mr->umem->odp_data->private = mr;
1104+
/*
1105+
* Make sure we will see the new
1106+
* umem->odp_data->private value in the invalidation
1107+
* routines, before we can get page faults on the
1108+
* MR. Page faults can happen once we put the MR in
1109+
* the tree, below this line. Without the barrier,
1110+
* there can be a fault handling and an invalidation
1111+
* before umem->odp_data->private == mr is visible to
1112+
* the invalidation handler.
1113+
*/
1114+
smp_wmb();
1115+
}
1116+
#endif
1117+
10771118
return &mr->ibmr;
10781119

10791120
error:
1121+
/*
1122+
* Destroy the umem *before* destroying the MR, to ensure we
1123+
* will not have any in-flight notifiers when destroying the
1124+
* MR.
1125+
*
1126+
* As the MR is completely invalid to begin with, and this
1127+
* error path is only taken if we can't push the mr entry into
1128+
* the pagefault tree, this is safe.
1129+
*/
1130+
10801131
ib_umem_release(umem);
1132+
/* Kill the MR, and return an error code. */
1133+
clean_mr(mr);
10811134
return ERR_PTR(err);
10821135
}
10831136

@@ -1121,7 +1174,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
11211174
int err;
11221175

11231176
if (!umred) {
1124-
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
1177+
err = destroy_mkey(dev, mr);
11251178
if (err) {
11261179
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
11271180
mr->mmr.key, err);
@@ -1150,9 +1203,25 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
11501203
struct ib_umem *umem = mr->umem;
11511204

11521205
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1153-
if (umem)
1206+
if (umem && umem->odp_data) {
1207+
/* Prevent new page faults from succeeding */
1208+
mr->live = 0;
11541209
/* Wait for all running page-fault handlers to finish. */
11551210
synchronize_srcu(&dev->mr_srcu);
1211+
/* Destroy all page mappings */
1212+
mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1213+
ib_umem_end(umem));
1214+
/*
1215+
* We kill the umem before the MR for ODP,
1216+
* so that there will not be any invalidations in
1217+
* flight, looking at the *mr struct.
1218+
*/
1219+
ib_umem_release(umem);
1220+
atomic_sub(npages, &dev->mdev->priv.reg_pages);
1221+
1222+
/* Avoid double-freeing the umem. */
1223+
umem = NULL;
1224+
}
11561225
#endif
11571226

11581227
clean_mr(mr);
@@ -1269,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
12691338
kfree(mr->sig);
12701339
}
12711340

1272-
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
1341+
err = destroy_mkey(dev, mr);
12731342
if (err) {
12741343
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
12751344
mr->mmr.key, err);

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 117 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,78 @@
3737

3838
#define MAX_PREFETCH_LEN (4*1024*1024U)
3939

40+
/* Timeout in ms to wait for an active mmu notifier to complete when handling
41+
* a pagefault. */
42+
#define MMU_NOTIFIER_TIMEOUT 1000
43+
4044
struct workqueue_struct *mlx5_ib_page_fault_wq;
4145

46+
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
47+
unsigned long end)
48+
{
49+
struct mlx5_ib_mr *mr;
50+
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
51+
u64 idx = 0, blk_start_idx = 0;
52+
int in_block = 0;
53+
u64 addr;
54+
55+
if (!umem || !umem->odp_data) {
56+
pr_err("invalidation called on NULL umem or non-ODP umem\n");
57+
return;
58+
}
59+
60+
mr = umem->odp_data->private;
61+
62+
if (!mr || !mr->ibmr.pd)
63+
return;
64+
65+
start = max_t(u64, ib_umem_start(umem), start);
66+
end = min_t(u64, ib_umem_end(umem), end);
67+
68+
/*
69+
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
70+
* while we are doing the invalidation, no page fault will attempt to
71+
* overwrite the same MTTs. Concurent invalidations might race us,
72+
* but they will write 0s as well, so no difference in the end result.
73+
*/
74+
75+
for (addr = start; addr < end; addr += (u64)umem->page_size) {
76+
idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
77+
/*
78+
* Strive to write the MTTs in chunks, but avoid overwriting
79+
* non-existing MTTs. The huristic here can be improved to
80+
* estimate the cost of another UMR vs. the cost of bigger
81+
* UMR.
82+
*/
83+
if (umem->odp_data->dma_list[idx] &
84+
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
85+
if (!in_block) {
86+
blk_start_idx = idx;
87+
in_block = 1;
88+
}
89+
} else {
90+
u64 umr_offset = idx & umr_block_mask;
91+
92+
if (in_block && umr_offset == 0) {
93+
mlx5_ib_update_mtt(mr, blk_start_idx,
94+
idx - blk_start_idx, 1);
95+
in_block = 0;
96+
}
97+
}
98+
}
99+
if (in_block)
100+
mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
101+
1);
102+
103+
/*
104+
* We are now sure that the device will not access the
105+
* memory. We can safely unmap it, and mark it as dirty if
106+
* needed.
107+
*/
108+
109+
ib_umem_odp_unmap_dma_pages(umem, start, end);
110+
}
111+
42112
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
43113
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
44114
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
@@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
59129
if (err)
60130
goto out;
61131

62-
/* At this point we would copy the capability bits that the driver
63-
* supports from the hw_caps struct to the caps struct. However, no
64-
* such capabilities are supported so far. */
132+
caps->general_caps = IB_ODP_SUPPORT;
133+
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
134+
SEND);
135+
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
136+
SEND);
137+
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
138+
RECV);
139+
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
140+
WRITE);
141+
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
142+
READ);
143+
65144
out:
66145
return err;
67146
}
@@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
71150
{
72151
u32 base_key = mlx5_base_mkey(key);
73152
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
153+
struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
74154

75-
if (!mmr || mmr->key != key)
155+
if (!mmr || mmr->key != key || !mr->live)
76156
return NULL;
77157

78158
return container_of(mmr, struct mlx5_ib_mr, mmr);
@@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
143223
}
144224

145225
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
226+
/*
227+
* Ensure the sequence number is valid for some time before we call
228+
* gup.
229+
*/
230+
smp_rmb();
146231

147232
/*
148233
* Avoid branches - this code will perform correctly
@@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
165250

166251
if (npages > 0) {
167252
mutex_lock(&mr->umem->odp_data->umem_mutex);
168-
/*
169-
* No need to check whether the MTTs really belong to
170-
* this MR, since ib_umem_odp_map_dma_pages already
171-
* checks this.
172-
*/
173-
ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
253+
if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
254+
/*
255+
* No need to check whether the MTTs really belong to
256+
* this MR, since ib_umem_odp_map_dma_pages already
257+
* checks this.
258+
*/
259+
ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
260+
} else {
261+
ret = -EAGAIN;
262+
}
174263
mutex_unlock(&mr->umem->odp_data->umem_mutex);
175264
if (ret < 0) {
176-
pr_err("Failed to update mkey page tables\n");
265+
if (ret != -EAGAIN)
266+
pr_err("Failed to update mkey page tables\n");
177267
goto srcu_unlock;
178268
}
179269

@@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
185275
}
186276

187277
srcu_unlock:
278+
if (ret == -EAGAIN) {
279+
if (!mr->umem->odp_data->dying) {
280+
struct ib_umem_odp *odp_data = mr->umem->odp_data;
281+
unsigned long timeout =
282+
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
283+
284+
if (!wait_for_completion_timeout(
285+
&odp_data->notifier_completion,
286+
timeout)) {
287+
pr_warn("timeout waiting for mmu notifier completion\n");
288+
}
289+
} else {
290+
/* The MR is being killed, kill the QP as well. */
291+
ret = -EFAULT;
292+
}
293+
}
188294
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
189295
pfault->mpfault.bytes_committed = 0;
190296
return ret ? ret : npages;

0 commit comments

Comments
 (0)