Skip to content

Commit 813e90b

Browse files
monis410jgunthorpe
authored andcommitted
IB/mlx5: Add advise_mr() support
The verb advise_mr() is used to give advice to the kernel about an address range that belongs to a MR. Implement the verb and register it on the device. The current implementation supports the only known advice to date, prefetch. Signed-off-by: Moni Shoua <monis@mellanox.com> Reviewed-by: Guy Levi <guyle@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
1 parent ad8a449 commit 813e90b

File tree

4 files changed

+157
-9
lines changed

4 files changed

+157
-9
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5712,6 +5712,8 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
57125712
mlx5_ib_cleanup_multiport_master(dev);
57135713
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
57145714
cleanup_srcu_struct(&dev->mr_srcu);
5715+
drain_workqueue(dev->advise_mr_wq);
5716+
destroy_workqueue(dev->advise_mr_wq);
57155717
#endif
57165718
kfree(dev->port);
57175719
}
@@ -5766,6 +5768,12 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
57665768
dev->memic.dev = mdev;
57675769

57685770
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
5771+
dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
5772+
if (!dev->advise_mr_wq) {
5773+
err = -ENOMEM;
5774+
goto err_free_port;
5775+
}
5776+
57695777
err = init_srcu_struct(&dev->mr_srcu);
57705778
if (err)
57715779
goto err_free_port;

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ struct mlx5_ib_dev {
923923
*/
924924
struct srcu_struct mr_srcu;
925925
u32 null_mkey;
926+
struct workqueue_struct *advise_mr_wq;
926927
#endif
927928
struct mlx5_ib_flow_db *flow_db;
928929
/* protect resources needed as part of reset flow */
@@ -1085,6 +1086,12 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
10851086
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
10861087
u64 virt_addr, int access_flags,
10871088
struct ib_udata *udata);
1089+
int mlx5_ib_advise_mr(struct ib_pd *pd,
1090+
enum ib_uverbs_advise_mr_advice advice,
1091+
u32 flags,
1092+
struct ib_sge *sg_list,
1093+
u32 num_sge,
1094+
struct uverbs_attr_bundle *attrs);
10881095
struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
10891096
struct ib_udata *udata);
10901097
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
@@ -1182,6 +1189,10 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
11821189
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
11831190
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
11841191
size_t nentries, struct mlx5_ib_mr *mr, int flags);
1192+
1193+
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1194+
enum ib_uverbs_advise_mr_advice advice,
1195+
u32 flags, struct ib_sge *sg_list, u32 num_sge);
11851196
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
11861197
static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
11871198
{
@@ -1197,6 +1208,13 @@ static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
11971208
size_t nentries, struct mlx5_ib_mr *mr,
11981209
int flags) {}
11991210

1211+
static int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1212+
enum ib_uverbs_advise_mr_advice advice,
1213+
u32 flags, struct ib_sge *sg_list,
1214+
u32 num_sge)
1215+
{
1216+
return -EOPNOTSUPP;
1217+
}
12001218
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
12011219

12021220
/* Needed for rep profile */

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,21 @@ static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
12801280
return ERR_PTR(err);
12811281
}
12821282

1283+
int mlx5_ib_advise_mr(struct ib_pd *pd,
1284+
enum ib_uverbs_advise_mr_advice advice,
1285+
u32 flags,
1286+
struct ib_sge *sg_list,
1287+
u32 num_sge,
1288+
struct uverbs_attr_bundle *attrs)
1289+
{
1290+
if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1291+
advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
1292+
return -EOPNOTSUPP;
1293+
1294+
return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1295+
sg_list, num_sge);
1296+
}
1297+
12831298
struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
12841299
struct ib_dm_mr_attr *attr,
12851300
struct uverbs_attr_bundle *attrs)

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 116 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -549,10 +549,15 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
549549
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
550550
}
551551

552+
#define MLX5_PF_FLAGS_PREFETCH BIT(0)
553+
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
552554
static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
553-
u64 io_virt, size_t bcnt, u32 *bytes_mapped)
555+
u64 io_virt, size_t bcnt, u32 *bytes_mapped,
556+
u32 flags)
554557
{
555558
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
559+
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
560+
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
556561
u64 access_mask = ODP_READ_ALLOWED_BIT;
557562
int npages = 0, page_shift, np;
558563
u64 start_idx, page_mask;
@@ -579,7 +584,15 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
579584
page_mask = ~(BIT(page_shift) - 1);
580585
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
581586

582-
if (mr->umem->writable)
587+
if (prefetch && !downgrade && !mr->umem->writable) {
588+
/* prefetch with write-access must
589+
* be supported by the MR
590+
*/
591+
ret = -EINVAL;
592+
goto out;
593+
}
594+
595+
if (mr->umem->writable && !downgrade)
583596
access_mask |= ODP_WRITE_ALLOWED_BIT;
584597

585598
current_seq = READ_ONCE(odp->notifiers_seq);
@@ -684,12 +697,13 @@ struct pf_frame {
684697
* -EFAULT when there's an error mapping the requested pages. The caller will
685698
* abort the page fault handling.
686699
*/
687-
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
688-
u32 key, u64 io_virt, size_t bcnt,
700+
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key,
701+
u64 io_virt, size_t bcnt,
689702
u32 *bytes_committed,
690-
u32 *bytes_mapped)
703+
u32 *bytes_mapped, u32 flags)
691704
{
692705
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
706+
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
693707
struct pf_frame *head = NULL, *frame;
694708
struct mlx5_core_mkey *mmkey;
695709
struct mlx5_ib_mw *mw;
@@ -711,6 +725,12 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
711725
goto srcu_unlock;
712726
}
713727

728+
if (prefetch && mmkey->type != MLX5_MKEY_MR) {
729+
mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
730+
ret = -EINVAL;
731+
goto srcu_unlock;
732+
}
733+
714734
switch (mmkey->type) {
715735
case MLX5_MKEY_MR:
716736
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
@@ -720,6 +740,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
720740
goto srcu_unlock;
721741
}
722742

743+
if (prefetch && !mr->umem->is_odp) {
744+
ret = -EINVAL;
745+
goto srcu_unlock;
746+
}
747+
723748
if (!mr->umem->is_odp) {
724749
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
725750
key);
@@ -729,7 +754,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
729754
goto srcu_unlock;
730755
}
731756

732-
ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
757+
ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
733758
if (ret < 0)
734759
goto srcu_unlock;
735760

@@ -906,7 +931,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
906931

907932
ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
908933
&pfault->bytes_committed,
909-
bytes_mapped);
934+
bytes_mapped, 0);
910935
if (ret < 0)
911936
break;
912937
npages += ret;
@@ -1217,7 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
12171242
}
12181243

12191244
ret = pagefault_single_data_segment(dev, rkey, address, length,
1220-
&pfault->bytes_committed, NULL);
1245+
&pfault->bytes_committed, NULL,
1246+
0);
12211247
if (ret == -EAGAIN) {
12221248
/* We're racing with an invalidation, don't prefetch */
12231249
prefetch_activated = 0;
@@ -1244,7 +1270,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
12441270

12451271
ret = pagefault_single_data_segment(dev, rkey, address,
12461272
prefetch_len,
1247-
&bytes_committed, NULL);
1273+
&bytes_committed, NULL,
1274+
0);
12481275
if (ret < 0 && ret != -EAGAIN) {
12491276
mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
12501277
ret, pfault->token, address, prefetch_len);
@@ -1493,10 +1520,17 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
14931520
}
14941521
}
14951522

1523+
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1524+
.advise_mr = mlx5_ib_advise_mr,
1525+
};
1526+
14961527
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
14971528
{
14981529
int ret = 0;
14991530

1531+
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
1532+
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
1533+
15001534
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
15011535
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
15021536
if (ret) {
@@ -1528,3 +1562,76 @@ int mlx5_ib_odp_init(void)
15281562

15291563
return 0;
15301564
}
1565+
1566+
struct prefetch_mr_work {
1567+
struct work_struct work;
1568+
struct mlx5_ib_dev *dev;
1569+
u32 pf_flags;
1570+
u32 num_sge;
1571+
struct ib_sge sg_list[0];
1572+
};
1573+
1574+
static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags,
1575+
struct ib_sge *sg_list, u32 num_sge)
1576+
{
1577+
int i;
1578+
1579+
for (i = 0; i < num_sge; ++i) {
1580+
struct ib_sge *sg = &sg_list[i];
1581+
int bytes_committed = 0;
1582+
int ret;
1583+
1584+
ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr,
1585+
sg->length,
1586+
&bytes_committed, NULL,
1587+
pf_flags);
1588+
if (ret < 0)
1589+
return ret;
1590+
}
1591+
return 0;
1592+
}
1593+
1594+
static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
1595+
{
1596+
struct prefetch_mr_work *w =
1597+
container_of(work, struct prefetch_mr_work, work);
1598+
1599+
if (w->dev->ib_dev.reg_state == IB_DEV_REGISTERED)
1600+
mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list,
1601+
w->num_sge);
1602+
1603+
kfree(w);
1604+
}
1605+
1606+
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1607+
enum ib_uverbs_advise_mr_advice advice,
1608+
u32 flags, struct ib_sge *sg_list, u32 num_sge)
1609+
{
1610+
struct mlx5_ib_dev *dev = to_mdev(pd->device);
1611+
u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
1612+
struct prefetch_mr_work *work;
1613+
1614+
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1615+
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
1616+
1617+
if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1618+
return mlx5_ib_prefetch_sg_list(dev, pf_flags, sg_list,
1619+
num_sge);
1620+
1621+
if (dev->ib_dev.reg_state != IB_DEV_REGISTERED)
1622+
return -ENODEV;
1623+
1624+
work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
1625+
if (!work)
1626+
return -ENOMEM;
1627+
1628+
memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
1629+
1630+
work->dev = dev;
1631+
work->pf_flags = pf_flags;
1632+
work->num_sge = num_sge;
1633+
1634+
INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1635+
schedule_work(&work->work);
1636+
return 0;
1637+
}

0 commit comments

Comments
 (0)