@@ -549,10 +549,15 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
549
549
wait_event (imr -> q_leaf_free , !atomic_read (& imr -> num_leaf_free ));
550
550
}
551
551
552
+ #define MLX5_PF_FLAGS_PREFETCH BIT(0)
553
+ #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
552
554
static int pagefault_mr (struct mlx5_ib_dev * dev , struct mlx5_ib_mr * mr ,
553
- u64 io_virt , size_t bcnt , u32 * bytes_mapped )
555
+ u64 io_virt , size_t bcnt , u32 * bytes_mapped ,
556
+ u32 flags )
554
557
{
555
558
struct ib_umem_odp * odp_mr = to_ib_umem_odp (mr -> umem );
559
+ bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE ;
560
+ bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH ;
556
561
u64 access_mask = ODP_READ_ALLOWED_BIT ;
557
562
int npages = 0 , page_shift , np ;
558
563
u64 start_idx , page_mask ;
@@ -579,7 +584,15 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
579
584
page_mask = ~(BIT (page_shift ) - 1 );
580
585
start_idx = (io_virt - (mr -> mmkey .iova & page_mask )) >> page_shift ;
581
586
582
- if (mr -> umem -> writable )
587
+ if (prefetch && !downgrade && !mr -> umem -> writable ) {
588
+ /* prefetch with write-access must
589
+ * be supported by the MR
590
+ */
591
+ ret = - EINVAL ;
592
+ goto out ;
593
+ }
594
+
595
+ if (mr -> umem -> writable && !downgrade )
583
596
access_mask |= ODP_WRITE_ALLOWED_BIT ;
584
597
585
598
current_seq = READ_ONCE (odp -> notifiers_seq );
@@ -684,12 +697,13 @@ struct pf_frame {
684
697
* -EFAULT when there's an error mapping the requested pages. The caller will
685
698
* abort the page fault handling.
686
699
*/
687
- static int pagefault_single_data_segment (struct mlx5_ib_dev * dev ,
688
- u32 key , u64 io_virt , size_t bcnt ,
700
+ static int pagefault_single_data_segment (struct mlx5_ib_dev * dev , u32 key ,
701
+ u64 io_virt , size_t bcnt ,
689
702
u32 * bytes_committed ,
690
- u32 * bytes_mapped )
703
+ u32 * bytes_mapped , u32 flags )
691
704
{
692
705
int npages = 0 , srcu_key , ret , i , outlen , cur_outlen = 0 , depth = 0 ;
706
+ bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH ;
693
707
struct pf_frame * head = NULL , * frame ;
694
708
struct mlx5_core_mkey * mmkey ;
695
709
struct mlx5_ib_mw * mw ;
@@ -711,6 +725,12 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
711
725
goto srcu_unlock ;
712
726
}
713
727
728
+ if (prefetch && mmkey -> type != MLX5_MKEY_MR ) {
729
+ mlx5_ib_dbg (dev , "prefetch is allowed only for MR\n" );
730
+ ret = - EINVAL ;
731
+ goto srcu_unlock ;
732
+ }
733
+
714
734
switch (mmkey -> type ) {
715
735
case MLX5_MKEY_MR :
716
736
mr = container_of (mmkey , struct mlx5_ib_mr , mmkey );
@@ -720,6 +740,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
720
740
goto srcu_unlock ;
721
741
}
722
742
743
+ if (prefetch && !mr -> umem -> is_odp ) {
744
+ ret = - EINVAL ;
745
+ goto srcu_unlock ;
746
+ }
747
+
723
748
if (!mr -> umem -> is_odp ) {
724
749
mlx5_ib_dbg (dev , "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n" ,
725
750
key );
@@ -729,7 +754,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
729
754
goto srcu_unlock ;
730
755
}
731
756
732
- ret = pagefault_mr (dev , mr , io_virt , bcnt , bytes_mapped );
757
+ ret = pagefault_mr (dev , mr , io_virt , bcnt , bytes_mapped , flags );
733
758
if (ret < 0 )
734
759
goto srcu_unlock ;
735
760
@@ -906,7 +931,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
906
931
907
932
ret = pagefault_single_data_segment (dev , key , io_virt , bcnt ,
908
933
& pfault -> bytes_committed ,
909
- bytes_mapped );
934
+ bytes_mapped , 0 );
910
935
if (ret < 0 )
911
936
break ;
912
937
npages += ret ;
@@ -1217,7 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1217
1242
}
1218
1243
1219
1244
ret = pagefault_single_data_segment (dev , rkey , address , length ,
1220
- & pfault -> bytes_committed , NULL );
1245
+ & pfault -> bytes_committed , NULL ,
1246
+ 0 );
1221
1247
if (ret == - EAGAIN ) {
1222
1248
/* We're racing with an invalidation, don't prefetch */
1223
1249
prefetch_activated = 0 ;
@@ -1244,7 +1270,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1244
1270
1245
1271
ret = pagefault_single_data_segment (dev , rkey , address ,
1246
1272
prefetch_len ,
1247
- & bytes_committed , NULL );
1273
+ & bytes_committed , NULL ,
1274
+ 0 );
1248
1275
if (ret < 0 && ret != - EAGAIN ) {
1249
1276
mlx5_ib_dbg (dev , "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n" ,
1250
1277
ret , pfault -> token , address , prefetch_len );
@@ -1493,10 +1520,17 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1493
1520
}
1494
1521
}
1495
1522
1523
+ static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1524
+ .advise_mr = mlx5_ib_advise_mr ,
1525
+ };
1526
+
1496
1527
int mlx5_ib_odp_init_one (struct mlx5_ib_dev * dev )
1497
1528
{
1498
1529
int ret = 0 ;
1499
1530
1531
+ if (dev -> odp_caps .general_caps & IB_ODP_SUPPORT )
1532
+ ib_set_device_ops (& dev -> ib_dev , & mlx5_ib_dev_odp_ops );
1533
+
1500
1534
if (dev -> odp_caps .general_caps & IB_ODP_SUPPORT_IMPLICIT ) {
1501
1535
ret = mlx5_cmd_null_mkey (dev -> mdev , & dev -> null_mkey );
1502
1536
if (ret ) {
@@ -1528,3 +1562,76 @@ int mlx5_ib_odp_init(void)
1528
1562
1529
1563
return 0 ;
1530
1564
}
1565
+
1566
+ struct prefetch_mr_work {
1567
+ struct work_struct work ;
1568
+ struct mlx5_ib_dev * dev ;
1569
+ u32 pf_flags ;
1570
+ u32 num_sge ;
1571
+ struct ib_sge sg_list [0 ];
1572
+ };
1573
+
1574
+ static int mlx5_ib_prefetch_sg_list (struct mlx5_ib_dev * dev , u32 pf_flags ,
1575
+ struct ib_sge * sg_list , u32 num_sge )
1576
+ {
1577
+ int i ;
1578
+
1579
+ for (i = 0 ; i < num_sge ; ++ i ) {
1580
+ struct ib_sge * sg = & sg_list [i ];
1581
+ int bytes_committed = 0 ;
1582
+ int ret ;
1583
+
1584
+ ret = pagefault_single_data_segment (dev , sg -> lkey , sg -> addr ,
1585
+ sg -> length ,
1586
+ & bytes_committed , NULL ,
1587
+ pf_flags );
1588
+ if (ret < 0 )
1589
+ return ret ;
1590
+ }
1591
+ return 0 ;
1592
+ }
1593
+
1594
+ static void mlx5_ib_prefetch_mr_work (struct work_struct * work )
1595
+ {
1596
+ struct prefetch_mr_work * w =
1597
+ container_of (work , struct prefetch_mr_work , work );
1598
+
1599
+ if (w -> dev -> ib_dev .reg_state == IB_DEV_REGISTERED )
1600
+ mlx5_ib_prefetch_sg_list (w -> dev , w -> pf_flags , w -> sg_list ,
1601
+ w -> num_sge );
1602
+
1603
+ kfree (w );
1604
+ }
1605
+
1606
+ int mlx5_ib_advise_mr_prefetch (struct ib_pd * pd ,
1607
+ enum ib_uverbs_advise_mr_advice advice ,
1608
+ u32 flags , struct ib_sge * sg_list , u32 num_sge )
1609
+ {
1610
+ struct mlx5_ib_dev * dev = to_mdev (pd -> device );
1611
+ u32 pf_flags = MLX5_PF_FLAGS_PREFETCH ;
1612
+ struct prefetch_mr_work * work ;
1613
+
1614
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH )
1615
+ pf_flags |= MLX5_PF_FLAGS_DOWNGRADE ;
1616
+
1617
+ if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH )
1618
+ return mlx5_ib_prefetch_sg_list (dev , pf_flags , sg_list ,
1619
+ num_sge );
1620
+
1621
+ if (dev -> ib_dev .reg_state != IB_DEV_REGISTERED )
1622
+ return - ENODEV ;
1623
+
1624
+ work = kvzalloc (struct_size (work , sg_list , num_sge ), GFP_KERNEL );
1625
+ if (!work )
1626
+ return - ENOMEM ;
1627
+
1628
+ memcpy (work -> sg_list , sg_list , num_sge * sizeof (struct ib_sge ));
1629
+
1630
+ work -> dev = dev ;
1631
+ work -> pf_flags = pf_flags ;
1632
+ work -> num_sge = num_sge ;
1633
+
1634
+ INIT_WORK (& work -> work , mlx5_ib_prefetch_mr_work );
1635
+ schedule_work (& work -> work );
1636
+ return 0 ;
1637
+ }
0 commit comments