37
37
38
38
#define MAX_PREFETCH_LEN (4*1024*1024U)
39
39
40
+ /* Timeout in ms to wait for an active mmu notifier to complete when handling
41
+ * a pagefault. */
42
+ #define MMU_NOTIFIER_TIMEOUT 1000
43
+
40
44
struct workqueue_struct * mlx5_ib_page_fault_wq ;
41
45
46
+ void mlx5_ib_invalidate_range (struct ib_umem * umem , unsigned long start ,
47
+ unsigned long end )
48
+ {
49
+ struct mlx5_ib_mr * mr ;
50
+ const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof (u64 )) - 1 ;
51
+ u64 idx = 0 , blk_start_idx = 0 ;
52
+ int in_block = 0 ;
53
+ u64 addr ;
54
+
55
+ if (!umem || !umem -> odp_data ) {
56
+ pr_err ("invalidation called on NULL umem or non-ODP umem\n" );
57
+ return ;
58
+ }
59
+
60
+ mr = umem -> odp_data -> private ;
61
+
62
+ if (!mr || !mr -> ibmr .pd )
63
+ return ;
64
+
65
+ start = max_t (u64 , ib_umem_start (umem ), start );
66
+ end = min_t (u64 , ib_umem_end (umem ), end );
67
+
68
+ /*
69
+ * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
70
+ * while we are doing the invalidation, no page fault will attempt to
71
+ * overwrite the same MTTs. Concurent invalidations might race us,
72
+ * but they will write 0s as well, so no difference in the end result.
73
+ */
74
+
75
+ for (addr = start ; addr < end ; addr += (u64 )umem -> page_size ) {
76
+ idx = (addr - ib_umem_start (umem )) / PAGE_SIZE ;
77
+ /*
78
+ * Strive to write the MTTs in chunks, but avoid overwriting
79
+ * non-existing MTTs. The huristic here can be improved to
80
+ * estimate the cost of another UMR vs. the cost of bigger
81
+ * UMR.
82
+ */
83
+ if (umem -> odp_data -> dma_list [idx ] &
84
+ (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT )) {
85
+ if (!in_block ) {
86
+ blk_start_idx = idx ;
87
+ in_block = 1 ;
88
+ }
89
+ } else {
90
+ u64 umr_offset = idx & umr_block_mask ;
91
+
92
+ if (in_block && umr_offset == 0 ) {
93
+ mlx5_ib_update_mtt (mr , blk_start_idx ,
94
+ idx - blk_start_idx , 1 );
95
+ in_block = 0 ;
96
+ }
97
+ }
98
+ }
99
+ if (in_block )
100
+ mlx5_ib_update_mtt (mr , blk_start_idx , idx - blk_start_idx + 1 ,
101
+ 1 );
102
+
103
+ /*
104
+ * We are now sure that the device will not access the
105
+ * memory. We can safely unmap it, and mark it as dirty if
106
+ * needed.
107
+ */
108
+
109
+ ib_umem_odp_unmap_dma_pages (umem , start , end );
110
+ }
111
+
42
112
#define COPY_ODP_BIT_MLX_TO_IB (reg , ib_caps , field_name , bit_name ) do { \
43
113
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
44
114
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
@@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
59
129
if (err )
60
130
goto out ;
61
131
62
- /* At this point we would copy the capability bits that the driver
63
- * supports from the hw_caps struct to the caps struct. However, no
64
- * such capabilities are supported so far. */
132
+ caps -> general_caps = IB_ODP_SUPPORT ;
133
+ COPY_ODP_BIT_MLX_TO_IB (hw_caps , caps , per_transport_caps .ud_odp_caps ,
134
+ SEND );
135
+ COPY_ODP_BIT_MLX_TO_IB (hw_caps , caps , per_transport_caps .rc_odp_caps ,
136
+ SEND );
137
+ COPY_ODP_BIT_MLX_TO_IB (hw_caps , caps , per_transport_caps .rc_odp_caps ,
138
+ RECV );
139
+ COPY_ODP_BIT_MLX_TO_IB (hw_caps , caps , per_transport_caps .rc_odp_caps ,
140
+ WRITE );
141
+ COPY_ODP_BIT_MLX_TO_IB (hw_caps , caps , per_transport_caps .rc_odp_caps ,
142
+ READ );
143
+
65
144
out :
66
145
return err ;
67
146
}
@@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
71
150
{
72
151
u32 base_key = mlx5_base_mkey (key );
73
152
struct mlx5_core_mr * mmr = __mlx5_mr_lookup (dev -> mdev , base_key );
153
+ struct mlx5_ib_mr * mr = container_of (mmr , struct mlx5_ib_mr , mmr );
74
154
75
- if (!mmr || mmr -> key != key )
155
+ if (!mmr || mmr -> key != key || ! mr -> live )
76
156
return NULL ;
77
157
78
158
return container_of (mmr , struct mlx5_ib_mr , mmr );
@@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
143
223
}
144
224
145
225
current_seq = ACCESS_ONCE (mr -> umem -> odp_data -> notifiers_seq );
226
+ /*
227
+ * Ensure the sequence number is valid for some time before we call
228
+ * gup.
229
+ */
230
+ smp_rmb ();
146
231
147
232
/*
148
233
* Avoid branches - this code will perform correctly
@@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
165
250
166
251
if (npages > 0 ) {
167
252
mutex_lock (& mr -> umem -> odp_data -> umem_mutex );
168
- /*
169
- * No need to check whether the MTTs really belong to
170
- * this MR, since ib_umem_odp_map_dma_pages already
171
- * checks this.
172
- */
173
- ret = mlx5_ib_update_mtt (mr , start_idx , npages , 0 );
253
+ if (!ib_umem_mmu_notifier_retry (mr -> umem , current_seq )) {
254
+ /*
255
+ * No need to check whether the MTTs really belong to
256
+ * this MR, since ib_umem_odp_map_dma_pages already
257
+ * checks this.
258
+ */
259
+ ret = mlx5_ib_update_mtt (mr , start_idx , npages , 0 );
260
+ } else {
261
+ ret = - EAGAIN ;
262
+ }
174
263
mutex_unlock (& mr -> umem -> odp_data -> umem_mutex );
175
264
if (ret < 0 ) {
176
- pr_err ("Failed to update mkey page tables\n" );
265
+ if (ret != - EAGAIN )
266
+ pr_err ("Failed to update mkey page tables\n" );
177
267
goto srcu_unlock ;
178
268
}
179
269
@@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
185
275
}
186
276
187
277
srcu_unlock :
278
+ if (ret == - EAGAIN ) {
279
+ if (!mr -> umem -> odp_data -> dying ) {
280
+ struct ib_umem_odp * odp_data = mr -> umem -> odp_data ;
281
+ unsigned long timeout =
282
+ msecs_to_jiffies (MMU_NOTIFIER_TIMEOUT );
283
+
284
+ if (!wait_for_completion_timeout (
285
+ & odp_data -> notifier_completion ,
286
+ timeout )) {
287
+ pr_warn ("timeout waiting for mmu notifier completion\n" );
288
+ }
289
+ } else {
290
+ /* The MR is being killed, kill the QP as well. */
291
+ ret = - EFAULT ;
292
+ }
293
+ }
188
294
srcu_read_unlock (& mib_dev -> mr_srcu , srcu_key );
189
295
pfault -> mpfault .bytes_committed = 0 ;
190
296
return ret ? ret : npages ;
0 commit comments