Skip to content

Commit 93065ac

Browse files
Michal Hockotorvalds
authored andcommitted
mm, oom: distinguish blockable mode for mmu notifiers
There are several blockable mmu notifiers which might sleep in mmu_notifier_invalidate_range_start and that is a problem for the oom_reaper because it needs to guarantee a forward progress so it cannot depend on any sleepable locks. Currently we simply back off and mark an oom victim with blockable mmu notifiers as done after a short sleep. That can result in selecting a new oom victim prematurely because the previous one still hasn't torn its memory down yet. We can do much better though. Even if mmu notifiers use sleepable locks there is no reason to automatically assume those locks are held. Moreover majority of notifiers only care about a portion of the address space and there is absolutely zero reason to fail when we are unmapping an unrelated range. Many notifiers do really block and wait for HW which is harder to handle and we have to bail out though. This patch handles the low hanging fruit. __mmu_notifier_invalidate_range_start gets a blockable flag and callbacks are not allowed to sleep if the flag is set to false. This is achieved by using trylock instead of the sleepable lock for most callbacks and continue as long as we do not block down the call chain. I think we can improve that even further because there is a common pattern to do a range lookup first and then do something about that. The first part can be done without a sleeping lock in most cases AFAICS. The oom_reaper end then simply retries if there is at least one notifier which couldn't make any progress in !blockable mode. A retry loop is already implemented to wait for the mmap_sem and this is basically the same thing. The simplest way for driver developers to test this code path is to wrap userspace code which uses these notifiers into a memcg and set the hard limit to hit the oom. This can be done e.g. after the test faults in all the mmu notifier managed memory and set the hard limit to something really small. Then we are looking for a proper process tear down. [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: minor code simplification] Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Christian König <christian.koenig@amd.com> # AMD notifiers Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx and umem_odp Reported-by: David Rientjes <rientjes@google.com> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Doug Ledford <dledford@redhat.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com> Cc: Dennis Dalessandro <dennis.dalessandro@intel.com> Cc: Sudeep Dutt <sudeep.dutt@intel.com> Cc: Ashutosh Dixit <ashutosh.dixit@intel.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Juergen Gross <jgross@suse.com> Cc: "Jérôme Glisse" <jglisse@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent c2343d2 commit 93065ac

File tree

19 files changed

+223
-80
lines changed

19 files changed

+223
-80
lines changed

arch/x86/kvm/x86.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
73057305
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
73067306
}
73077307

7308-
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
7309-
unsigned long start, unsigned long end)
7308+
int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
7309+
unsigned long start, unsigned long end,
7310+
bool blockable)
73107311
{
73117312
unsigned long apic_address;
73127313

@@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
73177318
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
73187319
if (start <= apic_address && apic_address < end)
73197320
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
7321+
7322+
return 0;
73207323
}
73217324

73227325
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)

drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn)
178178
*
179179
* @amn: our notifier
180180
*/
181-
static void amdgpu_mn_read_lock(struct amdgpu_mn *amn)
181+
static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
182182
{
183-
mutex_lock(&amn->read_lock);
183+
if (blockable)
184+
mutex_lock(&amn->read_lock);
185+
else if (!mutex_trylock(&amn->read_lock))
186+
return -EAGAIN;
187+
184188
if (atomic_inc_return(&amn->recursion) == 1)
185189
down_read_non_owner(&amn->lock);
186190
mutex_unlock(&amn->read_lock);
191+
192+
return 0;
187193
}
188194

189195
/**
@@ -239,28 +245,40 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
239245
* Block for operations on BOs to finish and mark pages as accessed and
240246
* potentially dirty.
241247
*/
242-
static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
248+
static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
243249
struct mm_struct *mm,
244250
unsigned long start,
245-
unsigned long end)
251+
unsigned long end,
252+
bool blockable)
246253
{
247254
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
248255
struct interval_tree_node *it;
249256

250257
/* notification is exclusive, but interval is inclusive */
251258
end -= 1;
252259

253-
amdgpu_mn_read_lock(amn);
260+
/* TODO we should be able to split locking for interval tree and
261+
* amdgpu_mn_invalidate_node
262+
*/
263+
if (amdgpu_mn_read_lock(amn, blockable))
264+
return -EAGAIN;
254265

255266
it = interval_tree_iter_first(&amn->objects, start, end);
256267
while (it) {
257268
struct amdgpu_mn_node *node;
258269

270+
if (!blockable) {
271+
amdgpu_mn_read_unlock(amn);
272+
return -EAGAIN;
273+
}
274+
259275
node = container_of(it, struct amdgpu_mn_node, it);
260276
it = interval_tree_iter_next(it, start, end);
261277

262278
amdgpu_mn_invalidate_node(node, start, end);
263279
}
280+
281+
return 0;
264282
}
265283

266284
/**
@@ -275,24 +293,31 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
275293
* necessitates evicting all user-mode queues of the process. The BOs
276294
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
277295
*/
278-
static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
296+
static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
279297
struct mm_struct *mm,
280298
unsigned long start,
281-
unsigned long end)
299+
unsigned long end,
300+
bool blockable)
282301
{
283302
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
284303
struct interval_tree_node *it;
285304

286305
/* notification is exclusive, but interval is inclusive */
287306
end -= 1;
288307

289-
amdgpu_mn_read_lock(amn);
308+
if (amdgpu_mn_read_lock(amn, blockable))
309+
return -EAGAIN;
290310

291311
it = interval_tree_iter_first(&amn->objects, start, end);
292312
while (it) {
293313
struct amdgpu_mn_node *node;
294314
struct amdgpu_bo *bo;
295315

316+
if (!blockable) {
317+
amdgpu_mn_read_unlock(amn);
318+
return -EAGAIN;
319+
}
320+
296321
node = container_of(it, struct amdgpu_mn_node, it);
297322
it = interval_tree_iter_next(it, start, end);
298323

@@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
304329
amdgpu_amdkfd_evict_userptr(mem, mm);
305330
}
306331
}
332+
333+
return 0;
307334
}
308335

309336
/**

drivers/gpu/drm/i915/i915_gem_userptr.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo)
112112
mo->attached = false;
113113
}
114114

115-
static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
115+
static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
116116
struct mm_struct *mm,
117117
unsigned long start,
118-
unsigned long end)
118+
unsigned long end,
119+
bool blockable)
119120
{
120121
struct i915_mmu_notifier *mn =
121122
container_of(_mn, struct i915_mmu_notifier, mn);
@@ -124,14 +125,18 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
124125
LIST_HEAD(cancelled);
125126

126127
if (RB_EMPTY_ROOT(&mn->objects.rb_root))
127-
return;
128+
return 0;
128129

129130
/* interval ranges are inclusive, but invalidate range is exclusive */
130131
end--;
131132

132133
spin_lock(&mn->lock);
133134
it = interval_tree_iter_first(&mn->objects, start, end);
134135
while (it) {
136+
if (!blockable) {
137+
spin_unlock(&mn->lock);
138+
return -EAGAIN;
139+
}
135140
/* The mmu_object is released late when destroying the
136141
* GEM object so it is entirely possible to gain a
137142
* reference on an object in the process of being freed
@@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
154159

155160
if (!list_empty(&cancelled))
156161
flush_workqueue(mn->wq);
162+
163+
return 0;
157164
}
158165

159166
static const struct mmu_notifier_ops i915_gem_userptr_notifier = {

drivers/gpu/drm/radeon/radeon_mn.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,39 @@ static void radeon_mn_release(struct mmu_notifier *mn,
118118
* We block for all BOs between start and end to be idle and
119119
* unmap them by move them into system domain again.
120120
*/
121-
static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
121+
static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
122122
struct mm_struct *mm,
123123
unsigned long start,
124-
unsigned long end)
124+
unsigned long end,
125+
bool blockable)
125126
{
126127
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
127128
struct ttm_operation_ctx ctx = { false, false };
128129
struct interval_tree_node *it;
130+
int ret = 0;
129131

130132
/* notification is exclusive, but interval is inclusive */
131133
end -= 1;
132134

133-
mutex_lock(&rmn->lock);
135+
/* TODO we should be able to split locking for interval tree and
136+
* the tear down.
137+
*/
138+
if (blockable)
139+
mutex_lock(&rmn->lock);
140+
else if (!mutex_trylock(&rmn->lock))
141+
return -EAGAIN;
134142

135143
it = interval_tree_iter_first(&rmn->objects, start, end);
136144
while (it) {
137145
struct radeon_mn_node *node;
138146
struct radeon_bo *bo;
139147
long r;
140148

149+
if (!blockable) {
150+
ret = -EAGAIN;
151+
goto out_unlock;
152+
}
153+
141154
node = container_of(it, struct radeon_mn_node, it);
142155
it = interval_tree_iter_next(it, start, end);
143156

@@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
166179
}
167180
}
168181

182+
out_unlock:
169183
mutex_unlock(&rmn->lock);
184+
185+
return ret;
170186
}
171187

172188
static const struct mmu_notifier_ops radeon_mn_ops = {

drivers/infiniband/core/umem_odp.c

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
186186
rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
187187
ULLONG_MAX,
188188
ib_umem_notifier_release_trampoline,
189+
true,
189190
NULL);
190191
up_read(&context->umem_rwsem);
191192
}
@@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
207208
return 0;
208209
}
209210

210-
static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
211+
static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
211212
struct mm_struct *mm,
212213
unsigned long start,
213-
unsigned long end)
214+
unsigned long end,
215+
bool blockable)
214216
{
215217
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
218+
int ret;
216219

217220
if (!context->invalidate_range)
218-
return;
221+
return 0;
222+
223+
if (blockable)
224+
down_read(&context->umem_rwsem);
225+
else if (!down_read_trylock(&context->umem_rwsem))
226+
return -EAGAIN;
219227

220228
ib_ucontext_notifier_start_account(context);
221-
down_read(&context->umem_rwsem);
222-
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
229+
ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
223230
end,
224-
invalidate_range_start_trampoline, NULL);
231+
invalidate_range_start_trampoline,
232+
blockable, NULL);
225233
up_read(&context->umem_rwsem);
234+
235+
return ret;
226236
}
227237

228238
static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
@@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
242252
if (!context->invalidate_range)
243253
return;
244254

255+
/*
256+
* TODO: we currently bail out if there is any sleepable work to be done
257+
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
258+
* here. But this is ugly and fragile.
259+
*/
245260
down_read(&context->umem_rwsem);
246261
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
247262
end,
248-
invalidate_range_end_trampoline, NULL);
263+
invalidate_range_end_trampoline, true, NULL);
249264
up_read(&context->umem_rwsem);
250265
ib_ucontext_notifier_end_account(context);
251266
}
@@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
798813
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
799814
u64 start, u64 last,
800815
umem_call_back cb,
816+
bool blockable,
801817
void *cookie)
802818
{
803819
int ret_val = 0;
@@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
809825

810826
for (node = rbt_ib_umem_iter_first(root, start, last - 1);
811827
node; node = next) {
828+
/* TODO move the blockable decision up to the callback */
829+
if (!blockable)
830+
return -EAGAIN;
812831
next = rbt_ib_umem_iter_next(node, start, last - 1);
813832
umem = container_of(node, struct ib_umem_odp, interval_tree);
814833
ret_val = cb(umem->umem, start, last, cookie) || ret_val;

drivers/infiniband/hw/hfi1/mmu_rb.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@ struct mmu_rb_handler {
6767

6868
static unsigned long mmu_node_start(struct mmu_rb_node *);
6969
static unsigned long mmu_node_last(struct mmu_rb_node *);
70-
static void mmu_notifier_range_start(struct mmu_notifier *,
70+
static int mmu_notifier_range_start(struct mmu_notifier *,
7171
struct mm_struct *,
72-
unsigned long, unsigned long);
72+
unsigned long, unsigned long, bool);
7373
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
7474
unsigned long, unsigned long);
7575
static void do_remove(struct mmu_rb_handler *handler,
@@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
284284
handler->ops->remove(handler->ops_arg, node);
285285
}
286286

287-
static void mmu_notifier_range_start(struct mmu_notifier *mn,
287+
static int mmu_notifier_range_start(struct mmu_notifier *mn,
288288
struct mm_struct *mm,
289289
unsigned long start,
290-
unsigned long end)
290+
unsigned long end,
291+
bool blockable)
291292
{
292293
struct mmu_rb_handler *handler =
293294
container_of(mn, struct mmu_rb_handler, mn);
@@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn,
313314

314315
if (added)
315316
queue_work(handler->wq, &handler->del_work);
317+
318+
return 0;
316319
}
317320

318321
/*

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
488488

489489
down_read(&ctx->umem_rwsem);
490490
rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
491-
mr_leaf_free, imr);
491+
mr_leaf_free, true, imr);
492492
up_read(&ctx->umem_rwsem);
493493

494494
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));

drivers/misc/mic/scif/scif_dma.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
200200
schedule_work(&scif_info.misc_work);
201201
}
202202

203-
static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
203+
static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
204204
struct mm_struct *mm,
205205
unsigned long start,
206-
unsigned long end)
206+
unsigned long end,
207+
bool blockable)
207208
{
208209
struct scif_mmu_notif *mmn;
209210

210211
mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
211212
scif_rma_destroy_tcw(mmn, start, end - start);
213+
214+
return 0;
212215
}
213216

214217
static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,

drivers/misc/sgi-gru/grutlbpurge.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru)
219219
/*
220220
* MMUOPS notifier callout functions
221221
*/
222-
static void gru_invalidate_range_start(struct mmu_notifier *mn,
222+
static int gru_invalidate_range_start(struct mmu_notifier *mn,
223223
struct mm_struct *mm,
224-
unsigned long start, unsigned long end)
224+
unsigned long start, unsigned long end,
225+
bool blockable)
225226
{
226227
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
227228
ms_notifier);
@@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn,
231232
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
232233
start, end, atomic_read(&gms->ms_range_active));
233234
gru_flush_tlb_range(gms, start, end - start);
235+
236+
return 0;
234237
}
235238

236239
static void gru_invalidate_range_end(struct mmu_notifier *mn,

0 commit comments

Comments
 (0)