Skip to content

Commit 9bb5d40

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
perf: Fix mmap() accounting hole
Vince's fuzzer once again found holes. This time it spotted a leak in the locked page accounting. When an event had redirected output and its close() was the last reference to the buffer we didn't have a vm context to undo accounting. Change the code to destroy the buffer on the last munmap() and detach all redirected events at that time. This provides us the right context to undo the vm accounting. Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net Cc: <stable@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 26cb63a commit 9bb5d40

File tree

2 files changed

+159
-72
lines changed

2 files changed

+159
-72
lines changed

kernel/events/core.c

Lines changed: 157 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196196
static void update_context_time(struct perf_event_context *ctx);
197197
static u64 perf_event_time(struct perf_event *event);
198198

199-
static void ring_buffer_attach(struct perf_event *event,
200-
struct ring_buffer *rb);
201-
202199
void __weak perf_event_print_debug(void) { }
203200

204201
extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
29172914
kfree(event);
29182915
}
29192916

2920-
static bool ring_buffer_put(struct ring_buffer *rb);
2917+
static void ring_buffer_put(struct ring_buffer *rb);
2918+
static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
29212919

29222920
static void free_event(struct perf_event *event)
29232921
{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
29422940
if (has_branch_stack(event)) {
29432941
static_key_slow_dec_deferred(&perf_sched_events);
29442942
/* is system-wide event */
2945-
if (!(event->attach_state & PERF_ATTACH_TASK))
2943+
if (!(event->attach_state & PERF_ATTACH_TASK)) {
29462944
atomic_dec(&per_cpu(perf_branch_stack_events,
29472945
event->cpu));
2946+
}
29482947
}
29492948
}
29502949

29512950
if (event->rb) {
2952-
ring_buffer_put(event->rb);
2953-
event->rb = NULL;
2951+
struct ring_buffer *rb;
2952+
2953+
/*
2954+
* Can happen when we close an event with re-directed output.
2955+
*
2956+
* Since we have a 0 refcount, perf_mmap_close() will skip
2957+
* over us; possibly making our ring_buffer_put() the last.
2958+
*/
2959+
mutex_lock(&event->mmap_mutex);
2960+
rb = event->rb;
2961+
if (rb) {
2962+
rcu_assign_pointer(event->rb, NULL);
2963+
ring_buffer_detach(event, rb);
2964+
ring_buffer_put(rb); /* could be last */
2965+
}
2966+
mutex_unlock(&event->mmap_mutex);
29542967
}
29552968

29562969
if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
31883201
unsigned int events = POLL_HUP;
31893202

31903203
/*
3191-
* Race between perf_event_set_output() and perf_poll(): perf_poll()
3192-
* grabs the rb reference but perf_event_set_output() overrides it.
3193-
* Here is the timeline for two threads T1, T2:
3194-
* t0: T1, rb = rcu_dereference(event->rb)
3195-
* t1: T2, old_rb = event->rb
3196-
* t2: T2, event->rb = new rb
3197-
* t3: T2, ring_buffer_detach(old_rb)
3198-
* t4: T1, ring_buffer_attach(rb1)
3199-
* t5: T1, poll_wait(event->waitq)
3200-
*
3201-
* To avoid this problem, we grab mmap_mutex in perf_poll()
3202-
* thereby ensuring that the assignment of the new ring buffer
3203-
* and the detachment of the old buffer appear atomic to perf_poll()
3204+
* Pin the event->rb by taking event->mmap_mutex; otherwise
3205+
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
32043206
*/
32053207
mutex_lock(&event->mmap_mutex);
3206-
3207-
rcu_read_lock();
3208-
rb = rcu_dereference(event->rb);
3209-
if (rb) {
3210-
ring_buffer_attach(event, rb);
3208+
rb = event->rb;
3209+
if (rb)
32113210
events = atomic_xchg(&rb->poll, 0);
3212-
}
3213-
rcu_read_unlock();
3214-
32153211
mutex_unlock(&event->mmap_mutex);
32163212

32173213
poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
35213517
return;
35223518

35233519
spin_lock_irqsave(&rb->event_lock, flags);
3524-
if (!list_empty(&event->rb_entry))
3525-
goto unlock;
3526-
3527-
list_add(&event->rb_entry, &rb->event_list);
3528-
unlock:
3520+
if (list_empty(&event->rb_entry))
3521+
list_add(&event->rb_entry, &rb->event_list);
35293522
spin_unlock_irqrestore(&rb->event_lock, flags);
35303523
}
35313524

3532-
static void ring_buffer_detach(struct perf_event *event,
3533-
struct ring_buffer *rb)
3525+
static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
35343526
{
35353527
unsigned long flags;
35363528

@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
35493541

35503542
rcu_read_lock();
35513543
rb = rcu_dereference(event->rb);
3552-
if (!rb)
3553-
goto unlock;
3554-
3555-
list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3556-
wake_up_all(&event->waitq);
3557-
3558-
unlock:
3544+
if (rb) {
3545+
list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3546+
wake_up_all(&event->waitq);
3547+
}
35593548
rcu_read_unlock();
35603549
}
35613550

@@ -3582,52 +3571,115 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
35823571
return rb;
35833572
}
35843573

3585-
static bool ring_buffer_put(struct ring_buffer *rb)
3574+
static void ring_buffer_put(struct ring_buffer *rb)
35863575
{
3587-
struct perf_event *event, *n;
3588-
unsigned long flags;
3589-
35903576
if (!atomic_dec_and_test(&rb->refcount))
3591-
return false;
3577+
return;
35923578

3593-
spin_lock_irqsave(&rb->event_lock, flags);
3594-
list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595-
list_del_init(&event->rb_entry);
3596-
wake_up_all(&event->waitq);
3597-
}
3598-
spin_unlock_irqrestore(&rb->event_lock, flags);
3579+
WARN_ON_ONCE(!list_empty(&rb->event_list));
35993580

36003581
call_rcu(&rb->rcu_head, rb_free_rcu);
3601-
return true;
36023582
}
36033583

36043584
static void perf_mmap_open(struct vm_area_struct *vma)
36053585
{
36063586
struct perf_event *event = vma->vm_file->private_data;
36073587

36083588
atomic_inc(&event->mmap_count);
3589+
atomic_inc(&event->rb->mmap_count);
36093590
}
36103591

3592+
/*
3593+
* A buffer can be mmap()ed multiple times; either directly through the same
3594+
* event, or through other events by use of perf_event_set_output().
3595+
*
3596+
* In order to undo the VM accounting done by perf_mmap() we need to destroy
3597+
* the buffer here, where we still have a VM context. This means we need
3598+
* to detach all events redirecting to us.
3599+
*/
36113600
static void perf_mmap_close(struct vm_area_struct *vma)
36123601
{
36133602
struct perf_event *event = vma->vm_file->private_data;
36143603

3615-
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3616-
struct ring_buffer *rb = event->rb;
3617-
struct user_struct *mmap_user = rb->mmap_user;
3618-
int mmap_locked = rb->mmap_locked;
3619-
unsigned long size = perf_data_size(rb);
3604+
struct ring_buffer *rb = event->rb;
3605+
struct user_struct *mmap_user = rb->mmap_user;
3606+
int mmap_locked = rb->mmap_locked;
3607+
unsigned long size = perf_data_size(rb);
36203608

3621-
rcu_assign_pointer(event->rb, NULL);
3622-
ring_buffer_detach(event, rb);
3623-
mutex_unlock(&event->mmap_mutex);
3609+
atomic_dec(&rb->mmap_count);
3610+
3611+
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3612+
return;
3613+
3614+
/* Detach current event from the buffer. */
3615+
rcu_assign_pointer(event->rb, NULL);
3616+
ring_buffer_detach(event, rb);
3617+
mutex_unlock(&event->mmap_mutex);
3618+
3619+
/* If there's still other mmap()s of this buffer, we're done. */
3620+
if (atomic_read(&rb->mmap_count)) {
3621+
ring_buffer_put(rb); /* can't be last */
3622+
return;
3623+
}
36243624

3625-
if (ring_buffer_put(rb)) {
3626-
atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3627-
vma->vm_mm->pinned_vm -= mmap_locked;
3628-
free_uid(mmap_user);
3625+
/*
3626+
* No other mmap()s, detach from all other events that might redirect
3627+
* into the now unreachable buffer. Somewhat complicated by the
3628+
* fact that rb::event_lock otherwise nests inside mmap_mutex.
3629+
*/
3630+
again:
3631+
rcu_read_lock();
3632+
list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3633+
if (!atomic_long_inc_not_zero(&event->refcount)) {
3634+
/*
3635+
* This event is en-route to free_event() which will
3636+
* detach it and remove it from the list.
3637+
*/
3638+
continue;
36293639
}
3640+
rcu_read_unlock();
3641+
3642+
mutex_lock(&event->mmap_mutex);
3643+
/*
3644+
* Check we didn't race with perf_event_set_output() which can
3645+
* swizzle the rb from under us while we were waiting to
3646+
* acquire mmap_mutex.
3647+
*
3648+
* If we find a different rb; ignore this event, a next
3649+
* iteration will no longer find it on the list. We have to
3650+
* still restart the iteration to make sure we're not now
3651+
* iterating the wrong list.
3652+
*/
3653+
if (event->rb == rb) {
3654+
rcu_assign_pointer(event->rb, NULL);
3655+
ring_buffer_detach(event, rb);
3656+
ring_buffer_put(rb); /* can't be last, we still have one */
3657+
}
3658+
mutex_unlock(&event->mmap_mutex);
3659+
put_event(event);
3660+
3661+
/*
3662+
* Restart the iteration; either we're on the wrong list or
3663+
* destroyed its integrity by doing a deletion.
3664+
*/
3665+
goto again;
36303666
}
3667+
rcu_read_unlock();
3668+
3669+
/*
3670+
* It could be there's still a few 0-ref events on the list; they'll
3671+
* get cleaned up by free_event() -- they'll also still have their
3672+
* ref on the rb and will free it whenever they are done with it.
3673+
*
3674+
* Aside from that, this buffer is 'fully' detached and unmapped,
3675+
* undo the VM accounting.
3676+
*/
3677+
3678+
atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3679+
vma->vm_mm->pinned_vm -= mmap_locked;
3680+
free_uid(mmap_user);
3681+
3682+
ring_buffer_put(rb); /* could be last */
36313683
}
36323684

36333685
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
36773729
return -EINVAL;
36783730

36793731
WARN_ON_ONCE(event->ctx->parent_ctx);
3732+
again:
36803733
mutex_lock(&event->mmap_mutex);
36813734
if (event->rb) {
3682-
if (event->rb->nr_pages != nr_pages)
3735+
if (event->rb->nr_pages != nr_pages) {
36833736
ret = -EINVAL;
3737+
goto unlock;
3738+
}
3739+
3740+
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3741+
/*
3742+
* Raced against perf_mmap_close() through
3743+
* perf_event_set_output(). Try again, hope for better
3744+
* luck.
3745+
*/
3746+
mutex_unlock(&event->mmap_mutex);
3747+
goto again;
3748+
}
3749+
36843750
goto unlock;
36853751
}
36863752

@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
37223788
goto unlock;
37233789
}
37243790

3791+
atomic_set(&rb->mmap_count, 1);
37253792
rb->mmap_locked = extra;
37263793
rb->mmap_user = get_current_user();
37273794

37283795
atomic_long_add(user_extra, &user->locked_vm);
37293796
vma->vm_mm->pinned_vm += extra;
37303797

3798+
ring_buffer_attach(event, rb);
37313799
rcu_assign_pointer(event->rb, rb);
37323800

37333801
perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
37373805
atomic_inc(&event->mmap_count);
37383806
mutex_unlock(&event->mmap_mutex);
37393807

3808+
/*
3809+
* Since pinned accounting is per vm we cannot allow fork() to copy our
3810+
* vma.
3811+
*/
37403812
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
37413813
vma->vm_ops = &perf_mmap_vmops;
37423814

@@ -6415,23 +6487,37 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
64156487
if (atomic_read(&event->mmap_count))
64166488
goto unlock;
64176489

6490+
old_rb = event->rb;
6491+
64186492
if (output_event) {
64196493
/* get the rb we want to redirect to */
64206494
rb = ring_buffer_get(output_event);
64216495
if (!rb)
64226496
goto unlock;
64236497
}
64246498

6425-
old_rb = event->rb;
6426-
rcu_assign_pointer(event->rb, rb);
64276499
if (old_rb)
64286500
ring_buffer_detach(event, old_rb);
6501+
6502+
if (rb)
6503+
ring_buffer_attach(event, rb);
6504+
6505+
rcu_assign_pointer(event->rb, rb);
6506+
6507+
if (old_rb) {
6508+
ring_buffer_put(old_rb);
6509+
/*
6510+
* Since we detached before setting the new rb, so that we
6511+
* could attach the new rb, we could have missed a wakeup.
6512+
* Provide it now.
6513+
*/
6514+
wake_up_all(&event->waitq);
6515+
}
6516+
64296517
ret = 0;
64306518
unlock:
64316519
mutex_unlock(&event->mmap_mutex);
64326520

6433-
if (old_rb)
6434-
ring_buffer_put(old_rb);
64356521
out:
64366522
return ret;
64376523
}

kernel/events/internal.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ struct ring_buffer {
3131
spinlock_t event_lock;
3232
struct list_head event_list;
3333

34-
int mmap_locked;
34+
atomic_t mmap_count;
35+
unsigned long mmap_locked;
3536
struct user_struct *mmap_user;
3637

3738
struct perf_event_mmap_page *user_page;

0 commit comments

Comments
 (0)