@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196
196
static void update_context_time (struct perf_event_context * ctx );
197
197
static u64 perf_event_time (struct perf_event * event );
198
198
199
- static void ring_buffer_attach (struct perf_event * event ,
200
- struct ring_buffer * rb );
201
-
202
199
void __weak perf_event_print_debug (void ) { }
203
200
204
201
extern __weak const char * perf_pmu_name (void )
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
2917
2914
kfree (event );
2918
2915
}
2919
2916
2920
- static bool ring_buffer_put (struct ring_buffer * rb );
2917
+ static void ring_buffer_put (struct ring_buffer * rb );
2918
+ static void ring_buffer_detach (struct perf_event * event , struct ring_buffer * rb );
2921
2919
2922
2920
static void free_event (struct perf_event * event )
2923
2921
{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
2942
2940
if (has_branch_stack (event )) {
2943
2941
static_key_slow_dec_deferred (& perf_sched_events );
2944
2942
/* is system-wide event */
2945
- if (!(event -> attach_state & PERF_ATTACH_TASK ))
2943
+ if (!(event -> attach_state & PERF_ATTACH_TASK )) {
2946
2944
atomic_dec (& per_cpu (perf_branch_stack_events ,
2947
2945
event -> cpu ));
2946
+ }
2948
2947
}
2949
2948
}
2950
2949
2951
2950
if (event -> rb ) {
2952
- ring_buffer_put (event -> rb );
2953
- event -> rb = NULL ;
2951
+ struct ring_buffer * rb ;
2952
+
2953
+ /*
2954
+ * Can happen when we close an event with re-directed output.
2955
+ *
2956
+ * Since we have a 0 refcount, perf_mmap_close() will skip
2957
+ * over us; possibly making our ring_buffer_put() the last.
2958
+ */
2959
+ mutex_lock (& event -> mmap_mutex );
2960
+ rb = event -> rb ;
2961
+ if (rb ) {
2962
+ rcu_assign_pointer (event -> rb , NULL );
2963
+ ring_buffer_detach (event , rb );
2964
+ ring_buffer_put (rb ); /* could be last */
2965
+ }
2966
+ mutex_unlock (& event -> mmap_mutex );
2954
2967
}
2955
2968
2956
2969
if (is_cgroup_event (event ))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188
3201
unsigned int events = POLL_HUP ;
3189
3202
3190
3203
/*
3191
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
3192
- * grabs the rb reference but perf_event_set_output() overrides it.
3193
- * Here is the timeline for two threads T1, T2:
3194
- * t0: T1, rb = rcu_dereference(event->rb)
3195
- * t1: T2, old_rb = event->rb
3196
- * t2: T2, event->rb = new rb
3197
- * t3: T2, ring_buffer_detach(old_rb)
3198
- * t4: T1, ring_buffer_attach(rb1)
3199
- * t5: T1, poll_wait(event->waitq)
3200
- *
3201
- * To avoid this problem, we grab mmap_mutex in perf_poll()
3202
- * thereby ensuring that the assignment of the new ring buffer
3203
- * and the detachment of the old buffer appear atomic to perf_poll()
3204
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
3205
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3204
3206
*/
3205
3207
mutex_lock (& event -> mmap_mutex );
3206
-
3207
- rcu_read_lock ();
3208
- rb = rcu_dereference (event -> rb );
3209
- if (rb ) {
3210
- ring_buffer_attach (event , rb );
3208
+ rb = event -> rb ;
3209
+ if (rb )
3211
3210
events = atomic_xchg (& rb -> poll , 0 );
3212
- }
3213
- rcu_read_unlock ();
3214
-
3215
3211
mutex_unlock (& event -> mmap_mutex );
3216
3212
3217
3213
poll_wait (file , & event -> waitq , wait );
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521
3517
return ;
3522
3518
3523
3519
spin_lock_irqsave (& rb -> event_lock , flags );
3524
- if (!list_empty (& event -> rb_entry ))
3525
- goto unlock ;
3526
-
3527
- list_add (& event -> rb_entry , & rb -> event_list );
3528
- unlock :
3520
+ if (list_empty (& event -> rb_entry ))
3521
+ list_add (& event -> rb_entry , & rb -> event_list );
3529
3522
spin_unlock_irqrestore (& rb -> event_lock , flags );
3530
3523
}
3531
3524
3532
- static void ring_buffer_detach (struct perf_event * event ,
3533
- struct ring_buffer * rb )
3525
+ static void ring_buffer_detach (struct perf_event * event , struct ring_buffer * rb )
3534
3526
{
3535
3527
unsigned long flags ;
3536
3528
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549
3541
3550
3542
rcu_read_lock ();
3551
3543
rb = rcu_dereference (event -> rb );
3552
- if (!rb )
3553
- goto unlock ;
3554
-
3555
- list_for_each_entry_rcu (event , & rb -> event_list , rb_entry )
3556
- wake_up_all (& event -> waitq );
3557
-
3558
- unlock :
3544
+ if (rb ) {
3545
+ list_for_each_entry_rcu (event , & rb -> event_list , rb_entry )
3546
+ wake_up_all (& event -> waitq );
3547
+ }
3559
3548
rcu_read_unlock ();
3560
3549
}
3561
3550
@@ -3582,52 +3571,115 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3582
3571
return rb ;
3583
3572
}
3584
3573
3585
- static bool ring_buffer_put (struct ring_buffer * rb )
3574
+ static void ring_buffer_put (struct ring_buffer * rb )
3586
3575
{
3587
- struct perf_event * event , * n ;
3588
- unsigned long flags ;
3589
-
3590
3576
if (!atomic_dec_and_test (& rb -> refcount ))
3591
- return false ;
3577
+ return ;
3592
3578
3593
- spin_lock_irqsave (& rb -> event_lock , flags );
3594
- list_for_each_entry_safe (event , n , & rb -> event_list , rb_entry ) {
3595
- list_del_init (& event -> rb_entry );
3596
- wake_up_all (& event -> waitq );
3597
- }
3598
- spin_unlock_irqrestore (& rb -> event_lock , flags );
3579
+ WARN_ON_ONCE (!list_empty (& rb -> event_list ));
3599
3580
3600
3581
call_rcu (& rb -> rcu_head , rb_free_rcu );
3601
- return true;
3602
3582
}
3603
3583
3604
3584
static void perf_mmap_open (struct vm_area_struct * vma )
3605
3585
{
3606
3586
struct perf_event * event = vma -> vm_file -> private_data ;
3607
3587
3608
3588
atomic_inc (& event -> mmap_count );
3589
+ atomic_inc (& event -> rb -> mmap_count );
3609
3590
}
3610
3591
3592
+ /*
3593
+ * A buffer can be mmap()ed multiple times; either directly through the same
3594
+ * event, or through other events by use of perf_event_set_output().
3595
+ *
3596
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
3597
+ * the buffer here, where we still have a VM context. This means we need
3598
+ * to detach all events redirecting to us.
3599
+ */
3611
3600
static void perf_mmap_close (struct vm_area_struct * vma )
3612
3601
{
3613
3602
struct perf_event * event = vma -> vm_file -> private_data ;
3614
3603
3615
- if (atomic_dec_and_mutex_lock (& event -> mmap_count , & event -> mmap_mutex )) {
3616
- struct ring_buffer * rb = event -> rb ;
3617
- struct user_struct * mmap_user = rb -> mmap_user ;
3618
- int mmap_locked = rb -> mmap_locked ;
3619
- unsigned long size = perf_data_size (rb );
3604
+ struct ring_buffer * rb = event -> rb ;
3605
+ struct user_struct * mmap_user = rb -> mmap_user ;
3606
+ int mmap_locked = rb -> mmap_locked ;
3607
+ unsigned long size = perf_data_size (rb );
3620
3608
3621
- rcu_assign_pointer (event -> rb , NULL );
3622
- ring_buffer_detach (event , rb );
3623
- mutex_unlock (& event -> mmap_mutex );
3609
+ atomic_dec (& rb -> mmap_count );
3610
+
3611
+ if (!atomic_dec_and_mutex_lock (& event -> mmap_count , & event -> mmap_mutex ))
3612
+ return ;
3613
+
3614
+ /* Detach current event from the buffer. */
3615
+ rcu_assign_pointer (event -> rb , NULL );
3616
+ ring_buffer_detach (event , rb );
3617
+ mutex_unlock (& event -> mmap_mutex );
3618
+
3619
+ /* If there's still other mmap()s of this buffer, we're done. */
3620
+ if (atomic_read (& rb -> mmap_count )) {
3621
+ ring_buffer_put (rb ); /* can't be last */
3622
+ return ;
3623
+ }
3624
3624
3625
- if (ring_buffer_put (rb )) {
3626
- atomic_long_sub ((size >> PAGE_SHIFT ) + 1 , & mmap_user -> locked_vm );
3627
- vma -> vm_mm -> pinned_vm -= mmap_locked ;
3628
- free_uid (mmap_user );
3625
+ /*
3626
+ * No other mmap()s, detach from all other events that might redirect
3627
+ * into the now unreachable buffer. Somewhat complicated by the
3628
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
3629
+ */
3630
+ again :
3631
+ rcu_read_lock ();
3632
+ list_for_each_entry_rcu (event , & rb -> event_list , rb_entry ) {
3633
+ if (!atomic_long_inc_not_zero (& event -> refcount )) {
3634
+ /*
3635
+ * This event is en-route to free_event() which will
3636
+ * detach it and remove it from the list.
3637
+ */
3638
+ continue ;
3629
3639
}
3640
+ rcu_read_unlock ();
3641
+
3642
+ mutex_lock (& event -> mmap_mutex );
3643
+ /*
3644
+ * Check we didn't race with perf_event_set_output() which can
3645
+ * swizzle the rb from under us while we were waiting to
3646
+ * acquire mmap_mutex.
3647
+ *
3648
+ * If we find a different rb; ignore this event, a next
3649
+ * iteration will no longer find it on the list. We have to
3650
+ * still restart the iteration to make sure we're not now
3651
+ * iterating the wrong list.
3652
+ */
3653
+ if (event -> rb == rb ) {
3654
+ rcu_assign_pointer (event -> rb , NULL );
3655
+ ring_buffer_detach (event , rb );
3656
+ ring_buffer_put (rb ); /* can't be last, we still have one */
3657
+ }
3658
+ mutex_unlock (& event -> mmap_mutex );
3659
+ put_event (event );
3660
+
3661
+ /*
3662
+ * Restart the iteration; either we're on the wrong list or
3663
+ * destroyed its integrity by doing a deletion.
3664
+ */
3665
+ goto again ;
3630
3666
}
3667
+ rcu_read_unlock ();
3668
+
3669
+ /*
3670
+ * It could be there's still a few 0-ref events on the list; they'll
3671
+ * get cleaned up by free_event() -- they'll also still have their
3672
+ * ref on the rb and will free it whenever they are done with it.
3673
+ *
3674
+ * Aside from that, this buffer is 'fully' detached and unmapped,
3675
+ * undo the VM accounting.
3676
+ */
3677
+
3678
+ atomic_long_sub ((size >> PAGE_SHIFT ) + 1 , & mmap_user -> locked_vm );
3679
+ vma -> vm_mm -> pinned_vm -= mmap_locked ;
3680
+ free_uid (mmap_user );
3681
+
3682
+ ring_buffer_put (rb ); /* could be last */
3631
3683
}
3632
3684
3633
3685
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3677
3729
return - EINVAL ;
3678
3730
3679
3731
WARN_ON_ONCE (event -> ctx -> parent_ctx );
3732
+ again :
3680
3733
mutex_lock (& event -> mmap_mutex );
3681
3734
if (event -> rb ) {
3682
- if (event -> rb -> nr_pages != nr_pages )
3735
+ if (event -> rb -> nr_pages != nr_pages ) {
3683
3736
ret = - EINVAL ;
3737
+ goto unlock ;
3738
+ }
3739
+
3740
+ if (!atomic_inc_not_zero (& event -> rb -> mmap_count )) {
3741
+ /*
3742
+ * Raced against perf_mmap_close() through
3743
+ * perf_event_set_output(). Try again, hope for better
3744
+ * luck.
3745
+ */
3746
+ mutex_unlock (& event -> mmap_mutex );
3747
+ goto again ;
3748
+ }
3749
+
3684
3750
goto unlock ;
3685
3751
}
3686
3752
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3722
3788
goto unlock ;
3723
3789
}
3724
3790
3791
+ atomic_set (& rb -> mmap_count , 1 );
3725
3792
rb -> mmap_locked = extra ;
3726
3793
rb -> mmap_user = get_current_user ();
3727
3794
3728
3795
atomic_long_add (user_extra , & user -> locked_vm );
3729
3796
vma -> vm_mm -> pinned_vm += extra ;
3730
3797
3798
+ ring_buffer_attach (event , rb );
3731
3799
rcu_assign_pointer (event -> rb , rb );
3732
3800
3733
3801
perf_event_update_userpage (event );
@@ -3737,6 +3805,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3737
3805
atomic_inc (& event -> mmap_count );
3738
3806
mutex_unlock (& event -> mmap_mutex );
3739
3807
3808
+ /*
3809
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
3810
+ * vma.
3811
+ */
3740
3812
vma -> vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP ;
3741
3813
vma -> vm_ops = & perf_mmap_vmops ;
3742
3814
@@ -6415,23 +6487,37 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6415
6487
if (atomic_read (& event -> mmap_count ))
6416
6488
goto unlock ;
6417
6489
6490
+ old_rb = event -> rb ;
6491
+
6418
6492
if (output_event ) {
6419
6493
/* get the rb we want to redirect to */
6420
6494
rb = ring_buffer_get (output_event );
6421
6495
if (!rb )
6422
6496
goto unlock ;
6423
6497
}
6424
6498
6425
- old_rb = event -> rb ;
6426
- rcu_assign_pointer (event -> rb , rb );
6427
6499
if (old_rb )
6428
6500
ring_buffer_detach (event , old_rb );
6501
+
6502
+ if (rb )
6503
+ ring_buffer_attach (event , rb );
6504
+
6505
+ rcu_assign_pointer (event -> rb , rb );
6506
+
6507
+ if (old_rb ) {
6508
+ ring_buffer_put (old_rb );
6509
+ /*
6510
+ * Since we detached before setting the new rb, so that we
6511
+ * could attach the new rb, we could have missed a wakeup.
6512
+ * Provide it now.
6513
+ */
6514
+ wake_up_all (& event -> waitq );
6515
+ }
6516
+
6429
6517
ret = 0 ;
6430
6518
unlock :
6431
6519
mutex_unlock (& event -> mmap_mutex );
6432
6520
6433
- if (old_rb )
6434
- ring_buffer_put (old_rb );
6435
6521
out :
6436
6522
return ret ;
6437
6523
}
0 commit comments