Skip to content

Commit 8e5fc1a

Browse files
Stephane EranianIngo Molnar
authored andcommitted
perf_events: Fix transaction recovery in group_sched_in()
The group_sched_in() function uses a transactional approach to schedule a group of events. In a group, either all events can be scheduled or none are. To schedule each event in, the function calls event_sched_in(). In case of error, event_sched_out() is called on each event in the group. The problem is that event_sched_out() does not completely cancel the effects of event_sched_in(). Furthermore event_sched_out() changes the state of the event as if it had run which is not true is this particular case. Those inconsistencies impact time tracking fields and may lead to events in a group not all reporting the same time_enabled and time_running values. This is demonstrated with the example below: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 1946101 unhalted_core_cycles (32.85% scaling, ena=829181, run=556827) 11423 baclears (32.85% scaling, ena=829181, run=556827) 7671 baclears (0.00% scaling, ena=556827, run=556827) 2250443 unhalted_core_cycles (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) Notice that in the first group, the last baclears event does not report the same timings as its siblings. This issue comes from the fact that tstamp_stopped is updated by event_sched_out() as if the event had actually run. To solve the issue, we must ensure that, in case of error, there is no change in the event state whatsoever. That means timings must remain as they were when entering group_sched_in(). To do this we defer updating tstamp_running until we know the transaction succeeded. Therefore, we have split event_sched_in() in two parts separating the update to tstamp_running. Similarly, in case of error, we do not want to update tstamp_stopped. Therefore, we have split event_sched_out() in two parts separating the update to tstamp_stopped. With this patch, we now get the following output: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 2492050 unhalted_core_cycles (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 1852746 unhalted_core_cycles (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) Note that the uneven timing between groups is a side effect of the process spending most of its time sleeping, i.e., not enough event rotations (but that's a separate issue). Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4cb86b4c.41e9d80a.44e9.3e19@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
1 parent ba0cef3 commit 8e5fc1a

File tree

1 file changed

+63
-13
lines changed

1 file changed

+63
-13
lines changed

kernel/perf_event.c

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -412,8 +412,8 @@ event_filter_match(struct perf_event *event)
412412
return event->cpu == -1 || event->cpu == smp_processor_id();
413413
}
414414

415-
static void
416-
event_sched_out(struct perf_event *event,
415+
static int
416+
__event_sched_out(struct perf_event *event,
417417
struct perf_cpu_context *cpuctx,
418418
struct perf_event_context *ctx)
419419
{
@@ -432,14 +432,13 @@ event_sched_out(struct perf_event *event,
432432
}
433433

434434
if (event->state != PERF_EVENT_STATE_ACTIVE)
435-
return;
435+
return 0;
436436

437437
event->state = PERF_EVENT_STATE_INACTIVE;
438438
if (event->pending_disable) {
439439
event->pending_disable = 0;
440440
event->state = PERF_EVENT_STATE_OFF;
441441
}
442-
event->tstamp_stopped = ctx->time;
443442
event->pmu->del(event, 0);
444443
event->oncpu = -1;
445444

@@ -448,6 +447,19 @@ event_sched_out(struct perf_event *event,
448447
ctx->nr_active--;
449448
if (event->attr.exclusive || !cpuctx->active_oncpu)
450449
cpuctx->exclusive = 0;
450+
return 1;
451+
}
452+
453+
static void
454+
event_sched_out(struct perf_event *event,
455+
struct perf_cpu_context *cpuctx,
456+
struct perf_event_context *ctx)
457+
{
458+
int ret;
459+
460+
ret = __event_sched_out(event, cpuctx, ctx);
461+
if (ret)
462+
event->tstamp_stopped = ctx->time;
451463
}
452464

453465
static void
@@ -647,7 +659,7 @@ void perf_event_disable(struct perf_event *event)
647659
}
648660

649661
static int
650-
event_sched_in(struct perf_event *event,
662+
__event_sched_in(struct perf_event *event,
651663
struct perf_cpu_context *cpuctx,
652664
struct perf_event_context *ctx)
653665
{
@@ -667,8 +679,6 @@ event_sched_in(struct perf_event *event,
667679
return -EAGAIN;
668680
}
669681

670-
event->tstamp_running += ctx->time - event->tstamp_stopped;
671-
672682
if (!is_software_event(event))
673683
cpuctx->active_oncpu++;
674684
ctx->nr_active++;
@@ -679,6 +689,35 @@ event_sched_in(struct perf_event *event,
679689
return 0;
680690
}
681691

692+
static inline int
693+
event_sched_in(struct perf_event *event,
694+
struct perf_cpu_context *cpuctx,
695+
struct perf_event_context *ctx)
696+
{
697+
int ret = __event_sched_in(event, cpuctx, ctx);
698+
if (ret)
699+
return ret;
700+
event->tstamp_running += ctx->time - event->tstamp_stopped;
701+
return 0;
702+
}
703+
704+
static void
705+
group_commit_event_sched_in(struct perf_event *group_event,
706+
struct perf_cpu_context *cpuctx,
707+
struct perf_event_context *ctx)
708+
{
709+
struct perf_event *event;
710+
u64 now = ctx->time;
711+
712+
group_event->tstamp_running += now - group_event->tstamp_stopped;
713+
/*
714+
* Schedule in siblings as one group (if any):
715+
*/
716+
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
717+
event->tstamp_running += now - event->tstamp_stopped;
718+
}
719+
}
720+
682721
static int
683722
group_sched_in(struct perf_event *group_event,
684723
struct perf_cpu_context *cpuctx,
@@ -692,7 +731,13 @@ group_sched_in(struct perf_event *group_event,
692731

693732
pmu->start_txn(pmu);
694733

695-
if (event_sched_in(group_event, cpuctx, ctx)) {
734+
/*
735+
* use __event_sched_in() to delay updating tstamp_running
736+
* until the transaction is committed. In case of failure
737+
* we will keep an unmodified tstamp_running which is a
738+
* requirement to get correct timing information
739+
*/
740+
if (__event_sched_in(group_event, cpuctx, ctx)) {
696741
pmu->cancel_txn(pmu);
697742
return -EAGAIN;
698743
}
@@ -701,26 +746,31 @@ group_sched_in(struct perf_event *group_event,
701746
* Schedule in siblings as one group (if any):
702747
*/
703748
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
704-
if (event_sched_in(event, cpuctx, ctx)) {
749+
if (__event_sched_in(event, cpuctx, ctx)) {
705750
partial_group = event;
706751
goto group_error;
707752
}
708753
}
709754

710-
if (!pmu->commit_txn(pmu))
755+
if (!pmu->commit_txn(pmu)) {
756+
/* commit tstamp_running */
757+
group_commit_event_sched_in(group_event, cpuctx, ctx);
711758
return 0;
712-
759+
}
713760
group_error:
714761
/*
715762
* Groups can be scheduled in as one unit only, so undo any
716763
* partial group before returning:
764+
*
765+
* use __event_sched_out() to avoid updating tstamp_stopped
766+
* because the event never actually ran
717767
*/
718768
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
719769
if (event == partial_group)
720770
break;
721-
event_sched_out(event, cpuctx, ctx);
771+
__event_sched_out(event, cpuctx, ctx);
722772
}
723-
event_sched_out(group_event, cpuctx, ctx);
773+
__event_sched_out(group_event, cpuctx, ctx);
724774

725775
pmu->cancel_txn(pmu);
726776

0 commit comments

Comments
 (0)