Skip to content

Commit db4a835

Browse files
David Carrillo-CisnerosIngo Molnar
authored andcommitted
perf/core: Set cgroup in CPU contexts for new cgroup events
There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 <not counted> cycles / 2.000355591 <not counted> cycles / 3.000565154 <not counted> cycles / 4.000951350 <not counted> cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd2 ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 6152726 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros <davidcc@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Vince Weaver <vincent.weaver@maine.edu> Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 0b8f1e2 commit db4a835

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

include/linux/perf_event.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,9 @@ struct perf_event_context {
743743
u64 parent_gen;
744744
u64 generation;
745745
int pin_count;
746+
#ifdef CONFIG_CGROUP_PERF
746747
int nr_cgroups; /* cgroup evts */
748+
#endif
747749
void *task_ctx_data; /* pmu specific data */
748750
struct rcu_head rcu_head;
749751
};
@@ -769,7 +771,9 @@ struct perf_cpu_context {
769771
unsigned int hrtimer_active;
770772

771773
struct pmu *unique_pmu;
774+
#ifdef CONFIG_CGROUP_PERF
772775
struct perf_cgroup *cgrp;
776+
#endif
773777
};
774778

775779
struct perf_output_handle {

kernel/events/core.c

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
843843
}
844844
}
845845
}
846+
847+
/*
848+
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
849+
* cleared when last cgroup event is removed.
850+
*/
851+
static inline void
852+
list_update_cgroup_event(struct perf_event *event,
853+
struct perf_event_context *ctx, bool add)
854+
{
855+
struct perf_cpu_context *cpuctx;
856+
857+
if (!is_cgroup_event(event))
858+
return;
859+
860+
if (add && ctx->nr_cgroups++)
861+
return;
862+
else if (!add && --ctx->nr_cgroups)
863+
return;
864+
/*
865+
* Because cgroup events are always per-cpu events,
866+
* this will always be called from the right CPU.
867+
*/
868+
cpuctx = __get_cpu_context(ctx);
869+
cpuctx->cgrp = add ? event->cgrp : NULL;
870+
}
871+
846872
#else /* !CONFIG_CGROUP_PERF */
847873

848874
static inline bool
@@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
920946
struct perf_event_context *ctx)
921947
{
922948
}
949+
950+
static inline void
951+
list_update_cgroup_event(struct perf_event *event,
952+
struct perf_event_context *ctx, bool add)
953+
{
954+
}
955+
923956
#endif
924957

925958
/*
@@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
13921425
static void
13931426
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
13941427
{
1428+
13951429
lockdep_assert_held(&ctx->lock);
13961430

13971431
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
14121446
list_add_tail(&event->group_entry, list);
14131447
}
14141448

1415-
if (is_cgroup_event(event))
1416-
ctx->nr_cgroups++;
1449+
list_update_cgroup_event(event, ctx, true);
14171450

14181451
list_add_rcu(&event->event_entry, &ctx->event_list);
14191452
ctx->nr_events++;
@@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
15811614
static void
15821615
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
15831616
{
1584-
struct perf_cpu_context *cpuctx;
1585-
15861617
WARN_ON_ONCE(event->ctx != ctx);
15871618
lockdep_assert_held(&ctx->lock);
15881619

@@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
15941625

15951626
event->attach_state &= ~PERF_ATTACH_CONTEXT;
15961627

1597-
if (is_cgroup_event(event)) {
1598-
ctx->nr_cgroups--;
1599-
/*
1600-
* Because cgroup events are always per-cpu events, this will
1601-
* always be called from the right CPU.
1602-
*/
1603-
cpuctx = __get_cpu_context(ctx);
1604-
/*
1605-
* If there are no more cgroup events then clear cgrp to avoid
1606-
* stale pointer in update_cgrp_time_from_cpuctx().
1607-
*/
1608-
if (!ctx->nr_cgroups)
1609-
cpuctx->cgrp = NULL;
1610-
}
1628+
list_update_cgroup_event(event, ctx, false);
16111629

16121630
ctx->nr_events--;
16131631
if (event->attr.inherit_stat)

0 commit comments

Comments
 (0)