Skip to content

Commit 492ecee

Browse files
Alexei Starovoitovborkmann
authored andcommitted
bpf: enable program stats
JITed BPF programs are indistinguishable from kernel functions, but unlike kernel code BPF code can be changed often. Typical approach of "perf record" + "perf report" profiling and tuning of kernel code works just as well for BPF programs, but kernel code doesn't need to be monitored whereas BPF programs do. Users load and run large amount of BPF programs. These BPF stats allow tools monitor the usage of BPF on the server. The monitoring tools will turn sysctl kernel.bpf_stats_enabled on and off for few seconds to sample average cost of the programs. Aggregated data over hours and days will provide an insight into cost of BPF and alarms can trigger in case given program suddenly gets more expensive. The cost of two sched_clock() per program invocation adds ~20 nsec. Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down from ~10 nsec to ~30 nsec. static_key minimizes the cost of the stats collection. There is no measurable difference before/after this patch with kernel.bpf_stats_enabled=0 Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
1 parent 143bdc2 commit 492ecee

File tree

6 files changed

+129
-6
lines changed

6 files changed

+129
-6
lines changed

include/linux/bpf.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/rbtree_latch.h>
1717
#include <linux/numa.h>
1818
#include <linux/wait.h>
19+
#include <linux/u64_stats_sync.h>
1920

2021
struct bpf_verifier_env;
2122
struct perf_event;
@@ -340,6 +341,12 @@ enum bpf_cgroup_storage_type {
340341

341342
#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
342343

344+
struct bpf_prog_stats {
345+
u64 cnt;
346+
u64 nsecs;
347+
struct u64_stats_sync syncp;
348+
};
349+
343350
struct bpf_prog_aux {
344351
atomic_t refcnt;
345352
u32 used_map_cnt;
@@ -389,6 +396,7 @@ struct bpf_prog_aux {
389396
* main prog always has linfo_idx == 0
390397
*/
391398
u32 linfo_idx;
399+
struct bpf_prog_stats __percpu *stats;
392400
union {
393401
struct work_struct work;
394402
struct rcu_head rcu;
@@ -559,6 +567,7 @@ void bpf_map_area_free(void *base);
559567
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
560568

561569
extern int sysctl_unprivileged_bpf_disabled;
570+
extern int sysctl_bpf_stats_enabled;
562571

563572
int bpf_map_new_fd(struct bpf_map *map, int flags);
564573
int bpf_prog_new_fd(struct bpf_prog *prog);

include/linux/filter.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,24 @@ struct sk_filter {
533533
struct bpf_prog *prog;
534534
};
535535

536-
#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
536+
DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
537+
538+
#define BPF_PROG_RUN(prog, ctx) ({ \
539+
u32 ret; \
540+
cant_sleep(); \
541+
if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
542+
struct bpf_prog_stats *stats; \
543+
u64 start = sched_clock(); \
544+
ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \
545+
stats = this_cpu_ptr(prog->aux->stats); \
546+
u64_stats_update_begin(&stats->syncp); \
547+
stats->cnt++; \
548+
stats->nsecs += sched_clock() - start; \
549+
u64_stats_update_end(&stats->syncp); \
550+
} else { \
551+
ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \
552+
} \
553+
ret; })
537554

538555
#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
539556

@@ -764,6 +781,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
764781
void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
765782

766783
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
784+
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
767785
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
768786
gfp_t gfp_extra_flags);
769787
void __bpf_prog_free(struct bpf_prog *fp);

kernel/bpf/core.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
7878
return NULL;
7979
}
8080

81-
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
81+
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
8282
{
8383
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
8484
struct bpf_prog_aux *aux;
@@ -104,6 +104,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
104104

105105
return fp;
106106
}
107+
108+
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
109+
{
110+
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
111+
struct bpf_prog *prog;
112+
113+
prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
114+
if (!prog)
115+
return NULL;
116+
117+
prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
118+
if (!prog->aux->stats) {
119+
kfree(prog->aux);
120+
vfree(prog);
121+
return NULL;
122+
}
123+
124+
u64_stats_init(&prog->aux->stats->syncp);
125+
return prog;
126+
}
107127
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
108128

109129
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +251,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
231251

232252
void __bpf_prog_free(struct bpf_prog *fp)
233253
{
234-
kfree(fp->aux);
254+
if (fp->aux) {
255+
free_percpu(fp->aux->stats);
256+
kfree(fp->aux);
257+
}
235258
vfree(fp);
236259
}
237260

@@ -2069,6 +2092,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
20692092
return -EFAULT;
20702093
}
20712094

2095+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
2096+
EXPORT_SYMBOL(bpf_stats_enabled_key);
2097+
int sysctl_bpf_stats_enabled __read_mostly;
2098+
20722099
/* All definitions of tracepoints related to BPF. */
20732100
#define CREATE_TRACE_POINTS
20742101
#include <linux/bpf_trace.h>

kernel/bpf/syscall.c

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,24 +1283,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
12831283
return 0;
12841284
}
12851285

1286+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
1287+
struct bpf_prog_stats *stats)
1288+
{
1289+
u64 nsecs = 0, cnt = 0;
1290+
int cpu;
1291+
1292+
for_each_possible_cpu(cpu) {
1293+
const struct bpf_prog_stats *st;
1294+
unsigned int start;
1295+
u64 tnsecs, tcnt;
1296+
1297+
st = per_cpu_ptr(prog->aux->stats, cpu);
1298+
do {
1299+
start = u64_stats_fetch_begin_irq(&st->syncp);
1300+
tnsecs = st->nsecs;
1301+
tcnt = st->cnt;
1302+
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
1303+
nsecs += tnsecs;
1304+
cnt += tcnt;
1305+
}
1306+
stats->nsecs = nsecs;
1307+
stats->cnt = cnt;
1308+
}
1309+
12861310
#ifdef CONFIG_PROC_FS
12871311
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
12881312
{
12891313
const struct bpf_prog *prog = filp->private_data;
12901314
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
1315+
struct bpf_prog_stats stats;
12911316

1317+
bpf_prog_get_stats(prog, &stats);
12921318
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
12931319
seq_printf(m,
12941320
"prog_type:\t%u\n"
12951321
"prog_jited:\t%u\n"
12961322
"prog_tag:\t%s\n"
12971323
"memlock:\t%llu\n"
1298-
"prog_id:\t%u\n",
1324+
"prog_id:\t%u\n"
1325+
"run_time_ns:\t%llu\n"
1326+
"run_cnt:\t%llu\n",
12991327
prog->type,
13001328
prog->jited,
13011329
prog_tag,
13021330
prog->pages * 1ULL << PAGE_SHIFT,
1303-
prog->aux->id);
1331+
prog->aux->id,
1332+
stats.nsecs,
1333+
stats.cnt);
13041334
}
13051335
#endif
13061336

kernel/bpf/verifier.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7320,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
73207320
subprog_end = env->subprog_info[i + 1].start;
73217321

73227322
len = subprog_end - subprog_start;
7323-
func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
7323+
/* BPF_PROG_RUN doesn't call subprogs directly,
7324+
* hence main prog stats include the runtime of subprogs.
7325+
* subprogs don't have IDs and not reachable via prog_get_next_id
7326+
* func[i]->aux->stats will never be accessed and stays NULL
7327+
*/
7328+
func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
73247329
if (!func[i])
73257330
goto out_free;
73267331
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],

kernel/sysctl.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
224224
#endif
225225
static int proc_dopipe_max_size(struct ctl_table *table, int write,
226226
void __user *buffer, size_t *lenp, loff_t *ppos);
227+
static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
228+
void __user *buffer, size_t *lenp,
229+
loff_t *ppos);
227230

228231
#ifdef CONFIG_MAGIC_SYSRQ
229232
/* Note: sysrq code uses its own private copy */
@@ -1230,6 +1233,15 @@ static struct ctl_table kern_table[] = {
12301233
.extra2 = &one,
12311234
},
12321235
#endif
1236+
{
1237+
.procname = "bpf_stats_enabled",
1238+
.data = &sysctl_bpf_stats_enabled,
1239+
.maxlen = sizeof(sysctl_bpf_stats_enabled),
1240+
.mode = 0644,
1241+
.proc_handler = proc_dointvec_minmax_bpf_stats,
1242+
.extra1 = &zero,
1243+
.extra2 = &one,
1244+
},
12331245
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
12341246
{
12351247
.procname = "panic_on_rcu_stall",
@@ -3260,6 +3272,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
32603272

32613273
#endif /* CONFIG_PROC_SYSCTL */
32623274

3275+
static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
3276+
void __user *buffer, size_t *lenp,
3277+
loff_t *ppos)
3278+
{
3279+
int ret, bpf_stats = *(int *)table->data;
3280+
struct ctl_table tmp = *table;
3281+
3282+
if (write && !capable(CAP_SYS_ADMIN))
3283+
return -EPERM;
3284+
3285+
tmp.data = &bpf_stats;
3286+
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
3287+
if (write && !ret) {
3288+
*(int *)table->data = bpf_stats;
3289+
if (bpf_stats)
3290+
static_branch_enable(&bpf_stats_enabled_key);
3291+
else
3292+
static_branch_disable(&bpf_stats_enabled_key);
3293+
}
3294+
return ret;
3295+
}
3296+
32633297
/*
32643298
* No sense putting this after each symbol definition, twice,
32653299
* exception granted :-)

0 commit comments

Comments
 (0)