Skip to content

Commit 7f3249f

Browse files
committed
Merge branch 'bpf-xdp-adjust-tail'
Nikita V. Shirokov says: ==================== In this patch series i'm add new bpf helper which allow to manupulate xdp's data_end pointer. right now only "shrinking" (reduce packet's size by moving pointer) is supported (and i see no use case for "growing"). Main use case for such helper is to be able to generate controll (ICMP) messages from XDP context. such messages usually contains first N bytes from original packets as a payload, and this is exactly what this helper would allow us to do (see patch 3 for sample program, where we generate ICMP "packet too big" message). This helper could be usefull for load balancing applications where after additional encapsulation, resulting packet could be bigger then interface MTU. Aside from new helper this patch series contains minor changes in device drivers (for ones which requires), so they would recal packet's length not only when head pointer was adjusted, but if tail's one as well. v2->v3: * adding missed "signed off by" in v2 v1->v2: * fixed kbuild warning * made offset eq 0 invalid for xdp_bpf_adjust_tail * splitted bpf_prog_test_run fix and selftests in sep commits * added SPDX licence where applicable * some reshuffling in patches order (tests now in the end) ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2 parents 0c90f22 + c6ffd1f commit 7f3249f

File tree

18 files changed

+435
-12
lines changed

18 files changed

+435
-12
lines changed

drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,10 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
113113
if (tx_avail != bp->tx_ring_size)
114114
*event &= ~BNXT_RX_EVENT;
115115

116+
*len = xdp.data_end - xdp.data;
116117
if (orig_data != xdp.data) {
117118
offset = xdp.data - xdp.data_hard_start;
118119
*data_ptr = xdp.data_hard_start + offset;
119-
*len = xdp.data_end - xdp.data;
120120
}
121121
switch (act) {
122122
case XDP_PASS:

drivers/net/ethernet/cavium/thunder/nicvf_main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,9 +538,9 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
538538
action = bpf_prog_run_xdp(prog, &xdp);
539539
rcu_read_unlock();
540540

541+
len = xdp.data_end - xdp.data;
541542
/* Check if XDP program has changed headers */
542543
if (orig_data != xdp.data) {
543-
len = xdp.data_end - xdp.data;
544544
offset = orig_data - xdp.data;
545545
dma_addr -= offset;
546546
}

drivers/net/ethernet/mellanox/mlx4/en_rx.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -775,8 +775,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
775775

776776
act = bpf_prog_run_xdp(xdp_prog, &xdp);
777777

778+
length = xdp.data_end - xdp.data;
778779
if (xdp.data != orig_data) {
779-
length = xdp.data_end - xdp.data;
780780
frags[0].page_offset = xdp.data -
781781
xdp.data_hard_start;
782782
va = xdp.data;

drivers/net/ethernet/netronome/nfp/nfp_net_common.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
17221722

17231723
act = bpf_prog_run_xdp(xdp_prog, &xdp);
17241724

1725-
pkt_len -= xdp.data - orig_data;
1725+
pkt_len = xdp.data_end - xdp.data;
17261726
pkt_off += xdp.data - orig_data;
17271727

17281728
switch (act) {

drivers/net/tun.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1696,6 +1696,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
16961696
return NULL;
16971697
case XDP_PASS:
16981698
delta = orig_data - xdp.data;
1699+
len = xdp.data_end - xdp.data;
16991700
break;
17001701
default:
17011702
bpf_warn_invalid_xdp_action(act);
@@ -1716,7 +1717,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
17161717
}
17171718

17181719
skb_reserve(skb, pad - delta);
1719-
skb_put(skb, len + delta);
1720+
skb_put(skb, len);
17201721
get_page(alloc_frag->page);
17211722
alloc_frag->offset += buflen;
17221723

drivers/net/virtio_net.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
606606
case XDP_PASS:
607607
/* Recalculate length in case bpf program changed it */
608608
delta = orig_data - xdp.data;
609+
len = xdp.data_end - xdp.data;
609610
break;
610611
case XDP_TX:
611612
xdpf = convert_to_xdp_frame(&xdp);
@@ -642,7 +643,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
642643
goto err;
643644
}
644645
skb_reserve(skb, headroom - delta);
645-
skb_put(skb, len + delta);
646+
skb_put(skb, len);
646647
if (!delta) {
647648
buf += header_offset;
648649
memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
@@ -757,6 +758,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
757758
offset = xdp.data -
758759
page_address(xdp_page) - vi->hdr_len;
759760

761+
/* recalculate len if xdp.data or xdp.data_end were
762+
* adjusted
763+
*/
764+
len = xdp.data_end - xdp.data;
760765
/* We can only create skb based on xdp_page. */
761766
if (unlikely(xdp_page != page)) {
762767
rcu_read_unlock();

include/uapi/linux/bpf.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,13 @@ union bpf_attr {
755755
* @addr: pointer to struct sockaddr to bind socket to
756756
* @addr_len: length of sockaddr structure
757757
* Return: 0 on success or negative error code
758+
*
759+
* int bpf_xdp_adjust_tail(xdp_md, delta)
760+
* Adjust the xdp_md.data_end by delta. Only shrinking of packet's
761+
* size is supported.
762+
* @xdp_md: pointer to xdp_md
763+
* @delta: A negative integer to be added to xdp_md.data_end
764+
* Return: 0 on success or negative on error
758765
*/
759766
#define __BPF_FUNC_MAPPER(FN) \
760767
FN(unspec), \
@@ -821,7 +828,8 @@ union bpf_attr {
821828
FN(msg_apply_bytes), \
822829
FN(msg_cork_bytes), \
823830
FN(msg_pull_data), \
824-
FN(bind),
831+
FN(bind), \
832+
FN(xdp_adjust_tail),
825833

826834
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
827835
* function eBPF program intends to call

net/bpf/test_run.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
170170
xdp.rxq = &rxqueue->xdp_rxq;
171171

172172
retval = bpf_test_run(prog, &xdp, repeat, &duration);
173-
if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN)
173+
if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
174+
xdp.data_end != xdp.data + size)
174175
size = xdp.data_end - xdp.data;
175176
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
176177
kfree(data);

net/core/dev.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3996,9 +3996,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
39963996
struct bpf_prog *xdp_prog)
39973997
{
39983998
struct netdev_rx_queue *rxqueue;
3999+
void *orig_data, *orig_data_end;
39994000
u32 metalen, act = XDP_DROP;
40004001
struct xdp_buff xdp;
4001-
void *orig_data;
40024002
int hlen, off;
40034003
u32 mac_len;
40044004

@@ -4037,6 +4037,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
40374037
xdp.data_meta = xdp.data;
40384038
xdp.data_end = xdp.data + hlen;
40394039
xdp.data_hard_start = skb->data - skb_headroom(skb);
4040+
orig_data_end = xdp.data_end;
40404041
orig_data = xdp.data;
40414042

40424043
rxqueue = netif_get_rxqueue(skb);
@@ -4051,6 +4052,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
40514052
__skb_push(skb, -off);
40524053
skb->mac_header += off;
40534054

4055+
/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4056+
* pckt.
4057+
*/
4058+
off = orig_data_end - xdp.data_end;
4059+
if (off != 0)
4060+
skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
4061+
40544062
switch (act) {
40554063
case XDP_REDIRECT:
40564064
case XDP_TX:

net/core/filter.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2725,6 +2725,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
27252725
.arg2_type = ARG_ANYTHING,
27262726
};
27272727

2728+
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
2729+
{
2730+
void *data_end = xdp->data_end + offset;
2731+
2732+
/* only shrinking is allowed for now. */
2733+
if (unlikely(offset >= 0))
2734+
return -EINVAL;
2735+
2736+
if (unlikely(data_end < xdp->data + ETH_HLEN))
2737+
return -EINVAL;
2738+
2739+
xdp->data_end = data_end;
2740+
2741+
return 0;
2742+
}
2743+
2744+
static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
2745+
.func = bpf_xdp_adjust_tail,
2746+
.gpl_only = false,
2747+
.ret_type = RET_INTEGER,
2748+
.arg1_type = ARG_PTR_TO_CTX,
2749+
.arg2_type = ARG_ANYTHING,
2750+
};
2751+
27282752
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
27292753
{
27302754
void *meta = xdp->data_meta + offset;
@@ -3074,7 +3098,8 @@ bool bpf_helper_changes_pkt_data(void *func)
30743098
func == bpf_l4_csum_replace ||
30753099
func == bpf_xdp_adjust_head ||
30763100
func == bpf_xdp_adjust_meta ||
3077-
func == bpf_msg_pull_data)
3101+
func == bpf_msg_pull_data ||
3102+
func == bpf_xdp_adjust_tail)
30783103
return true;
30793104

30803105
return false;
@@ -3888,6 +3913,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
38883913
return &bpf_xdp_redirect_proto;
38893914
case BPF_FUNC_redirect_map:
38903915
return &bpf_xdp_redirect_map_proto;
3916+
case BPF_FUNC_xdp_adjust_tail:
3917+
return &bpf_xdp_adjust_tail_proto;
38913918
default:
38923919
return bpf_base_func_proto(func_id);
38933920
}

samples/bpf/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ hostprogs-y += xdp_monitor
4444
hostprogs-y += xdp_rxq_info
4545
hostprogs-y += syscall_tp
4646
hostprogs-y += cpustat
47+
hostprogs-y += xdp_adjust_tail
4748

4849
# Libbpf dependencies
4950
LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -95,6 +96,7 @@ xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
9596
xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
9697
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
9798
cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
99+
xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
98100

99101
# Tell kbuild to always build the programs
100102
always := $(hostprogs-y)
@@ -148,6 +150,7 @@ always += xdp_rxq_info_kern.o
148150
always += xdp2skb_meta_kern.o
149151
always += syscall_tp_kern.o
150152
always += cpustat_kern.o
153+
always += xdp_adjust_tail_kern.o
151154

152155
HOSTCFLAGS += -I$(objtree)/usr/include
153156
HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -193,6 +196,7 @@ HOSTLOADLIBES_xdp_monitor += -lelf
193196
HOSTLOADLIBES_xdp_rxq_info += -lelf
194197
HOSTLOADLIBES_syscall_tp += -lelf
195198
HOSTLOADLIBES_cpustat += -lelf
199+
HOSTLOADLIBES_xdp_adjust_tail += -lelf
196200

197201
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
198202
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang

samples/bpf/xdp_adjust_tail_kern.c

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/* SPDX-License-Identifier: GPL-2.0
2+
* Copyright (c) 2018 Facebook
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of version 2 of the GNU General Public
6+
* License as published by the Free Software Foundation.
7+
*
8+
* This program shows how to use bpf_xdp_adjust_tail() by
9+
* generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
10+
* to be more preice in case of v4)" where receiving packets bigger then
11+
* 600 bytes.
12+
*/
13+
#define KBUILD_MODNAME "foo"
14+
#include <uapi/linux/bpf.h>
15+
#include <linux/in.h>
16+
#include <linux/if_ether.h>
17+
#include <linux/if_packet.h>
18+
#include <linux/if_vlan.h>
19+
#include <linux/ip.h>
20+
#include <linux/icmp.h>
21+
#include "bpf_helpers.h"
22+
23+
#define DEFAULT_TTL 64
24+
#define MAX_PCKT_SIZE 600
25+
#define ICMP_TOOBIG_SIZE 98
26+
#define ICMP_TOOBIG_PAYLOAD_SIZE 92
27+
28+
struct bpf_map_def SEC("maps") icmpcnt = {
29+
.type = BPF_MAP_TYPE_ARRAY,
30+
.key_size = sizeof(__u32),
31+
.value_size = sizeof(__u64),
32+
.max_entries = 1,
33+
};
34+
35+
static __always_inline void count_icmp(void)
36+
{
37+
u64 key = 0;
38+
u64 *icmp_count;
39+
40+
icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
41+
if (icmp_count)
42+
*icmp_count += 1;
43+
}
44+
45+
static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
46+
{
47+
struct ethhdr *eth;
48+
49+
eth = data;
50+
memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
51+
memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
52+
eth->h_proto = orig_eth->h_proto;
53+
}
54+
55+
static __always_inline __u16 csum_fold_helper(__u32 csum)
56+
{
57+
return ~((csum & 0xffff) + (csum >> 16));
58+
}
59+
60+
static __always_inline void ipv4_csum(void *data_start, int data_size,
61+
__u32 *csum)
62+
{
63+
*csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
64+
*csum = csum_fold_helper(*csum);
65+
}
66+
67+
static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
68+
{
69+
int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
70+
71+
if (bpf_xdp_adjust_head(xdp, 0 - headroom))
72+
return XDP_DROP;
73+
void *data = (void *)(long)xdp->data;
74+
void *data_end = (void *)(long)xdp->data_end;
75+
76+
if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
77+
return XDP_DROP;
78+
79+
struct iphdr *iph, *orig_iph;
80+
struct icmphdr *icmp_hdr;
81+
struct ethhdr *orig_eth;
82+
__u32 csum = 0;
83+
__u64 off = 0;
84+
85+
orig_eth = data + headroom;
86+
swap_mac(data, orig_eth);
87+
off += sizeof(struct ethhdr);
88+
iph = data + off;
89+
off += sizeof(struct iphdr);
90+
icmp_hdr = data + off;
91+
off += sizeof(struct icmphdr);
92+
orig_iph = data + off;
93+
icmp_hdr->type = ICMP_DEST_UNREACH;
94+
icmp_hdr->code = ICMP_FRAG_NEEDED;
95+
icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct ethhdr));
96+
icmp_hdr->checksum = 0;
97+
ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
98+
icmp_hdr->checksum = csum;
99+
iph->ttl = DEFAULT_TTL;
100+
iph->daddr = orig_iph->saddr;
101+
iph->saddr = orig_iph->daddr;
102+
iph->version = 4;
103+
iph->ihl = 5;
104+
iph->protocol = IPPROTO_ICMP;
105+
iph->tos = 0;
106+
iph->tot_len = htons(
107+
ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
108+
iph->check = 0;
109+
csum = 0;
110+
ipv4_csum(iph, sizeof(struct iphdr), &csum);
111+
iph->check = csum;
112+
count_icmp();
113+
return XDP_TX;
114+
}
115+
116+
117+
static __always_inline int handle_ipv4(struct xdp_md *xdp)
118+
{
119+
void *data_end = (void *)(long)xdp->data_end;
120+
void *data = (void *)(long)xdp->data;
121+
int pckt_size = data_end - data;
122+
int offset;
123+
124+
if (pckt_size > MAX_PCKT_SIZE) {
125+
offset = pckt_size - ICMP_TOOBIG_SIZE;
126+
if (bpf_xdp_adjust_tail(xdp, 0 - offset))
127+
return XDP_PASS;
128+
return send_icmp4_too_big(xdp);
129+
}
130+
return XDP_PASS;
131+
}
132+
133+
SEC("xdp_icmp")
134+
int _xdp_icmp(struct xdp_md *xdp)
135+
{
136+
void *data_end = (void *)(long)xdp->data_end;
137+
void *data = (void *)(long)xdp->data;
138+
struct ethhdr *eth = data;
139+
__u16 h_proto;
140+
141+
if (eth + 1 > data_end)
142+
return XDP_DROP;
143+
144+
h_proto = eth->h_proto;
145+
146+
if (h_proto == htons(ETH_P_IP))
147+
return handle_ipv4(xdp);
148+
else
149+
return XDP_PASS;
150+
}
151+
152+
char _license[] SEC("license") = "GPL";

0 commit comments

Comments
 (0)