Skip to content

Commit 74b2058

Browse files
David Aherndavem330
authored andcommitted
net: l3mdev: Add hook in ip and ipv6
Currently the VRF driver uses the rx_handler to switch the skb device to the VRF device. Switching the dev prior to the ip / ipv6 layer means the VRF driver has to duplicate IP/IPv6 processing which adds overhead and makes features such as retaining the ingress device index more complicated than necessary. This patch moves the hook to the L3 layer just after the first NF_HOOK for PRE_ROUTING. This location makes exposing the original ingress device trivial (next patch) and allows adding other NF_HOOKs to the VRF driver in the future. dev_queue_xmit_nit is exported so that the VRF driver can cycle the skb with the switched device through the packet taps to maintain current behavior (tcpdump can be used on either the vrf device or the enslaved devices). Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent ca4aa97 commit 74b2058

File tree

8 files changed

+170
-101
lines changed

8 files changed

+170
-101
lines changed

drivers/net/vrf.c

Lines changed: 91 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,6 @@
4242
#define DRV_NAME "vrf"
4343
#define DRV_VERSION "1.0"
4444

45-
#define vrf_master_get_rcu(dev) \
46-
((struct net_device *)rcu_dereference(dev->rx_handler_data))
47-
4845
struct net_vrf {
4946
struct rtable *rth;
5047
struct rt6_info *rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
6057
struct u64_stats_sync syncp;
6158
};
6259

63-
/* neighbor handling is done with actual device; do not want
64-
* to flip skb->dev for those ndisc packets. This really fails
65-
* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
66-
* a start.
67-
*/
68-
#if IS_ENABLED(CONFIG_IPV6)
69-
static bool check_ipv6_frame(const struct sk_buff *skb)
70-
{
71-
const struct ipv6hdr *ipv6h;
72-
struct ipv6hdr _ipv6h;
73-
bool rc = true;
74-
75-
ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
76-
if (!ipv6h)
77-
goto out;
78-
79-
if (ipv6h->nexthdr == NEXTHDR_ICMP) {
80-
const struct icmp6hdr *icmph;
81-
struct icmp6hdr _icmph;
82-
83-
icmph = skb_header_pointer(skb, sizeof(_ipv6h),
84-
sizeof(_icmph), &_icmph);
85-
if (!icmph)
86-
goto out;
87-
88-
switch (icmph->icmp6_type) {
89-
case NDISC_ROUTER_SOLICITATION:
90-
case NDISC_ROUTER_ADVERTISEMENT:
91-
case NDISC_NEIGHBOUR_SOLICITATION:
92-
case NDISC_NEIGHBOUR_ADVERTISEMENT:
93-
case NDISC_REDIRECT:
94-
rc = false;
95-
break;
96-
}
97-
}
98-
99-
out:
100-
return rc;
101-
}
102-
#else
103-
static bool check_ipv6_frame(const struct sk_buff *skb)
104-
{
105-
return false;
106-
}
107-
#endif
108-
109-
static bool is_ip_rx_frame(struct sk_buff *skb)
110-
{
111-
switch (skb->protocol) {
112-
case htons(ETH_P_IP):
113-
return true;
114-
case htons(ETH_P_IPV6):
115-
return check_ipv6_frame(skb);
116-
}
117-
return false;
118-
}
119-
12060
static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
12161
{
12262
vrf_dev->stats.tx_errors++;
12363
kfree_skb(skb);
12464
}
12565

126-
/* note: already called with rcu_read_lock */
127-
static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
128-
{
129-
struct sk_buff *skb = *pskb;
130-
131-
if (is_ip_rx_frame(skb)) {
132-
struct net_device *dev = vrf_master_get_rcu(skb->dev);
133-
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
134-
135-
u64_stats_update_begin(&dstats->syncp);
136-
dstats->rx_pkts++;
137-
dstats->rx_bytes += skb->len;
138-
u64_stats_update_end(&dstats->syncp);
139-
140-
skb->dev = dev;
141-
142-
return RX_HANDLER_ANOTHER;
143-
}
144-
return RX_HANDLER_PASS;
145-
}
146-
14766
static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
14867
struct rtnl_link_stats64 *stats)
14968
{
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
506425
{
507426
int ret;
508427

509-
/* register the packet handler for slave ports */
510-
ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
511-
if (ret) {
512-
netdev_err(port_dev,
513-
"Device %s failed to register rx_handler\n",
514-
port_dev->name);
515-
goto out_fail;
516-
}
517-
518428
ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
519429
if (ret < 0)
520-
goto out_unregister;
430+
return ret;
521431

522432
port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
523433
cycle_netdev(port_dev);
524434

525435
return 0;
526-
527-
out_unregister:
528-
netdev_rx_handler_unregister(port_dev);
529-
out_fail:
530-
return ret;
531436
}
532437

533438
static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
544449
netdev_upper_dev_unlink(port_dev, dev);
545450
port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
546451

547-
netdev_rx_handler_unregister(port_dev);
548-
549452
cycle_netdev(port_dev);
550453

551454
return 0;
@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
669572
return rc;
670573
}
671574

575+
#if IS_ENABLED(CONFIG_IPV6)
576+
/* neighbor handling is done with actual device; do not want
577+
* to flip skb->dev for those ndisc packets. This really fails
578+
* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
579+
* a start.
580+
*/
581+
static bool ipv6_ndisc_frame(const struct sk_buff *skb)
582+
{
583+
const struct ipv6hdr *iph = ipv6_hdr(skb);
584+
bool rc = false;
585+
586+
if (iph->nexthdr == NEXTHDR_ICMP) {
587+
const struct icmp6hdr *icmph;
588+
struct icmp6hdr _icmph;
589+
590+
icmph = skb_header_pointer(skb, sizeof(*iph),
591+
sizeof(_icmph), &_icmph);
592+
if (!icmph)
593+
goto out;
594+
595+
switch (icmph->icmp6_type) {
596+
case NDISC_ROUTER_SOLICITATION:
597+
case NDISC_ROUTER_ADVERTISEMENT:
598+
case NDISC_NEIGHBOUR_SOLICITATION:
599+
case NDISC_NEIGHBOUR_ADVERTISEMENT:
600+
case NDISC_REDIRECT:
601+
rc = true;
602+
break;
603+
}
604+
}
605+
606+
out:
607+
return rc;
608+
}
609+
610+
static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
611+
struct sk_buff *skb)
612+
{
613+
/* if packet is NDISC keep the ingress interface */
614+
if (!ipv6_ndisc_frame(skb)) {
615+
skb->dev = vrf_dev;
616+
skb->skb_iif = vrf_dev->ifindex;
617+
618+
skb_push(skb, skb->mac_len);
619+
dev_queue_xmit_nit(skb, vrf_dev);
620+
skb_pull(skb, skb->mac_len);
621+
622+
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
623+
}
624+
625+
return skb;
626+
}
627+
628+
#else
629+
static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
630+
struct sk_buff *skb)
631+
{
632+
return skb;
633+
}
634+
#endif
635+
636+
static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
637+
struct sk_buff *skb)
638+
{
639+
skb->dev = vrf_dev;
640+
skb->skb_iif = vrf_dev->ifindex;
641+
642+
skb_push(skb, skb->mac_len);
643+
dev_queue_xmit_nit(skb, vrf_dev);
644+
skb_pull(skb, skb->mac_len);
645+
646+
return skb;
647+
}
648+
649+
/* called with rcu lock held */
650+
static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
651+
struct sk_buff *skb,
652+
u16 proto)
653+
{
654+
switch (proto) {
655+
case AF_INET:
656+
return vrf_ip_rcv(vrf_dev, skb);
657+
case AF_INET6:
658+
return vrf_ip6_rcv(vrf_dev, skb);
659+
}
660+
661+
return skb;
662+
}
663+
672664
#if IS_ENABLED(CONFIG_IPV6)
673665
static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
674666
const struct flowi6 *fl6)
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
690682
.l3mdev_fib_table = vrf_fib_table,
691683
.l3mdev_get_rtable = vrf_get_rtable,
692684
.l3mdev_get_saddr = vrf_get_saddr,
685+
.l3mdev_l3_rcv = vrf_l3_rcv,
693686
#if IS_ENABLED(CONFIG_IPV6)
694687
.l3mdev_get_rt6_dst = vrf_get_rt6_dst,
695688
#endif

include/linux/ipv6.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
118118
#define IP6SKB_ROUTERALERT 8
119119
#define IP6SKB_FRAGMENTED 16
120120
#define IP6SKB_HOPBYHOP 32
121+
#define IP6SKB_L3SLAVE 64
121122
};
122123

124+
#if defined(CONFIG_NET_L3_MASTER_DEV)
125+
static inline bool skb_l3mdev_slave(__u16 flags)
126+
{
127+
return flags & IP6SKB_L3SLAVE;
128+
}
129+
#else
130+
static inline bool skb_l3mdev_slave(__u16 flags)
131+
{
132+
return false;
133+
}
134+
#endif
135+
123136
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
124137
#define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb))
125138

126139
static inline int inet6_iif(const struct sk_buff *skb)
127140
{
128-
return IP6CB(skb)->iif;
141+
bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
142+
143+
return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
129144
}
130145

131146
struct tcp6_request_sock {

include/linux/netdevice.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
32583258
bool is_skb_forwardable(const struct net_device *dev,
32593259
const struct sk_buff *skb);
32603260

3261+
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
3262+
32613263
extern int netdev_budget;
32623264

32633265
/* Called by rtnetlink.c:rtnl_unlock() */

include/net/l3mdev.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626
struct l3mdev_ops {
2727
u32 (*l3mdev_fib_table)(const struct net_device *dev);
28+
struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
29+
struct sk_buff *skb, u16 proto);
2830

2931
/* IPv4 ops */
3032
struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
134136

135137
struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
136138

139+
static inline
140+
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
141+
{
142+
struct net_device *master = NULL;
143+
144+
if (netif_is_l3_slave(skb->dev))
145+
master = netdev_master_upper_dev_get_rcu(skb->dev);
146+
else if (netif_is_l3_master(skb->dev))
147+
master = skb->dev;
148+
149+
if (master && master->l3mdev_ops->l3mdev_l3_rcv)
150+
skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
151+
152+
return skb;
153+
}
154+
155+
static inline
156+
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
157+
{
158+
return l3mdev_l3_rcv(skb, AF_INET);
159+
}
160+
161+
static inline
162+
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
163+
{
164+
return l3mdev_l3_rcv(skb, AF_INET6);
165+
}
166+
137167
#else
138168

139169
static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
194224
{
195225
return NULL;
196226
}
227+
228+
static inline
229+
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
230+
{
231+
return skb;
232+
}
233+
234+
static inline
235+
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
236+
{
237+
return skb;
238+
}
197239
#endif
198240

199241
#endif /* _NET_L3MDEV_H_ */

include/net/tcp.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
786786
*/
787787
static inline int tcp_v6_iif(const struct sk_buff *skb)
788788
{
789-
return TCP_SKB_CB(skb)->header.h6.iif;
789+
bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
790+
791+
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
790792
}
791793
#endif
792794

net/core/dev.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
18501850
* taps currently in use.
18511851
*/
18521852

1853-
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1853+
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
18541854
{
18551855
struct packet_type *ptype;
18561856
struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
19071907
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
19081908
rcu_read_unlock();
19091909
}
1910+
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
19101911

19111912
/**
19121913
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change

net/ipv4/ip_input.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
313313
const struct iphdr *iph = ip_hdr(skb);
314314
struct rtable *rt;
315315

316+
/* if ingress device is enslaved to an L3 master device pass the
317+
* skb to its handler for processing
318+
*/
319+
skb = l3mdev_ip_rcv(skb);
320+
if (!skb)
321+
return NET_RX_SUCCESS;
322+
316323
if (net->ipv4.sysctl_ip_early_demux &&
317324
!skb_dst(skb) &&
318325
!skb->sk &&

0 commit comments

Comments
 (0)