Skip to content

Commit 79dfab4

Browse files
committed
Merge branch 'VXLAN-underlay-VRF'
Alexis Bauvin says: ==================== net: Add VRF support for VXLAN underlay v6 -> v7: - proper locking for device in udp_tunnel following Sabrina Dubroca's advice v5 -> v6: - remove automatic rebinding patch following Roopa Prabhu's advice v4 -> v5: - move test script to its own patch (6/6) - add schematic for test script - apply David Ahern comments to the test script v3 -> v4: - rename vxlan_is_in_l3mdev_chain to netdev_is_upper master - move it to net/core/dev.c - make it return bool instead of int - check if remote_ifindex is zero before resolving the l3mdev - add testing script v2 -> v3: - fix build when CONFIG_NET_IPV6 is off - fix build "unused l3mdev_master_upper_ifindex_by_index" build error with some configs v1 -> v2: - move vxlan_get_l3mdev from vxlan driver to l3mdev driver as l3mdev_master_upper_ifindex_by_index - vxlan: rename variables named l3mdev_ifindex to ifindex v0 -> v1: - fix typos We are trying to isolate the VXLAN traffic from different VMs with VRF as shown in the schemas below: +-------------------------+ +----------------------------+ | +----------+ | | +------------+ | | | | | | | | | | | tap-red | | | | tap-blue | | | | | | | | | | | +----+-----+ | | +-----+------+ | | | | | | | | | | | | | | +----+---+ | | +----+----+ | | | | | | | | | | | br-red | | | | br-blue | | | | | | | | | | | +----+---+ | | +----+----+ | | | | | | | | | | | | | | | | | | | | +----+--------+ | | +--------------+ | | | | | | | | | | | vxlan-red | | | | vxlan-blue | | | | | | | | | | | +------+------+ | | +-------+------+ | | | | | | | | | VRF | | | VRF | | | red | | | blue | +-------------------------+ +----------------------------+ | | | | +---------------------------------------------------------+ | | | | | | | | | | +--------------+ | | | | | | | | | +---------+ eth0.2030 +---------+ | | | 10.0.0.1/24 | | | +-----+--------+ VRF | | | green| +---------------------------------------------------------+ | | +----+---+ | | | eth0 | | | +--------+ iproute2 commands to reproduce the setup: ip link add green type vrf table 1 ip link set green up ip link add eth0.2030 link eth0 type vlan id 2030 ip link set eth0.2030 master green ip addr add 10.0.0.1/24 dev eth0.2030 ip link set eth0.2030 up ip link add blue type vrf table 2 ip link set blue up ip link add br-blue type bridge ip link set br-blue master blue ip link set br-blue up ip link add vxlan-blue type vxlan id 2 local 10.0.0.1 dev eth0.2030 \ port 4789 ip link set vxlan-blue master br-blue ip link set vxlan-blue up ip link set tap-blue master br-blue ip link set tap-blue up ip link add red type vrf table 3 ip link set red up ip link add br-red type bridge ip link set br-red master red ip link set br-red up ip link add vxlan-red type vxlan id 3 local 10.0.0.1 dev eth0.2030 \ port 4789 ip link set vxlan-red master br-red ip link set vxlan-red up ip link set tap-red master br-red ip link set tap-red up We faced some issue in the datapath, here are the details: * Egress traffic: The vxlan packets are sent directly to the default VRF because it's where the socket is bound, therefore the traffic has a default route via eth0. the workaround is to force this traffic to VRF green with ip rules. * Ingress traffic: When receiving the traffic on eth0.2030 the vxlan socket is unreachable from VRF green. The workaround is to enable *udp_l3mdev_accept* sysctl, but this breaks isolation between overlay and underlay: packets sent from blue or red by e.g. a guest VM will be accepted by the socket, allowing injection of VXLAN packets from the overlay. This patch series fixes the issues describe above by allowing VXLAN socket to be bound to a specific VRF device therefore looking up in the correct table. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents e3dd762 + 03f1c26 commit 79dfab4

File tree

8 files changed

+228
-9
lines changed

8 files changed

+228
-9
lines changed

drivers/net/vxlan.c

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
188188
* and enabled unshareable flags.
189189
*/
190190
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
191-
__be16 port, u32 flags)
191+
__be16 port, u32 flags, int ifindex)
192192
{
193193
struct vxlan_sock *vs;
194194

@@ -197,7 +197,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
197197
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
198198
if (inet_sk(vs->sock->sk)->inet_sport == port &&
199199
vxlan_get_sk_family(vs) == family &&
200-
vs->flags == flags)
200+
vs->flags == flags &&
201+
vs->sock->sk->sk_bound_dev_if == ifindex)
201202
return vs;
202203
}
203204
return NULL;
@@ -237,7 +238,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
237238
{
238239
struct vxlan_sock *vs;
239240

240-
vs = vxlan_find_sock(net, family, port, flags);
241+
vs = vxlan_find_sock(net, family, port, flags, ifindex);
241242
if (!vs)
242243
return NULL;
243244

@@ -2288,6 +2289,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
22882289
struct rtable *rt;
22892290
__be16 df = 0;
22902291

2292+
if (!ifindex)
2293+
ifindex = sock4->sock->sk->sk_bound_dev_if;
2294+
22912295
rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
22922296
dst->sin.sin_addr.s_addr,
22932297
&local_ip.sin.sin_addr.s_addr,
@@ -2337,6 +2341,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
23372341
} else {
23382342
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
23392343

2344+
if (!ifindex)
2345+
ifindex = sock6->sock->sk->sk_bound_dev_if;
2346+
23402347
ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
23412348
label, &dst->sin6.sin6_addr,
23422349
&local_ip.sin6.sin6_addr,
@@ -2951,7 +2958,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
29512958
};
29522959

29532960
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
2954-
__be16 port, u32 flags)
2961+
__be16 port, u32 flags, int ifindex)
29552962
{
29562963
struct socket *sock;
29572964
struct udp_port_cfg udp_conf;
@@ -2969,6 +2976,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
29692976
}
29702977

29712978
udp_conf.local_udp_port = port;
2979+
udp_conf.bind_ifindex = ifindex;
29722980

29732981
/* Open UDP socket */
29742982
err = udp_sock_create(net, &udp_conf, &sock);
@@ -2980,7 +2988,8 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
29802988

29812989
/* Create new listen socket if needed */
29822990
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
2983-
__be16 port, u32 flags)
2991+
__be16 port, u32 flags,
2992+
int ifindex)
29842993
{
29852994
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
29862995
struct vxlan_sock *vs;
@@ -2995,7 +3004,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
29953004
for (h = 0; h < VNI_HASH_SIZE; ++h)
29963005
INIT_HLIST_HEAD(&vs->vni_list[h]);
29973006

2998-
sock = vxlan_create_sock(net, ipv6, port, flags);
3007+
sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
29993008
if (IS_ERR(sock)) {
30003009
kfree(vs);
30013010
return ERR_CAST(sock);
@@ -3033,11 +3042,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
30333042
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
30343043
struct vxlan_sock *vs = NULL;
30353044
struct vxlan_dev_node *node;
3045+
int l3mdev_index = 0;
3046+
3047+
if (vxlan->cfg.remote_ifindex)
3048+
l3mdev_index = l3mdev_master_upper_ifindex_by_index(
3049+
vxlan->net, vxlan->cfg.remote_ifindex);
30363050

30373051
if (!vxlan->cfg.no_share) {
30383052
spin_lock(&vn->sock_lock);
30393053
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3040-
vxlan->cfg.dst_port, vxlan->cfg.flags);
3054+
vxlan->cfg.dst_port, vxlan->cfg.flags,
3055+
l3mdev_index);
30413056
if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
30423057
spin_unlock(&vn->sock_lock);
30433058
return -EBUSY;
@@ -3046,7 +3061,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
30463061
}
30473062
if (!vs)
30483063
vs = vxlan_socket_create(vxlan->net, ipv6,
3049-
vxlan->cfg.dst_port, vxlan->cfg.flags);
3064+
vxlan->cfg.dst_port, vxlan->cfg.flags,
3065+
l3mdev_index);
30503066
if (IS_ERR(vs))
30513067
return PTR_ERR(vs);
30523068
#if IS_ENABLED(CONFIG_IPV6)

include/net/l3mdev.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
101101
return master;
102102
}
103103

104+
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
105+
static inline
106+
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
107+
{
108+
rcu_read_lock();
109+
ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
110+
rcu_read_unlock();
111+
112+
return ifindex;
113+
}
114+
104115
u32 l3mdev_fib_table_rcu(const struct net_device *dev);
105116
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
106117
static inline u32 l3mdev_fib_table(const struct net_device *dev)
@@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
207218
return 0;
208219
}
209220

221+
static inline
222+
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
223+
{
224+
return 0;
225+
}
226+
static inline
227+
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
228+
{
229+
return 0;
230+
}
231+
210232
static inline
211233
struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
212234
{

include/net/udp_tunnel.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ struct udp_port_cfg {
3030

3131
__be16 local_udp_port;
3232
__be16 peer_udp_port;
33+
int bind_ifindex;
3334
unsigned int use_udp_checksums:1,
3435
use_udp6_tx_checksums:1,
3536
use_udp6_rx_checksums:1,

net/ipv4/udp_tunnel.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
2020
if (err < 0)
2121
goto error;
2222

23+
if (cfg->bind_ifindex) {
24+
struct net_device *dev;
25+
26+
dev = dev_get_by_index(net, cfg->bind_ifindex);
27+
if (!dev) {
28+
err = -ENODEV;
29+
goto error;
30+
}
31+
32+
err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
33+
dev->name, strlen(dev->name) + 1);
34+
dev_put(dev);
35+
36+
if (err < 0)
37+
goto error;
38+
}
39+
2340
udp_addr.sin_family = AF_INET;
2441
udp_addr.sin_addr = cfg->local_ip;
2542
udp_addr.sin_port = cfg->local_udp_port;

net/ipv6/ip6_udp_tunnel.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
3131
if (err < 0)
3232
goto error;
3333
}
34+
if (cfg->bind_ifindex) {
35+
struct net_device *dev;
36+
37+
dev = dev_get_by_index(net, cfg->bind_ifindex);
38+
if (!dev) {
39+
err = -ENODEV;
40+
goto error;
41+
}
42+
43+
err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
44+
dev->name, strlen(dev->name) + 1);
45+
dev_put(dev);
46+
47+
if (err < 0)
48+
goto error;
49+
}
3450

3551
udp6_addr.sin6_family = AF_INET6;
3652
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,

net/l3mdev/l3mdev.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
4646
}
4747
EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
4848

49+
/**
50+
* l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
51+
* device
52+
* @net: network namespace for device index lookup
53+
* @ifindex: targeted interface
54+
*/
55+
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
56+
{
57+
struct net_device *dev;
58+
59+
dev = dev_get_by_index_rcu(net, ifindex);
60+
while (dev && !netif_is_l3_master(dev))
61+
dev = netdev_master_upper_dev_get(dev);
62+
63+
return dev ? dev->ifindex : 0;
64+
}
65+
EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
66+
4967
/**
5068
* l3mdev_fib_table - get FIB table id associated with an L3
5169
* master interface

tools/testing/selftests/net/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ CFLAGS += -I../../../../usr/include/
77
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
88
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
99
TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
10-
TEST_PROGS += udpgro_bench.sh udpgro.sh
10+
TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh
1111
TEST_PROGS_EXTENDED := in_netns.sh
1212
TEST_GEN_FILES = socket
1313
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
# This test is for checking VXLAN underlay in a non-default VRF.
5+
#
6+
# It simulates two hypervisors running a VM each using four network namespaces:
7+
# two for the HVs, two for the VMs.
8+
# A small VXLAN tunnel is made between the two hypervisors to have the two vms
9+
# in the same virtual L2:
10+
#
11+
# +-------------------+ +-------------------+
12+
# | | | |
13+
# | vm-1 netns | | vm-2 netns |
14+
# | | | |
15+
# | +-------------+ | | +-------------+ |
16+
# | | veth-hv | | | | veth-hv | |
17+
# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | |
18+
# | +-------------+ | | +-------------+ |
19+
# | . | | . |
20+
# +-------------------+ +-------------------+
21+
# . .
22+
# . .
23+
# . .
24+
# +-----------------------------------+ +------------------------------------+
25+
# | . | | . |
26+
# | +----------+ | | +----------+ |
27+
# | | veth-tap | | | | veth-tap | |
28+
# | +----+-----+ | | +----+-----+ |
29+
# | | | | | |
30+
# | +--+--+ +--------------+ | | +--------------+ +--+--+ |
31+
# | | br0 | | vrf-underlay | | | | vrf-underlay | | br0 | |
32+
# | +--+--+ +-------+------+ | | +------+-------+ +--+--+ |
33+
# | | | | | | | |
34+
# | +---+----+ +-------+-------+ | | +-------+-------+ +---+----+ |
35+
# | | vxlan0 |....| veth0 |.|...|.| veth0 |....| vxlan0 | |
36+
# | +--------+ | 172.16.0.1/24 | | | | 172.16.0.2/24 | +--------+ |
37+
# | +---------------+ | | +---------------+ |
38+
# | | | |
39+
# | hv-1 netns | | hv-2 netns |
40+
# | | | |
41+
# +-----------------------------------+ +------------------------------------+
42+
#
43+
# This tests both the connectivity between vm-1 and vm-2, and that the underlay
44+
# can be moved in and out of the vrf by unsetting and setting veth0's master.
45+
46+
set -e
47+
48+
cleanup() {
49+
ip link del veth-hv-1 2>/dev/null || true
50+
ip link del veth-tap 2>/dev/null || true
51+
52+
for ns in hv-1 hv-2 vm-1 vm-2; do
53+
ip netns del $ns || true
54+
done
55+
}
56+
57+
# Clean start
58+
cleanup &> /dev/null
59+
60+
[[ $1 == "clean" ]] && exit 0
61+
62+
trap cleanup EXIT
63+
64+
# Setup "Hypervisors" simulated with netns
65+
ip link add veth-hv-1 type veth peer name veth-hv-2
66+
setup-hv-networking() {
67+
hv=$1
68+
69+
ip netns add hv-$hv
70+
ip link set veth-hv-$hv netns hv-$hv
71+
ip -netns hv-$hv link set veth-hv-$hv name veth0
72+
73+
ip -netns hv-$hv link add vrf-underlay type vrf table 1
74+
ip -netns hv-$hv link set vrf-underlay up
75+
ip -netns hv-$hv addr add 172.16.0.$hv/24 dev veth0
76+
ip -netns hv-$hv link set veth0 up
77+
78+
ip -netns hv-$hv link add br0 type bridge
79+
ip -netns hv-$hv link set br0 up
80+
81+
ip -netns hv-$hv link add vxlan0 type vxlan id 10 local 172.16.0.$hv dev veth0 dstport 4789
82+
ip -netns hv-$hv link set vxlan0 master br0
83+
ip -netns hv-$hv link set vxlan0 up
84+
}
85+
setup-hv-networking 1
86+
setup-hv-networking 2
87+
88+
# Check connectivity between HVs by pinging hv-2 from hv-1
89+
echo -n "Checking HV connectivity "
90+
ip netns exec hv-1 ping -c 1 -W 1 172.16.0.2 &> /dev/null || (echo "[FAIL]"; false)
91+
echo "[ OK ]"
92+
93+
# Setups a "VM" simulated by a netns an a veth pair
94+
setup-vm() {
95+
id=$1
96+
97+
ip netns add vm-$id
98+
ip link add veth-tap type veth peer name veth-hv
99+
100+
ip link set veth-tap netns hv-$id
101+
ip -netns hv-$id link set veth-tap master br0
102+
ip -netns hv-$id link set veth-tap up
103+
104+
ip link set veth-hv netns vm-$id
105+
ip -netns vm-$id addr add 10.0.0.$id/24 dev veth-hv
106+
ip -netns vm-$id link set veth-hv up
107+
}
108+
setup-vm 1
109+
setup-vm 2
110+
111+
# Setup VTEP routes to make ARP work
112+
bridge -netns hv-1 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.2 self permanent
113+
bridge -netns hv-2 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.1 self permanent
114+
115+
echo -n "Check VM connectivity through VXLAN (underlay in the default VRF) "
116+
ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
117+
echo "[ OK ]"
118+
119+
# Move the underlay to a non-default VRF
120+
ip -netns hv-1 link set veth0 vrf vrf-underlay
121+
ip -netns hv-1 link set veth0 down
122+
ip -netns hv-1 link set veth0 up
123+
ip -netns hv-2 link set veth0 vrf vrf-underlay
124+
ip -netns hv-2 link set veth0 down
125+
ip -netns hv-2 link set veth0 up
126+
127+
echo -n "Check VM connectivity through VXLAN (underlay in a VRF) "
128+
ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
129+
echo "[ OK ]"

0 commit comments

Comments
 (0)