Skip to content

Commit 0eeb075

Browse files
gospodavem330
authored andcommitted
net: ipv4 sysctl option to ignore routes when nexthop link is down
This feature is only enabled with the new per-interface or ipv4 global sysctls called 'ignore_routes_with_linkdown'. net.ipv4.conf.all.ignore_routes_with_linkdown = 0 net.ipv4.conf.default.ignore_routes_with_linkdown = 0 net.ipv4.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, will report to userspace that a route is dead and will no longer resolve to this nexthop when performing a fib lookup. This will signal to userspace that the route will not be selected. The signalling of a RTNH_F_DEAD is only passed to userspace if the sysctl is enabled and link is down. This was done as without it the netlink listeners would have no idea whether or not a nexthop would be selected. The kernel only sets RTNH_F_DEAD internally if the interface has IFF_UP cleared. With the new sysctl set, the following behavior can be observed (interface p8p1 is link-down): default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache <local> 80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15 cache While the route does remain in the table (so it can be modified if needed rather than being wiped away as it would be if IFF_UP was cleared), the proper next-hop is chosen automatically when the link is down. Now interface p8p1 is linked-up: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache <local> 80.0.0.2 dev p8p1 src 80.0.0.1 cache and the output changes to what one would expect. If the sysctl is not set, the following output would be expected when p8p1 is down: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 Since the dead flag does not appear, there should be no expectation that the kernel would skip using this route due to link being down. v2: Split kernel changes into 2 patches, this actually makes a behavioral change if the sysctl is set. Also took suggestion from Alex to simplify code by only checking sysctl during fib lookup and suggestion from Scott to add a per-interface sysctl. v3: Code clean-ups to make it more readable and efficient as well as a reverse path check fix. v4: Drop binary sysctl v5: Whitespace fixups from Dave v6: Style changes from Dave and checkpatch suggestions v7: One more checkpatch fixup Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com> Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com> Acked-by: Scott Feldman <sfeldma@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 8a3d031 commit 0eeb075

File tree

11 files changed

+63
-24
lines changed

11 files changed

+63
-24
lines changed

include/linux/inetdevice.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
120120
|| (!IN_DEV_FORWARD(in_dev) && \
121121
IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))
122122

123+
#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
124+
IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)
125+
123126
#define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER)
124127
#define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_ORCONF((in_dev), ARP_ACCEPT)
125128
#define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)

include/net/fib_rules.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ struct fib_lookup_arg {
3636
void *result;
3737
struct fib_rule *rule;
3838
int flags;
39-
#define FIB_LOOKUP_NOREF 1
39+
#define FIB_LOOKUP_NOREF 1
40+
#define FIB_LOOKUP_IGNORE_LINKSTATE 2
4041
};
4142

4243
struct fib_rules_ops {

include/net/ip_fib.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,15 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
226226
}
227227

228228
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
229-
struct fib_result *res)
229+
struct fib_result *res, unsigned int flags)
230230
{
231231
struct fib_table *tb;
232232
int err = -ENETUNREACH;
233233

234234
rcu_read_lock();
235235

236236
tb = fib_get_table(net, RT_TABLE_MAIN);
237-
if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
237+
if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF))
238238
err = 0;
239239

240240
rcu_read_unlock();
@@ -249,28 +249,30 @@ void __net_exit fib4_rules_exit(struct net *net);
249249
struct fib_table *fib_new_table(struct net *net, u32 id);
250250
struct fib_table *fib_get_table(struct net *net, u32 id);
251251

252-
int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res);
252+
int __fib_lookup(struct net *net, struct flowi4 *flp,
253+
struct fib_result *res, unsigned int flags);
253254

254255
static inline int fib_lookup(struct net *net, struct flowi4 *flp,
255-
struct fib_result *res)
256+
struct fib_result *res, unsigned int flags)
256257
{
257258
struct fib_table *tb;
258259
int err;
259260

261+
flags |= FIB_LOOKUP_NOREF;
260262
if (net->ipv4.fib_has_custom_rules)
261-
return __fib_lookup(net, flp, res);
263+
return __fib_lookup(net, flp, res, flags);
262264

263265
rcu_read_lock();
264266

265267
res->tclassid = 0;
266268

267269
for (err = 0; !err; err = -ENETUNREACH) {
268270
tb = rcu_dereference_rtnl(net->ipv4.fib_main);
269-
if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
271+
if (tb && !fib_table_lookup(tb, flp, res, flags))
270272
break;
271273

272274
tb = rcu_dereference_rtnl(net->ipv4.fib_default);
273-
if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
275+
if (tb && !fib_table_lookup(tb, flp, res, flags))
274276
break;
275277
}
276278

include/uapi/linux/ip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ enum
164164
IPV4_DEVCONF_ROUTE_LOCALNET,
165165
IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
166166
IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
167+
IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
167168
__IPV4_DEVCONF_MAX
168169
};
169170

net/ipv4/devinet.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2169,6 +2169,8 @@ static struct devinet_sysctl_table {
21692169
"igmpv2_unsolicited_report_interval"),
21702170
DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
21712171
"igmpv3_unsolicited_report_interval"),
2172+
DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
2173+
"ignore_routes_with_linkdown"),
21722174

21732175
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
21742176
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),

net/ipv4/fib_frontend.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
280280
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
281281
fl4.flowi4_scope = scope;
282282
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
283-
if (!fib_lookup(net, &fl4, &res))
283+
if (!fib_lookup(net, &fl4, &res, 0))
284284
return FIB_RES_PREFSRC(net, res);
285285
} else {
286286
scope = RT_SCOPE_LINK;
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
319319
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
320320

321321
net = dev_net(dev);
322-
if (fib_lookup(net, &fl4, &res))
322+
if (fib_lookup(net, &fl4, &res, 0))
323323
goto last_resort;
324324
if (res.type != RTN_UNICAST &&
325325
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
@@ -354,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
354354
fl4.flowi4_oif = dev->ifindex;
355355

356356
ret = 0;
357-
if (fib_lookup(net, &fl4, &res) == 0) {
357+
if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
358358
if (res.type == RTN_UNICAST)
359359
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
360360
}

net/ipv4/fib_rules.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,12 @@ struct fib4_rule {
4747
#endif
4848
};
4949

50-
int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
50+
int __fib_lookup(struct net *net, struct flowi4 *flp,
51+
struct fib_result *res, unsigned int flags)
5152
{
5253
struct fib_lookup_arg arg = {
5354
.result = res,
54-
.flags = FIB_LOOKUP_NOREF,
55+
.flags = flags,
5556
};
5657
int err;
5758

net/ipv4/fib_semantics.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
623623
/* It is not necessary, but requires a bit of thinking */
624624
if (fl4.flowi4_scope < RT_SCOPE_LINK)
625625
fl4.flowi4_scope = RT_SCOPE_LINK;
626-
err = fib_lookup(net, &fl4, &res);
626+
err = fib_lookup(net, &fl4, &res,
627+
FIB_LOOKUP_IGNORE_LINKSTATE);
627628
if (err) {
628629
rcu_read_unlock();
629630
return err;
@@ -1035,12 +1036,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
10351036
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
10361037
goto nla_put_failure;
10371038
if (fi->fib_nhs == 1) {
1039+
struct in_device *in_dev;
1040+
10381041
if (fi->fib_nh->nh_gw &&
10391042
nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
10401043
goto nla_put_failure;
10411044
if (fi->fib_nh->nh_oif &&
10421045
nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
10431046
goto nla_put_failure;
1047+
if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
1048+
in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev);
1049+
if (in_dev &&
1050+
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1051+
rtm->rtm_flags |= RTNH_F_DEAD;
1052+
}
10441053
#ifdef CONFIG_IP_ROUTE_CLASSID
10451054
if (fi->fib_nh[0].nh_tclassid &&
10461055
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
@@ -1057,11 +1066,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
10571066
goto nla_put_failure;
10581067

10591068
for_nexthops(fi) {
1069+
struct in_device *in_dev;
1070+
10601071
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
10611072
if (!rtnh)
10621073
goto nla_put_failure;
10631074

10641075
rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1076+
if (nh->nh_flags & RTNH_F_LINKDOWN) {
1077+
in_dev = __in_dev_get_rcu(nh->nh_dev);
1078+
if (in_dev &&
1079+
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1080+
rtnh->rtnh_flags |= RTNH_F_DEAD;
1081+
}
10651082
rtnh->rtnh_hops = nh->nh_weight - 1;
10661083
rtnh->rtnh_ifindex = nh->nh_oif;
10671084

@@ -1310,16 +1327,22 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
13101327
void fib_select_multipath(struct fib_result *res)
13111328
{
13121329
struct fib_info *fi = res->fi;
1330+
struct in_device *in_dev;
13131331
int w;
13141332

13151333
spin_lock_bh(&fib_multipath_lock);
13161334
if (fi->fib_power <= 0) {
13171335
int power = 0;
13181336
change_nexthops(fi) {
1319-
if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1320-
power += nexthop_nh->nh_weight;
1321-
nexthop_nh->nh_power = nexthop_nh->nh_weight;
1322-
}
1337+
in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
1338+
if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1339+
continue;
1340+
if (in_dev &&
1341+
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1342+
nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
1343+
continue;
1344+
power += nexthop_nh->nh_weight;
1345+
nexthop_nh->nh_power = nexthop_nh->nh_weight;
13231346
} endfor_nexthops(fi);
13241347
fi->fib_power = power;
13251348
if (power <= 0) {

net/ipv4/fib_trie.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,9 +1412,15 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
14121412
continue;
14131413
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
14141414
const struct fib_nh *nh = &fi->fib_nh[nhsel];
1415+
struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
14151416

14161417
if (nh->nh_flags & RTNH_F_DEAD)
14171418
continue;
1419+
if (in_dev &&
1420+
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1421+
nh->nh_flags & RTNH_F_LINKDOWN &&
1422+
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
1423+
continue;
14181424
if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
14191425
continue;
14201426

net/ipv4/netfilter/ipt_rpfilter.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
4040
struct net *net = dev_net(dev);
4141
int ret __maybe_unused;
4242

43-
if (fib_lookup(net, fl4, &res))
43+
if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
4444
return false;
4545

4646
if (res.type != RTN_UNICAST) {

net/ipv4/route.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
747747
if (!(n->nud_state & NUD_VALID)) {
748748
neigh_event_send(n, NULL);
749749
} else {
750-
if (fib_lookup(net, fl4, &res) == 0) {
750+
if (fib_lookup(net, fl4, &res, 0) == 0) {
751751
struct fib_nh *nh = &FIB_RES_NH(res);
752752

753753
update_or_create_fnhe(nh, fl4->daddr, new_gw,
@@ -975,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
975975
return;
976976

977977
rcu_read_lock();
978-
if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
978+
if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
979979
struct fib_nh *nh = &FIB_RES_NH(res);
980980

981981
update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1186,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
11861186
fl4.flowi4_mark = skb->mark;
11871187

11881188
rcu_read_lock();
1189-
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1189+
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
11901190
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
11911191
else
11921192
src = inet_select_addr(rt->dst.dev,
@@ -1716,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
17161716
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
17171717
fl4.daddr = daddr;
17181718
fl4.saddr = saddr;
1719-
err = fib_lookup(net, &fl4, &res);
1719+
err = fib_lookup(net, &fl4, &res, 0);
17201720
if (err != 0) {
17211721
if (!IN_DEV_FORWARD(in_dev))
17221722
err = -EHOSTUNREACH;
@@ -2123,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
21232123
goto make_route;
21242124
}
21252125

2126-
if (fib_lookup(net, fl4, &res)) {
2126+
if (fib_lookup(net, fl4, &res, 0)) {
21272127
res.fi = NULL;
21282128
res.table = NULL;
21292129
if (fl4->flowi4_oif) {

0 commit comments

Comments
 (0)