Skip to content

Commit 73f156a

Browse files
edumazetdavem330
authored andcommitted
inetpeer: get rid of ip_id_count
Ideally, we would need to generate IP ID using a per destination IP generator. linux kernels used inet_peer cache for this purpose, but this had a huge cost on servers disabling MTU discovery. 1) each inet_peer struct consumes 192 bytes 2) inetpeer cache uses a binary tree of inet_peer structs, with a nominal size of ~66000 elements under load. 3) lookups in this tree are hitting a lot of cache lines, as tree depth is about 20. 4) If server deals with many tcp flows, we have a high probability of not finding the inet_peer, allocating a fresh one, inserting it in the tree with same initial ip_id_count, (cf secure_ip_id()) 5) We garbage collect inet_peer aggressively. IP ID generation do not have to be 'perfect' Goal is trying to avoid duplicates in a short period of time, so that reassembly units have a chance to complete reassembly of fragments belonging to one message before receiving other fragments with a recycled ID. We simply use an array of generators, and a Jenkin hash using the dst IP as a key. ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it belongs (it is only used from this file) secure_ip_id() and secure_ipv6_id() no longer are needed. Rename ip_select_ident_more() to ip_select_ident_segs() to avoid unnecessary decrement/increment of the number of segments. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent e067ee3 commit 73f156a

File tree

17 files changed

+65
-155
lines changed

17 files changed

+65
-155
lines changed

drivers/net/ppp/pptp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
281281
nf_reset(skb);
282282

283283
skb->ip_summed = CHECKSUM_NONE;
284-
ip_select_ident(skb, &rt->dst, NULL);
284+
ip_select_ident(skb, NULL);
285285
ip_send_check(iph);
286286

287287
ip_local_out(skb);

include/net/inetpeer.h

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,13 @@ struct inet_peer {
4141
struct rcu_head gc_rcu;
4242
};
4343
/*
44-
* Once inet_peer is queued for deletion (refcnt == -1), following fields
45-
* are not available: rid, ip_id_count
44+
* Once inet_peer is queued for deletion (refcnt == -1), following field
45+
* is not available: rid
4646
* We can share memory with rcu_head to help keep inet_peer small.
4747
*/
4848
union {
4949
struct {
5050
atomic_t rid; /* Frag reception counter */
51-
atomic_t ip_id_count; /* IP ID for the next packet */
5251
};
5352
struct rcu_head rcu;
5453
struct inet_peer *gc_next;
@@ -165,28 +164,12 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
165164
void inetpeer_invalidate_tree(struct inet_peer_base *);
166165

167166
/*
168-
* temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
167+
* temporary check to make sure we dont access rid, tcp_ts,
169168
* tcp_ts_stamp if no refcount is taken on inet_peer
170169
*/
171170
static inline void inet_peer_refcheck(const struct inet_peer *p)
172171
{
173172
WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0);
174173
}
175174

176-
177-
/* can be called with or without local BH being disabled */
178-
static inline int inet_getid(struct inet_peer *p, int more)
179-
{
180-
int old, new;
181-
more++;
182-
inet_peer_refcheck(p);
183-
do {
184-
old = atomic_read(&p->ip_id_count);
185-
new = old + more;
186-
if (!new)
187-
new = 1;
188-
} while (atomic_cmpxchg(&p->ip_id_count, old, new) != old);
189-
return new;
190-
}
191-
192175
#endif /* _NET_INETPEER_H */

include/net/ip.h

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,19 @@ static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb)
309309
}
310310
}
311311

312-
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
312+
#define IP_IDENTS_SZ 2048u
313+
extern atomic_t *ip_idents;
313314

314-
static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk)
315+
static inline u32 ip_idents_reserve(u32 hash, int segs)
316+
{
317+
atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ;
318+
319+
return atomic_add_return(segs, id_ptr) - segs;
320+
}
321+
322+
void __ip_select_ident(struct iphdr *iph, int segs);
323+
324+
static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs)
315325
{
316326
struct iphdr *iph = ip_hdr(skb);
317327

@@ -321,24 +331,20 @@ static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, s
321331
* does not change, they drop every other packet in
322332
* a TCP stream using header compression.
323333
*/
324-
iph->id = (sk && inet_sk(sk)->inet_daddr) ?
325-
htons(inet_sk(sk)->inet_id++) : 0;
326-
} else
327-
__ip_select_ident(iph, dst, 0);
328-
}
329-
330-
static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more)
331-
{
332-
struct iphdr *iph = ip_hdr(skb);
333-
334-
if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
335334
if (sk && inet_sk(sk)->inet_daddr) {
336335
iph->id = htons(inet_sk(sk)->inet_id);
337-
inet_sk(sk)->inet_id += 1 + more;
338-
} else
336+
inet_sk(sk)->inet_id += segs;
337+
} else {
339338
iph->id = 0;
340-
} else
341-
__ip_select_ident(iph, dst, more);
339+
}
340+
} else {
341+
__ip_select_ident(iph, segs);
342+
}
343+
}
344+
345+
static inline void ip_select_ident(struct sk_buff *skb, struct sock *sk)
346+
{
347+
ip_select_ident_segs(skb, sk, 1);
342348
}
343349

344350
static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)

include/net/ipv6.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -668,8 +668,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
668668
return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
669669
}
670670

671-
void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
672-
673671
int ip6_dst_hoplimit(struct dst_entry *dst);
674672

675673
static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,

include/net/secure_seq.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
#include <linux/types.h>
55

6-
__u32 secure_ip_id(__be32 daddr);
7-
__u32 secure_ipv6_id(const __be32 daddr[4]);
86
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
97
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
108
__be16 dport);

net/core/secure_seq.c

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -85,31 +85,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
8585
#endif
8686

8787
#ifdef CONFIG_INET
88-
__u32 secure_ip_id(__be32 daddr)
89-
{
90-
u32 hash[MD5_DIGEST_WORDS];
91-
92-
net_secret_init();
93-
hash[0] = (__force __u32) daddr;
94-
hash[1] = net_secret[13];
95-
hash[2] = net_secret[14];
96-
hash[3] = net_secret[15];
97-
98-
md5_transform(hash, net_secret);
99-
100-
return hash[0];
101-
}
102-
103-
__u32 secure_ipv6_id(const __be32 daddr[4])
104-
{
105-
__u32 hash[4];
106-
107-
net_secret_init();
108-
memcpy(hash, daddr, 16);
109-
md5_transform(hash, net_secret);
110-
111-
return hash[0];
112-
}
11388

11489
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
11590
__be16 sport, __be16 dport)

net/ipv4/igmp.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
369369
pip->saddr = fl4.saddr;
370370
pip->protocol = IPPROTO_IGMP;
371371
pip->tot_len = 0; /* filled in later */
372-
ip_select_ident(skb, &rt->dst, NULL);
372+
ip_select_ident(skb, NULL);
373373
((u8 *)&pip[1])[0] = IPOPT_RA;
374374
((u8 *)&pip[1])[1] = 4;
375375
((u8 *)&pip[1])[2] = 0;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
714714
iph->daddr = dst;
715715
iph->saddr = fl4.saddr;
716716
iph->protocol = IPPROTO_IGMP;
717-
ip_select_ident(skb, &rt->dst, NULL);
717+
ip_select_ident(skb, NULL);
718718
((u8 *)&iph[1])[0] = IPOPT_RA;
719719
((u8 *)&iph[1])[1] = 4;
720720
((u8 *)&iph[1])[2] = 0;

net/ipv4/inetpeer.c

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,7 @@
2626
* Theory of operations.
2727
* We keep one entry for each peer IP address. The nodes contains long-living
2828
* information about the peer which doesn't depend on routes.
29-
* At this moment this information consists only of ID field for the next
30-
* outgoing IP packet. This field is incremented with each packet as encoded
31-
* in inet_getid() function (include/net/inetpeer.h).
32-
* At the moment of writing this notes identifier of IP packets is generated
33-
* to be unpredictable using this code only for packets subjected
34-
* (actually or potentially) to defragmentation. I.e. DF packets less than
35-
* PMTU in size when local fragmentation is disabled use a constant ID and do
36-
* not use this code (see ip_select_ident() in include/net/ip.h).
3729
*
38-
* Route cache entries hold references to our nodes.
39-
* New cache entries get references via lookup by destination IP address in
40-
* the avl tree. The reference is grabbed only when it's needed i.e. only
41-
* when we try to output IP packet which needs an unpredictable ID (see
42-
* __ip_select_ident() in net/ipv4/route.c).
4330
* Nodes are removed only when reference counter goes to 0.
4431
* When it's happened the node may be removed when a sufficient amount of
4532
* time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
6249
* refcnt: atomically against modifications on other CPU;
6350
* usually under some other lock to prevent node disappearing
6451
* daddr: unchangeable
65-
* ip_id_count: atomic value (no lock needed)
6652
*/
6753

6854
static struct kmem_cache *peer_cachep __read_mostly;
@@ -497,10 +483,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
497483
p->daddr = *daddr;
498484
atomic_set(&p->refcnt, 1);
499485
atomic_set(&p->rid, 0);
500-
atomic_set(&p->ip_id_count,
501-
(daddr->family == AF_INET) ?
502-
secure_ip_id(daddr->addr.a4) :
503-
secure_ipv6_id(daddr->addr.a6));
504486
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
505487
p->rate_tokens = 0;
506488
/* 60*HZ is arbitrary, but chosen enough high so that the first

net/ipv4/ip_output.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
148148
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149149
iph->saddr = saddr;
150150
iph->protocol = sk->sk_protocol;
151-
ip_select_ident(skb, &rt->dst, sk);
151+
ip_select_ident(skb, sk);
152152

153153
if (opt && opt->opt.optlen) {
154154
iph->ihl += opt->opt.optlen>>2;
@@ -430,8 +430,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
430430
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431431
}
432432

433-
ip_select_ident_more(skb, &rt->dst, sk,
434-
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
433+
ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
435434

436435
/* TODO : should we use skb->sk here instead of sk ? */
437436
skb->priority = sk->sk_priority;
@@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
13791378
iph->ttl = ttl;
13801379
iph->protocol = sk->sk_protocol;
13811380
ip_copy_addrs(iph, fl4);
1382-
ip_select_ident(skb, &rt->dst, sk);
1381+
ip_select_ident(skb, sk);
13831382

13841383
if (opt) {
13851384
iph->ihl += opt->optlen>>2;

net/ipv4/ip_tunnel_core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
7474
iph->daddr = dst;
7575
iph->saddr = src;
7676
iph->ttl = ttl;
77-
__ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
77+
__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
7878

7979
err = ip_local_out_sk(sk, skb);
8080
if (unlikely(net_xmit_eval(err)))

net/ipv4/ipmr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
16631663
iph->protocol = IPPROTO_IPIP;
16641664
iph->ihl = 5;
16651665
iph->tot_len = htons(skb->len);
1666-
ip_select_ident(skb, skb_dst(skb), NULL);
1666+
ip_select_ident(skb, NULL);
16671667
ip_send_check(iph);
16681668

16691669
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));

net/ipv4/raw.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
389389
iph->check = 0;
390390
iph->tot_len = htons(length);
391391
if (!iph->id)
392-
ip_select_ident(skb, &rt->dst, NULL);
392+
ip_select_ident(skb, NULL);
393393

394394
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
395395
}

net/ipv4/route.c

Lines changed: 16 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
#include <linux/rcupdate.h>
9090
#include <linux/times.h>
9191
#include <linux/slab.h>
92+
#include <linux/jhash.h>
9293
#include <net/dst.h>
9394
#include <net/net_namespace.h>
9495
#include <net/protocol.h>
@@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
456457
return neigh_create(&arp_tbl, pkey, dev);
457458
}
458459

459-
/*
460-
* Peer allocation may fail only in serious out-of-memory conditions. However
461-
* we still can generate some output.
462-
* Random ID selection looks a bit dangerous because we have no chances to
463-
* select ID being unique in a reasonable period of time.
464-
* But broken packet identifier may be better than no packet at all.
465-
*/
466-
static void ip_select_fb_ident(struct iphdr *iph)
467-
{
468-
static DEFINE_SPINLOCK(ip_fb_id_lock);
469-
static u32 ip_fallback_id;
470-
u32 salt;
460+
atomic_t *ip_idents __read_mostly;
461+
EXPORT_SYMBOL(ip_idents);
471462

472-
spin_lock_bh(&ip_fb_id_lock);
473-
salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474-
iph->id = htons(salt & 0xFFFF);
475-
ip_fallback_id = salt;
476-
spin_unlock_bh(&ip_fb_id_lock);
477-
}
478-
479-
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
463+
void __ip_select_ident(struct iphdr *iph, int segs)
480464
{
481-
struct net *net = dev_net(dst->dev);
482-
struct inet_peer *peer;
465+
static u32 ip_idents_hashrnd __read_mostly;
466+
u32 hash, id;
483467

484-
peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
485-
if (peer) {
486-
iph->id = htons(inet_getid(peer, more));
487-
inet_putpeer(peer);
488-
return;
489-
}
468+
net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
490469

491-
ip_select_fb_ident(iph);
470+
hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471+
id = ip_idents_reserve(hash, segs);
472+
iph->id = htons(id);
492473
}
493474
EXPORT_SYMBOL(__ip_select_ident);
494475

@@ -2711,6 +2692,12 @@ int __init ip_rt_init(void)
27112692
{
27122693
int rc = 0;
27132694

2695+
ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2696+
if (!ip_idents)
2697+
panic("IP: failed to allocate ip_idents\n");
2698+
2699+
prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2700+
27142701
#ifdef CONFIG_IP_ROUTE_CLASSID
27152702
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
27162703
if (!ip_rt_acct)

net/ipv4/xfrm4_mode_tunnel.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
5858

5959
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
6060
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
61-
ip_select_ident(skb, dst->child, NULL);
6261

6362
top_iph->ttl = ip4_dst_hoplimit(dst->child);
6463

6564
top_iph->saddr = x->props.saddr.a4;
6665
top_iph->daddr = x->id.daddr.a4;
66+
ip_select_ident(skb, NULL);
6767

6868
return 0;
6969
}

net/ipv6/ip6_output.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,18 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
537537
skb_copy_secmark(to, from);
538538
}
539539

540+
static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
541+
{
542+
static u32 ip6_idents_hashrnd __read_mostly;
543+
u32 hash, id;
544+
545+
net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
546+
547+
hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
548+
id = ip_idents_reserve(hash, 1);
549+
fhdr->identification = htonl(id);
550+
}
551+
540552
int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
541553
{
542554
struct sk_buff *frag;

0 commit comments

Comments
 (0)