Skip to content

Commit 648700f

Browse files
edumazetdavem330
authored andcommitted
inet: frags: use rhashtables for reassembly units
Some applications still rely on IP fragmentation, and to be fair linux reassembly unit is not working under any serious load. It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU. A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. Before this patch, 16 cpus (16 RX queue NIC) could not handle more than 1 Mpps frags DDOS. After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB of storage for the fragments (exact number depends on frags being evicted after timeout) $ grep FRAG /proc/net/sockstat FRAG: inuse 1966916 memory 2140004608 A followup patch will change the limits for 64bit arches. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Florian Westphal <fw@strlen.de> Cc: Jesper Dangaard Brouer <brouer@redhat.com> Cc: Alexander Aring <alex.aring@gmail.com> Cc: Stefan Schmidt <stefan@osg.samsung.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent ae6da1f commit 648700f

File tree

9 files changed

+265
-573
lines changed

9 files changed

+265
-573
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
134134
IP Fragmentation:
135135

136136
ipfrag_high_thresh - INTEGER
137-
Maximum memory used to reassemble IP fragments. When
138-
ipfrag_high_thresh bytes of memory is allocated for this purpose,
139-
the fragment handler will toss packets until ipfrag_low_thresh
140-
is reached. This also serves as a maximum limit to namespaces
141-
different from the initial one.
137+
Maximum memory used to reassemble IP fragments.
142138

143139
ipfrag_low_thresh - INTEGER
140+
(Obsolete since linux-4.17)
144141
Maximum memory used to reassemble IP fragments before the kernel
145142
begins to remove incomplete fragment queues to free up resources.
146143
The kernel still accepts new fragments for defragmentation.

include/net/inet_frag.h

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
#ifndef __NET_FRAG_H__
33
#define __NET_FRAG_H__
44

5+
#include <linux/rhashtable.h>
6+
57
struct netns_frags {
8+
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
9+
610
/* Keep atomic mem on separate cachelines in structs that include it */
711
atomic_t mem ____cacheline_aligned_in_smp;
812
/* sysctls */
@@ -26,12 +30,30 @@ enum {
2630
INET_FRAG_COMPLETE = BIT(2),
2731
};
2832

33+
struct frag_v4_compare_key {
34+
__be32 saddr;
35+
__be32 daddr;
36+
u32 user;
37+
u32 vif;
38+
__be16 id;
39+
u16 protocol;
40+
};
41+
42+
struct frag_v6_compare_key {
43+
struct in6_addr saddr;
44+
struct in6_addr daddr;
45+
u32 user;
46+
__be32 id;
47+
u32 iif;
48+
};
49+
2950
/**
3051
* struct inet_frag_queue - fragment queue
3152
*
32-
* @lock: spinlock protecting the queue
53+
* @node: rhash node
54+
* @key: keys identifying this frag.
3355
* @timer: queue expiration timer
34-
* @list: hash bucket list
56+
* @lock: spinlock protecting this frag
3557
* @refcnt: reference count of the queue
3658
* @fragments: received fragments head
3759
* @fragments_tail: received fragments tail
@@ -41,12 +63,16 @@ enum {
4163
* @flags: fragment queue flags
4264
* @max_size: maximum received fragment size
4365
* @net: namespace that this frag belongs to
44-
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
66+
* @rcu: rcu head for freeing deferall
4567
*/
4668
struct inet_frag_queue {
47-
spinlock_t lock;
69+
struct rhash_head node;
70+
union {
71+
struct frag_v4_compare_key v4;
72+
struct frag_v6_compare_key v6;
73+
} key;
4874
struct timer_list timer;
49-
struct hlist_node list;
75+
spinlock_t lock;
5076
refcount_t refcnt;
5177
struct sk_buff *fragments;
5278
struct sk_buff *fragments_tail;
@@ -55,51 +81,20 @@ struct inet_frag_queue {
5581
int meat;
5682
__u8 flags;
5783
u16 max_size;
58-
struct netns_frags *net;
59-
struct hlist_node list_evictor;
60-
};
61-
62-
#define INETFRAGS_HASHSZ 1024
63-
64-
/* averaged:
65-
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
66-
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
67-
* struct frag_queue))
68-
*/
69-
#define INETFRAGS_MAXDEPTH 128
70-
71-
struct inet_frag_bucket {
72-
struct hlist_head chain;
73-
spinlock_t chain_lock;
84+
struct netns_frags *net;
85+
struct rcu_head rcu;
7486
};
7587

7688
struct inet_frags {
77-
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
78-
79-
struct work_struct frags_work;
80-
unsigned int next_bucket;
81-
unsigned long last_rebuild_jiffies;
82-
bool rebuild;
83-
84-
/* The first call to hashfn is responsible to initialize
85-
* rnd. This is best done with net_get_random_once.
86-
*
87-
* rnd_seqlock is used to let hash insertion detect
88-
* when it needs to re-lookup the hash chain to use.
89-
*/
90-
u32 rnd;
91-
seqlock_t rnd_seqlock;
9289
unsigned int qsize;
9390

94-
unsigned int (*hashfn)(const struct inet_frag_queue *);
95-
bool (*match)(const struct inet_frag_queue *q,
96-
const void *arg);
9791
void (*constructor)(struct inet_frag_queue *q,
9892
const void *arg);
9993
void (*destructor)(struct inet_frag_queue *);
10094
void (*frag_expire)(struct timer_list *t);
10195
struct kmem_cache *frags_cachep;
10296
const char *frags_cache_name;
97+
struct rhashtable_params rhash_params;
10398
};
10499

105100
int inet_frags_init(struct inet_frags *);
@@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *);
108103
static inline int inet_frags_init_net(struct netns_frags *nf)
109104
{
110105
atomic_set(&nf->mem, 0);
111-
return 0;
106+
return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
112107
}
113108
void inet_frags_exit_net(struct netns_frags *nf);
114109

115110
void inet_frag_kill(struct inet_frag_queue *q);
116111
void inet_frag_destroy(struct inet_frag_queue *q);
117-
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
118-
struct inet_frags *f, void *key, unsigned int hash);
119-
112+
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
120113
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
121114
const char *prefix);
122115

@@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
128121

129122
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
130123
{
131-
return !hlist_unhashed(&q->list_evictor);
124+
return false;
132125
}
133126

134127
/* Memory Tracking Functions. */

include/net/ipv6.h

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -579,29 +579,15 @@ enum ip6_defrag_users {
579579
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
580580
};
581581

582-
struct ip6_create_arg {
583-
__be32 id;
584-
u32 user;
585-
const struct in6_addr *src;
586-
const struct in6_addr *dst;
587-
int iif;
588-
u8 ecn;
589-
};
590-
591582
void ip6_frag_init(struct inet_frag_queue *q, const void *a);
592-
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
583+
extern const struct rhashtable_params ip6_rhash_params;
593584

594585
/*
595586
* Equivalent of ipv4 struct ip
596587
*/
597588
struct frag_queue {
598589
struct inet_frag_queue q;
599590

600-
__be32 id; /* fragment id */
601-
u32 user;
602-
struct in6_addr saddr;
603-
struct in6_addr daddr;
604-
605591
int iif;
606592
__u16 nhoffset;
607593
u8 ecn;

net/ieee802154/6lowpan/6lowpan_i.h

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
1717
#define LOWPAN_DISPATCH_FRAG1 0xc0
1818
#define LOWPAN_DISPATCH_FRAGN 0xe0
1919

20-
struct lowpan_create_arg {
20+
struct frag_lowpan_compare_key {
2121
u16 tag;
2222
u16 d_size;
23-
const struct ieee802154_addr *src;
24-
const struct ieee802154_addr *dst;
23+
const struct ieee802154_addr src;
24+
const struct ieee802154_addr dst;
2525
};
2626

27-
/* Equivalent of ipv4 struct ip
27+
/* Equivalent of ipv4 struct ipq
2828
*/
2929
struct lowpan_frag_queue {
3030
struct inet_frag_queue q;
31-
32-
u16 tag;
33-
u16 d_size;
34-
struct ieee802154_addr saddr;
35-
struct ieee802154_addr daddr;
3631
};
3732

38-
static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
39-
{
40-
switch (a->mode) {
41-
case IEEE802154_ADDR_LONG:
42-
return (((__force u64)a->extended_addr) >> 32) ^
43-
(((__force u64)a->extended_addr) & 0xffffffff);
44-
case IEEE802154_ADDR_SHORT:
45-
return (__force u32)(a->short_addr + (a->pan_id << 16));
46-
default:
47-
return 0;
48-
}
49-
}
50-
5133
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
5234
void lowpan_net_frag_exit(void);
5335
int lowpan_net_frag_init(void);

net/ieee802154/6lowpan/reassembly.c

Lines changed: 42 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
3737
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
3838
struct sk_buff *prev, struct net_device *ldev);
3939

40-
static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
41-
const struct ieee802154_addr *saddr,
42-
const struct ieee802154_addr *daddr)
43-
{
44-
net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
45-
return jhash_3words(ieee802154_addr_hash(saddr),
46-
ieee802154_addr_hash(daddr),
47-
(__force u32)(tag + (d_size << 16)),
48-
lowpan_frags.rnd);
49-
}
50-
51-
static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
52-
{
53-
const struct lowpan_frag_queue *fq;
54-
55-
fq = container_of(q, struct lowpan_frag_queue, q);
56-
return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
57-
}
58-
59-
static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
60-
{
61-
const struct lowpan_frag_queue *fq;
62-
const struct lowpan_create_arg *arg = a;
63-
64-
fq = container_of(q, struct lowpan_frag_queue, q);
65-
return fq->tag == arg->tag && fq->d_size == arg->d_size &&
66-
ieee802154_addr_equal(&fq->saddr, arg->src) &&
67-
ieee802154_addr_equal(&fq->daddr, arg->dst);
68-
}
69-
7040
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
7141
{
72-
const struct lowpan_create_arg *arg = a;
42+
const struct frag_lowpan_compare_key *key = a;
7343
struct lowpan_frag_queue *fq;
7444

7545
fq = container_of(q, struct lowpan_frag_queue, q);
7646

77-
fq->tag = arg->tag;
78-
fq->d_size = arg->d_size;
79-
fq->saddr = *arg->src;
80-
fq->daddr = *arg->dst;
47+
BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
48+
memcpy(&q->key, key, sizeof(*key));
8149
}
8250

8351
static void lowpan_frag_expire(struct timer_list *t)
@@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
10573
const struct ieee802154_addr *src,
10674
const struct ieee802154_addr *dst)
10775
{
108-
struct inet_frag_queue *q;
109-
struct lowpan_create_arg arg;
110-
unsigned int hash;
11176
struct netns_ieee802154_lowpan *ieee802154_lowpan =
11277
net_ieee802154_lowpan(net);
78+
struct frag_lowpan_compare_key key = {
79+
.tag = cb->d_tag,
80+
.d_size = cb->d_size,
81+
.src = *src,
82+
.dst = *dst,
83+
};
84+
struct inet_frag_queue *q;
11385

114-
arg.tag = cb->d_tag;
115-
arg.d_size = cb->d_size;
116-
arg.src = src;
117-
arg.dst = dst;
118-
119-
hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
120-
121-
q = inet_frag_find(&ieee802154_lowpan->frags,
122-
&lowpan_frags, &arg, hash);
86+
q = inet_frag_find(&ieee802154_lowpan->frags, &key);
12387
if (IS_ERR_OR_NULL(q)) {
12488
inet_frag_maybe_warn_overflow(q, pr_fmt());
12589
return NULL;
@@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = {
611575
.exit = lowpan_frags_exit_net,
612576
};
613577

578+
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
579+
{
580+
return jhash2(data,
581+
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
582+
}
583+
584+
static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
585+
{
586+
const struct inet_frag_queue *fq = data;
587+
588+
return jhash2((const u32 *)&fq->key,
589+
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
590+
}
591+
592+
static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
593+
{
594+
const struct frag_lowpan_compare_key *key = arg->key;
595+
const struct inet_frag_queue *fq = ptr;
596+
597+
return !!memcmp(&fq->key, key, sizeof(*key));
598+
}
599+
600+
static const struct rhashtable_params lowpan_rhash_params = {
601+
.head_offset = offsetof(struct inet_frag_queue, node),
602+
.hashfn = lowpan_key_hashfn,
603+
.obj_hashfn = lowpan_obj_hashfn,
604+
.obj_cmpfn = lowpan_obj_cmpfn,
605+
.automatic_shrinking = true,
606+
};
607+
614608
int __init lowpan_net_frag_init(void)
615609
{
616610
int ret;
617611

618-
lowpan_frags.hashfn = lowpan_hashfn;
619612
lowpan_frags.constructor = lowpan_frag_init;
620613
lowpan_frags.destructor = NULL;
621614
lowpan_frags.qsize = sizeof(struct frag_queue);
622-
lowpan_frags.match = lowpan_frag_match;
623615
lowpan_frags.frag_expire = lowpan_frag_expire;
624616
lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
617+
lowpan_frags.rhash_params = lowpan_rhash_params;
625618
ret = inet_frags_init(&lowpan_frags);
626619
if (ret)
627620
goto out;

0 commit comments

Comments
 (0)