Skip to content

Commit 05dbc7b

Browse files
edumazetdavem330
authored andcommitted
tcp/dccp: remove twchain
TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 53af53a commit 05dbc7b

File tree

12 files changed

+132
-261
lines changed

12 files changed

+132
-261
lines changed

include/net/inet_hashtables.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,11 @@
3737
#include <asm/byteorder.h>
3838

3939
/* This is for all connections with a full identity, no wildcards.
40-
* One chain is dedicated to TIME_WAIT sockets.
41-
* I'll experiment with dynamic table growth later.
40+
* The 'e' prefix stands for Establish, but we really put all sockets
41+
* but LISTEN ones.
4242
*/
4343
struct inet_ehash_bucket {
4444
struct hlist_nulls_head chain;
45-
struct hlist_nulls_head twchain;
4645
};
4746

4847
/* There are a few simple rules, which allow for local port reuse by
@@ -123,7 +122,6 @@ struct inet_hashinfo {
123122
*
124123
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
125124
*
126-
* TIME_WAIT sockets use a separate chain (twchain).
127125
*/
128126
struct inet_ehash_bucket *ehash;
129127
spinlock_t *ehash_locks;
@@ -318,9 +316,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
318316
net_eq(sock_net(__sk), (__net)))
319317
#endif /* 64-bit arch */
320318

321-
#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
322-
INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)
323-
324319
/*
325320
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
326321
* not check it for lookups anymore, thanks Alexey. -DaveM

include/net/inet_timewait_sock.h

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,18 +141,6 @@ struct inet_timewait_sock {
141141
};
142142
#define tw_tclass tw_tos
143143

144-
static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
145-
struct hlist_nulls_head *list)
146-
{
147-
hlist_nulls_add_head_rcu(&tw->tw_node, list);
148-
}
149-
150-
static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
151-
struct hlist_head *list)
152-
{
153-
hlist_add_head(&tw->tw_bind_node, list);
154-
}
155-
156144
static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
157145
{
158146
return !hlist_unhashed(&tw->tw_death_node);
@@ -192,6 +180,7 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
192180
return (struct inet_timewait_sock *)sk;
193181
}
194182

183+
void inet_twsk_free(struct inet_timewait_sock *tw);
195184
void inet_twsk_put(struct inet_timewait_sock *tw);
196185

197186
int inet_twsk_unhash(struct inet_timewait_sock *tw);

include/net/sock.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ typedef __u64 __bitwise __addrpair;
156156
*/
157157
struct sock_common {
158158
/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
159-
* address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH()
159+
* address on 64bit arches : cf INET_MATCH()
160160
*/
161161
union {
162162
__addrpair skc_addrpair;
@@ -301,6 +301,8 @@ struct sock {
301301
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
302302
#define sk_hash __sk_common.skc_hash
303303
#define sk_portpair __sk_common.skc_portpair
304+
#define sk_num __sk_common.skc_num
305+
#define sk_dport __sk_common.skc_dport
304306
#define sk_addrpair __sk_common.skc_addrpair
305307
#define sk_daddr __sk_common.skc_daddr
306308
#define sk_rcv_saddr __sk_common.skc_rcv_saddr
@@ -1653,6 +1655,10 @@ static inline void sock_put(struct sock *sk)
16531655
if (atomic_dec_and_test(&sk->sk_refcnt))
16541656
sk_free(sk);
16551657
}
1658+
/* Generic version of sock_put(), dealing with all sockets
1659+
* (TCP_TIMEWAIT, ESTABLISHED...)
1660+
*/
1661+
void sock_gen_put(struct sock *sk);
16561662

16571663
int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested);
16581664

include/net/tcp.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1519,7 +1519,6 @@ enum tcp_seq_states {
15191519
TCP_SEQ_STATE_LISTENING,
15201520
TCP_SEQ_STATE_OPENREQ,
15211521
TCP_SEQ_STATE_ESTABLISHED,
1522-
TCP_SEQ_STATE_TIME_WAIT,
15231522
};
15241523

15251524
int tcp_seq_open(struct inode *inode, struct file *file);

net/dccp/proto.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1158,10 +1158,8 @@ static int __init dccp_init(void)
11581158
goto out_free_bind_bucket_cachep;
11591159
}
11601160

1161-
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
1161+
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
11621162
INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1163-
INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1164-
}
11651163

11661164
if (inet_ehash_locks_alloc(&dccp_hashinfo))
11671165
goto out_free_dccp_ehash;

net/ipv4/inet_diag.c

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -635,12 +635,14 @@ static int inet_csk_diag_dump(struct sock *sk,
635635
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
636636
}
637637

638-
static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
638+
static int inet_twsk_diag_dump(struct sock *sk,
639639
struct sk_buff *skb,
640640
struct netlink_callback *cb,
641641
struct inet_diag_req_v2 *r,
642642
const struct nlattr *bc)
643643
{
644+
struct inet_timewait_sock *tw = inet_twsk(sk);
645+
644646
if (bc != NULL) {
645647
struct inet_diag_entry entry;
646648

@@ -911,16 +913,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
911913

912914
num = 0;
913915

914-
if (hlist_nulls_empty(&head->chain) &&
915-
hlist_nulls_empty(&head->twchain))
916+
if (hlist_nulls_empty(&head->chain))
916917
continue;
917918

918919
if (i > s_i)
919920
s_num = 0;
920921

921922
spin_lock_bh(lock);
922923
sk_nulls_for_each(sk, node, &head->chain) {
923-
struct inet_sock *inet = inet_sk(sk);
924+
int res;
924925

925926
if (!net_eq(sock_net(sk), net))
926927
continue;
@@ -929,49 +930,26 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
929930
if (!(r->idiag_states & (1 << sk->sk_state)))
930931
goto next_normal;
931932
if (r->sdiag_family != AF_UNSPEC &&
932-
sk->sk_family != r->sdiag_family)
933+
sk->sk_family != r->sdiag_family)
933934
goto next_normal;
934-
if (r->id.idiag_sport != inet->inet_sport &&
935+
if (r->id.idiag_sport != htons(sk->sk_num) &&
935936
r->id.idiag_sport)
936937
goto next_normal;
937-
if (r->id.idiag_dport != inet->inet_dport &&
938+
if (r->id.idiag_dport != sk->sk_dport &&
938939
r->id.idiag_dport)
939940
goto next_normal;
940-
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
941+
if (sk->sk_state == TCP_TIME_WAIT)
942+
res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
943+
else
944+
res = inet_csk_diag_dump(sk, skb, cb, r, bc);
945+
if (res < 0) {
941946
spin_unlock_bh(lock);
942947
goto done;
943948
}
944949
next_normal:
945950
++num;
946951
}
947952

948-
if (r->idiag_states & TCPF_TIME_WAIT) {
949-
struct inet_timewait_sock *tw;
950-
951-
inet_twsk_for_each(tw, node,
952-
&head->twchain) {
953-
if (!net_eq(twsk_net(tw), net))
954-
continue;
955-
956-
if (num < s_num)
957-
goto next_dying;
958-
if (r->sdiag_family != AF_UNSPEC &&
959-
tw->tw_family != r->sdiag_family)
960-
goto next_dying;
961-
if (r->id.idiag_sport != tw->tw_sport &&
962-
r->id.idiag_sport)
963-
goto next_dying;
964-
if (r->id.idiag_dport != tw->tw_dport &&
965-
r->id.idiag_dport)
966-
goto next_dying;
967-
if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
968-
spin_unlock_bh(lock);
969-
goto done;
970-
}
971-
next_dying:
972-
++num;
973-
}
974-
}
975953
spin_unlock_bh(lock);
976954
}
977955

net/ipv4/inet_hashtables.c

Lines changed: 29 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,19 @@ struct sock *__inet_lookup_listener(struct net *net,
230230
}
231231
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
232232

233+
/* All sockets share common refcount, but have different destructors */
234+
void sock_gen_put(struct sock *sk)
235+
{
236+
if (!atomic_dec_and_test(&sk->sk_refcnt))
237+
return;
238+
239+
if (sk->sk_state == TCP_TIME_WAIT)
240+
inet_twsk_free(inet_twsk(sk));
241+
else
242+
sk_free(sk);
243+
}
244+
EXPORT_SYMBOL_GPL(sock_gen_put);
245+
233246
struct sock *__inet_lookup_established(struct net *net,
234247
struct inet_hashinfo *hashinfo,
235248
const __be32 saddr, const __be16 sport,
@@ -255,13 +268,13 @@ struct sock *__inet_lookup_established(struct net *net,
255268
if (likely(INET_MATCH(sk, net, acookie,
256269
saddr, daddr, ports, dif))) {
257270
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
258-
goto begintw;
271+
goto out;
259272
if (unlikely(!INET_MATCH(sk, net, acookie,
260273
saddr, daddr, ports, dif))) {
261-
sock_put(sk);
274+
sock_gen_put(sk);
262275
goto begin;
263276
}
264-
goto out;
277+
goto found;
265278
}
266279
}
267280
/*
@@ -271,37 +284,9 @@ struct sock *__inet_lookup_established(struct net *net,
271284
*/
272285
if (get_nulls_value(node) != slot)
273286
goto begin;
274-
275-
begintw:
276-
/* Must check for a TIME_WAIT'er before going to listener hash. */
277-
sk_nulls_for_each_rcu(sk, node, &head->twchain) {
278-
if (sk->sk_hash != hash)
279-
continue;
280-
if (likely(INET_TW_MATCH(sk, net, acookie,
281-
saddr, daddr, ports,
282-
dif))) {
283-
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
284-
sk = NULL;
285-
goto out;
286-
}
287-
if (unlikely(!INET_TW_MATCH(sk, net, acookie,
288-
saddr, daddr, ports,
289-
dif))) {
290-
inet_twsk_put(inet_twsk(sk));
291-
goto begintw;
292-
}
293-
goto out;
294-
}
295-
}
296-
/*
297-
* if the nulls value we got at the end of this lookup is
298-
* not the expected one, we must restart lookup.
299-
* We probably met an item that was moved to another chain.
300-
*/
301-
if (get_nulls_value(node) != slot)
302-
goto begintw;
303-
sk = NULL;
304287
out:
288+
sk = NULL;
289+
found:
305290
rcu_read_unlock();
306291
return sk;
307292
}
@@ -326,39 +311,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
326311
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
327312
struct sock *sk2;
328313
const struct hlist_nulls_node *node;
329-
struct inet_timewait_sock *tw;
314+
struct inet_timewait_sock *tw = NULL;
330315
int twrefcnt = 0;
331316

332317
spin_lock(lock);
333318

334-
/* Check TIME-WAIT sockets first. */
335-
sk_nulls_for_each(sk2, node, &head->twchain) {
336-
if (sk2->sk_hash != hash)
337-
continue;
338-
339-
if (likely(INET_TW_MATCH(sk2, net, acookie,
340-
saddr, daddr, ports, dif))) {
341-
tw = inet_twsk(sk2);
342-
if (twsk_unique(sk, sk2, twp))
343-
goto unique;
344-
else
345-
goto not_unique;
346-
}
347-
}
348-
tw = NULL;
349-
350-
/* And established part... */
351319
sk_nulls_for_each(sk2, node, &head->chain) {
352320
if (sk2->sk_hash != hash)
353321
continue;
322+
354323
if (likely(INET_MATCH(sk2, net, acookie,
355-
saddr, daddr, ports, dif)))
324+
saddr, daddr, ports, dif))) {
325+
if (sk2->sk_state == TCP_TIME_WAIT) {
326+
tw = inet_twsk(sk2);
327+
if (twsk_unique(sk, sk2, twp))
328+
break;
329+
}
356330
goto not_unique;
331+
}
357332
}
358333

359-
unique:
360334
/* Must record num and sport now. Otherwise we will see
361-
* in hash table socket with a funny identity. */
335+
* in hash table socket with a funny identity.
336+
*/
362337
inet->inet_num = lport;
363338
inet->inet_sport = htons(lport);
364339
sk->sk_hash = hash;

0 commit comments

Comments
 (0)