Skip to content

Commit fa76ce7

Browse files
edumazetdavem330
authored andcommitted
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling, done by inet_csk_reqsk_queue_prune(), fired by the keepalive timer of a TCP_LISTEN socket. This function runs for awful long times, with socket lock held, meaning that other cpus needing this lock have to spin for hundred of ms. SYNACK are sent in huge bursts, likely to cause severe drops anyway. This model was OK 15 years ago when memory was very tight. We now can afford to have a timer per request sock. Timer invocations no longer need to lock the listener, and can be run from all cpus in parallel. With following patch increasing somaxconn width to 32 bits, I tested a listener with more than 4 million active request sockets, and a steady SYNFLOOD of ~200,000 SYN per second. Host was sending ~830,000 SYNACK per second. This is ~100 times more what we could achieve before this patch. Later, we will get rid of the listener hash and use ehash instead. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 52452c5 commit fa76ce7

18 files changed

+173
-198
lines changed

include/net/inet6_connection_sock.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
2828
struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
2929
const struct request_sock *req);
3030

31-
struct request_sock *inet6_csk_search_req(const struct sock *sk,
31+
struct request_sock *inet6_csk_search_req(struct sock *sk,
3232
const __be16 rport,
3333
const struct in6_addr *raddr,
3434
const struct in6_addr *laddr,

include/net/inet_connection_sock.h

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
256256

257257
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
258258

259-
struct request_sock *inet_csk_search_req(const struct sock *sk,
259+
struct request_sock *inet_csk_search_req(struct sock *sk,
260260
const __be16 rport,
261261
const __be32 raddr,
262262
const __be32 laddr);
@@ -282,15 +282,13 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
282282
static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
283283
struct request_sock *req)
284284
{
285-
if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
286-
inet_csk_delete_keepalive_timer(sk);
285+
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
287286
}
288287

289288
static inline void inet_csk_reqsk_queue_added(struct sock *sk,
290289
const unsigned long timeout)
291290
{
292-
if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
293-
inet_csk_reset_keepalive_timer(sk, timeout);
291+
reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);
294292
}
295293

296294
static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
@@ -319,14 +317,9 @@ static inline void inet_csk_reqsk_queue_drop(struct sock *sk,
319317
{
320318
inet_csk_reqsk_queue_unlink(sk, req);
321319
inet_csk_reqsk_queue_removed(sk, req);
322-
reqsk_free(req);
320+
reqsk_put(req);
323321
}
324322

325-
void inet_csk_reqsk_queue_prune(struct sock *parent,
326-
const unsigned long interval,
327-
const unsigned long timeout,
328-
const unsigned long max_rto);
329-
330323
void inet_csk_destroy_sock(struct sock *sk);
331324
void inet_csk_prepare_forced_close(struct sock *sk);
332325

include/net/request_sock.h

Lines changed: 41 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ struct request_sock {
6262
u32 window_clamp; /* window clamp at creation time */
6363
u32 rcv_wnd; /* rcv_wnd offered first time */
6464
u32 ts_recent;
65-
unsigned long expires;
65+
struct timer_list rsk_timer;
6666
const struct request_sock_ops *rsk_ops;
6767
struct sock *sk;
6868
u32 secid;
@@ -110,9 +110,6 @@ static inline void reqsk_free(struct request_sock *req)
110110

111111
static inline void reqsk_put(struct request_sock *req)
112112
{
113-
/* temporary debugging, until req sock are put into ehash table */
114-
WARN_ON_ONCE(atomic_read(&req->rsk_refcnt) != 1);
115-
116113
if (atomic_dec_and_test(&req->rsk_refcnt))
117114
reqsk_free(req);
118115
}
@@ -124,12 +121,16 @@ extern int sysctl_max_syn_backlog;
124121
* @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
125122
*/
126123
struct listen_sock {
127-
u8 max_qlen_log;
124+
int qlen_inc; /* protected by listener lock */
125+
int young_inc;/* protected by listener lock */
126+
127+
/* following fields can be updated by timer */
128+
atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */
129+
atomic_t young_dec;
130+
131+
u8 max_qlen_log ____cacheline_aligned_in_smp;
128132
u8 synflood_warned;
129133
/* 2 bytes hole, try to use */
130-
int qlen;
131-
int qlen_young;
132-
int clock_hand;
133134
u32 hash_rnd;
134135
u32 nr_table_entries;
135136
struct request_sock *syn_table[0];
@@ -182,16 +183,17 @@ struct fastopen_queue {
182183
struct request_sock_queue {
183184
struct request_sock *rskq_accept_head;
184185
struct request_sock *rskq_accept_tail;
185-
rwlock_t syn_wait_lock;
186186
u8 rskq_defer_accept;
187-
/* 3 bytes hole, try to pack */
188187
struct listen_sock *listen_opt;
189188
struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been
190189
* enabled on this listener. Check
191190
* max_qlen != 0 in fastopen_queue
192191
* to determine if TFO is enabled
193192
* right at this moment.
194193
*/
194+
195+
/* temporary alignment, our goal is to get rid of this lock */
196+
rwlock_t syn_wait_lock ____cacheline_aligned_in_smp;
195197
};
196198

197199
int reqsk_queue_alloc(struct request_sock_queue *queue,
@@ -223,11 +225,15 @@ static inline void reqsk_queue_unlink(struct request_sock_queue *queue,
223225
struct request_sock **prev;
224226

225227
write_lock(&queue->syn_wait_lock);
228+
226229
prev = &lopt->syn_table[req->rsk_hash];
227230
while (*prev != req)
228231
prev = &(*prev)->dl_next;
229232
*prev = req->dl_next;
233+
230234
write_unlock(&queue->syn_wait_lock);
235+
if (del_timer(&req->rsk_timer))
236+
reqsk_put(req);
231237
}
232238

233239
static inline void reqsk_queue_add(struct request_sock_queue *queue,
@@ -260,64 +266,53 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue
260266
return req;
261267
}
262268

263-
static inline int reqsk_queue_removed(struct request_sock_queue *queue,
264-
struct request_sock *req)
269+
static inline void reqsk_queue_removed(struct request_sock_queue *queue,
270+
const struct request_sock *req)
265271
{
266272
struct listen_sock *lopt = queue->listen_opt;
267273

268274
if (req->num_timeout == 0)
269-
--lopt->qlen_young;
270-
271-
return --lopt->qlen;
275+
atomic_inc(&lopt->young_dec);
276+
atomic_inc(&lopt->qlen_dec);
272277
}
273278

274-
static inline int reqsk_queue_added(struct request_sock_queue *queue)
279+
static inline void reqsk_queue_added(struct request_sock_queue *queue)
275280
{
276281
struct listen_sock *lopt = queue->listen_opt;
277-
const int prev_qlen = lopt->qlen;
278282

279-
lopt->qlen_young++;
280-
lopt->qlen++;
281-
return prev_qlen;
283+
lopt->young_inc++;
284+
lopt->qlen_inc++;
282285
}
283286

284-
static inline int reqsk_queue_len(const struct request_sock_queue *queue)
287+
static inline int listen_sock_qlen(const struct listen_sock *lopt)
285288
{
286-
return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0;
289+
return lopt->qlen_inc - atomic_read(&lopt->qlen_dec);
287290
}
288291

289-
static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
292+
static inline int listen_sock_young(const struct listen_sock *lopt)
290293
{
291-
return queue->listen_opt->qlen_young;
294+
return lopt->young_inc - atomic_read(&lopt->young_dec);
292295
}
293296

294-
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
297+
static inline int reqsk_queue_len(const struct request_sock_queue *queue)
295298
{
296-
return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
299+
const struct listen_sock *lopt = queue->listen_opt;
300+
301+
return lopt ? listen_sock_qlen(lopt) : 0;
297302
}
298303

299-
static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
300-
u32 hash, struct request_sock *req,
301-
unsigned long timeout)
304+
static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
302305
{
303-
struct listen_sock *lopt = queue->listen_opt;
304-
305-
req->expires = jiffies + timeout;
306-
req->num_retrans = 0;
307-
req->num_timeout = 0;
308-
req->sk = NULL;
309-
310-
/* before letting lookups find us, make sure all req fields
311-
* are committed to memory and refcnt initialized.
312-
*/
313-
smp_wmb();
314-
atomic_set(&req->rsk_refcnt, 1);
306+
return listen_sock_young(queue->listen_opt);
307+
}
315308

316-
req->rsk_hash = hash;
317-
write_lock(&queue->syn_wait_lock);
318-
req->dl_next = lopt->syn_table[hash];
319-
lopt->syn_table[hash] = req;
320-
write_unlock(&queue->syn_wait_lock);
309+
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
310+
{
311+
return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log;
321312
}
322313

314+
void reqsk_queue_hash_req(struct request_sock_queue *queue,
315+
u32 hash, struct request_sock *req,
316+
unsigned long timeout);
317+
323318
#endif /* _REQUEST_SOCK_H */

net/core/request_sock.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,21 +94,26 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
9494
/* make all the listen_opt local to us */
9595
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
9696

97-
if (lopt->qlen != 0) {
97+
if (listen_sock_qlen(lopt) != 0) {
9898
unsigned int i;
9999

100100
for (i = 0; i < lopt->nr_table_entries; i++) {
101101
struct request_sock *req;
102102

103+
write_lock_bh(&queue->syn_wait_lock);
103104
while ((req = lopt->syn_table[i]) != NULL) {
104105
lopt->syn_table[i] = req->dl_next;
105-
lopt->qlen--;
106+
atomic_inc(&lopt->qlen_dec);
107+
if (del_timer(&req->rsk_timer))
108+
reqsk_put(req);
106109
reqsk_put(req);
107110
}
111+
write_unlock_bh(&queue->syn_wait_lock);
108112
}
109113
}
110114

111-
WARN_ON(lopt->qlen != 0);
115+
if (WARN_ON(listen_sock_qlen(lopt) != 0))
116+
pr_err("qlen %u\n", listen_sock_qlen(lopt));
112117
kvfree(lopt);
113118
}
114119

@@ -187,7 +192,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
187192
*
188193
* For more details see CoNext'11 "TCP Fast Open" paper.
189194
*/
190-
req->expires = jiffies + 60*HZ;
195+
req->rsk_timer.expires = jiffies + 60*HZ;
191196
if (fastopenq->rskq_rst_head == NULL)
192197
fastopenq->rskq_rst_head = req;
193198
else

net/core/sock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2739,7 +2739,7 @@ static int req_prot_init(const struct proto *prot)
27392739

27402740
rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
27412741
rsk_prot->obj_size, 0,
2742-
SLAB_HWCACHE_ALIGN, NULL);
2742+
0, NULL);
27432743

27442744
if (!rsk_prot->slab) {
27452745
pr_crit("%s: Can't create request sock SLAB cache!\n",

net/dccp/ipv4.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
306306
if (!between48(seq, dccp_rsk(req)->dreq_iss,
307307
dccp_rsk(req)->dreq_gss)) {
308308
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
309+
reqsk_put(req);
309310
goto out;
310311
}
311312
/*
@@ -315,6 +316,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
315316
* errors returned from accept().
316317
*/
317318
inet_csk_reqsk_queue_drop(sk, req);
319+
reqsk_put(req);
318320
goto out;
319321

320322
case DCCP_REQUESTING:
@@ -451,9 +453,11 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
451453
/* Find possible connection requests. */
452454
struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
453455
iph->saddr, iph->daddr);
454-
if (req)
455-
return dccp_check_req(sk, skb, req);
456-
456+
if (req) {
457+
nsk = dccp_check_req(sk, skb, req);
458+
reqsk_put(req);
459+
return nsk;
460+
}
457461
nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
458462
iph->saddr, dh->dccph_sport,
459463
iph->daddr, dh->dccph_dport,

net/dccp/ipv6.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
157157
req = inet6_csk_search_req(sk, dh->dccph_dport,
158158
&hdr->daddr, &hdr->saddr,
159159
inet6_iif(skb));
160-
if (req == NULL)
160+
if (!req)
161161
goto out;
162162

163163
/*
@@ -169,10 +169,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
169169
if (!between48(seq, dccp_rsk(req)->dreq_iss,
170170
dccp_rsk(req)->dreq_gss)) {
171171
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
172+
reqsk_put(req);
172173
goto out;
173174
}
174175

175176
inet_csk_reqsk_queue_drop(sk, req);
177+
reqsk_put(req);
176178
goto out;
177179

178180
case DCCP_REQUESTING:
@@ -322,9 +324,11 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
322324

323325
req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
324326
&iph->daddr, inet6_iif(skb));
325-
if (req != NULL)
326-
return dccp_check_req(sk, skb, req);
327-
327+
if (req) {
328+
nsk = dccp_check_req(sk, skb, req);
329+
reqsk_put(req);
330+
return nsk;
331+
}
328332
nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
329333
&iph->saddr, dh->dccph_sport,
330334
&iph->daddr, ntohs(dh->dccph_dport),

net/dccp/timer.c

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -161,33 +161,11 @@ static void dccp_write_timer(unsigned long data)
161161
sock_put(sk);
162162
}
163163

164-
/*
165-
* Timer for listening sockets
166-
*/
167-
static void dccp_response_timer(struct sock *sk)
168-
{
169-
inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
170-
DCCP_RTO_MAX);
171-
}
172-
173164
static void dccp_keepalive_timer(unsigned long data)
174165
{
175166
struct sock *sk = (struct sock *)data;
176167

177-
/* Only process if socket is not in use. */
178-
bh_lock_sock(sk);
179-
if (sock_owned_by_user(sk)) {
180-
/* Try again later. */
181-
inet_csk_reset_keepalive_timer(sk, HZ / 20);
182-
goto out;
183-
}
184-
185-
if (sk->sk_state == DCCP_LISTEN) {
186-
dccp_response_timer(sk);
187-
goto out;
188-
}
189-
out:
190-
bh_unlock_sock(sk);
168+
pr_err("dccp should not use a keepalive timer !\n");
191169
sock_put(sk);
192170
}
193171

0 commit comments

Comments
 (0)