Skip to content

Commit cec451c

Browse files
committed
Merge branch 'tcp-improving-RACK-cpu-performance'
Yuchung Cheng says: ==================== tcp: improving RACK cpu performance This patch set improves the CPU consumption of the RACK TCP loss recovery algorithm, in particular for high-speed networks. Currently, for every ACK in recovery RACK can potentially iterate over all sent packets in the write queue. On large BDP networks with non-trivial losses the RACK write queue walk CPU usage becomes unreasonably high. This patch introduces a new queue in TCP that keeps only skbs sent and not yet (s)acked or marked lost, in time order instead of sequence order. With that, RACK can examine this time-sorted list and only check packets that were sent recently, within the reordering window, per ACK. This is the fastest way without any write queue walks. The number of skbs examined per ACK is reduced by orders of magnitude. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents b1fb67f + bef0622 commit cec451c

File tree

8 files changed

+93
-49
lines changed

8 files changed

+93
-49
lines changed

include/linux/skbuff.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
617617
* @nf_trace: netfilter packet trace flag
618618
* @protocol: Packet protocol from driver
619619
* @destructor: Destruct function
620+
* @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
620621
* @_nfct: Associated connection, if any (with nfctinfo bits)
621622
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
622623
* @skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
686687
*/
687688
char cb[48] __aligned(8);
688689

689-
unsigned long _skb_refdst;
690-
void (*destructor)(struct sk_buff *skb);
690+
union {
691+
struct {
692+
unsigned long _skb_refdst;
693+
void (*destructor)(struct sk_buff *skb);
694+
};
695+
struct list_head tcp_tsorted_anchor;
696+
};
697+
691698
#ifdef CONFIG_XFRM
692699
struct sec_path *sp;
693700
#endif

include/linux/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ struct tcp_sock {
191191
u32 tsoffset; /* timestamp offset */
192192

193193
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
194+
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
194195

195196
u32 snd_wl1; /* Sequence for window update */
196197
u32 snd_wnd; /* The window we expect to receive */

include/net/tcp.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
15891589
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
15901590
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
15911591

1592+
/* This helper is needed, because skb->tcp_tsorted_anchor uses
1593+
* the same memory storage than skb->destructor/_skb_refdst
1594+
*/
1595+
static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
1596+
{
1597+
skb->destructor = NULL;
1598+
skb->_skb_refdst = 0UL;
1599+
}
1600+
1601+
#define tcp_skb_tsorted_save(skb) { \
1602+
unsigned long _save = skb->_skb_refdst; \
1603+
skb->_skb_refdst = 0UL;
1604+
1605+
#define tcp_skb_tsorted_restore(skb) \
1606+
skb->_skb_refdst = _save; \
1607+
}
1608+
15921609
/* write queue abstraction */
15931610
static inline void tcp_write_queue_purge(struct sock *sk)
15941611
{
15951612
struct sk_buff *skb;
15961613

15971614
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1598-
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
1615+
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1616+
tcp_skb_tsorted_anchor_cleanup(skb);
15991617
sk_wmem_free_skb(sk, skb);
1618+
}
1619+
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
16001620
sk_mem_reclaim(sk);
16011621
tcp_clear_all_retrans_hints(tcp_sk(sk));
16021622
}
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
17111731

17121732
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
17131733
{
1734+
list_del(&skb->tcp_tsorted_anchor);
1735+
tcp_skb_tsorted_anchor_cleanup(skb);
17141736
__skb_unlink(skb, &sk->sk_write_queue);
17151737
}
17161738

net/ipv4/tcp.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
415415
tp->out_of_order_queue = RB_ROOT;
416416
tcp_init_xmit_timers(sk);
417417
INIT_LIST_HEAD(&tp->tsq_node);
418+
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
418419

419420
icsk->icsk_rto = TCP_TIMEOUT_INIT;
420421
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
881882
* available to the caller, no more, no less.
882883
*/
883884
skb->reserved_tailroom = skb->end - skb->tail - size;
885+
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
884886
return skb;
885887
}
886888
__kfree_skb(skb);

net/ipv4/tcp_input.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
15931593
tcp_skb_pcount(skb),
15941594
skb->skb_mstamp);
15951595
tcp_rate_skb_delivered(sk, skb, state->rate);
1596+
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1597+
list_del_init(&skb->tcp_tsorted_anchor);
15961598

15971599
if (!before(TCP_SKB_CB(skb)->seq,
15981600
tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
30543056

30553057
shinfo = skb_shinfo(skb);
30563058
if (!before(shinfo->tskey, prior_snd_una) &&
3057-
before(shinfo->tskey, tcp_sk(sk)->snd_una))
3058-
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3059+
before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
3060+
tcp_skb_tsorted_save(skb) {
3061+
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3062+
} tcp_skb_tsorted_restore(skb);
3063+
}
30593064
}
30603065

30613066
/* Remove acknowledged frames from the retransmission queue. If our packet

net/ipv4/tcp_minisocks.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
446446
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
447447

448448
INIT_LIST_HEAD(&newtp->tsq_node);
449+
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
449450

450451
tcp_init_wl(newtp, treq->rcv_isn);
451452

net/ipv4/tcp_output.c

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
971971
HRTIMER_MODE_ABS_PINNED);
972972
}
973973

974+
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
975+
{
976+
skb->skb_mstamp = tp->tcp_mstamp;
977+
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
978+
}
979+
974980
/* This routine actually transmits TCP packets queued in by
975981
* tcp_do_sendmsg(). This is used by both the initial
976982
* transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
10031009
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
10041010
- tp->snd_una;
10051011
oskb = skb;
1006-
if (unlikely(skb_cloned(skb)))
1007-
skb = pskb_copy(skb, gfp_mask);
1008-
else
1009-
skb = skb_clone(skb, gfp_mask);
1012+
1013+
tcp_skb_tsorted_save(oskb) {
1014+
if (unlikely(skb_cloned(oskb)))
1015+
skb = pskb_copy(oskb, gfp_mask);
1016+
else
1017+
skb = skb_clone(oskb, gfp_mask);
1018+
} tcp_skb_tsorted_restore(oskb);
1019+
10101020
if (unlikely(!skb))
10111021
return -ENOBUFS;
10121022
}
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
11271137
err = net_xmit_eval(err);
11281138
}
11291139
if (!err && oskb) {
1130-
oskb->skb_mstamp = tp->tcp_mstamp;
1140+
tcp_update_skb_after_send(tp, oskb);
11311141
tcp_rate_skb_sent(sk, oskb);
11321142
}
11331143
return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
13281338
/* Link BUFF into the send queue. */
13291339
__skb_header_release(buff);
13301340
tcp_insert_write_queue_after(skb, buff, sk);
1341+
list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
13311342

13321343
return 0;
13331344
}
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
22602271

22612272
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
22622273
/* "skb_mstamp" is used as a start point for the retransmit timer */
2263-
skb->skb_mstamp = tp->tcp_mstamp;
2274+
tcp_update_skb_after_send(tp, skb);
22642275
goto repair; /* Skip network transmission */
22652276
}
22662277

@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
28382849
skb_headroom(skb) >= 0xFFFF)) {
28392850
struct sk_buff *nskb;
28402851

2841-
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2842-
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2843-
-ENOBUFS;
2852+
tcp_skb_tsorted_save(skb) {
2853+
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2854+
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2855+
-ENOBUFS;
2856+
} tcp_skb_tsorted_restore(skb);
2857+
28442858
if (!err)
2845-
skb->skb_mstamp = tp->tcp_mstamp;
2859+
tcp_update_skb_after_send(tp, skb);
28462860
} else {
28472861
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
28482862
}
@@ -3023,6 +3037,7 @@ void tcp_send_fin(struct sock *sk)
30233037
goto coalesce;
30243038
return;
30253039
}
3040+
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
30263041
skb_reserve(skb, MAX_TCP_HEADER);
30273042
sk_forced_mem_schedule(sk, skb->truesize);
30283043
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
30783093
}
30793094
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
30803095
if (skb_cloned(skb)) {
3081-
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
3096+
struct sk_buff *nskb;
3097+
3098+
tcp_skb_tsorted_save(skb) {
3099+
nskb = skb_copy(skb, GFP_ATOMIC);
3100+
} tcp_skb_tsorted_restore(skb);
30823101
if (!nskb)
30833102
return -ENOMEM;
3103+
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
30843104
tcp_unlink_write_queue(skb, sk);
30853105
__skb_header_release(nskb);
30863106
__tcp_add_write_queue_head(sk, nskb);

net/ipv4/tcp_recovery.c

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
4545
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
4646
{
4747
struct tcp_sock *tp = tcp_sk(sk);
48-
struct sk_buff *skb;
48+
struct sk_buff *skb, *n;
4949
u32 reo_wnd;
5050

5151
*reo_timeout = 0;
@@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
5858
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
5959
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
6060

61-
tcp_for_write_queue(skb, sk) {
61+
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
62+
tcp_tsorted_anchor) {
6263
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
64+
s32 remaining;
6365

64-
if (skb == tcp_send_head(sk))
65-
break;
66-
67-
/* Skip ones already (s)acked */
68-
if (!after(scb->end_seq, tp->snd_una) ||
69-
scb->sacked & TCPCB_SACKED_ACKED)
66+
/* Skip ones marked lost but not yet retransmitted */
67+
if ((scb->sacked & TCPCB_LOST) &&
68+
!(scb->sacked & TCPCB_SACKED_RETRANS))
7069
continue;
7170

72-
if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
73-
tp->rack.end_seq, scb->end_seq)) {
74-
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
75-
* A packet is lost if its elapsed time is beyond
76-
* the recent RTT plus the reordering window.
77-
*/
78-
u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
79-
skb->skb_mstamp);
80-
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
81-
82-
if (remaining < 0) {
83-
tcp_rack_mark_skb_lost(sk, skb);
84-
continue;
85-
}
86-
87-
/* Skip ones marked lost but not yet retransmitted */
88-
if ((scb->sacked & TCPCB_LOST) &&
89-
!(scb->sacked & TCPCB_SACKED_RETRANS))
90-
continue;
71+
if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
72+
tp->rack.end_seq, scb->end_seq))
73+
break;
9174

75+
/* A packet is lost if it has not been s/acked beyond
76+
* the recent RTT plus the reordering window.
77+
*/
78+
remaining = tp->rack.rtt_us + reo_wnd -
79+
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
80+
if (remaining < 0) {
81+
tcp_rack_mark_skb_lost(sk, skb);
82+
list_del_init(&skb->tcp_tsorted_anchor);
83+
} else {
9284
/* Record maximum wait time (+1 to avoid 0) */
9385
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
94-
95-
} else if (!(scb->sacked & TCPCB_RETRANS)) {
96-
/* Original data are sent sequentially so stop early
97-
* b/c the rest are all sent after rack_sent
98-
*/
99-
break;
10086
}
10187
}
10288
}

0 commit comments

Comments
 (0)