Skip to content

Commit 337f1b0

Browse files
committed
Merge branch 'tcp-xmit-timer-rearming'
Neal Cardwell says: ==================== tcp: fix xmit timer rearming to avoid stalls This patch series is a bug fix for a TCP loss recovery performance bug reported independently in recent netdev threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" Many thanks to Klavs Klavsen and Mao Wenan for the detailed reports, traces, and packetdrill test cases, which enabled us to root-cause this issue and verify the fix. - v1 -> v2: - In patch 2/3, changed an unclear comment in the pre-existing code in tcp_schedule_loss_probe() to be more clear (thanks to Eric Dumazet for suggesting we improve this). ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents b91d532 + df92c83 commit 337f1b0

File tree

3 files changed

+32
-31
lines changed

3 files changed

+32
-31
lines changed

include/net/tcp.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
19161916
u64 xmit_time);
19171917
extern void tcp_rack_reo_timeout(struct sock *sk);
19181918

1919+
/* At how many usecs into the future should the RTO fire? */
1920+
static inline s64 tcp_rto_delta_us(const struct sock *sk)
1921+
{
1922+
const struct sk_buff *skb = tcp_write_queue_head(sk);
1923+
u32 rto = inet_csk(sk)->icsk_rto;
1924+
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
1925+
1926+
return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
1927+
}
1928+
19191929
/*
19201930
* Save and compile IPv4 options, return a pointer to it
19211931
*/

net/ipv4/tcp_input.c

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
107107
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
108108
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
109109
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
110+
#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
110111
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
111112
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
112113
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
@@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk)
30043005
/* Offset the time elapsed after installing regular RTO */
30053006
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
30063007
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3007-
struct sk_buff *skb = tcp_write_queue_head(sk);
3008-
u64 rto_time_stamp = skb->skb_mstamp +
3009-
jiffies_to_usecs(rto);
3010-
s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
3008+
s64 delta_us = tcp_rto_delta_us(sk);
30113009
/* delta_us may not be positive if the socket is locked
30123010
* when the retrans timer fires and is rescheduled.
30133011
*/
@@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
30193017
}
30203018
}
30213019

3020+
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
3021+
static void tcp_set_xmit_timer(struct sock *sk)
3022+
{
3023+
if (!tcp_schedule_loss_probe(sk))
3024+
tcp_rearm_rto(sk);
3025+
}
3026+
30223027
/* If we get here, the whole TSO packet has not been acked. */
30233028
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
30243029
{
@@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
31803185
ca_rtt_us, sack->rate);
31813186

31823187
if (flag & FLAG_ACKED) {
3183-
tcp_rearm_rto(sk);
3188+
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
31843189
if (unlikely(icsk->icsk_mtup.probe_size &&
31853190
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
31863191
tcp_mtup_probe_success(sk);
@@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
32083213
* after when the head was last (re)transmitted. Otherwise the
32093214
* timeout may continue to extend in loss recovery.
32103215
*/
3211-
tcp_rearm_rto(sk);
3216+
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
32123217
}
32133218

32143219
if (icsk->icsk_ca_ops->pkts_acked) {
@@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
35803585
if (after(ack, tp->snd_nxt))
35813586
goto invalid_ack;
35823587

3583-
if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3584-
tcp_rearm_rto(sk);
3585-
35863588
if (after(ack, prior_snd_una)) {
35873589
flag |= FLAG_SND_UNA_ADVANCED;
35883590
icsk->icsk_retransmits = 0;
@@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
36473649
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
36483650
&sack_state);
36493651

3652+
if (tp->tlp_high_seq)
3653+
tcp_process_tlp_ack(sk, ack, flag);
3654+
/* If needed, reset TLP/RTO timer; RACK may later override this. */
3655+
if (flag & FLAG_SET_XMIT_TIMER)
3656+
tcp_set_xmit_timer(sk);
3657+
36503658
if (tcp_ack_is_dubious(sk, flag)) {
36513659
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
36523660
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
36533661
}
3654-
if (tp->tlp_high_seq)
3655-
tcp_process_tlp_ack(sk, ack, flag);
36563662

36573663
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
36583664
sk_dst_confirm(sk);
36593665

3660-
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3661-
tcp_schedule_loss_probe(sk);
36623666
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
36633667
lost = tp->lost - lost; /* freshly marked lost */
36643668
tcp_rate_gen(sk, delivered, lost, sack_state.rate);

net/ipv4/tcp_output.c

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk)
23772377
{
23782378
struct inet_connection_sock *icsk = inet_csk(sk);
23792379
struct tcp_sock *tp = tcp_sk(sk);
2380-
u32 timeout, tlp_time_stamp, rto_time_stamp;
23812380
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
2381+
u32 timeout, rto_delta_us;
23822382

2383-
/* No consecutive loss probes. */
2384-
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2385-
tcp_rearm_rto(sk);
2386-
return false;
2387-
}
23882383
/* Don't do any loss probe on a Fast Open connection before 3WHS
23892384
* finishes.
23902385
*/
23912386
if (tp->fastopen_rsk)
23922387
return false;
23932388

2394-
/* TLP is only scheduled when next timer event is RTO. */
2395-
if (icsk->icsk_pending != ICSK_TIME_RETRANS)
2396-
return false;
2397-
23982389
/* Schedule a loss probe in 2*RTT for SACK capable connections
23992390
* in Open state, that are either limited by cwnd or application.
24002391
*/
@@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
24172408
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
24182409
timeout = max_t(u32, timeout, msecs_to_jiffies(10));
24192410

2420-
/* If RTO is shorter, just schedule TLP in its place. */
2421-
tlp_time_stamp = tcp_jiffies32 + timeout;
2422-
rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
2423-
if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
2424-
s32 delta = rto_time_stamp - tcp_jiffies32;
2425-
if (delta > 0)
2426-
timeout = delta;
2427-
}
2411+
/* If the RTO formula yields an earlier time, then use that time. */
2412+
rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
2413+
if (rto_delta_us > 0)
2414+
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
24282415

24292416
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
24302417
TCP_RTO_MAX);

0 commit comments

Comments
 (0)