Skip to content

Commit e33099f

Browse files
yuchungchengdavem330
authored andcommitted
tcp: implement RFC5682 F-RTO
This patch implements F-RTO (foward RTO recovery): When the first retransmission after timeout is acknowledged, F-RTO sends new data instead of old data. If the next ACK acknowledges some never-retransmitted data, then the timeout was spurious and the congestion state is reverted. Otherwise if the next ACK selectively acknowledges the new data, then the timeout was genuine and the loss recovery continues. This idea applies to recurring timeouts as well. While F-RTO sends different data during timeout recovery, it does not (and should not) change the congestion control. The implementaion follows the three steps of SACK enhanced algorithm (section 3) in RFC5682. Step 1 is in tcp_enter_loss(). Step 2 and 3 are in tcp_process_loss(). The basic version is not supported because SACK enhanced version also works for non-SACK connections. The new implementation is functionally in parity with the old F-RTO implementation except the one case where it increases undo events: In addition to the RFC algorithm, a spurious timeout may be detected without sending data in step 2, as long as the SACK confirms not all the original data are dropped. When this happens, the sender will undo the cwnd and perhaps enter fast recovery instead. This additional check increases the F-RTO undo events by 5x compared to the prior implementation on Google Web servers, since the sender often does not have new data to send for HTTP. Note F-RTO may detect spurious timeout before Eifel with timestamps does so. Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent ab42d9e commit e33099f

File tree

3 files changed

+68
-26
lines changed

3 files changed

+68
-26
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -225,19 +225,13 @@ tcp_fin_timeout - INTEGER
225225
Default: 60 seconds
226226

227227
tcp_frto - INTEGER
228-
Enables Forward RTO-Recovery (F-RTO) defined in RFC4138.
228+
Enables Forward RTO-Recovery (F-RTO) defined in RFC5682.
229229
F-RTO is an enhanced recovery algorithm for TCP retransmission
230-
timeouts. It is particularly beneficial in wireless environments
231-
where packet loss is typically due to random radio interference
232-
rather than intermediate router congestion. F-RTO is sender-side
233-
only modification. Therefore it does not require any support from
234-
the peer.
235-
236-
If set to 1, basic version is enabled. 2 enables SACK enhanced
237-
F-RTO if flow uses SACK. The basic version can be used also when
238-
SACK is in use though scenario(s) with it exists where F-RTO
239-
interacts badly with the packet counting of the SACK enabled TCP
240-
flow.
230+
timeouts. It is particularly beneficial in networks where the
231+
RTT fluctuates (e.g., wireless). F-RTO is sender-side only
232+
modification. It does not require any support from the peer.
233+
234+
By default it's enabled with a non-zero value. 0 disables F-RTO.
241235

242236
tcp_keepalive_time - INTEGER
243237
How often TCP sends out keepalive messages when keepalive is enabled.

include/linux/tcp.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,8 @@ struct tcp_sock {
192192
u8 nonagle : 4,/* Disable Nagle algorithm? */
193193
thin_lto : 1,/* Use linear timeouts for thin streams */
194194
thin_dupack : 1,/* Fast retransmit on first dupack */
195-
repair : 1;
195+
repair : 1,
196+
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
196197
u8 repair_queue;
197198
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
198199
syn_data:1, /* SYN includes data */

net/ipv4/tcp_input.c

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ int sysctl_tcp_early_retrans __read_mostly = 3;
107107
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
108108
#define FLAG_ECE 0x40 /* ECE in this ACK */
109109
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
110+
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
110111
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
111112
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
112113
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
@@ -1155,6 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
11551156
tcp_highest_sack_seq(tp)))
11561157
state->reord = min(fack_count,
11571158
state->reord);
1159+
if (!after(end_seq, tp->high_seq))
1160+
state->flag |= FLAG_ORIG_SACK_ACKED;
11581161
}
11591162

11601163
if (sacked & TCPCB_LOST) {
@@ -1835,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how)
18351838
const struct inet_connection_sock *icsk = inet_csk(sk);
18361839
struct tcp_sock *tp = tcp_sk(sk);
18371840
struct sk_buff *skb;
1841+
bool new_recovery = false;
18381842

18391843
/* Reduce ssthresh if it has not yet been made inside this window. */
1840-
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1844+
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1845+
!after(tp->high_seq, tp->snd_una) ||
18411846
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1847+
new_recovery = true;
18421848
tp->prior_ssthresh = tcp_current_ssthresh(sk);
18431849
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
18441850
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -1883,6 +1889,14 @@ void tcp_enter_loss(struct sock *sk, int how)
18831889
tcp_set_ca_state(sk, TCP_CA_Loss);
18841890
tp->high_seq = tp->snd_nxt;
18851891
TCP_ECN_queue_cwr(tp);
1892+
1893+
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1894+
* loss recovery is underway except recurring timeout(s) on
1895+
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
1896+
*/
1897+
tp->frto = sysctl_tcp_frto &&
1898+
(new_recovery || icsk->icsk_retransmits) &&
1899+
!inet_csk(sk)->icsk_mtup.probe_size;
18861900
}
18871901

18881902
/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2426,12 +2440,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
24262440
return failed;
24272441
}
24282442

2429-
/* Undo during loss recovery after partial ACK. */
2430-
static bool tcp_try_undo_loss(struct sock *sk)
2443+
/* Undo during loss recovery after partial ACK or using F-RTO. */
2444+
static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
24312445
{
24322446
struct tcp_sock *tp = tcp_sk(sk);
24332447

2434-
if (tcp_may_undo(tp)) {
2448+
if (frto_undo || tcp_may_undo(tp)) {
24352449
struct sk_buff *skb;
24362450
tcp_for_write_queue(skb, sk) {
24372451
if (skb == tcp_send_head(sk))
@@ -2445,9 +2459,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
24452459
tp->lost_out = 0;
24462460
tcp_undo_cwr(sk, true);
24472461
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2462+
if (frto_undo)
2463+
NET_INC_STATS_BH(sock_net(sk),
2464+
LINUX_MIB_TCPSPURIOUSRTOS);
24482465
inet_csk(sk)->icsk_retransmits = 0;
24492466
tp->undo_marker = 0;
2450-
if (tcp_is_sack(tp))
2467+
if (frto_undo || tcp_is_sack(tp))
24512468
tcp_set_ca_state(sk, TCP_CA_Open);
24522469
return true;
24532470
}
@@ -2667,24 +2684,52 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
26672684
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
26682685
* recovered or spurious. Otherwise retransmits more on partial ACKs.
26692686
*/
2670-
static void tcp_process_loss(struct sock *sk, int flag)
2687+
static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
26712688
{
26722689
struct inet_connection_sock *icsk = inet_csk(sk);
26732690
struct tcp_sock *tp = tcp_sk(sk);
2691+
bool recovered = !before(tp->snd_una, tp->high_seq);
26742692

2675-
if (!before(tp->snd_una, tp->high_seq)) {
2693+
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2694+
if (flag & FLAG_ORIG_SACK_ACKED) {
2695+
/* Step 3.b. A timeout is spurious if not all data are
2696+
* lost, i.e., never-retransmitted data are (s)acked.
2697+
*/
2698+
tcp_try_undo_loss(sk, true);
2699+
return;
2700+
}
2701+
if (after(tp->snd_nxt, tp->high_seq) &&
2702+
(flag & FLAG_DATA_SACKED || is_dupack)) {
2703+
tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2704+
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2705+
tp->high_seq = tp->snd_nxt;
2706+
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
2707+
TCP_NAGLE_OFF);
2708+
if (after(tp->snd_nxt, tp->high_seq))
2709+
return; /* Step 2.b */
2710+
tp->frto = 0;
2711+
}
2712+
}
2713+
2714+
if (recovered) {
2715+
/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
26762716
icsk->icsk_retransmits = 0;
26772717
tcp_try_undo_recovery(sk);
26782718
return;
26792719
}
2680-
26812720
if (flag & FLAG_DATA_ACKED)
26822721
icsk->icsk_retransmits = 0;
2683-
if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
2684-
tcp_reset_reno_sack(tp);
2685-
if (tcp_try_undo_loss(sk))
2722+
if (tcp_is_reno(tp)) {
2723+
/* A Reno DUPACK means new data in F-RTO step 2.b above are
2724+
* delivered. Lower inflight to clock out (re)tranmissions.
2725+
*/
2726+
if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2727+
tcp_add_reno_sack(sk);
2728+
else if (flag & FLAG_SND_UNA_ADVANCED)
2729+
tcp_reset_reno_sack(tp);
2730+
}
2731+
if (tcp_try_undo_loss(sk, false))
26862732
return;
2687-
tcp_moderate_cwnd(tp);
26882733
tcp_xmit_retransmit_queue(sk);
26892734
}
26902735

@@ -2764,7 +2809,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
27642809
newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
27652810
break;
27662811
case TCP_CA_Loss:
2767-
tcp_process_loss(sk, flag);
2812+
tcp_process_loss(sk, flag, is_dupack);
27682813
if (icsk->icsk_ca_state != TCP_CA_Open)
27692814
return;
27702815
/* Fall through to processing in Open state. */
@@ -3003,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
30033048
}
30043049
if (!(sacked & TCPCB_SACKED_ACKED))
30053050
reord = min(pkts_acked, reord);
3051+
if (!after(scb->end_seq, tp->high_seq))
3052+
flag |= FLAG_ORIG_SACK_ACKED;
30063053
}
30073054

30083055
if (sacked & TCPCB_SACKED_ACKED)

0 commit comments

Comments
 (0)