Skip to content

Commit ab408b6

Browse files
edumazetdavem330
authored andcommitted
tcp: switch tcp and sch_fq to new earliest departure time model
TCP keeps track of tcp_wstamp_ns by itself, meaning sch_fq no longer has to do it. Thanks to this model, TCP can get more accurate RTT samples, since pacing no longer inflates them. This has the nice effect of removing some delays caused by FQ quantum mechanism, causing inflated max/P99 latencies. Also we might relax TCP Small Queue tight limits in the future, since this new model allow TCP to build bigger batches, since sch_fq (or a device with earliest departure time offload) ensure these packets will be delivered on time. Note that other protocols are not converted (they will probably never be) so sch_fq has still support for SO_MAX_PACING_RATE Tested: Test showing FQ pacing quantum artifact for low-rate flows, adding unexpected throttles for RPC flows, inflating max and P99 latencies. The parameters chosen here are to show what happens typically when a TCP flow has a reduced pacing rate (this can be caused by a reduced cwin after few losses, or/and rtt above few ms) MIBS="MIN_LATENCY,MEAN_LATENCY,MAX_LATENCY,P99_LATENCY,STDDEV_LATENCY" Before : $ netperf -H 10.246.7.133 -t TCP_RR -Cc -T6,6 -- -q 2000000 -r 100,100 -o $MIBS MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.133 () port 0 AF_INET : first burst 0 : cpu bind Minimum Latency Microseconds,Mean Latency Microseconds,Maximum Latency Microseconds,99th Percentile Latency Microseconds,Stddev Latency Microseconds 19,82.78,5279,3825,482.02 After : $ netperf -H 10.246.7.133 -t TCP_RR -Cc -T6,6 -- -q 2000000 -r 100,100 -o $MIBS MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.133 () port 0 AF_INET : first burst 0 : cpu bind Minimum Latency Microseconds,Mean Latency Microseconds,Maximum Latency Microseconds,99th Percentile Latency Microseconds,Stddev Latency Microseconds 20,49.94,128,63,3.18 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent fd2bca2 commit ab408b6

File tree

3 files changed

+33
-17
lines changed

3 files changed

+33
-17
lines changed

net/ipv4/tcp_bbr.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
128128
/* Skip TSO below the following bandwidth (bits/sec): */
129129
static const int bbr_min_tso_rate = 1200000;
130130

131+
/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
132+
static const int bbr_pacing_marging_percent = 1;
133+
131134
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
132135
* that will allow a smoothly increasing pacing rate that will double each RTT
133136
* and send the same number of packets per RTT that an un-paced, slow-starting
@@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
208211
{
209212
unsigned int mss = tcp_sk(sk)->mss_cache;
210213

211-
if (!tcp_needs_internal_pacing(sk))
212-
mss = tcp_mss_to_mtu(sk, mss);
213214
rate *= mss;
214215
rate *= gain;
215216
rate >>= BBR_SCALE;
216-
rate *= USEC_PER_SEC;
217+
rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent);
217218
return rate >> BW_SCALE;
218219
}
219220

net/ipv4/tcp_output.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,9 +1012,23 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
10121012
sock_hold(sk);
10131013
}
10141014

1015-
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
1015+
static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
10161016
{
1017+
struct tcp_sock *tp = tcp_sk(sk);
1018+
10171019
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1020+
if (sk->sk_pacing_status != SK_PACING_NONE) {
1021+
u32 rate = sk->sk_pacing_rate;
1022+
1023+
/* Original sch_fq does not pace first 10 MSS
1024+
* Note that tp->data_segs_out overflows after 2^32 packets,
1025+
* this is a minor annoyance.
1026+
*/
1027+
if (rate != ~0U && rate && tp->data_segs_out >= 10) {
1028+
tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);
1029+
/* TODO: update internal pacing here */
1030+
}
1031+
}
10181032
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
10191033
}
10201034

@@ -1178,7 +1192,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
11781192
err = net_xmit_eval(err);
11791193
}
11801194
if (!err && oskb) {
1181-
tcp_update_skb_after_send(tp, oskb);
1195+
tcp_update_skb_after_send(sk, oskb);
11821196
tcp_rate_skb_sent(sk, oskb);
11831197
}
11841198
return err;
@@ -2327,7 +2341,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
23272341

23282342
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
23292343
/* "skb_mstamp" is used as a start point for the retransmit timer */
2330-
tcp_update_skb_after_send(tp, skb);
2344+
tcp_update_skb_after_send(sk, skb);
23312345
goto repair; /* Skip network transmission */
23322346
}
23332347

@@ -2902,7 +2916,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
29022916
} tcp_skb_tsorted_restore(skb);
29032917

29042918
if (!err) {
2905-
tcp_update_skb_after_send(tp, skb);
2919+
tcp_update_skb_after_send(sk, skb);
29062920
tcp_rate_skb_sent(sk, skb);
29072921
}
29082922
} else {

net/sched/sch_fq.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -491,11 +491,16 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
491491
}
492492

493493
skb = f->head;
494-
if (unlikely(skb && now < f->time_next_packet &&
495-
!skb_is_tcp_pure_ack(skb))) {
496-
head->first = f->next;
497-
fq_flow_set_throttled(q, f);
498-
goto begin;
494+
if (skb && !skb_is_tcp_pure_ack(skb)) {
495+
u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
496+
f->time_next_packet);
497+
498+
if (now < time_next_packet) {
499+
head->first = f->next;
500+
f->time_next_packet = time_next_packet;
501+
fq_flow_set_throttled(q, f);
502+
goto begin;
503+
}
499504
}
500505

501506
skb = fq_dequeue_head(sch, f);
@@ -513,11 +518,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
513518
prefetch(&skb->end);
514519
f->credit -= qdisc_pkt_len(skb);
515520

516-
if (!q->rate_enable)
517-
goto out;
518-
519-
/* Do not pace locally generated ack packets */
520-
if (skb_is_tcp_pure_ack(skb))
521+
if (ktime_to_ns(skb->tstamp) || !q->rate_enable)
521522
goto out;
522523

523524
rate = q->flow_max_rate;

0 commit comments

Comments
 (0)