Skip to content

Commit 6d5274e

Browse files
committed
Merge branch 'tcp-sender-chronographs'
Yuchung Cheng says: ==================== tcp: sender chronographs instrumentation This patch set provides instrumentation on TCP sender limitations. While developing the BBR congestion control, we noticed that TCP sending process is often limited by factors unrelated to congestion control: insufficient sender buffer and/or insufficient receive window/buffer to saturate the network bandwidth. Unfortunately these limits are not visible to the users and often the poor performance is attributed to the congestion control of choice. Thie patch aims to help users get the high level understanding of where sending process is limited by, similar to the TCP_INFO design. It is not to replace detailed kernel tracing and instrumentation facilities. In addition this patch set provide a new option to the timestamping work to instrument these limits on application data unit. For exampe, one can use SO_TIMESTAMPING and this patch set to measure the how long a particular HTTP response is limited by small receive window. Patch set was initially written by Francis Yan then polished by Yuchung Cheng, with lots of help from Eric Dumazet and Soheil Hassas Yeganeh. ==================== Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents a090994 + 1c88580 commit 6d5274e

File tree

23 files changed

+217
-13
lines changed

23 files changed

+217
-13
lines changed

Documentation/networking/timestamping.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
182182
the timestamp even if sysctl net.core.tstamp_allow_data is 0.
183183
This option disables SOF_TIMESTAMPING_OPT_CMSG.
184184

185+
SOF_TIMESTAMPING_OPT_STATS:
186+
187+
Optional stats that are obtained along with the transmit timestamps.
188+
It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
189+
transmit timestamp is available, the stats are available in a
190+
separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
191+
list of TLVs (struct nlattr) of types. These stats allow the
192+
application to associate various transport layer stats with
193+
the transmit timestamps, such as how long a certain block of
194+
data was limited by peer's receiver window.
185195

186196
New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
187197
disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate

arch/alpha/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,6 @@
9797

9898
#define SO_CNX_ADVICE 53
9999

100+
#define SCM_TIMESTAMPING_OPT_STATS 54
101+
100102
#endif /* _UAPI_ASM_SOCKET_H */

arch/frv/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,5 +90,7 @@
9090

9191
#define SO_CNX_ADVICE 53
9292

93+
#define SCM_TIMESTAMPING_OPT_STATS 54
94+
9395
#endif /* _ASM_SOCKET_H */
9496

arch/ia64/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,4 +99,6 @@
9999

100100
#define SO_CNX_ADVICE 53
101101

102+
#define SCM_TIMESTAMPING_OPT_STATS 54
103+
102104
#endif /* _ASM_IA64_SOCKET_H */

arch/m32r/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,6 @@
9090

9191
#define SO_CNX_ADVICE 53
9292

93+
#define SCM_TIMESTAMPING_OPT_STATS 54
94+
9395
#endif /* _ASM_M32R_SOCKET_H */

arch/mips/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,4 +108,6 @@
108108

109109
#define SO_CNX_ADVICE 53
110110

111+
#define SCM_TIMESTAMPING_OPT_STATS 54
112+
111113
#endif /* _UAPI_ASM_SOCKET_H */

arch/mn10300/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,6 @@
9090

9191
#define SO_CNX_ADVICE 53
9292

93+
#define SCM_TIMESTAMPING_OPT_STATS 54
94+
9395
#endif /* _ASM_SOCKET_H */

arch/parisc/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,6 @@
8989

9090
#define SO_CNX_ADVICE 0x402E
9191

92+
#define SCM_TIMESTAMPING_OPT_STATS 0x402F
93+
9294
#endif /* _UAPI_ASM_SOCKET_H */

arch/powerpc/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,6 @@
9797

9898
#define SO_CNX_ADVICE 53
9999

100+
#define SCM_TIMESTAMPING_OPT_STATS 54
101+
100102
#endif /* _ASM_POWERPC_SOCKET_H */

arch/s390/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,6 @@
9696

9797
#define SO_CNX_ADVICE 53
9898

99+
#define SCM_TIMESTAMPING_OPT_STATS 54
100+
99101
#endif /* _ASM_SOCKET_H */

arch/sparc/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@
8686

8787
#define SO_CNX_ADVICE 0x0037
8888

89+
#define SCM_TIMESTAMPING_OPT_STATS 0x0038
90+
8991
/* Security levels - as per NRL IPv6 - don't actually do anything */
9092
#define SO_SECURITY_AUTHENTICATION 0x5001
9193
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002

arch/xtensa/include/uapi/asm/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,6 @@
101101

102102
#define SO_CNX_ADVICE 53
103103

104+
#define SCM_TIMESTAMPING_OPT_STATS 54
105+
104106
#endif /* _XTENSA_SOCKET_H */

include/linux/tcp.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,11 @@ struct tcp_sock {
211211
u8 reord; /* reordering detected */
212212
} rack;
213213
u16 advmss; /* Advertised MSS */
214-
u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
215-
unused:7;
214+
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
215+
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
216+
u8 chrono_type:2, /* current chronograph type */
217+
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
218+
unused:5;
216219
u8 nonagle : 4,/* Disable Nagle algorithm? */
217220
thin_lto : 1,/* Use linear timeouts for thin streams */
218221
thin_dupack : 1,/* Fast retransmit on first dupack */
@@ -425,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
425428
tp->saved_syn = NULL;
426429
}
427430

431+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
432+
428433
#endif /* _LINUX_TCP_H */

include/net/tcp.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1516,11 +1516,26 @@ struct tcp_fastopen_context {
15161516
struct rcu_head rcu;
15171517
};
15181518

1519+
/* Latencies incurred by various limits for a sender. They are
1520+
* chronograph-like stats that are mutually exclusive.
1521+
*/
1522+
enum tcp_chrono {
1523+
TCP_CHRONO_UNSPEC,
1524+
TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
1525+
TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
1526+
TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
1527+
__TCP_CHRONO_MAX,
1528+
};
1529+
1530+
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
1531+
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
1532+
15191533
/* write queue abstraction */
15201534
static inline void tcp_write_queue_purge(struct sock *sk)
15211535
{
15221536
struct sk_buff *skb;
15231537

1538+
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
15241539
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
15251540
sk_wmem_free_skb(sk, skb);
15261541
sk_mem_reclaim(sk);
@@ -1579,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *
15791594

15801595
static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
15811596
{
1582-
if (sk->sk_send_head == skb_unlinked)
1597+
if (sk->sk_send_head == skb_unlinked) {
15831598
sk->sk_send_head = NULL;
1599+
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1600+
}
15841601
if (tcp_sk(sk)->highest_sack == skb_unlinked)
15851602
tcp_sk(sk)->highest_sack = NULL;
15861603
}
@@ -1602,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
16021619
/* Queue it, remembering where we must start sending. */
16031620
if (sk->sk_send_head == NULL) {
16041621
sk->sk_send_head = skb;
1622+
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
16051623

16061624
if (tcp_sk(sk)->highest_sack == NULL)
16071625
tcp_sk(sk)->highest_sack = skb;

include/uapi/asm-generic/socket.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,6 @@
9292

9393
#define SO_CNX_ADVICE 53
9494

95+
#define SCM_TIMESTAMPING_OPT_STATS 54
96+
9597
#endif /* __ASM_GENERIC_SOCKET_H */

include/uapi/linux/net_tstamp.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ enum {
2525
SOF_TIMESTAMPING_TX_ACK = (1<<9),
2626
SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
2727
SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
28+
SOF_TIMESTAMPING_OPT_STATS = (1<<12),
2829

29-
SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
30+
SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
3031
SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
3132
SOF_TIMESTAMPING_LAST
3233
};

include/uapi/linux/tcp.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,18 @@ struct tcp_info {
214214
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
215215

216216
__u64 tcpi_delivery_rate;
217+
218+
__u64 tcpi_busy_time; /* Time (usec) busy sending data */
219+
__u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
220+
__u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
221+
};
222+
223+
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
224+
enum {
225+
TCP_NLA_PAD,
226+
TCP_NLA_BUSY, /* Time (usec) busy sending data */
227+
TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
228+
TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */
217229
};
218230

219231
/* for TCP_MD5SIG socket option */

net/core/skbuff.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3839,10 +3839,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
38393839
if (!skb_may_tx_timestamp(sk, tsonly))
38403840
return;
38413841

3842-
if (tsonly)
3843-
skb = alloc_skb(0, GFP_ATOMIC);
3844-
else
3842+
if (tsonly) {
3843+
#ifdef CONFIG_INET
3844+
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
3845+
sk->sk_protocol == IPPROTO_TCP &&
3846+
sk->sk_type == SOCK_STREAM)
3847+
skb = tcp_get_timestamping_opt_stats(sk);
3848+
else
3849+
#endif
3850+
skb = alloc_skb(0, GFP_ATOMIC);
3851+
} else {
38453852
skb = skb_clone(orig_skb, GFP_ATOMIC);
3853+
}
38463854
if (!skb)
38473855
return;
38483856

net/core/sock.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
854854
sk->sk_tskey = 0;
855855
}
856856
}
857+
858+
if (val & SOF_TIMESTAMPING_OPT_STATS &&
859+
!(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
860+
ret = -EINVAL;
861+
break;
862+
}
863+
857864
sk->sk_tsflags = val;
858865
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
859866
sock_enable_timestamp(sk,

net/ipv4/tcp.c

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -996,8 +996,11 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
996996
goto out;
997997
out_err:
998998
/* make sure we wake any epoll edge trigger waiter */
999-
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
999+
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1000+
err == -EAGAIN)) {
10001001
sk->sk_write_space(sk);
1002+
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1003+
}
10011004
return sk_stream_error(sk, flags, err);
10021005
}
10031006

@@ -1331,8 +1334,11 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
13311334
out_err:
13321335
err = sk_stream_error(sk, flags, err);
13331336
/* make sure we wake any epoll edge trigger waiter */
1334-
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1337+
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1338+
err == -EAGAIN)) {
13351339
sk->sk_write_space(sk);
1340+
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1341+
}
13361342
release_sock(sk);
13371343
return err;
13381344
}
@@ -2702,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
27022708
EXPORT_SYMBOL(compat_tcp_setsockopt);
27032709
#endif
27042710

2711+
static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
2712+
struct tcp_info *info)
2713+
{
2714+
u64 stats[__TCP_CHRONO_MAX], total = 0;
2715+
enum tcp_chrono i;
2716+
2717+
for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
2718+
stats[i] = tp->chrono_stat[i - 1];
2719+
if (i == tp->chrono_type)
2720+
stats[i] += tcp_time_stamp - tp->chrono_start;
2721+
stats[i] *= USEC_PER_SEC / HZ;
2722+
total += stats[i];
2723+
}
2724+
2725+
info->tcpi_busy_time = total;
2726+
info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
2727+
info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
2728+
}
2729+
27052730
/* Return information about state of tcp endpoint in API format. */
27062731
void tcp_get_info(struct sock *sk, struct tcp_info *info)
27072732
{
@@ -2794,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
27942819
info->tcpi_bytes_acked = tp->bytes_acked;
27952820
info->tcpi_bytes_received = tp->bytes_received;
27962821
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
2822+
tcp_get_info_chrono_stats(tp, info);
27972823

27982824
unlock_sock_fast(sk, slow);
27992825

@@ -2815,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
28152841
}
28162842
EXPORT_SYMBOL_GPL(tcp_get_info);
28172843

2844+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
2845+
{
2846+
const struct tcp_sock *tp = tcp_sk(sk);
2847+
struct sk_buff *stats;
2848+
struct tcp_info info;
2849+
2850+
stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
2851+
if (!stats)
2852+
return NULL;
2853+
2854+
tcp_get_info_chrono_stats(tp, &info);
2855+
nla_put_u64_64bit(stats, TCP_NLA_BUSY,
2856+
info.tcpi_busy_time, TCP_NLA_PAD);
2857+
nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
2858+
info.tcpi_rwnd_limited, TCP_NLA_PAD);
2859+
nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
2860+
info.tcpi_sndbuf_limited, TCP_NLA_PAD);
2861+
return stats;
2862+
}
2863+
28182864
static int do_tcp_getsockopt(struct sock *sk, int level,
28192865
int optname, char __user *optval, int __user *optlen)
28202866
{

net/ipv4/tcp_input.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3178,6 +3178,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
31783178
tp->lost_skb_hint = NULL;
31793179
}
31803180

3181+
if (!skb)
3182+
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
3183+
31813184
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
31823185
tp->snd_up = tp->snd_una;
31833186

@@ -5056,8 +5059,11 @@ static void tcp_check_space(struct sock *sk)
50565059
/* pairs with tcp_poll() */
50575060
smp_mb__after_atomic();
50585061
if (sk->sk_socket &&
5059-
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5062+
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
50605063
tcp_new_space(sk);
5064+
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5065+
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5066+
}
50615067
}
50625068
}
50635069

0 commit comments

Comments
 (0)