Skip to content

Commit a74f0fa

Browse files
edumazetdavem330
authored andcommitted
tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT
TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 as a step to enable bigger tcp sndbuf limits. It works reasonably well, but the following happens : Once the limit is reached, TCP stack generates an [E]POLLOUT event for every incoming ACK packet. This causes a high number of context switches. This patch implements the strategy David Miller added in sock_def_write_space() : - If TCP socket has a notsent_lowat constraint of X bytes, allow sendmsg() to fill up to X bytes, but send [E]POLLOUT only if number of notsent bytes is below X/2 This considerably reduces TCP_NOTSENT_LOWAT overhead, while allowing to keep the pipe full. Tested: 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM A:/# cat /proc/sys/net/ipv4/tcp_wmem 4096 262144 64000000 A:/# super_netperf 100 -H B -l 1000 -- -K bbr & A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ A:/# vmstat 5 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow A:/# vmstat 5 5 # check that context switches have not inflated too much. procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 4dc88ce commit a74f0fa

File tree

3 files changed

+22
-8
lines changed

3 files changed

+22
-8
lines changed

include/net/sock.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,7 +1110,7 @@ struct proto {
11101110
unsigned int inuse_idx;
11111111
#endif
11121112

1113-
bool (*stream_memory_free)(const struct sock *sk);
1113+
bool (*stream_memory_free)(const struct sock *sk, int wake);
11141114
bool (*stream_memory_read)(const struct sock *sk);
11151115
/* Memory pressure */
11161116
void (*enter_memory_pressure)(struct sock *sk);
@@ -1192,19 +1192,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
11921192
#define sk_refcnt_debug_release(sk) do { } while (0)
11931193
#endif /* SOCK_REFCNT_DEBUG */
11941194

1195-
static inline bool sk_stream_memory_free(const struct sock *sk)
1195+
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
11961196
{
11971197
if (sk->sk_wmem_queued >= sk->sk_sndbuf)
11981198
return false;
11991199

12001200
return sk->sk_prot->stream_memory_free ?
1201-
sk->sk_prot->stream_memory_free(sk) : true;
1201+
sk->sk_prot->stream_memory_free(sk, wake) : true;
12021202
}
12031203

1204-
static inline bool sk_stream_is_writeable(const struct sock *sk)
1204+
static inline bool sk_stream_memory_free(const struct sock *sk)
1205+
{
1206+
return __sk_stream_memory_free(sk, 0);
1207+
}
1208+
1209+
static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
12051210
{
12061211
return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
1207-
sk_stream_memory_free(sk);
1212+
__sk_stream_memory_free(sk, wake);
1213+
}
1214+
1215+
static inline bool sk_stream_is_writeable(const struct sock *sk)
1216+
{
1217+
return __sk_stream_is_writeable(sk, 0);
12081218
}
12091219

12101220
static inline int sk_under_cgroup_hierarchy(struct sock *sk,

include/net/tcp.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,12 +1870,16 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
18701870
return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
18711871
}
18721872

1873-
static inline bool tcp_stream_memory_free(const struct sock *sk)
1873+
/* @wake is one when sk_stream_write_space() calls us.
1874+
* This sends EPOLLOUT only if notsent_bytes is half the limit.
1875+
* This mimics the strategy used in sock_def_write_space().
1876+
*/
1877+
static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
18741878
{
18751879
const struct tcp_sock *tp = tcp_sk(sk);
18761880
u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
18771881

1878-
return notsent_bytes < tcp_notsent_lowat(tp);
1882+
return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
18791883
}
18801884

18811885
#ifdef CONFIG_PROC_FS

net/core/stream.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk)
3232
struct socket *sock = sk->sk_socket;
3333
struct socket_wq *wq;
3434

35-
if (sk_stream_is_writeable(sk) && sock) {
35+
if (__sk_stream_is_writeable(sk, 1) && sock) {
3636
clear_bit(SOCK_NOSPACE, &sock->flags);
3737

3838
rcu_read_lock();

0 commit comments

Comments
 (0)