Skip to content

Commit 99bbc70

Browse files
wdebruijdavem330
authored andcommitted
rps: selective flow shedding during softnet overflow
A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 4a5bddf commit 99bbc70

File tree

5 files changed

+194
-3
lines changed

5 files changed

+194
-3
lines changed

include/linux/netdevice.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,6 +1778,19 @@ static inline int unregister_gifconf(unsigned int family)
17781778
return register_gifconf(family, NULL);
17791779
}
17801780

1781+
#ifdef CONFIG_NET_FLOW_LIMIT
1782+
#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */
1783+
struct sd_flow_limit {
1784+
u64 count;
1785+
unsigned int num_buckets;
1786+
unsigned int history_head;
1787+
u16 history[FLOW_LIMIT_HISTORY];
1788+
u8 buckets[];
1789+
};
1790+
1791+
extern int netdev_flow_limit_table_len;
1792+
#endif /* CONFIG_NET_FLOW_LIMIT */
1793+
17811794
/*
17821795
* Incoming packets are placed on per-cpu queues
17831796
*/
@@ -1807,6 +1820,10 @@ struct softnet_data {
18071820
unsigned int dropped;
18081821
struct sk_buff_head input_pkt_queue;
18091822
struct napi_struct backlog;
1823+
1824+
#ifdef CONFIG_NET_FLOW_LIMIT
1825+
struct sd_flow_limit *flow_limit;
1826+
#endif
18101827
};
18111828

18121829
static inline void input_queue_head_incr(struct softnet_data *sd)

net/Kconfig

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,18 @@ config BPF_JIT
259259
packet sniffing (libpcap/tcpdump). Note : Admin should enable
260260
this feature changing /proc/sys/net/core/bpf_jit_enable
261261

262+
config NET_FLOW_LIMIT
263+
boolean
264+
depends on RPS
265+
default y
266+
---help---
267+
The network stack has to drop packets when a receive processing CPU's
268+
backlog reaches netdev_max_backlog. If a few out of many active flows
269+
generate the vast majority of load, drop their traffic earlier to
270+
maintain capacity for the other flows. This feature provides servers
271+
with many clients some protection against DoS by a single (spoofed)
272+
flow that greatly exceeds average workload.
273+
262274
menu "Network testing"
263275

264276
config NET_PKTGEN

net/core/dev.c

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
30643064
return 0;
30653065
}
30663066

3067+
#ifdef CONFIG_NET_FLOW_LIMIT
3068+
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3069+
#endif
3070+
3071+
static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3072+
{
3073+
#ifdef CONFIG_NET_FLOW_LIMIT
3074+
struct sd_flow_limit *fl;
3075+
struct softnet_data *sd;
3076+
unsigned int old_flow, new_flow;
3077+
3078+
if (qlen < (netdev_max_backlog >> 1))
3079+
return false;
3080+
3081+
sd = &__get_cpu_var(softnet_data);
3082+
3083+
rcu_read_lock();
3084+
fl = rcu_dereference(sd->flow_limit);
3085+
if (fl) {
3086+
new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3087+
old_flow = fl->history[fl->history_head];
3088+
fl->history[fl->history_head] = new_flow;
3089+
3090+
fl->history_head++;
3091+
fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3092+
3093+
if (likely(fl->buckets[old_flow]))
3094+
fl->buckets[old_flow]--;
3095+
3096+
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3097+
fl->count++;
3098+
rcu_read_unlock();
3099+
return true;
3100+
}
3101+
}
3102+
rcu_read_unlock();
3103+
#endif
3104+
return false;
3105+
}
3106+
30673107
/*
30683108
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
30693109
* queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
30733113
{
30743114
struct softnet_data *sd;
30753115
unsigned long flags;
3116+
unsigned int qlen;
30763117

30773118
sd = &per_cpu(softnet_data, cpu);
30783119

30793120
local_irq_save(flags);
30803121

30813122
rps_lock(sd);
3082-
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3123+
qlen = skb_queue_len(&sd->input_pkt_queue);
3124+
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
30833125
if (skb_queue_len(&sd->input_pkt_queue)) {
30843126
enqueue:
30853127
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
62696311
sd->backlog.weight = weight_p;
62706312
sd->backlog.gro_list = NULL;
62716313
sd->backlog.gro_count = 0;
6314+
6315+
#ifdef CONFIG_NET_FLOW_LIMIT
6316+
sd->flow_limit = NULL;
6317+
#endif
62726318
}
62736319

62746320
dev_boot_phase = 0;

net/core/net-procfs.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
146146
static int softnet_seq_show(struct seq_file *seq, void *v)
147147
{
148148
struct softnet_data *sd = v;
149+
unsigned int flow_limit_count = 0;
149150

150-
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151+
#ifdef CONFIG_NET_FLOW_LIMIT
152+
struct sd_flow_limit *fl;
153+
154+
rcu_read_lock();
155+
fl = rcu_dereference(sd->flow_limit);
156+
if (fl)
157+
flow_limit_count = fl->count;
158+
rcu_read_unlock();
159+
#endif
160+
161+
seq_printf(seq,
162+
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151163
sd->processed, sd->dropped, sd->time_squeeze, 0,
152164
0, 0, 0, 0, /* was fastroute */
153-
sd->cpu_collision, sd->received_rps);
165+
sd->cpu_collision, sd->received_rps, flow_limit_count);
154166
return 0;
155167
}
156168

net/core/sysctl_net_core.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
8787
}
8888
#endif /* CONFIG_RPS */
8989

90+
#ifdef CONFIG_NET_FLOW_LIMIT
91+
static DEFINE_MUTEX(flow_limit_update_mutex);
92+
93+
static int flow_limit_cpu_sysctl(ctl_table *table, int write,
94+
void __user *buffer, size_t *lenp,
95+
loff_t *ppos)
96+
{
97+
struct sd_flow_limit *cur;
98+
struct softnet_data *sd;
99+
cpumask_var_t mask;
100+
int i, len, ret = 0;
101+
102+
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
103+
return -ENOMEM;
104+
105+
if (write) {
106+
ret = cpumask_parse_user(buffer, *lenp, mask);
107+
if (ret)
108+
goto done;
109+
110+
mutex_lock(&flow_limit_update_mutex);
111+
len = sizeof(*cur) + netdev_flow_limit_table_len;
112+
for_each_possible_cpu(i) {
113+
sd = &per_cpu(softnet_data, i);
114+
cur = rcu_dereference_protected(sd->flow_limit,
115+
lockdep_is_held(&flow_limit_update_mutex));
116+
if (cur && !cpumask_test_cpu(i, mask)) {
117+
RCU_INIT_POINTER(sd->flow_limit, NULL);
118+
synchronize_rcu();
119+
kfree(cur);
120+
} else if (!cur && cpumask_test_cpu(i, mask)) {
121+
cur = kzalloc(len, GFP_KERNEL);
122+
if (!cur) {
123+
/* not unwinding previous changes */
124+
ret = -ENOMEM;
125+
goto write_unlock;
126+
}
127+
cur->num_buckets = netdev_flow_limit_table_len;
128+
rcu_assign_pointer(sd->flow_limit, cur);
129+
}
130+
}
131+
write_unlock:
132+
mutex_unlock(&flow_limit_update_mutex);
133+
} else {
134+
if (*ppos || !*lenp) {
135+
*lenp = 0;
136+
goto done;
137+
}
138+
139+
cpumask_clear(mask);
140+
rcu_read_lock();
141+
for_each_possible_cpu(i) {
142+
sd = &per_cpu(softnet_data, i);
143+
if (rcu_dereference(sd->flow_limit))
144+
cpumask_set_cpu(i, mask);
145+
}
146+
rcu_read_unlock();
147+
148+
len = cpumask_scnprintf(buffer, *lenp, mask);
149+
*lenp = len + 1;
150+
*ppos += len + 1;
151+
}
152+
153+
done:
154+
free_cpumask_var(mask);
155+
return ret;
156+
}
157+
158+
static int flow_limit_table_len_sysctl(ctl_table *table, int write,
159+
void __user *buffer, size_t *lenp,
160+
loff_t *ppos)
161+
{
162+
unsigned int old, *ptr;
163+
int ret;
164+
165+
mutex_lock(&flow_limit_update_mutex);
166+
167+
ptr = table->data;
168+
old = *ptr;
169+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
170+
if (!ret && write && !is_power_of_2(*ptr)) {
171+
*ptr = old;
172+
ret = -EINVAL;
173+
}
174+
175+
mutex_unlock(&flow_limit_update_mutex);
176+
return ret;
177+
}
178+
#endif /* CONFIG_NET_FLOW_LIMIT */
179+
90180
static struct ctl_table net_core_table[] = {
91181
#ifdef CONFIG_NET
92182
{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
180270
.proc_handler = rps_sock_flow_sysctl
181271
},
182272
#endif
273+
#ifdef CONFIG_NET_FLOW_LIMIT
274+
{
275+
.procname = "flow_limit_cpu_bitmap",
276+
.mode = 0644,
277+
.proc_handler = flow_limit_cpu_sysctl
278+
},
279+
{
280+
.procname = "flow_limit_table_len",
281+
.data = &netdev_flow_limit_table_len,
282+
.maxlen = sizeof(int),
283+
.mode = 0644,
284+
.proc_handler = flow_limit_table_len_sysctl
285+
},
286+
#endif /* CONFIG_NET_FLOW_LIMIT */
183287
#endif /* CONFIG_NET */
184288
{
185289
.procname = "netdev_budget",

0 commit comments

Comments
 (0)