Skip to content

Commit 11393cc

Browse files
jrfastabdavem330
authored andcommitted
xdp: Add batching support to redirect map
For performance reasons we want to avoid updating the tail pointer in the driver tx ring as much as possible. To accomplish this we add batching support to the redirect path in XDP. This adds another ndo op "xdp_flush" that is used to inform the driver that it should bump the tail pointer on the TX ring. Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 97f91a7 commit 11393cc

File tree

6 files changed

+166
-15
lines changed

6 files changed

+166
-15
lines changed

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2415,6 +2415,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
24152415
*/
24162416
wmb();
24172417
writel(ring->next_to_use, ring->tail);
2418+
2419+
xdp_do_flush_map();
24182420
}
24192421

24202422
u64_stats_update_begin(&rx_ring->syncp);
@@ -5817,6 +5819,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
58175819

58185820
usleep_range(10000, 20000);
58195821

5822+
/* synchronize_sched() needed for pending XDP buffers to drain */
5823+
if (adapter->xdp_ring[0])
5824+
synchronize_sched();
58205825
netif_tx_stop_all_queues(netdev);
58215826

58225827
/* call carrier off first to avoid false dev_watchdog timeouts */
@@ -9850,15 +9855,31 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
98509855
if (err != IXGBE_XDP_TX)
98519856
return -ENOMEM;
98529857

9858+
return 0;
9859+
}
9860+
9861+
static void ixgbe_xdp_flush(struct net_device *dev)
9862+
{
9863+
struct ixgbe_adapter *adapter = netdev_priv(dev);
9864+
struct ixgbe_ring *ring;
9865+
9866+
/* Its possible the device went down between xdp xmit and flush so
9867+
* we need to ensure device is still up.
9868+
*/
9869+
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
9870+
return;
9871+
9872+
ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
9873+
if (unlikely(!ring))
9874+
return;
9875+
98539876
/* Force memory writes to complete before letting h/w know there
98549877
* are new descriptors to fetch.
98559878
*/
98569879
wmb();
9857-
9858-
ring = adapter->xdp_ring[smp_processor_id()];
98599880
writel(ring->next_to_use, ring->tail);
98609881

9861-
return 0;
9882+
return;
98629883
}
98639884

98649885
static const struct net_device_ops ixgbe_netdev_ops = {
@@ -9908,6 +9929,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
99089929
.ndo_features_check = ixgbe_features_check,
99099930
.ndo_xdp = ixgbe_xdp,
99109931
.ndo_xdp_xmit = ixgbe_xdp_xmit,
9932+
.ndo_xdp_flush = ixgbe_xdp_flush,
99119933
};
99129934

99139935
/**

include/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,5 +381,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
381381

382382
/* Map specifics */
383383
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
384+
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
385+
void __dev_map_flush(struct bpf_map *map);
384386

385387
#endif /* _LINUX_BPF_H */

include/linux/filter.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,10 +712,17 @@ bool bpf_helper_changes_pkt_data(void *func);
712712
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
713713
const struct bpf_insn *patch, u32 len);
714714

715+
/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
716+
* same cpu context. Further for best results no more than a single map
717+
* for the do_redirect/do_flush pair should be used. This limitation is
718+
* because we only track one map and force a flush when the map changes.
719+
* This does not appear to be a real limiation for existing software.
720+
*/
715721
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
716722
int xdp_do_redirect(struct net_device *dev,
717723
struct xdp_buff *xdp,
718724
struct bpf_prog *prog);
725+
void xdp_do_flush_map(void);
719726

720727
void bpf_warn_invalid_xdp_action(u32 act);
721728
void bpf_warn_invalid_xdp_redirect(u32 ifindex);

include/linux/netdevice.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,9 @@ struct xfrmdev_ops {
11421142
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
11431143
* This function is used to submit a XDP packet for transmit on a
11441144
* netdevice.
1145-
*
1145+
* void (*ndo_xdp_flush)(struct net_device *dev);
1146+
* This function is used to inform the driver to flush a paticular
1147+
* xpd tx queue. Must be called on same CPU as xdp_xmit.
11461148
*/
11471149
struct net_device_ops {
11481150
int (*ndo_init)(struct net_device *dev);
@@ -1329,6 +1331,7 @@ struct net_device_ops {
13291331
struct netdev_xdp *xdp);
13301332
int (*ndo_xdp_xmit)(struct net_device *dev,
13311333
struct xdp_buff *xdp);
1334+
void (*ndo_xdp_flush)(struct net_device *dev);
13321335
};
13331336

13341337
/**

kernel/bpf/devmap.c

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct bpf_dtab_netdev {
5353
struct bpf_dtab {
5454
struct bpf_map map;
5555
struct bpf_dtab_netdev **netdev_map;
56+
unsigned long int __percpu *flush_needed;
5657
};
5758

5859
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
8788

8889
/* make sure page count doesn't overflow */
8990
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
91+
cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
9092
if (cost >= U32_MAX - PAGE_SIZE)
9193
goto free_dtab;
9294

@@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
9799
if (err)
98100
goto free_dtab;
99101

102+
/* A per cpu bitfield with a bit per possible net device */
103+
dtab->flush_needed = __alloc_percpu(
104+
BITS_TO_LONGS(attr->max_entries) *
105+
sizeof(unsigned long),
106+
__alignof__(unsigned long));
107+
if (!dtab->flush_needed)
108+
goto free_dtab;
109+
100110
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
101111
sizeof(struct bpf_dtab_netdev *));
102112
if (!dtab->netdev_map)
@@ -105,14 +115,15 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
105115
return &dtab->map;
106116

107117
free_dtab:
118+
free_percpu(dtab->flush_needed);
108119
kfree(dtab);
109120
return ERR_PTR(err);
110121
}
111122

112123
static void dev_map_free(struct bpf_map *map)
113124
{
114125
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
115-
int i;
126+
int i, cpu;
116127

117128
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
118129
* so the programs (can be more than one that used this map) were
@@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map)
123134
*/
124135
synchronize_rcu();
125136

137+
/* To ensure all pending flush operations have completed wait for flush
138+
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
139+
* Because the above synchronize_rcu() ensures the map is disconnected
140+
* from the program we can assume no new bits will be set.
141+
*/
142+
for_each_online_cpu(cpu) {
143+
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
144+
145+
while (!bitmap_empty(bitmap, dtab->map.max_entries))
146+
cpu_relax();
147+
}
148+
126149
for (i = 0; i < dtab->map.max_entries; i++) {
127150
struct bpf_dtab_netdev *dev;
128151

@@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map)
137160
/* At this point bpf program is detached and all pending operations
138161
* _must_ be complete
139162
*/
163+
free_percpu(dtab->flush_needed);
140164
bpf_map_area_free(dtab->netdev_map);
141165
kfree(dtab);
142166
}
@@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
159183
return 0;
160184
}
161185

186+
void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
187+
{
188+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
189+
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
190+
191+
__set_bit(key, bitmap);
192+
}
193+
162194
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
163195
{
164196
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
171203
return dev ? dev->dev : NULL;
172204
}
173205

206+
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
207+
* from the driver before returning from its napi->poll() routine. The poll()
208+
* routine is called either from busy_poll context or net_rx_action signaled
209+
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
210+
* net device can be torn down. On devmap tear down we ensure the ctx bitmap
211+
* is zeroed before completing to ensure all flush operations have completed.
212+
*/
213+
void __dev_map_flush(struct bpf_map *map)
214+
{
215+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
216+
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
217+
u32 bit;
218+
219+
for_each_set_bit(bit, bitmap, map->max_entries) {
220+
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
221+
struct net_device *netdev;
222+
223+
/* This is possible if the dev entry is removed by user space
224+
* between xdp redirect and flush op.
225+
*/
226+
if (unlikely(!dev))
227+
continue;
228+
229+
netdev = dev->dev;
230+
231+
__clear_bit(bit, bitmap);
232+
if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
233+
continue;
234+
235+
netdev->netdev_ops->ndo_xdp_flush(netdev);
236+
}
237+
}
238+
174239
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
175240
* update happens in parallel here a dev_put wont happen until after reading the
176241
* ifindex.
@@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
188253
return dev ? &dev->dev->ifindex : NULL;
189254
}
190255

256+
static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
257+
{
258+
if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
259+
struct net_device *fl = old_dev->dev;
260+
unsigned long *bitmap;
261+
int cpu;
262+
263+
for_each_online_cpu(cpu) {
264+
bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
265+
__clear_bit(old_dev->key, bitmap);
266+
267+
fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
268+
}
269+
}
270+
}
271+
191272
static void __dev_map_entry_free(struct rcu_head *rcu)
192273
{
193274
struct bpf_dtab_netdev *old_dev;
194275

195276
old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
277+
dev_map_flush_old(old_dev);
196278
dev_put(old_dev->dev);
197279
kfree(old_dev);
198280
}

net/core/filter.c

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,6 +1780,7 @@ struct redirect_info {
17801780
u32 ifindex;
17811781
u32 flags;
17821782
struct bpf_map *map;
1783+
struct bpf_map *map_to_flush;
17831784
};
17841785

17851786
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -2438,34 +2439,68 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
24382439
.arg2_type = ARG_ANYTHING,
24392440
};
24402441

2441-
static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
2442+
static int __bpf_tx_xdp(struct net_device *dev,
2443+
struct bpf_map *map,
2444+
struct xdp_buff *xdp,
2445+
u32 index)
24422446
{
2443-
if (dev->netdev_ops->ndo_xdp_xmit) {
2444-
dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
2445-
return 0;
2447+
int err;
2448+
2449+
if (!dev->netdev_ops->ndo_xdp_xmit) {
2450+
bpf_warn_invalid_xdp_redirect(dev->ifindex);
2451+
return -EOPNOTSUPP;
24462452
}
2447-
bpf_warn_invalid_xdp_redirect(dev->ifindex);
2448-
return -EOPNOTSUPP;
2453+
2454+
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
2455+
if (err)
2456+
return err;
2457+
2458+
if (map)
2459+
__dev_map_insert_ctx(map, index);
2460+
else
2461+
dev->netdev_ops->ndo_xdp_flush(dev);
2462+
2463+
return err;
24492464
}
24502465

2466+
void xdp_do_flush_map(void)
2467+
{
2468+
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
2469+
struct bpf_map *map = ri->map_to_flush;
2470+
2471+
ri->map = NULL;
2472+
ri->map_to_flush = NULL;
2473+
2474+
if (map)
2475+
__dev_map_flush(map);
2476+
}
2477+
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
2478+
24512479
int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
24522480
struct bpf_prog *xdp_prog)
24532481
{
24542482
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
24552483
struct bpf_map *map = ri->map;
2484+
u32 index = ri->ifindex;
24562485
struct net_device *fwd;
24572486
int err = -EINVAL;
24582487

24592488
ri->ifindex = 0;
24602489
ri->map = NULL;
24612490

2462-
fwd = __dev_map_lookup_elem(map, ri->ifindex);
2491+
fwd = __dev_map_lookup_elem(map, index);
24632492
if (!fwd)
24642493
goto out;
24652494

2466-
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
2467-
err = __bpf_tx_xdp(fwd, xdp);
2495+
if (ri->map_to_flush && (ri->map_to_flush != map))
2496+
xdp_do_flush_map();
2497+
2498+
err = __bpf_tx_xdp(fwd, map, xdp, index);
2499+
if (likely(!err))
2500+
ri->map_to_flush = map;
2501+
24682502
out:
2503+
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
24692504
return err;
24702505
}
24712506

@@ -2488,7 +2523,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
24882523

24892524
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
24902525

2491-
return __bpf_tx_xdp(fwd, xdp);
2526+
return __bpf_tx_xdp(fwd, NULL, xdp, 0);
24922527
}
24932528
EXPORT_SYMBOL_GPL(xdp_do_redirect);
24942529

0 commit comments

Comments
 (0)