Skip to content

Commit 1c601d8

Browse files
netoptimizerdavem330
authored andcommitted
bpf: cpumap xdp_buff to skb conversion and allocation
This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 9c270af commit 1c601d8

File tree

3 files changed

+158
-22
lines changed

3 files changed

+158
-22
lines changed

include/linux/netdevice.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
32603260
int netif_rx(struct sk_buff *skb);
32613261
int netif_rx_ni(struct sk_buff *skb);
32623262
int netif_receive_skb(struct sk_buff *skb);
3263+
int netif_receive_skb_core(struct sk_buff *skb);
32633264
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
32643265
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
32653266
struct sk_buff *napi_get_frags(struct napi_struct *napi);

kernel/bpf/cpumap.c

Lines changed: 130 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
#include <linux/kthread.h>
2626
#include <linux/capability.h>
2727

28+
#include <linux/netdevice.h> /* netif_receive_skb_core */
29+
#include <linux/etherdevice.h> /* eth_type_trans */
30+
2831
/* General idea: XDP packets getting XDP redirected to another CPU,
2932
* will maximum be stored/queued for one driver ->poll() call. It is
3033
* guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
179182
kthread_stop(rcpu->kthread);
180183
}
181184

185+
/* For now, xdp_pkt is a cpumap internal data structure, with info
186+
* carried between enqueue to dequeue. It is mapped into the top
187+
* headroom of the packet, to avoid allocating separate mem.
188+
*/
189+
struct xdp_pkt {
190+
void *data;
191+
u16 len;
192+
u16 headroom;
193+
u16 metasize;
194+
struct net_device *dev_rx;
195+
};
196+
197+
/* Convert xdp_buff to xdp_pkt */
198+
static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
199+
{
200+
struct xdp_pkt *xdp_pkt;
201+
int metasize;
202+
int headroom;
203+
204+
/* Assure headroom is available for storing info */
205+
headroom = xdp->data - xdp->data_hard_start;
206+
metasize = xdp->data - xdp->data_meta;
207+
metasize = metasize > 0 ? metasize : 0;
208+
if ((headroom - metasize) < sizeof(*xdp_pkt))
209+
return NULL;
210+
211+
/* Store info in top of packet */
212+
xdp_pkt = xdp->data_hard_start;
213+
214+
xdp_pkt->data = xdp->data;
215+
xdp_pkt->len = xdp->data_end - xdp->data;
216+
xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
217+
xdp_pkt->metasize = metasize;
218+
219+
return xdp_pkt;
220+
}
221+
222+
struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
223+
struct xdp_pkt *xdp_pkt)
224+
{
225+
unsigned int frame_size;
226+
void *pkt_data_start;
227+
struct sk_buff *skb;
228+
229+
/* build_skb need to place skb_shared_info after SKB end, and
230+
* also want to know the memory "truesize". Thus, need to
231+
* know the memory frame size backing xdp_buff.
232+
*
233+
* XDP was designed to have PAGE_SIZE frames, but this
234+
* assumption is not longer true with ixgbe and i40e. It
235+
* would be preferred to set frame_size to 2048 or 4096
236+
* depending on the driver.
237+
* frame_size = 2048;
238+
* frame_len = frame_size - sizeof(*xdp_pkt);
239+
*
240+
* Instead, with info avail, skb_shared_info in placed after
241+
* packet len. This, unfortunately fakes the truesize.
242+
* Another disadvantage of this approach, the skb_shared_info
243+
* is not at a fixed memory location, with mixed length
244+
* packets, which is bad for cache-line hotness.
245+
*/
246+
frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
247+
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
248+
249+
pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
250+
skb = build_skb(pkt_data_start, frame_size);
251+
if (!skb)
252+
return NULL;
253+
254+
skb_reserve(skb, xdp_pkt->headroom);
255+
__skb_put(skb, xdp_pkt->len);
256+
if (xdp_pkt->metasize)
257+
skb_metadata_set(skb, xdp_pkt->metasize);
258+
259+
/* Essential SKB info: protocol and skb->dev */
260+
skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
261+
262+
/* Optional SKB info, currently missing:
263+
* - HW checksum info (skb->ip_summed)
264+
* - HW RX hash (skb_set_hash)
265+
* - RX ring dev queue index (skb_record_rx_queue)
266+
*/
267+
268+
return skb;
269+
}
270+
182271
static int cpu_map_kthread_run(void *data)
183272
{
184273
struct bpf_cpu_map_entry *rcpu = data;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
191280
* kthread_stop signal until queue is empty.
192281
*/
193282
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
283+
unsigned int processed = 0, drops = 0;
194284
struct xdp_pkt *xdp_pkt;
195285

196-
schedule();
197-
/* Do work */
198-
while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
199-
/* For now just "refcnt-free" */
200-
page_frag_free(xdp_pkt);
286+
/* Release CPU reschedule checks */
287+
if (__ptr_ring_empty(rcpu->queue)) {
288+
__set_current_state(TASK_INTERRUPTIBLE);
289+
schedule();
290+
} else {
291+
cond_resched();
292+
}
293+
__set_current_state(TASK_RUNNING);
294+
295+
/* Process packets in rcpu->queue */
296+
local_bh_disable();
297+
/*
298+
* The bpf_cpu_map_entry is single consumer, with this
299+
* kthread CPU pinned. Lockless access to ptr_ring
300+
* consume side valid as no-resize allowed of queue.
301+
*/
302+
while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
303+
struct sk_buff *skb;
304+
int ret;
305+
306+
skb = cpu_map_build_skb(rcpu, xdp_pkt);
307+
if (!skb) {
308+
page_frag_free(xdp_pkt);
309+
continue;
310+
}
311+
312+
/* Inject into network stack */
313+
ret = netif_receive_skb_core(skb);
314+
if (ret == NET_RX_DROP)
315+
drops++;
316+
317+
/* Limit BH-disable period */
318+
if (++processed == 8)
319+
break;
201320
}
202-
__set_current_state(TASK_INTERRUPTIBLE);
321+
local_bh_enable(); /* resched point, may call do_softirq() */
203322
}
204323
__set_current_state(TASK_RUNNING);
205324

@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
490609
return 0;
491610
}
492611

493-
/* Notice: Will change in later patch */
494-
struct xdp_pkt {
495-
void *data;
496-
u16 len;
497-
u16 headroom;
498-
};
499-
500612
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
501613
* Thus, safe percpu variable access.
502614
*/
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
524636
struct net_device *dev_rx)
525637
{
526638
struct xdp_pkt *xdp_pkt;
527-
int headroom;
528639

529-
/* For now this is just used as a void pointer to data_hard_start.
530-
* Followup patch will generalize this.
531-
*/
532-
xdp_pkt = xdp->data_hard_start;
640+
xdp_pkt = convert_to_xdp_pkt(xdp);
641+
if (!xdp_pkt)
642+
return -EOVERFLOW;
533643

534-
/* Fake writing into xdp_pkt->data to measure overhead */
535-
headroom = xdp->data - xdp->data_hard_start;
536-
if (headroom < sizeof(*xdp_pkt))
537-
xdp_pkt->data = xdp->data;
644+
/* Info needed when constructing SKB on remote CPU */
645+
xdp_pkt->dev_rx = dev_rx;
538646

539647
bq_enqueue(rcpu, xdp_pkt);
540648
return 0;

net/core/dev.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4492,6 +4492,33 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
44924492
return ret;
44934493
}
44944494

4495+
/**
4496+
* netif_receive_skb_core - special purpose version of netif_receive_skb
4497+
* @skb: buffer to process
4498+
*
4499+
* More direct receive version of netif_receive_skb(). It should
4500+
* only be used by callers that have a need to skip RPS and Generic XDP.
4501+
* Caller must also take care of handling if (page_is_)pfmemalloc.
4502+
*
4503+
* This function may only be called from softirq context and interrupts
4504+
* should be enabled.
4505+
*
4506+
* Return values (usually ignored):
4507+
* NET_RX_SUCCESS: no congestion
4508+
* NET_RX_DROP: packet was dropped
4509+
*/
4510+
int netif_receive_skb_core(struct sk_buff *skb)
4511+
{
4512+
int ret;
4513+
4514+
rcu_read_lock();
4515+
ret = __netif_receive_skb_core(skb, false);
4516+
rcu_read_unlock();
4517+
4518+
return ret;
4519+
}
4520+
EXPORT_SYMBOL(netif_receive_skb_core);
4521+
44954522
static int __netif_receive_skb(struct sk_buff *skb)
44964523
{
44974524
int ret;

0 commit comments

Comments
 (0)