25
25
#include <linux/kthread.h>
26
26
#include <linux/capability.h>
27
27
28
+ #include <linux/netdevice.h> /* netif_receive_skb_core */
29
+ #include <linux/etherdevice.h> /* eth_type_trans */
30
+
28
31
/* General idea: XDP packets getting XDP redirected to another CPU,
29
32
* will maximum be stored/queued for one driver ->poll() call. It is
30
33
* guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
179
182
kthread_stop (rcpu -> kthread );
180
183
}
181
184
185
+ /* For now, xdp_pkt is a cpumap internal data structure, with info
186
+ * carried between enqueue to dequeue. It is mapped into the top
187
+ * headroom of the packet, to avoid allocating separate mem.
188
+ */
189
+ struct xdp_pkt {
190
+ void * data ;
191
+ u16 len ;
192
+ u16 headroom ;
193
+ u16 metasize ;
194
+ struct net_device * dev_rx ;
195
+ };
196
+
197
+ /* Convert xdp_buff to xdp_pkt */
198
+ static struct xdp_pkt * convert_to_xdp_pkt (struct xdp_buff * xdp )
199
+ {
200
+ struct xdp_pkt * xdp_pkt ;
201
+ int metasize ;
202
+ int headroom ;
203
+
204
+ /* Assure headroom is available for storing info */
205
+ headroom = xdp -> data - xdp -> data_hard_start ;
206
+ metasize = xdp -> data - xdp -> data_meta ;
207
+ metasize = metasize > 0 ? metasize : 0 ;
208
+ if ((headroom - metasize ) < sizeof (* xdp_pkt ))
209
+ return NULL ;
210
+
211
+ /* Store info in top of packet */
212
+ xdp_pkt = xdp -> data_hard_start ;
213
+
214
+ xdp_pkt -> data = xdp -> data ;
215
+ xdp_pkt -> len = xdp -> data_end - xdp -> data ;
216
+ xdp_pkt -> headroom = headroom - sizeof (* xdp_pkt );
217
+ xdp_pkt -> metasize = metasize ;
218
+
219
+ return xdp_pkt ;
220
+ }
221
+
222
+ struct sk_buff * cpu_map_build_skb (struct bpf_cpu_map_entry * rcpu ,
223
+ struct xdp_pkt * xdp_pkt )
224
+ {
225
+ unsigned int frame_size ;
226
+ void * pkt_data_start ;
227
+ struct sk_buff * skb ;
228
+
229
+ /* build_skb need to place skb_shared_info after SKB end, and
230
+ * also want to know the memory "truesize". Thus, need to
231
+ * know the memory frame size backing xdp_buff.
232
+ *
233
+ * XDP was designed to have PAGE_SIZE frames, but this
234
+ * assumption is not longer true with ixgbe and i40e. It
235
+ * would be preferred to set frame_size to 2048 or 4096
236
+ * depending on the driver.
237
+ * frame_size = 2048;
238
+ * frame_len = frame_size - sizeof(*xdp_pkt);
239
+ *
240
+ * Instead, with info avail, skb_shared_info in placed after
241
+ * packet len. This, unfortunately fakes the truesize.
242
+ * Another disadvantage of this approach, the skb_shared_info
243
+ * is not at a fixed memory location, with mixed length
244
+ * packets, which is bad for cache-line hotness.
245
+ */
246
+ frame_size = SKB_DATA_ALIGN (xdp_pkt -> len ) + xdp_pkt -> headroom +
247
+ SKB_DATA_ALIGN (sizeof (struct skb_shared_info ));
248
+
249
+ pkt_data_start = xdp_pkt -> data - xdp_pkt -> headroom ;
250
+ skb = build_skb (pkt_data_start , frame_size );
251
+ if (!skb )
252
+ return NULL ;
253
+
254
+ skb_reserve (skb , xdp_pkt -> headroom );
255
+ __skb_put (skb , xdp_pkt -> len );
256
+ if (xdp_pkt -> metasize )
257
+ skb_metadata_set (skb , xdp_pkt -> metasize );
258
+
259
+ /* Essential SKB info: protocol and skb->dev */
260
+ skb -> protocol = eth_type_trans (skb , xdp_pkt -> dev_rx );
261
+
262
+ /* Optional SKB info, currently missing:
263
+ * - HW checksum info (skb->ip_summed)
264
+ * - HW RX hash (skb_set_hash)
265
+ * - RX ring dev queue index (skb_record_rx_queue)
266
+ */
267
+
268
+ return skb ;
269
+ }
270
+
182
271
static int cpu_map_kthread_run (void * data )
183
272
{
184
273
struct bpf_cpu_map_entry * rcpu = data ;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
191
280
* kthread_stop signal until queue is empty.
192
281
*/
193
282
while (!kthread_should_stop () || !__ptr_ring_empty (rcpu -> queue )) {
283
+ unsigned int processed = 0 , drops = 0 ;
194
284
struct xdp_pkt * xdp_pkt ;
195
285
196
- schedule ();
197
- /* Do work */
198
- while ((xdp_pkt = ptr_ring_consume (rcpu -> queue ))) {
199
- /* For now just "refcnt-free" */
200
- page_frag_free (xdp_pkt );
286
+ /* Release CPU reschedule checks */
287
+ if (__ptr_ring_empty (rcpu -> queue )) {
288
+ __set_current_state (TASK_INTERRUPTIBLE );
289
+ schedule ();
290
+ } else {
291
+ cond_resched ();
292
+ }
293
+ __set_current_state (TASK_RUNNING );
294
+
295
+ /* Process packets in rcpu->queue */
296
+ local_bh_disable ();
297
+ /*
298
+ * The bpf_cpu_map_entry is single consumer, with this
299
+ * kthread CPU pinned. Lockless access to ptr_ring
300
+ * consume side valid as no-resize allowed of queue.
301
+ */
302
+ while ((xdp_pkt = __ptr_ring_consume (rcpu -> queue ))) {
303
+ struct sk_buff * skb ;
304
+ int ret ;
305
+
306
+ skb = cpu_map_build_skb (rcpu , xdp_pkt );
307
+ if (!skb ) {
308
+ page_frag_free (xdp_pkt );
309
+ continue ;
310
+ }
311
+
312
+ /* Inject into network stack */
313
+ ret = netif_receive_skb_core (skb );
314
+ if (ret == NET_RX_DROP )
315
+ drops ++ ;
316
+
317
+ /* Limit BH-disable period */
318
+ if (++ processed == 8 )
319
+ break ;
201
320
}
202
- __set_current_state ( TASK_INTERRUPTIBLE );
321
+ local_bh_enable (); /* resched point, may call do_softirq() */
203
322
}
204
323
__set_current_state (TASK_RUNNING );
205
324
@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
490
609
return 0 ;
491
610
}
492
611
493
- /* Notice: Will change in later patch */
494
- struct xdp_pkt {
495
- void * data ;
496
- u16 len ;
497
- u16 headroom ;
498
- };
499
-
500
612
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
501
613
* Thus, safe percpu variable access.
502
614
*/
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
524
636
struct net_device * dev_rx )
525
637
{
526
638
struct xdp_pkt * xdp_pkt ;
527
- int headroom ;
528
639
529
- /* For now this is just used as a void pointer to data_hard_start.
530
- * Followup patch will generalize this.
531
- */
532
- xdp_pkt = xdp -> data_hard_start ;
640
+ xdp_pkt = convert_to_xdp_pkt (xdp );
641
+ if (!xdp_pkt )
642
+ return - EOVERFLOW ;
533
643
534
- /* Fake writing into xdp_pkt->data to measure overhead */
535
- headroom = xdp -> data - xdp -> data_hard_start ;
536
- if (headroom < sizeof (* xdp_pkt ))
537
- xdp_pkt -> data = xdp -> data ;
644
+ /* Info needed when constructing SKB on remote CPU */
645
+ xdp_pkt -> dev_rx = dev_rx ;
538
646
539
647
bq_enqueue (rcpu , xdp_pkt );
540
648
return 0 ;
0 commit comments