@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187
187
188
188
static void packet_flush_mclist (struct sock * sk );
189
189
190
+ struct packet_fanout ;
190
191
struct packet_sock {
191
192
/* struct sock has to be the first member of packet_sock */
192
193
struct sock sk ;
194
+ struct packet_fanout * fanout ;
193
195
struct tpacket_stats stats ;
194
196
struct packet_ring_buffer rx_ring ;
195
197
struct packet_ring_buffer tx_ring ;
@@ -212,6 +214,24 @@ struct packet_sock {
212
214
struct packet_type prot_hook ____cacheline_aligned_in_smp ;
213
215
};
214
216
217
+ #define PACKET_FANOUT_MAX 256
218
+
219
+ struct packet_fanout {
220
+ #ifdef CONFIG_NET_NS
221
+ struct net * net ;
222
+ #endif
223
+ unsigned int num_members ;
224
+ u16 id ;
225
+ u8 type ;
226
+ u8 pad ;
227
+ atomic_t rr_cur ;
228
+ struct list_head list ;
229
+ struct sock * arr [PACKET_FANOUT_MAX ];
230
+ spinlock_t lock ;
231
+ atomic_t sk_ref ;
232
+ struct packet_type prot_hook ____cacheline_aligned_in_smp ;
233
+ };
234
+
215
235
struct packet_skb_cb {
216
236
unsigned int origlen ;
217
237
union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
227
247
return (struct packet_sock * )sk ;
228
248
}
229
249
250
+ static void __fanout_unlink (struct sock * sk , struct packet_sock * po );
251
+ static void __fanout_link (struct sock * sk , struct packet_sock * po );
252
+
230
253
/* register_prot_hook must be invoked with the po->bind_lock held,
231
254
* or from a context in which asynchronous accesses to the packet
232
255
* socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
235
258
{
236
259
struct packet_sock * po = pkt_sk (sk );
237
260
if (!po -> running ) {
238
- dev_add_pack (& po -> prot_hook );
261
+ if (po -> fanout )
262
+ __fanout_link (sk , po );
263
+ else
264
+ dev_add_pack (& po -> prot_hook );
239
265
sock_hold (sk );
240
266
po -> running = 1 ;
241
267
}
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
253
279
struct packet_sock * po = pkt_sk (sk );
254
280
255
281
po -> running = 0 ;
256
- __dev_remove_pack (& po -> prot_hook );
282
+ if (po -> fanout )
283
+ __fanout_unlink (sk , po );
284
+ else
285
+ __dev_remove_pack (& po -> prot_hook );
257
286
__sock_put (sk );
258
287
259
288
if (sync ) {
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk)
388
417
sk_refcnt_debug_dec (sk );
389
418
}
390
419
420
+ static int fanout_rr_next (struct packet_fanout * f , unsigned int num )
421
+ {
422
+ int x = atomic_read (& f -> rr_cur ) + 1 ;
423
+
424
+ if (x >= num )
425
+ x = 0 ;
426
+
427
+ return x ;
428
+ }
429
+
430
+ static struct sock * fanout_demux_hash (struct packet_fanout * f , struct sk_buff * skb , unsigned int num )
431
+ {
432
+ u32 idx , hash = skb -> rxhash ;
433
+
434
+ idx = ((u64 )hash * num ) >> 32 ;
435
+
436
+ return f -> arr [idx ];
437
+ }
438
+
439
+ static struct sock * fanout_demux_lb (struct packet_fanout * f , struct sk_buff * skb , unsigned int num )
440
+ {
441
+ int cur , old ;
442
+
443
+ cur = atomic_read (& f -> rr_cur );
444
+ while ((old = atomic_cmpxchg (& f -> rr_cur , cur ,
445
+ fanout_rr_next (f , num ))) != cur )
446
+ cur = old ;
447
+ return f -> arr [cur ];
448
+ }
449
+
450
+ static int packet_rcv_fanout_hash (struct sk_buff * skb , struct net_device * dev ,
451
+ struct packet_type * pt , struct net_device * orig_dev )
452
+ {
453
+ struct packet_fanout * f = pt -> af_packet_priv ;
454
+ unsigned int num = f -> num_members ;
455
+ struct packet_sock * po ;
456
+ struct sock * sk ;
457
+
458
+ if (!net_eq (dev_net (dev ), read_pnet (& f -> net )) ||
459
+ !num ) {
460
+ kfree_skb (skb );
461
+ return 0 ;
462
+ }
463
+
464
+ skb_get_rxhash (skb );
465
+
466
+ sk = fanout_demux_hash (f , skb , num );
467
+ po = pkt_sk (sk );
468
+
469
+ return po -> prot_hook .func (skb , dev , & po -> prot_hook , orig_dev );
470
+ }
471
+
472
+ static int packet_rcv_fanout_lb (struct sk_buff * skb , struct net_device * dev ,
473
+ struct packet_type * pt , struct net_device * orig_dev )
474
+ {
475
+ struct packet_fanout * f = pt -> af_packet_priv ;
476
+ unsigned int num = f -> num_members ;
477
+ struct packet_sock * po ;
478
+ struct sock * sk ;
479
+
480
+ if (!net_eq (dev_net (dev ), read_pnet (& f -> net )) ||
481
+ !num ) {
482
+ kfree_skb (skb );
483
+ return 0 ;
484
+ }
485
+
486
+ sk = fanout_demux_lb (f , skb , num );
487
+ po = pkt_sk (sk );
488
+
489
+ return po -> prot_hook .func (skb , dev , & po -> prot_hook , orig_dev );
490
+ }
491
+
492
+ static DEFINE_MUTEX (fanout_mutex );
493
+ static LIST_HEAD (fanout_list );
494
+
495
+ static void __fanout_link (struct sock * sk , struct packet_sock * po )
496
+ {
497
+ struct packet_fanout * f = po -> fanout ;
498
+
499
+ spin_lock (& f -> lock );
500
+ f -> arr [f -> num_members ] = sk ;
501
+ smp_wmb ();
502
+ f -> num_members ++ ;
503
+ spin_unlock (& f -> lock );
504
+ }
505
+
506
+ static void __fanout_unlink (struct sock * sk , struct packet_sock * po )
507
+ {
508
+ struct packet_fanout * f = po -> fanout ;
509
+ int i ;
510
+
511
+ spin_lock (& f -> lock );
512
+ for (i = 0 ; i < f -> num_members ; i ++ ) {
513
+ if (f -> arr [i ] == sk )
514
+ break ;
515
+ }
516
+ BUG_ON (i >= f -> num_members );
517
+ f -> arr [i ] = f -> arr [f -> num_members - 1 ];
518
+ f -> num_members -- ;
519
+ spin_unlock (& f -> lock );
520
+ }
521
+
522
+ static int fanout_add (struct sock * sk , u16 id , u8 type )
523
+ {
524
+ struct packet_sock * po = pkt_sk (sk );
525
+ struct packet_fanout * f , * match ;
526
+ int err ;
527
+
528
+ switch (type ) {
529
+ case PACKET_FANOUT_HASH :
530
+ case PACKET_FANOUT_LB :
531
+ break ;
532
+ default :
533
+ return - EINVAL ;
534
+ }
535
+
536
+ if (!po -> running )
537
+ return - EINVAL ;
538
+
539
+ if (po -> fanout )
540
+ return - EALREADY ;
541
+
542
+ mutex_lock (& fanout_mutex );
543
+ match = NULL ;
544
+ list_for_each_entry (f , & fanout_list , list ) {
545
+ if (f -> id == id &&
546
+ read_pnet (& f -> net ) == sock_net (sk )) {
547
+ match = f ;
548
+ break ;
549
+ }
550
+ }
551
+ if (!match ) {
552
+ match = kzalloc (sizeof (* match ), GFP_KERNEL );
553
+ if (match ) {
554
+ write_pnet (& match -> net , sock_net (sk ));
555
+ match -> id = id ;
556
+ match -> type = type ;
557
+ atomic_set (& match -> rr_cur , 0 );
558
+ INIT_LIST_HEAD (& match -> list );
559
+ spin_lock_init (& match -> lock );
560
+ atomic_set (& match -> sk_ref , 0 );
561
+ match -> prot_hook .type = po -> prot_hook .type ;
562
+ match -> prot_hook .dev = po -> prot_hook .dev ;
563
+ switch (type ) {
564
+ case PACKET_FANOUT_HASH :
565
+ match -> prot_hook .func = packet_rcv_fanout_hash ;
566
+ break ;
567
+ case PACKET_FANOUT_LB :
568
+ match -> prot_hook .func = packet_rcv_fanout_lb ;
569
+ break ;
570
+ }
571
+ match -> prot_hook .af_packet_priv = match ;
572
+ dev_add_pack (& match -> prot_hook );
573
+ list_add (& match -> list , & fanout_list );
574
+ }
575
+ }
576
+ err = - ENOMEM ;
577
+ if (match ) {
578
+ err = - EINVAL ;
579
+ if (match -> type == type &&
580
+ match -> prot_hook .type == po -> prot_hook .type &&
581
+ match -> prot_hook .dev == po -> prot_hook .dev ) {
582
+ err = - ENOSPC ;
583
+ if (atomic_read (& match -> sk_ref ) < PACKET_FANOUT_MAX ) {
584
+ __dev_remove_pack (& po -> prot_hook );
585
+ po -> fanout = match ;
586
+ atomic_inc (& match -> sk_ref );
587
+ __fanout_link (sk , po );
588
+ err = 0 ;
589
+ }
590
+ }
591
+ }
592
+ mutex_unlock (& fanout_mutex );
593
+ return err ;
594
+ }
595
+
596
+ static void fanout_release (struct sock * sk )
597
+ {
598
+ struct packet_sock * po = pkt_sk (sk );
599
+ struct packet_fanout * f ;
600
+
601
+ f = po -> fanout ;
602
+ if (!f )
603
+ return ;
604
+
605
+ po -> fanout = NULL ;
606
+
607
+ mutex_lock (& fanout_mutex );
608
+ if (atomic_dec_and_test (& f -> sk_ref )) {
609
+ list_del (& f -> list );
610
+ dev_remove_pack (& f -> prot_hook );
611
+ kfree (f );
612
+ }
613
+ mutex_unlock (& fanout_mutex );
614
+ }
391
615
392
616
static const struct proto_ops packet_ops ;
393
617
@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock)
1398
1622
if (po -> tx_ring .pg_vec )
1399
1623
packet_set_ring (sk , & req , 1 , 1 );
1400
1624
1625
+ fanout_release (sk );
1626
+
1401
1627
synchronize_net ();
1402
1628
/*
1403
1629
* Now the socket is dead. No more input will appear.
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock)
1421
1647
static int packet_do_bind (struct sock * sk , struct net_device * dev , __be16 protocol )
1422
1648
{
1423
1649
struct packet_sock * po = pkt_sk (sk );
1424
- /*
1425
- * Detach an existing hook if present.
1426
- */
1650
+
1651
+ if ( po -> fanout )
1652
+ return - EINVAL ;
1427
1653
1428
1654
lock_sock (sk );
1429
1655
@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2133
2359
po -> tp_tstamp = val ;
2134
2360
return 0 ;
2135
2361
}
2362
+ case PACKET_FANOUT :
2363
+ {
2364
+ int val ;
2365
+
2366
+ if (optlen != sizeof (val ))
2367
+ return - EINVAL ;
2368
+ if (copy_from_user (& val , optval , sizeof (val )))
2369
+ return - EFAULT ;
2370
+
2371
+ return fanout_add (sk , val & 0xffff , val >> 16 );
2372
+ }
2136
2373
default :
2137
2374
return - ENOPROTOOPT ;
2138
2375
}
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2231
2468
val = po -> tp_tstamp ;
2232
2469
data = & val ;
2233
2470
break ;
2471
+ case PACKET_FANOUT :
2472
+ if (len > sizeof (int ))
2473
+ len = sizeof (int );
2474
+ val = (po -> fanout ?
2475
+ ((u32 )po -> fanout -> id |
2476
+ ((u32 )po -> fanout -> type << 16 )) :
2477
+ 0 );
2478
+ data = & val ;
2479
+ break ;
2234
2480
default :
2235
2481
return - ENOPROTOOPT ;
2236
2482
}
0 commit comments