Skip to content

Commit dc99f60

Browse files
committed
packet: Add fanout support.
Fanouts allow packet capturing to be demuxed to a set of AF_PACKET sockets. Two fanout policies are implemented: 1) Hashing based upon skb->rxhash 2) Pure round-robin An AF_PACKET socket must be fully bound before it tries to add itself to a fanout. All AF_PACKET sockets trying to join the same fanout must all have the same bind settings. Fanouts are identified (within a network namespace) by a 16-bit ID. The first socket to try to add itself to a fanout with a particular ID, creates that fanout. When the last socket leaves the fanout (which happens only when the socket is closed), that fanout is destroyed. Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent ce06b03 commit dc99f60

File tree

2 files changed

+255
-5
lines changed

2 files changed

+255
-5
lines changed

include/linux/if_packet.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ struct sockaddr_ll {
4949
#define PACKET_VNET_HDR 15
5050
#define PACKET_TX_TIMESTAMP 16
5151
#define PACKET_TIMESTAMP 17
52+
#define PACKET_FANOUT 18
53+
54+
#define PACKET_FANOUT_HASH 0
55+
#define PACKET_FANOUT_LB 1
5256

5357
struct tpacket_stats {
5458
unsigned int tp_packets;

net/packet/af_packet.c

Lines changed: 251 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187187

188188
static void packet_flush_mclist(struct sock *sk);
189189

190+
struct packet_fanout;
190191
struct packet_sock {
191192
/* struct sock has to be the first member of packet_sock */
192193
struct sock sk;
194+
struct packet_fanout *fanout;
193195
struct tpacket_stats stats;
194196
struct packet_ring_buffer rx_ring;
195197
struct packet_ring_buffer tx_ring;
@@ -212,6 +214,24 @@ struct packet_sock {
212214
struct packet_type prot_hook ____cacheline_aligned_in_smp;
213215
};
214216

217+
#define PACKET_FANOUT_MAX 256
218+
219+
struct packet_fanout {
220+
#ifdef CONFIG_NET_NS
221+
struct net *net;
222+
#endif
223+
unsigned int num_members;
224+
u16 id;
225+
u8 type;
226+
u8 pad;
227+
atomic_t rr_cur;
228+
struct list_head list;
229+
struct sock *arr[PACKET_FANOUT_MAX];
230+
spinlock_t lock;
231+
atomic_t sk_ref;
232+
struct packet_type prot_hook ____cacheline_aligned_in_smp;
233+
};
234+
215235
struct packet_skb_cb {
216236
unsigned int origlen;
217237
union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
227247
return (struct packet_sock *)sk;
228248
}
229249

250+
static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
251+
static void __fanout_link(struct sock *sk, struct packet_sock *po);
252+
230253
/* register_prot_hook must be invoked with the po->bind_lock held,
231254
* or from a context in which asynchronous accesses to the packet
232255
* socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
235258
{
236259
struct packet_sock *po = pkt_sk(sk);
237260
if (!po->running) {
238-
dev_add_pack(&po->prot_hook);
261+
if (po->fanout)
262+
__fanout_link(sk, po);
263+
else
264+
dev_add_pack(&po->prot_hook);
239265
sock_hold(sk);
240266
po->running = 1;
241267
}
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
253279
struct packet_sock *po = pkt_sk(sk);
254280

255281
po->running = 0;
256-
__dev_remove_pack(&po->prot_hook);
282+
if (po->fanout)
283+
__fanout_unlink(sk, po);
284+
else
285+
__dev_remove_pack(&po->prot_hook);
257286
__sock_put(sk);
258287

259288
if (sync) {
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk)
388417
sk_refcnt_debug_dec(sk);
389418
}
390419

420+
static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
421+
{
422+
int x = atomic_read(&f->rr_cur) + 1;
423+
424+
if (x >= num)
425+
x = 0;
426+
427+
return x;
428+
}
429+
430+
static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
431+
{
432+
u32 idx, hash = skb->rxhash;
433+
434+
idx = ((u64)hash * num) >> 32;
435+
436+
return f->arr[idx];
437+
}
438+
439+
static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
440+
{
441+
int cur, old;
442+
443+
cur = atomic_read(&f->rr_cur);
444+
while ((old = atomic_cmpxchg(&f->rr_cur, cur,
445+
fanout_rr_next(f, num))) != cur)
446+
cur = old;
447+
return f->arr[cur];
448+
}
449+
450+
static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
451+
struct packet_type *pt, struct net_device *orig_dev)
452+
{
453+
struct packet_fanout *f = pt->af_packet_priv;
454+
unsigned int num = f->num_members;
455+
struct packet_sock *po;
456+
struct sock *sk;
457+
458+
if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
459+
!num) {
460+
kfree_skb(skb);
461+
return 0;
462+
}
463+
464+
skb_get_rxhash(skb);
465+
466+
sk = fanout_demux_hash(f, skb, num);
467+
po = pkt_sk(sk);
468+
469+
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
470+
}
471+
472+
static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
473+
struct packet_type *pt, struct net_device *orig_dev)
474+
{
475+
struct packet_fanout *f = pt->af_packet_priv;
476+
unsigned int num = f->num_members;
477+
struct packet_sock *po;
478+
struct sock *sk;
479+
480+
if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
481+
!num) {
482+
kfree_skb(skb);
483+
return 0;
484+
}
485+
486+
sk = fanout_demux_lb(f, skb, num);
487+
po = pkt_sk(sk);
488+
489+
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
490+
}
491+
492+
static DEFINE_MUTEX(fanout_mutex);
493+
static LIST_HEAD(fanout_list);
494+
495+
static void __fanout_link(struct sock *sk, struct packet_sock *po)
496+
{
497+
struct packet_fanout *f = po->fanout;
498+
499+
spin_lock(&f->lock);
500+
f->arr[f->num_members] = sk;
501+
smp_wmb();
502+
f->num_members++;
503+
spin_unlock(&f->lock);
504+
}
505+
506+
static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
507+
{
508+
struct packet_fanout *f = po->fanout;
509+
int i;
510+
511+
spin_lock(&f->lock);
512+
for (i = 0; i < f->num_members; i++) {
513+
if (f->arr[i] == sk)
514+
break;
515+
}
516+
BUG_ON(i >= f->num_members);
517+
f->arr[i] = f->arr[f->num_members - 1];
518+
f->num_members--;
519+
spin_unlock(&f->lock);
520+
}
521+
522+
static int fanout_add(struct sock *sk, u16 id, u8 type)
523+
{
524+
struct packet_sock *po = pkt_sk(sk);
525+
struct packet_fanout *f, *match;
526+
int err;
527+
528+
switch (type) {
529+
case PACKET_FANOUT_HASH:
530+
case PACKET_FANOUT_LB:
531+
break;
532+
default:
533+
return -EINVAL;
534+
}
535+
536+
if (!po->running)
537+
return -EINVAL;
538+
539+
if (po->fanout)
540+
return -EALREADY;
541+
542+
mutex_lock(&fanout_mutex);
543+
match = NULL;
544+
list_for_each_entry(f, &fanout_list, list) {
545+
if (f->id == id &&
546+
read_pnet(&f->net) == sock_net(sk)) {
547+
match = f;
548+
break;
549+
}
550+
}
551+
if (!match) {
552+
match = kzalloc(sizeof(*match), GFP_KERNEL);
553+
if (match) {
554+
write_pnet(&match->net, sock_net(sk));
555+
match->id = id;
556+
match->type = type;
557+
atomic_set(&match->rr_cur, 0);
558+
INIT_LIST_HEAD(&match->list);
559+
spin_lock_init(&match->lock);
560+
atomic_set(&match->sk_ref, 0);
561+
match->prot_hook.type = po->prot_hook.type;
562+
match->prot_hook.dev = po->prot_hook.dev;
563+
switch (type) {
564+
case PACKET_FANOUT_HASH:
565+
match->prot_hook.func = packet_rcv_fanout_hash;
566+
break;
567+
case PACKET_FANOUT_LB:
568+
match->prot_hook.func = packet_rcv_fanout_lb;
569+
break;
570+
}
571+
match->prot_hook.af_packet_priv = match;
572+
dev_add_pack(&match->prot_hook);
573+
list_add(&match->list, &fanout_list);
574+
}
575+
}
576+
err = -ENOMEM;
577+
if (match) {
578+
err = -EINVAL;
579+
if (match->type == type &&
580+
match->prot_hook.type == po->prot_hook.type &&
581+
match->prot_hook.dev == po->prot_hook.dev) {
582+
err = -ENOSPC;
583+
if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
584+
__dev_remove_pack(&po->prot_hook);
585+
po->fanout = match;
586+
atomic_inc(&match->sk_ref);
587+
__fanout_link(sk, po);
588+
err = 0;
589+
}
590+
}
591+
}
592+
mutex_unlock(&fanout_mutex);
593+
return err;
594+
}
595+
596+
static void fanout_release(struct sock *sk)
597+
{
598+
struct packet_sock *po = pkt_sk(sk);
599+
struct packet_fanout *f;
600+
601+
f = po->fanout;
602+
if (!f)
603+
return;
604+
605+
po->fanout = NULL;
606+
607+
mutex_lock(&fanout_mutex);
608+
if (atomic_dec_and_test(&f->sk_ref)) {
609+
list_del(&f->list);
610+
dev_remove_pack(&f->prot_hook);
611+
kfree(f);
612+
}
613+
mutex_unlock(&fanout_mutex);
614+
}
391615

392616
static const struct proto_ops packet_ops;
393617

@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock)
13981622
if (po->tx_ring.pg_vec)
13991623
packet_set_ring(sk, &req, 1, 1);
14001624

1625+
fanout_release(sk);
1626+
14011627
synchronize_net();
14021628
/*
14031629
* Now the socket is dead. No more input will appear.
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock)
14211647
static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
14221648
{
14231649
struct packet_sock *po = pkt_sk(sk);
1424-
/*
1425-
* Detach an existing hook if present.
1426-
*/
1650+
1651+
if (po->fanout)
1652+
return -EINVAL;
14271653

14281654
lock_sock(sk);
14291655

@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
21332359
po->tp_tstamp = val;
21342360
return 0;
21352361
}
2362+
case PACKET_FANOUT:
2363+
{
2364+
int val;
2365+
2366+
if (optlen != sizeof(val))
2367+
return -EINVAL;
2368+
if (copy_from_user(&val, optval, sizeof(val)))
2369+
return -EFAULT;
2370+
2371+
return fanout_add(sk, val & 0xffff, val >> 16);
2372+
}
21362373
default:
21372374
return -ENOPROTOOPT;
21382375
}
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
22312468
val = po->tp_tstamp;
22322469
data = &val;
22332470
break;
2471+
case PACKET_FANOUT:
2472+
if (len > sizeof(int))
2473+
len = sizeof(int);
2474+
val = (po->fanout ?
2475+
((u32)po->fanout->id |
2476+
((u32)po->fanout->type << 16)) :
2477+
0);
2478+
data = &val;
2479+
break;
22342480
default:
22352481
return -ENOPROTOOPT;
22362482
}

0 commit comments

Comments
 (0)