Skip to content

Commit 52900d2

Browse files
wdebruijdavem330
authored andcommitted
udp: elide zerocopy operation in hot path
With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info. Release of its last reference triggers a completion notification. The TCP stack in tcp_sendmsg_locked holds an extra ref independent of the skbs, because it can build, send and free skbs within its loop, possibly reaching refcount zero and freeing the ubuf_info too soon. The UDP stack currently also takes this extra ref, but does not need it as all skbs are sent after return from __ip(6)_append_data. Avoid the extra refcount_inc and refcount_dec_and_test, and generally the sock_zerocopy_put in the common path, by passing the initial reference to the first skb. This approach is taken instead of initializing the refcount to 0, as that would generate error "refcount_t: increment on 0" on the next skb_zcopy_set. Changes v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent b5947e5 commit 52900d2

File tree

5 files changed

+36
-31
lines changed

5 files changed

+36
-31
lines changed

include/linux/skbuff.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
481481
}
482482

483483
void sock_zerocopy_put(struct ubuf_info *uarg);
484-
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
484+
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
485485

486486
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
487487

@@ -1326,10 +1326,14 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
13261326
return is_zcopy ? skb_uarg(skb) : NULL;
13271327
}
13281328

1329-
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
1329+
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
1330+
bool *have_ref)
13301331
{
13311332
if (skb && uarg && !skb_zcopy(skb)) {
1332-
sock_zerocopy_get(uarg);
1333+
if (unlikely(have_ref && *have_ref))
1334+
*have_ref = false;
1335+
else
1336+
sock_zerocopy_get(uarg);
13331337
skb_shinfo(skb)->destructor_arg = uarg;
13341338
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
13351339
}
@@ -1374,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
13741378
struct ubuf_info *uarg = skb_zcopy(skb);
13751379

13761380
if (uarg) {
1377-
sock_zerocopy_put_abort(uarg);
1381+
sock_zerocopy_put_abort(uarg, false);
13781382
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
13791383
}
13801384
}

net/core/skbuff.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,15 +1089,16 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
10891089
}
10901090
EXPORT_SYMBOL_GPL(sock_zerocopy_put);
10911091

1092-
void sock_zerocopy_put_abort(struct ubuf_info *uarg)
1092+
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
10931093
{
10941094
if (uarg) {
10951095
struct sock *sk = skb_from_uarg(uarg)->sk;
10961096

10971097
atomic_dec(&sk->sk_zckey);
10981098
uarg->len--;
10991099

1100-
sock_zerocopy_put(uarg);
1100+
if (have_uref)
1101+
sock_zerocopy_put(uarg);
11011102
}
11021103
}
11031104
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
@@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
11371138
return err;
11381139
}
11391140

1140-
skb_zcopy_set(skb, uarg);
1141+
skb_zcopy_set(skb, uarg, NULL);
11411142
return skb->len - orig_len;
11421143
}
11431144
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
11571158
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
11581159
return -EIO;
11591160
}
1160-
skb_zcopy_set(nskb, skb_uarg(orig));
1161+
skb_zcopy_set(nskb, skb_uarg(orig), NULL);
11611162
}
11621163
return 0;
11631164
}

net/ipv4/ip_output.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
881881
int csummode = CHECKSUM_NONE;
882882
struct rtable *rt = (struct rtable *)cork->dst;
883883
unsigned int wmem_alloc_delta = 0;
884+
bool paged, extra_uref;
884885
u32 tskey = 0;
885-
bool paged;
886886

887887
skb = skb_peek_tail(queue);
888888

@@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk,
921921
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
922922
if (!uarg)
923923
return -ENOBUFS;
924+
extra_uref = true;
924925
if (rt->dst.dev->features & NETIF_F_SG &&
925926
csummode == CHECKSUM_PARTIAL) {
926927
paged = true;
927928
} else {
928929
uarg->zerocopy = 0;
929-
skb_zcopy_set(skb, uarg);
930+
skb_zcopy_set(skb, uarg, &extra_uref);
930931
}
931932
}
932933

@@ -1015,13 +1016,6 @@ static int __ip_append_data(struct sock *sk,
10151016
skb->csum = 0;
10161017
skb_reserve(skb, hh_len);
10171018

1018-
/* only the initial fragment is time stamped */
1019-
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1020-
cork->tx_flags = 0;
1021-
skb_shinfo(skb)->tskey = tskey;
1022-
tskey = 0;
1023-
skb_zcopy_set(skb, uarg);
1024-
10251019
/*
10261020
* Find where to start putting bytes.
10271021
*/
@@ -1054,6 +1048,13 @@ static int __ip_append_data(struct sock *sk,
10541048
exthdrlen = 0;
10551049
csummode = CHECKSUM_NONE;
10561050

1051+
/* only the initial fragment is time stamped */
1052+
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1053+
cork->tx_flags = 0;
1054+
skb_shinfo(skb)->tskey = tskey;
1055+
tskey = 0;
1056+
skb_zcopy_set(skb, uarg, &extra_uref);
1057+
10571058
if ((flags & MSG_CONFIRM) && !skb_prev)
10581059
skb_set_dst_pending_confirm(skb, 1);
10591060

@@ -1124,13 +1125,12 @@ static int __ip_append_data(struct sock *sk,
11241125

11251126
if (wmem_alloc_delta)
11261127
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1127-
sock_zerocopy_put(uarg);
11281128
return 0;
11291129

11301130
error_efault:
11311131
err = -EFAULT;
11321132
error:
1133-
sock_zerocopy_put_abort(uarg);
1133+
sock_zerocopy_put_abort(uarg, extra_uref);
11341134
cork->length -= length;
11351135
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
11361136
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

net/ipv4/tcp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
14231423
if (copied + copied_syn)
14241424
goto out;
14251425
out_err:
1426-
sock_zerocopy_put_abort(uarg);
1426+
sock_zerocopy_put_abort(uarg, true);
14271427
err = sk_stream_error(sk, flags, err);
14281428
/* make sure we wake any epoll edge trigger waiter */
14291429
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&

net/ipv6/ip6_output.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk,
12581258
int csummode = CHECKSUM_NONE;
12591259
unsigned int maxnonfragsize, headersize;
12601260
unsigned int wmem_alloc_delta = 0;
1261-
bool paged;
1261+
bool paged, extra_uref;
12621262

12631263
skb = skb_peek_tail(queue);
12641264
if (!skb) {
@@ -1327,12 +1327,13 @@ static int __ip6_append_data(struct sock *sk,
13271327
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
13281328
if (!uarg)
13291329
return -ENOBUFS;
1330+
extra_uref = true;
13301331
if (rt->dst.dev->features & NETIF_F_SG &&
13311332
csummode == CHECKSUM_PARTIAL) {
13321333
paged = true;
13331334
} else {
13341335
uarg->zerocopy = 0;
1335-
skb_zcopy_set(skb, uarg);
1336+
skb_zcopy_set(skb, uarg, &extra_uref);
13361337
}
13371338
}
13381339

@@ -1454,13 +1455,6 @@ static int __ip6_append_data(struct sock *sk,
14541455
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
14551456
dst_exthdrlen);
14561457

1457-
/* Only the initial fragment is time stamped */
1458-
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1459-
cork->tx_flags = 0;
1460-
skb_shinfo(skb)->tskey = tskey;
1461-
tskey = 0;
1462-
skb_zcopy_set(skb, uarg);
1463-
14641458
/*
14651459
* Find where to start putting bytes
14661460
*/
@@ -1492,6 +1486,13 @@ static int __ip6_append_data(struct sock *sk,
14921486
exthdrlen = 0;
14931487
dst_exthdrlen = 0;
14941488

1489+
/* Only the initial fragment is time stamped */
1490+
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1491+
cork->tx_flags = 0;
1492+
skb_shinfo(skb)->tskey = tskey;
1493+
tskey = 0;
1494+
skb_zcopy_set(skb, uarg, &extra_uref);
1495+
14951496
if ((flags & MSG_CONFIRM) && !skb_prev)
14961497
skb_set_dst_pending_confirm(skb, 1);
14971498

@@ -1562,13 +1563,12 @@ static int __ip6_append_data(struct sock *sk,
15621563

15631564
if (wmem_alloc_delta)
15641565
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1565-
sock_zerocopy_put(uarg);
15661566
return 0;
15671567

15681568
error_efault:
15691569
err = -EFAULT;
15701570
error:
1571-
sock_zerocopy_put_abort(uarg);
1571+
sock_zerocopy_put_abort(uarg, extra_uref);
15721572
cork->length -= length;
15731573
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
15741574
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

0 commit comments

Comments
 (0)