Skip to content

Commit 6e360f7

Browse files
committed
Merge branch 'udp-msg_zerocopy'
Willem de Bruijn says: ==================== udp msg_zerocopy Enable MSG_ZEROCOPY for udp sockets Patch 1/3 is the main patch, a rework of RFC patch http://patchwork.ozlabs.org/patch/899630/ more details in the patch commit message Patch 2/3 is an optimization to remove a branch from the UDP hot path and refcount_inc/refcount_dec_and_test pair when zerocopy is used. This used to be included in the first patch in v2. Patch 3/3 runs the already existing udp zerocopy tests as part of kselftest See also recent Linux Plumbers presentation https://linuxplumbersconf.org/event/2/contributions/106/attachments/104/128/willemdebruijn-lpc2018-udpgso-presentation-20181113.pdf Changes: v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents ce01a56 + db63e48 commit 6e360f7

File tree

9 files changed

+90
-27
lines changed

9 files changed

+90
-27
lines changed

include/linux/skbuff.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -481,10 +481,11 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
481481
}
482482

483483
void sock_zerocopy_put(struct ubuf_info *uarg);
484-
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
484+
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
485485

486486
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
487487

488+
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
488489
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
489490
struct msghdr *msg, int len,
490491
struct ubuf_info *uarg);
@@ -1325,10 +1326,14 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
13251326
return is_zcopy ? skb_uarg(skb) : NULL;
13261327
}
13271328

1328-
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
1329+
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
1330+
bool *have_ref)
13291331
{
13301332
if (skb && uarg && !skb_zcopy(skb)) {
1331-
sock_zerocopy_get(uarg);
1333+
if (unlikely(have_ref && *have_ref))
1334+
*have_ref = false;
1335+
else
1336+
sock_zerocopy_get(uarg);
13321337
skb_shinfo(skb)->destructor_arg = uarg;
13331338
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
13341339
}
@@ -1373,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
13731378
struct ubuf_info *uarg = skb_zcopy(skb);
13741379

13751380
if (uarg) {
1376-
sock_zerocopy_put_abort(uarg);
1381+
sock_zerocopy_put_abort(uarg, false);
13771382
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
13781383
}
13791384
}

net/core/skbuff.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,22 +1089,29 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
10891089
}
10901090
EXPORT_SYMBOL_GPL(sock_zerocopy_put);
10911091

1092-
void sock_zerocopy_put_abort(struct ubuf_info *uarg)
1092+
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
10931093
{
10941094
if (uarg) {
10951095
struct sock *sk = skb_from_uarg(uarg)->sk;
10961096

10971097
atomic_dec(&sk->sk_zckey);
10981098
uarg->len--;
10991099

1100-
sock_zerocopy_put(uarg);
1100+
if (have_uref)
1101+
sock_zerocopy_put(uarg);
11011102
}
11021103
}
11031104
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
11041105

11051106
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
11061107
struct iov_iter *from, size_t length);
11071108

1109+
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1110+
{
1111+
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1112+
}
1113+
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1114+
11081115
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
11091116
struct msghdr *msg, int len,
11101117
struct ubuf_info *uarg)
@@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
11311138
return err;
11321139
}
11331140

1134-
skb_zcopy_set(skb, uarg);
1141+
skb_zcopy_set(skb, uarg, NULL);
11351142
return skb->len - orig_len;
11361143
}
11371144
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
11511158
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
11521159
return -EIO;
11531160
}
1154-
skb_zcopy_set(nskb, skb_uarg(orig));
1161+
skb_zcopy_set(nskb, skb_uarg(orig), NULL);
11551162
}
11561163
return 0;
11571164
}

net/core/sock.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
10181018

10191019
case SO_ZEROCOPY:
10201020
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1021-
if (sk->sk_protocol != IPPROTO_TCP)
1021+
if (!((sk->sk_type == SOCK_STREAM &&
1022+
sk->sk_protocol == IPPROTO_TCP) ||
1023+
(sk->sk_type == SOCK_DGRAM &&
1024+
sk->sk_protocol == IPPROTO_UDP)))
10221025
ret = -ENOTSUPP;
10231026
} else if (sk->sk_family != PF_RDS) {
10241027
ret = -ENOTSUPP;

net/ipv4/ip_output.c

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
867867
unsigned int flags)
868868
{
869869
struct inet_sock *inet = inet_sk(sk);
870+
struct ubuf_info *uarg = NULL;
870871
struct sk_buff *skb;
871872

872873
struct ip_options *opt = cork->opt;
@@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
880881
int csummode = CHECKSUM_NONE;
881882
struct rtable *rt = (struct rtable *)cork->dst;
882883
unsigned int wmem_alloc_delta = 0;
884+
bool paged, extra_uref;
883885
u32 tskey = 0;
884-
bool paged;
885886

886887
skb = skb_peek_tail(queue);
887888

@@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk,
916917
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
917918
csummode = CHECKSUM_PARTIAL;
918919

920+
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
921+
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
922+
if (!uarg)
923+
return -ENOBUFS;
924+
extra_uref = true;
925+
if (rt->dst.dev->features & NETIF_F_SG &&
926+
csummode == CHECKSUM_PARTIAL) {
927+
paged = true;
928+
} else {
929+
uarg->zerocopy = 0;
930+
skb_zcopy_set(skb, uarg, &extra_uref);
931+
}
932+
}
933+
919934
cork->length += length;
920935

921936
/* So, what's going on in the loop below?
@@ -1001,12 +1016,6 @@ static int __ip_append_data(struct sock *sk,
10011016
skb->csum = 0;
10021017
skb_reserve(skb, hh_len);
10031018

1004-
/* only the initial fragment is time stamped */
1005-
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1006-
cork->tx_flags = 0;
1007-
skb_shinfo(skb)->tskey = tskey;
1008-
tskey = 0;
1009-
10101019
/*
10111020
* Find where to start putting bytes.
10121021
*/
@@ -1039,6 +1048,13 @@ static int __ip_append_data(struct sock *sk,
10391048
exthdrlen = 0;
10401049
csummode = CHECKSUM_NONE;
10411050

1051+
/* only the initial fragment is time stamped */
1052+
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1053+
cork->tx_flags = 0;
1054+
skb_shinfo(skb)->tskey = tskey;
1055+
tskey = 0;
1056+
skb_zcopy_set(skb, uarg, &extra_uref);
1057+
10421058
if ((flags & MSG_CONFIRM) && !skb_prev)
10431059
skb_set_dst_pending_confirm(skb, 1);
10441060

@@ -1068,7 +1084,7 @@ static int __ip_append_data(struct sock *sk,
10681084
err = -EFAULT;
10691085
goto error;
10701086
}
1071-
} else {
1087+
} else if (!uarg || !uarg->zerocopy) {
10721088
int i = skb_shinfo(skb)->nr_frags;
10731089

10741090
err = -ENOMEM;
@@ -1098,6 +1114,10 @@ static int __ip_append_data(struct sock *sk,
10981114
skb->data_len += copy;
10991115
skb->truesize += copy;
11001116
wmem_alloc_delta += copy;
1117+
} else {
1118+
err = skb_zerocopy_iter_dgram(skb, from, copy);
1119+
if (err < 0)
1120+
goto error;
11011121
}
11021122
offset += copy;
11031123
length -= copy;
@@ -1110,6 +1130,7 @@ static int __ip_append_data(struct sock *sk,
11101130
error_efault:
11111131
err = -EFAULT;
11121132
error:
1133+
sock_zerocopy_put_abort(uarg, extra_uref);
11131134
cork->length -= length;
11141135
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
11151136
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

net/ipv4/tcp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
14231423
if (copied + copied_syn)
14241424
goto out;
14251425
out_err:
1426-
sock_zerocopy_put_abort(uarg);
1426+
sock_zerocopy_put_abort(uarg, true);
14271427
err = sk_stream_error(sk, flags, err);
14281428
/* make sure we wake any epoll edge trigger waiter */
14291429
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&

net/ipv6/ip6_output.c

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
12451245
{
12461246
struct sk_buff *skb, *skb_prev = NULL;
12471247
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248+
struct ubuf_info *uarg = NULL;
12481249
int exthdrlen = 0;
12491250
int dst_exthdrlen = 0;
12501251
int hh_len;
@@ -1257,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk,
12571258
int csummode = CHECKSUM_NONE;
12581259
unsigned int maxnonfragsize, headersize;
12591260
unsigned int wmem_alloc_delta = 0;
1260-
bool paged;
1261+
bool paged, extra_uref;
12611262

12621263
skb = skb_peek_tail(queue);
12631264
if (!skb) {
@@ -1322,6 +1323,20 @@ static int __ip6_append_data(struct sock *sk,
13221323
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
13231324
csummode = CHECKSUM_PARTIAL;
13241325

1326+
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1327+
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1328+
if (!uarg)
1329+
return -ENOBUFS;
1330+
extra_uref = true;
1331+
if (rt->dst.dev->features & NETIF_F_SG &&
1332+
csummode == CHECKSUM_PARTIAL) {
1333+
paged = true;
1334+
} else {
1335+
uarg->zerocopy = 0;
1336+
skb_zcopy_set(skb, uarg, &extra_uref);
1337+
}
1338+
}
1339+
13251340
/*
13261341
* Let's try using as much space as possible.
13271342
* Use MTU if total length of the message fits into the MTU.
@@ -1440,12 +1455,6 @@ static int __ip6_append_data(struct sock *sk,
14401455
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
14411456
dst_exthdrlen);
14421457

1443-
/* Only the initial fragment is time stamped */
1444-
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1445-
cork->tx_flags = 0;
1446-
skb_shinfo(skb)->tskey = tskey;
1447-
tskey = 0;
1448-
14491458
/*
14501459
* Find where to start putting bytes
14511460
*/
@@ -1477,6 +1486,13 @@ static int __ip6_append_data(struct sock *sk,
14771486
exthdrlen = 0;
14781487
dst_exthdrlen = 0;
14791488

1489+
/* Only the initial fragment is time stamped */
1490+
skb_shinfo(skb)->tx_flags = cork->tx_flags;
1491+
cork->tx_flags = 0;
1492+
skb_shinfo(skb)->tskey = tskey;
1493+
tskey = 0;
1494+
skb_zcopy_set(skb, uarg, &extra_uref);
1495+
14801496
if ((flags & MSG_CONFIRM) && !skb_prev)
14811497
skb_set_dst_pending_confirm(skb, 1);
14821498

@@ -1506,7 +1522,7 @@ static int __ip6_append_data(struct sock *sk,
15061522
err = -EFAULT;
15071523
goto error;
15081524
}
1509-
} else {
1525+
} else if (!uarg || !uarg->zerocopy) {
15101526
int i = skb_shinfo(skb)->nr_frags;
15111527

15121528
err = -ENOMEM;
@@ -1536,6 +1552,10 @@ static int __ip6_append_data(struct sock *sk,
15361552
skb->data_len += copy;
15371553
skb->truesize += copy;
15381554
wmem_alloc_delta += copy;
1555+
} else {
1556+
err = skb_zerocopy_iter_dgram(skb, from, copy);
1557+
if (err < 0)
1558+
goto error;
15391559
}
15401560
offset += copy;
15411561
length -= copy;
@@ -1548,6 +1568,7 @@ static int __ip6_append_data(struct sock *sk,
15481568
error_efault:
15491569
err = -EFAULT;
15501570
error:
1571+
sock_zerocopy_put_abort(uarg, extra_uref);
15511572
cork->length -= length;
15521573
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
15531574
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

tools/testing/selftests/net/msg_zerocopy.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,12 +651,13 @@ static void do_flush_datagram(int fd, int type)
651651

652652
static void do_rx(int domain, int type, int protocol)
653653
{
654+
const int cfg_receiver_wait_ms = 400;
654655
uint64_t tstop;
655656
int fd;
656657

657658
fd = do_setup_rx(domain, type, protocol);
658659

659-
tstop = gettimeofday_ms() + cfg_runtime_ms;
660+
tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
660661
do {
661662
if (type == SOCK_STREAM)
662663
do_flush_tcp(fd);

tools/testing/selftests/net/msg_zerocopy.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ readonly path_sysctl_mem="net.core.optmem_max"
2525
if [[ "$#" -eq "0" ]]; then
2626
$0 4 tcp -t 1
2727
$0 6 tcp -t 1
28+
$0 4 udp -t 1
29+
$0 6 udp -t 1
2830
echo "OK. All tests passed"
2931
exit 0
3032
fi

tools/testing/selftests/net/udpgso_bench.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ run_udp() {
3535

3636
echo "udp gso"
3737
run_in_netns ${args} -S 0
38+
39+
echo "udp gso zerocopy"
40+
run_in_netns ${args} -S 0 -z
3841
}
3942

4043
run_tcp() {

0 commit comments

Comments
 (0)