Skip to content

Commit 5226779

Browse files
wdebruijdavem330
authored andcommitted
sock: add MSG_ZEROCOPY
The kernel supports zerocopy sendmsg in virtio and tap. Expand the infrastructure to support other socket types. Introduce a completion notification channel over the socket error queue. Notifications are returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid blocking the send/recv path on receiving notifications. Add reference counting, to support the skb split, merge, resize and clone operations possible with SOCK_STREAM and other socket types. The patch does not yet modify any datapaths. Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 3ece782 commit 5226779

File tree

7 files changed

+235
-21
lines changed

7 files changed

+235
-21
lines changed

include/linux/skbuff.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ enum {
429429
SKBTX_SCHED_TSTAMP = 1 << 6,
430430
};
431431

432+
#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
432433
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
433434
SKBTX_SCHED_TSTAMP)
434435
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
@@ -445,8 +446,28 @@ struct ubuf_info {
445446
void (*callback)(struct ubuf_info *, bool zerocopy_success);
446447
void *ctx;
447448
unsigned long desc;
449+
u16 zerocopy:1;
450+
atomic_t refcnt;
448451
};
449452

453+
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
454+
455+
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
456+
457+
static inline void sock_zerocopy_get(struct ubuf_info *uarg)
458+
{
459+
atomic_inc(&uarg->refcnt);
460+
}
461+
462+
void sock_zerocopy_put(struct ubuf_info *uarg);
463+
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
464+
465+
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
466+
467+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
468+
struct msghdr *msg, int len,
469+
struct ubuf_info *uarg);
470+
450471
/* This data is invariant across clones and lives at
451472
* the end of the header data, ie. at skb->end.
452473
*/
@@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
12141235
return &skb_shinfo(skb)->hwtstamps;
12151236
}
12161237

1238+
static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
1239+
{
1240+
bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
1241+
1242+
return is_zcopy ? skb_uarg(skb) : NULL;
1243+
}
1244+
1245+
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
1246+
{
1247+
if (skb && uarg && !skb_zcopy(skb)) {
1248+
sock_zerocopy_get(uarg);
1249+
skb_shinfo(skb)->destructor_arg = uarg;
1250+
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
1251+
}
1252+
}
1253+
1254+
/* Release a reference on a zerocopy structure */
1255+
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
1256+
{
1257+
struct ubuf_info *uarg = skb_zcopy(skb);
1258+
1259+
if (uarg) {
1260+
uarg->zerocopy = uarg->zerocopy && zerocopy;
1261+
sock_zerocopy_put(uarg);
1262+
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
1263+
}
1264+
}
1265+
1266+
/* Abort a zerocopy operation and revert zckey on error in send syscall */
1267+
static inline void skb_zcopy_abort(struct sk_buff *skb)
1268+
{
1269+
struct ubuf_info *uarg = skb_zcopy(skb);
1270+
1271+
if (uarg) {
1272+
sock_zerocopy_put_abort(uarg);
1273+
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
1274+
}
1275+
}
1276+
12171277
/**
12181278
* skb_queue_empty - check if a queue is empty
12191279
* @list: queue head

include/linux/socket.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ struct ucred {
287287
#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */
288288
#define MSG_EOF MSG_FIN
289289

290+
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
290291
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
291292
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
292293
descriptor received through

include/net/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ struct sock_common {
294294
* @sk_stamp: time stamp of last packet received
295295
* @sk_tsflags: SO_TIMESTAMPING socket options
296296
* @sk_tskey: counter to disambiguate concurrent tstamp requests
297+
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
297298
* @sk_socket: Identd and reporting IO signals
298299
* @sk_user_data: RPC layer private data
299300
* @sk_frag: cached page frag
@@ -462,6 +463,7 @@ struct sock {
462463
u16 sk_tsflags;
463464
u8 sk_shutdown;
464465
u32 sk_tskey;
466+
atomic_t sk_zckey;
465467
struct socket *sk_socket;
466468
void *sk_user_data;
467469
#ifdef CONFIG_SECURITY

include/uapi/linux/errqueue.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@ struct sock_extended_err {
1818
#define SO_EE_ORIGIN_ICMP 2
1919
#define SO_EE_ORIGIN_ICMP6 3
2020
#define SO_EE_ORIGIN_TXSTATUS 4
21+
#define SO_EE_ORIGIN_ZEROCOPY 5
2122
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
2223

2324
#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
2425

26+
#define SO_EE_CODE_ZEROCOPY_COPIED 1
27+
2528
/**
2629
* struct scm_timestamping - timestamps exposed through cmsg
2730
*

net/core/datagram.c

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -573,27 +573,12 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
573573
}
574574
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
575575

576-
/**
577-
* zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
578-
* @skb: buffer to copy
579-
* @from: the source to copy from
580-
*
581-
* The function will first copy up to headlen, and then pin the userspace
582-
* pages and build frags through them.
583-
*
584-
* Returns 0, -EFAULT or -EMSGSIZE.
585-
*/
586-
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
576+
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
577+
struct iov_iter *from, size_t length)
587578
{
588-
int len = iov_iter_count(from);
589-
int copy = min_t(int, skb_headlen(skb), len);
590-
int frag = 0;
579+
int frag = skb_shinfo(skb)->nr_frags;
591580

592-
/* copy up to skb headlen */
593-
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
594-
return -EFAULT;
595-
596-
while (iov_iter_count(from)) {
581+
while (length && iov_iter_count(from)) {
597582
struct page *pages[MAX_SKB_FRAGS];
598583
size_t start;
599584
ssize_t copied;
@@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
603588
if (frag == MAX_SKB_FRAGS)
604589
return -EMSGSIZE;
605590

606-
copied = iov_iter_get_pages(from, pages, ~0U,
591+
copied = iov_iter_get_pages(from, pages, length,
607592
MAX_SKB_FRAGS - frag, &start);
608593
if (copied < 0)
609594
return -EFAULT;
610595

611596
iov_iter_advance(from, copied);
597+
length -= copied;
612598

613599
truesize = PAGE_ALIGN(copied + start);
614600
skb->data_len += copied;
615601
skb->len += copied;
616602
skb->truesize += truesize;
617-
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
603+
if (sk && sk->sk_type == SOCK_STREAM) {
604+
sk->sk_wmem_queued += truesize;
605+
sk_mem_charge(sk, truesize);
606+
} else {
607+
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
608+
}
618609
while (copied) {
619610
int size = min_t(int, copied, PAGE_SIZE - start);
620611
skb_fill_page_desc(skb, frag++, pages[n], start, size);
@@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
625616
}
626617
return 0;
627618
}
619+
EXPORT_SYMBOL(__zerocopy_sg_from_iter);
620+
621+
/**
622+
* zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
623+
* @skb: buffer to copy
624+
* @from: the source to copy from
625+
*
626+
* The function will first copy up to headlen, and then pin the userspace
627+
* pages and build frags through them.
628+
*
629+
* Returns 0, -EFAULT or -EMSGSIZE.
630+
*/
631+
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
632+
{
633+
int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
634+
635+
/* copy up to skb headlen */
636+
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
637+
return -EFAULT;
638+
639+
return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
640+
}
628641
EXPORT_SYMBOL(zerocopy_sg_from_iter);
629642

630643
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,

net/core/skbuff.c

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,139 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
915915
}
916916
EXPORT_SYMBOL_GPL(skb_morph);
917917

918+
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
919+
{
920+
struct ubuf_info *uarg;
921+
struct sk_buff *skb;
922+
923+
WARN_ON_ONCE(!in_task());
924+
925+
skb = sock_omalloc(sk, 0, GFP_KERNEL);
926+
if (!skb)
927+
return NULL;
928+
929+
BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
930+
uarg = (void *)skb->cb;
931+
932+
uarg->callback = sock_zerocopy_callback;
933+
uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
934+
uarg->zerocopy = 1;
935+
atomic_set(&uarg->refcnt, 0);
936+
sock_hold(sk);
937+
938+
return uarg;
939+
}
940+
EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
941+
942+
static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
943+
{
944+
return container_of((void *)uarg, struct sk_buff, cb);
945+
}
946+
947+
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
948+
{
949+
struct sk_buff *skb = skb_from_uarg(uarg);
950+
struct sock_exterr_skb *serr;
951+
struct sock *sk = skb->sk;
952+
u16 id = uarg->desc;
953+
954+
if (sock_flag(sk, SOCK_DEAD))
955+
goto release;
956+
957+
serr = SKB_EXT_ERR(skb);
958+
memset(serr, 0, sizeof(*serr));
959+
serr->ee.ee_errno = 0;
960+
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
961+
serr->ee.ee_data = id;
962+
if (!success)
963+
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
964+
965+
skb_queue_tail(&sk->sk_error_queue, skb);
966+
skb = NULL;
967+
968+
sk->sk_error_report(sk);
969+
970+
release:
971+
consume_skb(skb);
972+
sock_put(sk);
973+
}
974+
EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
975+
976+
void sock_zerocopy_put(struct ubuf_info *uarg)
977+
{
978+
if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
979+
if (uarg->callback)
980+
uarg->callback(uarg, uarg->zerocopy);
981+
else
982+
consume_skb(skb_from_uarg(uarg));
983+
}
984+
}
985+
EXPORT_SYMBOL_GPL(sock_zerocopy_put);
986+
987+
void sock_zerocopy_put_abort(struct ubuf_info *uarg)
988+
{
989+
if (uarg) {
990+
struct sock *sk = skb_from_uarg(uarg)->sk;
991+
992+
atomic_dec(&sk->sk_zckey);
993+
994+
/* sock_zerocopy_put expects a ref. Most sockets take one per
995+
* skb, which is zero on abort. tcp_sendmsg holds one extra, to
996+
* avoid an skb send inside the main loop triggering uarg free.
997+
*/
998+
if (sk->sk_type != SOCK_STREAM)
999+
atomic_inc(&uarg->refcnt);
1000+
1001+
sock_zerocopy_put(uarg);
1002+
}
1003+
}
1004+
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
1005+
1006+
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1007+
struct iov_iter *from, size_t length);
1008+
1009+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1010+
struct msghdr *msg, int len,
1011+
struct ubuf_info *uarg)
1012+
{
1013+
struct iov_iter orig_iter = msg->msg_iter;
1014+
int err, orig_len = skb->len;
1015+
1016+
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
1017+
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
1018+
/* Streams do not free skb on error. Reset to prev state. */
1019+
msg->msg_iter = orig_iter;
1020+
___pskb_trim(skb, orig_len);
1021+
return err;
1022+
}
1023+
1024+
skb_zcopy_set(skb, uarg);
1025+
return skb->len - orig_len;
1026+
}
1027+
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
1028+
1029+
/* unused only until next patch in the series; will remove attribute */
1030+
static int __attribute__((unused))
1031+
skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
1032+
gfp_t gfp_mask)
1033+
{
1034+
if (skb_zcopy(orig)) {
1035+
if (skb_zcopy(nskb)) {
1036+
/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
1037+
if (!gfp_mask) {
1038+
WARN_ON_ONCE(1);
1039+
return -ENOMEM;
1040+
}
1041+
if (skb_uarg(nskb) == skb_uarg(orig))
1042+
return 0;
1043+
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
1044+
return -EIO;
1045+
}
1046+
skb_zcopy_set(nskb, skb_uarg(orig));
1047+
}
1048+
return 0;
1049+
}
1050+
9181051
/**
9191052
* skb_copy_ubufs - copy userspace skb frags buffers to kernel
9201053
* @skb: the skb to modify

net/core/sock.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
16701670
atomic_set(&newsk->sk_drops, 0);
16711671
newsk->sk_send_head = NULL;
16721672
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1673+
atomic_set(&newsk->sk_zckey, 0);
16731674

16741675
sock_reset_flag(newsk, SOCK_DONE);
16751676

@@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
27222723
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
27232724

27242725
sk->sk_stamp = SK_DEFAULT_STAMP;
2726+
atomic_set(&sk->sk_zckey, 0);
27252727

27262728
#ifdef CONFIG_NET_RX_BUSY_POLL
27272729
sk->sk_napi_id = 0;

0 commit comments

Comments
 (0)