diff options
author | David S. Miller <davem@davemloft.net> | 2018-02-17 00:04:18 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-02-17 00:04:18 +0300 |
commit | 80c6d2b8d83b6f2f72bd17d69e2e9fca76e4637a (patch) | |
tree | 484a63f508f2d92be4c2dd7796382748e7f75f57 | |
parent | ee99b2d8bf4ad6d03046a8c2f25bad7cfd9de64a (diff) | |
parent | dfb8434b0a94651dac7adf373900561d8f6499e4 (diff) | |
download | linux-80c6d2b8d83b6f2f72bd17d69e2e9fca76e4637a.tar.xz |
Merge branch 'RDS-zerocopy-support'
Sowmini Varadhan says:
====================
RDS: zerocopy support
This is version 3 of the series, following up on review comments for
http://patchwork.ozlabs.org/project/netdev/list/?series=28530
Review comments addressed
Patch 4
- fix fragile use of skb->cb[], do not set ee_code incorrectly.
Patch 5:
- remove needless bzero of skb->cb[], consolidate err cleanup
A brief overview of this feature follows.
This patch series provides support for MSG_ZERCOCOPY
on a PF_RDS socket based on the APIs and infrastructure added
by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY")
For single threaded rds-stress testing using rds-tcp with the
ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
results show that there is a significant reduction in latency: about
90 usec with zerocopy, compared with 200 usec without zerocopy.
This patchset modifies the above for zerocopy in the following manner.
- if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
- if the SO_ZEROCOPY socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg are pinned. The pinning
uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit
on MSG_ZEROCOPY pages"). The message is unpinned when all references
to the message go down to 0, and the message is freed by rds_message_purge.
A multithreaded application using this infrastructure must send down
a unique 32 bit cookie as ancillary data with each sendmsg invocation.
The format of this ancillary data is described in Patch 5 of the series.
The cookie is passed up to the application on the sk_error_queue when
the message is unpinned, indicating to the application that it is now
safe to free/reuse the message buffer. The details of the completion
notification are provided in Patch 4 of this series.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/skbuff.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/errqueue.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/rds.h | 1 | ||||
-rw-r--r-- | net/core/skbuff.c | 6 | ||||
-rw-r--r-- | net/core/sock.c | 25 | ||||
-rw-r--r-- | net/rds/af_rds.c | 2 | ||||
-rw-r--r-- | net/rds/message.c | 132 | ||||
-rw-r--r-- | net/rds/rds.h | 17 | ||||
-rw-r--r-- | net/rds/recv.c | 2 | ||||
-rw-r--r-- | net/rds/send.c | 51 | ||||
-rw-r--r-- | tools/testing/selftests/net/msg_zerocopy.c | 133 |
11 files changed, 339 insertions, 35 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720..b1cc38af53e1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -466,6 +466,9 @@ struct ubuf_info { #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size); +void mm_unaccount_pinned_pages(struct mmpin *mmp); + struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..28812eda4209 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,11 +20,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index e71d4491f225..12e3bca32cad 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -103,6 +103,7 @@ #define RDS_CMSG_MASKED_ATOMIC_FADD 8 #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 +#define RDS_CMSG_ZCOPY_COOKIE 12 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71..1a7485a2cdfa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg; struct user_struct *user; @@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) return 0; } +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); -static void mm_unaccount_pinned_pages(struct mmpin *mmp) +void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) { diff --git a/net/core/sock.c b/net/core/sock.c index e90d461748f0..a1fa4a548f1b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1049,18 +1049,21 @@ set_rcvbuf: break; case SO_ZEROCOPY: - if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; - else if (sk->sk_protocol != IPPROTO_TCP) - ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; - else if (val < 0 || val > 1) - ret = -EINVAL; - else - sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); - break; - + } + if (!ret) { + if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + } default: ret = -ENOPROTOOPT; break; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 0a8eefd256b3..a937f18896ae 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ diff --git a/net/rds/message.c b/net/rds/message.c index 4318cc9b78f7..651834513481 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -33,6 +33,9 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/errqueue.h> #include "rds.h" @@ -53,20 +56,92 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + int ncookies; + u32 *ptr; + + if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + return false; + ncookies = serr->ee.ee_data; + if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) + return false; + ptr = skb_put(skb, sizeof(u32)); + *ptr = cookie; + serr->ee.ee_data = ++ncookies; + return true; +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct sk_buff *skb, *tail; + struct sock_exterr_skb *serr; + unsigned long flags; + struct sk_buff_head *q; + u32 cookie = znotif->z_cookie; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + + if (tail && skb_zcookie_add(tail, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + mm_unaccount_pinned_pages(&znotif->z_mmp); + consume_skb(rds_skb_from_znotifier(znotif)); + sk->sk_error_report(sk); + return; + } + + skb = rds_skb_from_znotifier(znotif); + serr = SKB_EXT_ERR(skb); + memset(&serr->ee, 0, sizeof(serr->ee)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; + serr->ee.ee_info = 0; + WARN_ON(!skb_zcookie_add(skb, cookie)); + + __skb_queue_tail(q, skb); + + spin_unlock_irqrestore(&q->lock, flags); + sk->sk_error_report(sk); + + mm_unaccount_pinned_pages(&znotif->z_mmp); +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { - unsigned long i; + unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (rm->m_rs) { + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); + rm->m_rs = NULL; + } + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; @@ -266,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) { unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -281,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + if (zcopy) { + int total_copied = 0; + struct sk_buff *skb; + + skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), + GFP_KERNEL); + if (!skb) + return -ENOMEM; + rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + consume_skb(skb); + rm->data.op_mmp_znotifier = NULL; + return ret; + } /* zcopy */ + while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), diff --git a/net/rds/rds.h b/net/rds/rds.h index 7301b9b01890..31cd38852050 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 +struct rds_znotifier { + struct list_head z_list; + struct mmpin z_mmp; + u32 z_cookie; +}; + +#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0])) + +static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z) +{ + return container_of((void *)z, struct sk_buff, cb); +} + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -436,6 +449,7 @@ struct rds_message { unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; + struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; @@ -771,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from); +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca..b080961464df 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ diff --git a/net/rds/send.c b/net/rds/send.c index b1b0022b8370..028ab598ac1b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) rm->rdma.op_notifier = NULL; } was_on_sock = 1; - rm->m_rs = NULL; } spin_unlock(&rs->rs_lock); @@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); - spin_lock_irqsave(&rm->m_rs_lock, flags); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); @@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); + sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're @@ -880,12 +875,13 @@ out: * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int data_len) +static int rds_rm_size(struct msghdr *msg, int num_sgs) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; + bool zcopy_cookie = false; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -904,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len) break; + case RDS_CMSG_ZCOPY_COOKIE: + zcopy_cookie = true; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; @@ -924,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) + return -EINVAL; + + size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) @@ -933,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + u32 *cookie; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) + return -EINVAL; + cookie = CMSG_DATA(cmsg); + rm->data.op_mmp_znotifier->z_cookie = *cookie; + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -975,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ZCOPY_COOKIE: + ret = rds_cmsg_zcopy(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -1045,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; size_t total_payload_len = payload_len, rdma_payload_len = 0; + bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && + sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); + int num_sgs = ceil(payload_len, PAGE_SIZE); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ - if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } @@ -1092,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (zcopy) { + if (rs->rs_transport->t_type != RDS_TRANS_TCP) { + ret = -EOPNOTSUPP; + goto out; + } + num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); + } /* size of rm including all sgs */ - ret = rds_rm_size(msg, payload_len); + ret = rds_rm_size(msg, num_sgs); if (ret < 0) goto out; @@ -1105,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (!rm->data.op_sg) { ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &msg->msg_iter); + ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index e11fe84de0fd..5cc2a53bb71c 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -14,6 +14,9 @@ * - SOCK_DGRAM * - SOCK_RAW * + * PF_RDS + * - SOCK_SEQPACKET + * * Start this program on two connected hosts, one in send mode and * the other with option '-r' to put it in receiver mode. * @@ -53,6 +56,7 @@ #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> +#include <linux/rds.h> #ifndef SO_EE_ORIGIN_ZEROCOPY #define SO_EE_ORIGIN_ZEROCOPY 5 @@ -164,17 +168,39 @@ static int do_accept(int fd) return fd; } -static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) +static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie) +{ + struct cmsghdr *cm; + + if (!msg->msg_control) + error(1, errno, "NULL cookie"); + cm = (void *)msg->msg_control; + cm->cmsg_len = CMSG_LEN(sizeof(cookie)); + cm->cmsg_level = SOL_RDS; + cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE; + memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie)); +} + +static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain) { int ret, len, i, flags; + static uint32_t cookie; + char ckbuf[CMSG_SPACE(sizeof(cookie))]; len = 0; for (i = 0; i < msg->msg_iovlen; i++) len += msg->msg_iov[i].iov_len; flags = MSG_DONTWAIT; - if (do_zerocopy) + if (do_zerocopy) { flags |= MSG_ZEROCOPY; + if (domain == PF_RDS) { + memset(&msg->msg_control, 0, sizeof(msg->msg_control)); + msg->msg_controllen = CMSG_SPACE(sizeof(cookie)); + msg->msg_control = (struct cmsghdr *)ckbuf; + add_zcopy_cookie(msg, ++cookie); + } + } ret = sendmsg(fd, msg, flags); if (ret == -1 && errno == EAGAIN) @@ -190,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) if (do_zerocopy && ret) expected_completions++; } + if (do_zerocopy && domain == PF_RDS) { + msg->msg_control = NULL; + msg->msg_controllen = 0; + } return true; } @@ -216,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg) msg->msg_iov[0].iov_len = payload_len + extra_len; extra_len = 0; - do_sendmsg(fd, msg, do_zerocopy); + do_sendmsg(fd, msg, do_zerocopy, + (cfg_dst_addr.ss_family == AF_INET ? + PF_INET : PF_INET6)); } do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); @@ -300,13 +332,38 @@ static int do_setup_tx(int domain, int type, int protocol) if (cfg_zerocopy) do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1); - if (domain != PF_PACKET) + if (domain != PF_PACKET && domain != PF_RDS) if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) error(1, errno, "connect"); + if (domain == PF_RDS) { + if (bind(fd, (void *) &cfg_src_addr, cfg_alen)) + error(1, errno, "bind"); + } + return fd; } +static int do_process_zerocopy_cookies(struct sock_extended_err *serr, + uint32_t *ckbuf, size_t nbytes) +{ + int ncookies, i; + + if (serr->ee_errno != 0) + error(1, 0, "serr: wrong error code: %u", serr->ee_errno); + ncookies = serr->ee_data; + if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES) + error(1, 0, "Returned %d cookies, max expected %d\n", + ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES); + if (nbytes != ncookies * sizeof(uint32_t)) + error(1, 0, "Expected %d cookies, got %ld\n", + ncookies, nbytes/sizeof(uint32_t)); + for (i = 0; i < ncookies; i++) + if (cfg_verbose >= 2) + fprintf(stderr, "%d\n", ckbuf[i]); + return ncookies; +} + static bool do_recv_completion(int fd) { struct sock_extended_err *serr; @@ -315,10 +372,17 @@ static bool do_recv_completion(int fd) uint32_t hi, lo, range; int ret, zerocopy; char control[100]; + uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES]; + struct iovec iov; msg.msg_control = control; msg.msg_controllen = sizeof(control); + iov.iov_base = ckbuf; + iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0])); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); if (ret == -1 && errno == EAGAIN) return false; @@ -337,6 +401,11 @@ static bool do_recv_completion(int fd) cm->cmsg_level, cm->cmsg_type); serr = (void *) CMSG_DATA(cm); + + if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) { + completions += do_process_zerocopy_cookies(serr, ckbuf, ret); + return true; + } if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) error(1, 0, "serr: wrong origin: %u", serr->ee_origin); if (serr->ee_errno != 0) @@ -444,6 +513,13 @@ static void do_tx(int domain, int type, int protocol) msg.msg_iovlen++; } + if (domain == PF_RDS) { + msg.msg_name = &cfg_dst_addr; + msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + } + iov[2].iov_base = payload; iov[2].iov_len = cfg_payload_len; msg.msg_iovlen++; @@ -454,7 +530,7 @@ static void do_tx(int domain, int type, int protocol) if (cfg_cork) do_sendmsg_corked(fd, &msg); else - do_sendmsg(fd, &msg, cfg_zerocopy); + do_sendmsg(fd, &msg, cfg_zerocopy, domain); while (!do_poll(fd, POLLOUT)) { if (cfg_zerocopy) @@ -555,6 +631,40 @@ static void do_flush_datagram(int fd, int type) bytes += cfg_payload_len; } + +static void do_recvmsg(int fd) +{ + int ret, off = 0; + char *buf; + struct iovec iov; + struct msghdr msg; + struct sockaddr_storage din; + + buf = calloc(cfg_payload_len, sizeof(char)); + iov.iov_base = buf; + iov.iov_len = cfg_payload_len; + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &din; + msg.msg_namelen = sizeof(din); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ret = recvmsg(fd, &msg, MSG_TRUNC); + + if (ret == -1) + error(1, errno, "recv"); + if (ret != cfg_payload_len) + error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); + + if (memcmp(buf + off, payload, ret)) + error(1, 0, "recv: data mismatch"); + + free(buf); + packets++; + bytes += cfg_payload_len; +} + static void do_rx(int domain, int type, int protocol) { uint64_t tstop; @@ -566,6 +676,8 @@ static void do_rx(int domain, int type, int protocol) do { if (type == SOCK_STREAM) do_flush_tcp(fd); + else if (domain == PF_RDS) + do_recvmsg(fd); else do_flush_datagram(fd, type); @@ -610,6 +722,7 @@ static void parse_opts(int argc, char **argv) 40 /* max tcp options */; int c; char *daddr = NULL, *saddr = NULL; + char *cfg_test; cfg_payload_len = max_payload_len; @@ -667,6 +780,14 @@ static void parse_opts(int argc, char **argv) break; } } + + cfg_test = argv[argc - 1]; + if (strcmp(cfg_test, "rds") == 0) { + if (!daddr) + error(1, 0, "-D <server addr> required for PF_RDS\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S <client addr> required for PF_RDS\n"); + } setup_sockaddr(cfg_family, daddr, &cfg_dst_addr); setup_sockaddr(cfg_family, saddr, &cfg_src_addr); @@ -699,6 +820,8 @@ int main(int argc, char **argv) do_test(cfg_family, SOCK_STREAM, 0); else if (!strcmp(cfg_test, "udp")) do_test(cfg_family, SOCK_DGRAM, 0); + else if (!strcmp(cfg_test, "rds")) + do_test(PF_RDS, SOCK_SEQPACKET, 0); else error(1, 0, "unknown cfg_test %s", cfg_test); |