From 719f8bcc883e7992615f4d5625922e24995e2d98 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 17:03:00 -0400 Subject: svcrpc: fix xpt_list traversal locking on shutdown Server threads are not running at this point, but svc_age_temp_xprts still may be, so we need this locking. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index bac973a31367..e1810b947dea 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -917,16 +917,18 @@ void svc_close_xprt(struct svc_xprt *xprt) } EXPORT_SYMBOL_GPL(svc_close_xprt); -static void svc_close_list(struct list_head *xprt_list, struct net *net) +static void svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; + spin_lock(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; set_bit(XPT_CLOSE, &xprt->xpt_flags); set_bit(XPT_BUSY, &xprt->xpt_flags); } + spin_unlock(&serv->sv_lock); } static void svc_clear_pools(struct svc_serv *serv, struct net *net) @@ -949,24 +951,28 @@ static void svc_clear_pools(struct svc_serv *serv, struct net *net) } } -static void svc_clear_list(struct list_head *xprt_list, struct net *net) +static void svc_clear_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; struct svc_xprt *tmp; + LIST_HEAD(victims); + spin_lock(&serv->sv_lock); list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; - svc_delete_xprt(xprt); + list_move(&xprt->xpt_list, &victims); } - list_for_each_entry(xprt, xprt_list, xpt_list) - BUG_ON(xprt->xpt_net == net); + spin_unlock(&serv->sv_lock); + + list_for_each_entry_safe(xprt, tmp, &victims, xpt_list) + svc_delete_xprt(xprt); } void svc_close_net(struct svc_serv *serv, struct net *net) { - svc_close_list(&serv->sv_tempsocks, net); - svc_close_list(&serv->sv_permsocks, net); + svc_close_list(serv, &serv->sv_tempsocks, net); + svc_close_list(serv, &serv->sv_permsocks, net); svc_clear_pools(serv, net); /* @@ -974,8 +980,8 @@ void svc_close_net(struct svc_serv *serv, struct net *net) * svc_xprt_enqueue will not add new entries without taking the * sp_lock and checking XPT_BUSY. */ - svc_clear_list(&serv->sv_tempsocks, net); - svc_clear_list(&serv->sv_permsocks, net); + svc_clear_list(serv, &serv->sv_tempsocks, net); + svc_clear_list(serv, &serv->sv_permsocks, net); } /* -- cgit v1.2.3 From 72c3537607e42928f13691d59579ec840014b19e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 17:46:17 -0400 Subject: svcrpc: standardize svc_setup_socket return convention Use the kernel-standard ptr-or-error return convention instead of passing a pointer to the error. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 998aa8c1807c..d028b51a69ad 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -59,7 +59,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, - int *errp, int flags); + int flags); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); @@ -900,8 +900,9 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) */ newsock->sk->sk_sndtimeo = HZ*30; - if (!(newsvsk = svc_setup_socket(serv, newsock, &err, - (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) + newsvsk = svc_setup_socket(serv, newsock, + (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)); + if (IS_ERR(newsvsk)) goto failed; svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin, &slen); @@ -1383,29 +1384,29 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs); */ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, - int *errp, int flags) + int flags) { struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int err = 0; dprintk("svc: svc_setup_socket %p\n", sock); - if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { - *errp = -ENOMEM; - return NULL; - } + svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + if (!svsk) + return ERR_PTR(-ENOMEM); inet = sock->sk; /* Register socket with portmapper */ - if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, sock_net(sock->sk), inet->sk_family, + if (pmap_register) + err = svc_register(serv, sock_net(sock->sk), inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); - if (*errp < 0) { + if (err < 0) { kfree(svsk); - return NULL; + return ERR_PTR(err); } inet->sk_user_data = svsk; @@ -1463,10 +1464,12 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, else { if (!try_module_get(THIS_MODULE)) err = -ENOENT; - else - svsk = svc_setup_socket(serv, so, &err, - SVC_SOCK_DEFAULTS); - if (svsk) { + else { + svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); + if (IS_ERR(svsk)) + err = PTR_ERR(svsk); + } + if (err == 0) { struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *)&addr; int salen; @@ -1563,11 +1566,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, goto bummer; } - if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { + svsk = svc_setup_socket(serv, sock, flags); + if (!IS_ERR(svsk)) { svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); return (struct svc_xprt *)svsk; } - + error = PTR_ERR(svsk); bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); -- cgit v1.2.3 From a8e10078a87c8a2c3c8d0f9856c0f74272fc0f74 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 18:01:03 -0400 Subject: svcrpc: clean up control flow Mainly, use the kernel standard err = -ERROR; if (something_bad) goto out; normal case; rather than if (something_bad) err = -ERROR else { normal case; } Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 69 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d028b51a69ad..bf10b723f429 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1451,44 +1451,42 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, int err = 0; struct socket *so = sockfd_lookup(fd, &err); struct svc_sock *svsk = NULL; + struct sockaddr_storage addr; + struct sockaddr *sin = (struct sockaddr *)&addr; + int salen; if (!so) return err; + err = -EAFNOSUPPORT; if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6)) - err = -EAFNOSUPPORT; - else if (so->sk->sk_protocol != IPPROTO_TCP && + goto out; + err = -EPROTONOSUPPORT; + if (so->sk->sk_protocol != IPPROTO_TCP && so->sk->sk_protocol != IPPROTO_UDP) - err = -EPROTONOSUPPORT; - else if (so->state > SS_UNCONNECTED) - err = -EISCONN; - else { - if (!try_module_get(THIS_MODULE)) - err = -ENOENT; - else { - svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); - if (IS_ERR(svsk)) - err = PTR_ERR(svsk); - } - if (err == 0) { - struct sockaddr_storage addr; - struct sockaddr *sin = (struct sockaddr *)&addr; - int salen; - if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) - svc_xprt_set_local(&svsk->sk_xprt, sin, salen); - clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(&svsk->sk_xprt); - err = 0; - } else - module_put(THIS_MODULE); - } - if (err) { - sockfd_put(so); - return err; + goto out; + err = -EISCONN; + if (so->state > SS_UNCONNECTED) + goto out; + err = -ENOENT; + if (!try_module_get(THIS_MODULE)) + goto out; + svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); + if (IS_ERR(svsk)) { + module_put(THIS_MODULE); + err = PTR_ERR(svsk); + goto out; } + if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + svc_xprt_set_local(&svsk->sk_xprt, sin, salen); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(&svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); +out: + sockfd_put(so); + return err; } EXPORT_SYMBOL_GPL(svc_addsock); @@ -1567,11 +1565,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, } svsk = svc_setup_socket(serv, sock, flags); - if (!IS_ERR(svsk)) { - svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); - return (struct svc_xprt *)svsk; + if (IS_ERR(svsk)) { + error = PTR_ERR(svsk); + goto bummer; } - error = PTR_ERR(svsk); + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); + return (struct svc_xprt *)svsk; bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); -- cgit v1.2.3 From c3341966943284ab3618a1814cefd693ad9aa736 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 14 Aug 2012 15:27:23 -0400 Subject: svcrpc: make svc_create_xprt enqueue on clearing XPT_BUSY Whenever we clear XPT_BUSY we should call svc_xprt_enqueue(). Without that we may fail to notice any events (such as new connections) that arrived while XPT_BUSY was set. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e1810b947dea..4801fdac2c9d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -238,7 +238,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, list_add(&newxprt->xpt_list, &serv->sv_permsocks); spin_unlock_bh(&serv->sv_lock); newport = svc_xprt_local_port(newxprt); - clear_bit(XPT_BUSY, &newxprt->xpt_flags); + svc_xprt_received(newxprt); return newport; } err: -- cgit v1.2.3 From 39b553013719fe6495cf5e496b827b2d712e4265 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 14 Aug 2012 15:50:34 -0400 Subject: svcrpc: share some setup of listening sockets There's some duplicate code here. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 16 ++++++++++------ net/sunrpc/svcsock.c | 6 +----- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index b3f64b12f141..73c7a68667ea 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -124,6 +124,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, struct net *net, const sa_family_t af, const unsigned short port); int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen); +void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt); static inline void svc_xprt_get(struct svc_xprt *xprt) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4801fdac2c9d..ee15663798b3 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -208,6 +208,15 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); } +void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) +{ + clear_bit(XPT_TEMP, &new->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&new->xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(new); +} + int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags) @@ -232,13 +241,8 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, module_put(xcl->xcl_owner); return PTR_ERR(newxprt); } - - clear_bit(XPT_TEMP, &newxprt->xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&newxprt->xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); + svc_add_new_perm_xprt(serv, newxprt); newport = svc_xprt_local_port(newxprt); - svc_xprt_received(newxprt); return newport; } err: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index bf10b723f429..c7a7b14f54ed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1478,11 +1478,7 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, } if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); - clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(&svsk->sk_xprt); + svc_add_new_perm_xprt(serv, &svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); out: sockfd_put(so); -- cgit v1.2.3 From f23abfdb94fda3108441530cb4a813088d3f9176 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 20:32:27 -0400 Subject: svcrpc: minor udp code cleanup Order the code in a more boring way. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c7a7b14f54ed..06ae8a755349 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -620,10 +620,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (!svc_udp_get_dest_address(rqstp, cmh)) { net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", cmh->cmsg_level, cmh->cmsg_type); -out_free: - trace_kfree_skb(skb, svc_udp_recvfrom); - skb_free_datagram_locked(svsk->sk_sk, skb); - return 0; + goto out_free; } rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); @@ -662,6 +659,10 @@ out_free: serv->sv_stats->netudpcnt++; return len; +out_free: + trace_kfree_skb(skb, svc_udp_recvfrom); + skb_free_datagram_locked(svsk->sk_sk, skb); + return 0; } static int -- cgit v1.2.3 From af6d572134b012ca92c4efc8a2f1cadbe5d01064 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 21 Aug 2012 17:22:11 -0400 Subject: svcrpc: don't bother checking bad svc_addr_len result None of the callers should see an unsupported address family (only one of them even bothers to check for that case), so just check for the buggy case in svc_addr_len and don't bother elsewhere. Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 3 +-- net/sunrpc/svcsock.c | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 73c7a68667ea..193dddab6511 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -167,8 +167,7 @@ static inline size_t svc_addr_len(const struct sockaddr *sa) case AF_INET6: return sizeof(struct sockaddr_in6); } - - return 0; + BUG(); } static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 06ae8a755349..406688baac57 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -601,8 +601,6 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) return -EAGAIN; } len = svc_addr_len(svc_addr(rqstp)); - if (len == 0) - return -EAFNOSUPPORT; rqstp->rq_addrlen = len; if (skb->tstamp.tv64 == 0) { skb->tstamp = ktime_get_real(); -- cgit v1.2.3 From 9f9d2ebe693a98d517257e1a39f61120b4473b96 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 21:35:24 -0400 Subject: svcrpc: make xpo_recvfrom return only >=0 The only errors returned from xpo_recvfrom have been -EAGAIN and -EAFNOSUPPORT. The latter was removed by a previous patch. That leaves only -EAGAIN, which is treated just like 0 by the caller (svc_recv). So, just ditch -EAGAIN and return 0 instead. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 2 +- net/sunrpc/svcsock.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ee15663798b3..3e317307e288 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -743,7 +743,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_received(xprt); /* No data, incomplete (TCP) read, or accept() */ - if (len == 0 || len == -EAGAIN) + if (len <= 0) goto out; clear_bit(XPT_OLD, &xprt->xpt_flags); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 406688baac57..7aee54c3fe46 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -598,7 +598,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } - return -EAGAIN; + return 0; } len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; @@ -1174,13 +1174,13 @@ error: if (len != -EAGAIN) goto err_other; dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return -EAGAIN; + return 0; err_other: printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", svsk->sk_xprt.xpt_server->sv_name, -len); set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); err_noclose: - return -EAGAIN; /* record not complete */ + return 0; /* record not complete */ } /* -- cgit v1.2.3 From 6741019c829ecfa6f7a504fae1305dcf5d5cf057 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 22:12:19 -0400 Subject: svcrpc: make svc_xprt_received static Note this isn't used outside svc_xprt.c. May as well move it so we don't need a declaration while we're here. Also remove an outdated comment. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 - net/sunrpc/svc_xprt.c | 41 ++++++++++++++++---------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ---- 3 files changed, 20 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 193dddab6511..b05963f09ebf 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -114,7 +114,6 @@ void svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *, int svc_create_xprt(struct svc_serv *, const char *, struct net *, const int, const unsigned short, int); void svc_xprt_enqueue(struct svc_xprt *xprt); -void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); void svc_close_xprt(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3e317307e288..295e6ed21ca0 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -208,6 +208,26 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); } +/* + * svc_xprt_received conditionally queues the transport for processing + * by another thread. The caller must hold the XPT_BUSY bit and must + * not thereafter touch transport data. + * + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or + * insufficient) data. + */ +static void svc_xprt_received(struct svc_xprt *xprt) +{ + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + /* As soon as we clear busy, the xprt could be closed and + * 'put', so we need a reference to call svc_xprt_enqueue with: + */ + svc_xprt_get(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) { clear_bit(XPT_TEMP, &new->xpt_flags); @@ -398,27 +418,6 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) return xprt; } -/* - * svc_xprt_received conditionally queues the transport for processing - * by another thread. The caller must hold the XPT_BUSY bit and must - * not thereafter touch transport data. - * - * Note: XPT_DATA only gets cleared when a read-attempt finds no (or - * insufficient) data. - */ -void svc_xprt_received(struct svc_xprt *xprt) -{ - BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); - /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_enqueue with: - */ - svc_xprt_get(xprt); - clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); -} -EXPORT_SYMBOL_GPL(svc_xprt_received); - /** * svc_reserve - change the space reserved for the reply to a request. * @rqstp: The request in question diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 73b428bef598..62e4f9bcc387 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -578,10 +578,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); spin_unlock_bh(&listen_xprt->sc_lock); - /* - * Can't use svc_xprt_received here because we are not on a - * rqstp thread - */ set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); svc_xprt_enqueue(&listen_xprt->sc_xprt); } -- cgit v1.2.3 From 6797fa5a018ff916a071c6265fbf043644abcd29 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 18 Aug 2012 15:33:51 -0400 Subject: svcrpc: break up svc_recv Matter of taste, I suppose, but svc_recv breaks up naturally into: allocate pages and setup arg dequeue (wait for, if necessary) next socket do something with that socket And I find it easier to read when it doesn't go on for pages and pages. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 103 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 295e6ed21ca0..6ebc9a95bbab 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -568,33 +568,12 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. - */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +int svc_alloc_arg(struct svc_rqst *rqstp) { - struct svc_xprt *xprt = NULL; - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - int len, i; - int pages; - struct xdr_buf *arg; - DECLARE_WAITQUEUE(wait, current); - long time_left; - - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_recv: service %p, transport not NULL!\n", - rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); + struct svc_serv *serv = rqstp->rq_server; + struct xdr_buf *arg; + int pages; + int i; /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; @@ -624,11 +603,15 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) arg->page_len = (pages-2)*PAGE_SIZE; arg->len = (pages-1)*PAGE_SIZE; arg->tail[0].iov_len = 0; + return 0; +} - try_to_freeze(); - cond_resched(); - if (signalled() || kthread_should_stop()) - return -EINTR; +struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt; + struct svc_pool *pool = rqstp->rq_pool; + DECLARE_WAITQUEUE(wait, current); + long time_left; /* Normally we will wait up to 5 seconds for any required * cache information to be provided. @@ -666,7 +649,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (kthread_should_stop()) { set_current_state(TASK_RUNNING); spin_unlock_bh(&pool->sp_lock); - return -EINTR; + return ERR_PTR(-EINTR); } add_wait_queue(&rqstp->rq_wait, &wait); @@ -687,19 +670,25 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); if (signalled() || kthread_should_stop()) - return -EINTR; + return ERR_PTR(-EINTR); else - return -EAGAIN; + return ERR_PTR(-EAGAIN); } } spin_unlock_bh(&pool->sp_lock); + return xprt; +} + +static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct svc_serv *serv = rqstp->rq_server; + int len = 0; - len = 0; if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ - goto out; + return 0; } if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; @@ -727,8 +716,9 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_received(newxpt); } } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { + /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, pool->sp_id, xprt, + rqstp, rqstp->rq_pool->sp_id, xprt, atomic_read(&xprt->xpt_ref.refcount)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) @@ -739,7 +729,48 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } + /* clear XPT_BUSY: */ svc_xprt_received(xprt); + return len; +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + int len, err; + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + err = svc_alloc_arg(rqstp); + if (err) + return err; + + try_to_freeze(); + cond_resched(); + if (signalled() || kthread_should_stop()) + return -EINTR; + + xprt = svc_get_next_xprt(rqstp, timeout); + if (IS_ERR(xprt)) + return PTR_ERR(xprt); + + len = svc_handle_xprt(rqstp, xprt); /* No data, incomplete (TCP) read, or accept() */ if (len <= 0) -- cgit v1.2.3 From 65b2e6656bda2ad983727fcc725ac66b6d5035a7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 18 Aug 2012 15:44:33 -0400 Subject: svcrpc: split up svc_handle_xprt Move initialization of newly accepted socket into a helper. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6ebc9a95bbab..194d865fae72 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -679,6 +679,23 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) return xprt; } +void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) +{ + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(newxpt); +} + static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) { struct svc_serv *serv = rqstp->rq_server; @@ -692,29 +709,15 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) } if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(xprt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); newxpt = xprt->xpt_ops->xpo_accept(xprt); - if (newxpt) { - /* - * We know this module_get will succeed because the - * listener holds a reference too - */ - __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(xprt->xpt_server); - spin_lock_bh(&serv->sv_lock); - set_bit(XPT_TEMP, &newxpt->xpt_flags); - list_add(&newxpt->xpt_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp transports */ - setup_timer(&serv->sv_temptimer, - svc_age_temp_xprts, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(newxpt); - } + if (newxpt) + svc_add_new_temp_xprt(serv, newxpt); } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", -- cgit v1.2.3 From 43def35c1030d91a7414936c7c1b416828b20afb Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 10 Aug 2012 15:52:06 +0200 Subject: net/9p: Check errno validity While working on a modified server I had the Linux clients crash a few times. This lead me to find this: Some error codes are directly extracted from the server replies. A malformed server reply could contain an invalid error code, with a very large value. If this value is then passed to ERR_PTR() it will not be properly detected as an error code by IS_ERR() and as a result the kernel will dereference an invalid pointer. This patch tries to avoid this. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 8260f132b32e..34d417670935 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -76,6 +76,20 @@ inline int p9_is_proto_dotu(struct p9_client *clnt) } EXPORT_SYMBOL(p9_is_proto_dotu); +/* + * Some error codes are taken directly from the server replies, + * make sure they are valid. + */ +static int safe_errno(int err) +{ + if ((err > 0) || (err < -MAX_ERRNO)) { + p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err); + return -EPROTO; + } + return err; +} + + /* Interpret mount option for protocol version */ static int get_protocol_version(char *s) { @@ -782,7 +796,7 @@ again: return req; reterr: p9_free_req(c, req); - return ERR_PTR(err); + return ERR_PTR(safe_errno(err)); } /** @@ -865,7 +879,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, return req; reterr: p9_free_req(c, req); - return ERR_PTR(err); + return ERR_PTR(safe_errno(err)); } static struct p9_fid *p9_fid_create(struct p9_client *clnt) -- cgit v1.2.3 From eccf50c129686de11358093839749c83f6cae5db Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 15 Aug 2012 18:07:43 -0400 Subject: nfsd: remove unused listener-removal interfaces You can use nfsd/portlist to give nfsd additional sockets to listen on. In theory you can also remove listening sockets this way. But nobody's ever done that as far as I can tell. Also this was partially broken in 2.6.25, by a217813f9067b785241cb7f31956e51d2071703a "knfsd: Support adding transports by writing portlist file". (Note that we decide whether to take the "delfd" case by checking for a digit--but what's actually expected in that case is something made by svc_one_sock_name(), which won't begin with a digit.) So, let's just rip out this stuff. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 78 ------------------------------------------ include/linux/sunrpc/svcsock.h | 3 -- net/sunrpc/svcsock.c | 51 --------------------------- 3 files changed, 132 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e41a08ffbe0a..dab350dfc376 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -682,25 +682,6 @@ static ssize_t __write_ports_addfd(char *buf) return err; } -/* - * A '-' followed by the 'name' of a socket means we close the socket. - */ -static ssize_t __write_ports_delfd(char *buf) -{ - char *toclose; - int len = 0; - - toclose = kstrdup(buf + 1, GFP_KERNEL); - if (toclose == NULL) - return -ENOMEM; - - if (nfsd_serv != NULL) - len = svc_sock_names(nfsd_serv, buf, - SIMPLE_TRANSACTION_LIMIT, toclose); - kfree(toclose); - return len; -} - /* * A transport listener is added by writing it's transport name and * a port number. @@ -746,31 +727,6 @@ out_err: return err; } -/* - * A transport listener is removed by writing a "-", it's transport - * name, and it's port number. - */ -static ssize_t __write_ports_delxprt(char *buf) -{ - struct svc_xprt *xprt; - char transport[16]; - int port; - - if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) - return -EINVAL; - - if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) - return -EINVAL; - - xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port); - if (xprt == NULL) - return -ENOTCONN; - - svc_close_xprt(xprt); - svc_xprt_put(xprt); - return 0; -} - static ssize_t __write_ports(struct file *file, char *buf, size_t size) { if (size == 0) @@ -779,15 +735,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) if (isdigit(buf[0])) return __write_ports_addfd(buf); - if (buf[0] == '-' && isdigit(buf[1])) - return __write_ports_delfd(buf); - if (isalpha(buf[0])) return __write_ports_addxprt(buf); - if (buf[0] == '-' && isalpha(buf[1])) - return __write_ports_delxprt(buf); - return -EINVAL; } @@ -825,21 +775,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing a "-" followed - * by an integer value representing a - * previously passed in socket file - * descriptor - * size: non-zero length of C string in @buf - * Output: - * On success: NFS service no longer listens on that socket; - * passed-in buffer filled with a '\n'-terminated C - * string containing a unique name of the listener; - * return code is the size in bytes of the string - * On error: return code is a negative errno value - * - * OR - * - * Input: * buf: C string containing a transport * name and an unsigned integer value * representing the port to listen on, @@ -848,19 +783,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) * Output: * On success: returns zero; NFS service is started * On error: return code is a negative errno value - * - * OR - * - * Input: - * buf: C string containing a "-" followed - * by a transport name and an unsigned - * integer value representing the port - * to listen on, separated by whitespace - * size: non-zero length of C string in @buf - * Output: - * On success: returns zero; NFS service no longer listens - * on that transport - * On error: return code is a negative errno value */ static ssize_t write_ports(struct file *file, char *buf, size_t size) { diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index cb4ac69e1f33..92ad02f0dcc0 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -39,9 +39,6 @@ int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); -int svc_sock_names(struct svc_serv *serv, char *buf, - const size_t buflen, - const char *toclose); int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, const size_t len); void svc_init_xprt_sock(void); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7aee54c3fe46..03827cef1fa7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -305,57 +305,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } -/** - * svc_sock_names - construct a list of listener names in a string - * @serv: pointer to RPC service - * @buf: pointer to a buffer to fill in with socket names - * @buflen: size of the buffer to be filled - * @toclose: pointer to '\0'-terminated C string containing the name - * of a listener to be closed - * - * Fills in @buf with a '\n'-separated list of names of listener - * sockets. If @toclose is not NULL, the socket named by @toclose - * is closed, and is not included in the output list. - * - * Returns positive length of the socket name string, or a negative - * errno value on error. - */ -int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen, - const char *toclose) -{ - struct svc_sock *svsk, *closesk = NULL; - int len = 0; - - if (!serv) - return 0; - - spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { - int onelen = svc_one_sock_name(svsk, buf + len, buflen - len); - if (onelen < 0) { - len = onelen; - break; - } - if (toclose && strcmp(toclose, buf + len) == 0) { - closesk = svsk; - svc_xprt_get(&closesk->sk_xprt); - } else - len += onelen; - } - spin_unlock_bh(&serv->sv_lock); - - if (closesk) { - /* Should unregister with portmap, but you cannot - * unregister just one protocol... - */ - svc_close_xprt(&closesk->sk_xprt); - svc_xprt_put(&closesk->sk_xprt); - } else if (toclose) - return -ENOENT; - return len; -} -EXPORT_SYMBOL_GPL(svc_sock_names); - /* * Check input queue length */ -- cgit v1.2.3 From 0462194d358c2e040282d4d1a4fd1aab84417e42 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:28 +0200 Subject: 9P: Fix race in p9_read_work() Race scenario between p9_read_work() and p9_poll_mux() Data arrive, Rworksched is set, p9_read_work() is called. thread A thread B p9_read_work() . reads data . checks if new data ready. No. . gets preempted . More data arrive, p9_poll_mux() is called. . . . p9_poll_mux() . . if (!test_and_set_bit(Rworksched, . &m->wsched)) { . schedule_work(&m->rq); . } . . -> does not schedule work because . Rworksched is set . . clear_bit(Rworksched, &m->wsched); return; No work has been scheduled, and yet data are waiting. Currently p9_read_work() checks if there is data to read, and if not, it clears Rworksched. I think it should clear Rworksched first, and then check if there is data to read. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 6449bae15702..de1bbad0c7de 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -316,8 +316,7 @@ static void p9_read_work(struct work_struct *work) m->rsize - m->rpos); p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); if (err == -EAGAIN) { - clear_bit(Rworksched, &m->wsched); - return; + goto end_clear; } if (err <= 0) @@ -379,19 +378,20 @@ static void p9_read_work(struct work_struct *work) m->req = NULL; } +end_clear: + clear_bit(Rworksched, &m->wsched); + if (!list_empty(&m->req_list)) { if (test_and_clear_bit(Rpending, &m->wsched)) n = POLLIN; else n = p9_fd_poll(m->client, NULL); - if (n & POLLIN) { + if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) { p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m); schedule_work(&m->rq); - } else - clear_bit(Rworksched, &m->wsched); - } else - clear_bit(Rworksched, &m->wsched); + } + } return; error: -- cgit v1.2.3 From 1957b3a86f8eb5ceab32e3aae99e2822258aa530 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:29 +0200 Subject: 9P: fix test at the end of p9_write_work() At the end of p9_write_work() we want to test if there is still data to send. This means: - either the current request still has data to send (wsize != 0) - or there are requests in the unsent queue Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index de1bbad0c7de..7088a94b2601 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work) if (m->wpos == m->wsize) m->wpos = m->wsize = 0; - if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) { + if (m->wsize || !list_empty(&m->unsent_req_list)) { if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else -- cgit v1.2.3 From 584a8c13d58423462680907d4cc40d9929c9030a Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:30 +0200 Subject: 9P: Fix race in p9_write_work() See previous commit about p9_read_work() for details. This fixes a similar race between p9_write_work() and p9_poll_mux() Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 7088a94b2601..b2c308fffb8a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -476,10 +476,9 @@ static void p9_write_work(struct work_struct *work) clear_bit(Wpending, &m->wsched); err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos); p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err); - if (err == -EAGAIN) { - clear_bit(Wworksched, &m->wsched); - return; - } + if (err == -EAGAIN) + goto end_clear; + if (err < 0) goto error; @@ -492,19 +491,21 @@ static void p9_write_work(struct work_struct *work) if (m->wpos == m->wsize) m->wpos = m->wsize = 0; +end_clear: + clear_bit(Wworksched, &m->wsched); + if (m->wsize || !list_empty(&m->unsent_req_list)) { if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else n = p9_fd_poll(m->client, NULL); - if (n & POLLOUT) { + if ((n & POLLOUT) && + !test_and_set_bit(Wworksched, &m->wsched)) { p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m); schedule_work(&m->wq); - } else - clear_bit(Wworksched, &m->wsched); - } else - clear_bit(Wworksched, &m->wsched); + } + } return; -- cgit v1.2.3 From a519fc7a70d1a918574bb826cc6905b87b482eb9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Sep 2012 16:49:15 -0400 Subject: SUNRPC: Ensure that the TCP socket is closed when in CLOSE_WAIT Instead of doing a shutdown() call, we need to do an actual close(). Ditto if/when the server is sending us junk RPC headers. Signed-off-by: Trond Myklebust Tested-by: Simon Kirby Cc: stable@vger.kernel.org --- net/sunrpc/xprtsock.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a35b8e52e551..d1988cf8bf33 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1025,6 +1025,16 @@ static void xs_udp_data_ready(struct sock *sk, int len) read_unlock_bh(&sk->sk_callback_lock); } +/* + * Helper function to force a TCP close if the server is sending + * junk and/or it has put us in CLOSE_WAIT + */ +static void xs_tcp_force_close(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); +} + static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1051,7 +1061,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea /* Sanity check of the record length */ if (unlikely(transport->tcp_reclen < 8)) { dprintk("RPC: invalid TCP record fragment length\n"); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); return; } dprintk("RPC: reading TCP record fragment of length %d\n", @@ -1132,7 +1142,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, break; default: dprintk("RPC: invalid request message type\n"); - xprt_force_disconnect(&transport->xprt); + xs_tcp_force_close(&transport->xprt); } xs_tcp_check_fraghdr(transport); } @@ -1455,6 +1465,8 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_mark_closed(struct rpc_xprt *xprt) { smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1512,8 +1524,8 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - xprt_force_disconnect(xprt); xprt->connect_cookie++; + xs_tcp_force_close(xprt); case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2199,8 +2211,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry */ - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); break; case -ECONNREFUSED: case -ECONNRESET: -- cgit v1.2.3 From 84e28a307e376f271505af65a7b7e212dd6f61f4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Mon, 24 Sep 2012 13:39:01 -0400 Subject: SUNRPC: Set alloc_slot for backchannel tcp ops f39c1bfb5a03e2d255451bff05be0d7255298fa4 (SUNRPC: Fix a UDP transport regression) introduced the "alloc_slot" function for xprt operations, but never created one for the backchannel operations. This patch fixes a null pointer dereference when mounting NFS over v4.1. Call Trace: [] ? xprt_reserve+0x47/0x50 [sunrpc] [] call_reserve+0x34/0x60 [sunrpc] [] __rpc_execute+0x90/0x400 [sunrpc] [] rpc_async_schedule+0x2a/0x40 [sunrpc] [] process_one_work+0x139/0x500 [] ? alloc_worker+0x70/0x70 [] ? __rpc_execute+0x400/0x400 [sunrpc] [] worker_thread+0x15e/0x460 [] ? preempt_schedule+0x49/0x70 [] ? rescuer_thread+0x230/0x230 [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d1988cf8bf33..97f8918169ed 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2539,6 +2539,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { static struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .buf_alloc = bc_malloc, .buf_free = bc_free, -- cgit v1.2.3 From 8a9a8b8332b92b13316cf49685b5dc5257cfe115 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Aug 2012 14:32:13 -0400 Subject: SUNRPC: Fix the return value of xdr_align_pages() The callers of xdr_align_pages() expect it to return the number of bytes of actual XDR data remaining in the pages. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 0afba1b4b656..fbbd1c475b43 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -742,6 +742,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) /* Truncate page data and move it into the tail */ if (buf->page_len > len) xdr_shrink_pagelen(buf, buf->page_len - len); + else + len = buf->page_len; xdr->nwords = XDR_QUADLEN(buf->len - cur); return len; } -- cgit v1.2.3 From a11a2bf4de5679fa0b63474c7d39bea2dac7d061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Aug 2012 13:21:43 -0400 Subject: SUNRPC: Optimise away unnecessary data moves in xdr_align_pages We only have to call xdr_shrink_pagelen() if the remaining RPC message does not fit in the page buffer length that we supplied to xdr_align_pages(). Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fbbd1c475b43..08f50afd5f2a 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -730,21 +730,24 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) if (xdr->nwords == 0) return 0; - if (nwords > xdr->nwords) { - nwords = xdr->nwords; - len = nwords << 2; - } /* Realign pages to current pointer position */ iov = buf->head; - if (iov->iov_len > cur) + if (iov->iov_len > cur) { xdr_shrink_bufhead(buf, iov->iov_len - cur); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } - /* Truncate page data and move it into the tail */ - if (buf->page_len > len) - xdr_shrink_pagelen(buf, buf->page_len - len); - else + if (nwords > xdr->nwords) { + nwords = xdr->nwords; + len = nwords << 2; + } + if (buf->page_len <= len) len = buf->page_len; - xdr->nwords = XDR_QUADLEN(buf->len - cur); + else if (nwords < xdr->nwords) { + /* Truncate page data and move it into the tail */ + xdr_shrink_pagelen(buf, buf->page_len - len); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } return len; } -- cgit v1.2.3 From d19751e7b9bd8a01d00372325439589886674f79 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Sep 2012 17:21:25 -0400 Subject: SUNRPC: Get rid of the redundant xprt->shutdown bit field It is only set after everyone has dereferenced the transport, and serves no useful purpose: setting it is racy, so all the socket code, etc still needs to be able to cope with the cases where they miss reading it. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 8 ++------ net/sunrpc/xprtrdma/transport.c | 22 ++++++++-------------- net/sunrpc/xprtsock.c | 18 ------------------ 4 files changed, 11 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bf8c49ff7530..951cb9b7d02b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -173,8 +173,7 @@ struct rpc_xprt { unsigned int min_reqs; /* min number of slots */ atomic_t num_reqs; /* total slots */ unsigned long state; /* transport state */ - unsigned char shutdown : 1, /* being shut down */ - resvport : 1; /* use a reserved port */ + unsigned char resvport : 1; /* use a reserved port */ unsigned int swapper; /* we're swapping over this transport */ unsigned int bind_index; /* bind function index */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 5d7f61d7559c..bd462a532acf 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) { + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { smp_mb__before_clear_bit(); clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); @@ -504,9 +504,6 @@ EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); */ void xprt_write_space(struct rpc_xprt *xprt) { - if (unlikely(xprt->shutdown)) - return; - spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) { dprintk("RPC: write space: waking waiting task on " @@ -679,7 +676,7 @@ xprt_init_autodisconnect(unsigned long data) struct rpc_xprt *xprt = (struct rpc_xprt *)data; spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv) || xprt->shutdown) + if (!list_empty(&xprt->recv)) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; @@ -1262,7 +1259,6 @@ out: static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); - xprt->shutdown = 1; del_timer_sync(&xprt->timer); rpc_destroy_wait_queue(&xprt->binding); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 5d9202dc7cb1..c9aa7a35f3bf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -199,21 +199,15 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->xprt; int rc = 0; - if (!xprt->shutdown) { - current->flags |= PF_FSTRANS; - xprt_clear_connected(xprt); - - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - goto out; - } - goto out_clear; + current->flags |= PF_FSTRANS; + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + xprt_wake_pending_tasks(xprt, rc); -out: - xprt_wake_pending_tasks(xprt, rc); -out_clear: dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); current->flags &= ~PF_FSTRANS; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 97f8918169ed..aaaadfbe36e9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -917,9 +917,6 @@ static void xs_local_data_ready(struct sock *sk, int len) if (skb == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(rpc_fraghdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d\n", repsize); @@ -981,9 +978,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); @@ -1412,9 +1406,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; - if (xprt->shutdown) - goto out; - /* Any data means we had a useful conversation, so * the we don't need to delay the next reconnect */ @@ -1901,9 +1892,6 @@ static void xs_local_setup_socket(struct work_struct *work) struct socket *sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); @@ -2020,9 +2008,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; /* Start by resetting any existing state */ @@ -2168,9 +2153,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; if (!sock) { -- cgit v1.2.3 From 9b96ce71974127af0304514d310abe596426c112 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Sep 2012 20:24:16 -0400 Subject: SUNRPC: Limit the rpciod workqueue concurrency We shouldn't need more than 1 worker thread per cpu, since rpciod is designed to run without sleeping in most cases. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 128494ec9a64..6357fcb00c7e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1022,7 +1022,7 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } -- cgit v1.2.3 From 290e33593d76d1cebf873da50e036559c4820af9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Aug 2012 09:47:49 -0700 Subject: libceph: remove unused monc->have_fsid This is unused; use monc->client->have_fsid. Signed-off-by: Sage Weil --- include/linux/ceph/mon_client.h | 1 - net/ceph/mon_client.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 2113e3850a4e..1dc508aeb73d 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -71,7 +71,6 @@ struct ceph_mon_client { int cur_mon; /* last monitor i contacted */ unsigned long sub_sent, sub_renew_after; struct ceph_connection con; - bool have_fsid; /* pending generic requests */ struct rb_root generic_request_tree; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 900ea0f043fc..e98f6070b5ae 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -769,7 +769,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; - monc->have_fsid = false; return 0; } -- cgit v1.2.3 From 7698f2f5e0d7b7a062213fa970b7c4e121adf38e Mon Sep 17 00:00:00 2001 From: Iulius Curt Date: Thu, 23 Aug 2012 15:14:29 +0300 Subject: libceph: Fix sparse warning Make ceph_monc_do_poolop() static to remove the following sparse warning: * net/ceph/mon_client.c:616:5: warning: symbol 'ceph_monc_do_poolop' was not declared. Should it be static? Also drops the 'ceph_monc_' prefix, now being a private function. Signed-off-by: Iulius Curt Signed-off-by: Sage Weil --- net/ceph/mon_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index e98f6070b5ae..812eb3b46c1f 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -637,7 +637,7 @@ bad: /* * Do a synchronous pool op. */ -int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, +static int do_poolop(struct ceph_mon_client *monc, u32 op, u32 pool, u64 snapid, char *buf, int len) { @@ -687,7 +687,7 @@ out: int ceph_monc_create_snapid(struct ceph_mon_client *monc, u32 pool, u64 *snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, 0, (char *)snapid, sizeof(*snapid)); } @@ -696,7 +696,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, snapid, 0, 0); } -- cgit v1.2.3 From cc4829e5967de577794b25dfcd1a65e509d171ed Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 5 Sep 2012 14:34:32 +0800 Subject: ceph: use list_move_tail instead of list_del/list_add_tail Using list_move_tail() instead of list_del() + list_add_tail(). Signed-off-by: Wei Yongjun Signed-off-by: Sage Weil --- net/ceph/pagelist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 665cd23020ff..92866bebb65f 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -1,4 +1,3 @@ - #include #include #include @@ -134,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl, ceph_pagelist_unmap_tail(pl); while (pl->head.prev != c->page_lru) { page = list_entry(pl->head.prev, struct page, lru); - list_del(&page->lru); /* remove from pagelist */ - list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + /* move from pagelist to reserve */ + list_move_tail(&page->lru, &pl->free_list); ++pl->num_pages_free; } pl->room = c->room; -- cgit v1.2.3 From d63b77f4c552cc3a20506871046ab0fcbc332609 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 20:59:48 -0700 Subject: libceph: check for invalid mapping If we encounter an invalid (e.g., zeroed) mapping, return an error and avoid a divide by zero. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 2 +- include/linux/ceph/osdmap.h | 6 +++--- net/ceph/osd_client.c | 32 ++++++++++++++++++++------------ net/ceph/osdmap.c | 18 ++++++++++++++++-- 4 files changed, 40 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cedfb1a8434a..d9b880e977e6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,7 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 311ef8d6aa9e..e88a620b9f8a 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -109,9 +109,9 @@ extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ -extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, - u64 *bno, u64 *oxoff, u64 *oxlen); +extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ extern int ceph_calc_object_layout(struct ceph_object_layout *ol, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 42119c05e82c..f7b56e23988c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -52,7 +52,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, @@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ + int r; reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (r < 0) + return r; if (*plen < orig_len) dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); @@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", *bno, objoff, objlen, req->r_num_pages); - + return 0; } EXPORT_SYMBOL(ceph_calc_raw_layout); @@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { u64 bno; + int r; - ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); + if (r < 0) + return r; snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); + + return r; } /* diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3124b71a8883..5433fb0eb3c6 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -984,7 +984,7 @@ bad: * for now, we write only a single su, until we can * pass a stride back to the caller. */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *ono, u64 *oxoff, u64 *oxlen) @@ -998,11 +998,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); + if (su == 0 || sc == 0) + goto invalid; su_per_object = osize / su; + if (su_per_object == 0) + goto invalid; dout("osize %u / su %u = su_per_object %u\n", osize, su, su_per_object); - BUG_ON((su & ~PAGE_MASK) != 0); + if ((su & ~PAGE_MASK) != 0) + goto invalid; + /* bl = *off / su; */ t = off; do_div(t, su); @@ -1030,6 +1036,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, *plen = *oxlen; dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); + return 0; + +invalid: + dout(" invalid layout\n"); + *ono = 0; + *oxoff = 0; + *oxlen = 0; + return -EINVAL; } EXPORT_SYMBOL(ceph_calc_file_object_mapping); -- cgit v1.2.3 From 6816282dab3a72efe8c0d182c1bc2960d87f4322 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 21:01:02 -0700 Subject: ceph: propagate layout error on osd request creation If we are creating an osd request and get an invalid layout, return an EINVAL to the caller. We switch up the return to have an error code instead of NULL implying -ENOMEM. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/addr.c | 8 ++++---- fs/ceph/file.c | 4 ++-- net/ceph/osd_client.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 452e71a1b753..4469b63c9b7b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, NULL, false, 1, 0); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* build page vector */ nr_pages = len >> PAGE_CACHE_SHIFT; @@ -832,8 +832,8 @@ get_more_pages: ci->i_truncate_size, &inode->i_mtime, true, 1, 0); - if (!req) { - rc = -ENOMEM; + if (IS_ERR(req)) { + rc = PTR_ERR(req); unlock_page(page); break; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ecebbc09bfc7..5840d2aaed15 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -536,8 +536,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f7b56e23988c..ccbdfbba9e53 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -464,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; + int r; ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; @@ -482,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, use_mempool, GFP_NOFS, NULL, NULL); if (!req) - return NULL; + return ERR_PTR(-ENOMEM); /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(osdc, vino, layout, off, plen, req, ops); + if (r < 0) + return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that @@ -1928,8 +1931,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1971,8 +1974,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short write due to an object boundary */ req->r_pages = pages; -- cgit v1.2.3 From d8af9bc16c9e9506c10581111f897be04486218f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:34 -0400 Subject: SUNRPC: Clean up dprintk messages in rpc_pipe.c Clean up: The blank space in front of the message must be spaces. Tabs show up on the console as a graphical character. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 21fde99e5c56..80f5dd23417d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1119,8 +1119,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", + net, NET_NAME(net)); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, @@ -1155,8 +1155,8 @@ static void rpc_kill_sb(struct super_block *sb) sn->pipefs_sb = NULL; mutex_unlock(&sn->pipefs_sb_lock); put_net(net); - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", + net, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); -- cgit v1.2.3 From 632f0d0503accb8ab749a1165af99d344579c37b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:43 -0400 Subject: SUNRPC: Use __func__ in dprintk() in auth_gss.c Clean up: Some function names have changed, but debugging messages were never updated. Automate the construction of the function name in debugging messages. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 58 +++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 34c522021004..909dc0c31aab 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -239,7 +239,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct } return q; err: - dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + dprintk("RPC: %s returning %ld\n", __func__, -PTR_ERR(p)); return p; } @@ -301,10 +301,10 @@ __gss_find_upcall(struct rpc_pipe *pipe, uid_t uid) if (pos->uid != uid) continue; atomic_inc(&pos->count); - dprintk("RPC: gss_find_upcall found msg %p\n", pos); + dprintk("RPC: %s found msg %p\n", __func__, pos); return pos; } - dprintk("RPC: gss_find_upcall found nothing\n"); + dprintk("RPC: %s found nothing\n", __func__); return NULL; } @@ -507,8 +507,8 @@ gss_refresh_upcall(struct rpc_task *task) struct rpc_pipe *pipe; int err = 0; - dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid, - cred->cr_uid); + dprintk("RPC: %5u %s for uid %u\n", + task->tk_pid, __func__, cred->cr_uid); gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -539,8 +539,8 @@ gss_refresh_upcall(struct rpc_task *task) spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: - dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n", - task->tk_pid, cred->cr_uid, err); + dprintk("RPC: %5u %s for uid %u result %d\n", + task->tk_pid, __func__, cred->cr_uid, err); return err; } @@ -553,7 +553,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) DEFINE_WAIT(wait); int err = 0; - dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); + dprintk("RPC: %s for uid %u\n", __func__, cred->cr_uid); retry: gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { @@ -594,8 +594,8 @@ out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: - dprintk("RPC: gss_create_upcall for uid %u result %d\n", - cred->cr_uid, err); + dprintk("RPC: %s for uid %u result %d\n", + __func__, cred->cr_uid, err); return err; } @@ -681,7 +681,7 @@ err_put_ctx: err: kfree(buf); out: - dprintk("RPC: gss_pipe_downcall returning %Zd\n", err); + dprintk("RPC: %s returning %Zd\n", __func__, err); return err; } @@ -747,8 +747,8 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { - dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", - gss_msg); + dprintk("RPC: %s releasing msg %p\n", + __func__, gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) @@ -976,7 +976,7 @@ gss_destroying_context(struct rpc_cred *cred) static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { - dprintk("RPC: gss_free_ctx\n"); + dprintk("RPC: %s\n", __func__); gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); @@ -999,7 +999,7 @@ gss_free_ctx(struct gss_cl_ctx *ctx) static void gss_free_cred(struct gss_cred *gss_cred) { - dprintk("RPC: gss_free_cred %p\n", gss_cred); + dprintk("RPC: %s cred=%p\n", __func__, gss_cred); kfree(gss_cred); } @@ -1049,8 +1049,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) struct gss_cred *cred = NULL; int err = -ENOMEM; - dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", - acred->uid, auth->au_flavor); + dprintk("RPC: %s for uid %d, flavor %d\n", + __func__, acred->uid, auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) goto out_err; @@ -1069,7 +1069,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) return &cred->gc_base; out_err: - dprintk("RPC: gss_create_cred failed with error %d\n", err); + dprintk("RPC: %s failed with error %d\n", __func__, err); return ERR_PTR(err); } @@ -1127,7 +1127,7 @@ gss_marshal(struct rpc_task *task, __be32 *p) struct kvec iov; struct xdr_buf verf_buf; - dprintk("RPC: %5u gss_marshal\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); *p++ = htonl(RPC_AUTH_GSS); cred_len = p++; @@ -1253,7 +1253,7 @@ gss_validate(struct rpc_task *task, __be32 *p) u32 flav,len; u32 maj_stat; - dprintk("RPC: %5u gss_validate\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); flav = ntohl(*p++); if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) @@ -1271,20 +1271,20 @@ gss_validate(struct rpc_task *task, __be32 *p) if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) { - dprintk("RPC: %5u gss_validate: gss_verify_mic returned " - "error 0x%08x\n", task->tk_pid, maj_stat); + dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n", + task->tk_pid, __func__, maj_stat); goto out_bad; } /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n", - task->tk_pid); + dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n", + task->tk_pid, __func__); return p + XDR_QUADLEN(len); out_bad: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid); + dprintk("RPC: %5u %s failed.\n", task->tk_pid, __func__); return NULL; } @@ -1466,7 +1466,7 @@ gss_wrap_req(struct rpc_task *task, struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; - dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. @@ -1489,7 +1489,7 @@ gss_wrap_req(struct rpc_task *task, } out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status); + dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status); return status; } @@ -1604,8 +1604,8 @@ out_decode: status = gss_unwrap_req_decode(decode, rqstp, p, obj); out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid, - status); + dprintk("RPC: %5u %s returning %d\n", + task->tk_pid, __func__, status); return status; } -- cgit v1.2.3 From 1b63a75180c6c65c71655c250a4e6b578ba7d1c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:52 -0400 Subject: SUNRPC: Refactor rpc_clone_client() rpc_clone_client() does most of the same tasks as rpc_new_client(), so there is an opportunity for code re-use. Create a generic helper that makes it easy to clone an RPC client while replacing any of the clnt's parameters. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 83 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fa48c60aef23..afbeefab6600 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -490,59 +490,62 @@ EXPORT_SYMBOL_GPL(rpc_create); * same transport while varying parameters such as the authentication * flavour. */ -struct rpc_clnt * -rpc_clone_client(struct rpc_clnt *clnt) +static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, + struct rpc_clnt *clnt) { - struct rpc_clnt *new; struct rpc_xprt *xprt; - int err = -ENOMEM; + struct rpc_clnt *new; + int err; - new = kmemdup(clnt, sizeof(*new), GFP_KERNEL); - if (!new) - goto out_no_clnt; - new->cl_parent = clnt; - /* Turn off autobind on clones */ - new->cl_autobind = 0; - INIT_LIST_HEAD(&new->cl_tasks); - spin_lock_init(&new->cl_lock); - rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval); - new->cl_metrics = rpc_alloc_iostats(clnt); - if (new->cl_metrics == NULL) - goto out_no_stats; - if (clnt->cl_principal) { - new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL); - if (new->cl_principal == NULL) - goto out_no_principal; - } + err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); if (xprt == NULL) - goto out_no_transport; - rcu_assign_pointer(new->cl_xprt, xprt); - atomic_set(&new->cl_count, 1); - err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); - if (err != 0) - goto out_no_path; - rpc_clnt_set_nodename(new, utsname()->nodename); - if (new->cl_auth) - atomic_inc(&new->cl_auth->au_count); + goto out_err; + args->servername = xprt->servername; + + new = rpc_new_client(args, xprt); + if (IS_ERR(new)) { + err = PTR_ERR(new); + goto out_put; + } + atomic_inc(&clnt->cl_count); - rpc_register_client(new); - rpciod_up(); + new->cl_parent = clnt; + + /* Turn off autobind on clones */ + new->cl_autobind = 0; + new->cl_softrtry = clnt->cl_softrtry; + new->cl_discrtry = clnt->cl_discrtry; + new->cl_chatty = clnt->cl_chatty; return new; -out_no_path: + +out_put: xprt_put(xprt); -out_no_transport: - kfree(new->cl_principal); -out_no_principal: - rpc_free_iostats(new->cl_metrics); -out_no_stats: - kfree(new); -out_no_clnt: +out_err: dprintk("RPC: %s: returned error %d\n", __func__, err); return ERR_PTR(err); } + +/** + * rpc_clone_client - Clone an RPC client structure + * + * @clnt: RPC client whose parameters are copied + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = clnt->cl_auth->au_flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} EXPORT_SYMBOL_GPL(rpc_clone_client); /* -- cgit v1.2.3 From ba9b584c1dc37851d9c6ca6d0d2ccba55d9aad04 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:02 -0400 Subject: SUNRPC: Introduce rpc_clone_client_set_auth() An ULP is supposed to be able to replace a GSS rpc_auth object with another GSS rpc_auth object using rpcauth_create(). However, rpcauth_create() in 3.5 reliably fails with -EEXIST in this case. This is because when gss_create() attempts to create the upcall pipes, sometimes they are already there. For example if a pipe FS mount event occurs, or a previous GSS flavor was in use for this rpc_clnt. It turns out that's not the only problem here. While working on a fix for the above problem, we noticed that replacing an rpc_clnt's rpc_auth is not safe, since dereferencing the cl_auth field is not protected in any way. So we're deprecating the ability of rpcauth_create() to switch an rpc_clnt's security flavor during normal operation. Instead, let's add a fresh API that clones an rpc_clnt and gives the clone a new flavor before it's used. This makes immediate use of the new __rpc_clone_client() helper. This can be used in a similar fashion to rpcauth_create() when a client is hunting for the correct security flavor. Instead of replacing an rpc_clnt's security flavor in a loop, the ULP replaces the whole rpc_clnt. To fix the -EEXIST problem, any ULP logic that relies on replacing an rpc_clnt's rpc_auth with rpcauth_create() must be changed to use this API instead. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 ++----------- fs/nfs/nfs4namespace.c | 14 +------------- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 22 ++++++++++++++++++++++ 4 files changed, 27 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 99694442b93f..143149db3440 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -668,7 +668,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; - server->client = rpc_clone_client(clp->cl_rpcclient); + server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, + pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); @@ -678,16 +679,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; - - if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { - struct rpc_auth *auth; - - auth = rpcauth_create(pseudoflavour, server->client); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __func__); - return PTR_ERR(auth); - } - } server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 4fdeb1b7042e..79fbb61ce202 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -192,25 +192,13 @@ out: struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, struct qstr *name) { - struct rpc_clnt *clone; - struct rpc_auth *auth; rpc_authflavor_t flavor; flavor = nfs4_negotiate_security(inode, name); if ((int)flavor < 0) return ERR_PTR((int)flavor); - clone = rpc_clone_client(clnt); - if (IS_ERR(clone)) - return clone; - - auth = rpcauth_create(flavor, clone); - if (IS_ERR(auth)) { - rpc_shutdown_client(clone); - clone = ERR_PTR(-EIO); - } - - return clone; + return rpc_clone_client_set_auth(clnt, flavor); } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 523547ecfee2..34206b84d8da 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -130,6 +130,8 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, const struct rpc_program *, u32); void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); +struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *, + rpc_authflavor_t); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_client(struct rpc_task *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index afbeefab6600..cdc7564b4512 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -548,6 +548,28 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clone_client); +/** + * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth + * + * @clnt: RPC client whose parameters are copied + * @auth: security flavor for new client + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt * +rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} +EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); + /* * Kill all tasks for the given client. * XXX: kill their descendants as well? -- cgit v1.2.3 From 62b54dd91567686a1cb118f76a72d5f4764a86dd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 1 Oct 2012 23:19:14 +0000 Subject: ipv6: don't add link local route when there is no link local address When an address is added on loopback (ip -6 a a 2002::1/128 dev lo), a route to fe80::/64 is added in the main table: unreachable fe80::/64 dev lo proto kernel metric 256 error -101 This route does not match any prefix (no fe80:: address on lo). In fact, addrconf_dev_config() will not add link local address because this function filters interfaces by type. If the link local address is added manually, the route to the link local prefix will be automatically added by addrconf_add_linklocal(). Note also, that this route is not deleted when the address is removed. After looking at the code, it seems that addrconf_add_lroute() is redundant with addrconf_add_linklocal(), because this function will add the link local route when the link local address is configured. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 480e68422efb..d7c56f8a5b4e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1769,14 +1769,6 @@ static void sit_route_add(struct net_device *dev) } #endif -static void addrconf_add_lroute(struct net_device *dev) -{ - struct in6_addr addr; - - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, 0); -} - static struct inet6_dev *addrconf_add_dev(struct net_device *dev) { struct inet6_dev *idev; @@ -1794,8 +1786,6 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) if (!(dev->flags & IFF_LOOPBACK)) addrconf_add_mroute(dev); - /* Add link local route */ - addrconf_add_lroute(dev); return idev; } @@ -2474,10 +2464,9 @@ static void addrconf_sit_config(struct net_device *dev) sit_add_v4_addrs(idev); - if (dev->flags&IFF_POINTOPOINT) { + if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); - addrconf_add_lroute(dev); - } else + else sit_route_add(dev); } #endif -- cgit v1.2.3 From 5316cf9a5197eb80b2800e1acadde287924ca975 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 2 Oct 2012 06:14:17 +0000 Subject: 8021q: fix mac_len recomputation in vlan_untag() skb_reset_mac_len() relies on the value of the skb->network_header pointer, therefore we must wait for such pointer to be recalculated before computing the new mac_len value. Signed-off-by: Antonio Quartulli Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index b258da88f675..add69d0fd99d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -105,7 +105,6 @@ static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) return NULL; memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; - skb_reset_mac_len(skb); return skb; } @@ -139,6 +138,8 @@ struct sk_buff *vlan_untag(struct sk_buff *skb) skb_reset_network_header(skb); skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + return skb; err_free: -- cgit v1.2.3 From f4ef85bbda96324785097356336bc79cdd37db0a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Oct 2012 01:25:26 +0000 Subject: ipv4: add a fib_type to fib_info commit d2d68ba9fe8 (ipv4: Cache input routes in fib_info nexthops.) introduced a regression for forwarding. This was hard to reproduce but the symptom was that packets were delivered to local host instead of being forwarded. David suggested to add fib_type to fib_info so that we dont inadvertently share same fib_info for different purposes. With help from Julian Anastasov who provided very helpful hints, reproduced here : Can it be a problem related to fib_info reuse from different routes. For example, when local IP address is created for subnet we have: broadcast 192.168.0.255 dev DEV proto kernel scope link src 192.168.0.1 192.168.0.0/24 dev DEV proto kernel scope link src 192.168.0.1 local 192.168.0.1 dev DEV proto kernel scope host src 192.168.0.1 The "dev DEV proto kernel scope link src 192.168.0.1" is a reused fib_info structure where we put cached routes. The result can be same fib_info for 192.168.0.255 and 192.168.0.0/24. RTN_BROADCAST is cached only for input routes. Incoming broadcast to 192.168.0.255 can be cached and can cause problems for traffic forwarded to 192.168.0.0/24. So, this patch should solve the problem because it separates the broadcast from unicast traffic. And the ip_route_input_slow caching will work for local and broadcast input routes (above routes 1 and 3) just because they differ in scope and use different fib_info. Many thanks to Chris Clayton for his patience and help. Reported-by: Chris Clayton Bisected-by: Chris Clayton Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Julian Anastasov Tested-by: Chris Clayton Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + net/ipv4/fib_semantics.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 926142ed8d7a..9497be1ad4c0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -102,6 +102,7 @@ struct fib_info { unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; + unsigned char fib_type; __be32 fib_prefsrc; u32 fib_priority; u32 *fib_metrics; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3509065e409a..267753060ffc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -314,6 +314,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && + nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && @@ -833,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; + fi->fib_type = cfg->fc_type; fi->fib_nhs = nhs; change_nexthops(fi) { -- cgit v1.2.3 From 575659936f9d392b93b03ce97a58dbd4fce18abd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 3 Oct 2012 05:43:21 +0000 Subject: sctp: fix a typo in prototype of __sctp_rcv_lookup() Just to avoid confusion when people only reads this prototype. Signed-off-by: Nicolas Dichtel Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 25dfe7380479..8bd3c279427e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -68,8 +68,8 @@ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, - const union sctp_addr *laddr, const union sctp_addr *paddr, + const union sctp_addr *laddr, struct sctp_transport **transportp); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, const union sctp_addr *laddr); -- cgit v1.2.3 From edfee0339e681a784ebacec7e8c2dc97dc6d2839 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 3 Oct 2012 05:43:22 +0000 Subject: sctp: check src addr when processing SACK to update transport state Suppose we have an SCTP connection with two paths. After connection is established, path1 is not available, thus this path is marked as inactive. Then traffic goes through path2, but for some reasons packets are delayed (after rto.max). Because packets are delayed, the retransmit mechanism will switch again to path1. At this time, we receive a delayed SACK from path2. When we update the state of the path in sctp_check_transmitted(), we do not take into account the source address of the SACK, hence we update the wrong path. Signed-off-by: Nicolas Dichtel Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/outqueue.c | 15 ++++++++++----- net/sctp/sm_sideeffect.c | 4 ++-- net/sctp/sm_statefuns.c | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 0fef00f5d3ce..64158aa1bb5f 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1068,7 +1068,7 @@ void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk); -int sctp_outq_sack(struct sctp_outq *, struct sctp_sackhdr *); +int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_outq_restart(struct sctp_outq *); diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d16632e1503a..1b4a7f8ec3fd 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -63,6 +63,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, + union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn); @@ -1139,9 +1140,10 @@ static void sctp_sack_update_unack_data(struct sctp_association *assoc, * Process the SACK against the outqueue. Mostly, this just frees * things off the transmitted queue. */ -int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) +int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) { struct sctp_association *asoc = q->asoc; + struct sctp_sackhdr *sack = chunk->subh.sack_hdr; struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; @@ -1210,7 +1212,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ - sctp_check_transmitted(q, &q->retransmit, NULL, sack, &highest_new_tsn); + sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. @@ -1219,7 +1221,8 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, - transport, sack, &highest_new_tsn); + transport, &chunk->source, sack, + &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of @@ -1326,6 +1329,7 @@ int sctp_outq_is_empty(const struct sctp_outq *q) static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, + union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn_in_sack) { @@ -1633,8 +1637,9 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* Mark the destination transport address as * active if it is not so marked. */ - if ((transport->state == SCTP_INACTIVE) || - (transport->state == SCTP_UNCONFIRMED)) { + if ((transport->state == SCTP_INACTIVE || + transport->state == SCTP_UNCONFIRMED) && + sctp_cmp_addr_exact(&transport->ipaddr, saddr)) { sctp_assoc_control_transport( transport->asoc, transport, diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index bcfebb91559d..57f7de839b03 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -752,11 +752,11 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, struct sctp_association *asoc, - struct sctp_sackhdr *sackh) + struct sctp_chunk *chunk) { int err = 0; - if (sctp_outq_sack(&asoc->outqueue, sackh)) { + if (sctp_outq_sack(&asoc->outqueue, chunk)) { struct net *net = sock_net(asoc->base.sk); /* There are no more TSNs awaiting SACK. */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 094813b6c3c3..b6adef8a1e93 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3179,7 +3179,7 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net, return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); /* Return this SACK for further processing. */ - sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_SACKH(sackh)); + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); /* Note: We do the rest of the work on the PROCESS_SACK * sideeffect. -- cgit v1.2.3 From 096895818cbf3382dc318f369e642fb7460ccb26 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 4 Oct 2012 03:36:13 +0000 Subject: silence some noisy printks in irda Fuzzing causes these printks to spew constantly. Changing them to DEBUG statements is consistent with other usage in the file, and makes them disappear when CONFIG_IRDA_DEBUG is disabled. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/irda/af_irda.c | 2 +- net/irda/irttp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index bb738c9f9146..b833677d83d6 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -468,7 +468,7 @@ static int irda_open_tsap(struct irda_sock *self, __u8 tsap_sel, char *name) notify_t notify; if (self->tsap) { - IRDA_WARNING("%s: busy!\n", __func__); + IRDA_DEBUG(0, "%s: busy!\n", __func__); return -EBUSY; } diff --git a/net/irda/irttp.c b/net/irda/irttp.c index 5c93f2952b08..1002e3396f72 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -440,7 +440,7 @@ struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify) */ lsap = irlmp_open_lsap(stsap_sel, &ttp_notify, 0); if (lsap == NULL) { - IRDA_WARNING("%s: unable to allocate LSAP!!\n", __func__); + IRDA_DEBUG(0, "%s: unable to allocate LSAP!!\n", __func__); return NULL; } -- cgit v1.2.3 From e57edf6b6dba975eceede20b4b13699d4e88cd78 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 4 Oct 2012 05:00:43 +0000 Subject: tipc: prevent dropped connections due to rcvbuf overflow When large buffers are sent over connected TIPC sockets, it is likely that the sk_backlog will be filled up on the receiver side, but the TIPC flow control mechanism is happily unaware of this since that is based on message count. The sender will receive a TIPC_ERR_OVERLOAD message when this occurs and drop it's side of the connection, leaving it stale on the receiver end. By increasing the sk_rcvbuf to a 'worst case' value, we avoid the overload caused by a full backlog queue and the flow control will work properly. This worst case value is the max TIPC message size times the flow control window, multiplied by two because a sender will transmit up to double the window size before a port is marked congested. We multiply this by 2 to account for the sk_buff and other overheads. Signed-off-by: Erik Hugne Signed-off-by: David S. Miller --- net/tipc/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 09dc5b97e079..fd5f042dbff4 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -220,6 +220,7 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol, sock_init_data(sock, sk); sk->sk_backlog_rcv = backlog_rcv; + sk->sk_rcvbuf = TIPC_FLOW_CONTROL_WIN * 2 * TIPC_MAX_USER_MSG_SIZE * 2; tipc_sk(sk)->p = tp_ptr; tipc_sk(sk)->conn_timeout = CONN_TIMEOUT_DEFAULT; -- cgit v1.2.3 From 32418cfe495c95013be2e805c087db89dcefac6d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 4 Oct 2012 09:51:11 +0000 Subject: Remove noisy printks from llcp_sock_connect Validation of userspace input shouldn't trigger dmesg spamming. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/nfc/llcp/sock.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp/sock.c b/net/nfc/llcp/sock.c index 40f056debf9a..63e4cdc92376 100644 --- a/net/nfc/llcp/sock.c +++ b/net/nfc/llcp/sock.c @@ -497,15 +497,11 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags); if (!addr || len < sizeof(struct sockaddr_nfc) || - addr->sa_family != AF_NFC) { - pr_err("Invalid socket\n"); + addr->sa_family != AF_NFC) return -EINVAL; - } - if (addr->service_name_len == 0 && addr->dsap == 0) { - pr_err("Missing service name or dsap\n"); + if (addr->service_name_len == 0 && addr->dsap == 0) return -EINVAL; - } pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); -- cgit v1.2.3 From 6825a26c2dc21eb4f8df9c06d3786ddec97cf53b Mon Sep 17 00:00:00 2001 From: Gao feng Date: Wed, 19 Sep 2012 19:25:34 +0000 Subject: ipv6: release reference of ip6_null_entry's dst entry in __ip6_del_rt as we hold dst_entry before we call __ip6_del_rt, so we should alse call dst_release not only return -ENOENT when the rt6_info is ip6_null_entry. and we already hold the dst entry, so I think it's safe to call dst_release out of the write-read lock. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d1ddbc6ddac5..7c7e963260e1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1593,17 +1593,18 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) - return -ENOENT; + if (rt == net->ipv6.ip6_null_entry) { + err = -ENOENT; + goto out; + } table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_del(rt, info); - dst_release(&rt->dst); - write_unlock_bh(&table->tb6_lock); +out: + dst_release(&rt->dst); return err; } -- cgit v1.2.3 From 6299b669b1340b9f7de2bc2bd565921a1494e7f7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:12:08 -0700 Subject: sections: fix section conflicts in net/can Signed-off-by: Andi Kleen Cc: Oliver Hartkopp Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/can/af_can.c | 2 +- net/can/bcm.c | 2 +- net/can/gw.c | 2 +- net/can/raw.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 821022a7214f..ddac1ee2ed20 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -63,7 +63,7 @@ #include "af_can.h" -static __initdata const char banner[] = KERN_INFO +static __initconst const char banner[] = KERN_INFO "can: controller area network core (" CAN_VERSION_STRING ")\n"; MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); diff --git a/net/can/bcm.c b/net/can/bcm.c index 151b7730c12c..6f747582718e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -77,7 +77,7 @@ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) #define CAN_BCM_VERSION CAN_VERSION -static __initdata const char banner[] = KERN_INFO +static __initconst const char banner[] = KERN_INFO "can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n"; MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); diff --git a/net/can/gw.c b/net/can/gw.c index 127879c55fb6..1f5c9785a262 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -58,7 +58,7 @@ #include #define CAN_GW_VERSION "20101209" -static __initdata const char banner[] = +static __initconst const char banner[] = KERN_INFO "can: netlink gateway (rev " CAN_GW_VERSION ")\n"; MODULE_DESCRIPTION("PF_CAN netlink gateway"); diff --git a/net/can/raw.c b/net/can/raw.c index 3e9c89356a93..5b0e3e330d97 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -55,7 +55,7 @@ #include #define CAN_RAW_VERSION CAN_VERSION -static __initdata const char banner[] = +static __initconst const char banner[] = KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n"; MODULE_DESCRIPTION("PF_CAN raw protocol"); -- cgit v1.2.3 From 04a6f82cf01aeef9fb058b2fca0ef1fe0a09c2fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:12:11 -0700 Subject: sections: fix section conflicts in net Signed-off-by: Andi Kleen Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/net_namespace.h | 2 ++ net/decnet/dn_rules.c | 2 +- net/ipv4/fib_rules.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv6/addrlabel.c | 2 +- net/ipv6/fib6_rules.c | 2 +- net/ipv6/ip6mr.c | 2 +- 7 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4faf6612ecac..95e646641184 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -257,10 +257,12 @@ static inline struct net *read_pnet(struct net * const *pnet) #define __net_init #define __net_exit #define __net_initdata +#define __net_initconst #else #define __net_init __init #define __net_exit __exit_refok #define __net_initdata __initdata +#define __net_initconst __initconst #endif struct pernet_operations { diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index e65f2c856e06..faf7cc3483fe 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -220,7 +220,7 @@ static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops) dn_rt_cache_flush(-1); } -static const struct fib_rules_ops __net_initdata dn_fib_rules_ops_template = { +static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .family = AF_DECnet, .rule_size = sizeof(struct dn_fib_rule), .addr_size = sizeof(u16), diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 274309d3aded..26aa65d1fce4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -262,7 +262,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops) rt_cache_flush(ops->fro_net); } -static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1daa95c2a0ba..6168c4dc58b1 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -221,7 +221,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 4be23da32b89..ff76eecfd622 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -79,7 +79,7 @@ struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) #define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL -static const __net_initdata struct ip6addrlbl_init_table +static const __net_initconst struct ip6addrlbl_init_table { const struct in6_addr *prefix; int prefixlen; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 0ff1cfd55bc4..d9fb9110f607 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -238,7 +238,7 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(16); /* src */ } -static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 08ea3f0b6e55..f7c7c6319720 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -205,7 +205,7 @@ static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), -- cgit v1.2.3 From 6dc878a8ca39e93f70c42f3dd7260bde10c1e0f1 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 4 Oct 2012 20:15:48 +0000 Subject: netlink: add reference of module in netlink_dump_start I get a panic when I use ss -a and rmmod inet_diag at the same time. It's because netlink_dump uses inet_diag_dump which belongs to module inet_diag. I search the codes and find many modules have the same problem. We need to add a reference to the module which the cb->dump belongs to. Thanks for all help from Stephen,Jan,Eric,Steffen and Pablo. Change From v3: change netlink_dump_start to inline,suggestion from Pablo and Eric. Change From v2: delete netlink_dump_done,and call module_put in netlink_dump and netlink_sock_destruct. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- include/linux/netlink.h | 20 ++++++++++++++++---- net/netlink/af_netlink.c | 29 +++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f80c56ac4d82..6d3af05c107c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -245,6 +245,8 @@ struct netlink_callback { struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; + /* the module that dump function belong to */ + struct module *module; u16 family; u16 min_dump_alloc; unsigned int prev_seq, seq; @@ -262,14 +264,24 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla struct netlink_dump_control { int (*dump)(struct sk_buff *skb, struct netlink_callback *); - int (*done)(struct netlink_callback*); + int (*done)(struct netlink_callback *); void *data; + struct module *module; u16 min_dump_alloc; }; -extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - struct netlink_dump_control *control); +extern int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control); +static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (!control->module) + control->module = THIS_MODULE; + + return __netlink_dump_start(ssk, skb, nlh, control); +} #endif /* __KERNEL__ */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0f2e3ad69c47..01e944a017a4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -169,6 +169,8 @@ static void netlink_sock_destruct(struct sock *sk) if (nlk->cb) { if (nlk->cb->done) nlk->cb->done(nlk->cb); + + module_put(nlk->cb->module); netlink_destroy_callback(nlk->cb); } @@ -1758,6 +1760,7 @@ static int netlink_dump(struct sock *sk) nlk->cb = NULL; mutex_unlock(nlk->cb_mutex); + module_put(cb->module); netlink_consume_callback(cb); return 0; @@ -1767,9 +1770,9 @@ errout_skb: return err; } -int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - struct netlink_dump_control *control) +int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) { struct netlink_callback *cb; struct sock *sk; @@ -1784,6 +1787,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->done = control->done; cb->nlh = nlh; cb->data = control->data; + cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; atomic_inc(&skb->users); cb->skb = skb; @@ -1794,19 +1798,28 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return -ECONNREFUSED; } nlk = nlk_sk(sk); - /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + /* A dump is in progress... */ if (nlk->cb) { mutex_unlock(nlk->cb_mutex); netlink_destroy_callback(cb); - sock_put(sk); - return -EBUSY; + ret = -EBUSY; + goto out; } + /* add reference of module which cb->dump belongs to */ + if (!try_module_get(cb->module)) { + mutex_unlock(nlk->cb_mutex); + netlink_destroy_callback(cb); + ret = -EPROTONOSUPPORT; + goto out; + } + nlk->cb = cb; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); - +out: sock_put(sk); if (ret) @@ -1817,7 +1830,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, */ return -EINTR; } -EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) { -- cgit v1.2.3 From acb600def2110b1310466c0e485c0d26299898ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2012 06:23:55 +0000 Subject: net: remove skb recycling Over time, skb recycling infrastructure got litle interest and many bugs. Generic rx path skb allocation is now using page fragments for efficient GRO / TCP coalescing, and recyling a tx skb for rx path is not worth the pain. Last identified bug is that fat skbs can be recycled and it can endup using high order pages after few iterations. With help from Maxime Bizon, who pointed out that commit 87151b8689d (net: allow pskb_expand_head() to get maximum tailroom) introduced this regression for recycled skbs. Instead of fixing this bug, lets remove skb recycling. Drivers wanting really hot skbs should use build_skb() anyway, to allocate/populate sk_buff right before netif_receive_skb() Signed-off-by: Eric Dumazet Cc: Maxime Bizon Signed-off-by: David S. Miller --- drivers/net/ethernet/calxeda/xgmac.c | 19 +-------- drivers/net/ethernet/freescale/gianfar.c | 27 ++----------- drivers/net/ethernet/freescale/gianfar.h | 2 - drivers/net/ethernet/freescale/ucc_geth.c | 29 +++----------- drivers/net/ethernet/freescale/ucc_geth.h | 2 - drivers/net/ethernet/marvell/mv643xx_eth.c | 18 +-------- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 - drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 20 +--------- include/linux/skbuff.h | 24 ------------ net/core/skbuff.c | 47 ----------------------- 10 files changed, 16 insertions(+), 173 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c index 2b4b4f529ab4..16814b34d4b6 100644 --- a/drivers/net/ethernet/calxeda/xgmac.c +++ b/drivers/net/ethernet/calxeda/xgmac.c @@ -375,7 +375,6 @@ struct xgmac_priv { unsigned int tx_tail; void __iomem *base; - struct sk_buff_head rx_recycle; unsigned int dma_buf_sz; dma_addr_t dma_rx_phy; dma_addr_t dma_tx_phy; @@ -672,9 +671,7 @@ static void xgmac_rx_refill(struct xgmac_priv *priv) p = priv->dma_rx + entry; if (priv->rx_skbuff[entry] == NULL) { - skb = __skb_dequeue(&priv->rx_recycle); - if (skb == NULL) - skb = netdev_alloc_skb(priv->dev, priv->dma_buf_sz); + skb = netdev_alloc_skb(priv->dev, priv->dma_buf_sz); if (unlikely(skb == NULL)) break; @@ -887,17 +884,7 @@ static void xgmac_tx_complete(struct xgmac_priv *priv) desc_get_buf_len(p), DMA_TO_DEVICE); } - /* - * If there's room in the queue (limit it to size) - * we add this skb back into the pool, - * if it's the right size. - */ - if ((skb_queue_len(&priv->rx_recycle) < - DMA_RX_RING_SZ) && - skb_recycle_check(skb, priv->dma_buf_sz)) - __skb_queue_head(&priv->rx_recycle, skb); - else - dev_kfree_skb(skb); + dev_kfree_skb(skb); } if (dma_ring_space(priv->tx_head, priv->tx_tail, DMA_TX_RING_SZ) > @@ -1016,7 +1003,6 @@ static int xgmac_open(struct net_device *dev) dev->dev_addr); } - skb_queue_head_init(&priv->rx_recycle); memset(&priv->xstats, 0, sizeof(struct xgmac_extra_stats)); /* Initialize the XGMAC and descriptors */ @@ -1053,7 +1039,6 @@ static int xgmac_stop(struct net_device *dev) napi_disable(&priv->napi); writel(0, priv->base + XGMAC_DMA_INTR_ENA); - skb_queue_purge(&priv->rx_recycle); /* Disable the MAC core */ xgmac_mac_disable(priv->base); diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index a1b52ec3b930..1d03dcdd5e56 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1765,7 +1765,6 @@ static void free_skb_resources(struct gfar_private *priv) sizeof(struct rxbd8) * priv->total_rx_ring_size, priv->tx_queue[0]->tx_bd_base, priv->tx_queue[0]->tx_bd_dma_base); - skb_queue_purge(&priv->rx_recycle); } void gfar_start(struct net_device *dev) @@ -1943,8 +1942,6 @@ static int gfar_enet_open(struct net_device *dev) enable_napi(priv); - skb_queue_head_init(&priv->rx_recycle); - /* Initialize a bunch of registers */ init_registers(dev); @@ -2533,16 +2530,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue) bytes_sent += skb->len; - /* If there's room in the queue (limit it to rx_buffer_size) - * we add this skb back into the pool, if it's the right size - */ - if (skb_queue_len(&priv->rx_recycle) < rx_queue->rx_ring_size && - skb_recycle_check(skb, priv->rx_buffer_size + - RXBUF_ALIGNMENT)) { - gfar_align_skb(skb); - skb_queue_head(&priv->rx_recycle, skb); - } else - dev_kfree_skb_any(skb); + dev_kfree_skb_any(skb); tx_queue->tx_skbuff[skb_dirtytx] = NULL; @@ -2608,7 +2596,7 @@ static void gfar_new_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp, static struct sk_buff *gfar_alloc_skb(struct net_device *dev) { struct gfar_private *priv = netdev_priv(dev); - struct sk_buff *skb = NULL; + struct sk_buff *skb; skb = netdev_alloc_skb(dev, priv->rx_buffer_size + RXBUF_ALIGNMENT); if (!skb) @@ -2621,14 +2609,7 @@ static struct sk_buff *gfar_alloc_skb(struct net_device *dev) struct sk_buff *gfar_new_skb(struct net_device *dev) { - struct gfar_private *priv = netdev_priv(dev); - struct sk_buff *skb = NULL; - - skb = skb_dequeue(&priv->rx_recycle); - if (!skb) - skb = gfar_alloc_skb(dev); - - return skb; + return gfar_alloc_skb(dev); } static inline void count_errors(unsigned short status, struct net_device *dev) @@ -2787,7 +2768,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit) if (unlikely(!newskb)) newskb = skb; else if (skb) - skb_queue_head(&priv->rx_recycle, skb); + dev_kfree_skb(skb); } else { /* Increment the number of packets */ rx_queue->stats.rx_packets++; diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h index 4141ef2ddafc..22eabc13ca99 100644 --- a/drivers/net/ethernet/freescale/gianfar.h +++ b/drivers/net/ethernet/freescale/gianfar.h @@ -1080,8 +1080,6 @@ struct gfar_private { u32 cur_filer_idx; - struct sk_buff_head rx_recycle; - /* RX queue filer rule set*/ struct ethtool_rx_list rx_list; struct mutex rx_queue_access; diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 164288439220..dfa0aaaab009 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -209,14 +209,12 @@ static struct list_head *dequeue(struct list_head *lh) static struct sk_buff *get_new_skb(struct ucc_geth_private *ugeth, u8 __iomem *bd) { - struct sk_buff *skb = NULL; + struct sk_buff *skb; - skb = __skb_dequeue(&ugeth->rx_recycle); + skb = netdev_alloc_skb(ugeth->ndev, + ugeth->ug_info->uf_info.max_rx_buf_length + + UCC_GETH_RX_DATA_BUF_ALIGNMENT); if (!skb) - skb = netdev_alloc_skb(ugeth->ndev, - ugeth->ug_info->uf_info.max_rx_buf_length + - UCC_GETH_RX_DATA_BUF_ALIGNMENT); - if (skb == NULL) return NULL; /* We need the data buffer to be aligned properly. We will reserve @@ -2020,8 +2018,6 @@ static void ucc_geth_memclean(struct ucc_geth_private *ugeth) iounmap(ugeth->ug_regs); ugeth->ug_regs = NULL; } - - skb_queue_purge(&ugeth->rx_recycle); } static void ucc_geth_set_multi(struct net_device *dev) @@ -2230,8 +2226,6 @@ static int ucc_struct_init(struct ucc_geth_private *ugeth) return -ENOMEM; } - skb_queue_head_init(&ugeth->rx_recycle); - return 0; } @@ -3274,12 +3268,7 @@ static int ucc_geth_rx(struct ucc_geth_private *ugeth, u8 rxQ, int rx_work_limit if (netif_msg_rx_err(ugeth)) ugeth_err("%s, %d: ERROR!!! skb - 0x%08x", __func__, __LINE__, (u32) skb); - if (skb) { - skb->data = skb->head + NET_SKB_PAD; - skb->len = 0; - skb_reset_tail_pointer(skb); - __skb_queue_head(&ugeth->rx_recycle, skb); - } + dev_free_skb(skb); ugeth->rx_skbuff[rxQ][ugeth->skb_currx[rxQ]] = NULL; dev->stats.rx_dropped++; @@ -3349,13 +3338,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ) dev->stats.tx_packets++; - if (skb_queue_len(&ugeth->rx_recycle) < RX_BD_RING_LEN && - skb_recycle_check(skb, - ugeth->ug_info->uf_info.max_rx_buf_length + - UCC_GETH_RX_DATA_BUF_ALIGNMENT)) - __skb_queue_head(&ugeth->rx_recycle, skb); - else - dev_kfree_skb(skb); + dev_kfree_skb(skb); ugeth->tx_skbuff[txQ][ugeth->skb_dirtytx[txQ]] = NULL; ugeth->skb_dirtytx[txQ] = diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index f71b3e7b12de..75f337163ce3 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -1214,8 +1214,6 @@ struct ucc_geth_private { /* index of the first skb which hasn't been transmitted yet. */ u16 skb_dirtytx[NUM_TX_QUEUES]; - struct sk_buff_head rx_recycle; - struct ugeth_mii_info *mii_info; struct phy_device *phydev; phy_interface_t phy_interface; diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 087b9e0669f1..84c13263c514 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -412,7 +412,6 @@ struct mv643xx_eth_private { u8 work_rx_refill; int skb_size; - struct sk_buff_head rx_recycle; /* * RX state. @@ -673,9 +672,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget) struct rx_desc *rx_desc; int size; - skb = __skb_dequeue(&mp->rx_recycle); - if (skb == NULL) - skb = netdev_alloc_skb(mp->dev, mp->skb_size); + skb = netdev_alloc_skb(mp->dev, mp->skb_size); if (skb == NULL) { mp->oom = 1; @@ -989,14 +986,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force) desc->byte_cnt, DMA_TO_DEVICE); } - if (skb != NULL) { - if (skb_queue_len(&mp->rx_recycle) < - mp->rx_ring_size && - skb_recycle_check(skb, mp->skb_size)) - __skb_queue_head(&mp->rx_recycle, skb); - else - dev_kfree_skb(skb); - } + dev_kfree_skb(skb); } __netif_tx_unlock(nq); @@ -2349,8 +2339,6 @@ static int mv643xx_eth_open(struct net_device *dev) napi_enable(&mp->napi); - skb_queue_head_init(&mp->rx_recycle); - mp->int_mask = INT_EXT; for (i = 0; i < mp->rxq_count; i++) { @@ -2445,8 +2433,6 @@ static int mv643xx_eth_stop(struct net_device *dev) mib_counters_update(mp); del_timer_sync(&mp->mib_counters_timer); - skb_queue_purge(&mp->rx_recycle); - for (i = 0; i < mp->rxq_count; i++) rxq_deinit(mp->rxq + i); for (i = 0; i < mp->txq_count; i++) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index e872e1da3137..7d51a65ab099 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -50,7 +50,6 @@ struct stmmac_priv { unsigned int dirty_rx; struct sk_buff **rx_skbuff; dma_addr_t *rx_skbuff_dma; - struct sk_buff_head rx_recycle; struct net_device *dev; dma_addr_t dma_rx_phy; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3be88331d17a..c6cdbc4eb05e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -747,18 +747,7 @@ static void stmmac_tx(struct stmmac_priv *priv) priv->hw->ring->clean_desc3(p); if (likely(skb != NULL)) { - /* - * If there's room in the queue (limit it to size) - * we add this skb back into the pool, - * if it's the right size. - */ - if ((skb_queue_len(&priv->rx_recycle) < - priv->dma_rx_size) && - skb_recycle_check(skb, priv->dma_buf_sz)) - __skb_queue_head(&priv->rx_recycle, skb); - else - dev_kfree_skb(skb); - + dev_kfree_skb(skb); priv->tx_skbuff[entry] = NULL; } @@ -1169,7 +1158,6 @@ static int stmmac_open(struct net_device *dev) priv->eee_enabled = stmmac_eee_init(priv); napi_enable(&priv->napi); - skb_queue_head_init(&priv->rx_recycle); netif_start_queue(dev); return 0; @@ -1222,7 +1210,6 @@ static int stmmac_release(struct net_device *dev) kfree(priv->tm); #endif napi_disable(&priv->napi); - skb_queue_purge(&priv->rx_recycle); /* Free the IRQ lines */ free_irq(dev->irq, dev); @@ -1388,10 +1375,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv) if (likely(priv->rx_skbuff[entry] == NULL)) { struct sk_buff *skb; - skb = __skb_dequeue(&priv->rx_recycle); - if (skb == NULL) - skb = netdev_alloc_skb_ip_align(priv->dev, - bfsize); + skb = netdev_alloc_skb_ip_align(priv->dev, bfsize); if (unlikely(skb == NULL)) break; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b33a3a1f205e..6a2c34e6d962 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -589,9 +589,6 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } -extern void skb_recycle(struct sk_buff *skb); -extern bool skb_recycle_check(struct sk_buff *skb, int skb_size); - extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); extern struct sk_buff *skb_clone(struct sk_buff *skb, @@ -2645,27 +2642,6 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb) bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); -static inline bool skb_is_recycleable(const struct sk_buff *skb, int skb_size) -{ - if (irqs_disabled()) - return false; - - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) - return false; - - if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) - return false; - - skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); - if (skb_end_offset(skb) < skb_size) - return false; - - if (skb_shared(skb) || skb_cloned(skb)) - return false; - - return true; -} - /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cdc28598f4ef..6e04b1fa11f2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -655,53 +655,6 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); -/** - * skb_recycle - clean up an skb for reuse - * @skb: buffer - * - * Recycles the skb to be reused as a receive buffer. This - * function does any necessary reference count dropping, and - * cleans up the skbuff as if it just came from __alloc_skb(). - */ -void skb_recycle(struct sk_buff *skb) -{ - struct skb_shared_info *shinfo; - - skb_release_head_state(skb); - - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = skb->head + NET_SKB_PAD; - skb_reset_tail_pointer(skb); -} -EXPORT_SYMBOL(skb_recycle); - -/** - * skb_recycle_check - check if skb can be reused for receive - * @skb: buffer - * @skb_size: minimum receive buffer size - * - * Checks that the skb passed in is not shared or cloned, and - * that it is linear and its head portion at least as large as - * skb_size so that it can be recycled as a receive buffer. - * If these conditions are met, this function does any necessary - * reference count dropping and cleans up the skbuff as if it - * just came from __alloc_skb(). - */ -bool skb_recycle_check(struct sk_buff *skb, int skb_size) -{ - if (!skb_is_recycleable(skb, skb_size)) - return false; - - skb_recycle(skb); - - return true; -} -EXPORT_SYMBOL(skb_recycle_check); - static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; -- cgit v1.2.3 From e1f165032c8bade3a6bdf546f8faf61fda4dd01c Mon Sep 17 00:00:00 2001 From: "ramesh.nagappa@gmail.com" Date: Fri, 5 Oct 2012 19:10:15 +0000 Subject: net: Fix skb_under_panic oops in neigh_resolve_output The retry loop in neigh_resolve_output() and neigh_connected_output() call dev_hard_header() with out reseting the skb to network_header. This causes the retry to fail with skb_under_panic. The fix is to reset the network_header within the retry loop. Signed-off-by: Ramesh Nagappa Reviewed-by: Shawn Lu Reviewed-by: Robert Coulson Reviewed-by: Billie Alsup Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index baca771caae2..22571488730a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1301,8 +1301,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) if (!dst) goto discard; - __skb_pull(skb, skb_network_offset(skb)); - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; @@ -1312,6 +1310,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) neigh_hh_init(neigh, dst); do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); @@ -1342,9 +1341,8 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) unsigned int seq; int err; - __skb_pull(skb, skb_network_offset(skb)); - do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); -- cgit v1.2.3 From 51ec04038c113a811b177baa85d293feff9ce995 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2012 20:43:30 +0000 Subject: ipv6: GRO should be ECN friendly IPv4 side of the problem was addressed in commit a9e050f4e7f9d (net: tcp: GRO should be ECN friendly) This patch does the same, but for IPv6 : A Traffic Class mismatch doesnt mean flows are different, but instead should force a flush of previous packets. This patch removes artificial packet reordering problem. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e22e6d88bac6..f757e3b7cfbf 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -880,22 +880,25 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, nlen = skb_network_header_len(skb); for (p = *head; p; p = p->next) { - struct ipv6hdr *iph2; + const struct ipv6hdr *iph2; + __be32 first_word; /* */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = ipv6_hdr(p); + first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; - /* All fields must match except length. */ + /* All fields must match except length and Traffic Class. */ if (nlen != skb_network_header_len(p) || - memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) || + (first_word & htonl(0xF00FFFFF)) || memcmp(&iph->nexthdr, &iph2->nexthdr, nlen - offsetof(struct ipv6hdr, nexthdr))) { NAPI_GRO_CB(p)->same_flow = 0; continue; } - + /* flush if Traffic Class fields are different */ + NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; } -- cgit v1.2.3 From ca07e43e288956a0ad5e6bd075f7aa1fca3bca00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 22:28:06 +0000 Subject: net: gro: fix a potential crash in skb_gro_reset_offset Before accessing skb first fragment, better make sure there is one. This is probably not needed for old kernels, since an ethernet frame cannot contain only an ethernet header, but the recent GRO addition to tunnels makes this patch needed. Also skb_gro_reset_offset() can be static, it actually allows compiler to inline it. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 01646aa53b0e..a659fd0ba965 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1663,7 +1663,6 @@ extern int netpoll_trap(void); #endif extern int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); -extern void skb_gro_reset_offset(struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { diff --git a/net/core/dev.c b/net/core/dev.c index 1e0a1847c3bb..de2bad717d56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3631,20 +3631,22 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); -void skb_gro_reset_offset(struct sk_buff *skb) +static void skb_gro_reset_offset(struct sk_buff *skb) { + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { - NAPI_GRO_CB(skb)->frag0 = - skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); } } -EXPORT_SYMBOL(skb_gro_reset_offset); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.3 From cf7f601c067994f371ba77721d1e45fce61a4569 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Sep 2012 13:06:29 +0100 Subject: KEYS: Add payload preparsing opportunity prior to key instantiate or update Give the key type the opportunity to preparse the payload prior to the instantiation and update routines being called. This is done with the provision of two new key type operations: int (*preparse)(struct key_preparsed_payload *prep); void (*free_preparse)(struct key_preparsed_payload *prep); If the first operation is present, then it is called before key creation (in the add/update case) or before the key semaphore is taken (in the update and instantiate cases). The second operation is called to clean up if the first was called. preparse() is given the opportunity to fill in the following structure: struct key_preparsed_payload { char *description; void *type_data[2]; void *payload; const void *data; size_t datalen; size_t quotalen; }; Before the preparser is called, the first three fields will have been cleared, the payload pointer and size will be stored in data and datalen and the default quota size from the key_type struct will be stored into quotalen. The preparser may parse the payload in any way it likes and may store data in the type_data[] and payload fields for use by the instantiate() and update() ops. The preparser may also propose a description for the key by attaching it as a string to the description field. This can be used by passing a NULL or "" description to the add_key() system call or the key_create_or_update() function. This cannot work with request_key() as that required the description to tell the upcall about the key to be created. This, for example permits keys that store PGP public keys to generate their own name from the user ID and public key fingerprint in the key. The instantiate() and update() operations are then modified to look like this: int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); int (*update)(struct key *key, struct key_preparsed_payload *prep); and the new payload data is passed in *prep, whether or not it was preparsed. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- Documentation/security/keys.txt | 50 +++++++++++++- fs/cifs/cifs_spnego.c | 6 +- fs/cifs/cifsacl.c | 8 +-- include/keys/user-type.h | 6 +- include/linux/key-type.h | 35 +++++++++- net/ceph/crypto.c | 9 +-- net/dns_resolver/dns_key.c | 6 +- net/rxrpc/ar-key.c | 40 +++++------ security/keys/encrypted-keys/encrypted.c | 16 +++-- security/keys/key.c | 114 ++++++++++++++++++++++--------- security/keys/keyctl.c | 18 +++-- security/keys/keyring.c | 6 +- security/keys/request_key_auth.c | 8 +-- security/keys/trusted.c | 16 +++-- security/keys/user_defined.c | 14 ++-- 15 files changed, 250 insertions(+), 102 deletions(-) (limited to 'net') diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index aa0dbd74b71b..7d9ca92022d8 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -412,6 +412,10 @@ The main syscalls are: to the keyring. In this case, an error will be generated if the process does not have permission to write to the keyring. + If the key type supports it, if the description is NULL or an empty + string, the key type will try and generate a description from the content + of the payload. + The payload is optional, and the pointer can be NULL if not required by the type. The payload is plen in size, and plen can be zero for an empty payload. @@ -1114,12 +1118,53 @@ The structure has a number of fields, some of which are mandatory: it should return 0. - (*) int (*instantiate)(struct key *key, const void *data, size_t datalen); + (*) int (*preparse)(struct key_preparsed_payload *prep); + + This optional method permits the key type to attempt to parse payload + before a key is created (add key) or the key semaphore is taken (update or + instantiate key). The structure pointed to by prep looks like: + + struct key_preparsed_payload { + char *description; + void *type_data[2]; + void *payload; + const void *data; + size_t datalen; + size_t quotalen; + }; + + Before calling the method, the caller will fill in data and datalen with + the payload blob parameters; quotalen will be filled in with the default + quota size from the key type and the rest will be cleared. + + If a description can be proposed from the payload contents, that should be + attached as a string to the description field. This will be used for the + key description if the caller of add_key() passes NULL or "". + + The method can attach anything it likes to type_data[] and payload. These + are merely passed along to the instantiate() or update() operations. + + The method should return 0 if success ful or a negative error code + otherwise. + + + (*) void (*free_preparse)(struct key_preparsed_payload *prep); + + This method is only required if the preparse() method is provided, + otherwise it is unused. It cleans up anything attached to the + description, type_data and payload fields of the key_preparsed_payload + struct as filled in by the preparse() method. + + + (*) int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); This method is called to attach a payload to a key during construction. The payload attached need not bear any relation to the data passed to this function. + The prep->data and prep->datalen fields will define the original payload + blob. If preparse() was supplied then other fields may be filled in also. + If the amount of data attached to the key differs from the size in keytype->def_datalen, then key_payload_reserve() should be called. @@ -1135,6 +1180,9 @@ The structure has a number of fields, some of which are mandatory: If this type of key can be updated, then this method should be provided. It is called to update a key's payload from the blob of data provided. + The prep->data and prep->datalen fields will define the original payload + blob. If preparse() was supplied then other fields may be filled in also. + key_payload_reserve() should be called if the data length might change before any changes are actually made. Note that if this succeeds, the type is committed to changing the key because it's already been altered, so all diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index e622863b292f..086f381d6489 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -31,18 +31,18 @@ /* create a new cifs key */ static int -cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) +cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; int ret; ret = -ENOMEM; - payload = kmalloc(datalen, GFP_KERNEL); + payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) goto error; /* attach the data */ - memcpy(payload, data, datalen); + memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; ret = 0; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 05f4dc263a23..f3c60e264ca8 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = { }; static int -cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) +cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; - payload = kmalloc(datalen, GFP_KERNEL); + payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; - memcpy(payload, data, datalen); + memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; - key->datalen = datalen; + key->datalen = prep->datalen; return 0; } diff --git a/include/keys/user-type.h b/include/keys/user-type.h index bc9ec1d7698c..5e452c84f1e6 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -35,8 +35,10 @@ struct user_key_payload { extern struct key_type key_type_user; extern struct key_type key_type_logon; -extern int user_instantiate(struct key *key, const void *data, size_t datalen); -extern int user_update(struct key *key, const void *data, size_t datalen); +struct key_preparsed_payload; + +extern int user_instantiate(struct key *key, struct key_preparsed_payload *prep); +extern int user_update(struct key *key, struct key_preparsed_payload *prep); extern int user_match(const struct key *key, const void *criterion); extern void user_revoke(struct key *key); extern void user_destroy(struct key *key); diff --git a/include/linux/key-type.h b/include/linux/key-type.h index f0c651cda7b0..518a53afb9ea 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -26,6 +26,27 @@ struct key_construction { struct key *authkey;/* authorisation for key being constructed */ }; +/* + * Pre-parsed payload, used by key add, update and instantiate. + * + * This struct will be cleared and data and datalen will be set with the data + * and length parameters from the caller and quotalen will be set from + * def_datalen from the key type. Then if the preparse() op is provided by the + * key type, that will be called. Then the struct will be passed to the + * instantiate() or the update() op. + * + * If the preparse() op is given, the free_preparse() op will be called to + * clear the contents. + */ +struct key_preparsed_payload { + char *description; /* Proposed key description (or NULL) */ + void *type_data[2]; /* Private key-type data */ + void *payload; /* Proposed payload */ + const void *data; /* Raw data */ + size_t datalen; /* Raw datalen */ + size_t quotalen; /* Quota length for proposed payload */ +}; + typedef int (*request_key_actor_t)(struct key_construction *key, const char *op, void *aux); @@ -45,18 +66,28 @@ struct key_type { /* vet a description */ int (*vet_description)(const char *description); + /* Preparse the data blob from userspace that is to be the payload, + * generating a proposed description and payload that will be handed to + * the instantiate() and update() ops. + */ + int (*preparse)(struct key_preparsed_payload *prep); + + /* Free a preparse data structure. + */ + void (*free_preparse)(struct key_preparsed_payload *prep); + /* instantiate a key of this type * - this method should call key_payload_reserve() to determine if the * user's quota will hold the payload */ - int (*instantiate)(struct key *key, const void *data, size_t datalen); + int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); /* update a key of this type (optional) * - this method should call key_payload_reserve() to recalculate the * quota consumption * - the key must be locked against read when modifying */ - int (*update)(struct key *key, const void *data, size_t datalen); + int (*update)(struct key *key, struct key_preparsed_payload *prep); /* match a key against a description */ int (*match)(const struct key *key, const void *desc); diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 9da7fdd3cd8a..af14cb425164 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -423,14 +423,15 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, } } -int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) +int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; + size_t datalen = prep->datalen; int ret; void *p; ret = -EINVAL; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) goto err; ret = key_payload_reserve(key, datalen); @@ -443,8 +444,8 @@ int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) goto err; /* TODO ceph_crypto_key_decode should really take const input */ - p = (void *)data; - ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen); + p = (void *)prep->data; + ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); if (ret < 0) goto err_ckey; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index d9507dd05818..859ab8b6ec34 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -59,13 +59,13 @@ const struct cred *dns_resolver_cache; * "ip1,ip2,...#foo=bar" */ static int -dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen) +dns_resolver_instantiate(struct key *key, struct key_preparsed_payload *prep) { struct user_key_payload *upayload; unsigned long derrno; int ret; - size_t result_len = 0; - const char *data = _data, *end, *opt; + size_t datalen = prep->datalen, result_len = 0; + const char *data = prep->data, *end, *opt; kenter("%%%d,%s,'%*.*s',%zu", key->serial, key->description, diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 8b1f9f49960f..106c5a6b1ab2 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -26,8 +26,8 @@ #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); -static int rxrpc_instantiate(struct key *, const void *, size_t); -static int rxrpc_instantiate_s(struct key *, const void *, size_t); +static int rxrpc_instantiate(struct key *, struct key_preparsed_payload *); +static int rxrpc_instantiate_s(struct key *, struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); @@ -678,7 +678,7 @@ error: * * if no data is provided, then a no-security key is made */ -static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) +static int rxrpc_instantiate(struct key *key, struct key_preparsed_payload *prep) { const struct rxrpc_key_data_v1 *v1; struct rxrpc_key_token *token, **pp; @@ -686,26 +686,26 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) u32 kver; int ret; - _enter("{%x},,%zu", key_serial(key), datalen); + _enter("{%x},,%zu", key_serial(key), prep->datalen); /* handle a no-security key */ - if (!data && datalen == 0) + if (!prep->data && prep->datalen == 0) return 0; /* determine if the XDR payload format is being used */ - if (datalen > 7 * 4) { - ret = rxrpc_instantiate_xdr(key, data, datalen); + if (prep->datalen > 7 * 4) { + ret = rxrpc_instantiate_xdr(key, prep->data, prep->datalen); if (ret != -EPROTO) return ret; } /* get the key interface version number */ ret = -EINVAL; - if (datalen <= 4 || !data) + if (prep->datalen <= 4 || !prep->data) goto error; - memcpy(&kver, data, sizeof(kver)); - data += sizeof(kver); - datalen -= sizeof(kver); + memcpy(&kver, prep->data, sizeof(kver)); + prep->data += sizeof(kver); + prep->datalen -= sizeof(kver); _debug("KEY I/F VERSION: %u", kver); @@ -715,11 +715,11 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) /* deal with a version 1 key */ ret = -EINVAL; - if (datalen < sizeof(*v1)) + if (prep->datalen < sizeof(*v1)) goto error; - v1 = data; - if (datalen != sizeof(*v1) + v1->ticket_length) + v1 = prep->data; + if (prep->datalen != sizeof(*v1) + v1->ticket_length) goto error; _debug("SCIX: %u", v1->security_index); @@ -784,17 +784,17 @@ error: * instantiate a server secret key * data should be a pointer to the 8-byte secret key */ -static int rxrpc_instantiate_s(struct key *key, const void *data, - size_t datalen) +static int rxrpc_instantiate_s(struct key *key, + struct key_preparsed_payload *prep) { struct crypto_blkcipher *ci; - _enter("{%x},,%zu", key_serial(key), datalen); + _enter("{%x},,%zu", key_serial(key), prep->datalen); - if (datalen != 8) + if (prep->datalen != 8) return -EINVAL; - memcpy(&key->type_data, data, 8); + memcpy(&key->type_data, prep->data, 8); ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(ci)) { @@ -802,7 +802,7 @@ static int rxrpc_instantiate_s(struct key *key, const void *data, return PTR_ERR(ci); } - if (crypto_blkcipher_setkey(ci, data, 8) < 0) + if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0) BUG(); key->payload.data = ci; diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 2d1bb8af7696..9e1e005c7596 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -773,8 +773,8 @@ static int encrypted_init(struct encrypted_key_payload *epayload, * * On success, return 0. Otherwise return errno. */ -static int encrypted_instantiate(struct key *key, const void *data, - size_t datalen) +static int encrypted_instantiate(struct key *key, + struct key_preparsed_payload *prep) { struct encrypted_key_payload *epayload = NULL; char *datablob = NULL; @@ -782,16 +782,17 @@ static int encrypted_instantiate(struct key *key, const void *data, char *master_desc = NULL; char *decrypted_datalen = NULL; char *hex_encoded_iv = NULL; + size_t datalen = prep->datalen; int ret; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; datablob = kmalloc(datalen + 1, GFP_KERNEL); if (!datablob) return -ENOMEM; datablob[datalen] = 0; - memcpy(datablob, data, datalen); + memcpy(datablob, prep->data, datalen); ret = datablob_parse(datablob, &format, &master_desc, &decrypted_datalen, &hex_encoded_iv); if (ret < 0) @@ -834,16 +835,17 @@ static void encrypted_rcu_free(struct rcu_head *rcu) * * On success, return 0. Otherwise return errno. */ -static int encrypted_update(struct key *key, const void *data, size_t datalen) +static int encrypted_update(struct key *key, struct key_preparsed_payload *prep) { struct encrypted_key_payload *epayload = key->payload.data; struct encrypted_key_payload *new_epayload; char *buf; char *new_master_desc = NULL; const char *format = NULL; + size_t datalen = prep->datalen; int ret = 0; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; buf = kmalloc(datalen + 1, GFP_KERNEL); @@ -851,7 +853,7 @@ static int encrypted_update(struct key *key, const void *data, size_t datalen) return -ENOMEM; buf[datalen] = 0; - memcpy(buf, data, datalen); + memcpy(buf, prep->data, datalen); ret = datablob_parse(buf, &format, &new_master_desc, NULL, NULL); if (ret < 0) goto out; diff --git a/security/keys/key.c b/security/keys/key.c index 50d96d4e06f2..1d039af99f50 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -412,8 +412,7 @@ EXPORT_SYMBOL(key_payload_reserve); * key_construction_mutex. */ static int __key_instantiate_and_link(struct key *key, - const void *data, - size_t datalen, + struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, unsigned long *_prealloc) @@ -431,7 +430,7 @@ static int __key_instantiate_and_link(struct key *key, /* can't instantiate twice */ if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { /* instantiate the key */ - ret = key->type->instantiate(key, data, datalen); + ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ @@ -482,22 +481,37 @@ int key_instantiate_and_link(struct key *key, struct key *keyring, struct key *authkey) { + struct key_preparsed_payload prep; unsigned long prealloc; int ret; + memset(&prep, 0, sizeof(prep)); + prep.data = data; + prep.datalen = datalen; + prep.quotalen = key->type->def_datalen; + if (key->type->preparse) { + ret = key->type->preparse(&prep); + if (ret < 0) + goto error; + } + if (keyring) { ret = __key_link_begin(keyring, key->type, key->description, &prealloc); if (ret < 0) - return ret; + goto error_free_preparse; } - ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey, + ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &prealloc); if (keyring) __key_link_end(keyring, key->type, prealloc); +error_free_preparse: + if (key->type->preparse) + key->type->free_preparse(&prep); +error: return ret; } @@ -706,7 +720,7 @@ void key_type_put(struct key_type *ktype) * if we get an error. */ static inline key_ref_t __key_update(key_ref_t key_ref, - const void *payload, size_t plen) + struct key_preparsed_payload *prep) { struct key *key = key_ref_to_ptr(key_ref); int ret; @@ -722,7 +736,7 @@ static inline key_ref_t __key_update(key_ref_t key_ref, down_write(&key->sem); - ret = key->type->update(key, payload, plen); + ret = key->type->update(key, prep); if (ret == 0) /* updating a negative key instantiates it */ clear_bit(KEY_FLAG_NEGATIVE, &key->flags); @@ -774,6 +788,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, unsigned long flags) { unsigned long prealloc; + struct key_preparsed_payload prep; const struct cred *cred = current_cred(); struct key_type *ktype; struct key *keyring, *key = NULL; @@ -789,8 +804,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, } key_ref = ERR_PTR(-EINVAL); - if (!ktype->match || !ktype->instantiate) - goto error_2; + if (!ktype->match || !ktype->instantiate || + (!description && !ktype->preparse)) + goto error_put_type; keyring = key_ref_to_ptr(keyring_ref); @@ -798,18 +814,37 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, key_ref = ERR_PTR(-ENOTDIR); if (keyring->type != &key_type_keyring) - goto error_2; + goto error_put_type; + + memset(&prep, 0, sizeof(prep)); + prep.data = payload; + prep.datalen = plen; + prep.quotalen = ktype->def_datalen; + if (ktype->preparse) { + ret = ktype->preparse(&prep); + if (ret < 0) { + key_ref = ERR_PTR(ret); + goto error_put_type; + } + if (!description) + description = prep.description; + key_ref = ERR_PTR(-EINVAL); + if (!description) + goto error_free_prep; + } ret = __key_link_begin(keyring, ktype, description, &prealloc); - if (ret < 0) - goto error_2; + if (ret < 0) { + key_ref = ERR_PTR(ret); + goto error_free_prep; + } /* if we're going to allocate a new key, we're going to have * to modify the keyring */ ret = key_permission(keyring_ref, KEY_WRITE); if (ret < 0) { key_ref = ERR_PTR(ret); - goto error_3; + goto error_link_end; } /* if it's possible to update this type of key, search for an existing @@ -840,25 +875,27 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, perm, flags); if (IS_ERR(key)) { key_ref = ERR_CAST(key); - goto error_3; + goto error_link_end; } /* instantiate it and link it into the target keyring */ - ret = __key_instantiate_and_link(key, payload, plen, keyring, NULL, - &prealloc); + ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &prealloc); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); - goto error_3; + goto error_link_end; } key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); - error_3: +error_link_end: __key_link_end(keyring, ktype, prealloc); - error_2: +error_free_prep: + if (ktype->preparse) + ktype->free_preparse(&prep); +error_put_type: key_type_put(ktype); - error: +error: return key_ref; found_matching_key: @@ -866,10 +903,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, * - we can drop the locks first as we have the key pinned */ __key_link_end(keyring, ktype, prealloc); - key_type_put(ktype); - key_ref = __key_update(key_ref, payload, plen); - goto error; + key_ref = __key_update(key_ref, &prep); + goto error_free_prep; } EXPORT_SYMBOL(key_create_or_update); @@ -888,6 +924,7 @@ EXPORT_SYMBOL(key_create_or_update); */ int key_update(key_ref_t key_ref, const void *payload, size_t plen) { + struct key_preparsed_payload prep; struct key *key = key_ref_to_ptr(key_ref); int ret; @@ -900,18 +937,31 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) /* attempt to update it if supported */ ret = -EOPNOTSUPP; - if (key->type->update) { - down_write(&key->sem); - - ret = key->type->update(key, payload, plen); - if (ret == 0) - /* updating a negative key instantiates it */ - clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + if (!key->type->update) + goto error; - up_write(&key->sem); + memset(&prep, 0, sizeof(prep)); + prep.data = payload; + prep.datalen = plen; + prep.quotalen = key->type->def_datalen; + if (key->type->preparse) { + ret = key->type->preparse(&prep); + if (ret < 0) + goto error; } - error: + down_write(&key->sem); + + ret = key->type->update(key, &prep); + if (ret == 0) + /* updating a negative key instantiates it */ + clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + + up_write(&key->sem); + + if (key->type->preparse) + key->type->free_preparse(&prep); +error: return ret; } EXPORT_SYMBOL(key_update); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 3364fbf46807..505d40be196c 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -46,6 +46,9 @@ static int key_get_type_from_user(char *type, * Extract the description of a new key from userspace and either add it as a * new key to the specified keyring or update a matching key in that keyring. * + * If the description is NULL or an empty string, the key type is asked to + * generate one from the payload. + * * The keyring must be writable so that we can attach the key to it. * * If successful, the new key's serial number is returned, otherwise an error @@ -72,10 +75,17 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, if (ret < 0) goto error; - description = strndup_user(_description, PAGE_SIZE); - if (IS_ERR(description)) { - ret = PTR_ERR(description); - goto error; + description = NULL; + if (_description) { + description = strndup_user(_description, PAGE_SIZE); + if (IS_ERR(description)) { + ret = PTR_ERR(description); + goto error; + } + if (!*description) { + kfree(description); + description = NULL; + } } /* pull the payload in if one was supplied */ diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 81e7852d281d..f04d8cf81f3c 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -66,7 +66,7 @@ static inline unsigned keyring_hash(const char *desc) * operations. */ static int keyring_instantiate(struct key *keyring, - const void *data, size_t datalen); + struct key_preparsed_payload *prep); static int keyring_match(const struct key *keyring, const void *criterion); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); @@ -121,12 +121,12 @@ static void keyring_publish_name(struct key *keyring) * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, - const void *data, size_t datalen) + struct key_preparsed_payload *prep) { int ret; ret = -EINVAL; - if (datalen == 0) { + if (prep->datalen == 0) { /* make the keyring available by name if it has one */ keyring_publish_name(keyring); ret = 0; diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 60d4e3f5e4bb..85730d5a5a59 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -19,7 +19,8 @@ #include #include "internal.h" -static int request_key_auth_instantiate(struct key *, const void *, size_t); +static int request_key_auth_instantiate(struct key *, + struct key_preparsed_payload *); static void request_key_auth_describe(const struct key *, struct seq_file *); static void request_key_auth_revoke(struct key *); static void request_key_auth_destroy(struct key *); @@ -42,10 +43,9 @@ struct key_type key_type_request_key_auth = { * Instantiate a request-key authorisation key. */ static int request_key_auth_instantiate(struct key *key, - const void *data, - size_t datalen) + struct key_preparsed_payload *prep) { - key->payload.data = (struct request_key_auth *) data; + key->payload.data = (struct request_key_auth *)prep->data; return 0; } diff --git a/security/keys/trusted.c b/security/keys/trusted.c index 2d5d041f2049..42036c7a0856 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -927,22 +927,23 @@ static struct trusted_key_payload *trusted_payload_alloc(struct key *key) * * On success, return 0. Otherwise return errno. */ -static int trusted_instantiate(struct key *key, const void *data, - size_t datalen) +static int trusted_instantiate(struct key *key, + struct key_preparsed_payload *prep) { struct trusted_key_payload *payload = NULL; struct trusted_key_options *options = NULL; + size_t datalen = prep->datalen; char *datablob; int ret = 0; int key_cmd; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; datablob = kmalloc(datalen + 1, GFP_KERNEL); if (!datablob) return -ENOMEM; - memcpy(datablob, data, datalen); + memcpy(datablob, prep->data, datalen); datablob[datalen] = '\0'; options = trusted_options_alloc(); @@ -1011,17 +1012,18 @@ static void trusted_rcu_free(struct rcu_head *rcu) /* * trusted_update - reseal an existing key with new PCR values */ -static int trusted_update(struct key *key, const void *data, size_t datalen) +static int trusted_update(struct key *key, struct key_preparsed_payload *prep) { struct trusted_key_payload *p = key->payload.data; struct trusted_key_payload *new_p; struct trusted_key_options *new_o; + size_t datalen = prep->datalen; char *datablob; int ret = 0; if (!p->migratable) return -EPERM; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; datablob = kmalloc(datalen + 1, GFP_KERNEL); @@ -1038,7 +1040,7 @@ static int trusted_update(struct key *key, const void *data, size_t datalen) goto out; } - memcpy(datablob, data, datalen); + memcpy(datablob, prep->data, datalen); datablob[datalen] = '\0'; ret = datablob_parse(datablob, new_p, new_o); if (ret != Opt_update) { diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index c7660a25a3e4..55dc88939185 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -58,13 +58,14 @@ EXPORT_SYMBOL_GPL(key_type_logon); /* * instantiate a user defined key */ -int user_instantiate(struct key *key, const void *data, size_t datalen) +int user_instantiate(struct key *key, struct key_preparsed_payload *prep) { struct user_key_payload *upayload; + size_t datalen = prep->datalen; int ret; ret = -EINVAL; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) goto error; ret = key_payload_reserve(key, datalen); @@ -78,7 +79,7 @@ int user_instantiate(struct key *key, const void *data, size_t datalen) /* attach the data */ upayload->datalen = datalen; - memcpy(upayload->data, data, datalen); + memcpy(upayload->data, prep->data, datalen); rcu_assign_keypointer(key, upayload); ret = 0; @@ -92,13 +93,14 @@ EXPORT_SYMBOL_GPL(user_instantiate); * update a user defined key * - the key's semaphore is write-locked */ -int user_update(struct key *key, const void *data, size_t datalen) +int user_update(struct key *key, struct key_preparsed_payload *prep) { struct user_key_payload *upayload, *zap; + size_t datalen = prep->datalen; int ret; ret = -EINVAL; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) goto error; /* construct a replacement payload */ @@ -108,7 +110,7 @@ int user_update(struct key *key, const void *data, size_t datalen) goto error; upayload->datalen = datalen; - memcpy(upayload->data, data, datalen); + memcpy(upayload->data, prep->data, datalen); /* check the quota and attach the new data */ zap = upayload; -- cgit v1.2.3 From d851c12b60471188e15e5c8405b289073e8dd025 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:47:25 +0000 Subject: ipv4: Always invalidate or update the route on pmtu events Some protocols, like IPsec still cache routes. So we need to invalidate the old route on pmtu events to avoid the reuse of stale routes. We also need to update the mtu and expire time of the route if we already use a nh exception route, otherwise we ignore newly learned pmtu values after the first expiration. With this patch we always invalidate or update the route on pmtu events. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ff622069fcef..90ba8358a892 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -904,22 +904,29 @@ out: kfree_skb(skb); return 0; } -static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { + struct dst_entry *dst = &rt->dst; struct fib_result res; if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + if (!rt->rt_pmtu) { + dst->obsolete = DST_OBSOLETE_KILL; + } else { + rt->rt_pmtu = mtu; + dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); + } + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); - return mtu; } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -929,14 +936,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); - mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); - - if (!rt->rt_pmtu) { - dst->obsolete = DST_OBSOLETE_KILL; - } else { - rt->rt_pmtu = mtu; - rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); - } + __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, -- cgit v1.2.3 From 7f92d334ba19a0d8e96f8f8f092219553367d921 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:48:18 +0000 Subject: ipv4: Don't create nh exeption when the device mtu is smaller than the reported pmtu When a local tool like tracepath tries to send packets bigger than the device mtu, we create a nh exeption and set the pmtu to device mtu. The device mtu does not expire, so check if the device mtu is smaller than the reported pmtu and don't crerate a nh exeption in that case. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 90ba8358a892..741df67a81ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -909,6 +909,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) struct dst_entry *dst = &rt->dst; struct fib_result res; + if (dst->dev->mtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; -- cgit v1.2.3 From ee9a8f7ab2edf801b8b514c310455c94acc232f6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 8 Oct 2012 00:56:54 +0000 Subject: ipv4: Don't report stale pmtu values to userspace We report cached pmtu values even if they are already expired. Change this to not report these values after they are expired and fix a race in the expire time calculation, as suggested by Eric Dumazet. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 741df67a81ec..132e0dfee53a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2187,8 +2187,18 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; + + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt_pmtu) + if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; @@ -2198,13 +2208,6 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; error = rt->dst.error; - expires = rt->dst.expires; - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } if (rt_is_input_route(rt)) { if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) -- cgit v1.2.3 From 2e71a6f8084e7ac87166dd77d99c44190fb844fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 08:08:49 +0000 Subject: net: gro: selective flush of packets Current GRO can hold packets in gro_list for almost unlimited time, in case napi->poll() handler consumes its budget over and over. In this case, napi_complete()/napi_gro_flush() are not called. Another problem is that gro_list is flushed in non friendly way : We scan the list and complete packets in the reverse order. (youngest packets first, oldest packets last) This defeats priorities that sender could have cooked. Since GRO currently only store TCP packets, we dont really notice the bug because of retransmits, but this behavior can add unexpected latencies, particularly on mice flows clamped by elephant flows. This patch makes sure no packet can stay more than 1 ms in queue, and only in stress situations. It also complete packets in the right order to minimize latencies. Signed-off-by: Eric Dumazet Cc: Herbert Xu Cc: Jesse Gross Cc: Tom Herbert Cc: Yuchung Cheng Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/skge.c | 2 +- drivers/net/ethernet/realtek/8139cp.c | 2 +- include/linux/netdevice.h | 15 ++++++++------ net/core/dev.c | 38 ++++++++++++++++++++++++++++------- 4 files changed, 42 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 3f7dab46626b..9b9c2ac5c4c2 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -3189,7 +3189,7 @@ static int skge_poll(struct napi_struct *napi, int to_do) if (work_done < to_do) { unsigned long flags; - napi_gro_flush(napi); + napi_gro_flush(napi, false); spin_lock_irqsave(&hw->hw_lock, flags); __napi_complete(napi); hw->intr_mask |= napimask[skge->port]; diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 995d0cfc4c06..1c818254b7be 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -563,7 +563,7 @@ rx_next: if (cpr16(IntrStatus) & cp_rx_intr_mask) goto rx_status_loop; - napi_gro_flush(napi); + napi_gro_flush(napi, false); spin_lock_irqsave(&cp->lock, flags); __napi_complete(napi); cpw16_f(IntrMask, cp_intr_mask); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a659fd0ba965..0a36fff75bd5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1497,19 +1497,22 @@ struct napi_gro_cb { /* This indicates where we are processing relative to skb->data. */ int data_offset; - /* This is non-zero if the packet may be of the same flow. */ - int same_flow; - /* This is non-zero if the packet cannot be merged with the new skb. */ int flush; /* Number of segments aggregated. */ - int count; + u16 count; + + /* This is non-zero if the packet may be of the same flow. */ + u8 same_flow; /* Free the skb? */ - int free; + u8 free; #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 + + /* jiffies when first packet was created/queued */ + unsigned long age; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) @@ -2156,7 +2159,7 @@ extern gro_result_t dev_gro_receive(struct napi_struct *napi, extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); extern gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); -extern void napi_gro_flush(struct napi_struct *napi); +extern void napi_gro_flush(struct napi_struct *napi, bool flush_old); extern struct sk_buff * napi_get_frags(struct napi_struct *napi); extern gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index de2bad717d56..d44668f63c88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3471,17 +3471,31 @@ out: return netif_receive_skb(skb); } -inline void napi_gro_flush(struct napi_struct *napi) +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *next; + struct sk_buff *skb, *prev = NULL; - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; + /* scan list and build reverse chain */ + for (skb = napi->gro_list; skb != NULL; skb = skb->next) { + skb->prev = prev; + prev = skb; + } + + for (skb = prev; skb; skb = prev) { skb->next = NULL; + + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + + prev = skb->prev; napi_gro_complete(skb); + napi->gro_count--; } - napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); @@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; @@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n); + napi_gro_flush(n, false); local_irq_save(flags); __napi_complete(n); local_irq_restore(flags); @@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h) local_irq_enable(); napi_complete(n); local_irq_disable(); - } else + } else { + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + local_irq_enable(); + napi_gro_flush(n, HZ >= 1000); + local_irq_disable(); + } list_move_tail(&n->poll_list, &sd->poll_list); + } } netpoll_poll_unlock(have); -- cgit v1.2.3 From 55fabefe3695241e6ccfa0cd4974f3fa497693dc Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Fri, 5 Oct 2012 17:57:39 -0700 Subject: mac80211: call drv_get_tsf() in sleepable context The call to drv_get/set_tsf() was put on the workqueue to perform tsf adjustments since that function might sleep. However it ended up inside a spinlock, whose critical section must be atomic. Do tsf adjustment outside the spinlock instead, and get rid of a warning. Signed-off-by: Thomas Pedersen Signed-off-by: John W. Linville --- net/mac80211/mesh_sync.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index accfa00ffcdf..a16b7b4b1e02 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -56,7 +56,6 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) u64 tsfdelta; spin_lock_bh(&ifmsh->sync_offset_lock); - if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", (long long) ifmsh->sync_offset_clockdrift_max); @@ -69,11 +68,11 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) tsfdelta = -beacon_int_fraction; ifmsh->sync_offset_clockdrift_max -= beacon_int_fraction; } + spin_unlock_bh(&ifmsh->sync_offset_lock); tsf = drv_get_tsf(local, sdata); if (tsf != -1ULL) drv_set_tsf(local, sdata, tsf + tsfdelta); - spin_unlock_bh(&ifmsh->sync_offset_lock); } static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From c3e7724b6bc2f25e46c38dbe68f09d71fafeafb8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 8 Oct 2012 14:39:33 +0200 Subject: mac80211: use ieee80211_free_txskb to fix possible skb leaks A few places free skbs using dev_kfree_skb even though they're called after ieee80211_subif_start_xmit might have cloned it for tracking tx status. Use ieee80211_free_txskb here to prevent skb leaks. Signed-off-by: Felix Fietkau Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- net/mac80211/status.c | 4 ++-- net/mac80211/tx.c | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 2ce89732d0f2..3af0cc4130f1 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -34,7 +34,7 @@ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { - dev_kfree_skb_irq(skb); + ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } @@ -159,7 +159,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e0e0d1d0e830..c9bf83f36657 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -354,7 +354,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) total += skb_queue_len(&sta->ps_tx_buf[ac]); if (skb) { purged++; - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); break; } } @@ -466,7 +466,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) ps_dbg(tx->sdata, "STA %pM TX buffer for AC %d full - dropping oldest frame\n", sta->sta.addr, ac); - dev_kfree_skb(old); + ieee80211_free_txskb(&local->hw, old); } else tx->local->total_ps_buffered++; @@ -1103,7 +1103,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, spin_unlock(&tx->sta->lock); if (purge_skb) - dev_kfree_skb(purge_skb); + ieee80211_free_txskb(&tx->local->hw, purge_skb); } /* reset session timer */ @@ -1214,7 +1214,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (WARN_ON_ONCE(q >= local->hw.queues)) { __skb_unlink(skb, skbs); - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); continue; } #endif @@ -1356,7 +1356,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) - dev_kfree_skb(tx->skb); + ieee80211_free_txskb(&tx->local->hw, tx->skb); else __skb_queue_purge(&tx->skbs); return -1; @@ -1393,7 +1393,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, res_prepare = ieee80211_tx_prepare(sdata, &tx, skb); if (unlikely(res_prepare == TX_DROP)) { - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); goto out; } else if (unlikely(res_prepare == TX_QUEUED)) { goto out; @@ -1465,7 +1465,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) headroom = max_t(int, 0, headroom); if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) { - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); rcu_read_unlock(); return; } @@ -2050,8 +2050,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, head_need += IEEE80211_ENCRYPT_HEADROOM; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); - if (ieee80211_skb_resize(sdata, skb, head_need, true)) - goto fail; + if (ieee80211_skb_resize(sdata, skb, head_need, true)) { + ieee80211_free_txskb(&local->hw, skb); + return NETDEV_TX_OK; + } } if (encaps_data) { @@ -2184,7 +2186,7 @@ void ieee80211_tx_pending(unsigned long data) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { - kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); continue; } -- cgit v1.2.3 From 48cc32d38a52d0b68f91a171a8d00531edc6a46e Mon Sep 17 00:00:00 2001 From: Florian Zumbiehl Date: Sun, 7 Oct 2012 15:51:58 +0000 Subject: vlan: don't deliver frames for unknown vlans to protocols 6a32e4f9dd9219261f8856f817e6655114cfec2f made the vlan code skip marking vlan-tagged frames for not locally configured vlans as PACKET_OTHERHOST if there was an rx_handler, as the rx_handler could cause the frame to be received on a different (virtual) vlan-capable interface where that vlan might be configured. As rx_handlers do not necessarily return RX_HANDLER_ANOTHER, this could cause frames for unknown vlans to be delivered to the protocol stack as if they had been received untagged. For example, if an ipv6 router advertisement that's tagged for a locally not configured vlan is received on an interface with macvlan interfaces attached, macvlan's rx_handler returns RX_HANDLER_PASS after delivering the frame to the macvlan interfaces, which caused it to be passed to the protocol stack, leading to ipv6 addresses for the announced prefix being configured even though those are completely unusable on the underlying interface. The fix moves marking as PACKET_OTHERHOST after the rx_handler so the rx_handler, if there is one, sees the frame unchanged, but afterwards, before the frame is delivered to the protocol stack, it gets marked whether there is an rx_handler or not. Signed-off-by: Florian Zumbiehl Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 8 ++++---- net/8021q/vlan_core.c | 10 ++-------- net/core/dev.c | 7 +++++-- 3 files changed, 11 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e6ff12dd717b..c0ff748d0aa5 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -80,6 +80,8 @@ static inline int is_vlan_dev(struct net_device *dev) } #define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) +#define vlan_tx_nonzero_tag_present(__skb) \ + (vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK)) #define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) @@ -89,7 +91,7 @@ extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev, extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); -extern bool vlan_do_receive(struct sk_buff **skb, bool last_handler); +extern bool vlan_do_receive(struct sk_buff **skb); extern struct sk_buff *vlan_untag(struct sk_buff *skb); extern int vlan_vid_add(struct net_device *dev, unsigned short vid); @@ -120,10 +122,8 @@ static inline u16 vlan_dev_vlan_id(const struct net_device *dev) return 0; } -static inline bool vlan_do_receive(struct sk_buff **skb, bool last_handler) +static inline bool vlan_do_receive(struct sk_buff **skb) { - if (((*skb)->vlan_tci & VLAN_VID_MASK) && last_handler) - (*skb)->pkt_type = PACKET_OTHERHOST; return false; } diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index add69d0fd99d..fbbf1fa00940 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -5,7 +5,7 @@ #include #include "vlan.h" -bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) +bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; @@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) struct vlan_pcpu_stats *rx_stats; vlan_dev = vlan_find_dev(skb->dev, vlan_id); - if (!vlan_dev) { - /* Only the last call to vlan_do_receive() should change - * pkt_type to PACKET_OTHERHOST - */ - if (vlan_id && last_handler) - skb->pkt_type = PACKET_OTHERHOST; + if (!vlan_dev) return false; - } skb = *skbp = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) diff --git a/net/core/dev.c b/net/core/dev.c index d44668f63c88..09cb3f6dc40c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3300,18 +3300,18 @@ ncls: && !skb_pfmemalloc_protocol(skb)) goto drop; - rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb, !rx_handler)) + if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto unlock; } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3331,6 +3331,9 @@ ncls: } } + if (vlan_tx_nonzero_tag_present(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; -- cgit v1.2.3 From 863472454ce50d4ef0929c6aa738cc5d64b84679 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Oct 2012 21:38:50 +0200 Subject: ipv6: gro: fix PV6_GRO_CB(skb)->proto problem It seems IPV6_GRO_CB(skb)->proto can be destroyed in skb_gro_receive() if a new skb is allocated (to serve as an anchor for frag_list) We copy NAPI_GRO_CB() only (not the IPV6 specific part) in : *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); So we leave IPV6_GRO_CB(nskb)->proto to 0 (fresh skb allocation) instead of IPPROTO_TCP (6) ipv6_gro_complete() isnt able to call ops->gro_complete() [ tcp6_gro_complete() ] Fix this by moving proto in NAPI_GRO_CB() and getting rid of IPV6_GRO_CB Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/ipv6/af_inet6.c | 11 ++--------- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0a36fff75bd5..561c8bc8976d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1513,6 +1513,9 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; + + /* Used in ipv6_gro_receive() */ + int proto; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f757e3b7cfbf..a974247a9ae4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -822,13 +822,6 @@ out: return segs; } -struct ipv6_gro_cb { - struct napi_gro_cb napi; - int proto; -}; - -#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb) - static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -874,7 +867,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, iph = ipv6_hdr(skb); } - IPV6_GRO_CB(skb)->proto = proto; + NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_network_header_len(skb); @@ -930,7 +923,7 @@ static int ipv6_gro_complete(struct sk_buff *skb) sizeof(*iph)); rcu_read_lock(); - ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]); + ops = rcu_dereference(inet6_protos[NAPI_GRO_CB(skb)->proto]); if (WARN_ON(!ops || !ops->gro_complete)) goto out_unlock; -- cgit v1.2.3 From e81da0e113a1b7fc7449ae6213f65f89ccac6d06 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:15 +0000 Subject: ipv4: fix sending of redirects After "Cache input routes in fib_info nexthops" (commit d2d68ba9fe) and "Elide fib_validate_source() completely when possible" (commit 7a9bc9b81a) we can not send ICMP redirects. It seems we should not cache the RTCF_DOREDIRECT flag in nh_rth_input because the same fib_info can be used for traffic that is not redirected, eg. from other input devices or from sources that are not in same subnet. As result, we have to disable the caching of RTCF_DOREDIRECT flag and to force source validation for the case when forwarding traffic to the input device. If traffic comes from directly connected source we allow redirection as it was done before both changes. Avoid setting RTCF_DOREDIRECT if IN_DEV_TX_REDIRECTS is disabled, this can avoid source address validation and to help caching the routes. After the change "Adjust semantics of rt->rt_gateway" (commit f8126f1d51) we should make sure our ICMP_REDIR_HOST messages contain daddr instead of 0.0.0.0 when target is directly connected. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/route.c | 30 ++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 68c93d1bb03a..825c608826de 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -322,7 +322,8 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (!r && !fib_num_tclassid_users(dev_net(dev))) { + if (!r && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { *itag = 0; return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 132e0dfee53a..b90da1bc2704 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -802,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } @@ -827,7 +828,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE @@ -835,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), - &ip_hdr(skb)->daddr, &rt->rt_gateway); + &ip_hdr(skb)->daddr, &gw); #endif } out_put_peer: @@ -1442,10 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - if (out_dev == in_dev && err && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1462,15 +1468,11 @@ static int __mkroute_input(struct sk_buff *skb, } } - do_cache = false; - if (res->fi) { - if (!itag) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - goto out; - } - do_cache = true; + if (do_cache) { + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; } } -- cgit v1.2.3 From e0adef0f7456d5d3a3bfe8ea61c7dddf146b40e1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:16 +0000 Subject: ipv4: fix forwarding for strict source routes After the change "Adjust semantics of rt->rt_gateway" (commit f8126f1d51) rt_gateway can be 0 but ip_forward() compares it directly with nexthop. What we want here is to check if traffic is to directly connected nexthop and to fail if using gateway. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index ab09b126423c..7f35ac26a71a 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) + if (opt->is_strictroute && rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && -- cgit v1.2.3 From f8a17175c63fd3e8b573719f7538816f8c96abf4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:17 +0000 Subject: ipv4: make sure nh_pcpu_rth_output is always allocated Avoid checking nh_pcpu_rth_output in fast path, abort fib_info creation on alloc_percpu failure. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 267753060ffc..71b125cd5db1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -840,6 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { nexthop_nh->nh_parent = fi; nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); + if (!nexthop_nh->nh_pcpu_rth_output) + goto failure; } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b90da1bc2704..5b0180f11b20 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1207,8 +1207,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; } else { - if (!nh->nh_pcpu_rth_output) - goto nocache; p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); } orig = *p; @@ -1223,7 +1221,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) * unsuccessful at storing this route into the cache * we really need to set it. */ -nocache: rt->dst.flags |= DST_NOCACHE; ret = false; } -- cgit v1.2.3 From 155e8336c373d14d87a7f91e356d85ef4b93b8f9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:18 +0000 Subject: ipv4: introduce rt_uses_gateway Add new flag to remember when route is via gateway. We will use it to allow rt_gateway to contain address of directly connected host for the cases when DST_NOCACHE is used or when the NH exception caches per-destination route without DST_NOCACHE flag, i.e. when routes are not used for other destinations. By this way we force the neighbour resolving to work with the routed destination but we can use different address in the packet, feature needed for IPVS-DR where original packet for virtual IP is routed via route to real IP. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/route.h | 3 ++- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 4 ++-- net/ipv4/route.c | 48 ++++++++++++++++++++++------------------- net/ipv4/xfrm4_policy.c | 1 + 6 files changed, 34 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index da22243d2760..bc40b633a5c4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -48,7 +48,8 @@ struct rtable { int rt_genid; unsigned int rt_flags; __u16 rt_type; - __u16 rt_is_input; + __u8 rt_is_input; + __u8 rt_uses_gateway; int rt_iif; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f0c5b9c1a957..d34ce2972c8f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -406,7 +406,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; @@ -442,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 7f35ac26a71a..694de3b7aebf 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gateway) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 24a29a39e9a8..6537a408a4fb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -193,7 +193,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); @@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5b0180f11b20..3a116cb0991a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1126,7 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - if (rt->rt_gateway && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1177,7 +1177,9 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_gateway = fnhe->fnhe_gw; - } + rt->rt_uses_gateway = 1; + } else if (!rt->rt_gateway) + rt->rt_gateway = daddr; orig = rcu_dereference(fnhe->fnhe_rth); rcu_assign_pointer(fnhe->fnhe_rth, rt); @@ -1186,13 +1188,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, fnhe->fnhe_stamp = jiffies; ret = true; - } else { - /* Routes we intend to cache in nexthop exception have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; } spin_unlock_bh(&fnhe_lock); @@ -1215,15 +1210,8 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (prev == orig) { if (orig) rt_free(orig); - } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + } else ret = false; - } return ret; } @@ -1284,8 +1272,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; + } dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; @@ -1294,8 +1284,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) cached = rt_cache_route(nh, rt); - } - if (unlikely(!cached)) + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } + } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID @@ -1363,6 +1363,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; @@ -1432,7 +1433,6 @@ static int __mkroute_input(struct sk_buff *skb, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { @@ -1488,6 +1488,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; @@ -1658,6 +1659,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1826,6 +1828,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -2104,6 +2107,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2182,7 +2186,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gateway && + if (rt->rt_uses_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 681ea2f413e2..05c5ab8d983c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -91,6 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -- cgit v1.2.3 From c92b96553a80c1dbe2ebe128bbe37c8f98f148bf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:19 +0000 Subject: ipv4: Add FLOWI_FLAG_KNOWN_NH Add flag to request that output route should be returned with known rt_gateway, in case we want to use it as nexthop for neighbour resolving. The returned route can be cached as follows: - in NH exception: because the cached routes are not shared with other destinations - in FIB NH: when using gateway because all destinations for NH share same gateway As last option, to return rt_gateway!=0 we have to set DST_NOCACHE. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/ipv4/route.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/flow.h b/include/net/flow.h index e1dd5082ec7e..628e11b98c58 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -21,6 +21,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_CAN_SLEEP 0x02 +#define FLOWI_FLAG_KNOWN_NH 0x04 __u32 flowic_secid; }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3a116cb0991a..1a0da8dc8180 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1762,6 +1762,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct in_device *in_dev; u16 type = res->type; struct rtable *rth; + bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) @@ -1798,24 +1799,36 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } fnhe = NULL; + do_cache = fi != NULL; if (fi) { struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); - fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + fnhe = find_exception(nh, fl4->daddr); if (fnhe) prth = &fnhe->fnhe_rth; - else - prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } rth = rcu_dereference(*prth); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; } } + +add: rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi); + do_cache); if (!rth) return ERR_PTR(-ENOBUFS); -- cgit v1.2.3 From ad4d3ef8b7eb527cca478dc08c02c10936e64115 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:20 +0000 Subject: ipvs: fix ARP resolving for direct routing mode After the change "Make neigh lookups directly in output packet path" (commit a263b30936) IPVS can not reach the real server for DR mode because we resolve the destination address from IP header, not from route neighbour. Use the new FLOWI_FLAG_KNOWN_NH flag to request output routes with known nexthop, so that it has preference on resolving. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_xmit.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 56f6d5d81a77..cc4c8095681a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -50,6 +50,7 @@ enum { * local */ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ }; /* @@ -113,6 +114,8 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, fl4.daddr = daddr; fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_tos = rtos; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; retry: rt = ip_route_output_key(net, &fl4); @@ -1061,7 +1064,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(iph->tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, NULL))) + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); -- cgit v1.2.3 From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 +--- include/linux/rbtree.h | 15 +++++---------- include/linux/timerqueue.h | 2 +- lib/rbtree.c | 4 ++-- net/ceph/osd_client.c | 1 - 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84db7e..fddc50729632 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e6a807720ded..2049087c43b7 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -67,17 +67,12 @@ static inline void rb_set_color(struct rb_node *rb, int color) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbree */ +#define RB_EMPTY_NODE(node) ((node)->rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) ((node)->rb_parent_color = (unsigned long)(node)) -static inline void rb_init_node(struct rb_node *rb) -{ - rb->rb_parent_color = 0; - rb->rb_right = NULL; - rb->rb_left = NULL; - RB_CLEAR_NODE(rb); -} extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 5088727478fd..a520fd70a59f 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) static inline void timerqueue_init(struct timerqueue_node *node) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565dc2c..fe43c8c5f527 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -387,7 +387,7 @@ struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a right-hand child, go down and then left as far @@ -416,7 +416,7 @@ struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a left-hand child, go down and then right as far diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e53..c1d756cc7448 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); - rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); -- cgit v1.2.3 From 5175a5e76bbdf20a614fb47ce7a38f0f39e70226 Mon Sep 17 00:00:00 2001 From: "jeff.liu" Date: Mon, 8 Oct 2012 18:57:27 +0000 Subject: RDS: fix rds-ping spinlock recursion This is the revised patch for fixing rds-ping spinlock recursion according to Venkat's suggestions. RDS ping/pong over TCP feature has been broken for years(2.6.39 to 3.6.0) since we have to set TCP cork and call kernel_sendmsg() between ping/pong which both need to lock "struct sock *sk". However, this lock has already been hold before rds_tcp_data_ready() callback is triggerred. As a result, we always facing spinlock resursion which would resulting in system panic. Given that RDS ping is only used to test the connectivity and not for serious performance measurements, we can queue the pong transmit to rds_wq as a delayed response. Reported-by: Dan Carpenter CC: Venkat Venkatsubra CC: David S. Miller CC: James Morris Signed-off-by: Jie Liu Signed-off-by: David S. Miller --- net/rds/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 96531d4033a2..88eace57dd6b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1122,7 +1122,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_pong); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); rds_message_put(rm); return 0; -- cgit v1.2.3 From 5aa8b572007c4bca1e6d3dd4c4820f1ae49d6bb2 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:16 +0000 Subject: pktgen: fix crash when generating IPv6 packets For IPv6, sizeof(struct ipv6hdr) = 40, thus the following expression will result negative: datalen = pkt_dev->cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; And, the check "if (datalen < sizeof(struct pktgen_hdr))" will be passed as "datalen" is promoted to unsigned, therefore will cause a crash later. This is a quick fix by checking if "datalen" is negative. The following patch will increase the default value of 'min_pkt_size' for IPv6. This bug should exist for a long time, so Cc -stable too. Cc: Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 148e73d2c451..e356b8d52bad 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2927,7 +2927,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; - if (datalen < sizeof(struct pktgen_hdr)) { + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); net_info_ratelimited("increased datalen to %d\n", datalen); } -- cgit v1.2.3 From 68bf9f0b91ed4440951312cf7d4ffa70b9e8cf73 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:17 +0000 Subject: pktgen: set different default min_pkt_size for different protocols ETH_ZLEN is too small for IPv6, so this default value is not suitable. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e356b8d52bad..98ee54963553 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -248,8 +248,8 @@ struct pktgen_dev { int removal_mark; /* non-zero => the device is marked for * removal by worker thread */ - int min_pkt_size; /* = ETH_ZLEN; */ - int max_pkt_size; /* = ETH_ZLEN; */ + int min_pkt_size; + int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; struct page *page; @@ -2036,10 +2036,14 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) /* Set up Dest MAC */ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); - /* Set up pkt size */ - pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; - if (pkt_dev->flags & F_IPV6) { + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + /* * Skip this automatic address setting until locks or functions * gets exported @@ -2086,6 +2090,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) } #endif } else { + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + pkt_dev->saddr_min = 0; pkt_dev->saddr_max = 0; if (strlen(pkt_dev->src_min) == 0) { @@ -2111,6 +2122,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); } /* Initialize current values. */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size) + pkt_dev->max_pkt_size = pkt_dev->min_pkt_size; + pkt_dev->cur_dst_mac_offset = 0; pkt_dev->cur_src_mac_offset = 0; pkt_dev->cur_saddr = pkt_dev->saddr_min; @@ -3548,8 +3563,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) } pkt_dev->removal_mark = 0; - pkt_dev->min_pkt_size = ETH_ZLEN; - pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; pkt_dev->delay = pg_delay_d; pkt_dev->count = pg_count_d; -- cgit v1.2.3 From 0373a94671be1f5c823dbfc03617418d8effd5ce Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:18 +0000 Subject: pktgen: display IPv4 address in human-readable format It is weird to display IPv4 address in %x format, what's more, IPv6 address is disaplayed in human-readable format too. So, make it human-readable. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 98ee54963553..f9b4637e9f24 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -702,8 +702,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) &pkt_dev->cur_in6_saddr, &pkt_dev->cur_in6_daddr); } else - seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", - pkt_dev->cur_saddr, pkt_dev->cur_daddr); + seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n", + &pkt_dev->cur_saddr, &pkt_dev->cur_daddr); seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); -- cgit v1.2.3 From 4c139b8ccebaecdfad58eb068d61ef386f1a58ed Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:19 +0000 Subject: pktgen: enable automatic IPv6 address setting Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f9b4637e9f24..47fe18e6a8e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2037,6 +2037,9 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); if (pkt_dev->flags & F_IPV6) { + int i, set = 0, err = 1; + struct inet6_dev *idev; + if (pkt_dev->min_pkt_size == 0) { pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) + sizeof(struct udphdr) @@ -2044,15 +2047,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) + pkt_dev->pkt_overhead; } - /* - * Skip this automatic address setting until locks or functions - * gets exported - */ - -#ifdef NOTNOW - int i, set = 0, err = 1; - struct inet6_dev *idev; - for (i = 0; i < IN6_ADDR_HSIZE; i++) if (pkt_dev->cur_in6_saddr.s6_addr[i]) { set = 1; @@ -2073,9 +2067,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; - ifp = ifp->if_next) { - if (ifp->scope == IFA_LINK && + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if ((ifp->scope & IFA_LINK) && !(ifp->flags & IFA_F_TENTATIVE)) { pkt_dev->cur_in6_saddr = ifp->addr; err = 0; @@ -2088,7 +2081,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) if (err) pr_err("ERROR: IPv6 link address not available\n"); } -#endif } else { if (pkt_dev->min_pkt_size == 0) { pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) -- cgit v1.2.3 From c468fb1375f1b4de851e3e0dbe9d1293d414a160 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:20 +0000 Subject: pktgen: replace scan_ip6() with in6_pton() Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 101 +++--------------------------------------------------- 1 file changed, 4 insertions(+), 97 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 47fe18e6a8e4..d1dc14c2aac4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -449,8 +449,6 @@ static void pktgen_stop_all_threads_ifs(void); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); -static unsigned int scan_ip6(const char *s, char ip[16]); - /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; static int pg_delay_d __read_mostly; @@ -1299,7 +1297,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; @@ -1322,7 +1320,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; @@ -1344,7 +1342,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); if (debug) @@ -1365,7 +1363,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; @@ -2765,97 +2763,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, return skb; } -/* - * scan_ip6, fmt_ip taken from dietlibc-0.21 - * Author Felix von Leitner - * - * Slightly modified for kernel. - * Should be candidate for net/ipv4/utils.c - * --ro - */ - -static unsigned int scan_ip6(const char *s, char ip[16]) -{ - unsigned int i; - unsigned int len = 0; - unsigned long u; - char suffix[16]; - unsigned int prefixlen = 0; - unsigned int suffixlen = 0; - __be32 tmp; - char *pos; - - for (i = 0; i < 16; i++) - ip[i] = 0; - - for (;;) { - if (*s == ':') { - len++; - if (s[1] == ':') { /* Found "::", skip to part 2 */ - s += 2; - len++; - break; - } - s++; - } - - u = simple_strtoul(s, &pos, 16); - i = pos - s; - if (!i) - return 0; - if (prefixlen == 12 && s[i] == '.') { - - /* the last 4 bytes may be written as IPv4 address */ - - tmp = in_aton(s); - memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp)); - return i + len; - } - ip[prefixlen++] = (u >> 8); - ip[prefixlen++] = (u & 255); - s += i; - len += i; - if (prefixlen == 16) - return len; - } - -/* part 2, after "::" */ - for (;;) { - if (*s == ':') { - if (suffixlen == 0) - break; - s++; - len++; - } else if (suffixlen != 0) - break; - - u = simple_strtol(s, &pos, 16); - i = pos - s; - if (!i) { - if (*s) - len--; - break; - } - if (suffixlen + prefixlen <= 12 && s[i] == '.') { - tmp = in_aton(s); - memcpy((struct in_addr *)(suffix + suffixlen), &tmp, - sizeof(tmp)); - suffixlen += 4; - len += strlen(s); - break; - } - suffix[suffixlen++] = (u >> 8); - suffix[suffixlen++] = (u & 255); - s += i; - len += i; - if (prefixlen + suffixlen == 16) - break; - } - for (i = 0; i < suffixlen; i++) - ip[16 - suffixlen + i] = suffix[i]; - return len; -} - static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct pktgen_dev *pkt_dev) { -- cgit v1.2.3 From 6caab7b0544e83e6c160b5e80f5a4a7dd69545c7 Mon Sep 17 00:00:00 2001 From: Sarveshwar Bandi Date: Wed, 10 Oct 2012 01:15:01 +0000 Subject: bridge: Pull ip header into skb->data before looking into ip header. If lower layer driver leaves the ip header in the skb fragment, it needs to be first pulled into skb->data before inspecting ip header length or ip version number. Signed-off-by: Sarveshwar Bandi Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 68e8f364bbf8..fe43bc7b063f 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -265,6 +265,9 @@ static int br_parse_ip_options(struct sk_buff *skb) struct net_device *dev = skb->dev; u32 len; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); -- cgit v1.2.3 From 68aaed54e7682aef261d5c2cf99e85a9dbf33a84 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 10 Oct 2012 08:27:25 +0000 Subject: ipv4: fix route mark sparse warning Sparse complains about RTA_MARK which is should be host order according to include file and usage in iproute. net/ipv4/route.c:2223:46: warning: incorrect type in argument 3 (different base types) net/ipv4/route.c:2223:46: expected restricted __be32 [usertype] value net/ipv4/route.c:2223:46: got unsigned int [unsigned] [usertype] flowic_mark Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1a0da8dc8180..432f4bb77238 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2220,7 +2220,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; if (fl4->flowi4_mark && - nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; error = rt->dst.error; -- cgit v1.2.3 From 759f42987f98915764bad922ee123acb0eadbe33 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:31 +0200 Subject: 9P: Fix race between p9_write_work() and p9_fd_request() Race scenario: thread A thread B p9_write_work() p9_fd_request() if (list_empty (&m->unsent_req_list)) ... spin_lock(&client->lock); req->status = REQ_STATUS_UNSENT; list_add_tail(..., &m->unsent_req_list); spin_unlock(&client->lock); .... if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched) schedule_work(&m->wq); --> not done because Wworksched is set clear_bit(Wworksched, &m->wsched); return; --> nobody will take care of sending the new request. This is not very likely to happen though, because p9_write_work() being called with an empty unsent_req_list is not frequent. But this also means that taking the lock earlier will not be costly. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index b2c308fffb8a..0031a8cf145d 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -453,12 +453,13 @@ static void p9_write_work(struct work_struct *work) } if (!m->wsize) { + spin_lock(&m->client->lock); if (list_empty(&m->unsent_req_list)) { clear_bit(Wworksched, &m->wsched); + spin_unlock(&m->client->lock); return; } - spin_lock(&m->client->lock); req = list_entry(m->unsent_req_list.next, struct p9_req_t, req_list); req->status = REQ_STATUS_SENT; -- cgit v1.2.3 From 0e24c4fc52b16f0a1102a933f636d2f350c41098 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 11 Oct 2012 06:24:14 +0000 Subject: tcp: sysctl interface leaks 16 bytes of kernel memory If the rc_dereference of tcp_fastopen_ctx ever fails then we copy 16 bytes of kernel stack into the proc result. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9205e492dc9d..63d4eccc674d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -248,6 +248,8 @@ int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, ctxt = rcu_dereference(tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + else + memset(user_key, 0, sizeof(user_key)); rcu_read_unlock(); snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", -- cgit v1.2.3 From 4c67525849e0b7f4bd4fab2487ec9e43ea52ef29 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Fri, 12 Oct 2012 04:34:17 +0000 Subject: tcp: resets are misrouted After commit e2446eaa ("tcp_v4_send_reset: binding oif to iif in no sock case").. tcp resets are always lost, when routing is asymmetric. Yes, backing out that patch will result in misrouting of resets for dead connections which used interface binding when were alive, but we actually cannot do anything here. What's died that's died and correct handling normal unbound connections is obviously a priority. Comment to comment: > This has few benefits: > 1. tcp_v6_send_reset already did that. It was done to route resets for IPv6 link local addresses. It was a mistake to do so for global addresses. The patch fixes this as well. Actually, the problem appears to be even more serious than guaranteed loss of resets. As reported by Sergey Soloviev , those misrouted resets create a lot of arp traffic and huge amount of unresolved arp entires putting down to knees NAT firewalls which use asymmetric routing. Signed-off-by: Alexey Kuznetsov --- net/ipv4/tcp_ipv4.c | 7 ++++--- net/ipv6/tcp_ipv6.c | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 75735c9a6a9d..ef998b008a57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -708,10 +708,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. - * routing might fail in this case. using iif for oif to - * make sure we can deliver it + * routing might fail in this case. No choice here, if we choose to force + * input interface, we will misroute in case of asymmetric route. */ - arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + if (sk) + arg.bound_dev_if = sk->sk_bound_dev_if; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 49c890386ce9..26175bffbaa0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -877,7 +877,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - fl6.flowi6_oif = inet6_iif(skb); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = inet6_iif(skb); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); -- cgit v1.2.3 From 8437e7610c2d3e06f87f71fb82e10ed4b291812a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 11 Oct 2012 12:51:28 +0000 Subject: vti: fix sparse bit endian warnings Use be32_to_cpu instead of htonl to keep sparse happy. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 978bca4818ae..1831092f999f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -374,7 +374,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - htonl(tunnel->parms.i_key), RT_TOS(tos), + be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, dst, tiph->saddr, 0, 0); @@ -441,7 +441,7 @@ static int vti_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - htonl(tunnel->parms.i_key), + be32_to_cpu(tunnel->parms.i_key), RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, iph->daddr, iph->saddr, 0, 0); -- cgit v1.2.3 From 28194fcdc150e4d5b418d01db3c29058f60ef32c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 11 Oct 2012 21:06:16 +0000 Subject: net: add doc for in6_pton() It is not easy to use in6_pton() correctly without reading its definition, so add some doc for it. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/utils.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index f5613d569c23..30f3879faecc 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -161,6 +161,18 @@ out: } EXPORT_SYMBOL(in4_pton); +/** + * in6_pton - convert an IPv6 address from literal to binary representation + * @src: the start of the IPv6 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[16] array) representation of the IPv6 address + * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) -- cgit v1.2.3 From 93ac0ef016a1b223d23fbb5e0397cab75a8f7d34 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 11 Oct 2012 21:06:17 +0000 Subject: net: add doc for in4_pton() It is not easy to use in4_pton() correctly without reading its definition, so add some doc for it. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/utils.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 30f3879faecc..e3487e461939 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -107,6 +107,18 @@ static inline int xdigit2bin(char c, int delim) return IN6PTON_UNKNOWN; } +/** + * in4_pton - convert an IPv4 address from literal to binary representation + * @src: the start of the IPv4 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[4] array) representation of the IPv4 address + * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) -- cgit v1.2.3 From 1bbb3095a5912be4b9c90397ef2182a5a328865b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 3 Oct 2012 20:32:17 -0700 Subject: userns: Properly print bluetooth socket uids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With user namespace support enabled building bluetooth generated the warning. net/bluetooth/af_bluetooth.c: In function ‘bt_seq_show’: net/bluetooth/af_bluetooth.c:598:7: warning: format ‘%u’ expects argument of type ‘unsigned int’, but argument 7 has type ‘kuid_t’ [-Wformat] Convert sock_i_uid from a kuid_t to a uid_t before printing, to avoid this problem. Reported-by: Fengguang Wu Cc: Masatake YAMATO Cc: Gustavo Padovan Signed-off-by: "Eric W. Biederman" --- net/bluetooth/af_bluetooth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 9d49ee6d7219..ba033f09196e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -591,7 +591,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) atomic_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), - sock_i_uid(sk), + from_kuid(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), &src_baswapped, &dst_baswapped, -- cgit v1.2.3