From cd3bafc73d11eb51cb2d3691629718431e1768ce Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 4 Feb 2015 23:31:10 +0900 Subject: xfrm6: Fix a offset value for network header in _decode_session6 When a network-layer header has multiple IPv6 extension headers, then offset for mobility header goes wrong. This regression breaks an xfrm policy lookup for a particular receive packet. Binding update packets of Mobile IPv6 are all discarded without this fix. Fixes: de3b7a06dfe1 ("xfrm6: Fix transport header offset in _decode_session6.") Signed-off-by: Hajime Tazaki Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 48bf5a06847b..8d2d01b4800a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -200,6 +200,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: + offset += ipv6_optlen(exthdr); if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; -- cgit v1.2.3 From dd3733b3e798daf778a1ec08557f388f00fdc2f6 Mon Sep 17 00:00:00 2001 From: Alexey Andriyanov Date: Fri, 6 Feb 2015 22:32:20 +0300 Subject: ipvs: fix inability to remove a mixed-family RS The current code prevents any operation with a mixed-family dest unless IP_VS_CONN_F_TUNNEL flag is set. The problem is that it's impossible for the client to follow this rule, because ip_vs_genl_parse_dest does not even read the destination conn_flags when cmd = IPVS_CMD_DEL_DEST (need_full_dest = 0). Also, not every client can pass this flag when removing a dest. ipvsadm, for example, does not support the "-i" command line option together with the "-d" option. This change disables any checks for mixed-family on IPVS_CMD_DEL_DEST command. Signed-off-by: Alexey Andriyanov Fixes: bc18d37f676f ("ipvs: Allow heterogeneous pools now that we support them") Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b8295a430a56..fdcda8be1f0f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3399,7 +3399,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; - if (udest.af != svc->af) { + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ -- cgit v1.2.3 From 044a832a7779c0638bea2d0fea901c055b995f4a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 12 Jan 2015 13:38:49 +0100 Subject: xfrm: Fix local error reporting crash with interfamily tunnels We set the outer mode protocol too early. As a result, the local error handler might dispatch to the wrong address family and report the error to a wrong socket type. We fix this by setting the outer protocol to the skb after we accessed the inner mode for the last time, right before we do the atcual encapsulation where we switch finally to the outer mode. Reported-by: Chris Ruehl Tested-by: Chris Ruehl Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 2 +- net/ipv6/xfrm6_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d5f6bd9a210a..dab73813cb92 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -63,6 +63,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + skb->protocol = htons(ETH_P_IP); return x->outer_mode->output2(x, skb); } @@ -71,7 +72,6 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb->protocol = htons(ETH_P_IP); #ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index ca3f29b98ae5..010f8bd2d577 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -114,6 +114,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; skb->ignore_df = 1; + skb->protocol = htons(ETH_P_IPV6); return x->outer_mode->output2(x, skb); } @@ -122,7 +123,6 @@ EXPORT_SYMBOL(xfrm6_prepare_output); int xfrm6_output_finish(struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - skb->protocol = htons(ETH_P_IPV6); #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -- cgit v1.2.3 From ac37e2515c1a89c477459a2020b6bfdedabdb91b Mon Sep 17 00:00:00 2001 From: huaibin Wang Date: Wed, 11 Feb 2015 18:10:36 +0100 Subject: xfrm: release dst_orig in case of error in xfrm_lookup() dst_orig should be released on error. Function like __xfrm_route_forward() expects that behavior. Since a recent commit, xfrm_lookup() may also be called by xfrm_lookup_route(), which expects the opposite. Let's introduce a new flag (XFRM_LOOKUP_KEEP_DST_REF) to tell what should be done in case of error. Fixes: f92ee61982d("xfrm: Generate blackhole routes only from route lookup functions") Signed-off-by: huaibin Wang Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- include/net/dst.h | 1 + net/xfrm/xfrm_policy.c | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index a8ae4e760778..0fb99a26e973 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -481,6 +481,7 @@ void dst_init(void); enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, + XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cee479bc655c..638af0655aaf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2269,11 +2269,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { - dst_release(dst); - xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - - return ERR_PTR(-EREMOTE); + err = -EREMOTE; + goto error; } err = -EAGAIN; @@ -2324,7 +2322,8 @@ nopol: error: dst_release(dst); dropdst: - dst_release(dst_orig); + if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) + dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } @@ -2338,7 +2337,8 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, - flags | XFRM_LOOKUP_QUEUE); + flags | XFRM_LOOKUP_QUEUE | + XFRM_LOOKUP_KEEP_DST_REF); if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); -- cgit v1.2.3 From 520aa7414bb590f39d0d1591b06018e60cbc7cf4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 12 Feb 2015 22:15:31 +0100 Subject: netfilter: nft_compat: fix module refcount underflow Feb 12 18:20:42 nfdev kernel: ------------[ cut here ]------------ Feb 12 18:20:42 nfdev kernel: WARNING: CPU: 4 PID: 4359 at kernel/module.c:963 module_put+0x9b/0xba() Feb 12 18:20:42 nfdev kernel: CPU: 4 PID: 4359 Comm: ebtables-compat Tainted: G W 3.19.0-rc6+ #43 [...] Feb 12 18:20:42 nfdev kernel: Call Trace: Feb 12 18:20:42 nfdev kernel: [] dump_stack+0x4c/0x65 Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_common+0x9c/0xb6 Feb 12 18:20:42 nfdev kernel: [] ? module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_null+0x15/0x17 Feb 12 18:20:42 nfdev kernel: [] module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] nft_match_destroy+0x45/0x4c Feb 12 18:20:42 nfdev kernel: [] nf_tables_rule_destroy+0x28/0x70 Reported-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- net/netfilter/nft_compat.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 265e190f2218..b6364869c2e0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -578,8 +578,12 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct xt_match *match = nft_match->ops.data; if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) + match->revision == rev && match->family == family) { + if (!try_module_get(match->me)) + return ERR_PTR(-ENOENT); + return &nft_match->ops; + } } match = xt_request_find_match(family, mt_name, rev); @@ -648,8 +652,12 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct xt_target *target = nft_target->ops.data; if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) + target->revision == rev && target->family == family) { + if (!try_module_get(target->me)) + return ERR_PTR(-ENOENT); + return &nft_target->ops; + } } target = xt_request_find_target(family, tg_name, rev); -- cgit v1.2.3 From cef9ed86ed62eeffcd017882278bbece32001f86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Feb 2015 12:47:50 +0100 Subject: netfilter: xt_recent: don't reject rule if new hitcount exceeds table max given: -A INPUT -m recent --update --seconds 30 --hitcount 4 and iptables-save > foo then iptables-restore < foo will fail with: kernel: xt_recent: hitcount (4) is larger than packets to be remembered (4) for table DEFAULT Even when the check is fixed, the restore won't work if the hitcount is increased to e.g. 6, since by the time checkentry runs it will find the 'old' incarnation of the table. We can avoid this by increasing the maximum threshold silently; we only have to rm all the current entries of the table (these entries would not have enough room to handle the increased hitcount). This even makes (not-very-useful) -A INPUT -m recent --update --seconds 30 --hitcount 4 -A INPUT -m recent --update --seconds 30 --hitcount 42 work. Fixes: abc86d0f99242b7f142b (netfilter: xt_recent: relax ip_pkt_list_tot restrictions) Tracked-down-by: Chris Vine Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 30dbe34915ae..45e1b30e4fb2 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -378,12 +378,11 @@ static int recent_mt_check(const struct xt_mtchk_param *par, mutex_lock(&recent_mutex); t = recent_table_lookup(recent_net, info->name); if (t != NULL) { - if (info->hit_count > t->nstamps_max_mask) { - pr_info("hitcount (%u) is larger than packets to be remembered (%u) for table %s\n", - info->hit_count, t->nstamps_max_mask + 1, - info->name); - ret = -EINVAL; - goto out; + if (nstamp_mask > t->nstamps_max_mask) { + spin_lock_bh(&recent_lock); + recent_table_flush(t); + t->nstamps_max_mask = nstamp_mask; + spin_unlock_bh(&recent_lock); } t->refcnt++; -- cgit v1.2.3 From 78296c97ca1fd3b104f12e1f1fbc06c46635990b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Feb 2015 19:03:45 -0800 Subject: netfilter: xt_socket: fix a stack corruption bug As soon as extract_icmp6_fields() returns, its local storage (automatic variables) is deallocated and can be overwritten. Lets add an additional parameter to make sure storage is valid long enough. While we are at it, adds some const qualifiers. Signed-off-by: Eric Dumazet Fixes: b64c9256a9b76 ("tproxy: added IPv6 support to the socket match") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 1ba67931eb1b..13332dbf291d 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -243,12 +243,13 @@ static int extract_icmp6_fields(const struct sk_buff *skb, unsigned int outside_hdrlen, int *protocol, - struct in6_addr **raddr, - struct in6_addr **laddr, + const struct in6_addr **raddr, + const struct in6_addr **laddr, __be16 *rport, - __be16 *lport) + __be16 *lport, + struct ipv6hdr *ipv6_var) { - struct ipv6hdr *inside_iph, _inside_iph; + const struct ipv6hdr *inside_iph; struct icmp6hdr *icmph, _icmph; __be16 *ports, _ports[2]; u8 inside_nexthdr; @@ -263,12 +264,14 @@ extract_icmp6_fields(const struct sk_buff *skb, if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) return 1; - inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), + sizeof(*ipv6_var), ipv6_var); if (inside_iph == NULL) return 1; inside_nexthdr = inside_iph->nexthdr; - inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + + sizeof(*ipv6_var), &inside_nexthdr, &inside_fragoff); if (inside_hdrlen < 0) return 1; /* hjm: Packet has no/incomplete transport layer headers. */ @@ -315,10 +318,10 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, static bool socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr ipv6_var, *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; struct sock *sk = skb->sk; - struct in6_addr *daddr = NULL, *saddr = NULL; + const struct in6_addr *daddr = NULL, *saddr = NULL; __be16 uninitialized_var(dport), uninitialized_var(sport); int thoff = 0, uninitialized_var(tproto); const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; @@ -342,7 +345,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) } else if (tproto == IPPROTO_ICMPV6) { if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, - &sport, &dport)) + &sport, &dport, &ipv6_var)) return false; } else { return false; -- cgit v1.2.3 From a1d1e9be5a1dafe0ddc2181a9201c2ae29c71eff Mon Sep 17 00:00:00 2001 From: David Ramos Date: Fri, 13 Feb 2015 13:11:51 -0800 Subject: svcrpc: fix memory leak in gssp_accept_sec_context_upcall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our UC-KLEE tool found a kernel memory leak of 512 bytes (on x86_64) for each call to gssp_accept_sec_context_upcall() (net/sunrpc/auth_gss/gss_rpc_upcall.c). Since it appears that this call can be triggered by remote connections (at least, from a cursory a glance at the call chain), it may be exploitable to cause kernel memory exhaustion. We found the bug in kernel 3.16.3, but it appears to date back to commit 9dfd87da1aeb0fd364167ad199f40fe96a6a87be (2013-08-20). The gssp_accept_sec_context_upcall() function performs a pair of calls to gssp_alloc_receive_pages() and gssp_free_receive_pages(). The first allocates memory for arg->pages. The second then frees the pages pointed to by the arg->pages array, but not the array itself. Reported-by: David A. Ramos Fixes: 9dfd87da1aeb ("rpc: fix huge kmalloc's in gss-proxy”) Signed-off-by: David A. Ramos Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index abbb7dcd1689..59eeed43eda2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -217,6 +217,8 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); + + kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) -- cgit v1.2.3 From 1c4cff0cf55011792125b6041bc4e9713e46240f Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Fri, 13 Feb 2015 14:47:05 -0800 Subject: gen_stats.c: Duplicate xstats buffer for later use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gnet_stats_copy_app() function gets called, more often than not, with its second argument a pointer to an automatic variable in the caller's stack. Therefore, to avoid copying garbage afterwards when calling gnet_stats_finish_copy(), this data is better copied to a dynamically allocated memory that gets freed after use. [xiyou.wangcong@gmail.com: remove a useless kfree()] Signed-off-by: Ignacy Gawędzki Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/gen_stats.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 0c08062d1796..1e2f46a69d50 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -32,6 +32,9 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) return 0; nla_put_failure: + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; spin_unlock_bh(d->lock); return -1; } @@ -305,7 +308,9 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) { if (d->compat_xstats) { - d->xstats = st; + d->xstats = kmemdup(st, len, GFP_ATOMIC); + if (!d->xstats) + goto err_out; d->xstats_len = len; } @@ -313,6 +318,11 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return gnet_stats_copy(d, TCA_STATS_APP, st, len); return 0; + +err_out: + d->xstats_len = 0; + spin_unlock_bh(d->lock); + return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -345,6 +355,9 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; spin_unlock_bh(d->lock); return 0; } -- cgit v1.2.3 From fba04a9e0c869498889b6445fd06cbe7da9bb834 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Tue, 17 Feb 2015 13:33:46 +0300 Subject: ipv4: ip_check_defrag should correctly check return value of skb_copy_bits skb_copy_bits() returns zero on success and negative value on error, so it is needed to invert the condition in ip_check_defrag(). Fixes: 1bf3751ec90c ("ipv4: ip_check_defrag must not modify skb before unsharing") Signed-off-by: Alexander Drozdov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e5b6d0ddcb58..2c8d98e728c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -664,7 +664,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (skb->protocol != htons(ETH_P_IP)) return skb; - if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) + if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) -- cgit v1.2.3 From 34eea79e2664b314cab6a30fc582fdfa7a1bb1df Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Tue, 17 Feb 2015 20:15:20 +0100 Subject: ematch: Fix auto-loading of ematch modules. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In tcf_em_validate(), after calling request_module() to load the kind-specific module, set em->ops to NULL before returning -EAGAIN, so that module_put() is not called again by tcf_em_tree_destroy(). Signed-off-by: Ignacy Gawędzki Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/ematch.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 6742200b1307..fbb7ebfc58c6 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -228,6 +228,7 @@ static int tcf_em_validate(struct tcf_proto *tp, * to replay the request. */ module_put(em->ops->owner); + em->ops = NULL; err = -EAGAIN; } #endif -- cgit v1.2.3 From 7b4577a9da3702049650f7095506e9afd9f68849 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 17 Feb 2015 11:23:10 -0800 Subject: openvswitch: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [] lookup_vport+0x21/0xd0 [openvswitch] [] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [] genl_family_rcv_msg+0x1bc/0x3e0 [] genl_rcv_msg+0x79/0xc0 [] netlink_rcv_skb+0xb9/0xe0 [] genl_rcv+0x2c/0x40 [] netlink_unicast+0x12d/0x1c0 [] netlink_sendmsg+0x34a/0x6b0 [] sock_sendmsg+0xa0/0xe0 [] ___sys_sendmsg+0x408/0x420 [] __sys_sendmsg+0x51/0x90 [] SyS_sendmsg+0x12/0x20 [] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar Reviewed-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- net/openvswitch/vport.h | 2 ++ 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ae5e77cdc0ca..5bae7243c577 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2194,14 +2194,55 @@ static int __net_init ovs_init_net(struct net *net) return 0; } -static void __net_exit ovs_exit_net(struct net *net) +static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, + struct list_head *head) { - struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct datapath *dp; + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (dev_net(netdev_vport->dev) == dnet) + list_add(&vport->detach_list, head); + } + } + } +} + +static void __net_exit ovs_exit_net(struct net *dnet) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); + struct vport *vport, *vport_next; + struct net *net; + LIST_HEAD(head); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); + + rtnl_lock(); + for_each_net(net) + list_vports_from_net(net, dnet, &head); + rtnl_unlock(); + + /* Detach all vports from given namespace. */ + list_for_each_entry_safe(vport, vport_next, &head, detach_list) { + list_del(&vport->detach_list); + ovs_dp_detach_port(vport); + } + ovs_unlock(); cancel_work_sync(&ovs_net->dp_notify_work); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index f8ae295fb001..bc85331a6c60 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -103,6 +103,7 @@ struct vport_portids { * @ops: Class structure. * @percpu_stats: Points to per-CPU statistics used and maintained by vport * @err_stats: Points to error statistics used and maintained by vport + * @detach_list: list used for detaching vport in net-exit call. */ struct vport { struct rcu_head rcu; @@ -117,6 +118,7 @@ struct vport { struct pcpu_sw_netstats __percpu *percpu_stats; struct vport_err_stats err_stats; + struct list_head detach_list; }; /** -- cgit v1.2.3 From 997d5c3f4427f38562cbe207ce05bb25fdcb993b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Feb 2015 05:47:55 -0800 Subject: sock: sock_dequeue_err_skb() needs hard irq safety Non NAPI drivers can call skb_tstamp_tx() and then sock_queue_err_skb() from hard IRQ context. Therefore, sock_dequeue_err_skb() needs to block hard irq or corruptions or hangs can happen. Signed-off-by: Eric Dumazet Fixes: 364a9e93243d1 ("sock: deduplicate errqueue dequeue") Fixes: cb820f8e4b7f7 ("net: Provide a generic socket error queue delivery method for Tx time stamps.") Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 88c613eab142..f80507823531 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3621,13 +3621,14 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; struct sk_buff *skb, *skb_next; + unsigned long flags; int err = 0; - spin_lock_bh(&q->lock); + spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) err = SKB_EXT_ERR(skb_next)->ee.ee_errno; - spin_unlock_bh(&q->lock); + spin_unlock_irqrestore(&q->lock, flags); sk->sk_err = err; if (err) -- cgit v1.2.3 From 65e9256c4ec569ed62bb89023daab7b72368b89f Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Fri, 20 Feb 2015 21:34:58 +0200 Subject: ethtool: Add hw-switch-offload to netdev_features_strings. commit aafb3e98b279 (netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads) add a new feature without adding it to netdev_features_strings array; this patch fixes this. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 91f74f3eb204..aa378ecef186 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload", }; static const char -- cgit v1.2.3 From 278f7b4fffce9ad267406cf8800df271d14f4a16 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 19 Feb 2015 12:13:13 +0300 Subject: caif: fix a signedness bug in cfpkt_iterate() The cfpkt_iterate() function can return -EPROTO on error, but the function is a u16 so the negative value gets truncated to a positive unsigned short. This causes a static checker warning. The only caller which might care is cffrml_receive(), when it's checking the frame checksum. I modified cffrml_receive() so that it never says -EPROTO is a valid checksum. Also this isn't ever going to be inlined so I removed the "inline". Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/net/caif/cfpkt.h | 2 +- net/caif/cffrml.c | 2 +- net/caif/cfpkt_skbuff.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h index 1c1ad46250d5..fe328c52c46b 100644 --- a/include/net/caif/cfpkt.h +++ b/include/net/caif/cfpkt.h @@ -171,7 +171,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos); * @return Checksum of buffer. */ -u16 cfpkt_iterate(struct cfpkt *pkt, +int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16 chks, void *buf, u16 len), u16 data); diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index 8bc7caa28e64..434ba8557826 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -84,7 +84,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) u16 tmp; u16 len; u16 hdrchks; - u16 pktchks; + int pktchks; struct cffrml *this; this = container_obj(layr); diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 1be0b521ac49..f6c3b2137eea 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -255,9 +255,9 @@ inline u16 cfpkt_getlen(struct cfpkt *pkt) return skb->len; } -inline u16 cfpkt_iterate(struct cfpkt *pkt, - u16 (*iter_func)(u16, void *, u16), - u16 data) +int cfpkt_iterate(struct cfpkt *pkt, + u16 (*iter_func)(u16, void *, u16), + u16 data) { /* * Don't care about the performance hit of linearizing, -- cgit v1.2.3 From a4176a9391868bfa87705bcd2e3b49e9b9dd2996 Mon Sep 17 00:00:00 2001 From: Matthew Thode Date: Tue, 17 Feb 2015 18:31:57 -0600 Subject: net: reject creation of netdev names with colons colons are used as a separator in netdev device lookup in dev_ioctl.c Specific functions are SIOCGIFTXQLEN SIOCETHTOOL SIOCSIFNAME Signed-off-by: Matthew Thode Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8f9710c62e20..962ee9d71964 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -946,7 +946,7 @@ bool dev_valid_name(const char *name) return false; while (*name) { - if (*name == '/' || isspace(*name)) + if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } -- cgit v1.2.3 From 3f34b24a732bab9635c4b32823268c37c01b40f0 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Fri, 20 Feb 2015 08:24:27 +0300 Subject: af_packet: allow packets defragmentation not only for hash fanout type Packets defragmentation was introduced for PACKET_FANOUT_HASH only, see 7736d33f4262 ("packet: Add pre-defragmentation support for ipv4 fanouts") It may be useful to have defragmentation enabled regardless of fanout type. Without that, the AF_PACKET user may have to: 1. Collect fragments from different rings 2. Defragment by itself Signed-off-by: Alexander Drozdov Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9c28cec1a083..99fc628f7372 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1349,14 +1349,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, return 0; } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { + skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); + if (!skb) + return 0; + } switch (f->type) { case PACKET_FANOUT_HASH: default: - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { - skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); - if (!skb) - return 0; - } idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: -- cgit v1.2.3 From 2156d321b879cdadb95a633d046169cfebdbf784 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Sat, 21 Feb 2015 19:30:55 +0100 Subject: netfilter: nft_compat: don't truncate ethernet protocol type to u8 Use u16 for protocol and then cast it to __be16 >> net/netfilter/nft_compat.c:140:37: sparse: incorrect type in assignment (different base types) net/netfilter/nft_compat.c:140:37: expected restricted __be16 [usertype] ethproto net/netfilter/nft_compat.c:140:37: got unsigned char [unsigned] [usertype] proto >> net/netfilter/nft_compat.c:351:37: sparse: incorrect type in assignment (different base types) net/netfilter/nft_compat.c:351:37: expected restricted __be16 [usertype] ethproto net/netfilter/nft_compat.c:351:37: got unsigned char [unsigned] [usertype] proto Reported-by: kbuild test robot Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1279cd85663e..213584cf04b3 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -123,7 +123,7 @@ static void nft_target_set_tgchk_param(struct xt_tgchk_param *par, const struct nft_ctx *ctx, struct xt_target *target, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -137,7 +137,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -171,7 +171,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, }; -static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; u32 flags; @@ -203,7 +203,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; @@ -334,7 +334,7 @@ static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { static void nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -348,7 +348,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -385,7 +385,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; -- cgit v1.2.3 From 02263db00b6cb98701332aa257c07ca549c2324b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 20 Feb 2015 17:11:10 +0100 Subject: netfilter: nf_tables: fix addition/deletion of elements from commit/abort We have several problems in this path: 1) There is a use-after-free when removing individual elements from the commit path. 2) We have to uninit() the data part of the element from the abort path to avoid a chain refcount leak. 3) We have to check for set->flags to see if there's a mapping, instead of the element flags. 4) We have to check for !(flags & NFT_SET_ELEM_INTERVAL_END) to skip elements that are part of the interval that have no data part, so they don't need to be uninit(). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 199fd0f27b0e..a8c94620f20e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3612,12 +3612,11 @@ static int nf_tables_commit(struct sk_buff *skb) &te->elem, NFT_MSG_DELSETELEM, 0); te->set->ops->get(te->set, &te->elem); - te->set->ops->remove(te->set, &te->elem); nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); - if (te->elem.flags & NFT_SET_MAP) { - nft_data_uninit(&te->elem.data, - te->set->dtype); - } + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; } @@ -3658,7 +3657,7 @@ static int nf_tables_abort(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; - struct nft_set *set; + struct nft_trans_elem *te; list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -3719,9 +3718,13 @@ static int nf_tables_abort(struct sk_buff *skb) break; case NFT_MSG_NEWSETELEM: nft_trans_elem_set(trans)->nelems--; - set = nft_trans_elem_set(trans); - set->ops->get(set, &nft_trans_elem(trans)); - set->ops->remove(set, &nft_trans_elem(trans)); + te = (struct nft_trans_elem *)trans->data; + te->set->ops->get(te->set, &te->elem); + nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: -- cgit v1.2.3 From 528c943f3bb919aef75ab2fff4f00176f09a4019 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 21 Feb 2015 21:03:10 +0200 Subject: ipvs: add missing ip_vs_pe_put in sync code ip_vs_conn_fill_param_sync() gets in param.pe a module reference for persistence engine from __ip_vs_pe_getbyname() but forgets to put it. Problem occurs in backup for sync protocol v1 (2.6.39). Also, pe_data usually comes in sync messages for connection templates and ip_vs_conn_new() copies the pointer only in this case. Make sure pe_data is not leaked if it comes unexpectedly for normal connections. Leak can happen only if bogus messages are sent to backup server. Fixes: fe5e7a1efb66 ("IPVS: Backup, Adding Version 1 receive capability") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c47ffd7a0a70..d93ceeb3ef04 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -896,6 +896,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + kfree(param->pe_data); } if (opt) @@ -1169,6 +1171,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #endif + ip_vs_pe_put(param.pe); return 0; /* Error exit */ out: -- cgit v1.2.3 From 52d6c8c6ca125872459054daa70f2f1c698c8e75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Feb 2015 17:03:41 -0800 Subject: net: pktgen: disable xmit_clone on virtual devices Trying to use burst capability (aka xmit_more) on a virtual device like bonding is not supported. For example, skb might be queued multiple times on a qdisc, with various list corruptions. Fixes: 38b2cf2982dc ("net: pktgen: packet bursting via skb->xmit_more") Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b4899f5b7388..508155b283dd 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1134,6 +1134,9 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; + if ((value > 1) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); return count; -- cgit v1.2.3 From 6514890f7aa8dd75fa46e282a1c7a8615e86f363 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 20 Feb 2015 13:33:16 -0500 Subject: tcp: fix tcp_should_expand_sndbuf() to use tcp_packets_in_flight() tcp_should_expand_sndbuf() does not expand the send buffer if we have filled the congestion window. However, it should use tcp_packets_in_flight() instead of tp->packets_out to make this check. Testing has established that the difference matters a lot if there are many SACKed packets, causing a needless performance shortfall. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8fdd27b17306..fb4cf8b8e121 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4770,7 +4770,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we filled the congestion window, do not expand. */ - if (tp->packets_out >= tp->snd_cwnd) + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return false; return true; -- cgit v1.2.3 From 46b9e4bb76ee26f1e024e048bb95af41b763f48f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 23 Feb 2015 12:02:51 +0100 Subject: decnet: Fix obvious o/0 typo Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1d7c1256e845..3b81092771f8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1062,7 +1062,7 @@ source_ok: if (decnet_debug_level & 16) printk(KERN_DEBUG "dn_route_output_slow: initial checks complete." - " dst=%o4x src=%04x oif=%d try_hard=%d\n", + " dst=%04x src=%04x oif=%d try_hard=%d\n", le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr), fld.flowidn_oif, try_hard); -- cgit v1.2.3 From 9b1dcbc8cf4679ecf090b343ac31bda6e55ddabe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Feb 2015 10:14:51 -0500 Subject: xprtrdma: Store RDMA credits in unsigned variables Dan Carpenter's static checker pointed out: net/sunrpc/xprtrdma/rpc_rdma.c:879 rpcrdma_reply_handler() warn: can 'credits' be negative? "credits" is defined as an int. The credits value comes from the server as a 32-bit unsigned integer. A malicious or broken server can plant a large unsigned integer in that field which would result in an underflow in the following logic, potentially triggering a deadlock of the mount point by blocking the client from issuing more RPC requests. net/sunrpc/xprtrdma/rpc_rdma.c: 876 credits = be32_to_cpu(headerp->rm_credit); 877 if (credits == 0) 878 credits = 1; /* don't deadlock */ 879 else if (credits > r_xprt->rx_buf.rb_max_requests) 880 credits = r_xprt->rx_buf.rb_max_requests; 881 882 cwnd = xprt->cwnd; 883 xprt->cwnd = credits << RPC_CWNDSHIFT; 884 if (xprt->cwnd > cwnd) 885 xprt_release_rqst_cong(rqst->rq_task); Reported-by: Dan Carpenter Fixes: eba8ff660b2d ("xprtrdma: Move credit update to RPC . . .") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 3 ++- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 7e9acd9361c5..91ffde82fa0c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -738,8 +738,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpc_xprt *xprt = rep->rr_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; - int credits, rdmalen, status; + int rdmalen, status; unsigned long cwnd; + u32 credits; /* Check status. If bad, signal disconnect and return rep to pool */ if (rep->rr_len == ~0U) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d1b70397c60f..0a16fb6f0885 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -285,7 +285,7 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) */ struct rpcrdma_buffer { spinlock_t rb_lock; /* protects indexes */ - int rb_max_requests;/* client max requests */ + u32 rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ struct list_head rb_all; int rb_send_index; -- cgit v1.2.3 From a948f8ce771a1f07c17ed8bcb51f59f69129a51c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Feb 2015 18:38:24 +0100 Subject: irda: replace current->state by set_current_state() Use helper functions to access current->state. Direct assignments are prone to races and therefore buggy. current->state = TASK_RUNNING can be replaced by __set_current_state() Thanks to Peter Zijlstra for the exact definition of the problem. Suggested-By: Peter Zijlstra Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/irda/ircomm/ircomm_tty.c | 2 +- net/irda/irnet/irnet_ppp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 40695b9751c1..9940a41efca1 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -811,7 +811,7 @@ static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) break; } spin_unlock_irqrestore(&self->spinlock, flags); - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); } /* diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 3c83a1e5ab03..1215693fdd22 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -305,7 +305,7 @@ irnet_ctrl_read(irnet_socket * ap, /* Put ourselves on the wait queue to be woken up */ add_wait_queue(&irnet_events.rwait, &wait); - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); for(;;) { /* If there is unread events */ @@ -321,7 +321,7 @@ irnet_ctrl_read(irnet_socket * ap, /* Yield and wait to be woken up */ schedule(); } - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); remove_wait_queue(&irnet_events.rwait, &wait); /* Did we got it ? */ -- cgit v1.2.3 From d720d8cec563ce4e4fa44a613d4f2dcb1caf2998 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2015 18:12:56 +0000 Subject: net: compat: Ignore MSG_CMSG_COMPAT in compat_sys_{send, recv}msg With commit a7526eb5d06b (net: Unbreak compat_sys_{send,recv}msg), the MSG_CMSG_COMPAT flag is blocked at the compat syscall entry points, changing the kernel compat behaviour from the one before the commit it was trying to fix (1be374a0518a, net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg). On 32-bit kernels (!CONFIG_COMPAT), MSG_CMSG_COMPAT is 0 and the native 32-bit sys_sendmsg() allows flag 0x80000000 to be set (it is ignored by the kernel). However, on a 64-bit kernel, the compat ABI is different with commit a7526eb5d06b. This patch changes the compat_sys_{send,recv}msg behaviour to the one prior to commit 1be374a0518a. The problem was found running 32-bit LTP (sendmsg01) binary on an arm64 kernel. Arguably, LTP should not pass 0xffffffff as flags to sendmsg() but the general rule is not to break user ABI (even when the user behaviour is not entirely sane). Fixes: a7526eb5d06b (net: Unbreak compat_sys_{send,recv}msg) Cc: Andy Lutomirski Cc: David S. Miller Signed-off-by: Catalin Marinas Signed-off-by: David S. Miller --- net/compat.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 3236b4167a32..94d3d5e97883 100644 --- a/net/compat.c +++ b/net/compat.c @@ -711,24 +711,18 @@ static unsigned char nas[21] = { COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT); } COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } @@ -751,9 +745,6 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, int datagrams; struct timespec ktspec; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); -- cgit v1.2.3 From 77751427a1ff25b27d47a4c36b12c3c8667855ac Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Mon, 23 Feb 2015 11:17:13 -0300 Subject: ipv6: addrconf: validate new MTU before applying it Currently we don't check if the new MTU is valid or not and this allows one to configure a smaller than minimum allowed by RFCs or even bigger than interface own MTU, which is a problem as it may lead to packet drops. If you have a daemon like NetworkManager running, this may be exploited by remote attackers by forging RA packets with an invalid MTU, possibly leading to a DoS. (NetworkManager currently only validates for values too small, but not for too big ones.) The fix is just to make sure the new value is valid. That is, between IPV6_MIN_MTU and interface's MTU. Note that similar check is already performed at ndisc_router_discovery(), for when kernel itself parses the RA. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98e4a63d72bb..b6030025f411 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4903,6 +4903,21 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } +static +int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct inet6_dev *idev = ctl->extra1; + int min_mtu = IPV6_MIN_MTU; + struct ctl_table lctl; + + lctl = *ctl; + lctl.extra1 = &min_mtu; + lctl.extra2 = idev ? &idev->dev->mtu : NULL; + + return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos); +} + static void dev_disable_change(struct inet6_dev *idev) { struct netdev_notifier_info info; @@ -5054,7 +5069,7 @@ static struct addrconf_sysctl_table .data = &ipv6_devconf.mtu6, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = addrconf_sysctl_mtu, }, { .procname = "accept_ra", -- cgit v1.2.3 From 104f5a6206f4b3133c675e3d41eca2ca4c41406b Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 8 Feb 2015 12:36:07 +0200 Subject: mac80211: clear sdata->radar_required If ieee80211_vif_use_channel() fails, we have to clear sdata->radar_required (which we might have just set). Failing to do it results in stale radar_required field which prevents starting new scan requests. Reported-by: Jouni Malinen Signed-off-by: Eliad Peller [use false instead of 0] Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index ff0d2db09df9..5bcd4e5589d3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1508,6 +1508,8 @@ static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); + sdata->radar_required = false; + /* Unreserving may ready an in-place reservation. */ if (use_reserved_switch) ieee80211_vif_use_reserved_switch(local); @@ -1566,6 +1568,9 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); out: + if (ret) + sdata->radar_required = false; + mutex_unlock(&local->chanctx_mtx); return ret; } -- cgit v1.2.3 From 5528fae88697268a7176dd6c8d7ab368c04368be Mon Sep 17 00:00:00 2001 From: Samuel Tan Date: Mon, 9 Feb 2015 21:29:15 +0200 Subject: nl80211: use loop index as type for net detect frequency results We currently add nested members of the NL80211_ATTR_SCAN_FREQUENCIES as NLA_U32 attributes of type NL80211_ATTR_WIPHY_FREQ in cfg80211_net_detect_results. However, since there can be an arbitrary number of frequency results, we should use the loop index of the loop used to add the frequency results to NL80211_ATTR_SCAN_FREQUENCIES as the type (i.e. nla_type) for each result attribute, rather than a fixed type. This change is in line with how nested members are added to NL80211_ATTR_SCAN_FREQUENCIES in the functions nl80211_send_wowlan_nd and nl80211_add_scan_req. Signed-off-by: Samuel Tan Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d78fd8b54515..3c7fb0459e58 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12528,9 +12528,7 @@ static int cfg80211_net_detect_results(struct sk_buff *msg, } for (j = 0; j < match->n_channels; j++) { - if (nla_put_u32(msg, - NL80211_ATTR_WIPHY_FREQ, - match->channels[j])) { + if (nla_put_u32(msg, j, match->channels[j])) { nla_nest_cancel(msg, nl_freqs); nla_nest_cancel(msg, nl_match); goto out; -- cgit v1.2.3 From a18c7192aabac73078f35e5ef4ce28ad5f8cfe8e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Feb 2015 10:56:42 +0100 Subject: nl80211: fix memory leak in monitor flags parsing If monitor flags parsing results in active monitor but that isn't supported, the already allocated message is leaked. Fix this by moving the allocation after this check. Reported-by: Christian Engelmayer Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3c7fb0459e58..be2501538011 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2654,10 +2654,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return err; } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL, &flags); @@ -2666,6 +2662,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + wdev = rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL80211_ATTR_IFNAME]), type, err ? NULL : &flags, ¶ms); -- cgit v1.2.3 From 28981e5eb45fe13e1d2c9e0e2e189dc5c8682006 Mon Sep 17 00:00:00 2001 From: Jason Abele Date: Tue, 17 Feb 2015 16:10:41 -0800 Subject: cfg80211: fix n_reg_rules to match world_regdom There are currently 8 rules in the world_regdom, but only the first 6 are applied due to an incorrect value for n_reg_rules. This causes channels 149-165 and 60GHz to be disabled. Signed-off-by: Jason Abele Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b586d0dcb09e..48dfc7b4e981 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -228,7 +228,7 @@ static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work); /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { - .n_reg_rules = 6, + .n_reg_rules = 8, .alpha2 = "00", .reg_rules = { /* IEEE 802.11b/g, channels 1..11 */ -- cgit v1.2.3 From 81daf735f9fe35ef6bc4073068748b221d64fb47 Mon Sep 17 00:00:00 2001 From: Junjie Mao Date: Wed, 28 Jan 2015 10:03:26 +0800 Subject: cfg80211: calls nl80211_exit on error nl80211_exit should be called in cfg80211_init if nl80211_init succeeds but regulatory_init or create_singlethread_workqueue fails. Signed-off-by: Junjie Mao Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 3af0ecf1cc16..2a0bbd22854b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1199,6 +1199,7 @@ out_fail_wq: regulatory_exit(); out_fail_reg: debugfs_remove(ieee80211_debugfs_dir); + nl80211_exit(); out_fail_nl80211: unregister_netdevice_notifier(&cfg80211_netdev_notifier); out_fail_notifier: -- cgit v1.2.3 From 17dce15801d5602719936045a7e84ff7dc6b6da2 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 19 Feb 2015 12:57:40 +0100 Subject: mac80211/minstrel: fix !x!=0 confusion Commit 06d961a8e210 ("mac80211/minstrel: use the new rate control API") inverted the condition 'if (msr->sample_limit != 0)' to 'if (!msr->sample_limit != 0)'. But it is confusing both to people and compilers (gcc5): net/mac80211/rc80211_minstrel.c: In function 'minstrel_get_rate': net/mac80211/rc80211_minstrel.c:376:26: warning: logical not is only applied to the left hand side of comparison if (!msr->sample_limit != 0) ^ Let there be only 'if (!msr->sample_limit)'. Fixes: 06d961a8e210 ("mac80211/minstrel: use the new rate control API") Signed-off-by: Jiri Slaby Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 7c86a002df95..ef6e8a6c4253 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -373,7 +373,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, rate++; mi->sample_deferred++; } else { - if (!msr->sample_limit != 0) + if (!msr->sample_limit) return; mi->sample_packets++; -- cgit v1.2.3 From 4e10fd5b4a7f4100007147558c304da3e73b25cf Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 24 Feb 2015 14:14:35 -0500 Subject: rtnetlink: avoid 0 sized arrays Arrays (when not in a struct) "shall have a value greater than zero". GCC complains when it's not the case here. Fixes: ba7d49b1f0 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab293a3066b3..1385de0fa080 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2012,8 +2012,8 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; - struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; + struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; + struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; struct nlattr **data = NULL; struct nlattr **slave_data = NULL; struct net *dest_net, *link_net = NULL; -- cgit v1.2.3 From 41a50d621a321b4c15273cc1b5ed41437f4acdfb Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Tue, 24 Feb 2015 08:18:28 +0300 Subject: af_packet: don't pass empty blocks for PACKET_V3 Before da413eec729d ("packet: Fixed TPACKET V3 to signal poll when block is closed rather than every packet") poll listening for an af_packet socket was not signaled if there was no packets to process. After the patch poll is signaled evety time when block retire timer expires. That happens because af_packet closes the current block on timeout even if the block is empty. Passing empty blocks to the user not only wastes CPU but also wastes ring buffer space increasing probability of packets dropping on small timeouts. Signed-off-by: Alexander Drozdov Cc: Dan Collins Cc: Willem de Bruijn Cc: Guy Harris Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 99fc628f7372..5bf1e968a728 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -698,6 +698,10 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data) if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { + if (!BLOCK_NUM_PKTS(pbd)) { + /* An empty block. Just refresh the timer. */ + goto refresh_timer; + } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; @@ -798,7 +802,11 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { - /* Ok, we tmo'd - so get the current time */ + /* Ok, we tmo'd - so get the current time. + * + * It shouldn't really happen as we don't close empty + * blocks. See prb_retire_rx_blk_timer_expired(). + */ struct timespec ts; getnstimeofday(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; -- cgit v1.2.3 From 9c1c98a3bb7b7593b60264b9a07e001e68b46697 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 26 Feb 2015 15:50:50 +0200 Subject: mac80211: Send EAPOL frames at lowest rate The current minstrel_ht rate control behavior is somewhat optimistic in trying to find optimum TX rate. While this is usually fine for normal Data frames, there are cases where a more conservative set of retry parameters would be beneficial to make the connection more robust. EAPOL frames are critical to the authentication and especially the EAPOL-Key message 4/4 (the last message in the 4-way handshake) is important to get through to the AP. If that message is lost, the only recovery mechanism in many cases is to reassociate with the AP and start from scratch. This can often be avoided by trying to send the frame with more conservative rate and/or with more link layer retries. In most cases, minstrel_ht is currently using the initial EAPOL-Key frames for probing higher rates and this results in only five link layer transmission attempts (one at high(ish) MCS and four at MCS0). While this works with most APs, it looks like there are some deployed APs that may have issues with the EAPOL frames using HT MCS immediately after association. Similarly, there may be issues in cases where the signal strength or radio environment is not good enough to be able to get frames through even at couple of MCS 0 tries. The best approach for this would likely to be to reduce the TX rate for the last rate (3rd rate parameter in the set) to a low basic rate (say, 6 Mbps on 5 GHz and 2 or 5.5 Mbps on 2.4 GHz), but doing that cleanly requires some more effort. For now, we can start with a simple one-liner that forces the minimum rate to be used for EAPOL frames similarly how the TX rate is selected for the IEEE 802.11 Management frames. This does result in a small extra latency added to the cases where the AP would be able to receive the higher rate, but taken into account how small number of EAPOL frames are used, this is likely to be insignificant. A future optimization in the minstrel_ht design can also allow this patch to be reverted to get back to the more optimized initial TX rate. It should also be noted that many drivers that do not use minstrel as the rate control algorithm are already doing similar workarounds by forcing the lowest TX rate to be used for EAPOL frames. Cc: stable@vger.kernel.org Reported-by: Linus Torvalds Tested-by: Linus Torvalds Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 88a18ffe2975..07bd8db00af8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -566,6 +566,7 @@ ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) if (tx->sdata->control_port_no_encrypt) info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + info->flags |= IEEE80211_TX_CTL_USE_MINRATE; } return TX_CONTINUE; -- cgit v1.2.3 From 76cb4be993c03bf9ec65a58b13f12c679bb041e4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Feb 2015 18:34:01 +0300 Subject: sunrpc: integer underflow in rsc_parse() If we call groups_alloc() with invalid values then it's might lead to memory corruption. For example, with a negative value then we might not allocate enough for sizeof(struct group_info). (We're doing this in the caller for consistency with other callers of groups_alloc(). The other alternative might be to move the check out of all the callers into groups_alloc().) Signed-off-by: Dan Carpenter Reviewed-by: Simo Sorce Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 224a82f24d3c..1095be9c80ab 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -463,6 +463,8 @@ static int rsc_parse(struct cache_detail *cd, /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; + if (N < 0 || N > NGROUPS_MAX) + goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) -- cgit v1.2.3 From 4c4b52d9b2df45e8216d3e30b5452e4a364d2cac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 25 Feb 2015 16:31:54 +0100 Subject: rhashtable: remove indirection for grow/shrink decision functions Currently, all real users of rhashtable default their grow and shrink decision functions to rht_grow_above_75() and rht_shrink_below_30(), so that there's currently no need to have this explicitly selectable. It can/should be generic and private inside rhashtable until a real use case pops up. Since we can make this private, we'll save us this additional indirection layer and can improve insertion/deletion time as well. Reference: http://patchwork.ozlabs.org/patch/443040/ Suggested-by: David S. Miller Signed-off-by: Daniel Borkmann Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 13 ----------- lib/rhashtable.c | 56 ++++++++++++++-------------------------------- lib/test_rhashtable.c | 1 + net/netfilter/nft_hash.c | 2 -- net/netlink/af_netlink.c | 2 -- net/tipc/socket.c | 2 -- 6 files changed, 18 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index cb2104be2135..d438eeb08bff 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -79,12 +79,6 @@ struct rhashtable; * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key * @obj_hashfn: Function to hash object - * @grow_decision: If defined, may return true if table should expand - * @shrink_decision: If defined, may return true if table should shrink - * - * Note: when implementing the grow and shrink decision function, min/max - * shift must be enforced, otherwise, resizing watermarks they set may be - * useless. */ struct rhashtable_params { size_t nelem_hint; @@ -98,10 +92,6 @@ struct rhashtable_params { size_t locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; - bool (*grow_decision)(const struct rhashtable *ht, - size_t new_size); - bool (*shrink_decision)(const struct rhashtable *ht, - size_t new_size); }; /** @@ -193,9 +183,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node); -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); - int rhashtable_expand(struct rhashtable *ht); int rhashtable_shrink(struct rhashtable *ht); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index bcf119bfdef4..090641db4c0d 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -247,26 +247,24 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, * @ht: hash table * @new_size: new table size */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (new_size / 4 * 3) && (!ht->p.max_shift || atomic_read(&ht->shift) < ht->p.max_shift); } -EXPORT_SYMBOL_GPL(rht_grow_above_75); /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @new_size: new table size */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) +static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (new_size * 3 / 10) && (atomic_read(&ht->shift) > ht->p.min_shift); } -EXPORT_SYMBOL_GPL(rht_shrink_below_30); static void lock_buckets(struct bucket_table *new_tbl, struct bucket_table *old_tbl, unsigned int hash) @@ -528,40 +526,19 @@ static void rht_deferred_worker(struct work_struct *work) list_for_each_entry(walker, &ht->walkers, list) walker->resize = true; - if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) + if (rht_grow_above_75(ht, tbl->size)) rhashtable_expand(ht); - else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) + else if (rht_shrink_below_30(ht, tbl->size)) rhashtable_shrink(ht); - unlock: mutex_unlock(&ht->mutex); } -static void rhashtable_probe_expand(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.grow_decision && - ht->p.grow_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - -static void rhashtable_probe_shrink(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.shrink_decision && - ht->p.shrink_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, - struct bucket_table *tbl, u32 hash) + struct bucket_table *tbl, + const struct bucket_table *old_tbl, u32 hash) { + bool no_resize_running = tbl == old_tbl; struct rhash_head *head; hash = rht_bucket_index(tbl, hash); @@ -577,8 +554,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); - - rhashtable_probe_expand(ht); + if (no_resize_running && rht_grow_above_75(ht, tbl->size)) + schedule_work(&ht->run_work); } /** @@ -608,7 +585,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); lock_buckets(tbl, old_tbl, hash); - __rhashtable_insert(ht, obj, tbl, hash); + __rhashtable_insert(ht, obj, tbl, old_tbl, hash); unlock_buckets(tbl, old_tbl, hash); rcu_read_unlock(); @@ -690,8 +667,11 @@ found: unlock_buckets(new_tbl, old_tbl, new_hash); if (ret) { + bool no_resize_running = new_tbl == old_tbl; + atomic_dec(&ht->nelems); - rhashtable_probe_shrink(ht); + if (no_resize_running && rht_shrink_below_30(ht, new_tbl->size)) + schedule_work(&ht->run_work); } rcu_read_unlock(); @@ -861,7 +841,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, goto exit; } - __rhashtable_insert(ht, obj, new_tbl, new_hash); + __rhashtable_insert(ht, obj, new_tbl, old_tbl, new_hash); exit: unlock_buckets(new_tbl, old_tbl, new_hash); @@ -1123,8 +1103,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) if (!ht->p.hash_rnd) get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); - if (ht->p.grow_decision || ht->p.shrink_decision) - INIT_WORK(&ht->run_work, rht_deferred_worker); + INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } @@ -1142,8 +1121,7 @@ void rhashtable_destroy(struct rhashtable *ht) { ht->being_destroyed = true; - if (ht->p.grow_decision || ht->p.shrink_decision) - cancel_work_sync(&ht->run_work); + cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); bucket_table_free(rht_dereference(ht->tbl, ht)); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index f9e9d734446a..67c7593d1dd6 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -201,6 +201,7 @@ static int __init test_rht_init(void) .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(int), .hashfn = jhash, + .max_shift = 1, /* we expand/shrink manually here */ .nulls_base = (3U << RHT_BASE_SHIFT), }; int err; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 61e6c407476a..c82df0a48fcd 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -192,8 +192,6 @@ static int nft_hash_init(const struct nft_set *set, .key_offset = offsetof(struct nft_hash_elem, key), .key_len = set->klen, .hashfn = jhash, - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(priv, ¶ms); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2702673f0f23..05919bf3f670 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3126,8 +3126,6 @@ static int __init netlink_proto_init(void) .key_len = sizeof(u32), /* portid */ .hashfn = jhash, .max_shift = 16, /* 64K */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; if (err != 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f73e975af80b..b4d4467d0bb0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2364,8 +2364,6 @@ int tipc_sk_rht_init(struct net *net) .hashfn = jhash, .max_shift = 20, /* 1M */ .min_shift = 8, /* 256 */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(&tn->sk_rht, &rht_params); -- cgit v1.2.3 From 505ce4154ac86c250aa4a84a536dd9fc56479bb5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Feb 2015 16:19:00 -0600 Subject: net: Verify permission to dest_net in newlink When applicable verify that the caller has permision to create a network device in another network namespace. This check is already present when moving a network device between network namespaces in setlink so all that is needed is to duplicate that check in newlink. This change almost backports cleanly, but there are context conflicts as the code that follows was added in v4.0-rc1 Fixes: b51642f6d77b net: Enable a userns root rtnl calls that are safe for unprivilged users Signed-off-by: "Eric W. Biederman" Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1385de0fa080..b237959c7497 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2122,6 +2122,10 @@ replay: if (IS_ERR(dest_net)) return PTR_ERR(dest_net); + err = -EPERM; + if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN)) + goto out; + if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); -- cgit v1.2.3 From 06615bed60c1fb7c37adddb75bdc80da873b5edb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Feb 2015 16:20:07 -0600 Subject: net: Verify permission to link_net in newlink When applicable verify that the caller has permisson to the underlying network namespace for a newly created network device. Similary checks exist for the network namespace a network device will be created in. Fixes: 317f4810e45e ("rtnl: allow to create device with IFLA_LINK_NETNSID set") Signed-off-by: "Eric W. Biederman" Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b237959c7497..2c49355d16c2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2134,6 +2134,9 @@ replay: err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; } dev = rtnl_create_link(link_net ? : dest_net, ifname, -- cgit v1.2.3 From cac5e65e8a7ea074f2626d2eaa53aa308452dec4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2015 09:42:50 -0800 Subject: net: do not use rcu in rtnl_dump_ifinfo() We did a failed attempt in the past to only use rcu in rtnl dump operations (commit e67f88dd12f6 "net: dont hold rtnl mutex during netlink dump callbacks") Now that dumps are holding RTNL anyway, there is no need to also use rcu locking, as it forbids any scheduling ability, like GFP_KERNEL allocations that controlling path should use instead of GFP_ATOMIC whenever possible. This should fix following splat Cong Wang reported : [ INFO: suspicious RCU usage. ] 3.19.0+ #805 Tainted: G W include/linux/rcupdate.h:538 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ip/771: #0: (rtnl_mutex){+.+.+.}, at: [] netlink_dump+0x21/0x26c #1: (rcu_read_lock){......}, at: [] rcu_read_lock+0x0/0x6e stack backtrace: CPU: 3 PID: 771 Comm: ip Tainted: G W 3.19.0+ #805 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000001 ffff8800d51e7718 ffffffff81a27457 0000000029e729e6 ffff8800d6108000 ffff8800d51e7748 ffffffff810b539b ffffffff820013dd 00000000000001c8 0000000000000000 ffff8800d7448088 ffff8800d51e7758 Call Trace: [] dump_stack+0x4c/0x65 [] lockdep_rcu_suspicious+0x107/0x110 [] rcu_preempt_sleep_check+0x45/0x47 [] ___might_sleep+0x1d/0x1cb [] __might_sleep+0x78/0x80 [] idr_alloc+0x45/0xd1 [] ? rcu_read_lock_held+0x3b/0x3d [] ? idr_for_each+0x53/0x101 [] alloc_netid+0x61/0x69 [] __peernet2id+0x79/0x8d [] peernet2id+0x13/0x1f [] rtnl_fill_ifinfo+0xa8d/0xc20 [] ? __lock_is_held+0x39/0x52 [] rtnl_dump_ifinfo+0x149/0x213 [] netlink_dump+0xef/0x26c [] netlink_recvmsg+0x17b/0x2c5 [] __sock_recvmsg+0x4e/0x59 [] sock_recvmsg+0x3f/0x51 [] ___sys_recvmsg+0xf6/0x1d9 [] ? handle_pte_fault+0x6e1/0xd3d [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? rcu_read_lock_held+0x3b/0x3d [] ? __fcheck_files+0x4c/0x58 [] ? __fget_light+0x2d/0x52 [] __sys_recvmsg+0x42/0x60 [] SyS_recvmsg+0x12/0x1c Signed-off-by: Eric Dumazet Fixes: 0c7aecd4bde4b7302 ("netns: add rtnl cmd to add and get peer netns ids") Cc: Nicolas Dichtel Reported-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2c49355d16c2..25b4b5d23485 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1300,7 +1300,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_idx = cb->args[1]; - rcu_read_lock(); cb->seq = net->dev_base_seq; /* A hack to preserve kernel<->userspace interface. @@ -1322,7 +1321,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { + hlist_for_each_entry(dev, head, index_hlist) { if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -1344,7 +1343,6 @@ cont: } } out: - rcu_read_unlock(); cb->args[1] = idx; cb->args[0] = h; -- cgit v1.2.3 From 56b08fdcf637955d3023d769afd6cdabc526ba22 Mon Sep 17 00:00:00 2001 From: Arvid Brodin Date: Fri, 27 Feb 2015 21:26:03 +0100 Subject: net/hsr: Fix NULL pointer dereference and refcnt bugs when deleting a HSR interface. To repeat: $ sudo ip link del hsr0 BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] hsr_del_port+0x15/0xa0 etc... Bug description: As part of the hsr master device destruction, hsr_del_port() is called for each of the hsr ports. At each such call, the master device is updated regarding features and mtu. When the master device is freed before the slave interfaces, master will be NULL in hsr_del_port(), which led to a NULL pointer dereference. Additionally, dev_put() was called on the master device itself in hsr_del_port(), causing a refcnt error. A third bug in the same code path was that the rtnl lock was not taken before hsr_del_port() was called as part of hsr_dev_destroy(). The reporter (Nicolas Dichtel) also said: "hsr_netdev_notify() supposes that the port will always be available when the notification is for an hsr interface. It's wrong. For example, netdev_wait_allrefs() may resend NETDEV_UNREGISTER.". As a precaution against this, a check for port == NULL was added in hsr_dev_notify(). Reported-by: Nicolas Dichtel Fixes: 51f3c605318b056a ("net/hsr: Move slave init to hsr_slave.c.") Signed-off-by: Arvid Brodin Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 4 ++++ net/hsr/hsr_slave.c | 10 +++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index a138d75751df..44d27469ae55 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -359,8 +359,11 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) struct hsr_port *port; hsr = netdev_priv(hsr_dev); + + rtnl_lock(); hsr_for_each_port(hsr, port) hsr_del_port(port); + rtnl_unlock(); del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 779d28b65417..cd37d0011b42 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -36,6 +36,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + if (port == NULL) { + /* Resend of notification concerning removed device? */ + return NOTIFY_DONE; + } } else { hsr = port->hsr; } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index a348dcbcd683..7d37366cc695 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -181,8 +181,10 @@ void hsr_del_port(struct hsr_port *port) list_del_rcu(&port->port_list); if (port != master) { - netdev_update_features(master->dev); - dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + if (master != NULL) { + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + } netdev_rx_handler_unregister(port->dev); dev_set_promiscuity(port->dev, -1); } @@ -192,5 +194,7 @@ void hsr_del_port(struct hsr_port *port) */ synchronize_rcu(); - dev_put(port->dev); + + if (port != master) + dev_put(port->dev); } -- cgit v1.2.3 From c03ae533a9c4de83a35105f9bfd7152d916b4680 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Feb 2015 11:51:36 +0100 Subject: rxrpc: terminate retrans loop when sending of skb fails Typo, 'stop' is never set to true. Seems intent is to not attempt to retransmit more packets after sendmsg returns an error. This change is based on code inspection only. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/rxrpc/ar-ack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index c6be17a959a6..40404183a5da 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -218,7 +218,8 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_header *hdr; struct sk_buff *txb; unsigned long *p_txb, resend_at; - int loop, stop; + bool stop; + int loop; u8 resend; _enter("{%d,%d,%d,%d},", @@ -226,7 +227,7 @@ static void rxrpc_resend(struct rxrpc_call *call) atomic_read(&call->sequence), CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - stop = 0; + stop = false; resend = 0; resend_at = 0; @@ -255,7 +256,7 @@ static void rxrpc_resend(struct rxrpc_call *call) _proto("Tx DATA %%%u { #%d }", ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); if (rxrpc_send_packet(call->conn->trans, txb) < 0) { - stop = 0; + stop = true; sp->resend_at = jiffies + 3; } else { sp->resend_at = -- cgit v1.2.3 From 765dd3bb44711b4ba36c1e06f9c4b7bfe73ffef7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Feb 2015 11:51:37 +0100 Subject: rxrpc: don't multiply with HZ twice rxrpc_resend_timeout has an initial value of 4 * HZ; use it as-is. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/rxrpc/ar-ack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index 40404183a5da..e0547f521f20 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -260,7 +260,7 @@ static void rxrpc_resend(struct rxrpc_call *call) sp->resend_at = jiffies + 3; } else { sp->resend_at = - jiffies + rxrpc_resend_timeout * HZ; + jiffies + rxrpc_resend_timeout; } } -- cgit v1.2.3 From acf8dd0a9d0b9e4cdb597c2f74802f79c699e802 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Mon, 2 Mar 2015 18:27:11 +0100 Subject: udp: only allow UFO for packets from SOCK_DGRAM sockets If an over-MTU UDP datagram is sent through a SOCK_RAW socket to a UFO-capable device, ip_ufo_append_data() sets skb->ip_summed to CHECKSUM_PARTIAL unconditionally as all GSO code assumes transport layer checksum is to be computed on segmentation. However, in this case, skb->csum_start and skb->csum_offset are never set as raw socket transmit path bypasses udp_send_skb() where they are usually set. As a result, driver may access invalid memory when trying to calculate the checksum and store the result (as observed in virtio_net driver). Moreover, the very idea of modifying the userspace provided UDP header is IMHO against raw socket semantics (I wasn't able to find a document clearly stating this or the opposite, though). And while allowing CHECKSUM_NONE in the UFO case would be more efficient, it would be a bit too intrusive change just to handle a corner case like this. Therefore disallowing UFO for packets from SOCK_DGRAM seems to be the best option. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- net/ipv6/ip6_output.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d68199d9b2b0..a7aea2048a0d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -888,7 +888,8 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && + (sk->sk_type == SOCK_DGRAM)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7deebf102cba..0a04a37305d5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1298,7 +1298,8 @@ emsgsize: if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO) && + (sk->sk_type == SOCK_DGRAM)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags, rt); -- cgit v1.2.3 From d0c22119f574b851e63360c6b8660fe9593bbc3c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Mon, 2 Mar 2015 14:28:52 -0500 Subject: mac80211: drop unencrypted frames in mesh fwding The mesh forwarding path was not checking that data frames were protected when running an encrypted network; add the necessary check. Cc: stable@vger.kernel.org Reported-by: Johannes Berg Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1101563357ea..944bdc04e913 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2214,6 +2214,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) + return RX_DROP_MONITOR; + /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && -- cgit v1.2.3 From aa75ebc275b2a91b193654a177daf900ad6703f0 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 10 Feb 2015 12:48:44 +0100 Subject: mac80211: disable u-APSD queues by default Some APs experience problems when working with U-APSD. Decreasing the probability of that happening by using legacy mode for all ACs but VO isn't enough. Cisco 4410N originally forced us to enable VO by default only because it treated non-VO ACs as legacy. However some APs (notably Netgear R7000) silently reclassify packets to different ACs. Since u-APSD ACs require trigger frames for frame retrieval clients would never see some frames (e.g. ARP responses) or would fetch them accidentally after a long time. It makes little sense to enable u-APSD queues by default because it needs userspace applications to be aware of it to actually take advantage of the possible additional powersavings. Implicitly depending on driver autotrigger frame support doesn't make much sense. Cc: stable@vger.kernel.org Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3afe36824703..c0e089c194f1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -58,13 +58,24 @@ struct ieee80211_local; #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* - * Some APs experience problems when working with U-APSD. Decrease the - * probability of that happening by using legacy mode for all ACs but VO. - * The AP that caused us trouble was a Cisco 4410N. It ignores our - * setting, and always treats non-VO ACs as legacy. + * Some APs experience problems when working with U-APSD. Decreasing the + * probability of that happening by using legacy mode for all ACs but VO isn't + * enough. + * + * Cisco 4410N originally forced us to enable VO by default only because it + * treated non-VO ACs as legacy. + * + * However some APs (notably Netgear R7000) silently reclassify packets to + * different ACs. Since u-APSD ACs require trigger frames for frame retrieval + * clients would never see some frames (e.g. ARP responses) or would fetch them + * accidentally after a long time. + * + * It makes little sense to enable u-APSD queues by default because it needs + * userspace applications to be aware of it to actually take advantage of the + * possible additional powersavings. Implicitly depending on driver autotrigger + * frame support doesn't make much sense. */ -#define IEEE80211_DEFAULT_UAPSD_QUEUES \ - IEEE80211_WMM_IE_STA_QOSINFO_AC_VO +#define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL -- cgit v1.2.3 From 71e168b151babf4334e2e26c92230a6bda3b1f24 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Mar 2015 13:53:31 +0100 Subject: net: bridge: add compile-time assert for cb struct size make build fail if structure no longer fits into ->cb storage. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/bridge/br.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index fb57ab6b24f9..02c24cf63c34 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -190,6 +190,8 @@ static int __init br_init(void) { int err; + BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); -- cgit v1.2.3 From f4f8e73850589008095b1da3c8c17cf68bd1c62a Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 2 Mar 2015 18:49:56 -0800 Subject: openvswitch: Fix serialization of non-masked set actions. Set actions consist of a regular OVS_KEY_ATTR_* attribute nested inside of a OVS_ACTION_ATTR_SET action attribute. When converting masked actions back to regular set actions, the inner attribute length was not changed, ie, double the length being serialized. This patch fixes the bug. Fixes: 83d2b9b ("net: openvswitch: Support masked set actions.") Signed-off-by: Joe Stringer Acked-by: Jarno Rajahalme Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 216f20b90aa5..22b18c145c92 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2253,14 +2253,20 @@ static int masked_set_action_to_set_action_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); + struct nlattr *nla; size_t key_len = nla_len(ovs_key) / 2; /* Revert the conversion we did from a non-masked set action to * masked set action. */ - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a) - key_len, ovs_key)) + nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!nla) return -EMSGSIZE; + if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) + return -EMSGSIZE; + + nla_nest_end(skb, nla); return 0; } -- cgit v1.2.3 From 8670c3a55e91cb27a4b4d4d4c4fa35b0149e1abf Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:18 +0000 Subject: netfilter: nf_tables: fix transaction race condition A race condition exists in the rule transaction code for rules that get added and removed within the same transaction. The new rule starts out as inactive in the current and active in the next generation and is inserted into the ruleset. When it is deleted, it is additionally set to inactive in the next generation as well. On commit the next generation is begun, then the actions are finalized. For the new rule this would mean clearing out the inactive bit for the previously current, now next generation. However nft_rule_clear() clears out the bits for *both* generations, activating the rule in the current generation, where it should be deactivated due to being deleted. The rule will thus be active until the deletion is finalized, removing the rule from the ruleset. Similarly, when aborting a transaction for the same case, the undo of insertion will remove it from the RCU protected rule list, the deletion will clear out all bits. However until the next RCU synchronization after all operations have been undone, the rule is active on CPUs which can still see the rule on the list. Generally, there may never be any modifications of the current generations' inactive bit since this defeats the entire purpose of atomicity. Change nft_rule_clear() to only touch the next generations bit to fix this. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a8c94620f20e..6fb532bf0fdb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -227,7 +227,7 @@ nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) { - rule->genmask = 0; + rule->genmask &= ~(1 << gencursor_next(net)); } static int -- cgit v1.2.3 From 9889840f5988ecfd43b00c9abb83c1804e21406b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:19 +0000 Subject: netfilter: nf_tables: check for overflow of rule dlen field Check that the space required for the expressions doesn't exceed the size of the dlen field, which would lead to the iterators crashing. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6fb532bf0fdb..7baafd5ab520 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1968,6 +1968,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, n++; } } + /* Check for overflow of dlen field */ + err = -EFBIG; + if (size >= 1 << 12) + goto err1; if (nla[NFTA_RULE_USERDATA]) ulen = nla_len(nla[NFTA_RULE_USERDATA]); -- cgit v1.2.3 From 86f1ec32318159a24de349f0a38e79b9d2b3131a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:20 +0000 Subject: netfilter: nf_tables: fix userdata length overflow The NFT_USERDATA_MAXLEN is defined to 256, however we only have a u8 to store its size. Introduce a struct nft_userdata which contains a length field and indicate its presence using a single bit in the rule. The length field of struct nft_userdata is also a u8, however we don't store zero sized data, so the actual length is udata->len + 1. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 22 +++++++++++++++++++--- net/netfilter/nf_tables_api.c | 28 +++++++++++++++++++--------- 2 files changed, 38 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9eaaa7884586..decb9a095ae7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -119,6 +119,22 @@ int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, enum nft_data_types type); + +/** + * struct nft_userdata - user defined data associated with an object + * + * @len: length of the data + * @data: content + * + * The presence of user data is indicated in an object specific fashion, + * so a length of zero can't occur and the value "len" indicates data + * of length len + 1. + */ +struct nft_userdata { + u8 len; + unsigned char data[0]; +}; + /** * struct nft_set_elem - generic representation of set elements * @@ -380,7 +396,7 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data - * @ulen: length of user data (used for comments) + * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { @@ -388,7 +404,7 @@ struct nft_rule { u64 handle:42, genmask:2, dlen:12, - ulen:8; + udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; @@ -476,7 +492,7 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) return (struct nft_expr *)&rule->data[rule->dlen]; } -static inline void *nft_userdata(const struct nft_rule *rule) +static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7baafd5ab520..74e4b876c96e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1711,9 +1711,12 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, } nla_nest_end(skb, list); - if (rule->ulen && - nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) - goto nla_put_failure; + if (rule->udata) { + struct nft_userdata *udata = nft_userdata(rule); + if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1, + udata->data) < 0) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return 0; @@ -1896,11 +1899,12 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; + struct nft_userdata *udata; struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0; + unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; bool create; u64 handle, pos_handle; @@ -1973,11 +1977,14 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (size >= 1 << 12) goto err1; - if (nla[NFTA_RULE_USERDATA]) + if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); + if (ulen > 0) + usize = sizeof(struct nft_userdata) + ulen; + } err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) goto err1; @@ -1985,10 +1992,13 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, rule->handle = handle; rule->dlen = size; - rule->ulen = ulen; + rule->udata = ulen ? 1 : 0; - if (ulen) - nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + if (ulen) { + udata = nft_userdata(rule); + udata->len = ulen - 1; + nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen); + } expr = nft_expr_first(rule); for (i = 0; i < n; i++) { -- cgit v1.2.3 From 59900e0a019e7c2bdb7809a03ed5742d311b15b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 4 Mar 2015 17:55:27 +0100 Subject: netfilter: nf_tables: fix error handling of rule replacement In general, if a transaction object is added to the list successfully, we can rely on the abort path to undo what we've done. This allows us to simplify the error handling of the rule replacement path in nf_tables_newrule(). This implicitly fixes an unnecessary removal of the old rule, which needs to be left in place if we fail to replace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 74e4b876c96e..6ab777912237 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2045,12 +2045,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err3: list_del_rcu(&rule->list); - if (trans) { - list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_clear(net, nft_trans_rule(trans)); - nft_trans_destroy(trans); - chain->use++; - } err2: nf_tables_rule_destroy(&ctx, rule); err1: -- cgit v1.2.3 From 9145736d4862145684009d6a72a6e61324a9439e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 3 Mar 2015 23:16:16 +0900 Subject: net: ping: Return EAFNOSUPPORT when appropriate. 1. For an IPv4 ping socket, ping_check_bind_addr does not check the family of the socket address that's passed in. Instead, make it behave like inet_bind, which enforces either that the address family is AF_INET, or that the family is AF_UNSPEC and the address is 0.0.0.0. 2. For an IPv6 ping socket, ping_check_bind_addr returns EINVAL if the socket family is not AF_INET6. Return EAFNOSUPPORT instead, for consistency with inet6_bind. 3. Make ping_v4_sendmsg and ping_v6_sendmsg return EAFNOSUPPORT instead of EINVAL if an incorrect socket address structure is passed in. 4. Make IPv6 ping sockets be IPv6-only. The code does not support IPv4, and it cannot easily be made to support IPv4 because the protocol numbers for ICMP and ICMPv6 are different. This makes connect(::ffff:192.0.2.1) fail with EAFNOSUPPORT instead of making the socket unusable. Among other things, this fixes an oops that can be triggered by: int s = socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP); struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = in6addr_any, }; bind(s, (struct sockaddr *) &sin6, sizeof(sin6)); Change-Id: If06ca86d9f1e4593c0d6df174caca3487c57a241 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/ping.c | 12 ++++++++++-- net/ipv6/ping.c | 5 +++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e9f66e1cda50..208d5439e59b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -259,6 +259,9 @@ int ping_init_sock(struct sock *sk) kgid_t low, high; int ret = 0; + if (sk->sk_family == AF_INET6) + sk->sk_ipv6only = 1; + inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; @@ -305,6 +308,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin_family != AF_INET && + !(addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr == htonl(INADDR_ANY))) + return -EAFNOSUPPORT; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); @@ -330,7 +338,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return -EINVAL; if (addr->sin6_family != AF_INET6) - return -EINVAL; + return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -716,7 +724,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) - return -EINVAL; + return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index bd46f736f61d..a2dfff6ff227 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -102,9 +102,10 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); - if (msg->msg_namelen < sizeof(struct sockaddr_in6) || - u->sin6_family != AF_INET6) { + if (msg->msg_namelen < sizeof(*u)) return -EINVAL; + if (u->sin6_family != AF_INET6) { + return -EAFNOSUPPORT; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != u->sin6_scope_id) { -- cgit v1.2.3 From 3e32e733d1bbb3f227259dc782ef01d5706bdae0 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Thu, 5 Mar 2015 10:29:39 +0300 Subject: ipv4: ip_check_defrag should not assume that skb_network_offset is zero ip_check_defrag() may be used by af_packet to defragment outgoing packets. skb_network_offset() of af_packet's outgoing packets is not zero. Signed-off-by: Alexander Drozdov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2c8d98e728c0..145a50c4d566 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -659,27 +659,30 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { struct iphdr iph; + int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) + netoff = skb_network_offset(skb); + + if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); - if (skb->len < len || len < (iph.ihl * 4)) + if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, iph.ihl*4)) + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) return skb; - if (pskb_trim_rcsum(skb, len)) + if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) -- cgit v1.2.3 From 6c09fa09d468d730eecd7122122175da772d3b09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Mar 2015 08:03:06 -0800 Subject: tcp: align tcp_xmit_size_goal() on tcp_tso_autosize() With some mss values, it is possible tcp_xmit_size_goal() puts one segment more in TSO packet than tcp_tso_autosize(). We send then one TSO packet followed by one single MSS. It is not a serious bug, but we can do slightly better, especially for drivers using netif_set_gso_max_size() to lower gso_max_size. Using same formula avoids these corner cases and makes tcp_xmit_size_goal() a bit faster. Signed-off-by: Eric Dumazet Fixes: 605ad7f184b6 ("tcp: refine TSO autosizing") Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d72a0fcd928..995a2259bcfc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,17 +835,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 new_size_goal, size_goal, hlen; + u32 new_size_goal, size_goal; if (!large_allowed || !sk_can_gso(sk)) return mss_now; - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - new_size_goal = sk->sk_gso_max_size - 1 - hlen; + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); /* We try hard to avoid divides here */ -- cgit v1.2.3 From 2c3fbe3cf28fbd7001545a92a83b4f8acfd9fa36 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 4 Mar 2015 10:39:03 +0100 Subject: net: irda: fix wait_until_sent poll timeout In case an infinite timeout (0) is requested, the irda wait_until_sent implementation would use a zero poll timeout rather than the default 200ms. Note that wait_until_sent is currently never called with a 0-timeout argument due to a bug in tty_wait_until_sent. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable # v2.6.12 Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- net/irda/ircomm/ircomm_tty.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 40695b9751c1..4efe486baee6 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -798,7 +798,9 @@ static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) orig_jiffies = jiffies; /* Set poll time to 200 ms */ - poll_time = IRDA_MIN(timeout, msecs_to_jiffies(200)); + poll_time = msecs_to_jiffies(200); + if (timeout) + poll_time = min_t(unsigned long, timeout, poll_time); spin_lock_irqsave(&self->spinlock, flags); while (self->tx_skb && self->tx_skb->len) { -- cgit v1.2.3 From 1711fd9addf214823b993468567cab1f8254fc51 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 7 Mar 2015 21:08:46 +0000 Subject: sunrpc: fix braino in ->poll() POLL_OUT isn't what callers of ->poll() are expecting to see; it's actually __SI_POLL | 2 and it's a siginfo code, not a poll bitmap bit... Signed-off-by: Al Viro Cc: stable@vger.kernel.org Cc: Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 33fb105d4352..5199bb1a017e 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -921,7 +921,7 @@ static unsigned int cache_poll(struct file *filp, poll_table *wait, poll_wait(filp, &queue_wait, wait); /* alway allow write */ - mask = POLL_OUT | POLLWRNORM; + mask = POLLOUT | POLLWRNORM; if (!rp) return mask; -- cgit v1.2.3 From c247f0534cc5a5a547a343903f42295a471844e2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 7 Mar 2015 20:33:22 -0500 Subject: ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann Signed-off-by: Willem de Bruijn ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 33 +++++++++++++++++++++++---------- net/ipv6/datagram.c | 39 ++++++++++++++++++++++++++++----------- net/rxrpc/ar-error.c | 4 ++-- 3 files changed, 53 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 31d8c71986b4..5cd99271d3a6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,17 +432,32 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } -static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, - const struct sk_buff *skb, - int ee_origin) +/* IPv4 supports cmsg on all imcp errors and some timestamps + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ipv4_datagram_support_cmsg(const struct sock *sk, + struct sk_buff *skb, + int ee_origin) { - struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + struct in_pktinfo *info; + + if (ee_origin == SO_EE_ORIGIN_ICMP) + return true; - if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || - (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + if (ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + /* Support IP_PKTINFO on tstamp packets if requested, to correlate + * timestamp with egress dev. Not possible for packets without dev + * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). + */ + if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || (!skb->dev)) return false; + info = PKTINFO_SKB_CB(skb); info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; info->ipi_ifindex = skb->dev->ifindex; return true; @@ -483,7 +498,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -496,9 +511,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (skb->len && - (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || - ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { + if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_sk(sk)->cmsg_flags) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c215be70cac0..ace8daca5c83 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,14 +325,34 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } -static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. + * + * At one point, excluding local errors was a quick test to identify icmp/icmp6 + * errors. This is no longer true, but the test remained, so the v6 stack, + * unlike v4, also honors cmsg requests on all wifi and timestamp errors. + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ip6_datagram_support_cmsg(struct sk_buff *skb, + struct sock_exterr_skb *serr) { - int ifindex = skb->dev ? skb->dev->ifindex : -1; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) + return true; + + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + if (!skb->dev) + return false; if (skb->protocol == htons(ETH_P_IPV6)) - IP6CB(skb)->iif = ifindex; + IP6CB(skb)->iif = skb->dev->ifindex; else - PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; + PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex; + + return true; } /* @@ -369,7 +389,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -394,14 +414,11 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { + + if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; - if (np->rxopt.all) { - if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && - serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) - ip6_datagram_prepare_pktinfo_errqueue(skb); + if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); - } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 5394b6be46ec..0610efa83d72 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -42,7 +42,8 @@ void rxrpc_UDP_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - if (!skb->len) { + serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); kfree_skb(skb); return; @@ -50,7 +51,6 @@ void rxrpc_UDP_error_report(struct sock *sk) rxrpc_new_skb(skb); - serr = SKB_EXT_ERR(skb); addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); port = serr->port; -- cgit v1.2.3 From 969439016d2cf61fef53a973d7e6d2061c3793b1 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 23 Feb 2015 20:37:54 +0100 Subject: can: add missing initialisations in CAN related skbuffs When accessing CAN network interfaces with AF_PACKET sockets e.g. by dhclient this can lead to a skb_under_panic due to missing skb initialisations. Add the missing initialisations at the CAN skbuff creation times on driver level (rx path) and in the network layer (tx path). Reported-by: Austin Schuh Reported-by: Daniel Steer Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 8 ++++++++ net/can/af_can.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 3c82e02e3dae..b0f69248cb71 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -579,6 +579,10 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; @@ -603,6 +607,10 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev, skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; diff --git a/net/can/af_can.c b/net/can/af_can.c index 66e08040ced7..32d710eaf1fc 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -259,6 +259,9 @@ int can_send(struct sk_buff *skb, int loop) goto inval_skb; } + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); -- cgit v1.2.3 From 82f17091e68254d1612b42cf23291cad63cfaf04 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Mon, 9 Mar 2015 11:51:04 -0700 Subject: net: delete stale packet_mclist entries When an interface is deleted from a net namespace the ifindex in the corresponding entries in PF_PACKET sockets' mclists becomes stale. This can create inconsistencies if later an interface with the same ifindex is moved from a different namespace (not that unlikely since ifindexes are per-namespace). In particular we saw problems with dev->promiscuity, resulting in "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken" warnings and EOVERFLOW failures of setsockopt(PACKET_ADD_MEMBERSHIP). This patch deletes the mclist entries for interfaces that are deleted. Since this now causes setsockopt(PACKET_DROP_MEMBERSHIP) to fail with EADDRNOTAVAIL if called after the interface is deleted, also make packet_mc_drop not fail. Signed-off-by: Francesco Ruggeri Signed-off-by: David S. Miller --- net/packet/af_packet.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5bf1e968a728..f8db7064d81c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3123,11 +3123,18 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, return 0; } -static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +static void packet_dev_mclist_delete(struct net_device *dev, + struct packet_mclist **mlp) { - for ( ; i; i = i->next) { - if (i->ifindex == dev->ifindex) - packet_dev_mc(dev, i, what); + struct packet_mclist *ml; + + while ((ml = *mlp) != NULL) { + if (ml->ifindex == dev->ifindex) { + packet_dev_mc(dev, ml, -1); + *mlp = ml->next; + kfree(ml); + } else + mlp = &ml->next; } } @@ -3204,12 +3211,11 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) packet_dev_mc(dev, ml, -1); kfree(ml); } - rtnl_unlock(); - return 0; + break; } } rtnl_unlock(); - return -EADDRNOTAVAIL; + return 0; } static void packet_flush_mclist(struct sock *sk) @@ -3559,7 +3565,7 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist(dev, po->mclist, -1); + packet_dev_mclist_delete(dev, &po->mclist); /* fallthrough */ case NETDEV_DOWN: -- cgit v1.2.3 From e6441bae326271090755e1707196ad05aa1dc703 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 9 Mar 2015 16:16:22 -0400 Subject: tipc: fix bug in link failover handling In commit c637c1035534867b85b78b453c38c495b58e2c5a ("tipc: resolve race problem at unicast message reception") we introduced a new mechanism for delivering buffers upwards from link to socket layer. That code contains a bug in how we handle the new link input queue during failover. When a link is reset, some of its users may be blocked because of congestion, and in order to resolve this, we add any pending wakeup pseudo messages to the link's input queue, and deliver them to the socket. This misses the case where the other, remaining link also may have congested users. Currently, the owner node's reference to the remaining link's input queue is unconditionally overwritten by the reset link's input queue. This has the effect that wakeup events from the remaining link may be unduely delayed (but not lost) for a potentially long period. We fix this by adding the pending events from the reset link to the input queue that is currently referenced by the node, whichever one it is. This commit should be applied to both net and net-next. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index a4cf364316de..14f09b3cb87c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -464,10 +464,11 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Clean up all queues, except inputq: */ __skb_queue_purge(&l_ptr->outqueue); __skb_queue_purge(&l_ptr->deferred_queue); - skb_queue_splice_init(&l_ptr->wakeupq, &l_ptr->inputq); - if (!skb_queue_empty(&l_ptr->inputq)) + if (!owner->inputq) + owner->inputq = &l_ptr->inputq; + skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); + if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - owner->inputq = &l_ptr->inputq; l_ptr->next_out = NULL; l_ptr->unacked_window = 0; l_ptr->checkpoint = 1; -- cgit v1.2.3 From 5778d39d070b4ac5f889928175b7f2d53ae7504e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 9 Mar 2015 17:03:40 -0700 Subject: net_sched: fix struct tc_u_hnode layout in u32 We dynamically allocate divisor+1 entries for ->ht[] in tc_u_hnode: ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); So ->ht is supposed to be the last field of this struct, however this is broken, since an rcu head is appended after it. Fixes: 1ce87720d456 ("net: sched: make cls_u32 lockless") Cc: Jamal Hadi Salim Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 09487afbfd51..95fdf4e40051 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -78,8 +78,11 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; - struct tc_u_knode __rcu *ht[1]; struct rcu_head rcu; + /* The 'ht' field MUST be the last field in structure to allow for + * more entries allocated at end of structure. + */ + struct tc_u_knode __rcu *ht[1]; }; struct tc_u_common { -- cgit v1.2.3 From 7768eed8bf1d2e5eefa38c573f15f737a9824052 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 10 Mar 2015 19:03:46 +0100 Subject: net: add comment for sock_efree() usage Signed-off-by: Oliver Hartkopp Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 93c8b20c91e4..78e89eb7eb70 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1655,6 +1655,10 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); -- cgit v1.2.3 From 4363890079674db7b00cf1bb0e6fa430e846e86b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Mar 2015 21:58:32 -0400 Subject: net: Handle unregister properly when netdev namespace change fails. If rtnl_newlink() fails on it's call to dev_change_net_namespace(), we have to make use of the ->dellink() method, if present, just like we do when rtnl_configure_link() fails. Fixes: 317f4810e45e ("rtnl: allow to create device with IFLA_LINK_NETNSID set") Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..ee0608bb3bc0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2166,28 +2166,28 @@ replay: } } err = rtnl_configure_link(dev, ifm); - if (err < 0) { - if (ops->newlink) { - LIST_HEAD(list_kill); - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; - } - + if (err < 0) + goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) - unregister_netdevice(dev); + goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; } } -- cgit v1.2.3 From 9949afa42be0b76f5832db112ce51bb6b35b2abb Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:03 -0400 Subject: tcp: fix tcp_cong_avoid_ai() credit accumulation bug with decreases in w The recent change to tcp_cong_avoid_ai() to handle stretch ACKs introduced a bug where snd_cwnd_cnt could accumulate a very large value while w was large, and then if w was reduced snd_cwnd could be incremented by a large delta, leading to a large burst and high packet loss. This was tickled when CUBIC's bictcp_update() sets "ca->cnt = 100 * cwnd". This bug crept in while preparing the upstream version of 814d488c6126. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d694088214cd..62856e185a93 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + /* If credits accumulated at a higher w, apply them gently now. */ + if (tp->snd_cwnd_cnt >= w) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; -- cgit v1.2.3 From d578e18ce93f5d33a7120fd57c453e22a4c0fc37 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:04 -0400 Subject: tcp: restore 1.5x per RTT limit to CUBIC cwnd growth in congestion avoidance Commit 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") fixed a bug where tcp_cong_avoid_ai() would either credit a connection with an increase of snd_cwnd_cnt, or increase snd_cwnd, but not both, resulting in cwnd increasing by 1 packet on at most every alternate invocation of tcp_cong_avoid_ai(). Although the commit correctly implemented the CUBIC algorithm, which can increase cwnd by as much as 1 packet per 1 packet ACKed (2x per RTT), in practice that could be too aggressive: in tests on network paths with small buffers, YouTube server retransmission rates nearly doubled. This commit restores CUBIC to a maximum cwnd growth rate of 1 packet per 2 packets ACKed (1.5x per RTT). In YouTube tests this restored retransmit rates to low levels. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 9cd981dcf174 ("tcp: fix stretch ACK bugs in CUBIC") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4b276d1ed980..06d3d665a9fd 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -306,8 +306,10 @@ tcp_friendliness: } } - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; + /* The maximum rate of cwnd increase CUBIC allows is 1 packet per + * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. + */ + ca->cnt = max(ca->cnt, 2U); } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) -- cgit v1.2.3 From b1cb59cf2efe7971d3d72a7b963d09a512d994c9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 11 Mar 2015 14:29:17 +0300 Subject: net: sysctl_net_core: check SNDBUF and RCVBUF for min length sysctl has sysctl.net.core.rmem_*/wmem_* parameters which can be set to incorrect values. Given that 'struct sk_buff' allocates from rcvbuf, incorrectly set buffer length could result to memory allocation failures. For example, set them as follows: # sysctl net.core.rmem_default=64 net.core.wmem_default = 64 # sysctl net.core.wmem_default=64 net.core.wmem_default = 64 # ping localhost -s 1024 -i 0 > /dev/null This could result to the following failure: skbuff: skb_over_panic: text:ffffffff81628db4 len:-32 put:-32 head:ffff88003a1cc200 data:ffff88003a1cc200 tail:0xffffffe0 end:0xc0 dev: kernel BUG at net/core/skbuff.c:102! invalid opcode: 0000 [#1] SMP ... task: ffff88003b7f5550 ti: ffff88003ae88000 task.ti: ffff88003ae88000 RIP: 0010:[] [] skb_put+0xa1/0xb0 RSP: 0018:ffff88003ae8bc68 EFLAGS: 00010296 RAX: 000000000000008d RBX: 00000000ffffffe0 RCX: 0000000000000000 RDX: ffff88003fdcf598 RSI: ffff88003fdcd9c8 RDI: ffff88003fdcd9c8 RBP: ffff88003ae8bc88 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 00000000000002b2 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88003d3f7300 R15: ffff88000012a900 FS: 00007fa0e2b4a840(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000d0f7e0 CR3: 000000003b8fb000 CR4: 00000000000006f0 Stack: ffff88003a1cc200 00000000ffffffe0 00000000000000c0 ffffffff818cab1d ffff88003ae8bd68 ffffffff81628db4 ffff88003ae8bd48 ffff88003b7f5550 ffff880031a09408 ffff88003b7f5550 ffff88000012aa48 ffff88000012ab00 Call Trace: [] unix_stream_sendmsg+0x2c4/0x470 [] sock_write_iter+0x146/0x160 [] new_sync_write+0x92/0xd0 [] vfs_write+0xd6/0x180 [] SyS_write+0x59/0xd0 [] system_call_fastpath+0x12/0x17 Code: 00 00 48 89 44 24 10 8b 87 c8 00 00 00 48 89 44 24 08 48 8b 87 d8 00 00 00 48 c7 c7 30 db 91 81 48 89 04 24 31 c0 e8 4f a8 0e 00 <0f> 0b eb fe 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 RIP [] skb_put+0xa1/0xb0 RSP Kernel panic - not syncing: Fatal exception Moreover, the possible minimum is 1, so we can get another kernel panic: ... BUG: unable to handle kernel paging request at ffff88013caee5c0 IP: [] __alloc_skb+0x12f/0x1f0 ... Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 433424804284..8ce351ffceb1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -237,7 +239,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_max", @@ -245,7 +247,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", @@ -253,7 +255,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_default", @@ -261,7 +263,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", -- cgit v1.2.3 From c29390c6dfeee0944ac6b5610ebbe403944378fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2015 18:42:02 -0700 Subject: xps: must clear sender_cpu before forwarding John reported that my previous commit added a regression on his router. This is because sender_cpu & napi_id share a common location, so get_xps_queue() can see garbage and perform an out of bound access. We need to make sure sender_cpu is cleared before doing the transmit, otherwise any NIC busy poll enabled (skb_mark_napi_id()) can trigger this bug. Signed-off-by: Eric Dumazet Reported-by: John Bisected-by: John Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++++ net/core/skbuff.c | 2 +- net/ipv4/ip_forward.c | 1 + net/ipv6/ip6_output.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 30007afe70b3..f54d6659713a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -948,6 +948,13 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_sender_cpu_clear(struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + skb->sender_cpu = 0; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f80507823531..434e78e5254d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4173,7 +4173,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; - skb->sender_cpu = 0; + skb_sender_cpu_clear(skb); skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 787b3c294ce6..d9bc28ac5d1b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0a04a37305d5..7e80b61b51ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct sk_buff *skb) { + skb_sender_cpu_clear(skb); return dst_output(skb); } -- cgit v1.2.3 From 3a8dd9711e0792f64394edafadd66c2d1f1904df Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Mar 2015 15:43:55 -0400 Subject: sock: fix possible NULL sk dereference in __skb_tstamp_tx Test that sk != NULL before reading sk->sk_tsflags. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: One Thousand Gnomes Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 434e78e5254d..8e4ac97c8477 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3733,9 +3733,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + bool tsonly; - if (!sk || !skb_may_tx_timestamp(sk, tsonly)) + if (!sk) + return; + + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) -- cgit v1.2.3 From f862e07cf95d5b62a5fc5e981dd7d0dbaf33a501 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Mar 2015 22:46:59 +0100 Subject: rds: avoid potential stack overflow The rds_iw_update_cm_id function stores a large 'struct rds_sock' object on the stack in order to pass a pair of addresses. This happens to just fit withint the 1024 byte stack size warning limit on x86, but just exceed that limit on ARM, which gives us this warning: net/rds/iw_rdma.c:200:1: warning: the frame size of 1056 bytes is larger than 1024 bytes [-Wframe-larger-than=] As the use of this large variable is basically bogus, we can rearrange the code to not do that. Instead of passing an rds socket into rds_iw_get_device, we now just pass the two addresses that we have available in rds_iw_update_cm_id, and we change rds_iw_get_mr accordingly, to create two address structures on the stack there. Signed-off-by: Arnd Bergmann Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/iw_rdma.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index a817705ce2d0..dba8d0864f18 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -88,7 +88,9 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, + struct rds_iw_device **rds_iwdev, + struct rdma_cm_id **cm_id) { struct rds_iw_device *iwdev; struct rds_iw_cm_id *i_cm_id; @@ -112,15 +114,15 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd src_addr->sin_port, dst_addr->sin_addr.s_addr, dst_addr->sin_port, - rs->rs_bound_addr, - rs->rs_bound_port, - rs->rs_conn_addr, - rs->rs_conn_port); + src->sin_addr.s_addr, + src->sin_port, + dst->sin_addr.s_addr, + dst->sin_port); #ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && - src_addr->sin_port == rs->rs_bound_port && - dst_addr->sin_addr.s_addr == rs->rs_conn_addr && - dst_addr->sin_port == rs->rs_conn_port) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && + src_addr->sin_port == src->sin_port && + dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && + dst_addr->sin_port == dst->sin_port) { #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only @@ -128,7 +130,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd * zero'ed. It doesn't appear to be properly populated * during connection setup... */ - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { #endif spin_unlock_irq(&iwdev->spinlock); *rds_iwdev = iwdev; @@ -180,19 +182,13 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i { struct sockaddr_in *src_addr, *dst_addr; struct rds_iw_device *rds_iwdev_old; - struct rds_sock rs; struct rdma_cm_id *pcm_id; int rc; src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - rs.rs_bound_addr = src_addr->sin_addr.s_addr; - rs.rs_bound_port = src_addr->sin_port; - rs.rs_conn_addr = dst_addr->sin_addr.s_addr; - rs.rs_conn_port = dst_addr->sin_port; - - rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); if (rc) rds_iw_remove_cm_id(rds_iwdev, cm_id); @@ -598,9 +594,17 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_iw_device *rds_iwdev; struct rds_iw_mr *ibmr = NULL; struct rdma_cm_id *cm_id; + struct sockaddr_in src = { + .sin_addr.s_addr = rs->rs_bound_addr, + .sin_port = rs->rs_bound_port, + }; + struct sockaddr_in dst = { + .sin_addr.s_addr = rs->rs_conn_addr, + .sin_port = rs->rs_conn_port, + }; int ret; - ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); if (ret || !cm_id) { ret = -ENODEV; goto out; -- cgit v1.2.3 From 78146572b9cd20452da47951812f35b1ad4906be Mon Sep 17 00:00:00 2001 From: Ian Wilson Date: Thu, 12 Mar 2015 09:37:58 +0000 Subject: netfilter: Zero the tuple in nfnl_cthelper_parse_tuple() nfnl_cthelper_parse_tuple() is called from nfnl_cthelper_new(), nfnl_cthelper_get() and nfnl_cthelper_del(). In each case they pass a pointer to an nf_conntrack_tuple data structure local variable: struct nf_conntrack_tuple tuple; ... ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); The problem is that this local variable is not initialized, and nfnl_cthelper_parse_tuple() only initializes two fields: src.l3num and dst.protonum. This leaves all other fields with undefined values based on whatever is on the stack: tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); The symptom observed was that when the rpc and tns helpers were added then traffic to port 1536 was being sent to user-space. Signed-off-by: Ian Wilson Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index a5599fc51a6f..54330fb5efaf 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -77,6 +77,9 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) return -EINVAL; + /* Not all fields are initialized so first zero the tuple */ + memset(tuple, 0, sizeof(struct nf_conntrack_tuple)); + tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); -- cgit v1.2.3 From 8051a2a518fcf3827a143470083ad6008697ff17 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 12 Mar 2015 11:53:41 +1030 Subject: 9p/trans_virtio: fix hot-unplug On device hot-unplug, 9p/virtio currently will kfree channel while it might still be in use. Of course, it might stay used forever, so it's an extremely ugly hack, but it seems better than use-after-free that we have now. [ Unused variable removed, whitespace cleanup, msg single-lined --RR ] Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- net/9p/trans_virtio.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d8e376a5f0f1..36a1a739ad68 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,14 +658,30 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) static void p9_virtio_remove(struct virtio_device *vdev) { struct virtio_chan *chan = vdev->priv; - - if (chan->inuse) - p9_virtio_close(chan->client); - vdev->config->del_vqs(vdev); + unsigned long warning_time; mutex_lock(&virtio_9p_lock); + + /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); + warning_time = jiffies; + + /* Wait for existing users to close. */ + while (chan->inuse) { + mutex_unlock(&virtio_9p_lock); + msleep(250); + if (time_after(jiffies, warning_time + 10 * HZ)) { + dev_emerg(&vdev->dev, + "p9_virtio_remove: waiting for device in use.\n"); + warning_time = jiffies; + } + mutex_lock(&virtio_9p_lock); + } + mutex_unlock(&virtio_9p_lock); + + vdev->config->del_vqs(vdev); + sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); kfree(chan->tag); -- cgit v1.2.3 From d8bdff59cea141d2e5f7e98c1b11d3e0271640bd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 13 Mar 2015 10:52:14 +1100 Subject: netfilter: Fix potential crash in nft_hash walker When we get back an EAGAIN from rhashtable_walk_next we were treating it as a valid object which obviously doesn't work too well. Luckily this is hard to trigger so it seems nobody has run into it yet. This patch fixes it by redoing the next call when we get an EAGAIN. Signed-off-by: Herbert Xu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_hash.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c82df0a48fcd..37c15e674884 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -153,6 +153,8 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, iter->err = err; goto out; } + + continue; } if (iter->count < iter->skip) -- cgit v1.2.3 From c8e2c80d7ec00d020320f905822bf49c5ad85250 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Mar 2015 09:49:59 -0700 Subject: inet_diag: fix possible overflow in inet_diag_dump_one_icsk() inet_diag_dump_one_icsk() allocates too small skb. Add inet_sk_attr_size() helper right before inet_sk_diag_fill() so that it can be updated if/when new attributes are added. iproute2/ss currently does not use this dump_one() interface, this might explain nobody noticed this problem yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 81751f12645f..592aff37366b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler( mutex_unlock(&inet_diag_table_mutex); } +static size_t inet_sk_attr_size(void) +{ + return nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, struct user_namespace *user_ns, @@ -326,9 +340,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64, GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; -- cgit v1.2.3 From 4c906c279886550d2aaac6facf71d709158e4e3c Mon Sep 17 00:00:00 2001 From: Venkat Venkatsubra Date: Fri, 13 Mar 2015 07:08:22 -0700 Subject: bridge: reset bridge mtu after deleting an interface On adding an interface br_add_if() sets the MTU to the min of all the interfaces. Do the same thing on removing an interface too in br_del_if. Signed-off-by: Venkat Venkatsubra Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_if.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b087d278c679..1849d96b3c91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -563,6 +563,8 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); + dev_set_mtu(br->dev, br_min_mtu(br)); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); -- cgit v1.2.3 From 3eeff778e00c956875c70b145c52638c313dfb23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:22:21 +0000 Subject: caif: fix MSG_OOB test in caif_seqpkt_recvmsg() It should be checking flags, not msg->msg_flags. It's ->sendmsg() instances that need to look for that in ->msg_flags, ->recvmsg() ones (including the other ->recvmsg() instance in that file, as well as unix_dgram_recvmsg() this one claims to be imitating) check in flags. Braino had been introduced in commit dcda13 ("caif: Bugfix - use MSG_TRUNC in receive") back in 2010, so it goes quite a while back. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 769b185fefbd..a6e2da0bc718 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -281,7 +281,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, int copylen; ret = -EOPNOTSUPP; - if (m->msg_flags&MSG_OOB) + if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); -- cgit v1.2.3 From 7d985ed1dca5c90535d67ce92ef6ca520302340a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:34:56 +0000 Subject: rxrpc: bogus MSG_PEEK test in rxrpc_recvmsg() [I would really like an ACK on that one from dhowells; it appears to be quite straightforward, but...] MSG_PEEK isn't passed to ->recvmsg() via msg->msg_flags; as the matter of fact, neither the kernel users of rxrpc, nor the syscalls ever set that bit in there. It gets passed via flags; in fact, another such check in the same function is done correctly - as flags & MSG_PEEK. It had been that way (effectively disabled) for 8 years, though, so the patch needs beating up - that case had never been tested. If it is correct, it's -stable fodder. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/rxrpc/ar-recvmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 4575485ad1b4..19a560626dc4 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -87,7 +87,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) { /* nothing remains on the queue */ if (copied && - (msg->msg_flags & MSG_PEEK || timeo == 0)) + (flags & MSG_PEEK || timeo == 0)) goto out; /* wait for a message to turn up */ -- cgit v1.2.3 From 0f611d28fc2e13cfec64e1c544c16a086886805a Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Thu, 12 Mar 2015 08:53:30 +0200 Subject: mac80211: count interfaces correctly for combination checks Since moving the interface combination checks to mac80211, it's broken because it now only considers interfaces with an assigned channel context, so for example any interface that isn't active can still be up, which is clearly an issue; also, in particular P2P-Device wdevs are an issue since they never have a chanctx. Fix this by counting running interfaces instead the ones with a channel context assigned. Cc: stable@vger.kernel.org [3.16+] Fixes: 73de86a38962b ("cfg80211/mac80211: move interface counting for combination check to mac80211") Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach [rewrite commit message, dig out the commit it fixes] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 8428f4a95479..747bdcf72e92 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3178,7 +3178,7 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || - rcu_access_pointer(sdata_iter->vif.chanctx_conf) == NULL || + !ieee80211_sdata_running(sdata_iter) || local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) continue; -- cgit v1.2.3 From 70a3fd6c61c46c07c63cab935dca9a17d8de1709 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:29 +0200 Subject: mac80211: ask for ECSA IE to be considered for beacon parse CRC When a beacon from the AP contains only the ECSA IE, and not a CSA IE as well, this ECSA IE is not considered for calculating the CRC and the beacon might be dropped as not being interesting. This is clearly wrong, it should be handled and the channel switch should be executed. Fix this by including the ECSA IE ID in the bitmap of interesting IEs. Reported-by: Gil Tribush Reviewed-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 10ac6324c1d0..cde8cd3d6595 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3204,7 +3204,8 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | - (1ULL << WLAN_EID_HT_OPERATION); + (1ULL << WLAN_EID_HT_OPERATION) | + (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, -- cgit v1.2.3 From 496fcc294daab18799e190c0264863d653588d1f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:27 +0200 Subject: nl80211: ignore HT/VHT capabilities without QoS/WMM As HT/VHT depend heavily on QoS/WMM, it's not a good idea to let userspace add clients that have HT/VHT but not QoS/WMM. Since it does so in certain cases we've observed (client is using HT IEs but not QoS/WMM) just ignore the HT/VHT info at this point and don't pass it down to the drivers which might unconditionally use it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index be2501538011..b6f84f6a2a09 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4400,6 +4400,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; + /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT + * as userspace might just pass through the capabilities from the IEs + * directly, rather than enforcing this restriction and returning an + * error in this case. + */ + if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { + params.ht_capa = NULL; + params.vht_capa = NULL; + } + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); -- cgit v1.2.3 From f84eaa1068315409ffbef57e6fea312180787db3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:26 +0200 Subject: mac80211: ignore CSA to same channel If the AP is confused and starts doing a CSA to the same channel, just ignore that request instead of trying to act it out since it was likely sent in error anyway. In the case of the bug I was investigating the GO was misbehaving and sending out a beacon with CSA IEs still included after having actually done the channel switch. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c0e089c194f1..8d53d65bd2ab 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -464,6 +464,7 @@ struct ieee80211_if_managed { unsigned int flags; bool csa_waiting_bcn; + bool csa_ignored_same_chan; bool beacon_crc_valid; u32 beacon_crc; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cde8cd3d6595..142f66aece18 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1150,6 +1150,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + if (cfg80211_chandef_identical(&csa_ie.chandef, + &sdata->vif.bss_conf.chandef)) { + if (ifmgd->csa_ignored_same_chan) + return; + sdata_info(sdata, + "AP %pM tries to chanswitch to same channel, ignore\n", + ifmgd->associated->bssid); + ifmgd->csa_ignored_same_chan = true; + return; + } + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, @@ -1210,6 +1221,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; sdata->csa_block_tx = csa_ie.mode; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) ieee80211_stop_vif_queues(local, sdata, @@ -2090,6 +2102,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); -- cgit v1.2.3 From d6b6cb1d3e6f78d55c2d4043d77d0d8def3f3b99 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Mar 2015 13:21:42 +0100 Subject: netfilter: nf_tables: allow to change chain policy without hook if it exists If there's an existing base chain, we have to allow to change the default policy without indicating the hook information. However, if the chain doesn't exists, we have to enforce the presence of the hook attribute. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6ab777912237..ac1a9528dbf2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1225,7 +1225,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_CHAIN_POLICY]) { if ((chain != NULL && - !(chain->flags & NFT_BASE_CHAIN)) || + !(chain->flags & NFT_BASE_CHAIN))) + return -EOPNOTSUPP; + + if (chain == NULL && nla[NFTA_CHAIN_HOOK] == NULL) return -EOPNOTSUPP; -- cgit v1.2.3 From 37355565ba57fd45f78f0934305be2761b641f8f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 16 Mar 2015 15:56:05 +0100 Subject: ip6_tunnel: fix error code when tunnel exists After commit 2b0bb01b6edb, the kernel returns -ENOBUFS when user tries to add an existing tunnel with ioctl API: $ ip -6 tunnel add ip6tnl1 mode ip6ip6 dev eth1 add tunnel "ip6tnl0" failed: No buffer space available It's confusing, the right error is EEXIST. This patch also change a bit the code returned: - ENOBUFS -> ENOMEM - ENOENT -> ENODEV Fixes: 2b0bb01b6edb ("ip6_tunnel: Return an error when adding an existing tunnel.") CC: Steffen Klassert Reported-by: Pierre Cheynier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 266a264ec212..ddd94eca19b3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -314,7 +314,7 @@ out: * Create tunnel matching given parameters. * * Return: - * created tunnel or NULL + * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) @@ -322,7 +322,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; - int err; + int err = -ENOMEM; if (p->name[0]) strlcpy(name, p->name, IFNAMSIZ); @@ -348,7 +348,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) failed_free: ip6_dev_free(dev); failed: - return NULL; + return ERR_PTR(err); } /** @@ -362,7 +362,7 @@ failed: * tunnel device is created and registered for use. * * Return: - * matching tunnel or NULL + * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, @@ -380,13 +380,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) - return NULL; + return ERR_PTR(-EEXIST); return t; } } if (!create) - return NULL; + return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } @@ -1420,7 +1420,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); @@ -1445,7 +1445,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; @@ -1457,14 +1457,15 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) else err = ip6_tnl_update(t, &p1); } - if (t) { + if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = PTR_ERR(t); + } break; case SIOCDELTUNNEL: err = -EPERM; @@ -1478,7 +1479,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) @@ -1672,12 +1673,13 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); - struct ip6_tnl *nt; + struct ip6_tnl *nt, *t; nt = netdev_priv(dev); ip6_tnl_netlink_parms(data, &nt->parms); - if (ip6_tnl_locate(net, &nt->parms, 0)) + t = ip6_tnl_locate(net, &nt->parms, 0); + if (!IS_ERR(t)) return -EEXIST; return ip6_tnl_create2(dev); @@ -1697,8 +1699,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], ip6_tnl_netlink_parms(data, &p); t = ip6_tnl_locate(net, &p, 0); - - if (t) { + if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else -- cgit v1.2.3 From cb7cf8a33ff73cf638481d1edf883d8968f934f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Mar 2015 12:19:24 -0700 Subject: inet: Clean up inet_csk_wait_for_connect() vs. might_sleep() I got the following trace with current net-next kernel : [14723.885290] WARNING: CPU: 26 PID: 22658 at kernel/sched/core.c:7285 __might_sleep+0x89/0xa0() [14723.885325] do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait_exclusive+0x34/0xa0 [14723.885355] CPU: 26 PID: 22658 Comm: netserver Not tainted 4.0.0-dbg-DEV #1379 [14723.885359] ffffffff81a223a8 ffff881fae9e7ca8 ffffffff81650b5d 0000000000000001 [14723.885364] ffff881fae9e7cf8 ffff881fae9e7ce8 ffffffff810a72e7 0000000000000000 [14723.885367] ffffffff81a57620 000000000000093a 0000000000000000 ffff881fae9e7e64 [14723.885371] Call Trace: [14723.885377] [] dump_stack+0x4c/0x65 [14723.885382] [] warn_slowpath_common+0x97/0xe0 [14723.885386] [] warn_slowpath_fmt+0x46/0x50 [14723.885390] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885393] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885396] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885399] [] __might_sleep+0x89/0xa0 [14723.885403] [] lock_sock_nested+0x36/0xb0 [14723.885406] [] ? release_sock+0x173/0x1c0 [14723.885411] [] inet_csk_accept+0x157/0x2a0 [14723.885415] [] ? abort_exclusive_wait+0xc0/0xc0 [14723.885419] [] inet_accept+0x2d/0x150 [14723.885424] [] SYSC_accept4+0xff/0x210 [14723.885428] [] ? retint_swapgs+0xe/0x44 [14723.885431] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885437] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [14723.885441] [] SyS_accept+0x10/0x20 [14723.885444] [] system_call_fastpath+0x12/0x17 [14723.885447] ---[ end trace ff74cd83355b1873 ]--- In commit 26cabd31259ba43f68026ce3f62b78094124333f Peter added a sched_annotate_sleep() in sk_wait_event() Is the following patch needed as well ? Alternative would be to use sk_wait_event() from inet_csk_wait_for_connect() Signed-off-by: Eric Dumazet Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 14d02ea905b6..3e44b9b0b78e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -268,6 +268,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); + sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) -- cgit v1.2.3 From ced585c83b27deca427c606a34dd3eaa6b96d82b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Mar 2015 20:25:57 +0100 Subject: act_bpf: allow non-default TC_ACT opcodes as BPF exec outcome Revisiting commit d23b8ad8ab23 ("tc: add BPF based action") with regards to eBPF support, I was thinking that it might be better to improve return semantics from a BPF program invoked through BPF_PROG_RUN(). Currently, in case filter_res is 0, we overwrite the default action opcode with TC_ACT_SHOT. A default action opcode configured through tc's m_bpf can be: TC_ACT_RECLASSIFY, TC_ACT_PIPE, TC_ACT_SHOT, TC_ACT_UNSPEC, TC_ACT_OK. In cls_bpf, we have the possibility to overwrite the default class associated with the classifier in case filter_res is _not_ 0xffffffff (-1). That allows us to fold multiple [e]BPF programs into a single one, where they would otherwise need to be defined as a separate classifier with its own classid, needlessly redoing parsing work, etc. Similarly, we could do better in act_bpf: Since above TC_ACT* opcodes are exported to UAPI anyway, we reuse them for return-code-to-tc-opcode mapping, where we would allow above possibilities. Thus, like in cls_bpf, a filter_res of 0xffffffff (-1) means that the configured _default_ action is used. Any unkown return code from the BPF program would fail in tcf_bpf() with TC_ACT_UNSPEC. Should we one day want to make use of TC_ACT_STOLEN or TC_ACT_QUEUED, which both have the same semantics, we have the option to either use that as a default action (filter_res of 0xffffffff) or non-default BPF return code. All that will allow us to transparently use tcf_bpf() for both BPF flavours. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Cc: Alexei Starovoitov Cc: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 82c5d7fc1988..5f6288fa3f12 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -25,21 +25,41 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_bpf *b = a->priv; - int action; - int filter_res; + int action, filter_res; spin_lock(&b->tcf_lock); + b->tcf_tm.lastuse = jiffies; bstats_update(&b->tcf_bstats, skb); - action = b->tcf_action; filter_res = BPF_PROG_RUN(b->filter, skb); - if (filter_res == 0) { - /* Return code 0 from the BPF program - * is being interpreted as a drop here. - */ - action = TC_ACT_SHOT; + + /* A BPF program may overwrite the default action opcode. + * Similarly as in cls_bpf, if filter_res == -1 we use the + * default action specified from tc. + * + * In case a different well-known TC_ACT opcode has been + * returned, it will overwrite the default one. + * + * For everything else that is unkown, TC_ACT_UNSPEC is + * returned. + */ + switch (filter_res) { + case TC_ACT_PIPE: + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + action = filter_res; + break; + case TC_ACT_SHOT: + action = filter_res; b->tcf_qstats.drops++; + break; + case TC_ACT_UNSPEC: + action = b->tcf_action; + break; + default: + action = TC_ACT_UNSPEC; + break; } spin_unlock(&b->tcf_lock); -- cgit v1.2.3 From 4017a7ee693d1cae6735c0dac21594a7c6416c4c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 Mar 2015 01:10:28 +0100 Subject: netfilter: restore rule tracing via nfnetlink_log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since fab4085 ("netfilter: log: nf_log_packet() as real unified interface"), the loginfo structure that is passed to nf_log_packet() is used to explicitly indicate the logger type you want to use. This is a problem for people tracing rules through nfnetlink_log since packets are always routed to the NF_LOG_TYPE logger after the aforementioned patch. We can fix this by removing the trace loginfo structures, but that still changes the log level from 4 to 5 for tracing messages and there may be someone relying on this outthere. So let's just introduce a new nf_log_trace() function that restores the former behaviour. Reported-by: Markus Kötter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 10 ++++++++++ net/ipv4/netfilter/ip_tables.c | 6 +++--- net/ipv6/netfilter/ip6_tables.c | 6 +++--- net/netfilter/nf_log.c | 24 ++++++++++++++++++++++++ net/netfilter/nf_tables_core.c | 8 ++++---- 5 files changed, 44 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 534e1f2ac4fc..57639fca223a 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -79,6 +79,16 @@ void nf_log_packet(struct net *net, const struct nf_loginfo *li, const char *fmt, ...); +__printf(8, 9) +void nf_log_trace(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li, + const char *fmt, ...); + struct nf_log_buf; struct nf_log_buf *nf_log_buf_open(void); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 99e810f84671..cf5e82f39d3b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -272,9 +272,9 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, - "TRACE: %s:%s:%s:%u ", - tablename, chainname, comment, rulenum); + nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); } #endif diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e080fbbbc0e5..bb00c6f2a885 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -298,9 +298,9 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, - "TRACE: %s:%s:%s:%u ", - tablename, chainname, comment, rulenum); + nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); } #endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 0d8448f19dfe..675d12c69e32 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -212,6 +212,30 @@ void nf_log_packet(struct net *net, } EXPORT_SYMBOL(nf_log_packet); +void nf_log_trace(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + const struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_trace); + #define S_SIZE (1024 - (sizeof(unsigned int) + 1)) struct nf_log_buf { diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 3b90eb2b2c55..2d298dccb6dd 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -94,10 +94,10 @@ static void nft_trace_packet(const struct nft_pktinfo *pkt, { struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, - pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", - chain->table->name, chain->name, comments[type], - rulenum); + nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", + chain->table->name, chain->name, comments[type], + rulenum); } unsigned int -- cgit v1.2.3 From 3d8c6dce53a349df8878d078e56bf429bad572f9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 20 Mar 2015 13:56:06 +0100 Subject: netfilter: xt_TPROXY: fix invflags check in tproxy_tg6_check() We have to check for IP6T_INV_PROTO in invflags, instead of flags. Signed-off-by: Pablo Neira Ayuso Acked-by: Balazs Scheidler --- net/netfilter/xt_TPROXY.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index ef8a926752a9..50e1e5aaf4ce 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -513,8 +513,8 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_ip6 *i = par->entryinfo; - if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) - && !(i->flags & IP6T_INV_PROTO)) + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && + !(i->invflags & IP6T_INV_PROTO)) return 0; pr_info("Can be used only in combination with " -- cgit v1.2.3 From 8e199dfd82ee097b522b00344af6448715d8ee0c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 19 Mar 2015 11:22:32 +0100 Subject: ipv6: call ipv6_proxy_select_ident instead of ipv6_select_ident in udp6_ufo_fragment Matt Grant reported frequent crashes in ipv6_select_ident when udp6_ufo_fragment is called from openvswitch on a skb that doesn't have a dst_entry set. ipv6_proxy_select_ident generates the frag_id without using the dst associated with the skb. This approach was suggested by Vladislav Yasevich. Fixes: 0508c07f5e0c ("ipv6: Select fragment id during UFO segmentation if not set.") Cc: Vladislav Yasevich Reported-by: Matt Grant Tested-by: Matt Grant Signed-off-by: Sabrina Dubroca Acked-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/ipv6/udp_offload.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ab889bb16b3c..be2c0ba82c85 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -112,11 +112,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); fptr->nexthdr = nexthdr; fptr->reserved = 0; - if (skb_shinfo(skb)->ip6_frag_id) - fptr->identification = skb_shinfo(skb)->ip6_frag_id; - else - ipv6_select_ident(fptr, - (struct rt6_info *)skb_dst(skb)); + if (!skb_shinfo(skb)->ip6_frag_id) + ipv6_proxy_select_ident(skb); + fptr->identification = skb_shinfo(skb)->ip6_frag_id; /* Fragment the skb. ipv6 header and the remaining fields of the * fragment header are updated in ipv6_gso_segment() -- cgit v1.2.3 From 73ba57bfae4a1914f6a6dac71e3168dd900e00af Mon Sep 17 00:00:00 2001 From: Steven Barth Date: Thu, 19 Mar 2015 16:16:04 +0100 Subject: ipv6: fix backtracking for throw routes for throw routes to trigger evaluation of other policy rules EAGAIN needs to be propagated up to fib_rules_lookup similar to how its done for IPv4 A simple testcase for verification is: ip -6 rule add lookup 33333 priority 33333 ip -6 route add throw 2001:db8::1 ip -6 route add 2001:db8::1 via fe80::1 dev wlan0 table 33333 ip route get 2001:db8::1 Signed-off-by: Steven Barth Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b4d5e1d97c1b..27ca79682efb 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -104,6 +104,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto again; flp6->saddr = saddr; } + err = rt->dst.error; goto out; } again: -- cgit v1.2.3 From d22e1537181188e5dc8cbc51451832625035bdc2 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Thu, 19 Mar 2015 19:19:30 -0400 Subject: tcp: fix tcp fin memory accounting tcp_send_fin() does not account for the memory it allocates properly, so sk_forward_alloc can be negative in cases where we've sent a FIN: ss example output (ss -amn | grep -B1 f4294): tcp FIN-WAIT-1 0 1 192.168.0.1:45520 192.0.2.1:8080 skmem:(r0,rb87380,t0,tb87380,f4294966016,w1280,o0,bl0) Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a2a796c5536b..1db253e36045 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2773,15 +2773,11 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (skb) break; yield(); } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); -- cgit v1.2.3 From 91edd096e224941131f896b86838b1e59553696a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 20 Mar 2015 16:48:13 +0000 Subject: net: compat: Update get_compat_msghdr() to match copy_msghdr_from_user() behaviour Commit db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an error) introduced the clamping of msg_namelen when the unsigned value was larger than sizeof(struct sockaddr_storage). This caused a msg_namelen of -1 to be valid. The native code was subsequently fixed by commit dbb490b96584 (net: socket: error on a negative msg_namelen). In addition, the native code sets msg_namelen to 0 when msg_name is NULL. This was done in commit (6a2a2b3ae075 net:socket: set msg_namelen to 0 if msg_name is passed as NULL in msghdr struct from userland) and subsequently updated by 08adb7dabd48 (fold verify_iovec() into copy_msghdr_from_user()). This patch brings the get_compat_msghdr() in line with copy_msghdr_from_user(). Fixes: db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an error) Cc: David S. Miller Cc: Dan Carpenter Signed-off-by: Catalin Marinas Signed-off-by: David S. Miller --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 94d3d5e97883..f7bd286a8280 100644 --- a/net/compat.c +++ b/net/compat.c @@ -49,6 +49,13 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) return -EFAULT; + + if (!uaddr) + kmsg->msg_namelen = 0; + + if (kmsg->msg_namelen < 0) + return -EINVAL; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control = compat_ptr(tmp3); -- cgit v1.2.3 From 4de930efc23b92ddf88ce91c405ee645fe6e27ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Mar 2015 17:41:43 +0000 Subject: net: validate the range we feed to iov_iter_init() in sys_sendto/sys_recvfrom Cc: stable@vger.kernel.org # v3.19 Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/socket.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index bbedbfcb42c2..245330ca0015 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1702,6 +1702,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (len > INT_MAX) len = INT_MAX; + if (unlikely(!access_ok(VERIFY_READ, buff, len))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -1760,6 +1762,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, if (size > INT_MAX) size = INT_MAX; + if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; -- cgit v1.2.3 From 749177ccc74f9c6d0f51bd78a15c652a2134aa11 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Mar 2015 19:25:05 +0100 Subject: netfilter: nft_compat: set IP6T_F_PROTO flag if protocol is set ip6tables extensions check for this flag to restrict match/target to a given protocol. Without this flag set, SYNPROXY6 returns an error. Signed-off-by: Pablo Neira Ayuso Acked-by: Patrick McHardy --- net/netfilter/nft_compat.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 213584cf04b3..65f3e2b6be44 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -133,6 +133,9 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: + if (proto) + entry->e6.ipv6.flags |= IP6T_F_PROTO; + entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; @@ -344,6 +347,9 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: + if (proto) + entry->e6.ipv6.flags |= IP6T_F_PROTO; + entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; -- cgit v1.2.3 From d0c294c53a771ae7e84506dfbd8c18c30f078735 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Mon, 23 Mar 2015 15:14:00 +0100 Subject: tcp: prevent fetching dst twice in early demux code On s390x, gcc 4.8 compiles this part of tcp_v6_early_demux() struct dst_entry *dst = sk->sk_rx_dst; if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); to code reading sk->sk_rx_dst twice, once for the test and once for the argument of ip6_dst_check() (dst_check() is inline). This allows ip6_dst_check() to be called with null first argument, causing a crash. Protect sk->sk_rx_dst access by READ_ONCE() both in IPv4 and IPv6 TCP early demux code. Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.") Fixes: c7109986db3c ("ipv6: Early TCP socket demux") Signed-off-by: Michal Kubecek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a2dfed4783b..f1756ee02207 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1518,7 +1518,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5d46832c6f72..b283a498f7a4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1585,7 +1585,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); -- cgit v1.2.3 From d079535d5e1bf5e2e7c856bae2483414ea21e137 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 23 Mar 2015 16:31:09 -0700 Subject: net: use for_each_netdev_safe() in rtnl_group_changelink() In case we move the whole dev group to another netns, we should call for_each_netdev_safe(), otherwise we get a soft lockup: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ip:798] irq event stamp: 255424 hardirqs last enabled at (255423): [] restore_args+0x0/0x30 hardirqs last disabled at (255424): [] apic_timer_interrupt+0x6a/0x80 softirqs last enabled at (255422): [] __do_softirq+0x2c1/0x3a9 softirqs last disabled at (255417): [] irq_exit+0x41/0x95 CPU: 0 PID: 798 Comm: ip Not tainted 4.0.0-rc4+ #881 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d1b88000 ti: ffff880119530000 task.ti: ffff880119530000 RIP: 0010:[] [] debug_lockdep_rcu_enabled+0x28/0x30 RSP: 0018:ffff880119533778 EFLAGS: 00000246 RAX: ffff8800d1b88000 RBX: 0000000000000002 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffff8800d1b888c8 RDI: ffff8800d1b888c8 RBP: ffff880119533778 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000b5c2 R12: 0000000000000246 R13: ffff880119533708 R14: 00000000001d5a40 R15: ffff88011a7d5a40 FS: 00007fc01315f740(0000) GS:ffff88011a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f367a120988 CR3: 000000011849c000 CR4: 00000000000007f0 Stack: ffff880119533798 ffffffff811ac868 ffffffff811ac831 ffffffff811ac828 ffff8801195337c8 ffffffff811ac8c9 ffff8801195339b0 ffff8801197633e0 0000000000000000 ffff8801195339b0 ffff8801195337d8 ffffffff811ad2d7 Call Trace: [] rcu_read_lock+0x37/0x6e [] ? rcu_read_unlock+0x5f/0x5f [] ? rcu_read_unlock+0x56/0x5f [] __fget+0x2a/0x7a [] fget+0x13/0x15 [] proc_ns_fget+0xe/0x38 [] get_net_ns_by_fd+0x11/0x59 [] rtnl_link_get_net+0x33/0x3e [] do_setlink+0x73/0x87b [] ? trace_hardirqs_off+0xd/0xf [] ? retint_restore_args+0xe/0xe [] rtnl_newlink+0x40c/0x699 [] ? rtnl_newlink+0xeb/0x699 [] ? _raw_spin_unlock+0x28/0x33 [] ? security_capable+0x18/0x1a [] ? ns_capable+0x4d/0x65 [] rtnetlink_rcv_msg+0x181/0x194 [] ? rtnl_lock+0x17/0x19 [] ? rtnl_lock+0x17/0x19 [] ? __rtnl_unlock+0x17/0x17 [] netlink_rcv_skb+0x4d/0x93 [] rtnetlink_rcv+0x26/0x2d [] netlink_unicast+0xcb/0x150 [] netlink_sendmsg+0x501/0x523 [] ? might_fault+0x59/0xa9 [] ? copy_from_user+0x2a/0x2c [] sock_sendmsg+0x34/0x3c [] ___sys_sendmsg+0x1b8/0x255 [] ? handle_pte_fault+0xbd5/0xd4a [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? rcu_read_lock_held+0x3b/0x3d [] ? __fcheck_files+0x4c/0x58 [] ? __fget_light+0x2d/0x52 [] __sys_sendmsg+0x42/0x60 [] SyS_sendmsg+0x12/0x1c [] system_call_fastpath+0x12/0x17 Fixes: e7ed828f10bd8 ("netlink: support setting devgroup parameters") Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ee0608bb3bc0..7ebed55b5f7d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1932,10 +1932,10 @@ static int rtnl_group_changelink(const struct sk_buff *skb, struct ifinfomsg *ifm, struct nlattr **tb) { - struct net_device *dev; + struct net_device *dev, *aux; int err; - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, ifm, tb, NULL, 0); if (err < 0) -- cgit v1.2.3 From 6fd99094de2b83d1d4c8457f2c83483b2828e75a Mon Sep 17 00:00:00 2001 From: "D.S. Ljungmark" Date: Wed, 25 Mar 2015 09:28:15 +0100 Subject: ipv6: Don't reduce hop limit for an interface A local route may have a lower hop_limit set than global routes do. RFC 3756, Section 4.2.7, "Parameter Spoofing" > 1. The attacker includes a Current Hop Limit of one or another small > number which the attacker knows will cause legitimate packets to > be dropped before they reach their destination. > As an example, one possible approach to mitigate this threat is to > ignore very small hop limits. The nodes could implement a > configurable minimum hop limit, and ignore attempts to set it below > said limit. Signed-off-by: D.S. Ljungmark Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 471ed24aabae..14ecdaf06bf7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1218,7 +1218,14 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt6_set_expires(rt, jiffies + (HZ * lifetime)); if (ra_msg->icmph.icmp6_hop_limit) { - in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + /* Only set hop_limit on the interface if it is higher than + * the current hop_limit. + */ + if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + } else { + ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n"); + } if (rt) dst_metric_set(&rt->dst, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); -- cgit v1.2.3 From f243e5a7859a24d10975afb9a1708cac624ba6f1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 25 Mar 2015 14:45:03 -0700 Subject: ipmr,ip6mr: call ip6mr_free_table() on failure path Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 2 +- net/ipv6/ip6mr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9d78427652d2..92825443fad6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -268,7 +268,7 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: - kfree(mrt); + ipmr_free_table(mrt); err1: fib_rules_unregister(ops); return err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 34b682617f50..52028f449a89 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -252,7 +252,7 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; err2: - kfree(mrt); + ip6mr_free_table(mrt); err1: fib_rules_unregister(ops); return err; -- cgit v1.2.3 From 4217291e592da0e4258b652e82e5428639d29acc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 26 Mar 2015 17:56:38 +0100 Subject: netns: don't clear nsid too early on removal With the current code, ids are removed too early. Suppose you have an ipip interface that stands in the netns foo and its link part in the netns bar (so the netns bar has an nsid into the netns foo). Now, you remove the netns bar: - the bar nsid into the netns foo is removed - the netns exit method of ipip is called, thus our ipip iface is removed: => a netlink message is sent in the netns foo to advertise this deletion => this netlink message requests an nsid for bar, thus a new nsid is allocated for bar and never removed. We must remove nsids when we are sure that nobody will refer to netns currently cleaned. Fixes: 0c7aecd4bde4 ("netns: add rtnl cmd to add and get peer netns ids") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b8c428..5221f975a4cc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -349,7 +349,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp; + struct net *net, *tmp, *peer; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -365,14 +365,6 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); - - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - } - idr_destroy(&net->netns_ids); - } rtnl_unlock(); @@ -398,12 +390,26 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + rtnl_lock(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + /* Unreference net from all peers (no need to loop over + * net_exit_list because idr_destroy() will be called for each + * element of this list. + */ + for_each_net(peer) { + int id = __peernet2id(peer, net, false); + + if (id >= 0) + idr_remove(&peer->netns_ids, id); + } + idr_destroy(&net->netns_ids); + list_del_init(&net->exit_list); put_user_ns(net->user_ns); net_drop_ns(net); } + rtnl_unlock(); } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3 From 4ad19de8774e2a7b075b3e8ea48db85adcf33fa6 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 27 Mar 2015 12:24:22 +0300 Subject: net: tcp6: fix double call of tcp_v6_fill_cb() tcp_v6_fill_cb() will be called twice if socket's state changes from TCP_TIME_WAIT to TCP_LISTEN. That can result in control buffer data corruption because in the second tcp_v6_fill_cb() call it's not copying IP6CB(skb) anymore, but 'seq', 'end_seq', etc., so we can get weird and unpredictable results. Performance loss of up to 1200% has been observed in LTP/vxlan03 test. This can be fixed by copying inet6_skb_parm to the beginning of 'cb' only if xfrm6_policy_check() and tcp_v6_fill_cb() are going to be called again. Fixes: 2dc49d1680b53 ("tcp6: don't move IP6CB before xfrm6_policy_check()") Signed-off-by: Alexey Kodanev Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b283a498f7a4..1f5e62229aaa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1411,6 +1411,15 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->sacked = 0; } +static void tcp_v6_restore_cb(struct sk_buff *skb) +{ + /* We need to move header back to the beginning if xfrm6_policy_check() + * and tcp_v6_fill_cb() are going to be called again. + */ + memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, + sizeof(struct inet6_skb_parm)); +} + static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; @@ -1543,6 +1552,7 @@ do_time_wait: inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); sk = sk2; + tcp_v6_restore_cb(skb); goto process; } /* Fall through to ACK */ @@ -1551,6 +1561,7 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: + tcp_v6_restore_cb(skb); goto no_tcp_socket; case TCP_TW_SUCCESS: ; -- cgit v1.2.3 From f9c72d10d6fbf949558cd088389a42213ed7b12d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 31 Mar 2015 12:03:28 -0400 Subject: sunrpc: make debugfs file creation failure non-fatal We currently have a problem that SELinux policy is being enforced when creating debugfs files. If a debugfs file is created as a side effect of doing some syscall, then that creation can fail if the SELinux policy for that process prevents it. This seems wrong. We don't do that for files under /proc, for instance, so Bruce has proposed a patch to fix that. While discussing that patch however, Greg K.H. stated: "No kernel code should care / fail if a debugfs function fails, so please fix up the sunrpc code first." This patch converts all of the sunrpc debugfs setup code to be void return functins, and the callers to not look for errors from those functions. This should allow rpc_clnt and rpc_xprt creation to work, even if the kernel fails to create debugfs files for some reason. Symptoms were failing krb5 mounts on systems using gss-proxy and selinux. Fixes: 388f0c776781 "sunrpc: add a debugfs rpc_xprt directory..." Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Acked-by: Greg Kroah-Hartman Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/debug.h | 18 +++++++-------- net/sunrpc/clnt.c | 4 +--- net/sunrpc/debugfs.c | 52 ++++++++++++++++++++++++-------------------- net/sunrpc/sunrpc_syms.c | 7 +----- net/sunrpc/xprt.c | 7 +----- 5 files changed, 41 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index c57d8ea0716c..59a7889e15db 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -60,17 +60,17 @@ struct rpc_xprt; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) void rpc_register_sysctl(void); void rpc_unregister_sysctl(void); -int sunrpc_debugfs_init(void); +void sunrpc_debugfs_init(void); void sunrpc_debugfs_exit(void); -int rpc_clnt_debugfs_register(struct rpc_clnt *); +void rpc_clnt_debugfs_register(struct rpc_clnt *); void rpc_clnt_debugfs_unregister(struct rpc_clnt *); -int rpc_xprt_debugfs_register(struct rpc_xprt *); +void rpc_xprt_debugfs_register(struct rpc_xprt *); void rpc_xprt_debugfs_unregister(struct rpc_xprt *); #else -static inline int +static inline void sunrpc_debugfs_init(void) { - return 0; + return; } static inline void @@ -79,10 +79,10 @@ sunrpc_debugfs_exit(void) return; } -static inline int +static inline void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - return 0; + return; } static inline void @@ -91,10 +91,10 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt) return; } -static inline int +static inline void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { - return 0; + return; } static inline void diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612aa73bbc60..e6ce1517367f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -303,9 +303,7 @@ static int rpc_client_register(struct rpc_clnt *clnt, struct super_block *pipefs_sb; int err; - err = rpc_clnt_debugfs_register(clnt); - if (err) - return err; + rpc_clnt_debugfs_register(clnt); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e811f390f9f6..82962f7e6e88 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -129,48 +129,52 @@ static const struct file_operations tasks_fops = { .release = tasks_release, }; -int +void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - int len, err; + int len; char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */ + struct rpc_xprt *xprt; /* Already registered? */ - if (clnt->cl_debugfs) - return 0; + if (clnt->cl_debugfs || !rpc_clnt_dir) + return; len = snprintf(name, sizeof(name), "%x", clnt->cl_clid); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir); if (!clnt->cl_debugfs) - return -ENOMEM; + return; /* make tasks file */ - err = -ENOMEM; if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs, clnt, &tasks_fops)) goto out_err; - err = -EINVAL; rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + /* no "debugfs" dentry? Don't bother with the symlink. */ + if (!xprt->debugfs) { + rcu_read_unlock(); + return; + } len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", - rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name); + xprt->debugfs->d_name.name); rcu_read_unlock(); + if (len >= sizeof(name)) goto out_err; - err = -ENOMEM; if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name)) goto out_err; - return 0; + return; out_err: debugfs_remove_recursive(clnt->cl_debugfs); clnt->cl_debugfs = NULL; - return err; } void @@ -226,33 +230,33 @@ static const struct file_operations xprt_info_fops = { .release = xprt_info_release, }; -int +void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { int len, id; static atomic_t cur_id; char name[9]; /* 8 hex digits + NULL term */ + if (!rpc_xprt_dir) + return; + id = (unsigned int)atomic_inc_return(&cur_id); len = snprintf(name, sizeof(name), "%x", id); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir); if (!xprt->debugfs) - return -ENOMEM; + return; /* make tasks file */ if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs, xprt, &xprt_info_fops)) { debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; - return -ENOMEM; } - - return 0; } void @@ -266,14 +270,17 @@ void __exit sunrpc_debugfs_exit(void) { debugfs_remove_recursive(topdir); + topdir = NULL; + rpc_clnt_dir = NULL; + rpc_xprt_dir = NULL; } -int __init +void __init sunrpc_debugfs_init(void) { topdir = debugfs_create_dir("sunrpc", NULL); if (!topdir) - goto out; + return; rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); if (!rpc_clnt_dir) @@ -283,10 +290,9 @@ sunrpc_debugfs_init(void) if (!rpc_xprt_dir) goto out_remove; - return 0; + return; out_remove: debugfs_remove_recursive(topdir); topdir = NULL; -out: - return -ENOMEM; + rpc_clnt_dir = NULL; } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index e37fbed87956..ee5d3d253102 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -98,10 +98,7 @@ init_sunrpc(void) if (err) goto out4; - err = sunrpc_debugfs_init(); - if (err) - goto out5; - + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif @@ -109,8 +106,6 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; -out5: - unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e3015aede0d9..9949722d99ce 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1331,7 +1331,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { - int err; struct rpc_xprt *xprt; struct xprt_class *t; @@ -1372,11 +1371,7 @@ found: return ERR_PTR(-ENOMEM); } - err = rpc_xprt_debugfs_register(xprt); - if (err) { - xprt_destroy(xprt); - return ERR_PTR(err); - } + rpc_xprt_debugfs_register(xprt); dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); -- cgit v1.2.3 From fa2d8ff4e3522b4e05f590575d3eb8087f3a8cdc Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Mon, 30 Mar 2015 13:57:41 +0200 Subject: openvswitch: Return vport module ref before destruction Return module reference before invoking the respective vport ->destroy() function. This is needed as ovs_vport_del() is not invoked inside an RCU read side critical section so the kfree can occur immediately before returning to ovs_vport_del(). Returning the module reference before ->destroy() is safe because the module unregistration is blocked on ovs_lock which we hold while destroying the datapath. Fixes: 62b9c8d0372d ("ovs: Turn vports with dependencies into separate modules") Reported-by: Pravin Shelar Signed-off-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index ec2954ffc690..067a3fff1d2c 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -274,10 +274,8 @@ void ovs_vport_del(struct vport *vport) ASSERT_OVSL(); hlist_del_rcu(&vport->hash_node); - - vport->ops->destroy(vport); - module_put(vport->ops->owner); + vport->ops->destroy(vport); } /** -- cgit v1.2.3 From ed4ac4221776a5103faf71a4032ca00178d6e66b Mon Sep 17 00:00:00 2001 From: Eugene Crosser Date: Mon, 30 Mar 2015 15:40:42 +0200 Subject: af_iucv: fix AF_IUCV sendmsg() errno When sending over AF_IUCV socket, errno was incorrectly set to ENOMEM even when other values where appropriate, notably EAGAIN. With this patch, error indicator returned by sock_alloc_send_skb() is passed to the caller, rather than being overwritten with ENOMEM. Signed-off-by: Eugene Crosser Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 2e9953b2db84..53d931172088 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1114,10 +1114,8 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, noblock, &err); else skb = sock_alloc_send_skb(sk, len, noblock, &err); - if (!skb) { - err = -ENOMEM; + if (!skb) goto out; - } if (iucv->transport == AF_IUCV_TRANS_HIPER) skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN); if (memcpy_from_msg(skb_put(skb, len), msg, len)) { -- cgit v1.2.3 From 7e436905780659d6dc12d0581944934bf91a9919 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 1 Apr 2015 09:42:50 +0800 Subject: tipc: fix a slab object leak When remove TIPC module, there is a warning to remind us that a slab object is leaked like: root@localhost:~# rmmod tipc [ 19.056226] ============================================================================= [ 19.057549] BUG TIPC (Not tainted): Objects remaining in TIPC on kmem_cache_close() [ 19.058736] ----------------------------------------------------------------------------- [ 19.058736] [ 19.060287] INFO: Slab 0xffffea0000519a00 objects=23 used=1 fp=0xffff880014668b00 flags=0x100000000004080 [ 19.061915] INFO: Object 0xffff880014668000 @offset=0 [ 19.062717] kmem_cache_destroy TIPC: Slab cache still has objects This is because the listening socket of TIPC topology server is not closed before TIPC proto handler is unregistered with proto_unregister(). However, as the socket is closed in tipc_exit_net() which is called by unregister_pernet_subsys() during unregistering TIPC namespace operation, the warning can be eliminated if calling unregister_pernet_subsys() is moved before calling proto_unregister(). Fixes: e05b31f4bf89 ("tipc: make tipc socket support net namespace") Reviewed-by: Erik Hugne Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index 935205e6bcfe..be1c9fa60b09 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -152,11 +152,11 @@ out_netlink: static void __exit tipc_exit(void) { tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_net_ops); tipc_netlink_stop(); tipc_netlink_compat_stop(); tipc_socket_stop(); tipc_unregister_sysctl(); - unregister_pernet_subsys(&tipc_net_ops); pr_info("Deactivated\n"); } -- cgit v1.2.3 From 788211d81bfdf9b6a547d0530f206ba6ee76b107 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Apr 2015 14:20:42 +0200 Subject: mac80211: fix RX A-MPDU session reorder timer deletion There's an issue with the way the RX A-MPDU reorder timer is deleted that can cause a kernel crash like this: * tid_rx is removed - call_rcu(ieee80211_free_tid_rx) * station is destroyed * reorder timer fires before ieee80211_free_tid_rx() runs, accessing the station, thus potentially crashing due to the use-after-free The station deletion is protected by synchronize_net(), but that isn't enough -- ieee80211_free_tid_rx() need not have run when that returns (it deletes the timer.) We could use rcu_barrier() instead of synchronize_net(), but that's much more expensive. Instead, to fix this, add a field tracking that the session is being deleted. In this case, the only re-arming of the timer happens with the reorder spinlock held, so make that code not rearm it if the session is being deleted and also delete the timer after setting that field. This ensures the timer cannot fire after ___ieee80211_stop_rx_ba_session() returns, which fixes the problem. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 8 ++++++-- net/mac80211/rx.c | 7 ++++--- net/mac80211/sta_info.h | 2 ++ 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a48bad468880..7702978a4c99 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -49,8 +49,6 @@ static void ieee80211_free_tid_rx(struct rcu_head *h) container_of(h, struct tid_ampdu_rx, rcu_head); int i; - del_timer_sync(&tid_rx->reorder_timer); - for (i = 0; i < tid_rx->buf_size; i++) __skb_queue_purge(&tid_rx->reorder_buf[i]); kfree(tid_rx->reorder_buf); @@ -93,6 +91,12 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, del_timer_sync(&tid_rx->session_timer); + /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */ + spin_lock_bh(&tid_rx->reorder_lock); + tid_rx->removed = true; + spin_unlock_bh(&tid_rx->reorder_lock); + del_timer_sync(&tid_rx->reorder_timer); + call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 944bdc04e913..1eb730bf8752 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -873,9 +873,10 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, set_release_timer: - mod_timer(&tid_agg_rx->reorder_timer, - tid_agg_rx->reorder_time[j] + 1 + - HT_RX_REORDER_BUF_TIMEOUT); + if (!tid_agg_rx->removed) + mod_timer(&tid_agg_rx->reorder_timer, + tid_agg_rx->reorder_time[j] + 1 + + HT_RX_REORDER_BUF_TIMEOUT); } else { del_timer(&tid_agg_rx->reorder_timer); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 925e68fe64c7..fb0fc1302a58 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -175,6 +175,7 @@ struct tid_ampdu_tx { * @reorder_lock: serializes access to reorder buffer, see below. * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. + * @removed: this session is removed (but might have been found due to RCU) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -199,6 +200,7 @@ struct tid_ampdu_rx { u16 timeout; u8 dialog_token; bool auto_seq; + bool removed; }; /** -- cgit v1.2.3 From 666b805150efd62f05810ff0db08f44a2370c937 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 1 Apr 2015 20:26:46 -0400 Subject: tcp: fix FRTO undo on cumulative ACK of SACKed range On processing cumulative ACKs, the FRTO code was not checking the SACKed bit, meaning that there could be a spurious FRTO undo on a cumulative ACK of a previously SACKed skb. The FRTO code should only consider a cumulative ACK to indicate that an original/unretransmitted skb is newly ACKed if the skb was not yet SACKed. The effect of the spurious FRTO undo would typically be to make the connection think that all previously-sent packets were in flight when they really weren't, leading to a stall and an RTO. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Fixes: e33099f96d99c ("tcp: implement RFC5682 F-RTO") Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb4cf8b8e121..f501ac048366 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3105,10 +3105,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; - if (!(sacked & TCPCB_SACKED_ACKED)) + if (!(sacked & TCPCB_SACKED_ACKED)) { reord = min(pkts_acked, reord); - if (!after(scb->end_seq, tp->high_seq)) - flag |= FLAG_ORIG_SACK_ACKED; + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; + } } if (sacked & TCPCB_SACKED_ACKED) -- cgit v1.2.3 From ed785309c94445dd90e242370e1f7bb034e008fd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:45 -0700 Subject: ipv4: take rtnl_lock and mark mrt table as freed on namespace cleanup This is the IPv4 part for commit 905a6f96a1b1 (ipv6: take rtnl_lock and mark mrt6 table as freed on namespace cleanup). Cc: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 92825443fad6..bc40115bc394 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,10 +278,12 @@ static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; + rtnl_lock(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } + rtnl_unlock(); fib_rules_unregister(net->ipv4.mr_rules_ops); } #else @@ -308,7 +310,10 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { + rtnl_lock(); ipmr_free_table(net->ipv4.mrt); + net->ipv4.mrt = NULL; + rtnl_unlock(); } #endif -- cgit v1.2.3 From 419df12fb5fa558451319276838c1842f2b11f8f Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:46 -0700 Subject: net: move fib_rules_unregister() under rtnl lock We have to hold rtnl lock for fib_rules_unregister() otherwise the following race could happen: fib_rules_unregister(): fib_nl_delrule(): ... ... ... ops = lookup_rules_ops(); list_del_rcu(&ops->list); list_for_each_entry(ops->rules) { fib_rules_cleanup_ops(ops); ... list_del_rcu(); list_del_rcu(); } Note, net->rules_mod_lock is actually not needed at all, either upper layer netns code or rtnl lock guarantees we are safe. Cc: Alexander Duyck Cc: Thomas Graf Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- net/decnet/dn_rules.c | 2 ++ net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ipmr.c | 2 +- net/ipv6/fib6_rules.c | 2 ++ net/ipv6/ip6mr.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 44706e81b2e0..e4fdc9dfb2c7 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -175,9 +175,9 @@ void fib_rules_unregister(struct fib_rules_ops *ops) spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); - fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); + fib_rules_cleanup_ops(ops); call_rcu(&ops->rcu, fib_rules_put_rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index faf7cc3483fe..9d66a0f72f90 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -248,7 +248,9 @@ void __init dn_fib_rules_init(void) void __exit dn_fib_rules_cleanup(void) { + rtnl_lock(); fib_rules_unregister(dn_fib_rules_ops); + rtnl_unlock(); rcu_barrier(); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57be71dd6a9e..23b9b3e86f4c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1111,11 +1111,10 @@ static void ip_fib_net_exit(struct net *net) { unsigned int i; + rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif - - rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bc40115bc394..fe54eba6d00d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -283,8 +283,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_del(&mrt->list); ipmr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv4.mr_rules_ops); + rtnl_unlock(); } #else #define ipmr_for_each_table(mrt, net) \ diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 27ca79682efb..70bc6abc0639 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -322,7 +322,9 @@ out_fib6_rules_ops: static void __net_exit fib6_rules_net_exit(struct net *net) { + rtnl_lock(); fib_rules_unregister(net->ipv6.fib6_rules_ops); + rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 52028f449a89..2f1fd9ffcb34 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -267,8 +267,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) list_del(&mrt->list); ip6mr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv6.mr6_rules_ops); + rtnl_unlock(); } #else #define ip6mr_for_each_table(mrt, net) \ -- cgit v1.2.3 From 7ba0c47c34a1ea5bc7a24ca67309996cce0569b5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:47 -0700 Subject: ip6mr: call del_timer_sync() in ip6mr_free_table() We need to wait for the flying timers, since we are going to free the mrtable right after it. Cc: Hannes Frederic Sowa Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2f1fd9ffcb34..312e0ff47339 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -336,7 +336,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr6_table *mrt) { - del_timer(&mrt->ipmr_expire_timer); + del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt); kfree(mrt); } -- cgit v1.2.3 From 6d458f5b4ece8542a5c2281e40008823fec91814 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:36 +0200 Subject: Revert "netns: don't clear nsid too early on removal" This reverts commit 4217291e592d ("netns: don't clear nsid too early on removal"). This is not the right fix, it introduces races. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5221f975a4cc..cb5290b8c428 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -349,7 +349,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp, *peer; + struct net *net, *tmp; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -365,6 +365,14 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); + for_each_net(tmp) { + int id = __peernet2id(tmp, net, false); + + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + } + idr_destroy(&net->netns_ids); + } rtnl_unlock(); @@ -390,26 +398,12 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); - rtnl_lock(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { - /* Unreference net from all peers (no need to loop over - * net_exit_list because idr_destroy() will be called for each - * element of this list. - */ - for_each_net(peer) { - int id = __peernet2id(peer, net, false); - - if (id >= 0) - idr_remove(&peer->netns_ids, id); - } - idr_destroy(&net->netns_ids); - list_del_init(&net->exit_list); put_user_ns(net->user_ns); net_drop_ns(net); } - rtnl_unlock(); } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3 From 576b7cd2f6ff1e90b3fc0a000d2fe74f8a50a4bb Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:37 +0200 Subject: netns: don't allocate an id for dead netns First, let's explain the problem. Suppose you have an ipip interface that stands in the netns foo and its link part in the netns bar (so the netns bar has an nsid into the netns foo). Now, you remove the netns bar: - the bar nsid into the netns foo is removed - the netns exit method of ipip is called, thus our ipip iface is removed: => a netlink message is built in the netns foo to advertise this deletion => this netlink message requests an nsid for bar, thus a new nsid is allocated for bar and never removed. This patch adds a check in peernet2id() so that an id cannot be allocated for a netns which is currently destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b8c428..70d3450588b2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -198,8 +198,10 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) */ int peernet2id(struct net *net, struct net *peer) { - int id = __peernet2id(net, peer, true); + bool alloc = atomic_read(&peer->count) == 0 ? false : true; + int id; + id = __peernet2id(net, peer, alloc); return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; } EXPORT_SYMBOL(peernet2id); -- cgit v1.2.3 From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001 From: "hannes@stressinduktion.org" Date: Wed, 1 Apr 2015 17:07:44 +0200 Subject: ipv6: protect skb->sk accesses from recursive dereference inside the stack We should not consult skb->sk for output decisions in xmit recursion levels > 0 in the stack. Otherwise local socket settings could influence the result of e.g. tunnel encapsulation process. ipv6 does not conform with this in three places: 1) ip6_fragment: we do consult ipv6_npinfo for frag_size 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should loop the packet back to the local socket 3) ip6_skb_dst_mtu could query the settings from the user socket and force a wrong MTU Furthermore: In sk_mc_loop we could potentially land in WARN_ON(1) if we use a PF_PACKET socket ontop of an IPv6-backed vxlan device. Reuse xmit_recursion as we are currently only interested in protecting tunnel devices. Cc: Jiri Pirko Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip.h | 16 ---------------- include/net/ip6_route.h | 3 ++- include/net/sock.h | 2 ++ net/core/dev.c | 4 +++- net/core/sock.c | 19 +++++++++++++++++++ net/ipv6/ip6_output.c | 3 ++- 7 files changed, 34 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dcf6ec27739b..278738873703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +DECLARE_PER_CPU(int, xmit_recursion); +static inline int dev_recursion_level(void) +{ + return this_cpu_read(xmit_recursion); +} + struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/include/net/ip.h b/include/net/ip.h index 025c61c0dffb..6cc1eafb153a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif -static inline int sk_mc_loop(struct sock *sk) -{ - if (!sk) - return 1; - switch (sk->sk_family) { - case AF_INET: - return inet_sk(sk)->mc_loop; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - return inet6_sk(sk)->mc_loop; -#endif - } - WARN_ON(1); - return 1; -} - bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1d09b46c1e48..eda131d179d9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..e4079c28e6b8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +bool sk_mc_loop(struct sock *sk); + static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d71964..45109b70664e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** diff --git a/net/core/sock.c b/net/core/sock.c index 78e89eb7eb70..71e3e5f1eaa0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e80b61b51ff..36cf0ab685a0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; -- cgit v1.2.3 From 67e04c29ec0daad9ba29341b4dab4b89526994cf Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 3 Apr 2015 13:46:09 -0700 Subject: l2tp: unregister l2tp_net_ops on failure path Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 895348e44c7d..a29a504492af 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1871,6 +1871,7 @@ static int __init l2tp_init(void) l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); + unregister_pernet_device(&l2tp_net_ops); rc = -ENOMEM; goto out; } -- cgit v1.2.3 From 303038135afbd0520d1e241c02592be6e4ea7204 Mon Sep 17 00:00:00 2001 From: Pavel Nakonechny Date: Sun, 5 Apr 2015 00:46:21 +0300 Subject: net: dsa: fix filling routing table from OF description According to description in 'include/net/dsa.h', in cascade switches configurations where there are more than one interconnected devices, 'rtable' array in 'dsa_chip_data' structure is used to indicate which port on this switch should be used to send packets to that are destined for corresponding switch. However, dsa_of_setup_routing_table() fills 'rtable' with port numbers of the _target_ switch, but not current one. This commit removes redundant devicetree parsing and adds needed port number as a function argument. So dsa_of_setup_routing_table() now just looks for target switch number by parsing parent of 'link' device node. To remove possible misunderstandings with the way of determining target switch number, a corresponding comment was added to the source code and to the DSA device tree bindings documentation file. This was tested on a custom board with two Marvell 88E6095 switches with following corresponding routing tables: { -1, 10 } and { 8, -1 }. Signed-off-by: Pavel Nakonechny Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/dsa.txt | 4 +++- net/dsa/dsa.c | 23 +++++++---------------- 2 files changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index e124847443f8..f0b4cd72411d 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -19,7 +19,9 @@ the parent DSA node. The maximum number of allowed child nodes is 4 (DSA_MAX_SWITCHES). Each of these switch child nodes should have the following required properties: -- reg : Describes the switch address on the MII bus +- reg : Contains two fields. The first one describes the + address on the MII bus. The second is the switch + number that must be unique in cascaded configurations - #address-cells : Must be 1 - #size-cells : Must be 0 diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 2173402d87e0..4dea2e0681d1 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -501,12 +501,10 @@ static struct net_device *dev_to_net_device(struct device *dev) #ifdef CONFIG_OF static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, struct dsa_chip_data *cd, - int chip_index, + int chip_index, int port_index, struct device_node *link) { - int ret; const __be32 *reg; - int link_port_addr; int link_sw_addr; struct device_node *parent_sw; int len; @@ -519,6 +517,10 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, if (!reg || (len != sizeof(*reg) * 2)) return -EINVAL; + /* + * Get the destination switch number from the second field of its 'reg' + * property, i.e. for "reg = <0x19 1>" sw_addr is '1'. + */ link_sw_addr = be32_to_cpup(reg + 1); if (link_sw_addr >= pd->nr_chips) @@ -535,20 +537,9 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); } - reg = of_get_property(link, "reg", NULL); - if (!reg) { - ret = -EINVAL; - goto out; - } - - link_port_addr = be32_to_cpup(reg); - - cd->rtable[link_sw_addr] = link_port_addr; + cd->rtable[link_sw_addr] = port_index; return 0; -out: - kfree(cd->rtable); - return ret; } static void dsa_of_free_platform_data(struct dsa_platform_data *pd) @@ -658,7 +649,7 @@ static int dsa_of_probe(struct platform_device *pdev) if (!strcmp(port_name, "dsa") && link && pd->nr_chips > 1) { ret = dsa_of_setup_routing_table(pd, cd, - chip_index, link); + chip_index, port_index, link); if (ret) goto out_free_chip; } -- cgit v1.2.3