From de3b7a06dfe15bda3e66a52285d422b954bb4832 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 4 Dec 2014 09:46:20 +0100 Subject: xfrm6: Fix transport header offset in _decode_session6. skb->transport_header might not be valid when we do a reverse decode because the ipv6 tunnel error handlers don't update it to the inner transport header. This leads to a wrong offset calculation and to wrong layer 4 informations. We fix this by using the size of the ipv6 header as the first offset. Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 5f983644373a..aa48302f00a1 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -130,8 +130,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) { struct flowi6 *fl6 = &fl->u.ip6; int onlyproto = 0; - u16 offset = skb_network_header_len(skb); const struct ipv6hdr *hdr = ipv6_hdr(skb); + u16 offset = sizeof(*hdr); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u8 nexthdr = nh[IP6CB(skb)->nhoff]; -- cgit v1.2.3 From f855691975bb06373a98711e4cfe2c224244b536 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 8 Dec 2014 07:56:18 +0100 Subject: xfrm6: Fix the nexthdr offset in _decode_session6. xfrm_decode_session() was originally designed for the usage in the receive path where the correct nexthdr offset is stored in IP6CB(skb)->nhoff. Over time this function spread to code that is used in the output path (netfilter, vti) where IP6CB(skb)->nhoff is not set. As a result, we get a wrong nexthdr and the upper layer flow informations are wrong. This can leed to incorrect policy lookups. Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index aa48302f00a1..48bf5a06847b 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -134,8 +134,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) u16 offset = sizeof(*hdr); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); - u8 nexthdr = nh[IP6CB(skb)->nhoff]; + u16 nhoff = IP6CB(skb)->nhoff; int oif = 0; + u8 nexthdr; + + if (!nhoff) + nhoff = offsetof(struct ipv6hdr, nexthdr); + + nexthdr = nh[nhoff]; if (skb_dst(skb)) oif = skb_dst(skb)->dev->ifindex; -- cgit v1.2.3 From cc72f6e227b8091e0b8297a6be266bedcb20a5aa Mon Sep 17 00:00:00 2001 From: John Linville Date: Tue, 6 Jan 2015 14:39:33 -0500 Subject: mac80211: uninitialized return val in __ieee80211_sta_handle_tspec_ac_params The return value should be initialized to false so that there's a valid return value when there are no sessions that need work to be done on them. Luckily, the side effect of using the uninitialized value is an extra harmless driver call. Coverity: CID 1260096 Fixes: 02219b3abca59 ("mac80211: add WMM admission control support") Signed-off-by: John W. Linville [extend commit message] Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2c36c4765f47..837a406a9dd6 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1643,7 +1643,7 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - bool ret; + bool ret = false; int ac; if (local->hw.queues < IEEE80211_NUM_ACS) -- cgit v1.2.3 From 20658702e08ecd693236b443837d28863b93e872 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Mon, 29 Dec 2014 11:59:59 +0200 Subject: cfg80211: fix deadlock during reg chan check If a P2P GO is active, the cfg80211_reg_can_beacon function will take the wdev lock, in its call to cfg80211_go_permissive_chan. But the wdev lock is already taken by the parent channel-checking function, causing a deadlock. Split the checking code into two parts. The first part will check if the wdev is active and saves the channel under the wdev lock. The second part will check actual channel validity according to type. Signed-off-by: Arik Nemtsov Reviewed-by: Ilan Peer Reviewed-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/reg.c | 56 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 7b8309840d4e..d39d1cbc86b1 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1530,45 +1530,40 @@ static void reg_call_notifier(struct wiphy *wiphy, static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { - struct ieee80211_channel *ch; struct cfg80211_chan_def chandef; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - bool ret = true; + enum nl80211_iftype iftype; wdev_lock(wdev); + iftype = wdev->iftype; + /* make sure the interface is active */ if (!wdev->netdev || !netif_running(wdev->netdev)) - goto out; + goto wdev_inactive_unlock; - switch (wdev->iftype) { + switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: if (!wdev->beacon_interval) - goto out; - - ret = cfg80211_reg_can_beacon(wiphy, - &wdev->chandef, wdev->iftype); + goto wdev_inactive_unlock; + chandef = wdev->chandef; break; case NL80211_IFTYPE_ADHOC: if (!wdev->ssid_len) - goto out; - - ret = cfg80211_reg_can_beacon(wiphy, - &wdev->chandef, wdev->iftype); + goto wdev_inactive_unlock; + chandef = wdev->chandef; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!wdev->current_bss || !wdev->current_bss->pub.channel) - goto out; + goto wdev_inactive_unlock; - ch = wdev->current_bss->pub.channel; - if (rdev->ops->get_channel && - !rdev_get_channel(rdev, wdev, &chandef)) - ret = cfg80211_chandef_usable(wiphy, &chandef, - IEEE80211_CHAN_DISABLED); - else - ret = !(ch->flags & IEEE80211_CHAN_DISABLED); + if (!rdev->ops->get_channel || + rdev_get_channel(rdev, wdev, &chandef)) + cfg80211_chandef_create(&chandef, + wdev->current_bss->pub.channel, + NL80211_CHAN_NO_HT); break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -1581,9 +1576,26 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) break; } -out: wdev_unlock(wdev); - return ret; + + switch (iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_ADHOC: + return cfg80211_reg_can_beacon(wiphy, &chandef, iftype); + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + return cfg80211_chandef_usable(wiphy, &chandef, + IEEE80211_CHAN_DISABLED); + default: + break; + } + + return true; + +wdev_inactive_unlock: + wdev_unlock(wdev); + return true; } static void reg_leave_invalid_chans(struct wiphy *wiphy) -- cgit v1.2.3 From f812116b174e59a350acc8e4856213a166a91222 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 15 Jan 2015 13:18:40 -0500 Subject: ip: zero sockaddr returned on error queue The sockaddr is returned in IP(V6)_RECVERR as part of errhdr. That structure is defined and allocated on the stack as struct { struct sock_extended_err ee; struct sockaddr_in(6) offender; } errhdr; The second part is only initialized for certain SO_EE_ORIGIN values. Always initialize it completely. An MTU exceeded error on a SOCK_RAW/IPPROTO_RAW is one example that would return uninitialized bytes. Signed-off-by: Willem de Bruijn ---- Also verified that there is no padding between errhdr.ee and errhdr.offender that could leak additional kernel data. Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 8 ++------ net/ipv6/datagram.c | 10 +++------- 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8a89c738b7a3..6b85adb05003 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -461,17 +461,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; - sin->sin_family = AF_UNSPEC; + memset(sin, 0, sizeof(*sin)); if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { - struct inet_sock *inet = inet_sk(sk); - sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - sin->sin_port = 0; - memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - if (inet->cmsg_flags) + if (inet_sk(sk)->cmsg_flags) ip_cmsg_recv(msg, skb); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 100c589a2a6c..49f5e73db122 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -393,11 +393,10 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; - sin->sin6_family = AF_UNSPEC; + memset(sin, 0, sizeof(*sin)); + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { sin->sin6_family = AF_INET6; - sin->sin6_flowinfo = 0; - sin->sin6_port = 0; if (np->rxopt.all) { if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) @@ -412,12 +411,9 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { - struct inet_sock *inet = inet_sk(sk); - ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); - sin->sin6_scope_id = 0; - if (inet->cmsg_flags) + if (inet_sk(sk)->cmsg_flags) ip_cmsg_recv(msg, skb); } } -- cgit v1.2.3 From ac64da0b83d82abe62f78b3d0e21cca31aea24fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Jan 2015 17:04:22 -0800 Subject: net: rps: fix cpu unplug softnet_data.input_pkt_queue is protected by a spinlock that we must hold when transferring packets from victim queue to an active one. This is because other cpus could still be trying to enqueue packets into victim queue. A second problem is that when we transfert the NAPI poll_list from victim to current cpu, we absolutely need to special case the percpu backlog, because we do not want to add complex locking to protect process_queue : Only owner cpu is allowed to manipulate it, unless cpu is offline. Based on initial patch from Prasad Sodagudi & Subash Abhinov Kasiviswanathan. This version is better because we do not slow down packet processing, only make migration safer. Reported-by: Prasad Sodagudi Reported-by: Subash Abhinov Kasiviswanathan Signed-off-by: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 683d493aa1bf..171420e75b03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7072,10 +7072,20 @@ static int dev_cpu_callback(struct notifier_block *nfb, oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } - /* Append NAPI poll list from offline CPU. */ - if (!list_empty(&oldsd->poll_list)) { - list_splice_init(&oldsd->poll_list, &sd->poll_list); - raise_softirq_irqoff(NET_RX_SOFTIRQ); + /* Append NAPI poll list from offline CPU, with one exception : + * process_backlog() must be called by cpu owning percpu backlog. + * We properly handle process_queue & input_pkt_queue later. + */ + while (!list_empty(&oldsd->poll_list)) { + struct napi_struct *napi = list_first_entry(&oldsd->poll_list, + struct napi_struct, + poll_list); + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) + napi->state = 0; + else + ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); @@ -7086,7 +7096,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, netif_rx_internal(skb); input_queue_head_incr(oldsd); } - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx_internal(skb); input_queue_head_incr(oldsd); } -- cgit v1.2.3 From 5ad6300524c0332ac67e912c20d6e5cf262ba58f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Jan 2015 11:37:13 +0100 Subject: genetlink: disallow subscribing to unknown mcast groups Jeff Layton reported that he could trigger the multicast unbind warning in generic netlink using trinity. I originally thought it was a race condition between unregistering the generic netlink family and closing the socket, but there's a far simpler explanation: genetlink currently allows subscribing to groups that don't (yet) exist, and the warning is triggered when unsubscribing again while the group still doesn't exist. Originally, I had a warning in the subscribe case and accepted it out of userspace API concerns, but the warning was of course wrong and removed later. However, I now think that allowing userspace to subscribe to groups that don't exist is wrong and could possibly become a security problem: Consider a (new) genetlink family implementing a permission check in the mcast_bind() function similar to the like the audit code does today; it would be possible to bypass the permission check by guessing the ID and subscribing to the group it exists. This is only possible in case a family like that would be dynamically loaded, but it doesn't seem like a huge stretch, for example wireless may be loaded when you plug in a USB device. To avoid this reject such subscription attempts. If this ends up causing userspace issues we may need to add a workaround in af_netlink to deny such requests but not return an error. Reported-by: Jeff Layton Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2e11061ef885..c18d3f5624b2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -985,7 +985,7 @@ static struct genl_multicast_group genl_ctrl_groups[] = { static int genl_bind(struct net *net, int group) { - int i, err = 0; + int i, err = -ENOENT; down_read(&cb_lock); for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { -- cgit v1.2.3 From ee1c244219fd652964710a6cc3e4f922e86aa492 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Jan 2015 11:37:14 +0100 Subject: genetlink: synchronize socket closing and family removal In addition to the problem Jeff Layton reported, I looked at the code and reproduced the same warning by subscribing and removing the genl family with a socket still open. This is a fairly tricky race which originates in the fact that generic netlink allows the family to go away while sockets are still open - unlike regular netlink which has a module refcount for every open socket so in general this cannot be triggered. Trying to resolve this issue by the obvious locking isn't possible as it will result in deadlocks between unregistration and group unbind notification (which incidentally lockdep doesn't find due to the home grown locking in the netlink table.) To really resolve this, introduce a "closing socket" reference counter (for generic netlink only, as it's the only affected family) in the core netlink code and use that in generic netlink to wait for all the sockets that are being closed at the same time as a generic netlink family is removed. This fixes the race that when a socket is closed, it will should call the unbind, but if the family is removed at the same time the unbind will not find it, leading to the warning. The real problem though is that in this case the unbind could actually find a new family that is registered to have a multicast group with the same ID, and call its mcast_unbind() leading to confusing. Also remove the warning since it would still trigger, but is now no longer a problem. This also moves the code in af_netlink.c to before unreferencing the module to avoid having the same problem in the normal non-genl case. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/genetlink.h | 4 ++++ include/net/genetlink.h | 5 ++++- net/netlink/af_netlink.c | 24 +++++++++++++++++------- net/netlink/af_netlink.h | 1 + net/netlink/genetlink.c | 16 +++++++++------- 5 files changed, 35 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index 55b685719d52..09460d6d6682 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -11,6 +11,10 @@ extern void genl_unlock(void); extern int lockdep_genl_is_held(void); #endif +/* for synchronisation between af_netlink and genetlink */ +extern atomic_t genl_sk_destructing_cnt; +extern wait_queue_head_t genl_sk_destructing_waitq; + /** * rcu_dereference_genl - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 2ea2c55bdc87..6c92415311ca 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -35,7 +35,10 @@ struct genl_info; * undo operations done by pre_doit, for example release locks * @mcast_bind: a socket bound to the given multicast group (which * is given as the offset into the groups array) - * @mcast_unbind: a socket was unbound from the given multicast group + * @mcast_unbind: a socket was unbound from the given multicast group. + * Note that unbind() will not be called symmetrically if the + * generic netlink family is removed while there are still open + * sockets. * @attrbuf: buffer to store parsed attributes * @family_list: family list * @mcgrps: multicast groups used by this family (private) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 84ea76ca3f1f..02fdde28dada 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -1095,6 +1096,8 @@ static void netlink_remove(struct sock *sk) __sk_del_bind_node(sk); netlink_update_listeners(sk); } + if (sk->sk_protocol == NETLINK_GENERIC) + atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } @@ -1211,6 +1214,20 @@ static int netlink_release(struct socket *sock) * will be purged. */ + /* must not acquire netlink_table_lock in any way again before unbind + * and notifying genetlink is done as otherwise it might deadlock + */ + if (nlk->netlink_unbind) { + int i; + + for (i = 0; i < nlk->ngroups; i++) + if (test_bit(i, nlk->groups)) + nlk->netlink_unbind(sock_net(sk), i + 1); + } + if (sk->sk_protocol == NETLINK_GENERIC && + atomic_dec_return(&genl_sk_destructing_cnt) == 0) + wake_up(&genl_sk_destructing_waitq); + sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); @@ -1246,13 +1263,6 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } - if (nlk->netlink_unbind) { - int i; - - for (i = 0; i < nlk->ngroups; i++) - if (test_bit(i, nlk->groups)) - nlk->netlink_unbind(sock_net(sk), i + 1); - } kfree(nlk->groups); nlk->groups = NULL; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index f123a88496f8..f1c31b39aa3e 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -2,6 +2,7 @@ #define _AF_NETLINK_H #include +#include #include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c18d3f5624b2..ee57459fc258 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -23,6 +23,9 @@ static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); +atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq); + void genl_lock(void) { mutex_lock(&genl_mutex); @@ -435,15 +438,18 @@ int genl_unregister_family(struct genl_family *family) genl_lock_all(); - genl_unregister_mc_groups(family); - list_for_each_entry(rc, genl_family_chain(family->id), family_list) { if (family->id != rc->id || strcmp(rc->name, family->name)) continue; + genl_unregister_mc_groups(family); + list_del(&rc->family_list); family->n_ops = 0; - genl_unlock_all(); + up_write(&cb_lock); + wait_event(genl_sk_destructing_waitq, + atomic_read(&genl_sk_destructing_cnt) == 0); + genl_unlock(); kfree(family->attrbuf); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); @@ -1014,7 +1020,6 @@ static int genl_bind(struct net *net, int group) static void genl_unbind(struct net *net, int group) { int i; - bool found = false; down_read(&cb_lock); for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { @@ -1027,14 +1032,11 @@ static void genl_unbind(struct net *net, int group) if (f->mcast_unbind) f->mcast_unbind(net, fam_grp); - found = true; break; } } } up_read(&cb_lock); - - WARN_ON(!found); } static int __net_init genl_pernet_init(struct net *net) -- cgit v1.2.3 From 2061dcd6bff8b774b4fac8b0739b6be3f87bc9f2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2015 16:34:35 +0100 Subject: net: sctp: fix race for one-to-many sockets in sendmsg's auto associate I.e. one-to-many sockets in SCTP are not required to explicitly call into connect(2) or sctp_connectx(2) prior to data exchange. Instead, they can directly invoke sendmsg(2) and the SCTP stack will automatically trigger connection establishment through 4WHS via sctp_primitive_ASSOCIATE(). However, this in its current implementation is racy: INIT is being sent out immediately (as it cannot be bundled anyway) and the rest of the DATA chunks are queued up for later xmit when connection is established, meaning sendmsg(2) will return successfully. This behaviour can result in an undesired side-effect that the kernel made the application think the data has already been transmitted, although none of it has actually left the machine, worst case even after close(2)'ing the socket. Instead, when the association from client side has been shut down e.g. first gracefully through SCTP_EOF and then close(2), the client could afterwards still receive the server's INIT_ACK due to a connection with higher latency. This INIT_ACK is then considered out of the blue and hence responded with ABORT as there was no alive assoc found anymore. This can be easily reproduced f.e. with sctp_test application from lksctp. One way to fix this race is to wait for the handshake to actually complete. The fix defers waiting after sctp_primitive_ASSOCIATE() and sctp_primitive_SEND() succeeded, so that DATA chunks cooked up from sctp_sendmsg() have already been placed into the output queue through the side-effect interpreter, and therefore can then be bundeled together with COOKIE_ECHO control chunks. strace from example application (shortened): socket(PF_INET, SOCK_SEQPACKET, IPPROTO_SCTP) = 3 sendmsg(3, {msg_name(28)={sa_family=AF_INET, sin_port=htons(8888), sin_addr=inet_addr("192.168.1.115")}, msg_iov(1)=[{"hello", 5}], msg_controllen=0, msg_flags=0}, 0) = 5 sendmsg(3, {msg_name(28)={sa_family=AF_INET, sin_port=htons(8888), sin_addr=inet_addr("192.168.1.115")}, msg_iov(1)=[{"hello", 5}], msg_controllen=0, msg_flags=0}, 0) = 5 sendmsg(3, {msg_name(28)={sa_family=AF_INET, sin_port=htons(8888), sin_addr=inet_addr("192.168.1.115")}, msg_iov(1)=[{"hello", 5}], msg_controllen=0, msg_flags=0}, 0) = 5 sendmsg(3, {msg_name(28)={sa_family=AF_INET, sin_port=htons(8888), sin_addr=inet_addr("192.168.1.115")}, msg_iov(1)=[{"hello", 5}], msg_controllen=0, msg_flags=0}, 0) = 5 sendmsg(3, {msg_name(28)={sa_family=AF_INET, sin_port=htons(8888), sin_addr=inet_addr("192.168.1.115")}, msg_iov(0)=[], msg_controllen=48, {cmsg_len=48, cmsg_level=0x84 /* SOL_??? */, cmsg_type=, ...}, msg_flags=0}, 0) = 0 // graceful shutdown for SOCK_SEQPACKET via SCTP_EOF close(3) = 0 tcpdump before patch (fooling the application): 22:33:36.306142 IP 192.168.1.114.41462 > 192.168.1.115.8888: sctp (1) [INIT] [init tag: 3879023686] [rwnd: 106496] [OS: 10] [MIS: 65535] [init TSN: 3139201684] 22:33:36.316619 IP 192.168.1.115.8888 > 192.168.1.114.41462: sctp (1) [INIT ACK] [init tag: 3345394793] [rwnd: 106496] [OS: 10] [MIS: 10] [init TSN: 3380109591] 22:33:36.317600 IP 192.168.1.114.41462 > 192.168.1.115.8888: sctp (1) [ABORT] tcpdump after patch: 14:28:58.884116 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [INIT] [init tag: 438593213] [rwnd: 106496] [OS: 10] [MIS: 65535] [init TSN: 3092969729] 14:28:58.888414 IP 192.168.1.115.8888 > 192.168.1.114.35846: sctp (1) [INIT ACK] [init tag: 381429855] [rwnd: 106496] [OS: 10] [MIS: 10] [init TSN: 2141904492] 14:28:58.888638 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [COOKIE ECHO] , (2) [DATA] (B)(E) [TSN: 3092969729] [...] 14:28:58.893278 IP 192.168.1.115.8888 > 192.168.1.114.35846: sctp (1) [COOKIE ACK] , (2) [SACK] [cum ack 3092969729] [a_rwnd 106491] [#gap acks 0] [#dup tsns 0] 14:28:58.893591 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [DATA] (B)(E) [TSN: 3092969730] [...] 14:28:59.096963 IP 192.168.1.115.8888 > 192.168.1.114.35846: sctp (1) [SACK] [cum ack 3092969730] [a_rwnd 106496] [#gap acks 0] [#dup tsns 0] 14:28:59.097086 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [DATA] (B)(E) [TSN: 3092969731] [...] , (2) [DATA] (B)(E) [TSN: 3092969732] [...] 14:28:59.103218 IP 192.168.1.115.8888 > 192.168.1.114.35846: sctp (1) [SACK] [cum ack 3092969732] [a_rwnd 106486] [#gap acks 0] [#dup tsns 0] 14:28:59.103330 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [SHUTDOWN] 14:28:59.107793 IP 192.168.1.115.8888 > 192.168.1.114.35846: sctp (1) [SHUTDOWN ACK] 14:28:59.107890 IP 192.168.1.114.35846 > 192.168.1.115.8888: sctp (1) [SHUTDOWN COMPLETE] Looks like this bug is from the pre-git history museum. ;) Fixes: 08707d5482df ("lksctp-2_5_31-0_5_1.patch") Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 2625eccb77d5..aafe94bf292e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1603,7 +1603,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, sctp_assoc_t associd = 0; sctp_cmsgs_t cmsgs = { NULL }; sctp_scope_t scope; - bool fill_sinfo_ttl = false; + bool fill_sinfo_ttl = false, wait_connect = false; struct sctp_datamsg *datamsg; int msg_flags = msg->msg_flags; __u16 sinfo_flags = 0; @@ -1943,6 +1943,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, if (err < 0) goto out_free; + wait_connect = true; pr_debug("%s: we associated primitively\n", __func__); } @@ -1980,6 +1981,11 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, sctp_datamsg_put(datamsg); err = msg_len; + if (unlikely(wait_connect)) { + timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT); + sctp_wait_for_connect(asoc, &timeo); + } + /* If we are already past ASSOCIATE, the lower * layers are responsible for association cleanup. */ -- cgit v1.2.3 From 9d289715eb5c252ae15bd547cb252ca547a3c4f2 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Thu, 15 Jan 2015 22:34:25 +0100 Subject: ipv6: stop sending PTB packets for MTU < 1280 Reduce the attack vector and stop generating IPv6 Fragment Header for paths with an MTU smaller than the minimum required IPv6 MTU size (1280 byte) - called atomic fragments. See IETF I-D "Deprecating the Generation of IPv6 Atomic Fragments" [1] for more information and how this "feature" can be misused. [1] https://tools.ietf.org/html/draft-ietf-6man-deprecate-atomfrag-generation-00 Signed-off-by: Fernando Gont Signed-off-by: Hagen Paul Pfeifer Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c91083156edb..166e33bed222 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1160,12 +1160,9 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct net *net = dev_net(dst->dev); rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) { - u32 features = dst_metric(dst, RTAX_FEATURES); + if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - features |= RTAX_FEATURE_ALLFRAG; - dst_metric_set(dst, RTAX_FEATURES, features); - } + dst_metric_set(dst, RTAX_MTU, mtu); rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); } -- cgit v1.2.3 From 2af81d6718f5ec92b1d787e0fe79b0d3b6f78601 Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Wed, 21 Jan 2015 22:19:34 +0200 Subject: mac80211: only roll back station states for WDS when suspending In normal cases (i.e. when we are fully associated), cfg80211 takes care of removing all the stations before calling suspend in mac80211. But in the corner case when we suspend during authentication or association, mac80211 needs to roll back the station states. But we shouldn't roll back the station states in the suspend function, because this is taken care of in other parts of the code, except for WDS interfaces. For AP types of interfaces, cfg80211 takes care of disconnecting all stations before calling the driver's suspend code. For station interfaces, this is done in the quiesce code. For WDS interfaces we still need to do it here, so move the code into a new switch case for WDS. Cc: stable@kernel.org [3.15+] Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/pm.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 4c5192e0d66c..4a95fe3cffbc 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -86,20 +86,6 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) } } - /* tear down aggregation sessions and remove STAs */ - mutex_lock(&local->sta_mtx); - list_for_each_entry(sta, &local->sta_list, list) { - if (sta->uploaded) { - enum ieee80211_sta_state state; - - state = sta->sta_state; - for (; state > IEEE80211_STA_NOTEXIST; state--) - WARN_ON(drv_sta_state(local, sta->sdata, sta, - state, state - 1)); - } - } - mutex_unlock(&local->sta_mtx); - /* remove all interfaces that were created in the driver */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) @@ -111,6 +97,21 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) case NL80211_IFTYPE_STATION: ieee80211_mgd_quiesce(sdata); break; + case NL80211_IFTYPE_WDS: + /* tear down aggregation sessions and remove STAs */ + mutex_lock(&local->sta_mtx); + sta = sdata->u.wds.sta; + if (sta && sta->uploaded) { + enum ieee80211_sta_state state; + + state = sta->sta_state; + for (; state > IEEE80211_STA_NOTEXIST; state--) + WARN_ON(drv_sta_state(local, sta->sdata, + sta, state, + state - 1)); + } + mutex_unlock(&local->sta_mtx); + break; default: break; } -- cgit v1.2.3 From fb142f4bbb7d718b3d9cc8f27c909b4809545f5c Mon Sep 17 00:00:00 2001 From: Fred Chou Date: Tue, 20 Jan 2015 10:17:27 +0800 Subject: mac80211: correct header length calculation HT Control field may also be present in management frames, as defined in 8.2.4.1.10 of 802.11-2012. Account for this in calculation of header length. Signed-off-by: Fred Chou Signed-off-by: Johannes Berg --- net/wireless/util.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index d0ac795445b7..5488c3662f7d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -308,6 +308,12 @@ unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) goto out; } + if (ieee80211_is_mgmt(fc)) { + if (ieee80211_has_order(fc)) + hdrlen += IEEE80211_HT_CTL_LEN; + goto out; + } + if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how -- cgit v1.2.3 From 3a5c5e81d8128a9e43abc52b75dd21d3da7a0cfc Mon Sep 17 00:00:00 2001 From: Mathy Vanhoef Date: Tue, 20 Jan 2015 15:05:08 +0100 Subject: mac80211: properly set CCK flag in radiotap Fix a regression introduced by commit a5e70697d0c4 ("mac80211: add radiotap flag and handling for 5/10 MHz") where the IEEE80211_CHAN_CCK channel type flag was incorrectly replaced by the IEEE80211_CHAN_OFDM flag. This commit fixes that by using the CCK flag again. Cc: stable@vger.kernel.org Fixes: a5e70697d0c4 ("mac80211: add radiotap flag and handling for 5/10 MHz") Signed-off-by: Mathy Vanhoef Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 683b10f46505..d69ca513848e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -272,7 +272,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, else if (rate && rate->flags & IEEE80211_RATE_ERP_G) channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ; else if (rate) - channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ; + channel_flags |= IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ; else channel_flags |= IEEE80211_CHAN_2GHZ; put_unaligned_le16(channel_flags, pos); -- cgit v1.2.3 From 0fa7b39131576dd1baa6ca17fca53c65d7f62249 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 23 Jan 2015 11:10:12 +0100 Subject: nl80211: fix per-station group key get/del and memory leak In case userspace attempts to obtain key information for or delete a unicast key, this is currently erroneously rejected unless the driver sets the WIPHY_FLAG_IBSS_RSN flag. Apparently enough drivers do so it was never noticed. Fix that, and while at it fix a potential memory leak: the error path in the get_key() function was placed after allocating a message but didn't free it - move it to a better place. Luckily admin permissions are needed to call this operation. Cc: stable@vger.kernel.org Fixes: e31b82136d1ad ("cfg80211/mac80211: allow per-station GTKs") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7ca4b5133123..8887c6e5fca8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2854,6 +2854,9 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->get_key) return -EOPNOTSUPP; + if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + return -ENOENT; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -2873,10 +2876,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr)) goto nla_put_failure; - if (pairwise && mac_addr && - !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) - return -ENOENT; - err = rdev_get_key(rdev, dev, key_idx, pairwise, mac_addr, &cookie, get_key_callback); @@ -3047,7 +3046,7 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); - if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr && + if (key.type == NL80211_KEYTYPE_GROUP && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; -- cgit v1.2.3 From 6b8d9117ccb4f81b1244aafa7bc70ef8fa45fc49 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 23 Jan 2015 20:47:00 -0500 Subject: net: llc: use correct size for sysctl timeout entries The timeout entries are sizeof(int) rather than sizeof(long), which means that when they were getting read we'd also leak kernel memory to userspace along with the timeout values. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/llc/sysctl_net_llc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 612a5ddaf93b..799bafc2af39 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -18,28 +18,28 @@ static struct ctl_table llc2_timeout_table[] = { { .procname = "ack", .data = &sysctl_llc2_ack_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_ack_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "busy", .data = &sysctl_llc2_busy_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_busy_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "p", .data = &sysctl_llc2_p_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_p_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "rej", .data = &sysctl_llc2_rej_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_rej_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, -- cgit v1.2.3 From 24df8986f36b9a5d8ae08236498d92267bac454b Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 20 Jan 2015 19:13:32 -0500 Subject: net: dsa: set slave MII bus PHY mask When registering a mdio bus, Linux assumes than every port has a PHY and tries to scan it. If a switch port has no PHY registered, DSA will fail to register the slave MII bus. To fix this, set the slave MII bus PHY mask to the switch PHYs mask. As an example, if we use a Marvell MV88E6352 (which is a 7-port switch with no registered PHYs for port 5 and port 6), with the following declared names: static struct dsa_chip_data switch_cdata = { [...] .port_names[0] = "sw0", .port_names[1] = "sw1", .port_names[2] = "sw2", .port_names[3] = "sw3", .port_names[4] = "sw4", .port_names[5] = "cpu", }; DSA will fail to create the switch instance. With the PHY mask set for the slave MII bus, only the PHY for ports 0-4 will be scanned and the instance will be successfully created. Signed-off-by: Vivien Didelot Tested-by: Florian Fainelli Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 515569ffde8a..589aafd01fc5 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -46,6 +46,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", ds->index, ds->pd->sw_addr); ds->slave_mii_bus->parent = ds->master_dev; + ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } -- cgit v1.2.3 From b0a1ba59921eaaa9cb8f97bb35f2e6870fcdfedc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 20 Jan 2015 19:16:02 -0800 Subject: ipv6: Fix __ip6_route_redirect In my last commit (a3c00e4: ipv6: Remove BACKTRACK macro), the changes in __ip6_route_redirect is incorrect. The following case is missed: 1. The for loop tries to find a valid gateway rt. If it fails to find one, rt will be NULL. 2. When rt is NULL, it is set to the ip6_null_entry. 3. The newly added 'else if', from a3c00e4, will stop the backtrack from happening. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 166e33bed222..495965358d22 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1242,12 +1242,16 @@ restart: rt = net->ipv6.ip6_null_entry; else if (rt->dst.error) { rt = net->ipv6.ip6_null_entry; - } else if (rt == net->ipv6.ip6_null_entry) { + goto out; + } + + if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } +out: dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); -- cgit v1.2.3 From 7913ecf69e24bd7575e0d0325eda3b43c8cfa749 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 22 Jan 2015 10:41:01 +0100 Subject: net: cls_bpf: fix size mismatch on filter preparation In cls_bpf_modify_existing(), we read out the number of filter blocks, do some sanity checks, allocate a block on that size, and copy over the BPF instruction blob from user space, then pass everything through the classic BPF checker prior to installation of the classifier. We should reject mismatches here, there are 2 scenarios: the number of filter blocks could be smaller than the provided instruction blob, so we do a partial copy of the BPF program, and thus the instructions will either be rejected from the verifier or a valid BPF program will be run; in the other case, we'll end up copying more than we're supposed to, and most likely the trailing garbage will be rejected by the verifier as well (i.e. we need to fit instruction pattern, ret {A,K} needs to be last instruction, load/stores must be correct, etc); in case not, we would leak memory when dumping back instruction patterns. The code should have only used nla_len() as Dave noted to avoid this from the beginning. Anyway, lets fix it by rejecting such load attempts. Fixes: 7d1d65cb84e1 ("net: sched: cls_bpf: add BPF-based classifier") Signed-off-by: Daniel Borkmann Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 84c8219c3e1c..49e5fa8795ae 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -180,6 +180,11 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, } bpf_size = bpf_len * sizeof(*bpf_ops); + if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { + ret = -EINVAL; + goto errout; + } + bpf_ops = kzalloc(bpf_size, GFP_KERNEL); if (bpf_ops == NULL) { ret = -ENOMEM; -- cgit v1.2.3 From 3f2ab135946dcd4eb6af92a53d6d4bd35e7526ca Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 22 Jan 2015 10:41:02 +0100 Subject: net: cls_bpf: fix auto generation of per list handles When creating a bpf classifier in tc with priority collisions and invoking automatic unique handle assignment, cls_bpf_grab_new_handle() will return a wrong handle id which in fact is non-unique. Usually altering of specific filters is being addressed over major id, but in case of collisions we result in a filter chain, where handle ids address individual cls_bpf_progs inside the classifier. Issue is, in cls_bpf_grab_new_handle() we probe for head->hgen handle in cls_bpf_get() and in case we found a free handle, we're supposed to use exactly head->hgen. In case of insufficient numbers of handles, we bail out later as handle id 0 is not allowed. Fixes: 7d1d65cb84e1 ("net: sched: cls_bpf: add BPF-based classifier") Signed-off-by: Daniel Borkmann Acked-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 49e5fa8795ae..f59adf8a4cd7 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -220,15 +220,21 @@ static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, struct cls_bpf_head *head) { unsigned int i = 0x80000000; + u32 handle; do { if (++head->hgen == 0x7FFFFFFF) head->hgen = 1; } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - if (i == 0) + + if (unlikely(i == 0)) { pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } - return i; + return handle; } static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, -- cgit v1.2.3 From 600ddd6825543962fb807884169e57b580dba208 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 22 Jan 2015 18:26:54 +0100 Subject: net: sctp: fix slab corruption from use after free on INIT collisions When hitting an INIT collision case during the 4WHS with AUTH enabled, as already described in detail in commit 1be9a950c646 ("net: sctp: inherit auth_capable on INIT collisions"), it can happen that we occasionally still remotely trigger the following panic on server side which seems to have been uncovered after the fix from commit 1be9a950c646 ... [ 533.876389] BUG: unable to handle kernel paging request at 00000000ffffffff [ 533.913657] IP: [] __kmalloc+0x95/0x230 [ 533.940559] PGD 5030f2067 PUD 0 [ 533.957104] Oops: 0000 [#1] SMP [ 533.974283] Modules linked in: sctp mlx4_en [...] [ 534.939704] Call Trace: [ 534.951833] [] ? crypto_init_shash_ops+0x60/0xf0 [ 534.984213] [] crypto_init_shash_ops+0x60/0xf0 [ 535.015025] [] __crypto_alloc_tfm+0x6d/0x170 [ 535.045661] [] crypto_alloc_base+0x4c/0xb0 [ 535.074593] [] ? _raw_spin_lock_bh+0x12/0x50 [ 535.105239] [] sctp_inet_listen+0x161/0x1e0 [sctp] [ 535.138606] [] SyS_listen+0x9d/0xb0 [ 535.166848] [] system_call_fastpath+0x16/0x1b ... or depending on the the application, for example this one: [ 1370.026490] BUG: unable to handle kernel paging request at 00000000ffffffff [ 1370.026506] IP: [] kmem_cache_alloc+0x75/0x1d0 [ 1370.054568] PGD 633c94067 PUD 0 [ 1370.070446] Oops: 0000 [#1] SMP [ 1370.085010] Modules linked in: sctp kvm_amd kvm [...] [ 1370.963431] Call Trace: [ 1370.974632] [] ? SyS_epoll_ctl+0x53f/0x960 [ 1371.000863] [] SyS_epoll_ctl+0x53f/0x960 [ 1371.027154] [] ? anon_inode_getfile+0xd3/0x170 [ 1371.054679] [] ? __alloc_fd+0xa7/0x130 [ 1371.080183] [] system_call_fastpath+0x16/0x1b With slab debugging enabled, we can see that the poison has been overwritten: [ 669.826368] BUG kmalloc-128 (Tainted: G W ): Poison overwritten [ 669.826385] INFO: 0xffff880228b32e50-0xffff880228b32e50. First byte 0x6a instead of 0x6b [ 669.826414] INFO: Allocated in sctp_auth_create_key+0x23/0x50 [sctp] age=3 cpu=0 pid=18494 [ 669.826424] __slab_alloc+0x4bf/0x566 [ 669.826433] __kmalloc+0x280/0x310 [ 669.826453] sctp_auth_create_key+0x23/0x50 [sctp] [ 669.826471] sctp_auth_asoc_create_secret+0xcb/0x1e0 [sctp] [ 669.826488] sctp_auth_asoc_init_active_key+0x68/0xa0 [sctp] [ 669.826505] sctp_do_sm+0x29d/0x17c0 [sctp] [...] [ 669.826629] INFO: Freed in kzfree+0x31/0x40 age=1 cpu=0 pid=18494 [ 669.826635] __slab_free+0x39/0x2a8 [ 669.826643] kfree+0x1d6/0x230 [ 669.826650] kzfree+0x31/0x40 [ 669.826666] sctp_auth_key_put+0x19/0x20 [sctp] [ 669.826681] sctp_assoc_update+0x1ee/0x2d0 [sctp] [ 669.826695] sctp_do_sm+0x674/0x17c0 [sctp] Since this only triggers in some collision-cases with AUTH, the problem at heart is that sctp_auth_key_put() on asoc->asoc_shared_key is called twice when having refcnt 1, once directly in sctp_assoc_update() and yet again from within sctp_auth_asoc_init_active_key() via sctp_assoc_update() on the already kzfree'd memory, which is also consistent with the observation of the poison decrease from 0x6b to 0x6a (note: the overwrite is detected at a later point in time when poison is checked on new allocation). Reference counting of auth keys revisited: Shared keys for AUTH chunks are being stored in endpoints and associations in endpoint_shared_keys list. On endpoint creation, a null key is being added; on association creation, all endpoint shared keys are being cached and thus cloned over to the association. struct sctp_shared_key only holds a pointer to the actual key bytes, that is, struct sctp_auth_bytes which keeps track of users internally through refcounting. Naturally, on assoc or enpoint destruction, sctp_shared_key are being destroyed directly and the reference on sctp_auth_bytes dropped. User space can add keys to either list via setsockopt(2) through struct sctp_authkey and by passing that to sctp_auth_set_key() which replaces or adds a new auth key. There, sctp_auth_create_key() creates a new sctp_auth_bytes with refcount 1 and in case of replacement drops the reference on the old sctp_auth_bytes. A key can be set active from user space through setsockopt() on the id via sctp_auth_set_active_key(), which iterates through either endpoint_shared_keys and in case of an assoc, invokes (one of various places) sctp_auth_asoc_init_active_key(). sctp_auth_asoc_init_active_key() computes the actual secret from local's and peer's random, hmac and shared key parameters and returns a new key directly as sctp_auth_bytes, that is asoc->asoc_shared_key, plus drops the reference if there was a previous one. The secret, which where we eventually double drop the ref comes from sctp_auth_asoc_set_secret() with intitial refcount of 1, which also stays unchanged eventually in sctp_assoc_update(). This key is later being used for crypto layer to set the key for the hash in crypto_hash_setkey() from sctp_auth_calculate_hmac(). To close the loop: asoc->asoc_shared_key is freshly allocated secret material and independant of the sctp_shared_key management keeping track of only shared keys in endpoints and assocs. Hence, also commit 4184b2a79a76 ("net: sctp: fix memory leak in auth key management") is independant of this bug here since it concerns a different layer (though same structures being used eventually). asoc->asoc_shared_key is reference dropped correctly on assoc destruction in sctp_association_free() and when active keys are being replaced in sctp_auth_asoc_init_active_key(), it always has a refcount of 1. Hence, it's freed prematurely in sctp_assoc_update(). Simple fix is to remove that sctp_auth_key_put() from there which fixes these panics. Fixes: 730fc3d05cd4 ("[SCTP]: Implete SCTP-AUTH parameter processing") Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index f791edd64d6c..26d06dbcc1c8 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1182,7 +1182,6 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->peer.peer_hmacs = new->peer.peer_hmacs; new->peer.peer_hmacs = NULL; - sctp_auth_key_put(asoc->asoc_shared_key); sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); } -- cgit v1.2.3 From df4d92549f23e1c037e83323aff58a21b3de7fe0 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 23 Jan 2015 12:01:26 +0100 Subject: ipv4: try to cache dst_entries which would cause a redirect Not caching dst_entries which cause redirects could be exploited by hosts on the same subnet, causing a severe DoS attack. This effect aggravated since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()"). Lookups causing redirects will be allocated with DST_NOCACHE set which will force dst_release to free them via RCU. Unfortunately waiting for RCU grace period just takes too long, we can end up with >1M dst_entries waiting to be released and the system will run OOM. rcuos threads cannot catch up under high softirq load. Attaching the flag to emit a redirect later on to the specific skb allows us to cache those dst_entries thus reducing the pressure on allocation and deallocation. This issue was discovered by Marcelo Leitner. Cc: Julian Anastasov Signed-off-by: Marcelo Leitner Signed-off-by: Florian Westphal Signed-off-by: Hannes Frederic Sowa Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/ip.h | 11 ++++++----- net/ipv4/ip_forward.c | 3 ++- net/ipv4/route.c | 9 +++++---- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 0bb620702929..f7cbd703d15d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -39,11 +39,12 @@ struct inet_skb_parm { struct ip_options opt; /* Compiled IP options */ unsigned char flags; -#define IPSKB_FORWARDED 1 -#define IPSKB_XFRM_TUNNEL_SIZE 2 -#define IPSKB_XFRM_TRANSFORMED 4 -#define IPSKB_FRAG_COMPLETE 8 -#define IPSKB_REROUTED 16 +#define IPSKB_FORWARDED BIT(0) +#define IPSKB_XFRM_TUNNEL_SIZE BIT(1) +#define IPSKB_XFRM_TRANSFORMED BIT(2) +#define IPSKB_FRAG_COMPLETE BIT(3) +#define IPSKB_REROUTED BIT(4) +#define IPSKB_DOREDIRECT BIT(5) u16 frag_max_size; }; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3a83ce5efa80..787b3c294ce6 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb) * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr && + !skb_sec_path(skb)) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a2155b02602..d58dd0ec3e53 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1554,11 +1554,10 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && + skb->protocol == htons(ETH_P_IP) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { - flags |= RTCF_DOREDIRECT; - do_cache = false; - } + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -2303,6 +2302,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; + if (IPCB(skb)->flags & IPSKB_DOREDIRECT) + r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_be32(skb, RTA_DST, dst)) goto nla_put_failure; -- cgit v1.2.3 From 86f3cddbc3037882414c7308973530167906b7e9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 24 Jan 2015 08:02:40 +1100 Subject: udp_diag: Fix socket skipping within chain While working on rhashtable walking I noticed that the UDP diag dumping code is buggy. In particular, the socket skipping within a chain never happens, even though we record the number of sockets that should be skipped. As this code was supposedly copied from TCP, this patch does what TCP does and resets num before we walk a chain. Signed-off-by: Herbert Xu Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/udp_diag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 7927db0a9279..4a000f1dd757 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -99,11 +99,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin s_slot = cb->args[0]; num = s_num = cb->args[1]; - for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { + for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct sock *sk; struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + num = 0; + if (hlist_nulls_empty(&hslot->head)) continue; -- cgit v1.2.3 From fc752f1f43c1c038a2c6ae58cc739ebb5953ccb0 Mon Sep 17 00:00:00 2001 From: "subashab@codeaurora.org" Date: Fri, 23 Jan 2015 22:26:02 +0000 Subject: ping: Fix race in free in receive path An exception is seen in ICMP ping receive path where the skb destructor sock_rfree() tries to access a freed socket. This happens because ping_rcv() releases socket reference with sock_put() and this internally frees up the socket. Later icmp_rcv() will try to free the skb and as part of this, skb destructor is called and which leads to a kernel panic as the socket is freed already in ping_rcv(). -->|exception -007|sk_mem_uncharge -007|sock_rfree -008|skb_release_head_state -009|skb_release_all -009|__kfree_skb -010|kfree_skb -011|icmp_rcv -012|ip_local_deliver_finish Fix this incorrect free by cloning this skb and processing this cloned skb instead. This patch was suggested by Eric Dumazet Signed-off-by: Subash Abhinov Kasiviswanathan Cc: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ping.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c0d82f78d364..2a3720fb5a5f 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -966,8 +966,11 @@ bool ping_rcv(struct sk_buff *skb) sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk != NULL) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + pr_debug("rcv on socket %p\n", sk); - ping_queue_rcv_skb(sk, skb_get(skb)); + if (skb2) + ping_queue_rcv_skb(sk, skb2); sock_put(sk); return true; } -- cgit v1.2.3 From 6e9e16e6143b725662e47026a1d0f270721cdd24 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 26 Jan 2015 15:11:17 +0100 Subject: ipv6: replacing a rt6_info needs to purge possible propagated rt6_infos too Lubomir Rintel reported that during replacing a route the interface reference counter isn't correctly decremented. To quote bug : | [root@rhel7-5 lkundrak]# sh -x lal | + ip link add dev0 type dummy | + ip link set dev0 up | + ip link add dev1 type dummy | + ip link set dev1 up | + ip addr add 2001:db8:8086::2/64 dev dev0 | + ip route add 2001:db8:8086::/48 dev dev0 proto static metric 20 | + ip route add 2001:db8:8088::/48 dev dev1 proto static metric 10 | + ip route replace 2001:db8:8086::/48 dev dev1 proto static metric 20 | + ip link del dev0 type dummy | Message from syslogd@rhel7-5 at Jan 23 10:54:41 ... | kernel:unregister_netdevice: waiting for dev0 to become free. Usage count = 2 | | Message from syslogd@rhel7-5 at Jan 23 10:54:51 ... | kernel:unregister_netdevice: waiting for dev0 to become free. Usage count = 2 During replacement of a rt6_info we must walk all parent nodes and check if the to be replaced rt6_info got propagated. If so, replace it with an alive one. Fixes: 4a287eba2de3957 ("IPv6 routing, NLM_F_* flag support: REPLACE and EXCL flags support, warn about missing CREATE flag") Reported-by: Lubomir Rintel Signed-off-by: Hannes Frederic Sowa Tested-by: Lubomir Rintel Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b2d1838897c9..f1c6d5e98322 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -659,6 +659,29 @@ static int fib6_commit_metrics(struct dst_entry *dst, return 0; } +static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, + struct net *net) +{ + if (atomic_read(&rt->rt6i_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(net, fn); + atomic_inc(&fn->leaf->rt6i_ref); + rt6_release(rt); + } + fn = fn->parent; + } + /* No more references are possible at this point. */ + BUG_ON(atomic_read(&rt->rt6i_ref) != 1); + } +} + /* * Insert routing information in a node. */ @@ -807,11 +830,12 @@ add: rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); inet6_rt_notify(RTM_NEWROUTE, rt, info); - rt6_release(iter); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + fib6_purge_rt(iter, fn, info->nl_net); + rt6_release(iter); } return 0; @@ -1322,24 +1346,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fn = fib6_repair_tree(net, fn); } - if (atomic_read(&rt->rt6i_ref) != 1) { - /* This route is used as dummy address holder in some split - * nodes. It is not leaked, but it still holds other resources, - * which must be released in time. So, scan ascendant nodes - * and replace dummy references to this route with references - * to still alive ones. - */ - while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); - rt6_release(rt); - } - fn = fn->parent; - } - /* No more references are possible at this point. */ - BUG_ON(atomic_read(&rt->rt6i_ref) != 1); - } + fib6_purge_rt(rt, fn, net); inet6_rt_notify(RTM_DELROUTE, rt, info); rt6_release(rt); -- cgit v1.2.3 From 06539d3071067ff146a9bffd1c801fa56d290909 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jan 2015 12:25:33 -0800 Subject: net: don't OOPS on socket aio Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/socket.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index a2c33a4dc7ba..418795caa897 100644 --- a/net/socket.c +++ b/net/socket.c @@ -869,9 +869,6 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, struct sock_iocb *siocb) { - if (!is_sync_kiocb(iocb)) - BUG(); - siocb->kiocb = iocb; iocb->private = siocb; return siocb; -- cgit v1.2.3