From c319d50bfcf678c2857038276d9fab3c6646f3bf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 30 Jul 2013 22:34:28 +0200 Subject: nl80211: fix another nl80211_fam.attrbuf race This is similar to the race Linus had reported, but in this case it's an older bug: nl80211_prepare_wdev_dump() uses the wiphy index in cb->args[0] as it is and thus parses the message over and over again instead of just once because 0 is the first valid wiphy index. Similar code in nl80211_testmode_dump() correctly offsets the wiphy_index by 1, do that here as well. Cc: stable@vger.kernel.org Reported-by: Ben Hutchings Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 25d217d90807..3fcba69817e5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -441,10 +441,12 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, goto out_unlock; } *rdev = wiphy_to_dev((*wdev)->wiphy); - cb->args[0] = (*rdev)->wiphy_idx; + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wiphy_idx + 1; cb->args[1] = (*wdev)->identifier; } else { - struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0]); + /* subtract the 1 again here */ + struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); struct wireless_dev *tmp; if (!wiphy) { -- cgit v1.2.3 From b56e4b857c5210e848bfb80e074e5756a36cd523 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 31 Jul 2013 12:12:24 -0700 Subject: mac80211: fix infinite loop in ieee80211_determine_chantype Commit "3d9646d mac80211: fix channel selection bug" introduced a possible infinite loop by moving the out target above the chandef_downgrade while loop. When we downgrade to NL80211_CHAN_WIDTH_20_NOHT, we jump back up to re-run the while loop...indefinitely. Replace goto with break and carry on. This may not be sufficient to connect to the AP, but will at least keep the cpu from livelocking. Thanks to Derek Atkins as an extra pair of debugging eyes. Cc: stable@kernel.org Signed-off-by: Chris Wright Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ae31968d42d3..e3e7d2be9e41 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -338,7 +338,7 @@ out: if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; - goto out; + break; } ret |= chandef_downgrade(chandef); -- cgit v1.2.3 From cb236d2d713cff83d024a82b836757d9e2b50715 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jul 2013 23:07:43 +0200 Subject: mac80211: don't wait for TX status forever TX status notification can get lost, or the frames could get stuck on the queue, so don't wait for the callback from the driver forever and instead time out after half a second. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e3e7d2be9e41..e5c3cf405060 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -31,10 +31,12 @@ #include "led.h" #define IEEE80211_AUTH_TIMEOUT (HZ / 5) +#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) #define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) +#define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 @@ -3394,10 +3396,13 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) if (tx_flags == 0) { auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; - ifmgd->auth_data->timeout_started = true; + auth_data->timeout_started = true; run_again(sdata, auth_data->timeout); } else { - auth_data->timeout_started = false; + auth_data->timeout = + round_jiffies_up(jiffies + IEEE80211_AUTH_TIMEOUT_LONG); + auth_data->timeout_started = true; + run_again(sdata, auth_data->timeout); } return 0; @@ -3434,7 +3439,11 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } else { - assoc_data->timeout_started = false; + assoc_data->timeout = + round_jiffies_up(jiffies + + IEEE80211_ASSOC_TIMEOUT_LONG); + assoc_data->timeout_started = true; + run_again(sdata, assoc_data->timeout); } return 0; -- cgit v1.2.3 From 5cdaed1e878d723d56d04ae0be1738124acf9f46 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jul 2013 11:23:06 +0200 Subject: mac80211: ignore HT primary channel while connected While we're connected, the AP shouldn't change the primary channel in the HT information. We checked this, and dropped the connection if it did change it. Unfortunately, this is causing problems on some APs, e.g. on the Netgear WRT610NL: the beacons seem to always contain a bad channel and if we made a connection using a probe response (correct data) we drop the connection immediately and can basically not connect properly at all. Work around this by ignoring the HT primary channel information in beacons if we're already connected. Also print out more verbose messages in the other situations to help diagnose similar bugs quicker in the future. Cc: stable@vger.kernel.org [3.10] Acked-by: Andy Isaacson Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e5c3cf405060..077a95360830 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -211,8 +211,9 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, - struct cfg80211_chan_def *chandef, bool verbose) + struct cfg80211_chan_def *chandef, bool tracking) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_chan_def vht_chandef; u32 ht_cfreq, ret; @@ -231,7 +232,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ - if (channel->center_freq != ht_cfreq) { + if (!tracking && channel->center_freq != ht_cfreq) { /* * It's possible that some APs are confused here; * Netgear WNDR3700 sometimes reports 4 higher than @@ -239,11 +240,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, * since we look at probe response/beacon data here * it should be OK. */ - if (verbose) - sdata_info(sdata, - "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", - channel->center_freq, ht_cfreq, - ht_oper->primary_chan, channel->band); + sdata_info(sdata, + "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", + channel->center_freq, ht_cfreq, + ht_oper->primary_chan, channel->band); ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; goto out; } @@ -297,7 +297,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, channel->band); break; default: - if (verbose) + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT operation IE has invalid channel width (%d), disable VHT\n", vht_oper->chan_width); @@ -306,7 +306,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } if (!cfg80211_chandef_valid(&vht_chandef)) { - if (verbose) + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); ret = IEEE80211_STA_DISABLE_VHT; @@ -319,7 +319,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { - if (verbose) + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information doesn't match HT, disable VHT\n"); ret = IEEE80211_STA_DISABLE_VHT; @@ -346,7 +346,7 @@ out: ret |= chandef_downgrade(chandef); } - if (chandef->width != vht_chandef.width && verbose) + if (chandef->width != vht_chandef.width && !tracking) sdata_info(sdata, "capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n"); @@ -386,7 +386,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, /* calculate new channel (type) based on HT/VHT operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, ht_oper, - vht_oper, &chandef, false); + vht_oper, &chandef, true); /* * Downgrade the new channel if we associated with restricted @@ -3838,7 +3838,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_oper, vht_oper, - &chandef, true); + &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), local->rx_chains); -- cgit v1.2.3 From 74418edec915d0f446debebde08d170c7b8ba0ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 30 Jul 2013 10:11:25 +0200 Subject: cfg80211: fix P2P GO interface teardown When a P2P GO interface goes down, cfg80211 doesn't properly tear it down, leading to warnings later. Add the GO interface type to the enumeration to tear it down like AP interfaces. Otherwise, we leave it pending and mac80211's state can get very confused, leading to warnings later. Cc: stable@vger.kernel.org Reported-by: Ilan Peer Tested-by: Ilan Peer Reviewed-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 4f9f216665e9..a8c29fa4f1b3 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -765,6 +765,7 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, cfg80211_leave_mesh(rdev, dev); break; case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev); break; default: -- cgit v1.2.3 From ddfe49b42d8ad4bfdf92d63d4a74f162660d878d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jul 2013 20:52:03 +0200 Subject: mac80211: continue using disabled channels while connected In case the AP has different regulatory information than we do, it can happen that we connect to an AP based on e.g. the world roaming regulatory data, and then update our database with the AP's country information disables the channel the AP is using. If this happens on an HT AP, the bandwidth tracking code will hit the WARN_ON() and disconnect. Since that's not very useful, ignore the channel-disable flag in bandwidth tracking. Cc: stable@vger.kernel.org Reported-by: Chris Wright Tested-by: Chris Wright Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 077a95360830..cc9e02d79b55 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -335,8 +335,17 @@ out: if (ret & IEEE80211_STA_DISABLE_VHT) vht_chandef = *chandef; + /* + * Ignore the DISABLED flag when we're already connected and only + * tracking the APs beacon for bandwidth changes - otherwise we + * might get disconnected here if we connect to an AP, update our + * regulatory information based on the AP's country IE and the + * information we have is wrong/outdated and disables the channel + * that we're actually using for the connection to the AP. + */ while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, - IEEE80211_CHAN_DISABLED)) { + tracking ? 0 : + IEEE80211_CHAN_DISABLED)) { if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; -- cgit v1.2.3 From 71ffe9c77dd7a2b62207953091efa8dafec958dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 25 Jul 2013 10:37:49 +0200 Subject: netfilter: xt_TCPMSS: fix handling of malformed TCP header and options Make sure the packet has enough room for the TCP header and that it is not malformed. While at it, store tcph->doff*4 in a variable, as it is used several times. This patch also fixes a possible off by one in case of malformed TCP options. Reported-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPMSS.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 7011c71646f0..6113cc7efffc 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -52,7 +52,8 @@ tcpmss_mangle_packet(struct sk_buff *skb, { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; - unsigned int tcplen, i; + int len, tcp_hdrlen; + unsigned int i; __be16 oldval; u16 newmss; u8 *opt; @@ -64,11 +65,14 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (!skb_make_writable(skb, skb->len)) return -1; - tcplen = skb->len - tcphoff; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return -1; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; - /* Header cannot be larger than the packet */ - if (tcplen < tcph->doff*4) + if (len < tcp_hdrlen) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { @@ -87,9 +91,8 @@ tcpmss_mangle_packet(struct sk_buff *skb, newmss = info->mss; opt = (u_int8_t *)tcph; - for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { - if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && - opt[i+1] == TCPOLEN_MSS) { + for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { + if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; @@ -112,9 +115,10 @@ tcpmss_mangle_packet(struct sk_buff *skb, } /* There is data after the header so the option can't be added - without moving it, and doing so may make the SYN packet - itself too large. Accept the packet unmodified instead. */ - if (tcplen > tcph->doff*4) + * without moving it, and doing so may make the SYN packet + * itself too large. Accept the packet unmodified instead. + */ + if (len > tcp_hdrlen) return 0; /* @@ -143,10 +147,10 @@ tcpmss_mangle_packet(struct sk_buff *skb, newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); - memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), 1); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; -- cgit v1.2.3 From a206bcb3b02025b23137f3228109d72e0f835c05 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 25 Jul 2013 10:46:46 +0200 Subject: netfilter: xt_TCPOPTSTRIP: fix possible off by one access Fix a possible off by one access since optlen() touches opt[offset+1] unsafely when i == tcp_hdrlen(skb) - 1. This patch replaces tcp_hdrlen() by the local variable tcp_hdrlen that stores the TCP header length, to save some cycles. Reported-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPOPTSTRIP.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index b68fa191710f..625fa1d636a0 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -38,7 +38,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len; + int len, tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) @@ -52,7 +52,9 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, return NF_DROP; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); - if (tcph->doff * 4 > len) + tcp_hdrlen = tcph->doff * 4; + + if (len < tcp_hdrlen) return NF_DROP; opt = (u_int8_t *)tcph; @@ -61,10 +63,10 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, * Walk through all TCP options - if we find some option to remove, * set all octets to %TCPOPT_NOP and adjust checksum. */ - for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) { optl = optlen(opt, i); - if (i + optl > tcp_hdrlen(skb)) + if (i + optl > tcp_hdrlen) break; if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) -- cgit v1.2.3 From e4d091d7bf787cd303383725b8071d0bae76f981 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 1 Aug 2013 12:36:57 +0300 Subject: netfilter: nfnetlink_{log,queue}: fix information leaks in netlink message These structs have a "_pad" member. Also the "phw" structs have an 8 byte "hw_addr[]" array but sometimes only the first 6 bytes are initialized. Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 6 +++++- net/netfilter/nfnetlink_queue_core.c | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 962e9792e317..d92cc317bf8b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -419,6 +419,7 @@ __build_packet_message(struct nfnl_log_net *log, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(inst->group_num); + memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; @@ -498,7 +499,10 @@ __build_packet_message(struct nfnl_log_net *log, if (indev && skb->dev && skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; - int len = dev_parse_header(skb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 971ea145ab3e..8a703c3dd318 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -463,7 +463,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (indev && entskb->dev && entskb->mac_header != entskb->network_header) { struct nfqnl_msg_packet_hw phw; - int len = dev_parse_header(entskb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) -- cgit v1.2.3 From d9af2d67e490b48f0d36f448d34e7bab9425f142 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 5 Aug 2013 16:47:38 +0200 Subject: net/vmw_vsock/af_vsock.c: drop unneeded semicolon Drop the semicolon at the end of the list_for_each_entry loop header. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 593071dabd1c..4d9334683f84 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -347,7 +347,7 @@ void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], - connected_table); + connected_table) fn(sk_vsock(vsk)); } -- cgit v1.2.3 From 0369722f024cd374f74eac6d261014403aa27ea2 Mon Sep 17 00:00:00 2001 From: "nikolay@redhat.com" Date: Sat, 3 Aug 2013 22:07:46 +0200 Subject: vlan: make vlan_dev_real_dev work over stacked vlans Sometimes we might have stacked vlans on top of each other, and we're interested in the first non-vlan real device on the path, so transform vlan_dev_real_dev to go over the stacked vlans and extract the first non-vlan device. Signed-off-by: Nikolay Aleksandrov Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4a78c4de9f20..6ee48aac776f 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -91,7 +91,12 @@ EXPORT_SYMBOL(__vlan_find_dev_deep); struct net_device *vlan_dev_real_dev(const struct net_device *dev) { - return vlan_dev_priv(dev)->real_dev; + struct net_device *ret = vlan_dev_priv(dev)->real_dev; + + while (is_vlan_dev(ret)) + ret = vlan_dev_priv(ret)->real_dev; + + return ret; } EXPORT_SYMBOL(vlan_dev_real_dev); -- cgit v1.2.3 From 07ce76aa9bcf8bc106a53c67548c5602f1598595 Mon Sep 17 00:00:00 2001 From: "nikolay@redhat.com" Date: Sat, 3 Aug 2013 22:07:47 +0200 Subject: net_sched: make dev_trans_start return vlan's real dev trans_start Vlan devices are LLTX and don't update their own trans_start, so if dev_trans_start has to be called with a vlan device then 0 or a stale value will be returned. Currently the bonding is the only such user, and it's needed for proper arp monitoring when the slaves are vlans. Fix this by extracting the vlan's real device trans_start. Suggested-by: David Miller Signed-off-by: Nikolay Aleksandrov Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4626cef4b76e..eeb8276d7a89 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -207,15 +208,19 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res = dev->trans_start; + unsigned long val, res; unsigned int i; + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; for (i = 0; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } dev->trans_start = res; + return res; } EXPORT_SYMBOL(dev_trans_start); -- cgit v1.2.3 From 7921895a5e852fc99de347bc0600659997de9298 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 5 Aug 2013 12:49:35 +0200 Subject: net: esp{4,6}: fix potential MTU calculation overflows Commit 91657eafb ("xfrm: take net hdr len into account for esp payload size calculation") introduced a possible interger overflow in esp{4,6}_get_mtu() handlers in case of x->props.mode equals XFRM_MODE_TUNNEL. Thus, the following expression will overflow unsigned int net_adj; ... net_adj = 0; ... return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - net_adj) & ~(align - 1)) + (net_adj - 2); where (net_adj - 2) would be evaluated as + (0 - 2) in an unsigned context. Fix it by simply removing brackets as those operations here do not need to have special precedence. Signed-off-by: Daniel Borkmann Cc: Benjamin Poirier Cc: Steffen Klassert Acked-by: Benjamin Poirier Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ab3d814bc80a..109ee89f123e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -477,7 +477,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) } return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + (net_adj - 2); + net_adj) & ~(align - 1)) + net_adj - 2; } static void esp4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40ffd72243a4..aeac0dc3635d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -425,7 +425,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) net_adj = 0; return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + (net_adj - 2); + net_adj) & ~(align - 1)) + net_adj - 2; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3 From fc7f8f5c53fdb82d4689e24df3da1a88bc3859f7 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Fri, 2 Aug 2013 19:07:38 +0200 Subject: neighbour: populate neigh_parms on alloc before calling ndo_neigh_setup dev->ndo_neigh_setup() might need some of the values of neigh_parms, so populate them before calling it. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9232c68941ab..60533db8b72d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1441,16 +1441,18 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, atomic_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + release_net(net); + dev_put(dev); kfree(p); return NULL; } - dev_hold(dev); - p->dev = dev; - write_pnet(&p->net, hold_net(net)); - p->sysctl_table = NULL; write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; -- cgit v1.2.3 From aab515d7c32a34300312416c50314e755ea6f765 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 11:18:49 -0700 Subject: fib_trie: remove potential out of bound access AddressSanitizer [1] dynamic checker pointed a potential out of bound access in leaf_walk_rcu() We could allocate one more slot in tnode_new() to leave the prefetch() in-place but it looks not worth the pain. Bug added in commit 82cfbb008572b ("[IPV4] fib_trie: iterator recode") [1] : https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel Reported-by: Andrey Konovalov Signed-off-by: Eric Dumazet Cc: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 108a1e9c9eac..3df6d3edb2a1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -71,7 +71,6 @@ #include #include #include -#include #include #include #include @@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) if (!c) continue; - if (IS_LEAF(c)) { - prefetch(rcu_dereference_rtnl(p->child[idx])); + if (IS_LEAF(c)) return (struct leaf *) c; - } /* Rescan start scanning in new node */ p = (struct tnode *) c; -- cgit v1.2.3 From 248ba8ec05a2c3b118c2224e57eb10c128176ab1 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 6 Aug 2013 00:32:05 +0200 Subject: bridge: don't try to update timers in case of broken MLD queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are reading an uninitialized value for the max_delay variable when snooping an MLD query message of invalid length and would update our timers with that. Fixing this by simply ignoring such broken MLD queries (just like we do for IGMP already). This is a regression introduced by: "bridge: disable snooping if there is no querier" (b00589af3b04) Reported-by: Paul Bolle Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 61c5e819380e..08e576ada0b2 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1195,7 +1195,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); if (max_delay) group = &mld->mld_mca; - } else if (skb->len >= sizeof(*mld2q)) { + } else { if (!pskb_may_pull(skb, sizeof(*mld2q))) { err = -EINVAL; goto out; -- cgit v1.2.3 From 2ed0edf9090bf4afa2c6fc4f38575a85a80d4b20 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 17:10:15 -0700 Subject: tcp: cubic: fix overflow error in bictcp_update() commit 17a6e9f1aa9 ("tcp_cubic: fix clock dependency") added an overflow error in bictcp_update() in following code : /* change the unit from HZ to bictcp_HZ */ t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - ca->epoch_start) << BICTCP_HZ) / HZ; Because msecs_to_jiffies() being unsigned long, compiler does implicit type promotion. We really want to constrain (tcp_time_stamp - ca->epoch_start) to a signed 32bit value, or else 't' has unexpected high values. This bugs triggers an increase of retransmit rates ~24 days after boot [1], as the high order bit of tcp_time_stamp flips. [1] for hosts with HZ=1000 Big thanks to Van Jacobson for spotting this problem. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Stephen Hemminger Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb2..b6b591f0a788 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 offs; - u32 delta, t, bic_target, max_cnt; + u32 delta, bic_target, max_cnt; + u64 offs, t; ca->ack_cnt++; /* count the number of ACKs */ @@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * if the cwnd < 1 million packets !!! */ + t = (s32)(tcp_time_stamp - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - - ca->epoch_start) << BICTCP_HZ) / HZ; + t <<= BICTCP_HZ; + do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; -- cgit v1.2.3 From 15401946f9b720efdd20bda3ae79725e9c586897 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 6 Aug 2013 08:44:46 +0800 Subject: bridge: correct the comment for file br_sysfs_br.c br_sysfs_if.c is for sysfs attributes of bridge ports, while br_sysfs_br.c is for sysfs attributes of bridge itself. Correct the comment here. Signed-off-by: Wang Sheng-Hui Signed-off-by: David S. Miller --- net/bridge/br_sysfs_br.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 394bb96b6087..3b9637fb7939 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1,5 +1,5 @@ /* - * Sysfs attributes of bridge ports + * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: -- cgit v1.2.3 From cd6b423afd3c08b27e1fed52db828ade0addbc6b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 20:05:12 -0700 Subject: tcp: cubic: fix bug in bictcp_acked() While investigating about strange increase of retransmit rates on hosts ~24 days after boot, Van found hystart was disabled if ca->epoch_start was 0, as following condition is true when tcp_time_stamp high order bit is set. (s32)(tcp_time_stamp - ca->epoch_start) < HZ Quoting Van : At initialization & after every loss ca->epoch_start is set to zero so I believe that the above line will turn off hystart as soon as the 2^31 bit is set in tcp_time_stamp & hystart will stay off for 24 days. I think we've observed that cubic's restart is too aggressive without hystart so this might account for the higher drop rate we observe. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index b6b591f0a788..b6ae92a51f58 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -416,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) return; /* Discard delay samples right after fast recovery */ - if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; delay = (rtt_us << 3) / USEC_PER_MSEC; -- cgit v1.2.3 From 3e3be275851bc6fc90bfdcd732cd95563acd982b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 7 Aug 2013 02:34:31 +0200 Subject: ipv6: don't stop backtracking in fib6_lookup_1 if subtree does not match In case a subtree did not match we currently stop backtracking and return NULL (root table from fib_lookup). This could yield in invalid routing table lookups when using subtrees. Instead continue to backtrack until a valid subtree or node is found and return this match. Also remove unneeded NULL check. Reported-by: Teco Boot Cc: YOSHIFUJI Hideaki Cc: David Lamparter Cc: Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bff3d821c7eb..c4ff5bbb45c4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -993,14 +993,22 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) - fn = fib6_lookup_1(fn->subtree, args + 1); + if (fn->subtree) { + struct fib6_node *sfn; + sfn = fib6_lookup_1(fn->subtree, + args + 1); + if (!sfn) + goto backtrack; + fn = sfn; + } #endif - if (!fn || fn->fn_flags & RTN_RTINFO) + if (fn->fn_flags & RTN_RTINFO) return fn; } } - +#ifdef CONFIG_IPV6_SUBTREES +backtrack: +#endif if (fn->fn_flags & RTN_ROOT) break; -- cgit v1.2.3 From 77a482bdb2e68d13fae87541b341905ba70d572b Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Tue, 6 Aug 2013 13:45:43 +0300 Subject: ip_gre: fix ipgre_header to return correct offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ipgre_header() (header_ops->create) to return the correct amount of bytes pushed. Most callers of dev_hard_header() seem to care only if it was success, but af_packet.c uses it as offset to the skb to copy from userspace only once. In practice this fixes packet socket sendto()/sendmsg() to gre tunnels. Regression introduced in c54419321455631079c7d6e60bc732dd0c5914c5 ("GRE: Refactor GRE tunneling code.") Cc: Pravin B Shelar Signed-off-by: Timo Teräs Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1f6eab66f7ce..8d6939eeb492 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -383,7 +383,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) - return t->hlen; + return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } -- cgit v1.2.3 From e11aada32b39a060e26fa4091cb968bd42e3bcbf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Aug 2013 04:35:06 -0700 Subject: net: flow_dissector: add 802.1ad support Same behavior than 802.1q : finds the encapsulated protocol and skip 32bit header. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 00ee068efc1c..b84a1b155bc1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -65,6 +65,7 @@ ipv6: nhoff += sizeof(struct ipv6hdr); break; } + case __constant_htons(ETH_P_8021AD): case __constant_htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; struct vlan_hdr _vlan; -- cgit v1.2.3 From 288a9376371d425edeeea41a0310922c5fb2092d Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 7 Aug 2013 11:33:25 +0300 Subject: net: rename busy poll MIB counter Rename mib counter from "low latency" to "busy poll" v1 also moved the counter to the ip MIB (suggested by Shawn Bohrer) Eric Dumazet suggested that the current location is better. So v2 just renames the counter to fit the new naming convention. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- include/net/busy_poll.h | 2 +- include/uapi/linux/snmp.h | 2 +- net/ipv4/proc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8e2dfc106aed..8a358a2c97e6 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -122,7 +122,7 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) if (rc > 0) /* local bh are disabled so it is ok to use _BH */ NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_LOWLATENCYRXPACKETS, rc); + LINUX_MIB_BUSYPOLLRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index af0a674cc677..a1356d3b54df 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,7 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ - LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ + LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a47..463bd1273346 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,7 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), - SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), + SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3 From 645359930231d5e78fd3296a38b98c1a658a7ade Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Thu, 8 Aug 2013 15:19:48 -0700 Subject: rtnetlink: Fix inverted check in ndo_dflt_fdb_del() Fix inverted check when deleting an fdb entry. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de740834d1f..82d968527121 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2156,7 +2156,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, /* If aging addresses are supported device will need to * implement its own handler for this. */ - if (ndm->ndm_state & NUD_PERMANENT) { + if (!(ndm->ndm_state & NUD_PERMANENT)) { pr_info("%s: FDB only supports static addresses\n", dev->name); return -EINVAL; } -- cgit v1.2.3 From 356d7d88e088687b6578ca64601b0a2c9d145296 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 9 Aug 2013 17:21:27 -0700 Subject: netfilter: nf_conntrack: fix tcp_in_window for Fast Open Currently the conntrack checks if the ending sequence of a packet falls within the observed receive window. However it does so even if it has not observe any packet from the remote yet and uses an uninitialized receive window (td_maxwin). If a connection uses Fast Open to send a SYN-data packet which is dropped afterward in the network. The subsequent SYNs retransmits will all fail this check and be discarded, leading to a connection timeout. This is because the SYN retransmit does not contain data payload so end == initial sequence number (isn) + 1 sender->td_end == isn + syn_data_len receiver->td_maxwin == 0 The fix is to only apply this check after td_maxwin is initialized. Reported-by: Michael Chan Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7dcc376eea5f..2f8010707d01 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -526,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; s16 receiver_offset; - bool res; + bool res, in_recv_win; /* * Get the required data from the packet. @@ -649,14 +649,18 @@ static bool tcp_in_window(const struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->td_maxwin || + after(end, sender->td_end - receiver->td_maxwin - 1); + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), + (in_recv_win ? 1 : 0), before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && + in_recv_win && before(sack, receiver->td_end + 1) && after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* @@ -725,7 +729,7 @@ static bool tcp_in_window(const struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? + in_recv_win ? before(sack, receiver->td_end + 1) ? after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" -- cgit v1.2.3 From 9d2c9488cedb666bc8206fbdcdc1575e0fbc5929 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 6 Aug 2013 20:21:15 +0200 Subject: batman-adv: fix potential kernel paging errors for unicast transmissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several functions which might reallocate skb data. Currently some places keep reusing their old ethhdr pointer regardless of whether they became invalid after such a reallocation or not. This potentially leads to kernel paging errors. This patch fixes these by refetching the ethdr pointer after the potential reallocations. Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bridge_loop_avoidance.c | 2 ++ net/batman-adv/gateway_client.c | 13 ++++++++++++- net/batman-adv/gateway_client.h | 3 +-- net/batman-adv/soft-interface.c | 9 ++++++++- net/batman-adv/unicast.c | 13 ++++++++++--- 5 files changed, 33 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index e14531f1ce1c..264de88db320 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1529,6 +1529,8 @@ out: * in these cases, the skb is further handled by this function and * returns 1, otherwise it returns 0 and the caller shall further * process the skb. + * + * This call might reallocate skb data. */ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index f105219f4a4b..7614af31daff 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -508,6 +508,7 @@ out: return 0; } +/* this call might reallocate skb data */ static bool batadv_is_type_dhcprequest(struct sk_buff *skb, int header_len) { int ret = false; @@ -568,6 +569,7 @@ out: return ret; } +/* this call might reallocate skb data */ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) { struct ethhdr *ethhdr; @@ -619,6 +621,12 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return false; + + /* skb->data might have been reallocated by pskb_may_pull() */ + ethhdr = (struct ethhdr *)skb->data; + if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) + ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); + udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); @@ -634,12 +642,14 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) return true; } +/* this call might reallocate skb data */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, - struct sk_buff *skb, struct ethhdr *ethhdr) + struct sk_buff *skb) { struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *curr_gw = NULL; + struct ethhdr *ethhdr; bool ret, out_of_range = false; unsigned int header_len = 0; uint8_t curr_tq_avg; @@ -648,6 +658,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!ret) goto out; + ethhdr = (struct ethhdr *)skb->data; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest); if (!orig_dst_node) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 039902dca4a6..1037d75da51f 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -34,7 +34,6 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, void batadv_gw_node_purge(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len); -bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, - struct sk_buff *skb, struct ethhdr *ethhdr); +bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 700d0b49742d..0f04e1c302b4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -180,6 +180,9 @@ static int batadv_interface_tx(struct sk_buff *skb, if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; + /* skb->data might have been reallocated by batadv_bla_tx() */ + ethhdr = (struct ethhdr *)skb->data; + /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source)) batadv_tt_local_add(soft_iface, ethhdr->h_source, skb->skb_iif); @@ -220,6 +223,10 @@ static int batadv_interface_tx(struct sk_buff *skb, default: break; } + + /* reminder: ethhdr might have become unusable from here on + * (batadv_gw_is_dhcp_target() might have reallocated skb data) + */ } /* ethernet packet should be broadcasted */ @@ -266,7 +273,7 @@ static int batadv_interface_tx(struct sk_buff *skb, /* unicast packet */ } else { if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_OFF) { - ret = batadv_gw_out_of_range(bat_priv, skb, ethhdr); + ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; } diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index dc8b5d4dd636..688a0419756b 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -326,7 +326,9 @@ static bool batadv_unicast_push_and_fill_skb(struct sk_buff *skb, int hdr_size, * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * - * Returns false if the payload could not be encapsulated or true otherwise + * Returns false if the payload could not be encapsulated or true otherwise. + * + * This call might reallocate skb data. */ static bool batadv_unicast_prepare_skb(struct sk_buff *skb, struct batadv_orig_node *orig_node) @@ -343,7 +345,9 @@ static bool batadv_unicast_prepare_skb(struct sk_buff *skb, * @orig_node: the destination node * @packet_subtype: the batman 4addr packet subtype to use * - * Returns false if the payload could not be encapsulated or true otherwise + * Returns false if the payload could not be encapsulated or true otherwise. + * + * This call might reallocate skb data. */ bool batadv_unicast_4addr_prepare_skb(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -401,7 +405,7 @@ int batadv_unicast_generic_send_skb(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_node; int data_len = skb->len; int ret = NET_RX_DROP; - unsigned int dev_mtu; + unsigned int dev_mtu, header_len; /* get routing information */ if (is_multicast_ether_addr(ethhdr->h_dest)) { @@ -429,10 +433,12 @@ find_router: switch (packet_type) { case BATADV_UNICAST: batadv_unicast_prepare_skb(skb, orig_node); + header_len = sizeof(struct batadv_unicast_packet); break; case BATADV_UNICAST_4ADDR: batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, packet_subtype); + header_len = sizeof(struct batadv_unicast_4addr_packet); break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It @@ -441,6 +447,7 @@ find_router: goto out; } + ethhdr = (struct ethhdr *)(skb->data + header_len); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route -- cgit v1.2.3 From d4cca39d90fca21c04315095de5d0e734e839a8b Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Fri, 9 Aug 2013 17:12:58 +0800 Subject: tipc: avoid possible deadlock while enable and disable bearer We met lockdep warning when enable and disable the bearer for commands such as: tipc-config -netid=1234 -addr=1.1.3 -be=eth:eth0 tipc-config -netid=1234 -addr=1.1.3 -bd=eth:eth0 --------------------------------------------------- [ 327.693595] ====================================================== [ 327.693994] [ INFO: possible circular locking dependency detected ] [ 327.694519] 3.11.0-rc3-wwd-default #4 Tainted: G O [ 327.694882] ------------------------------------------------------- [ 327.695385] tipc-config/5825 is trying to acquire lock: [ 327.695754] (((timer))#2){+.-...}, at: [] del_timer_sync+0x0/0xd0 [ 327.696018] [ 327.696018] but task is already holding lock: [ 327.696018] (&(&b_ptr->lock)->rlock){+.-...}, at: [] bearer_disable+ 0xdd/0x120 [tipc] [ 327.696018] [ 327.696018] which lock already depends on the new lock. [ 327.696018] [ 327.696018] [ 327.696018] the existing dependency chain (in reverse order) is: [ 327.696018] [ 327.696018] -> #1 (&(&b_ptr->lock)->rlock){+.-...}: [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] _raw_spin_lock_bh+0x41/0x80 [ 327.696018] [] disc_timeout+0x18/0xd0 [tipc] [ 327.696018] [] call_timer_fn+0xda/0x1e0 [ 327.696018] [] run_timer_softirq+0x2a7/0x2d0 [ 327.696018] [] __do_softirq+0x16a/0x2e0 [ 327.696018] [] irq_exit+0xd5/0xe0 [ 327.696018] [] smp_apic_timer_interrupt+0x45/0x60 [ 327.696018] [] apic_timer_interrupt+0x6f/0x80 [ 327.696018] [] arch_cpu_idle+0x1e/0x30 [ 327.696018] [] cpu_idle_loop+0x1fd/0x280 [ 327.696018] [] cpu_startup_entry+0x1e/0x20 [ 327.696018] [] start_secondary+0x89/0x90 [ 327.696018] [ 327.696018] -> #0 (((timer))#2){+.-...}: [ 327.696018] [] check_prev_add+0x43e/0x4b0 [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] del_timer_sync+0x3d/0xd0 [ 327.696018] [] tipc_disc_delete+0x15/0x30 [tipc] [ 327.696018] [] bearer_disable+0xef/0x120 [tipc] [ 327.696018] [] tipc_disable_bearer+0x2f/0x60 [tipc] [ 327.696018] [] tipc_cfg_do_cmd+0x2e2/0x550 [tipc] [ 327.696018] [] handle_cmd+0x49/0xe0 [tipc] [ 327.696018] [] genl_family_rcv_msg+0x268/0x340 [ 327.696018] [] genl_rcv_msg+0x70/0xd0 [ 327.696018] [] netlink_rcv_skb+0x89/0xb0 [ 327.696018] [] genl_rcv+0x27/0x40 [ 327.696018] [] netlink_unicast+0x15e/0x1b0 [ 327.696018] [] netlink_sendmsg+0x22f/0x400 [ 327.696018] [] __sock_sendmsg+0x66/0x80 [ 327.696018] [] sock_aio_write+0x107/0x120 [ 327.696018] [] do_sync_write+0x7d/0xc0 [ 327.696018] [] vfs_write+0x186/0x190 [ 327.696018] [] SyS_write+0x60/0xb0 [ 327.696018] [] system_call_fastpath+0x16/0x1b [ 327.696018] [ 327.696018] other info that might help us debug this: [ 327.696018] [ 327.696018] Possible unsafe locking scenario: [ 327.696018] [ 327.696018] CPU0 CPU1 [ 327.696018] ---- ---- [ 327.696018] lock(&(&b_ptr->lock)->rlock); [ 327.696018] lock(((timer))#2); [ 327.696018] lock(&(&b_ptr->lock)->rlock); [ 327.696018] lock(((timer))#2); [ 327.696018] [ 327.696018] *** DEADLOCK *** [ 327.696018] [ 327.696018] 5 locks held by tipc-config/5825: [ 327.696018] #0: (cb_lock){++++++}, at: [] genl_rcv+0x18/0x40 [ 327.696018] #1: (genl_mutex){+.+.+.}, at: [] genl_rcv_msg+0xa6/0xd0 [ 327.696018] #2: (config_mutex){+.+.+.}, at: [] tipc_cfg_do_cmd+0x39/ 0x550 [tipc] [ 327.696018] #3: (tipc_net_lock){++.-..}, at: [] tipc_disable_bearer+ 0x18/0x60 [tipc] [ 327.696018] #4: (&(&b_ptr->lock)->rlock){+.-...}, at: [] bearer_disable+0xdd/0x120 [tipc] [ 327.696018] [ 327.696018] stack backtrace: [ 327.696018] CPU: 2 PID: 5825 Comm: tipc-config Tainted: G O 3.11.0-rc3-wwd- default #4 [ 327.696018] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 327.696018] 00000000ffffffff ffff880037fa77a8 ffffffff814d03dd 0000000000000000 [ 327.696018] ffff880037fa7808 ffff880037fa77e8 ffffffff810b1c4f 0000000037fa77e8 [ 327.696018] ffff880037fa7808 ffff880037e4db40 0000000000000000 ffff880037e4e318 [ 327.696018] Call Trace: [ 327.696018] [] dump_stack+0x4d/0xa0 [ 327.696018] [] print_circular_bug+0x10f/0x120 [ 327.696018] [] check_prev_add+0x43e/0x4b0 [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] ? sched_clock_cpu+0xd8/0x110 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] ? try_to_del_timer_sync+0x70/0x70 [ 327.696018] [] del_timer_sync+0x3d/0xd0 [ 327.696018] [] ? try_to_del_timer_sync+0x70/0x70 [ 327.696018] [] tipc_disc_delete+0x15/0x30 [tipc] [ 327.696018] [] bearer_disable+0xef/0x120 [tipc] [ 327.696018] [] tipc_disable_bearer+0x2f/0x60 [tipc] [ 327.696018] [] tipc_cfg_do_cmd+0x2e2/0x550 [tipc] [ 327.696018] [] ? security_capable+0x13/0x20 [ 327.696018] [] handle_cmd+0x49/0xe0 [tipc] [ 327.696018] [] genl_family_rcv_msg+0x268/0x340 [ 327.696018] [] genl_rcv_msg+0x70/0xd0 [ 327.696018] [] ? genl_lock+0x20/0x20 [ 327.696018] [] netlink_rcv_skb+0x89/0xb0 [ 327.696018] [] ? genl_rcv+0x18/0x40 [ 327.696018] [] genl_rcv+0x27/0x40 [ 327.696018] [] netlink_unicast+0x15e/0x1b0 [ 327.696018] [] ? memcpy_fromiovec+0x6c/0x90 [ 327.696018] [] netlink_sendmsg+0x22f/0x400 [ 327.696018] [] __sock_sendmsg+0x66/0x80 [ 327.696018] [] sock_aio_write+0x107/0x120 [ 327.696018] [] ? release_sock+0x8c/0xa0 [ 327.696018] [] do_sync_write+0x7d/0xc0 [ 327.696018] [] ? rw_verify_area+0x54/0x100 [ 327.696018] [] vfs_write+0x186/0x190 [ 327.696018] [] SyS_write+0x60/0xb0 [ 327.696018] [] system_call_fastpath+0x16/0x1b ----------------------------------------------------------------------- The problem is that the tipc_link_delete() will cancel the timer disc_timeout() when the b_ptr->lock is hold, but the disc_timeout() still call b_ptr->lock to finish the work, so the dead lock occurs. We should unlock the b_ptr->lock when del the disc_timeout(). Remove link_timeout() still met the same problem, the patch: http://article.gmane.org/gmane.network.tipc.general/4380 fix the problem, so no need to send patch for fix link_timeout() deadlock warming. Signed-off-by: Wang Weidong Signed-off-by: Ding Tianhong Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index cb29ef7ba2f0..609c30c80816 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -460,6 +460,7 @@ static void bearer_disable(struct tipc_bearer *b_ptr) { struct tipc_link *l_ptr; struct tipc_link *temp_l_ptr; + struct tipc_link_req *temp_req; pr_info("Disabling bearer <%s>\n", b_ptr->name); spin_lock_bh(&b_ptr->lock); @@ -468,9 +469,13 @@ static void bearer_disable(struct tipc_bearer *b_ptr) list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { tipc_link_delete(l_ptr); } - if (b_ptr->link_req) - tipc_disc_delete(b_ptr->link_req); + temp_req = b_ptr->link_req; + b_ptr->link_req = NULL; spin_unlock_bh(&b_ptr->lock); + + if (temp_req) + tipc_disc_delete(temp_req); + memset(b_ptr, 0, sizeof(struct tipc_bearer)); } -- cgit v1.2.3 From ac4f9599362475662efb6efbb334cbcec98d4778 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Aug 2013 15:09:08 +0200 Subject: net: sctp: sctp_assoc_control_transport: fix MTU size in SCTP_PF state The SCTP Quick failover draft [1] section 5.1, point 5 says that the cwnd should be 1 MTU. So, instead of 1, set it to 1 MTU. [1] https://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05 Reported-by: Karl Heiss Signed-off-by: Daniel Borkmann Cc: Neil Horman Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bce5b79662a6..ab67efc64b24 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -846,12 +846,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, else spc_state = SCTP_ADDR_AVAILABLE; /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1, see SCTP + * active state and set cwnd to 1 MTU, see SCTP * Quick failover draft section 5.1, point 5 */ if (transport->state == SCTP_PF) { ulp_notify = false; - transport->cwnd = 1; + transport->cwnd = asoc->pathmtu; } transport->state = SCTP_ACTIVE; break; -- cgit v1.2.3 From 771085d6bf3c52de29fc213e5bad07a82e57c23e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Aug 2013 16:25:21 +0200 Subject: net: sctp: sctp_transport_destroy{, _rcu}: fix potential pointer corruption Probably this one is quite unlikely to be triggered, but it's more safe to do the call_rcu() at the end after we have dropped the reference on the asoc and freed sctp packet chunks. The reason why is because in sctp_transport_destroy_rcu() the transport is being kfree()'d, and if we're unlucky enough we could run into corrupted pointers. Probably that's more of theoretical nature, but it's safer to have this simple fix. Introduced by commit 8c98653f ("sctp: sctp_close: fix release of bindings for deferred call_rcu's"). I also did the 8c98653f regression test and it's fine that way. Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bdbbc3fd7c14..8fdd16046d66 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -181,12 +181,12 @@ static void sctp_transport_destroy(struct sctp_transport *transport) return; } - call_rcu(&transport->rcu, sctp_transport_destroy_rcu); - sctp_packet_free(&transport->packet); if (transport->asoc) sctp_association_put(transport->asoc); + + call_rcu(&transport->rcu, sctp_transport_destroy_rcu); } /* Start T3_rtx timer if it is not already running and update the heartbeat -- cgit v1.2.3 From 58ad436fcf49810aa006016107f494c9ac9013db Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Aug 2013 09:04:05 +0200 Subject: genetlink: fix family dump race When dumping generic netlink families, only the first dump call is locked with genl_lock(), which protects the list of families, and thus subsequent calls can access the data without locking, racing against family addition/removal. This can cause a crash. Fix it - the locking needs to be conditional because the first time around it's already locked. A similar bug was reported to me on an old kernel (3.4.47) but the exact scenario that happened there is no longer possible, on those kernels the first round wasn't locked either. Looking at the current code I found the race described above, which had also existed on the old kernel. Cc: stable@vger.kernel.org Reported-by: Andrei Otcheretianski Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 512718adb0d5..f85f8a2ad6cf 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -789,6 +789,10 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; + bool need_locking = chains_to_skip || fams_to_skip; + + if (need_locking) + genl_lock(); for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { n = 0; @@ -810,6 +814,9 @@ errout: cb->args[0] = i; cb->args[1] = n; + if (need_locking) + genl_unlock(); + return skb->len; } -- cgit v1.2.3 From 4221f40513233fa8edeef7fc82e44163fde03b9b Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 13 Aug 2013 01:41:06 -0700 Subject: ip_tunnel: Do not use inner ip-header-id for tunnel ip-header-id. Using inner-id for tunnel id is not safe in some rare cases. E.g. packets coming from multiple sources entering same tunnel can have same id. Therefore on tunnel packet receive we could have packets from two different stream but with same source and dst IP with same ip-id which could confuse ip packet reassembly. Following patch reverts optimization from commit 490ab08127 (IP_GRE: Fix IP-Identification.) CC: Jarno Rajahalme CC: Ansis Atteka Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 14 -------------- net/ipv4/ip_tunnel_core.c | 4 +--- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 781b3cf86a2f..a354db5b7662 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -145,20 +145,6 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -static inline void tunnel_ip_select_ident(struct sk_buff *skb, - const struct iphdr *old_iph, - struct dst_entry *dst) -{ - struct iphdr *iph = ip_hdr(skb); - - /* Use inner packet iph-id if possible. */ - if (skb->protocol == htons(ETH_P_IP) && old_iph->id) - iph->id = old_iph->id; - else - __ip_select_ident(iph, dst, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); -} - int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct net *net, struct rtable *rt, struct sk_buff *skb, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 7167b08977df..850525b34899 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -76,9 +76,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - tunnel_ip_select_ident(skb, - (const struct iphdr *)skb_inner_network_header(skb), - &rt->dst); + __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); err = ip_local_out(skb); if (unlikely(net_xmit_eval(err))) -- cgit v1.2.3 From 3e805ad288c524bb65aad3f1e004402223d3d504 Mon Sep 17 00:00:00 2001 From: Asbjoern Sloth Toennesen Date: Mon, 12 Aug 2013 16:30:09 +0000 Subject: rtnetlink: rtnl_bridge_getlink: Call nlmsg_find_attr() with ifinfomsg header Fix the iproute2 command `bridge vlan show`, after switching from rtgenmsg to ifinfomsg. Let's start with a little history: Feb 20: Vlad Yasevich got his VLAN-aware bridge patchset included in the 3.9 merge window. In the kernel commit 6cbdceeb, he added attribute support to bridge GETLINK requests sent with rtgenmsg. Mar 6th: Vlad got this iproute2 reference implementation of the bridge vlan netlink interface accepted (iproute2 9eff0e5c) Apr 25th: iproute2 switched from using rtgenmsg to ifinfomsg (63338dca) http://patchwork.ozlabs.org/patch/239602/ http://marc.info/?t=136680900700007 Apr 28th: Linus released 3.9 Apr 30th: Stephen released iproute2 3.9.0 The `bridge vlan show` command haven't been working since the switch to ifinfomsg, or in a released version of iproute2. Since the kernel side only supports rtgenmsg, which iproute2 switched away from just prior to the iproute2 3.9.0 release. I haven't been able to find any documentation, about neither rtgenmsg nor ifinfomsg, and in which situation to use which, but kernel commit 88c5b5ce seams to suggest that ifinfomsg should be used. Fixing this in kernel will break compatibility, but I doubt that anybody have been using it due to this bug in the user space reference implementation, at least not without noticing this bug. That said the functionality is still fully functional in 3.9, when reversing iproute2 commit 63338dca. This could also be fixed in iproute2, but thats an ugly patch that would reintroduce rtgenmsg in iproute2, and from searching in netdev it seams like rtgenmsg usage is discouraged. I'm assuming that the only reason that Vlad implemented the kernel side to use rtgenmsg, was because iproute2 was using it at the time. Signed-off-by: Asbjoern Sloth Toennesen Reviewed-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 82d968527121..ca198c1d1d30 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2384,7 +2384,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *extfilt; u32 filter_mask = 0; - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct rtgenmsg), + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), IFLA_EXT_MASK); if (extfilt) filter_mask = nla_get_u32(extfilt); -- cgit v1.2.3 From 628e341f319f1a64a4639088faba952e4ec8f0a8 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 14 Aug 2013 13:05:23 +0200 Subject: xfrm: make local error reporting more robust In xfrm4 and xfrm6 we need to take care about sockets of the other address family. This could happen because a 6in4 or 4in6 tunnel could get protected by ipsec. Because we don't want to have a run-time dependency on ipv6 when only using ipv4 xfrm we have to embed a pointer to the correct local_error function in xfrm_state_afinet and look it up when returning an error depending on the socket address family. Thanks to vi0ss for the great bug report: v2: a) fix two more unsafe interpretations of skb->sk as ipv6 socket (xfrm6_local_dontfrag and __xfrm6_output) v3: a) add an EXPORT_SYMBOL_GPL(xfrm_local_error) to fix a link error when building ipv6 as a module (thanks to Steffen Klassert) Reported-by: Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 6 ++++++ net/ipv4/xfrm4_output.c | 12 ++++++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 10 ++++++---- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_output.c | 13 +++++++++++++ net/xfrm/xfrm_state.c | 7 ++----- 7 files changed, 39 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 94ce082b29dc..e823786e7c66 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -341,10 +341,13 @@ struct xfrm_state_afinfo { struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); + void (*local_error)(struct sk_buff *skb, u32 mtu); }; extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); extern int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); +extern struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); +extern void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); extern void xfrm_state_delete_tunnel(struct xfrm_state *x); @@ -1477,6 +1480,7 @@ extern int xfrm_input_resume(struct sk_buff *skb, int nexthdr); extern int xfrm_output_resume(struct sk_buff *skb, int err); extern int xfrm_output(struct sk_buff *skb); extern int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); +extern void xfrm_local_error(struct sk_buff *skb, int mtu); extern int xfrm4_extract_header(struct sk_buff *skb); extern int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, @@ -1497,6 +1501,7 @@ extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short fam extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler); extern int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler); +extern void xfrm4_local_error(struct sk_buff *skb, u32 mtu); extern int xfrm6_extract_header(struct sk_buff *skb); extern int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi); @@ -1514,6 +1519,7 @@ extern int xfrm6_output(struct sk_buff *skb); extern int xfrm6_output_finish(struct sk_buff *skb); extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); +extern void xfrm6_local_error(struct sk_buff *skb, u32 mtu); #ifdef CONFIG_XFRM extern int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 327a617d594c..7a5491ffa4de 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -33,8 +33,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(dst); if (skb->len > mtu) { if (skb->sk) - ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, - inet_sk(skb->sk)->inet_dport, mtu); + xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -99,3 +98,12 @@ int xfrm4_output(struct sk_buff *skb) x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } + +void xfrm4_local_error(struct sk_buff *skb, u32 mtu) +{ + struct iphdr *hdr; + + hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); + ip_local_error(skb->sk, EMSGSIZE, hdr->daddr, + inet_sk(skb->sk)->inet_dport, mtu); +} diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 9258e751baba..0b2a0641526a 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, + .local_error = xfrm4_local_error, }; void __init xfrm4_state_init(void) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8755a3079d0f..b64fff30eb06 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -34,8 +34,10 @@ static int xfrm6_local_dontfrag(struct sk_buff *skb) struct sock *sk = skb->sk; if (sk) { - proto = sk->sk_protocol; + if (sk->sk_family != AF_INET6) + return 0; + proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) return inet6_sk(sk)->dontfrag; } @@ -54,7 +56,7 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) ipv6_local_rxpmtu(sk, &fl6, mtu); } -static void xfrm6_local_error(struct sk_buff *skb, u32 mtu) +void xfrm6_local_error(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; struct sock *sk = skb->sk; @@ -80,7 +82,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (xfrm6_local_dontfrag(skb)) xfrm6_local_rxpmtu(skb, mtu); else if (skb->sk) - xfrm6_local_error(skb, mtu); + xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; @@ -142,7 +144,7 @@ static int __xfrm6_output(struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; } else if (!skb->local_df && skb->len > mtu && skb->sk) { - xfrm6_local_error(skb, mtu); + xfrm_local_error(skb, mtu); return -EMSGSIZE; } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index d8c70b8efc24..3fc970135fc6 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -183,6 +183,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, + .local_error = xfrm6_local_error, }; int __init xfrm6_state_init(void) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index eb4a84288648..6f5fc612b162 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -214,5 +214,18 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) return inner_mode->afinfo->extract_output(x, skb); } +void xfrm_local_error(struct sk_buff *skb, int mtu) +{ + struct xfrm_state_afinfo *afinfo; + + afinfo = xfrm_state_get_afinfo(skb->sk->sk_family); + if (!afinfo) + return; + + afinfo->local_error(skb, mtu); + xfrm_state_put_afinfo(afinfo); +} + EXPORT_SYMBOL_GPL(xfrm_output); EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); +EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78f66fa92449..54c0acd29468 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -39,9 +39,6 @@ static DEFINE_SPINLOCK(xfrm_state_lock); static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; -static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); -static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); - static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -1860,7 +1857,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); -static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) +struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; if (unlikely(family >= NPROTO)) @@ -1872,7 +1869,7 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) return afinfo; } -static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) +void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { rcu_read_unlock(); } -- cgit v1.2.3 From 0ea9d5e3e0e03a63b11392f5613378977dae7eca Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 13 Aug 2013 04:35:58 +0200 Subject: xfrm: introduce helper for safe determination of mtu skb->sk socket can be of AF_INET or AF_INET6 address family. Thus we always have to make sure we a referring to the correct interpretation of skb->sk. We only depend on header defines to query the mtu, so we don't introduce a new dependency to ipv6 by this change. Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/route.h | 8 ++++++++ include/net/xfrm.h | 12 ++++++++++++ net/ipv4/ip_output.c | 8 -------- net/ipv4/xfrm4_output.c | 4 +--- net/ipv6/xfrm6_output.c | 5 ++++- 5 files changed, 25 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 2ea40c1b5e00..afdeeb5bec25 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -317,4 +317,12 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +} + #endif /* _ROUTE_H */ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e823786e7c66..b41d2d10ff0e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -1723,4 +1724,15 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (sk && sk->sk_family == AF_INET6) + return ip6_skb_dst_mtu(skb); + else if (sk && sk->sk_family == AF_INET) + return ip_skb_dst_mtu(skb); + return dst_mtu(skb_dst(skb)); +} + #endif /* _NET_XFRM_H */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4bcabf3ab4ca..9ee17e3d11c3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) -{ - struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; - - return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); -} - static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7a5491ffa4de..80baf4a3b1b5 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -21,7 +21,6 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; - struct dst_entry *dst; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; @@ -29,8 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; - dst = skb_dst(skb); - mtu = dst_mtu(dst); + mtu = xfrm_skb_dst_mtu(skb); if (skb->len > mtu) { if (skb->sk) xfrm_local_error(skb, mtu); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index b64fff30eb06..3ac5ab264fed 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -138,7 +138,10 @@ static int __xfrm6_output(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu = ip6_skb_dst_mtu(skb); + int mtu = xfrm_skb_dst_mtu(skb); + + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); -- cgit v1.2.3 From cb35fba360dfc3496e5d8a47e23ec5ccdfd90925 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Aug 2013 14:50:01 +0300 Subject: nl80211: nl80211hdr_put() doesn't return an ERR_PTR There are a few places which check nl80211hdr_put() for an ERR_PTR but actually it returns NULL on error and never error values. In nl80211_testmode_dump() the return wasn't checked at all so I have added one. Signed-off-by: Dan Carpenter [some whitespace changes] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3fcba69817e5..5f6e982cdcf4 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2622,8 +2622,8 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_NEW_KEY); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -ENOBUFS; cookie.msg = msg; cookie.idx = key_idx; @@ -6507,6 +6507,9 @@ static int nl80211_testmode_dump(struct sk_buff *skb, NL80211_CMD_TESTMODE); struct nlattr *tmdata; + if (!hdr) + break; + if (nla_put_u32(skb, NL80211_ATTR_WIPHY, phy_idx)) { genlmsg_cancel(skb, hdr); break; @@ -6951,9 +6954,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_REMAIN_ON_CHANNEL); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } @@ -7251,9 +7253,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_FRAME); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } } @@ -8132,9 +8133,8 @@ static int nl80211_probe_client(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_PROBE_CLIENT); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } -- cgit v1.2.3 From dee8a9732e713480075adbbca8eb220c5b8d1216 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Aug 2013 09:23:57 +0200 Subject: cfg80211: don't request disconnect if not connected Neil Brown reports that with libertas, my recent cfg80211 SME changes in commit ceca7b7121795ef81bd598a240d53a92566 ("cfg80211: separate internal SME implementation") broke libertas suspend because it we now asked it to disconnect while already disconnected. The problematic change is in cfg80211_disconnect() as it previously checked the SME state and now calls the driver disconnect operation unconditionally. Fix this by checking if there's a current_bss indicating a connection, and do nothing if not. Reported-and-tested-by: Neil Brown Signed-off-by: Johannes Berg --- net/wireless/sme.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 81c8a10d743c..20e86a95dc4e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -976,21 +976,19 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; + int err = 0; ASSERT_WDEV_LOCK(wdev); kfree(wdev->connect_keys); wdev->connect_keys = NULL; - if (wdev->conn) { + if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); - } else if (!rdev->ops->disconnect) { + else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); - err = 0; - } else { + else if (wdev->current_bss) err = rdev_disconnect(rdev, dev, reason); - } return err; } -- cgit v1.2.3 From 30444e981ba28e892c439017fbc011d867f02a7d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 13 May 2013 08:41:06 -0700 Subject: openvswitch: Fix bad merge resolution. git silently included an extra hunk in vport_cmd_set() during automatic merging. This code is unreachable so it does not actually introduce a problem but it is clearly incorrect. Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f7e3a0d84c40..f2ed7600084e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2076,9 +2076,6 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_notify(reply, info, &ovs_dp_vport_multicast_group); return 0; - rtnl_unlock(); - return 0; - exit_free: kfree_skb(reply); exit_unlock: -- cgit v1.2.3 From 42415c90ceaf50c792e29823e359463bc6d4ee05 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 30 Jul 2013 15:44:14 -0700 Subject: openvswitch: Use correct type while allocating flex array. Flex array is used to allocate hash buckets which is type struct hlist_head, but we use `struct hlist_head *` to calculate array size. Since hlist_head is of size pointer it works fine. Following patch use correct type. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 5c519b121e1b..1aa84dc58777 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -240,7 +240,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) struct flex_array *buckets; int i, err; - buckets = flex_array_alloc(sizeof(struct hlist_head *), + buckets = flex_array_alloc(sizeof(struct hlist_head), n_buckets, GFP_KERNEL); if (!buckets) return NULL; -- cgit v1.2.3 From 36bf5cc66d60868bcc10aff209defed5a7b71c1d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 14 Aug 2013 15:50:36 -0700 Subject: openvswitch: Reset tunnel key between input and output. It doesn't make sense to output a tunnel packet using the same parameters that it was received with since that will generally just result in the packet going back to us. As a result, userspace assumes that the tunnel key is cleared when transitioning through the switch. In the majority of cases this doesn't matter since a packet is either going to a tunnel port (in which the key is overwritten with new values) or to a non-tunnel port (in which case the key is ignored). However, it's theoreticaly possible that userspace could rely on the documented behavior, so this corrects it. Signed-off-by: Jesse Gross --- net/openvswitch/actions.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 22c5f399f1cf..ab101f715447 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -535,6 +535,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) { struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + OVS_CB(skb)->tun_key = NULL; return do_execute_actions(dp, skb, acts->actions, acts->actions_len, false); } -- cgit v1.2.3 From 8a8e3d84b1719a56f9151909e80ea6ebc5b8e318 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 14 Aug 2013 23:47:11 +0200 Subject: net_sched: restore "linklayer atm" handling commit 56b765b79 ("htb: improved accuracy at high rates") broke the "linklayer atm" handling. tc class add ... htb rate X ceil Y linklayer atm The linklayer setting is implemented by modifying the rate table which is send to the kernel. No direct parameter were transferred to the kernel indicating the linklayer setting. The commit 56b765b79 ("htb: improved accuracy at high rates") removed the use of the rate table system. To keep compatible with older iproute2 utils, this patch detects the linklayer by parsing the rate table. It also supports future versions of iproute2 to send this linklayer parameter to the kernel directly. This is done by using the __reserved field in struct tc_ratespec, to convey the choosen linklayer option, but only using the lower 4 bits of this field. Linklayer detection is limited to speeds below 100Mbit/s, because at high rates the rtab is gets too inaccurate, so bad that several fields contain the same values, this resembling the ATM detect. Fields even start to contain "0" time to send, e.g. at 1000Mbit/s sending a 96 bytes packet cost "0", thus the rtab have been more broken than we first realized. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- include/uapi/linux/pkt_sched.h | 10 +++++++++- net/sched/sch_api.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 1 + net/sched/sch_htb.c | 13 +++++++++++++ 5 files changed, 72 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6eab63363e59..e5ae0c50fa9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -683,13 +683,19 @@ struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; + u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { - return ((u64)(len + r->overhead) * r->mult) >> r->shift; + len += r->overhead; + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) + return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; + + return ((u64)len * r->mult) >> r->shift; } extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf); @@ -700,6 +706,7 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, memset(res, 0, sizeof(*res)); res->rate = r->rate_bytes_ps; res->overhead = r->overhead; + res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index dbd71b0c7d8c..09d62b9228ff 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -73,9 +73,17 @@ struct tc_estimator { #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + struct tc_ratespec { unsigned char cell_log; - unsigned char __reserved; + __u8 linklayer; /* lower 4 bits */ unsigned short overhead; short cell_align; unsigned short mpu; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 281c1bded1f6..51b968d3febb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -285,6 +285,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) @@ -308,6 +347,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eeb8276d7a89..48be3d5c0d92 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -909,6 +909,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->rate_bytes_ps = conf->rate; + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* * The deal here is to replace a divide by a reciprocal one diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 45e751527dfc..c2178b15ca6e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1329,6 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; @@ -1350,6 +1351,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + if (rtab) + qdisc_put_rtab(rtab); + } + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (ctab) + qdisc_put_rtab(ctab); + } + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; -- cgit v1.2.3 From 50fa3b31f4700deb1a860fa1a04016b889765323 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 27 Jul 2013 03:24:44 +0200 Subject: batman-adv: check return type of unicast packet preparations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batadv_unicast(_4addr)_prepare_skb might reallocate the skb's data. And if it tries to do so then this can potentially fail. We shouldn't continue working on this skb in such a case. Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Acked-by: Antonio Quartulli Signed-off-by: Antonio Quartulli --- net/batman-adv/unicast.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index 688a0419756b..857e1b8349ee 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -432,12 +432,16 @@ find_router: switch (packet_type) { case BATADV_UNICAST: - batadv_unicast_prepare_skb(skb, orig_node); + if (!batadv_unicast_prepare_skb(skb, orig_node)) + goto out; + header_len = sizeof(struct batadv_unicast_packet); break; case BATADV_UNICAST_4ADDR: - batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, - packet_subtype); + if (!batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, + packet_subtype)) + goto out; + header_len = sizeof(struct batadv_unicast_4addr_packet); break; default: -- cgit v1.2.3 From 3d483058c8c8b87a167155ca9ddd776dd730bc39 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:46:52 +0200 Subject: ipv6: wire up skb->encapsulation When pushing a new header before current one call skb_reset_inner_headers to record the position of the inner headers in the various ipv6 tunnel protocols. We later need this to correctly identify the addresses needed to send back an error in the xfrm layer. This change is safe, because skb->protocol is always checked before dereferencing data from the inner protocol. Cc: Steffen Klassert Cc: YOSHIFUJI Hideaki Cc: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/ip6_gre.c | 5 +++++ net/ipv6/ip6_tunnel.c | 6 ++++++ net/ipv6/sit.c | 5 +++++ 3 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ecd60733e5e2..90747f1973fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -724,6 +724,11 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, gre_hlen); skb_reset_network_header(skb); skb_set_transport_header(skb, sizeof(*ipv6h)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1e55866cead7..46ba243605a3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1027,6 +1027,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a3437a4cd07e..fbfc5a83867f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -888,6 +888,11 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); -- cgit v1.2.3 From 5d0ff542d0264f61dc4bdb34eba39ffb4ea3bc23 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:46:57 +0200 Subject: ipv6: xfrm: dereference inner ipv6 header if encapsulated In xfrm6_local_error use inner_header if the packet was encapsulated. Cc: Steffen Klassert Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 3ac5ab264fed..e092e306882d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -59,10 +59,12 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) void xfrm6_local_error(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; + const struct ipv6hdr *hdr; struct sock *sk = skb->sk; + hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb); fl6.fl6_dport = inet_sk(sk)->inet_dport; - fl6.daddr = ipv6_hdr(skb)->daddr; + fl6.daddr = hdr->daddr; ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } -- cgit v1.2.3 From 844d48746e4b281a933aedc0428048a1219b42f4 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:47:01 +0200 Subject: xfrm: choose protocol family by skb protocol We need to choose the protocol family by skb->protocol. Otherwise we call the wrong xfrm{4,6}_local_error handler in case an ipv6 sockets is used in ipv4 mode, in which case we should call down to xfrm4_local_error (ip6 sockets are a superset of ip4 ones). We are called before before ip_output functions, so skb->protocol is not reset. Cc: Steffen Klassert Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 ++-- net/xfrm/xfrm_output.c | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b41d2d10ff0e..ac5b02515355 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1728,9 +1728,9 @@ static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && sk->sk_family == AF_INET6) + if (sk && skb->protocol == htons(ETH_P_IPV6)) return ip6_skb_dst_mtu(skb); - else if (sk && sk->sk_family == AF_INET) + else if (sk && skb->protocol == htons(ETH_P_IP)) return ip_skb_dst_mtu(skb); return dst_mtu(skb_dst(skb)); } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 6f5fc612b162..3bb2cdc13b46 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -216,9 +216,17 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) void xfrm_local_error(struct sk_buff *skb, int mtu) { + unsigned int proto; struct xfrm_state_afinfo *afinfo; - afinfo = xfrm_state_get_afinfo(skb->sk->sk_family); + if (skb->protocol == htons(ETH_P_IP)) + proto = AF_INET; + else if (skb->protocol == htons(ETH_P_IPV6)) + proto = AF_INET6; + else + return; + + afinfo = xfrm_state_get_afinfo(proto); if (!afinfo) return; -- cgit v1.2.3 From 4b08a8f1bd8cb4541c93ec170027b4d0782dab52 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 16 Aug 2013 13:02:27 +0200 Subject: ipv6: remove max_addresses check from ipv6_create_tempaddr Because of the max_addresses check attackers were able to disable privacy extensions on an interface by creating enough autoconfigured addresses: But the check is not actually needed: max_addresses protects the kernel to install too many ipv6 addresses on an interface and guards addrconf_prefix_rcv to install further addresses as soon as this limit is reached. We only generate temporary addresses in direct response of a new address showing up. As soon as we filled up the maximum number of addresses of an interface, we stop installing more addresses and thus also stop generating more temp addresses. Even if the attacker tries to generate a lot of temporary addresses by announcing a prefix and removing it again (lifetime == 0) we won't install more temp addresses, because the temporary addresses do count to the maximum number of addresses, thus we would stop installing new autoconfigured addresses when the limit is reached. This patch fixes CVE-2013-0343 (but other layer-2 attacks are still possible). Thanks to Ding Tianhong to bring this topic up again. Cc: Ding Tianhong Cc: George Kargiotakis Cc: P J P Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Acked-by: Ding Tianhong Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index da4241c8c7da..498ea99194af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1126,12 +1126,10 @@ retry: if (ifp->flags & IFA_F_OPTIMISTIC) addr_flags |= IFA_F_OPTIMISTIC; - ift = !max_addresses || - ipv6_count_addresses(idev) < max_addresses ? - ipv6_add_addr(idev, &addr, NULL, tmp_plen, - ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft) : NULL; - if (IS_ERR_OR_NULL(ift)) { + ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, + ipv6_addr_scope(&addr), addr_flags, + tmp_valid_lft, tmp_prefered_lft); + if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); pr_info("%s: retry temporary address regeneration\n", __func__); -- cgit v1.2.3 From f46078cfcd77fa5165bf849f5e568a7ac5fa569c Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 16 Aug 2013 13:30:07 +0200 Subject: ipv6: drop packets with multiple fragmentation headers It is not allowed for an ipv6 packet to contain multiple fragmentation headers. So discard packets which were already reassembled by fragmentation logic and send back a parameter problem icmp. The updates for RFC 6980 will come in later, I have to do a bit more research here. Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + net/ipv6/reassembly.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 850e95bc766c..b8b7dc755752 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -101,6 +101,7 @@ struct inet6_skb_parm { #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 +#define IP6SKB_FRAGMENTED 16 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 790d9f4b8b0b..1aeb473b2cc6 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -490,6 +490,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, ipv6_hdr(head)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->nhoff = nhoff; + IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -524,6 +525,9 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct net *net = dev_net(skb_dst(skb)->dev); int evicted; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) + goto fail_hdr; + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ @@ -544,6 +548,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; return 1; } -- cgit v1.2.3 From 7ed5c5ae96d23da22de95e1c7a239537acd378b1 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Fri, 16 Aug 2013 19:04:36 +0400 Subject: tcp: set timestamps for restored skb-s When the repair mode is turned off, the write queue seqs are updated so that the whole queue is considered to be 'already sent. The "when" field must be set for such skb. It's used in tcp_rearm_rto for example. If the "when" field isn't set, the retransmit timeout can be calculated incorrectly and a tcp connected can stop for two minutes (TCP_RTO_MAX). Acked-by: Pavel Emelyanov Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5423223e93c2..b2f6c74861af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1120,6 +1120,13 @@ new_segment: if (!skb) goto wait_for_memory; + /* + * All packets are restored as if they have + * already been sent. + */ + if (tp->repair) + TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* * Check whether we can use HW checksum. */ -- cgit v1.2.3 From 8bcdeaff5ed544704a9a691d4aef0adb3f9c5b8f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 19 Aug 2013 16:40:22 -0400 Subject: packet: restore packet statistics tp_packets to include drops getsockopt PACKET_STATISTICS returns tp_packets + tp_drops. Commit ee80fbf301 ("packet: account statistics only in tpacket_stats_u") cleaned up the getsockopt PACKET_STATISTICS code. This also changed semantics. Historically, tp_packets included tp_drops on return. The commit removed the line that adds tp_drops into tp_packets. This patch reinstates the old semantics. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4b66c752eae5..75c8bbf598c8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3259,9 +3259,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); + st.stats3.tp_packets += st.stats3.tp_drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); + st.stats1.tp_packets += st.stats1.tp_drops; data = &st.stats1; } -- cgit v1.2.3 From ef40b7ef181b7b1a24df2ef2d1ef84956bffa635 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 20 Aug 2013 17:10:18 +0900 Subject: bridge: Use the correct bit length for bitmap functions in the VLAN code The VLAN code needs to know the length of the per-port VLAN bitmap to perform its most basic operations (retrieving VLAN informations, removing VLANs, forwarding database manipulation, etc). Unfortunately, in the current implementation we are using a macro that indicates the bitmap size in longs in places where the size in bits is expected, which in some cases can cause what appear to be random failures. Use the correct macro. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 10 +++++----- net/bridge/br_netlink.c | 4 ++-- net/bridge/br_vlan.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 60aca9109a50..ffd5874f2592 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -161,7 +161,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!pv) return; - for_each_set_bit_from(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { f = __br_fdb_get(br, br->dev->dev_addr, vid); if (f && f->is_local && !f->dst) fdb_delete(br, f); @@ -730,7 +730,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], /* VID was specified, so use it. */ err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) { err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); goto out; } @@ -739,7 +739,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); if (err) goto out; @@ -817,7 +817,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], err = __br_fdb_delete(p, addr, vid); } else { - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) { err = __br_fdb_delete(p, addr, 0); goto out; } @@ -827,7 +827,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], * vlan on this port. */ err = -ENOENT; - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { err &= __br_fdb_delete(p, addr, vid); } } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 1fc30abd3a52..b9259efa636e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -132,7 +132,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, else pv = br_get_vlan_info(br); - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) goto done; af = nla_nest_start(skb, IFLA_AF_SPEC); @@ -140,7 +140,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { vinfo.vid = vid; vinfo.flags = 0; if (vid == pvid) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bd58b45f5f90..9a9ffe7e4019 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -108,7 +108,7 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) clear_bit(vid, v->vlan_bitmap); v->num_vlans--; - if (bitmap_empty(v->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) { if (v->port_idx) rcu_assign_pointer(v->parent.port->vlan_info, NULL); else @@ -122,7 +122,7 @@ static void __vlan_flush(struct net_port_vlans *v) { smp_wmb(); v->pvid = 0; - bitmap_zero(v->vlan_bitmap, BR_VLAN_BITMAP_LEN); + bitmap_zero(v->vlan_bitmap, VLAN_N_VID); if (v->port_idx) rcu_assign_pointer(v->parent.port->vlan_info, NULL); else -- cgit v1.2.3 From 2a3ba63c235fdcd37f6451bdf4a0c7865a3930cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 20 Aug 2013 11:28:50 +0200 Subject: mac80211: add missing channel context release IBSS needs to release the channel context when leaving but I evidently missed that. Fix it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index ea7b9c2c7e66..5e8bb3bee3c2 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1138,6 +1138,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); + ieee80211_vif_release_channel(sdata); synchronize_rcu(); kfree(presp); -- cgit v1.2.3 From 2dfca312a91631311c1cf7c090246cc8103de038 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 20 Aug 2013 19:43:54 +0200 Subject: mac80211: add a flag to indicate CCK support for HT clients brcm80211 cannot handle sending frames with CCK rates as part of an A-MPDU session. Other drivers may have issues too. Set the flag in all drivers that have been tested with CCK rates. This fixes a reported brcmsmac regression introduced in commit ef47a5e4f1aaf1d0e2e6875e34b2c9595897bef6 "mac80211/minstrel_ht: fix cck rate sampling" Cc: stable@vger.kernel.org # 3.10 Reported-by: Tom Gundersen Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath9k/init.c | 3 ++- drivers/net/wireless/ath/carl9170/main.c | 3 ++- drivers/net/wireless/rt2x00/rt2800lib.c | 3 ++- include/net/mac80211.h | 1 + net/mac80211/rc80211_minstrel_ht.c | 3 +++ 5 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 16f8b201642b..026a2a067b46 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -802,7 +802,8 @@ void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_SPECTRUM_MGMT | IEEE80211_HW_REPORTS_TX_ACK_STATUS | - IEEE80211_HW_SUPPORTS_RC_TABLE; + IEEE80211_HW_SUPPORTS_RC_TABLE | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_HT) { hw->flags |= IEEE80211_HW_AMPDU_AGGREGATION; diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c index 4a33c6e39ca2..349fa22a921a 100644 --- a/drivers/net/wireless/ath/carl9170/main.c +++ b/drivers/net/wireless/ath/carl9170/main.c @@ -1860,7 +1860,8 @@ void *carl9170_alloc(size_t priv_size) IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC | IEEE80211_HW_SUPPORTS_RC_TABLE | - IEEE80211_HW_SIGNAL_DBM; + IEEE80211_HW_SIGNAL_DBM | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; if (!modparam_noht) { /* diff --git a/drivers/net/wireless/rt2x00/rt2800lib.c b/drivers/net/wireless/rt2x00/rt2800lib.c index 1f80ea5e29dd..1b41c8eda12d 100644 --- a/drivers/net/wireless/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/rt2x00/rt2800lib.c @@ -6133,7 +6133,8 @@ static int rt2800_probe_hw_mode(struct rt2x00_dev *rt2x00dev) IEEE80211_HW_SUPPORTS_PS | IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_AMPDU_AGGREGATION | - IEEE80211_HW_REPORTS_TX_ACK_STATUS; + IEEE80211_HW_REPORTS_TX_ACK_STATUS | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; /* * Don't set IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING for USB devices diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5b7a3dadadde..551ba6a6a073 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1499,6 +1499,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_RC_TABLE = 1<<24, IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF = 1<<25, IEEE80211_HW_TIMING_BEACON_ONLY = 1<<26, + IEEE80211_HW_SUPPORTS_HT_CCK_RATES = 1<<27, }; /** diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index f5aed963b22e..f3bbea1eb9e7 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -828,6 +828,9 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != IEEE80211_BAND_2GHZ) return; + if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES)) + return; + mi->cck_supported = 0; mi->cck_supported_short = 0; for (i = 0; i < 4; i++) { -- cgit v1.2.3 From 75a423f493ffdf741acae27bf179cd560f7813d7 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Wed, 21 Aug 2013 15:30:25 +0200 Subject: mac80211: ibss: fix ignored channel parameter my earlier patch "mac80211: change IBSS channel state to chandef" created a regression by ignoring the channel parameter in __ieee80211_sta_join_ibss, which breaks IBSS channel selection. This patch fixes this situation by using the right channel and adopting the selected bandwidth mode. Cc: stable@vger.kernel.org Signed-off-by: Simon Wunderlich Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 5e8bb3bee3c2..2d45643c964e 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -36,7 +36,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const int beacon_int, - struct ieee80211_channel *chan, + struct cfg80211_chan_def *req_chandef, const u32 basic_rates, const u16 capability, u64 tsf, bool creator) @@ -51,6 +51,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, u32 bss_change; u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; struct cfg80211_chan_def chandef; + struct ieee80211_channel *chan; struct beacon_data *presp; int frame_len; @@ -81,7 +82,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; - chandef = ifibss->chandef; + /* make a copy of the chandef, it could be modified below. */ + chandef = *req_chandef; + chan = chandef.chan; if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) { chandef.width = NL80211_CHAN_WIDTH_20; chandef.center_freq1 = chan->center_freq; @@ -259,10 +262,12 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = container_of((void *)bss, struct cfg80211_bss, priv); struct ieee80211_supported_band *sband; + struct cfg80211_chan_def chandef; u32 basic_rates; int i, j; u16 beacon_int = cbss->beacon_interval; const struct cfg80211_bss_ies *ies; + enum nl80211_channel_type chan_type; u64 tsf; sdata_assert_lock(sdata); @@ -270,6 +275,26 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, if (beacon_int < 10) beacon_int = 10; + switch (sdata->u.ibss.chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef); + cfg80211_chandef_create(&chandef, cbss->channel, chan_type); + break; + case NL80211_CHAN_WIDTH_5: + case NL80211_CHAN_WIDTH_10: + cfg80211_chandef_create(&chandef, cbss->channel, + NL80211_CHAN_WIDTH_20_NOHT); + chandef.width = sdata->u.ibss.chandef.width; + break; + default: + /* fall back to 20 MHz for unsupported modes */ + cfg80211_chandef_create(&chandef, cbss->channel, + NL80211_CHAN_WIDTH_20_NOHT); + break; + } + sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; basic_rates = 0; @@ -294,7 +319,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, __ieee80211_sta_join_ibss(sdata, cbss->bssid, beacon_int, - cbss->channel, + &chandef, basic_rates, cbss->capability, tsf, false); @@ -736,7 +761,7 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) sdata->drop_unencrypted = 0; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, - ifibss->chandef.chan, ifibss->basic_rates, + &ifibss->chandef, ifibss->basic_rates, capability, 0, true); } -- cgit v1.2.3 From 9d47b380563174e5c15776ce6ca9bab4ee7d59e3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 21 Aug 2013 16:08:02 +0200 Subject: Revert "genetlink: fix family dump race" This reverts commit 58ad436fcf49810aa006016107f494c9ac9013db. It turns out that the change introduced a potential deadlock by causing a locking dependency with netlink's cb_mutex. I can't seem to find a way to resolve this without doing major changes to the locking, so revert this. Signed-off-by: Johannes Berg Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f85f8a2ad6cf..512718adb0d5 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -789,10 +789,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; - bool need_locking = chains_to_skip || fams_to_skip; - - if (need_locking) - genl_lock(); for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { n = 0; @@ -814,9 +810,6 @@ errout: cb->args[0] = i; cb->args[1] = n; - if (need_locking) - genl_unlock(); - return skb->len; } -- cgit v1.2.3 From c92a59eca86f5d13ae4d481c3bae6b54609fe006 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Thu, 22 Aug 2013 12:07:35 +0800 Subject: ipv6: handle Redirect ICMP Message with no Redirected Header option rfc 4861 says the Redirected Header option is optional, so the kernel should not drop the Redirect Message that has no Redirected Header option. In this patch, the function ip6_redirect_no_header() is introduced to deal with that condition. Signed-off-by: Duan Jiong Acked-by: Hannes Frederic Sowa --- include/net/ip6_route.h | 2 ++ net/ipv6/ndisc.c | 4 +++- net/ipv6/route.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 260f83f16bcf..f667248202b6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -135,6 +135,8 @@ extern void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, extern void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); extern void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); +extern void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark); extern void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 79aa9652ed86..04d31c2fbef1 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1369,8 +1369,10 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) return; - if (!ndopts.nd_opts_rh) + if (!ndopts.nd_opts_rh) { + ip6_redirect_no_header(skb, dev_net(skb->dev), 0, 0); return; + } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b70f8979003b..8d9a93ed9c59 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1178,6 +1178,27 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) } EXPORT_SYMBOL_GPL(ip6_redirect); +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.flowi6_flags = 0; + fl6.daddr = msg->dest; + fl6.saddr = iph->daddr; + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst->error) + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} + void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); -- cgit v1.2.3 From 5a25cf1e310888eb333f9e034be84a8117111d30 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 26 Aug 2013 12:31:19 +0200 Subject: xfrm: revert ipv4 mtu determination to dst_mtu In commit 0ea9d5e3e0e03a63b11392f5613378977dae7eca ("xfrm: introduce helper for safe determination of mtu") I switched the determination of ipv4 mtus from dst_mtu to ip_skb_dst_mtu. This was an error because in case of IP_PMTUDISC_PROBE we fall back to the interface mtu, which is never correct for ipv4 ipsec. This patch partly reverts 0ea9d5e3e0e03a63b11392f5613378977dae7eca ("xfrm: introduce helper for safe determination of mtu"). Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 ------------ net/ipv4/xfrm4_output.c | 2 +- net/ipv6/xfrm6_output.c | 8 +++++--- 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ac5b02515355..e823786e7c66 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -1724,15 +1723,4 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } -static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - if (sk && skb->protocol == htons(ETH_P_IPV6)) - return ip6_skb_dst_mtu(skb); - else if (sk && skb->protocol == htons(ETH_P_IP)) - return ip_skb_dst_mtu(skb); - return dst_mtu(skb_dst(skb)); -} - #endif /* _NET_XFRM_H */ diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 80baf4a3b1b5..baa0f63731fd 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -28,7 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; - mtu = xfrm_skb_dst_mtu(skb); + mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { if (skb->sk) xfrm_local_error(skb, mtu); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index e092e306882d..6cd625e37706 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -140,10 +140,12 @@ static int __xfrm6_output(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu = xfrm_skb_dst_mtu(skb); + int mtu; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (skb->protocol == htons(ETH_P_IPV6)) + mtu = ip6_skb_dst_mtu(skb); + else + mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); -- cgit v1.2.3 From 9c9c9ad5fae7e9ef56a38acb508a01919b225e9a Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 26 Aug 2013 12:31:23 +0200 Subject: ipv6: set skb->protocol on tcp, raw and ip6_append_data genereated skbs Currently we don't initialize skb->protocol when transmitting data via tcp, raw(with and without inclhdr) or udp+ufo or appending data directly to the socket transmit queue (via ip6_append_data). This needs to be done so that we can get the correct mtu in the xfrm layer. Setting of skb->protocol happens only in functions where we also have a transmitting socket and a new skb, so we don't overwrite old values. Cc: Steffen Klassert Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 3 +++ net/ipv6/raw.c | 1 + 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 6e3ddf806ec2..e7ceb6c871d1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -238,6 +238,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1057,6 +1058,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; + skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; } @@ -1359,6 +1361,7 @@ alloc_new_skb: /* * Fill in the control structures */ + skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; /* reserve for fragmentation and ipsec header */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index c45f7a5c36e9..cdaed47ba932 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -628,6 +628,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, goto error; skb_reserve(skb, hlen); + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb_dst_set(skb, &rt->dst); -- cgit v1.2.3 From d661684cf6820331feae71146c35da83d794467e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Aug 2013 11:39:15 -0700 Subject: net: Check the correct namespace when spoofing pid over SCM_RIGHTS This is a security bug. The follow-up will fix nsproxy to discourage this type of issue from happening again. Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 03795d0147f2..b4da80b1cc07 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -54,7 +54,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || - ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || -- cgit v1.2.3 From 302a50bc941010d7a67f288fd0db31981e4d722d Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Aug 2013 08:47:14 +0200 Subject: xfrm: Fix potential null pointer dereference in xdst_queue_output The net_device might be not set on the skb when we try refcounting. This leads to a null pointer dereference in xdst_queue_output(). It turned out that the refcount to the net_device is not needed after all. The dst_entry has a refcount to the net_device before we queue the skb, so it can't go away. Therefore we can remove the refcount on queueing to fix the null pointer dereference. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e52cab3591dd..f77c371ea72b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -320,10 +320,8 @@ static void xfrm_queue_purge(struct sk_buff_head *list) { struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { - dev_put(skb->dev); + while ((skb = skb_dequeue(list)) != NULL) kfree_skb(skb); - } } /* Rule must be locked. Release descentant resources, announce @@ -1758,7 +1756,6 @@ static void xfrm_policy_queue_process(unsigned long arg) struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct net_device *dev; struct xfrm_policy *pol = (struct xfrm_policy *)arg; struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; @@ -1805,7 +1802,6 @@ static void xfrm_policy_queue_process(unsigned long arg) dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, &fl, skb->sk, 0); if (IS_ERR(dst)) { - dev_put(skb->dev); kfree_skb(skb); continue; } @@ -1814,9 +1810,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - dev = skb->dev; err = dst_output(skb); - dev_put(dev); } return; @@ -1839,7 +1833,6 @@ static int xdst_queue_output(struct sk_buff *skb) } skb_dst_force(skb); - dev_hold(skb->dev); spin_lock_bh(&pq->hold_queue.lock); -- cgit v1.2.3 From 347e2233b7667e336d9f671f1a52dfa3f0416e2c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 28 Aug 2013 13:35:13 -0400 Subject: SUNRPC: Fix memory corruption issue on 32-bit highmem systems Some architectures, such as ARM-32 do not return the same base address when you call kmap_atomic() twice on the same page. This causes problems for the memmove() call in the XDR helper routine "_shift_data_right_pages()", since it defeats the detection of overlapping memory ranges, and has been seen to corrupt memory. The fix is to distinguish between the case where we're doing an inter-page copy or not. In the former case of we know that the memory ranges cannot possibly overlap, so we can additionally micro-optimise by replacing memmove() with memcpy(). Reported-by: Mark Young Reported-by: Matt Craighead Cc: Bruce Fields Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Tested-by: Matt Craighead --- net/sunrpc/xdr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 75edcfad6e26..1504bb11e4f3 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -207,10 +207,13 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, pgfrom_base -= copy; vto = kmap_atomic(*pgto); - vfrom = kmap_atomic(*pgfrom); - memmove(vto + pgto_base, vfrom + pgfrom_base, copy); + if (*pgto != *pgfrom) { + vfrom = kmap_atomic(*pgfrom); + memcpy(vto + pgto_base, vfrom + pgfrom_base, copy); + kunmap_atomic(vfrom); + } else + memmove(vto + pgto_base, vto + pgfrom_base, copy); flush_dcache_page(*pgto); - kunmap_atomic(vfrom); kunmap_atomic(vto); } while ((len -= copy) != 0); -- cgit v1.2.3 From 9b96309c5b0b9e466773c07a5bc8b7b68fcf010a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 23 Aug 2013 12:44:55 -0700 Subject: genl: Fix genl dumpit() locking. In case of genl-family with parallel ops off, dumpif() callback is expected to run under genl_lock, But commit def3117493eafd9df (genl: Allow concurrent genl callbacks.) changed this behaviour where only first dumpit() op was called under genl-lock. For subsequent dump, only nlk->cb_lock was taken. Following patch fixes it by defining locked dumpit() and done() callback which takes care of genl-locking. CC: Jesse Gross CC: Johannes Berg Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 51 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 512718adb0d5..7e0f4d199ade 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -544,6 +544,30 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct genl_ops *ops = cb->data; + int rc; + + genl_lock(); + rc = ops->dumpit(skb, cb); + genl_unlock(); + return rc; +} + +static int genl_lock_done(struct netlink_callback *cb) +{ + struct genl_ops *ops = cb->data; + int rc = 0; + + if (ops->done) { + genl_lock(); + rc = ops->done(cb); + genl_unlock(); + } + return rc; +} + static int genl_family_rcv_msg(struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh) @@ -572,15 +596,32 @@ static int genl_family_rcv_msg(struct genl_family *family, return -EPERM; if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = ops->dumpit, - .done = ops->done, - }; + int rc; if (ops->dumpit == NULL) return -EOPNOTSUPP; - return netlink_dump_start(net->genl_sock, skb, nlh, &c); + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .data = ops, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; + + genl_unlock(); + rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); + + } else { + struct netlink_dump_control c = { + .dump = ops->dumpit, + .done = ops->done, + }; + + rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + } + + return rc; } if (ops->doit == NULL) -- cgit v1.2.3 From 33c6b1f6b154894321f5734e50c66621e9134e7e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 23 Aug 2013 12:45:04 -0700 Subject: genl: Hold reference on correct module while netlink-dump. netlink dump operations take module as parameter to hold reference for entire netlink dump duration. Currently it holds ref only on genl module which is not correct when we use ops registered to genl from another module. Following patch adds module pointer to genl_ops so that netlink can hold ref count on it. CC: Jesse Gross CC: Johannes Berg Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/genetlink.h | 20 ++++++++++++++++++-- net/netlink/genetlink.c | 20 +++++++++++--------- 2 files changed, 29 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 93024a47e0e2..8e0b6c856a13 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -61,6 +61,7 @@ struct genl_family { struct list_head ops_list; /* private */ struct list_head family_list; /* private */ struct list_head mcast_groups; /* private */ + struct module *module; }; /** @@ -121,9 +122,24 @@ struct genl_ops { struct list_head ops_list; }; -extern int genl_register_family(struct genl_family *family); -extern int genl_register_family_with_ops(struct genl_family *family, +extern int __genl_register_family(struct genl_family *family); + +static inline int genl_register_family(struct genl_family *family) +{ + family->module = THIS_MODULE; + return __genl_register_family(family); +} + +extern int __genl_register_family_with_ops(struct genl_family *family, struct genl_ops *ops, size_t n_ops); + +static inline int genl_register_family_with_ops(struct genl_family *family, + struct genl_ops *ops, size_t n_ops) +{ + family->module = THIS_MODULE; + return __genl_register_family_with_ops(family, ops, n_ops); +} + extern int genl_unregister_family(struct genl_family *family); extern int genl_register_ops(struct genl_family *, struct genl_ops *ops); extern int genl_unregister_ops(struct genl_family *, struct genl_ops *ops); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 7e0f4d199ade..0c741cec4d0d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -364,7 +364,7 @@ int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) EXPORT_SYMBOL(genl_unregister_ops); /** - * genl_register_family - register a generic netlink family + * __genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one @@ -374,7 +374,7 @@ EXPORT_SYMBOL(genl_unregister_ops); * * Return 0 on success or a negative error code. */ -int genl_register_family(struct genl_family *family) +int __genl_register_family(struct genl_family *family) { int err = -EINVAL; @@ -430,10 +430,10 @@ errout_locked: errout: return err; } -EXPORT_SYMBOL(genl_register_family); +EXPORT_SYMBOL(__genl_register_family); /** - * genl_register_family_with_ops - register a generic netlink family + * __genl_register_family_with_ops - register a generic netlink family * @family: generic netlink family * @ops: operations to be registered * @n_ops: number of elements to register @@ -457,12 +457,12 @@ EXPORT_SYMBOL(genl_register_family); * * Return 0 on success or a negative error code. */ -int genl_register_family_with_ops(struct genl_family *family, +int __genl_register_family_with_ops(struct genl_family *family, struct genl_ops *ops, size_t n_ops) { int err, i; - err = genl_register_family(family); + err = __genl_register_family(family); if (err) return err; @@ -476,7 +476,7 @@ err_out: genl_unregister_family(family); return err; } -EXPORT_SYMBOL(genl_register_family_with_ops); +EXPORT_SYMBOL(__genl_register_family_with_ops); /** * genl_unregister_family - unregister generic netlink family @@ -603,22 +603,24 @@ static int genl_family_rcv_msg(struct genl_family *family, if (!family->parallel_ops) { struct netlink_dump_control c = { + .module = family->module, .data = ops, .dump = genl_lock_dumpit, .done = genl_lock_done, }; genl_unlock(); - rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_lock(); } else { struct netlink_dump_control c = { + .module = family->module, .dump = ops->dumpit, .done = ops->done, }; - rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } return rc; -- cgit v1.2.3 From c7781a6e3c4a9a17e144ec2db00ebfea327bd627 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:20:40 +0400 Subject: tcp: initialize rcv_tstamp for restored sockets u32 rcv_tstamp; /* timestamp of last received ACK */ Its value used in tcp_retransmit_timer, which closes socket if the last ack was received more then TCP_RTO_MAX ago. Currently rcv_tstamp is initialized to zero and if tcp_retransmit_timer is called before receiving a first ack, the connection is closed. This patch initializes rcv_tstamp to a timestamp, when a socket was restored. Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reported-by: Cyrill Gorcunov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa82..e2972993c671 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2814,6 +2814,8 @@ void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; -- cgit v1.2.3 From e3e12028315749b7fa2edbc37328e5847be9ede9 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:21:55 +0400 Subject: tcp: don't apply tsoffset if rcv_tsecr is zero The zero value means that tsecr is not valid, so it's a special case. tsoffset is used to customize tcp_time_stamp for one socket. tsoffset is usually zero, it's used when a socket was moved from one host to another host. Currently this issue affects logic of tcp_rcv_rtt_measure_ts. Due to incorrect value of rcv_tsecr, tcp_rcv_rtt_measure_ts sets rto to TCP_RTO_MAX. Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reported-by: Cyrill Gorcunov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe062..3ca2139a130b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3535,7 +3535,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + if (*ptr) + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + else + tp->rx_opt.rcv_tsecr = 0; return true; } return false; @@ -3560,7 +3563,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, } tcp_parse_options(skb, &tp->rx_opt, 1, NULL); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; @@ -5316,7 +5319,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { -- cgit v1.2.3 From c27c9322d015dc1d9dfdf31724fca71c0476c4d1 Mon Sep 17 00:00:00 2001 From: Chris Clark Date: Tue, 27 Aug 2013 12:02:15 -0600 Subject: ipv4: sendto/hdrincl: don't use destination address found in header ipv4: raw_sendmsg: don't use header's destination address A sendto() regression was bisected and found to start with commit f8126f1d5136be1 (ipv4: Adjust semantics of rt->rt_gateway.) The problem is that it tries to ARP-lookup the constructed packet's destination address rather than the explicitly provided address. Fix this using FLOWI_FLAG_KNOWN_NH so that given nexthop is used. cf. commit 2ad5b9e4bd314fc685086b99e90e5de3bc59e26b Reported-by: Chris Clark Bisected-by: Chris Clark Tested-by: Chris Clark Suggested-by: Julian Anastasov Signed-off-by: Chris Clark Signed-off-by: David S. Miller --- net/ipv4/raw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dd44e0ab600c..61e60d67adca 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); if (!inet->hdrincl) { -- cgit v1.2.3 From 1f324e38870cc09659cf23bc626f1b8869e201f2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 28 Aug 2013 01:07:25 +0200 Subject: ipv6: Don't depend on per socket memory for neighbour discovery messages Allocating skbs when sending out neighbour discovery messages currently uses sock_alloc_send_skb() based on a per net namespace socket and thus share a socket wmem buffer space. If a netdevice is temporarily unable to transmit due to carrier loss or for other reasons, the queued up ndisc messages will cosnume all of the wmem space and will thus prevent from any more skbs to be allocated even for netdevices that are able to transmit packets. The number of neighbour discovery messages sent is very limited, simply use alloc_skb() and don't depend on any socket wmem space any longer. This patch has orginally been posted by Eric Dumazet in a modified form. Signed-off-by: Thomas Graf Cc: Eric Dumazet Acked-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d31c2fbef1..5cb98df966c2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -370,16 +370,12 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; - int err; - skb = sock_alloc_send_skb(sk, - hlen + sizeof(struct ipv6hdr) + len + tlen, - 1, &err); + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", - __func__, err); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", + __func__); return NULL; } -- cgit v1.2.3 From cc0fdd802859eaeb00e1c87dbb655594bed2844c Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Fri, 30 Aug 2013 17:28:17 +0200 Subject: bridge: separate querier and query timer into IGMP/IPv4 and MLD/IPv6 ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we would still potentially suffer multicast packet loss if there is just either an IGMP or an MLD querier: For the former case, we would possibly drop IPv6 multicast packets, for the latter IPv4 ones. This is because we are currently assuming that if either an IGMP or MLD querier is present that the other one is present, too. This patch makes the behaviour and fix added in "bridge: disable snooping if there is no querier" (b00589af3b04) to also work if there is either just an IGMP or an MLD querier on the link: It refines the deactivation of the snooping to be protocol specific by using separate timers for the snooped IGMP and MLD queries as well as separate timers for our internal IGMP and MLD queriers. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- net/bridge/br_input.c | 2 +- net/bridge/br_mdb.c | 14 ++- net/bridge/br_multicast.c | 258 ++++++++++++++++++++++++++++++++-------------- net/bridge/br_private.h | 57 ++++++++-- 5 files changed, 240 insertions(+), 93 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 69363bd37f64..89659d4ed1f9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -71,7 +71,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br)) + br_multicast_querier_exists(br, eth_hdr(skb))) br_multicast_deliver(mdst, skb); else br_flood_deliver(br, skb, false); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8c561c0aa636..a2fd37ec35f7 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -102,7 +102,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br)) { + br_multicast_querier_exists(br, eth_hdr(skb))) { if ((mdst && mdst->mglist) || br_multicast_is_router(br)) skb2 = skb; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 0daae3ec2355..6319c4333c39 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -414,16 +414,20 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; - if (timer_pending(&br->multicast_querier_timer)) - return -EBUSY; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) + if (ip.proto == htons(ETH_P_IP)) { + if (timer_pending(&br->ip4_querier.timer)) + return -EBUSY; + ip.u.ip4 = entry->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - else + } else { + if (timer_pending(&br->ip6_querier.timer)) + return -EBUSY; + ip.u.ip6 = entry->addr.u.ip6; #endif + } spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 08e576ada0b2..9d1d0e66c357 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -33,7 +33,8 @@ #include "br_private.h" -static void br_multicast_start_querier(struct net_bridge *br); +static void br_multicast_start_querier(struct net_bridge *br, + struct bridge_mcast_query *query); unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -755,20 +756,35 @@ static void br_multicast_local_router_expired(unsigned long data) { } -static void br_multicast_querier_expired(unsigned long data) +static void br_multicast_querier_expired(struct net_bridge *br, + struct bridge_mcast_query *query) { - struct net_bridge *br = (void *)data; - spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || br->multicast_disabled) goto out; - br_multicast_start_querier(br); + br_multicast_start_querier(br, query); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_querier_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + br_multicast_querier_expired(br, &br->ip4_query); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_querier_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + br_multicast_querier_expired(br, &br->ip6_query); +} +#endif + static void __br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *ip) @@ -789,37 +805,45 @@ static void __br_multicast_send_query(struct net_bridge *br, } static void br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, u32 sent) + struct net_bridge_port *port, + struct bridge_mcast_query *query) { unsigned long time; struct br_ip br_group; + struct bridge_mcast_querier *querier = NULL; if (!netif_running(br->dev) || br->multicast_disabled || - !br->multicast_querier || - timer_pending(&br->multicast_querier_timer)) + !br->multicast_querier) return; memset(&br_group.u, 0, sizeof(br_group.u)); - br_group.proto = htons(ETH_P_IP); - __br_multicast_send_query(br, port, &br_group); - + if (port ? (query == &port->ip4_query) : + (query == &br->ip4_query)) { + querier = &br->ip4_querier; + br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) - br_group.proto = htons(ETH_P_IPV6); - __br_multicast_send_query(br, port, &br_group); + } else { + querier = &br->ip6_querier; + br_group.proto = htons(ETH_P_IPV6); #endif + } + + if (!querier || timer_pending(&querier->timer)) + return; + + __br_multicast_send_query(br, port, &br_group); time = jiffies; - time += sent < br->multicast_startup_query_count ? + time += query->startup_sent < br->multicast_startup_query_count ? br->multicast_startup_query_interval : br->multicast_query_interval; - mod_timer(port ? &port->multicast_query_timer : - &br->multicast_query_timer, time); + mod_timer(&query->timer, time); } -static void br_multicast_port_query_expired(unsigned long data) +static void br_multicast_port_query_expired(struct net_bridge_port *port, + struct bridge_mcast_query *query) { - struct net_bridge_port *port = (void *)data; struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); @@ -827,25 +851,43 @@ static void br_multicast_port_query_expired(unsigned long data) port->state == BR_STATE_BLOCKING) goto out; - if (port->multicast_startup_queries_sent < - br->multicast_startup_query_count) - port->multicast_startup_queries_sent++; + if (query->startup_sent < br->multicast_startup_query_count) + query->startup_sent++; - br_multicast_send_query(port->br, port, - port->multicast_startup_queries_sent); + br_multicast_send_query(port->br, port, query); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_port_query_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + + br_multicast_port_query_expired(port, &port->ip4_query); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_port_query_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + + br_multicast_port_query_expired(port, &port->ip6_query); +} +#endif + void br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = 1; setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); - setup_timer(&port->multicast_query_timer, - br_multicast_port_query_expired, (unsigned long)port); + setup_timer(&port->ip4_query.timer, br_ip4_multicast_port_query_expired, + (unsigned long)port); +#if IS_ENABLED(CONFIG_IPV6) + setup_timer(&port->ip6_query.timer, br_ip6_multicast_port_query_expired, + (unsigned long)port); +#endif } void br_multicast_del_port(struct net_bridge_port *port) @@ -853,13 +895,13 @@ void br_multicast_del_port(struct net_bridge_port *port) del_timer_sync(&port->multicast_router_timer); } -static void __br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable(struct bridge_mcast_query *query) { - port->multicast_startup_queries_sent = 0; + query->startup_sent = 0; - if (try_to_del_timer_sync(&port->multicast_query_timer) >= 0 || - del_timer(&port->multicast_query_timer)) - mod_timer(&port->multicast_query_timer, jiffies); + if (try_to_del_timer_sync(&query->timer) >= 0 || + del_timer(&query->timer)) + mod_timer(&query->timer, jiffies); } void br_multicast_enable_port(struct net_bridge_port *port) @@ -870,7 +912,10 @@ void br_multicast_enable_port(struct net_bridge_port *port) if (br->multicast_disabled || !netif_running(br->dev)) goto out; - __br_multicast_enable_port(port); + br_multicast_enable(&port->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_enable(&port->ip6_query); +#endif out: spin_unlock(&br->multicast_lock); @@ -889,7 +934,10 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (!hlist_unhashed(&port->rlist)) hlist_del_init_rcu(&port->rlist); del_timer(&port->multicast_router_timer); - del_timer(&port->multicast_query_timer); + del_timer(&port->ip4_query.timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&port->ip6_query.timer); +#endif spin_unlock(&br->multicast_lock); } @@ -1014,14 +1062,15 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, } #endif -static void br_multicast_update_querier_timer(struct net_bridge *br, - unsigned long max_delay) +static void +br_multicast_update_querier_timer(struct net_bridge *br, + struct bridge_mcast_querier *querier, + unsigned long max_delay) { - if (!timer_pending(&br->multicast_querier_timer)) - br->multicast_querier_delay_time = jiffies + max_delay; + if (!timer_pending(&querier->timer)) + querier->delay_time = jiffies + max_delay; - mod_timer(&br->multicast_querier_timer, - jiffies + br->multicast_querier_interval); + mod_timer(&querier->timer, jiffies + br->multicast_querier_interval); } /* @@ -1074,12 +1123,13 @@ timer: static void br_multicast_query_received(struct net_bridge *br, struct net_bridge_port *port, + struct bridge_mcast_querier *querier, int saddr, unsigned long max_delay) { if (saddr) - br_multicast_update_querier_timer(br, max_delay); - else if (timer_pending(&br->multicast_querier_timer)) + br_multicast_update_querier_timer(br, querier, max_delay); + else if (timer_pending(&querier->timer)) return; br_multicast_mark_router(br, port); @@ -1129,7 +1179,8 @@ static int br_ip4_multicast_query(struct net_bridge *br, IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } - br_multicast_query_received(br, port, !!iph->saddr, max_delay); + br_multicast_query_received(br, port, &br->ip4_querier, !!iph->saddr, + max_delay); if (!group) goto out; @@ -1206,8 +1257,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; } - br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr), - max_delay); + br_multicast_query_received(br, port, &br->ip6_querier, + !ipv6_addr_any(&ip6h->saddr), max_delay); if (!group) goto out; @@ -1244,7 +1295,9 @@ out: static void br_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group) + struct br_ip *group, + struct bridge_mcast_querier *querier, + struct bridge_mcast_query *query) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -1255,7 +1308,7 @@ static void br_multicast_leave_group(struct net_bridge *br, spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || (port && port->state == BR_STATE_DISABLED) || - timer_pending(&br->multicast_querier_timer)) + timer_pending(&querier->timer)) goto out; mdb = mlock_dereference(br->mdb, br); @@ -1263,14 +1316,13 @@ static void br_multicast_leave_group(struct net_bridge *br, if (!mp) goto out; - if (br->multicast_querier && - !timer_pending(&br->multicast_querier_timer)) { + if (br->multicast_querier) { __br_multicast_send_query(br, port, &mp->addr); time = jiffies + br->multicast_last_member_count * br->multicast_last_member_interval; - mod_timer(port ? &port->multicast_query_timer : - &br->multicast_query_timer, time); + + mod_timer(&query->timer, time); for (p = mlock_dereference(mp->ports, br); p != NULL; @@ -1323,7 +1375,6 @@ static void br_multicast_leave_group(struct net_bridge *br, mod_timer(&mp->timer, time); } } - out: spin_unlock(&br->multicast_lock); } @@ -1334,6 +1385,8 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; + struct bridge_mcast_query *query = port ? &port->ip4_query : + &br->ip4_query; if (ipv4_is_local_multicast(group)) return; @@ -1342,7 +1395,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group); + br_multicast_leave_group(br, port, &br_group, &br->ip4_querier, query); } #if IS_ENABLED(CONFIG_IPV6) @@ -1352,6 +1405,9 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; + struct bridge_mcast_query *query = port ? &port->ip6_query : + &br->ip6_query; + if (!ipv6_is_transient_multicast(group)) return; @@ -1360,7 +1416,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group); + br_multicast_leave_group(br, port, &br_group, &br->ip6_querier, query); } #endif @@ -1622,19 +1678,32 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, return 0; } -static void br_multicast_query_expired(unsigned long data) +static void br_multicast_query_expired(struct net_bridge *br, + struct bridge_mcast_query *query) +{ + spin_lock(&br->multicast_lock); + if (query->startup_sent < br->multicast_startup_query_count) + query->startup_sent++; + + br_multicast_send_query(br, NULL, query); + spin_unlock(&br->multicast_lock); +} + +static void br_ip4_multicast_query_expired(unsigned long data) { struct net_bridge *br = (void *)data; - spin_lock(&br->multicast_lock); - if (br->multicast_startup_queries_sent < - br->multicast_startup_query_count) - br->multicast_startup_queries_sent++; + br_multicast_query_expired(br, &br->ip4_query); +} - br_multicast_send_query(br, NULL, br->multicast_startup_queries_sent); +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_query_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; - spin_unlock(&br->multicast_lock); + br_multicast_query_expired(br, &br->ip6_query); } +#endif void br_multicast_init(struct net_bridge *br) { @@ -1654,25 +1723,43 @@ void br_multicast_init(struct net_bridge *br) br->multicast_querier_interval = 255 * HZ; br->multicast_membership_interval = 260 * HZ; - br->multicast_querier_delay_time = 0; + br->ip4_querier.delay_time = 0; +#if IS_ENABLED(CONFIG_IPV6) + br->ip6_querier.delay_time = 0; +#endif spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, br_multicast_local_router_expired, 0); - setup_timer(&br->multicast_querier_timer, - br_multicast_querier_expired, (unsigned long)br); - setup_timer(&br->multicast_query_timer, br_multicast_query_expired, + setup_timer(&br->ip4_querier.timer, br_ip4_multicast_querier_expired, + (unsigned long)br); + setup_timer(&br->ip4_query.timer, br_ip4_multicast_query_expired, (unsigned long)br); +#if IS_ENABLED(CONFIG_IPV6) + setup_timer(&br->ip6_querier.timer, br_ip6_multicast_querier_expired, + (unsigned long)br); + setup_timer(&br->ip6_query.timer, br_ip6_multicast_query_expired, + (unsigned long)br); +#endif } -void br_multicast_open(struct net_bridge *br) +static void __br_multicast_open(struct net_bridge *br, + struct bridge_mcast_query *query) { - br->multicast_startup_queries_sent = 0; + query->startup_sent = 0; if (br->multicast_disabled) return; - mod_timer(&br->multicast_query_timer, jiffies); + mod_timer(&query->timer, jiffies); +} + +void br_multicast_open(struct net_bridge *br) +{ + __br_multicast_open(br, &br->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + __br_multicast_open(br, &br->ip6_query); +#endif } void br_multicast_stop(struct net_bridge *br) @@ -1684,8 +1771,12 @@ void br_multicast_stop(struct net_bridge *br) int i; del_timer_sync(&br->multicast_router_timer); - del_timer_sync(&br->multicast_querier_timer); - del_timer_sync(&br->multicast_query_timer); + del_timer_sync(&br->ip4_querier.timer); + del_timer_sync(&br->ip4_query.timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&br->ip6_querier.timer); + del_timer_sync(&br->ip6_query.timer); +#endif spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -1788,18 +1879,24 @@ unlock: return err; } -static void br_multicast_start_querier(struct net_bridge *br) +static void br_multicast_start_querier(struct net_bridge *br, + struct bridge_mcast_query *query) { struct net_bridge_port *port; - br_multicast_open(br); + __br_multicast_open(br, query); list_for_each_entry(port, &br->port_list, list) { if (port->state == BR_STATE_DISABLED || port->state == BR_STATE_BLOCKING) continue; - __br_multicast_enable_port(port); + if (query == &br->ip4_query) + br_multicast_enable(&port->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + else + br_multicast_enable(&port->ip6_query); +#endif } } @@ -1834,7 +1931,10 @@ rollback: goto rollback; } - br_multicast_start_querier(br); + br_multicast_start_querier(br, &br->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_start_querier(br, &br->ip6_query); +#endif unlock: spin_unlock_bh(&br->multicast_lock); @@ -1857,10 +1957,18 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val) goto unlock; max_delay = br->multicast_query_response_interval; - if (!timer_pending(&br->multicast_querier_timer)) - br->multicast_querier_delay_time = jiffies + max_delay; - br_multicast_start_querier(br); + if (!timer_pending(&br->ip4_querier.timer)) + br->ip4_querier.delay_time = jiffies + max_delay; + + br_multicast_start_querier(br, &br->ip4_query); + +#if IS_ENABLED(CONFIG_IPV6) + if (!timer_pending(&br->ip6_querier.timer)) + br->ip6_querier.delay_time = jiffies + max_delay; + + br_multicast_start_querier(br, &br->ip6_query); +#endif unlock: spin_unlock_bh(&br->multicast_lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2f7da41851bf..263ba9034468 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -66,6 +66,20 @@ struct br_ip __u16 vid; }; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +/* our own querier */ +struct bridge_mcast_query { + struct timer_list timer; + u32 startup_sent; +}; + +/* other querier */ +struct bridge_mcast_querier { + struct timer_list timer; + unsigned long delay_time; +}; +#endif + struct net_port_vlans { u16 port_idx; u16 pvid; @@ -162,10 +176,12 @@ struct net_bridge_port #define BR_FLOOD 0x00000040 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - u32 multicast_startup_queries_sent; + struct bridge_mcast_query ip4_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_query ip6_query; +#endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; struct timer_list multicast_router_timer; - struct timer_list multicast_query_timer; struct hlist_head mglist; struct hlist_node rlist; #endif @@ -258,7 +274,6 @@ struct net_bridge u32 hash_max; u32 multicast_last_member_count; - u32 multicast_startup_queries_sent; u32 multicast_startup_query_count; unsigned long multicast_last_member_interval; @@ -267,15 +282,18 @@ struct net_bridge unsigned long multicast_query_interval; unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - unsigned long multicast_querier_delay_time; spinlock_t multicast_lock; struct net_bridge_mdb_htable __rcu *mdb; struct hlist_head router_list; struct timer_list multicast_router_timer; - struct timer_list multicast_querier_timer; - struct timer_list multicast_query_timer; + struct bridge_mcast_querier ip4_querier; + struct bridge_mcast_query ip4_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_querier ip6_querier; + struct bridge_mcast_query ip6_query; +#endif /* IS_ENABLED(CONFIG_IPV6) */ #endif struct timer_list hello_timer; @@ -503,11 +521,27 @@ static inline bool br_multicast_is_router(struct net_bridge *br) timer_pending(&br->multicast_router_timer)); } -static inline bool br_multicast_querier_exists(struct net_bridge *br) +static inline bool +__br_multicast_querier_exists(struct net_bridge *br, + struct bridge_mcast_querier *querier) +{ + return time_is_before_jiffies(querier->delay_time) && + (br->multicast_querier || timer_pending(&querier->timer)); +} + +static inline bool br_multicast_querier_exists(struct net_bridge *br, + struct ethhdr *eth) { - return time_is_before_jiffies(br->multicast_querier_delay_time) && - (br->multicast_querier || - timer_pending(&br->multicast_querier_timer)); + switch (eth->h_proto) { + case (htons(ETH_P_IP)): + return __br_multicast_querier_exists(br, &br->ip4_querier); +#if IS_ENABLED(CONFIG_IPV6) + case (htons(ETH_P_IPV6)): + return __br_multicast_querier_exists(br, &br->ip6_querier); +#endif + default: + return false; + } } #else static inline int br_multicast_rcv(struct net_bridge *br, @@ -565,7 +599,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br) { return 0; } -static inline bool br_multicast_querier_exists(struct net_bridge *br) +static inline bool br_multicast_querier_exists(struct net_bridge *br, + struct ethhdr *eth) { return false; } -- cgit v1.2.3 From eb8895debe1baba41fcb62c78a16f0c63c21662a Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Tue, 27 Aug 2013 16:41:40 -0700 Subject: tcp: tcp_make_synack() should use sock_wmalloc In commit 90ba9b19 (tcp: tcp_make_synack() can use alloc_skb()), Eric changed the call to sock_wmalloc in tcp_make_synack to alloc_skb. In doing so, the netfilter owner match lost its ability to block the SYNACK packet on outbound listening sockets. Revert the change, restoring the owner match functionality. This closes netfilter bugzilla #847. Signed-off-by: Phil Oester Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2972993c671..170737a9d56d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2670,7 +2670,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; -- cgit v1.2.3 From 2c8d85182348021fc0a1bed193a4be4161dc8364 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Wed, 28 Aug 2013 09:29:58 +0200 Subject: tipc: set sk_err correctly when connection fails Should a connect fail, if the publication/server is unavailable or due to some other error, a positive value will be returned and errno is never set. If the application code checks for an explicit zero return from connect (success) or a negative return (failure), it will not catch the error and subsequent send() calls will fail as shown from the strace snippet below. socket(0x1e /* PF_??? */, SOCK_SEQPACKET, 0) = 3 connect(3, {sa_family=0x1e /* AF_??? */, sa_data="\2\1\322\4\0\0\322\4\0\0\0\0\0\0"}, 16) = 111 sendto(3, "test", 4, 0, NULL, 0) = -1 EPIPE (Broken pipe) The reason for this behaviour is that TIPC wrongly inverts error codes set in sk_err. Signed-off-by: Erik Hugne Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ce8249c76827..6cc7ddd2fb7c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1257,7 +1257,7 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) /* Accept only ACK or NACK message */ if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - sk->sk_err = -ECONNREFUSED; + sk->sk_err = ECONNREFUSED; retval = TIPC_OK; break; } @@ -1268,7 +1268,7 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) res = auto_connect(sock, msg); if (res) { sock->state = SS_DISCONNECTING; - sk->sk_err = res; + sk->sk_err = -res; retval = TIPC_OK; break; } -- cgit v1.2.3 From 737e828bdbdaf2f9d7de07f20a0308ac46ce5178 Mon Sep 17 00:00:00 2001 From: Li Hongjun Date: Wed, 28 Aug 2013 11:54:50 +0200 Subject: ipv4 tunnels: fix an oops when using ipip/sit with IPsec Since commit 3d7b46cd20e3 (ip_tunnel: push generic protocol handling to ip_tunnel module.), an Oops is triggered when an xfrm policy is configured on an IPv4 over IPv4 tunnel. xfrm4_policy_check() calls __xfrm_policy_check2(), which uses skb_dst(skb). But this field is NULL because iptunnel_pull_header() calls skb_dst_drop(skb). Signed-off-by: Li Hongjun Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 5 ++--- net/ipv6/sit.c | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 51fc2a1dcdd3..b3ac3c3f6219 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; const struct iphdr *iph; - if (iptunnel_pull_header(skb, 0, tpi.proto)) - goto drop; - iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fbfc5a83867f..21b25dd8466b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -645,11 +645,7 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph; struct ip_tunnel *tunnel; - if (iptunnel_pull_header(skb, 0, tpi.proto)) - goto drop; - iph = ip_hdr(skb); - tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { @@ -659,6 +655,8 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } -- cgit v1.2.3 From 25ad6117e73656071b38fd19fa67ae325471c758 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 30 Aug 2013 17:39:33 -0400 Subject: Revert "ipv6: Don't depend on per socket memory for neighbour discovery messages" This reverts commit 1f324e38870cc09659cf23bc626f1b8869e201f2. It seems to cause regressions, and in particular the output path really depends upon there being a socket attached to skb->sk for checks such as sk_mc_loop(skb->sk) for example. See ip6_output_finish2(). Reported-by: Stephen Warren Reported-by: Fabio Estevam Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 5cb98df966c2..04d31c2fbef1 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -370,12 +370,16 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; + int err; - skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); + skb = sock_alloc_send_skb(sk, + hlen + sizeof(struct ipv6hdr) + len + tlen, + 1, &err); if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", + __func__, err); return NULL; } -- cgit v1.2.3 From 702821f4ea6f68db18aa1de7d8ed62c6ba586a64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Aug 2013 18:10:43 -0700 Subject: net: revert 8728c544a9c ("net: dev_pick_tx() fix") commit 8728c544a9cbdc ("net: dev_pick_tx() fix") and commit b6fe83e9525a ("bonding: refine IFF_XMIT_DST_RELEASE capability") are quite incompatible : Queue selection is disabled because skb dst was dropped before entering bonding device. This causes major performance regression, mainly because TCP packets for a given flow can be sent to multiple queues. This is particularly visible when using the new FQ packet scheduler with MQ + FQ setup on the slaves. We can safely revert the first commit now that 416186fbf8c5b ("net: Split core bits of netdev_pick_tx into __netdev_pick_tx") properly caps the queue_index. Reported-by: Xi Wang Diagnosed-by: Xi Wang Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Alexander Duyck Cc: Denys Fedorysychenko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b84a1b155bc1..d12e3a9a5356 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -346,14 +346,9 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) if (new_index < 0) new_index = skb_tx_hash(dev, skb); - if (queue_index != new_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); - - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - - } + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, queue_index); queue_index = new_index; } -- cgit v1.2.3 From 2d98c29b6fb3de44d9eaa73c09f9cf7209346383 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Aug 2013 23:55:05 +0200 Subject: net: bridge: convert MLDv2 Query MRC into msecs_to_jiffies for max_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While looking into MLDv1/v2 code, I noticed that bridging code does not convert it's max delay into jiffies for MLDv2 messages as we do in core IPv6' multicast code. RFC3810, 5.1.3. Maximum Response Code says: The Maximum Response Code field specifies the maximum time allowed before sending a responding Report. The actual time allowed, called the Maximum Response Delay, is represented in units of milliseconds, and is derived from the Maximum Response Code as follows: [...] As we update timers that work with jiffies, we need to convert it. Signed-off-by: Daniel Borkmann Cc: Linus Lüssing Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9d1d0e66c357..bbcb43582496 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1254,7 +1254,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; + + max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mld2q->mld2q_mrc))), 1UL); } br_multicast_query_received(br, port, &br->ip6_querier, -- cgit v1.2.3