From 07bf7908950a8b14e81aa1807e3c667eab39287a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 1 Aug 2018 13:45:11 +0200 Subject: xfrm: Validate address prefix lengths in the xfrm selector. We don't validate the address prefix lengths in the xfrm selector we got from userspace. This can lead to undefined behaviour in the address matching functions if the prefix is too big for the given address family. Fix this by checking the prefixes and refuse SA/policy insertation when a prefix is invalid. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Air Icy Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 33878e6e0d0a..5151b3ebf068 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -151,10 +151,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; switch (p->family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + goto out; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + goto out; + break; #else err = -EAFNOSUPPORT; @@ -1359,10 +1365,16 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) switch (p->sel.family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + return -EINVAL; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + return -EINVAL; + break; #else return -EAFNOSUPPORT; -- cgit v1.2.3 From 215ab0f021c9fea3c18b75e7d522400ee6a49990 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 31 Aug 2018 08:38:49 -0300 Subject: xfrm6: call kfree_skb when skb is toobig After commit d6990976af7c5d8f55903bfb4289b6fb030bf754 ("vti6: fix PMTU caching and reporting on xmit"), some too big skbs might be potentially passed down to __xfrm6_output, causing it to fail to transmit but not free the skb, causing a leak of skb, and consequentially a leak of dst references. After running pmtu.sh, that shows as failure to unregister devices in a namespace: [ 311.397671] unregister_netdevice: waiting for veth_b to become free. Usage count = 1 The fix is to call kfree_skb in case of transmit failures. Fixes: dd767856a36e ("xfrm6: Don't call icmpv6_send on local error") Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5959ce9620eb..6a74080005cf 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -170,9 +170,11 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } -- cgit v1.2.3 From bfc0698bebcb16d19ecfc89574ad4d696955e5d3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:52 -0700 Subject: xfrm: reset transport header back to network header after all input transforms ahave been applied A policy may have been set up with multiple transforms (e.g., ESP and ipcomp). In this situation, the ingress IPsec processing iterates in xfrm_input() and applies each transform in turn, processing the nexthdr to find any additional xfrm that may apply. This patch resets the transport header back to network header only after the last transformation so that subsequent xfrms can find the correct transport header. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Suggested-by: Steffen Klassert Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 1 + net/ipv4/xfrm4_mode_transport.c | 4 +--- net/ipv6/xfrm6_input.c | 1 + net/ipv6/xfrm6_mode_transport.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index bcfc00e88756..f8de2482a529 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 3d36644890bb..1ad2c2c4e250 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841f4a07438e..9ef490dddcea 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -59,6 +59,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return -1; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 9ad07a91708e..3c29da5defe6 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -51,7 +51,6 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -60,8 +59,7 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } -- cgit v1.2.3 From 782710e333a526780d65918d669cb96646983ba2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:53 -0700 Subject: xfrm: reset crypto_done when iterating over multiple input xfrms We only support one offloaded xfrm (we do not have devices that can handle more than one offload), so reset crypto_done in xfrm_input() when iterating over multiple transforms in xfrm_input, so that we can invoke the appropriate x->type->input for the non-offloaded transforms Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 352abca2605f..86f5afbd0a0c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -453,6 +453,7 @@ resume: XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); -- cgit v1.2.3 From 8682250b3c1b75a45feb7452bc413d004cfe3778 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:13 +0300 Subject: mac80211: Always report TX status If a frame is dropped for any reason, mac80211 wouldn't report the TX status back to user space. As the user space may rely on the TX_STATUS to kick its state machines, resends etc, it's better to just report this frame as not acked instead. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/status.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9a6d7208bf4f..001a869c059c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -479,11 +479,6 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (!skb) return; - if (dropped) { - dev_kfree_skb_any(skb); - return; - } - if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; @@ -506,6 +501,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, } rcu_read_unlock(); + dev_kfree_skb_any(skb); + } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ -- cgit v1.2.3 From 94a5b3acd0aef83c0e38b5117eda7b2abf4a05a4 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:14 +0300 Subject: mac80211: Don't wake up from PS for offchannel TX Otherwise the offchannel frame might be queued due to IEEE80211_QUEUE_STOP_REASON_PS and later dropped (in ieee80211_tx_frags()). Anyway, it doesn't make much sense to wake up the device during ROC. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f353d9db54bc..131542513c8f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -214,6 +214,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) @@ -242,6 +243,9 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) + return TX_CONTINUE; + ifmgd = &tx->sdata->u.mgd; /* -- cgit v1.2.3 From 24f33e64fcd0d50a4b1a8e5b41bd0257aa66b0e8 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:12 +0300 Subject: cfg80211: reg: Init wiphy_idx in regulatory_hint_core() Core regulatory hints didn't set wiphy_idx to WIPHY_IDX_INVALID. Since the regulatory request is zeroed, wiphy_idx was always implicitly set to 0. This resulted in updating only phy #0. Fix that. Fixes: 806a9e39670b ("cfg80211: make regulatory_request use wiphy_idx instead of wiphy") Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho [add fixes tag] Signed-off-by: Johannes Berg --- net/wireless/reg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f702adf2912..765dedb12361 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2867,6 +2867,7 @@ static int regulatory_hint_core(const char *alpha2) request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; + request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); -- cgit v1.2.3 From 6eae4a6c2be387fec41b0d2782c4fffb57159498 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Wed, 5 Sep 2018 06:22:59 -0400 Subject: mac80211: fix pending queue hang due to TX_DROP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our environment running lots of mesh nodes, we are seeing the pending queue hang periodically, with the debugfs queues file showing lines such as: 00: 0x00000000/348 i.e. there are a large number of frames but no stop reason set. One way this could happen is if queue processing from the pending tasklet exited early without processing all frames, and without having some future event (incoming frame, stop reason flag, ...) to reschedule it. Exactly this can occur today if ieee80211_tx() returns false due to packet drops or power-save buffering in the tx handlers. In the past, this function would return true in such cases, and the change to false doesn't seem to be intentional. Fix this case by reverting to the previous behavior. Fixes: bb42f2d13ffc ("mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue") Signed-off-by: Bob Copeland Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 131542513c8f..25ba24bef8f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1894,7 +1894,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) - return false; + return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; -- cgit v1.2.3 From 119f94a6fefcc76d47075b83d2b73d04c895df78 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 5 Sep 2018 18:52:22 +0300 Subject: cfg80211: Address some corner cases in scan result channel updating cfg80211_get_bss_channel() is used to update the RX channel based on the available frame payload information (channel number from DSSS Parameter Set element or HT Operation element). This is needed on 2.4 GHz channels where frames may be received on neighboring channels due to overlapping frequency range. This might of some use on the 5 GHz band in some corner cases, but things are more complex there since there is no n:1 or 1:n mapping between channel numbers and frequencies due to multiple different starting frequencies in different operating classes. This could result in ieee80211_channel_to_frequency() returning incorrect frequency and ieee80211_get_channel() returning incorrect channel information (or indication of no match). In the previous implementation, this could result in some scan results being dropped completely, e.g., for the 4.9 GHz channels. That prevented connection to such BSSs. Fix this by using the driver-provided channel pointer if ieee80211_get_channel() does not find matching channel data for the channel number in the frame payload and if the scan is done with 5 MHz or 10 MHz channel bandwidth. While doing this, also add comments describing what the function is trying to achieve to make it easier to understand what happens here and why. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/scan.c | 58 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d36c3eb7b931..d0e7472dd9fd 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1058,13 +1058,23 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, return NULL; } +/* + * Update RX channel information based on the available frame payload + * information. This is mainly for the 2.4 GHz band where frames can be received + * from neighboring channels and the Beacon frames use the DSSS Parameter Set + * element to indicate the current (transmitting) channel, but this might also + * be needed on other bands if RX frequency does not match with the actual + * operating channel of a BSS. + */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel) + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width) { const u8 *tmp; u32 freq; int channel_number = -1; + struct ieee80211_channel *alt_channel; tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp[1] == 1) { @@ -1078,16 +1088,45 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } } - if (channel_number < 0) + if (channel_number < 0) { + /* No channel information in frame payload */ return channel; + } freq = ieee80211_channel_to_frequency(channel_number, channel->band); - channel = ieee80211_get_channel(wiphy, freq); - if (!channel) - return NULL; - if (channel->flags & IEEE80211_CHAN_DISABLED) + alt_channel = ieee80211_get_channel(wiphy, freq); + if (!alt_channel) { + if (channel->band == NL80211_BAND_2GHZ) { + /* + * Better not allow unexpected channels when that could + * be going beyond the 1-11 range (e.g., discovering + * BSS on channel 12 when radio is configured for + * channel 11. + */ + return NULL; + } + + /* No match for the payload channel number - ignore it */ + return channel; + } + + if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || + scan_width == NL80211_BSS_CHAN_WIDTH_5) { + /* + * Ignore channel number in 5 and 10 MHz channels where there + * may not be an n:1 or 1:n mapping between frequencies and + * channel numbers. + */ + return channel; + } + + /* + * Use the channel determined through the payload channel number + * instead of the RX channel reported by the driver. + */ + if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; - return channel; + return alt_channel; } /* Returned bss is reference counted and must be cleaned up appropriately. */ @@ -1112,7 +1151,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, (data->signal < 0 || data->signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan, + data->scan_width); if (!channel) return NULL; @@ -1210,7 +1250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, data->chan); + ielen, data->chan, data->scan_width); if (!channel) return NULL; -- cgit v1.2.3 From cb59bc14e830028d2244861216df038165d7625d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Sep 2018 13:34:02 +0200 Subject: mac80211: TDLS: fix skb queue/priority assignment If the TDLS setup happens over a connection to an AP that doesn't have QoS, we nevertheless assign a non-zero TID (skb->priority) and queue mapping, which may confuse us or drivers later. Fix it by just assigning the special skb->priority and then using ieee80211_select_queue() just like other data frames would go through. Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 5cd5e6e5834e..6c647f425e05 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -16,6 +16,7 @@ #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" +#include "wme.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -1010,14 +1011,13 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: case WLAN_TDLS_SETUP_RESPONSE: - skb_set_queue_mapping(skb, IEEE80211_AC_BK); - skb->priority = 2; + skb->priority = 256 + 2; break; default: - skb_set_queue_mapping(skb, IEEE80211_AC_VI); - skb->priority = 5; + skb->priority = 256 + 5; break; } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. -- cgit v1.2.3 From c42055105785580563535e6d3143cad95c7ac7ee Mon Sep 17 00:00:00 2001 From: Yuan-Chi Pang Date: Thu, 6 Sep 2018 16:57:48 +0800 Subject: mac80211: fix TX status reporting for ieee80211s TX status reporting to ieee80211s is through ieee80211s_update_metric. There are two problems about ieee80211s_update_metric: 1. The purpose is to estimate the fail probability to a specific link. No need to restrict to data frame. 2. Current implementation does not work if wireless driver does not pass tx_status with skb. Fix this by removing ieee80211_is_data condition, passing ieee80211_tx_status directly to ieee80211s_update_metric, and putting it in both __ieee80211_tx_status and ieee80211_tx_status_ext. Signed-off-by: Yuan-Chi Pang Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 3 ++- net/mac80211/mesh_hwmp.c | 9 +++------ net/mac80211/status.c | 4 +++- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index ee56f18cad3f..21526630bf65 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -217,7 +217,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, + struct ieee80211_tx_status *st); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index daf9db3c8f24..6950cd0bf594 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -295,15 +295,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, } void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, + struct ieee80211_tx_status *st) { - struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *txinfo = st->info; int failed; - if (!ieee80211_is_data(hdr->frame_control)) - return; - failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100. diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 001a869c059c..91d7c0cd1882 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -808,7 +808,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) - ieee80211s_update_metric(local, sta, skb); + ieee80211s_update_metric(local, sta, status); if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); @@ -969,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } rate_control_tx_status(local, sband, status); + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) + ieee80211s_update_metric(local, sta, status); } if (acked || noack_success) { -- cgit v1.2.3 From 9e1437937807b0122e8da1ca8765be2adca9aee6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 11 Sep 2018 10:31:15 +0200 Subject: xfrm: Fix NULL pointer dereference when skb_dst_force clears the dst_entry. Since commit 222d7dbd258d ("net: prevent dst uses after free") skb_dst_force() might clear the dst_entry attached to the skb. The xfrm code don't expect this to happen, so we crash with a NULL pointer dereference in this case. Fix it by checking skb_dst(skb) for NULL after skb_dst_force() and drop the packet in cast the dst_entry was cleared. Fixes: 222d7dbd258d ("net: prevent dst uses after free") Reported-by: Tobias Hommel Reported-by: Kristian Evensen Reported-by: Wolfgang Walter Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 4 ++++ net/xfrm/xfrm_policy.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 89b178a78dc7..36d15a38ce5e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -101,6 +101,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } if (xfrm_offload(skb)) { x->type_offload->encap(x, skb); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7c5e8978aeaa..626e0f4d1749 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2548,6 +2548,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); + return 0; + } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { -- cgit v1.2.3 From 32bf94fb5c2ec4ec842152d0e5937cd4bb6738fa Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Wed, 19 Sep 2018 13:54:56 -0600 Subject: xfrm: validate template mode XFRM mode parameters passed as part of the user templates in the IP_XFRM_POLICY are never properly validated. Passing values other than valid XFRM modes can cause stack-out-of-bounds reads to occur later in the XFRM processing: [ 140.535608] ================================================================ [ 140.543058] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x17e4/0x1cc4 [ 140.550306] Read of size 4 at addr ffffffc0238a7a58 by task repro/5148 [ 140.557369] [ 140.558927] Call trace: [ 140.558936] dump_backtrace+0x0/0x388 [ 140.558940] show_stack+0x24/0x30 [ 140.558946] __dump_stack+0x24/0x2c [ 140.558949] dump_stack+0x8c/0xd0 [ 140.558956] print_address_description+0x74/0x234 [ 140.558960] kasan_report+0x240/0x264 [ 140.558963] __asan_report_load4_noabort+0x2c/0x38 [ 140.558967] xfrm_state_find+0x17e4/0x1cc4 [ 140.558971] xfrm_resolve_and_create_bundle+0x40c/0x1fb8 [ 140.558975] xfrm_lookup+0x238/0x1444 [ 140.558977] xfrm_lookup_route+0x48/0x11c [ 140.558984] ip_route_output_flow+0x88/0xc4 [ 140.558991] raw_sendmsg+0xa74/0x266c [ 140.558996] inet_sendmsg+0x258/0x3b0 [ 140.559002] sock_sendmsg+0xbc/0xec [ 140.559005] SyS_sendto+0x3a8/0x5a8 [ 140.559008] el0_svc_naked+0x34/0x38 [ 140.559009] [ 140.592245] page dumped because: kasan: bad access detected [ 140.597981] page_owner info is not active (free page?) [ 140.603267] [ 140.653503] ================================================================ Signed-off-by: Sean Tranchetti Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5151b3ebf068..d0672c400c2f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1455,6 +1455,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) (ut[i].family != prev_family)) return -EINVAL; + if (ut[i].mode >= XFRM_MODE_MAX) + return -EINVAL; + prev_family = ut[i].family; switch (ut[i].family) { -- cgit v1.2.3 From a173f066c7cfc031acb8f541708041e009fc9812 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Sep 2018 08:20:36 -0700 Subject: netfilter: bridge: Don't sabotage nf_hook calls from an l3mdev For starters, the bridge netfilter code registers operations that are invoked any time nh_hook is called. Specifically, ip_sabotage_in watches for nested calls for NF_INET_PRE_ROUTING when a bridge is in the stack. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0 and the packet continues up the stack. The packet eventually makes it to the VRF driver and it invokes nf_hook for NF_INET_PRE_ROUTING in case any rules have been added against the vrf device. Because of the registered operations the call to nf_hook causes ip_sabotage_in to be invoked. That function sees the nf_bridge on the skb and that in_prerouting is not set. Thinking it is an invalid nested call it steals (drops) the packet. Update ip_sabotage_in to recognize that the bridge or one of its upper devices (e.g., vlan) can be enslaved to a VRF (L3 master device) and allow the packet to go through the nf_hook a second time. Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") Reported-by: D'Souza, Nelson Signed-off-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; } -- cgit v1.2.3 From bab4344975fe2c719eda32de59298d6de26fe126 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 17 Sep 2018 22:21:36 -0700 Subject: netfilter: nft_osf: use enum nft_data_types for nft_validate_register_store The function nft_validate_register_store requires a struct of type struct nft_data_types. NFTA_DATA_VALUE is of type enum nft_verdict_attributes. Pass the correct enum type. This fixes a warning seen with Clang: net/netfilter/nft_osf.c:52:8: warning: implicit conversion from enumeration type 'enum nft_data_attributes' to different enumeration type 'enum nft_data_types' [-Wenum-conversion] NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); ^~~~~~~~~~~~~~~ Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Link: https://github.com/ClangBuiltLinux/linux/issues/103 Signed-off-by: Stefan Agner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 5af74b37f423..a35fb59ace73 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -49,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); + NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); if (err < 0) return err; -- cgit v1.2.3 From 346fa83d10934cf206e2fd0f514bf8ce186f08fe Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 19 Sep 2018 20:21:11 +0800 Subject: netfilter: conntrack: get rid of double sizeof sizeof(sizeof()) is quite strange and does not seem to be what is wanted here. The issue is detected with the help of Coccinelle. Fixes: 39215846740a ("netfilter: conntrack: remove nlattr_size pointer from l4proto trackers") Signed-off-by: zhong jiang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b4bdf9eda7b7..247b89784a6f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1213,8 +1213,8 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags)))) + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { -- cgit v1.2.3 From 92ef12b32feab8f277b69e9fb89ede2796777f4d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 18:21:58 +0200 Subject: tipc: fix flow control accounting for implicit connect In the case of implicit connect message with data > 1K, the flow control accounting is incorrect. At this state, the socket does not know the peer nodes capability and falls back to legacy flow control by return 1, however the receiver of this message will perform the new block accounting. This leads to a slack and eventually traffic disturbance. In this commit, we perform tipc_node_get_capabilities() at implicit connect and perform accounting based on the peer's capability. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..b6f99b021d09 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1419,8 +1419,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); - if (dlen && (dlen == rc)) + if (dlen && dlen == rc) { + tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); + } return rc; } -- cgit v1.2.3 From 94b6ddce71780575fbbf9d2c36afc8440e61a281 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 21:56:57 +0200 Subject: tipc: reset bearer if device carrier not ok If we detect that under lying carrier detects errors and goes down, we reset the bearer. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 418f03d0be90..645c16052052 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -609,16 +609,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, switch (evt) { case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) + if (netif_carrier_ok(dev) && netif_oper_up(dev)) { + test_and_set_bit_lock(0, &b->up); break; - /* else: fall through */ - case NETDEV_UP: - test_and_set_bit_lock(0, &b->up); - break; + } + /* fall through */ case NETDEV_GOING_DOWN: clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; + case NETDEV_UP: + test_and_set_bit_lock(0, &b->up); + break; case NETDEV_CHANGEMTU: if (tipc_mtu_bad(dev, 0)) { bearer_disable(net, b); -- cgit v1.2.3 From 3f32d0be6c16b902b687453c962d17eea5b8ea19 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 22:09:10 +0200 Subject: tipc: lock wakeup & inputq at tipc_link_reset() In tipc_link_reset() we copy the wakeup queue to input queue using skb_queue_splice_init(link->wakeupq, link->inputq). This is performed without holding any locks. The lists might be simultaneously be accessed by other cpu threads in tipc_sk_rcv(), something leading to to random missing packets. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index b1f0bee54eac..26cc033ee167 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -841,9 +841,14 @@ void tipc_link_reset(struct tipc_link *l) l->in_session = false; l->session++; l->mtu = l->advertised_mtu; + spin_lock_bh(&l->wakeupq.lock); + spin_lock_bh(&l->inputq->lock); + skb_queue_splice_init(&l->wakeupq, l->inputq); + spin_unlock_bh(&l->inputq->lock); + spin_unlock_bh(&l->wakeupq.lock); + __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); - skb_queue_splice_init(&l->wakeupq, l->inputq); __skb_queue_purge(&l->backlogq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; -- cgit v1.2.3 From 8105f9b8a8879bff7f1d43d0720c993a99c9d135 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 22 Sep 2018 18:35:31 +0200 Subject: mac80211: allocate TXQs for active monitor interfaces Monitor mode interfaces with the active flag are passed down to the driver. Drivers using TXQ expect that all interfaces have allocated TXQs before they get added. Fixes: 79af1f866193d ("mac80211: avoid allocating TXQs that won't be used") Cc: stable@vger.kernel.org Reported-by: Catrinel Catrinescu Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5e6cf2cee965..5836ddeac9e3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1756,7 +1756,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->ops->wake_tx_queue && type != NL80211_IFTYPE_AP_VLAN && - type != NL80211_IFTYPE_MONITOR) + (type != NL80211_IFTYPE_MONITOR || + (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; -- cgit v1.2.3 From 30fe6d50eb088783c8729c7d930f65296b2b3fa7 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:00 +0900 Subject: nl80211: Fix possible Spectre-v1 for NL80211_TXRATE_HT Use array_index_nospec() to sanitize ridx with respect to speculation. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4b8ec659e797..bd26230de63e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3756,6 +3756,7 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, return false; /* check availability */ + ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN); if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) mcs[ridx] |= rbit; else -- cgit v1.2.3 From cb28c306b93b71f2741ce1a5a66289db26715f4d Mon Sep 17 00:00:00 2001 From: Matias Karhumaa Date: Wed, 26 Sep 2018 09:13:46 +0300 Subject: Bluetooth: SMP: fix crash in unpairing In case unpair_device() was called through mgmt interface at the same time when pairing was in progress, Bluetooth kernel module crash was seen. [ 600.351225] general protection fault: 0000 [#1] SMP PTI [ 600.351235] CPU: 1 PID: 11096 Comm: btmgmt Tainted: G OE 4.19.0-rc1+ #1 [ 600.351238] Hardware name: Dell Inc. Latitude E5440/08RCYC, BIOS A18 05/14/2017 [ 600.351272] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351276] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351279] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351282] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351285] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351287] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351290] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351292] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351295] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351298] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351300] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 [ 600.351302] Call Trace: [ 600.351325] smp_failure+0x4f/0x70 [bluetooth] [ 600.351345] smp_cancel_pairing+0x74/0x80 [bluetooth] [ 600.351370] unpair_device+0x1c1/0x330 [bluetooth] [ 600.351399] hci_sock_sendmsg+0x960/0x9f0 [bluetooth] [ 600.351409] ? apparmor_socket_sendmsg+0x1e/0x20 [ 600.351417] sock_sendmsg+0x3e/0x50 [ 600.351422] sock_write_iter+0x85/0xf0 [ 600.351429] do_iter_readv_writev+0x12b/0x1b0 [ 600.351434] do_iter_write+0x87/0x1a0 [ 600.351439] vfs_writev+0x98/0x110 [ 600.351443] ? ep_poll+0x16d/0x3d0 [ 600.351447] ? ep_modify+0x73/0x170 [ 600.351451] do_writev+0x61/0xf0 [ 600.351455] ? do_writev+0x61/0xf0 [ 600.351460] __x64_sys_writev+0x1c/0x20 [ 600.351465] do_syscall_64+0x5a/0x110 [ 600.351471] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 600.351474] RIP: 0033:0x7fb2bdb62fe0 [ 600.351477] Code: 73 01 c3 48 8b 0d b8 6e 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 69 c7 2c 00 00 75 10 b8 14 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 de 80 01 00 48 89 04 24 [ 600.351479] RSP: 002b:00007ffe062cb8f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 [ 600.351484] RAX: ffffffffffffffda RBX: 000000000255b3d0 RCX: 00007fb2bdb62fe0 [ 600.351487] RDX: 0000000000000001 RSI: 00007ffe062cb920 RDI: 0000000000000004 [ 600.351490] RBP: 00007ffe062cb920 R08: 000000000255bd80 R09: 0000000000000000 [ 600.351494] R10: 0000000000000353 R11: 0000000000000246 R12: 0000000000000001 [ 600.351497] R13: 00007ffe062cbbe0 R14: 0000000000000000 R15: 0000000000000000 [ 600.351501] Modules linked in: algif_hash algif_skcipher af_alg cmac ipt_MASQUERADE nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat nf_nat_ipv4 xt_addrtype iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c br_netfilter bridge stp llc overlay arc4 nls_iso8859_1 dm_crypt intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp dell_laptop kvm_intel crct10dif_pclmul dell_smm_hwmon crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper intel_cstate intel_rapl_perf uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common videodev media hid_multitouch input_leds joydev serio_raw dell_wmi snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic dell_smbios dcdbas sparse_keymap [ 600.351569] snd_hda_intel btusb snd_hda_codec btrtl btbcm btintel snd_hda_core bluetooth(OE) snd_hwdep snd_pcm iwlmvm ecdh_generic wmi_bmof dell_wmi_descriptor snd_seq_midi mac80211 snd_seq_midi_event lpc_ich iwlwifi snd_rawmidi snd_seq snd_seq_device snd_timer cfg80211 snd soundcore mei_me mei dell_rbtn dell_smo8800 mac_hid parport_pc ppdev lp parport autofs4 hid_generic usbhid hid i915 nouveau kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm irqbypass i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mxm_wmi psmouse ahci sdhci_pci cqhci libahci fb_sys_fops sdhci drm e1000e video wmi [ 600.351637] ---[ end trace e49e9f1df09c94fb ]--- [ 600.351664] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351666] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351669] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351672] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351674] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351676] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351679] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351681] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351684] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351686] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351689] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 Crash happened because list_del_rcu() was called twice for smp->ltk. This was possible if unpair_device was called right after ltk was generated but before keys were distributed. In this commit smp_cancel_pairing was refactored to cancel pairing if it is in progress and otherwise just removes keys. Once keys are removed from rcu list, pointers to smp context's keys are set to NULL to make sure removed list items are not accessed later. This commit also adjusts the functionality of mgmt unpair_device() little bit. Previously pairing was canceled only if pairing was in state that keys were already generated. With this commit unpair_device() cancels pairing already in earlier states. Bug was found by fuzzing kernel SMP implementation using Synopsys Defensics. Reported-by: Pekka Oikarainen Signed-off-by: Matias Karhumaa Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 7 ++----- net/bluetooth/smp.c | 29 +++++++++++++++++++++++++---- net/bluetooth/smp.h | 3 ++- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3bdc8f3ca259..ccce954f8146 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2434,9 +2434,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* LE address type */ addr_type = le_addr_type(cp->addr.type); - hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); - - err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); + /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */ + err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, @@ -2450,8 +2449,6 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto done; } - /* Abort any ongoing SMP pairing */ - smp_cancel_pairing(conn); /* Defer clearing up the connection parameters until closing to * give a chance of keeping them if a repairing happens. diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a7b0773536b..73f7211d0431 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2422,30 +2422,51 @@ unlock: return ret; } -void smp_cancel_pairing(struct hci_conn *hcon) +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct hci_conn *hcon; + struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; + int err; + + err = hci_remove_ltk(hdev, bdaddr, addr_type); + hci_remove_irk(hdev, bdaddr, addr_type); + + hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); + if (!hcon) + goto done; + conn = hcon->l2cap_data; if (!conn) - return; + goto done; chan = conn->smp; if (!chan) - return; + goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { + /* Set keys to NULL to make sure smp_failure() does not try to + * remove and free already invalidated rcu list entries. */ + smp->ltk = NULL; + smp->slave_ltk = NULL; + smp->remote_irk = NULL; + if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); + err = 0; } l2cap_chan_unlock(chan); + +done: + return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 0ff6247eaa6c..121edadd5f8d 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -181,7 +181,8 @@ enum smp_key_pref { }; /* SMP Commands */ -void smp_cancel_pairing(struct hci_conn *hcon); +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type); bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -- cgit v1.2.3 From 36f19d5b4f99fa9fa8263877e5f8e669d7fddc14 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 26 Sep 2018 17:35:14 -0700 Subject: net/ipv6: Remove extra call to ip6_convert_metrics for multipath case The change to move metrics from the dst to rt6_info moved the call to ip6_convert_metrics from ip6_route_add to ip6_route_info_create. In doing so it makes the call in ip6_route_info_append redundant and actually leaks the metrics installed as part of the ip6_route_info_create. Remove the now unnecessary call. Fixes: d4ead6b34b67f ("net/ipv6: move metrics from dst to rt6_info") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 826b14de7dbb..a366c05a239d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4321,11 +4321,6 @@ static int ip6_route_info_append(struct net *net, if (!nh) return -ENOMEM; nh->fib6_info = rt; - err = ip6_convert_metrics(net, rt, r_cfg); - if (err) { - kfree(nh); - return err; - } memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->next, rt6_nh_list); -- cgit v1.2.3 From 6194114324139dc16f3251c67ed853bd6d4ae056 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 24 Sep 2018 21:58:59 +0200 Subject: net: core: add member wol_enabled to struct net_device Add flag wol_enabled to struct net_device indicating whether Wake-on-LAN is enabled. As first user phy_suspend() will use it to decide whether PHY can be suspended or not. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Fixes: e8cfd9d6c772 ("net: phy: call state machine synchronously in phy_stop") Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/ethtool.c | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca5ab98053c8..c7861e4b402c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1730,6 +1730,8 @@ enum netdev_priv_flags { * switch driver and used to set the phys state of the * switch port. * + * @wol_enabled: Wake-on-LAN is enabled + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2014,6 +2016,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; + unsigned wol_enabled:1; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 234a0ec2e932..0762aaf8e964 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1483,6 +1483,7 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; + int ret; if (!dev->ethtool_ops->set_wol) return -EOPNOTSUPP; @@ -1490,7 +1491,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; - return dev->ethtool_ops->set_wol(dev, &wol); + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + return ret; + + dev->wol_enabled = !!wol.wolopts; + + return 0; } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) -- cgit v1.2.3 From d4ce58082f206bf6e7d697380c7bc5480a8b0264 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 25 Sep 2018 21:59:28 -0700 Subject: net-tcp: /proc/sys/net/ipv4/tcp_probe_interval is a u32 not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (fix documentation and sysctl access to treat it as such) Tested: # zcat /proc/config.gz | egrep ^CONFIG_HZ CONFIG_HZ_1000=y CONFIG_HZ=1000 # echo $[(1<<32)/1000 + 1] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294968 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument # echo $[(1<<32)/1000] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294967 # echo 0 | tee /proc/sys/net/ipv4/tcp_probe_interval # echo -1 | tee /proc/sys/net/ipv4/tcp_probe_interval -1 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8313a636dd53..960de8fe3f40 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -425,7 +425,7 @@ tcp_mtu_probing - INTEGER 1 - Disabled by default, enabled when an ICMP black hole detected 2 - Always enabled, use initial MSS of tcp_base_mss. -tcp_probe_interval - INTEGER +tcp_probe_interval - UNSIGNED INTEGER Controls how often to start TCP Packetization-Layer Path MTU Discovery reprobe. The default is reprobing every 10 minutes as per RFC4821. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..891ed2f91467 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; +static u32 u32_max_div_HZ = UINT_MAX / HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, - .maxlen = sizeof(int), + .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", -- cgit v1.2.3 From 1222a16014888ed9733c11e221730d4a8196222b Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:01 +0900 Subject: nl80211: Fix possible Spectre-v1 for CQM RSSI thresholds Use array_index_nospec() to sanitize i with respect to speculation. Note that the user doesn't control i directly, but can make it out of bounds by not finding a threshold in the array. Signed-off-by: Masashi Honma [add note about user control, as explained by Masashi] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bd26230de63e..176edfefcbaa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10231,7 +10231,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; u32 hyst; - int i, n; + int i, n, low_index; int err; /* RSSI reporting disabled? */ @@ -10268,10 +10268,19 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (last < wdev->cqm_config->rssi_thresholds[i]) break; - low = i > 0 ? - (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN; - high = i < n ? - (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX; + low_index = i - 1; + if (low_index >= 0) { + low_index = array_index_nospec(low_index, n); + low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + } else { + low = S32_MIN; + } + if (i < n) { + i = array_index_nospec(i, n); + high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + } else { + high = S32_MAX; + } return rdev_set_cqm_rssi_range_config(rdev, dev, low, high); } -- cgit v1.2.3 From 5aac49378742a52bbe8af3d25bc51b487be7b17f Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 14 Aug 2018 20:41:06 +0200 Subject: Bluetooth: Remove unnecessary smp_mb__{before,after}_atomic The barriers are unneeded; wait_woken() and woken_wake_function() already provide us with the required synchronization: remove them and document that we're relying on the (implicit) synchronization provided by wait_woken() and woken_wake_function(). Signed-off-by: Andrea Parri Reviewed-by: Brian Norris Signed-off-by: Marcel Holtmann --- net/bluetooth/bnep/core.c | 7 ++++--- net/bluetooth/cmtp/core.c | 14 ++++++++------ net/bluetooth/hidp/core.c | 13 ++++++++----- 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 7b3965861013..43c284158f63 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -489,9 +489,6 @@ static int bnep_session(void *arg) add_wait_queue(sk_sleep(sk), &wait); while (1) { - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); - if (atomic_read(&s->terminate)) break; /* RX */ @@ -512,6 +509,10 @@ static int bnep_session(void *arg) break; netif_wake_queue(dev); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(sk_sleep(sk), &wait); diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 7f26a5a19ff6..07cfa3249f83 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -288,9 +288,6 @@ static int cmtp_session(void *arg) add_wait_queue(sk_sleep(sk), &wait); while (1) { - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); - if (atomic_read(&session->terminate)) break; if (sk->sk_state != BT_CONNECTED) @@ -306,6 +303,10 @@ static int cmtp_session(void *arg) cmtp_process_transmit(session); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(sk_sleep(sk), &wait); @@ -431,9 +432,10 @@ int cmtp_del_connection(struct cmtp_conndel_req *req) /* Stop session thread */ atomic_inc(&session->terminate); - /* Ensure session->terminate is updated */ - smp_mb__after_atomic(); - + /* + * See the comment preceding the call to wait_woken() + * in cmtp_session(). + */ wake_up_interruptible(sk_sleep(session->sock->sk)); } else err = -ENOENT; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 253975cce943..3734dc1788b4 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1074,6 +1074,10 @@ static int hidp_session_start_sync(struct hidp_session *session) static void hidp_session_terminate(struct hidp_session *session) { atomic_inc(&session->terminate); + /* + * See the comment preceding the call to wait_woken() + * in hidp_session_run(). + */ wake_up_interruptible(&hidp_session_wq); } @@ -1193,8 +1197,6 @@ static void hidp_session_run(struct hidp_session *session) * thread is woken up by ->sk_state_changed(). */ - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); if (atomic_read(&session->terminate)) break; @@ -1228,14 +1230,15 @@ static void hidp_session_run(struct hidp_session *session) hidp_process_transmit(session, &session->ctrl_transmit, session->ctrl_sock); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&hidp_session_wq, &wait); atomic_inc(&session->terminate); - - /* Ensure session->terminate is updated */ - smp_mb__after_atomic(); } static int hidp_session_wake_function(wait_queue_entry_t *wait, -- cgit v1.2.3 From b950aa88638c52a013504f025e0b8f99bf2dc26e Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 17 Aug 2018 07:29:19 +0530 Subject: Bluetooth: Add definitions and track LE resolve list modification Add the definitions for adding entries to the LE resolve list and removing entries from the LE resolve list. When the LE resolve list gets changed via HCI commands make sure that the internal storage of the resolve list entries gets updated. Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 14 +++++++++ include/net/bluetooth/hci_core.h | 15 ++++++++++ net/bluetooth/hci_core.c | 63 ++++++++++++++++++++++++++++++++++++++++ net/bluetooth/hci_event.c | 47 ++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index cdd9f1fe7cfa..c36dc1e20556 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1517,6 +1517,20 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_ADD_TO_RESOLV_LIST 0x2027 +struct hci_cp_le_add_to_resolv_list { + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 peer_irk[16]; + __u8 local_irk[16]; +} __packed; + +#define HCI_OP_LE_DEL_FROM_RESOLV_LIST 0x2028 +struct hci_cp_le_del_from_resolv_list { + __u8 bdaddr_type; + bdaddr_t bdaddr; +} __packed; + #define HCI_OP_LE_CLEAR_RESOLV_LIST 0x2029 #define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0db1b9b428b7..9b0f821b2d3a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -103,6 +103,14 @@ struct bdaddr_list { u8 bdaddr_type; }; +struct bdaddr_list_with_irk { + struct list_head list; + bdaddr_t bdaddr; + u8 bdaddr_type; + u8 peer_irk[16]; + u8 local_irk[16]; +}; + struct bt_uuid { struct list_head list; u8 uuid[16]; @@ -1058,8 +1066,15 @@ int hci_inquiry(void __user *arg); struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *list, bdaddr_t *bdaddr, u8 type); +struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( + struct list_head *list, bdaddr_t *bdaddr, + u8 type); int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type); +int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type, u8 *peer_irk, u8 *local_irk); int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type); +int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type); void hci_bdaddr_list_clear(struct list_head *list); struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 74b29c7d841c..0f1a8820d75c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2839,6 +2839,20 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, return NULL; } +struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( + struct list_head *bdaddr_list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_irk *b; + + list_for_each_entry(b, bdaddr_list, list) { + if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) + return b; + } + + return NULL; +} + void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; @@ -2871,6 +2885,35 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) return 0; } +int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type, u8 *peer_irk, u8 *local_irk) +{ + struct bdaddr_list_with_irk *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + if (hci_bdaddr_list_lookup(list, bdaddr, type)) + return -EEXIST; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, bdaddr); + entry->bdaddr_type = type; + + if (peer_irk) + memcpy(entry->peer_irk, peer_irk, 16); + + if (local_irk) + memcpy(entry->local_irk, local_irk, 16); + + list_add(&entry->list, list); + + return 0; +} + int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; @@ -2890,6 +2933,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) return 0; } +int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_irk *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) { + hci_bdaddr_list_clear(list); + return 0; + } + + entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); + if (!entry) + return -ENOENT; + + list_del(&entry->list); + kfree(entry); + + return 0; +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f12555f23a49..f47f8fad757a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1454,6 +1454,45 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_add_to_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_add_to_resolv_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST); + if (!sent) + return; + + hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr, + sent->bdaddr_type, sent->peer_irk, + sent->local_irk); +} + +static void hci_cc_le_del_from_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_del_from_resolv_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST); + if (!sent) + return; + + hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr, + sent->bdaddr_type); +} + static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3279,6 +3318,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_ADD_TO_RESOLV_LIST: + hci_cc_le_add_to_resolv_list(hdev, skb); + break; + + case HCI_OP_LE_DEL_FROM_RESOLV_LIST: + hci_cc_le_del_from_resolv_list(hdev, skb); + break; + case HCI_OP_LE_CLEAR_RESOLV_LIST: hci_cc_le_clear_resolv_list(hdev, skb); break; -- cgit v1.2.3 From fe1493101ac1313cbdbef1af65342fb17d944e71 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Sep 2018 13:39:20 +0300 Subject: Bluetooth: L2CAP: Derive MPS from connection MTU This ensures the MPS can fit in a single HCI fragment so each segment don't have to be reassembled at HCI level, in addition to that also remove the debugfs entry to configure the MPS. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 - net/bluetooth/l2cap_core.c | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 0697fd413087..17296675a0b1 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -456,7 +456,6 @@ struct l2cap_conn_param_update_rsp { #define L2CAP_CONN_PARAM_REJECTED 0x0001 #define L2CAP_LE_MAX_CREDITS 10 -#define L2CAP_LE_DEFAULT_MPS 230 struct l2cap_le_conn_req { __le16 psm; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index d17a4736e47c..8a60db6261f7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -52,7 +52,6 @@ static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); static u16 le_max_credits = L2CAP_LE_MAX_CREDITS; -static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS; static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); @@ -520,7 +519,8 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan) chan->sdu_len = 0; chan->tx_credits = 0; chan->rx_credits = le_max_credits; - chan->mps = min_t(u16, chan->imtu, le_default_mps); + /* Derive MPS from connection MTU to stop HCI fragmentation */ + chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); skb_queue_head_init(&chan->tx_q); } @@ -1282,6 +1282,8 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; + l2cap_le_flowctl_init(chan); + req.psm = chan->psm; req.scid = cpu_to_le16(chan->scid); req.mtu = cpu_to_le16(chan->imtu); @@ -5493,8 +5495,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response_unlock; } - l2cap_le_flowctl_init(chan); - bacpy(&chan->src, &conn->hcon->src); bacpy(&chan->dst, &conn->hcon->dst); chan->src_type = bdaddr_src_type(conn->hcon); @@ -5506,6 +5506,9 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, chan->tx_credits = __le16_to_cpu(req->credits); __l2cap_chan_add(conn, chan); + + l2cap_le_flowctl_init(chan); + dcid = chan->scid; credits = chan->rx_credits; @@ -7102,7 +7105,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, case L2CAP_MODE_BASIC: break; case L2CAP_MODE_LE_FLOWCTL: - l2cap_le_flowctl_init(chan); break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: @@ -7647,8 +7649,6 @@ int __init l2cap_init(void) debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs, &le_max_credits); - debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs, - &le_default_mps); return 0; } -- cgit v1.2.3 From 96cd8eaa131f0ffd4cfae09e1b4bdfafb9570907 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Sep 2018 13:39:21 +0300 Subject: Bluetooth: L2CAP: Derive rx credits from MTU and MPS Give enough rx credits for a full packet instead of using an arbitrary number which may not be enough depending on the MTU and MPS which can cause interruptions while waiting for more credits, also remove debugfs entry for l2cap_le_max_credits. With these changes the credits are restored after each SDU is received instead of using fixed threshold, this way it is garanteed that there will always be enough credits to send a packet without waiting more credits to arrive. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 2 -- net/bluetooth/l2cap_core.c | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 17296675a0b1..3555440e14fc 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -455,8 +455,6 @@ struct l2cap_conn_param_update_rsp { #define L2CAP_CONN_PARAM_ACCEPTED 0x0000 #define L2CAP_CONN_PARAM_REJECTED 0x0001 -#define L2CAP_LE_MAX_CREDITS 10 - struct l2cap_le_conn_req { __le16 psm; __le16 scid; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8a60db6261f7..3fb2d757df88 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -51,8 +51,6 @@ static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); -static u16 le_max_credits = L2CAP_LE_MAX_CREDITS; - static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, @@ -518,9 +516,10 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan) chan->sdu_last_frag = NULL; chan->sdu_len = 0; chan->tx_credits = 0; - chan->rx_credits = le_max_credits; /* Derive MPS from connection MTU to stop HCI fragmentation */ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); + /* Give enough credits for a full packet */ + chan->rx_credits = (chan->imtu / chan->mps) + 1; skb_queue_head_init(&chan->tx_q); } @@ -6702,13 +6701,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) struct l2cap_le_credits pkt; u16 return_credits; - /* We return more credits to the sender only after the amount of - * credits falls below half of the initial amount. - */ - if (chan->rx_credits >= (le_max_credits + 1) / 2) - return; + return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits; - return_credits = le_max_credits - chan->rx_credits; + if (!return_credits) + return; BT_DBG("chan %p returning %u credits to sender", chan, return_credits); @@ -6722,6 +6718,21 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); } +static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; + + BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len); + + /* Wait recv to confirm reception before updating the credits */ + err = chan->ops->recv(chan, skb); + + /* Update credits whenever an SDU is received */ + l2cap_chan_le_send_credits(chan); + + return err; +} + static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) { int err; @@ -6740,7 +6751,11 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) chan->rx_credits--; BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); - l2cap_chan_le_send_credits(chan); + /* Update if remote had run out of credits, this should only happens + * if the remote is not using the entire MPS. + */ + if (!chan->rx_credits) + l2cap_chan_le_send_credits(chan); err = 0; @@ -6766,7 +6781,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) } if (skb->len == sdu_len) - return chan->ops->recv(chan, skb); + return l2cap_le_recv(chan, skb); chan->sdu = skb; chan->sdu_len = sdu_len; @@ -6788,7 +6803,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) skb = NULL; if (chan->sdu->len == chan->sdu_len) { - err = chan->ops->recv(chan, chan->sdu); + err = l2cap_le_recv(chan, chan->sdu); if (!err) { chan->sdu = NULL; chan->sdu_last_frag = NULL; @@ -7647,9 +7662,6 @@ int __init l2cap_init(void) l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, NULL, &l2cap_debugfs_fops); - debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs, - &le_max_credits); - return 0; } -- cgit v1.2.3 From a5c3021bb62b970713550db3f7fd08aa70665d7e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Sep 2018 13:39:22 +0300 Subject: Bluetooth: L2CAP: Detect if remote is not able to use the whole MPS If the remote is not able to fully utilize the MPS choosen recalculate the credits based on the actual amount it is sending that way it can still send packets of MTU size without credits dropping to 0. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 3fb2d757df88..514899f7f0d4 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6787,6 +6787,16 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) chan->sdu_len = sdu_len; chan->sdu_last_frag = skb; + /* Detect if remote is not able to use the selected MPS */ + if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) { + u16 mps_len = skb->len + L2CAP_SDULEN_SIZE; + + /* Adjust the number of credits */ + BT_DBG("chan->mps %u -> %u", chan->mps, mps_len); + chan->mps = mps_len; + l2cap_chan_le_send_credits(chan); + } + return 0; } -- cgit v1.2.3 From 092ffc51fb3f9b8369e737c9320bf0bffb2c898f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:07 +0100 Subject: rxrpc: Remove dup code from rxrpc_find_connection_rcu() rxrpc_find_connection_rcu() initialises variable k twice with the same information. Remove one of the initialisations. Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 77440a356b14..1746b48cb165 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -85,9 +85,6 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) goto not_found; - k.epoch = sp->hdr.epoch; - k.cid = sp->hdr.cid & RXRPC_CIDMASK; - /* We may have to handle mixing IPv4 and IPv6 */ if (srx.transport.family != local->srx.transport.family) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", -- cgit v1.2.3 From dc71db34e4f3c06b8277c8f3c2ff014610607a8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Fix checks as to whether we should set up a new call There's a check in rxrpc_data_ready() that's checking the CLIENT_INITIATED flag in the packet type field rather than in the packet flags field. Fix this by creating a pair of helper functions to check whether the packet is going to the client or to the server and use them generally. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 10 ++++++++++ net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 12 ++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c97558710421..9fcb3e197b14 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -463,6 +463,16 @@ struct rxrpc_connection { u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; +static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) +{ + return sp->hdr.flags & RXRPC_CLIENT_INITIATED; +} + +static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) +{ + return !rxrpc_to_server(sp); +} + /* * Flags in call->flags. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1746b48cb165..390ba50cfab4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -96,7 +96,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + if (rxrpc_to_server(sp)) { /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate * step and then the connection from the peer's tree. diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdc199c6351..ec299c627f77 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1177,10 +1177,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - _net("Rx RxRPC %s ep=%x call=%x:%x", - sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { _proto("Rx Bad Packet Type %u", sp->hdr.type); @@ -1189,13 +1185,13 @@ void rxrpc_data_ready(struct sock *udp_sk) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + if (rxrpc_to_client(sp)) goto discard; rxrpc_post_packet_to_local(local, skb); goto out; case RXRPC_PACKET_TYPE_BUSY: - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + if (rxrpc_to_server(sp)) goto discard; /* Fall through */ @@ -1280,7 +1276,7 @@ void rxrpc_data_ready(struct sock *udp_sk) call = rcu_dereference(chan->call); if (sp->hdr.callNumber > chan->call_id) { - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) { + if (rxrpc_to_client(sp)) { rcu_read_unlock(); goto reject_packet; } @@ -1303,7 +1299,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } if (!call || atomic_read(&call->usage) == 0) { - if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || + if (rxrpc_to_client(sp) || sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; -- cgit v1.2.3 From b604dd9883f783a94020d772e4fe03160f455372 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Fix RTT gathering Fix RTT information gathering in AF_RXRPC by the following means: (1) Enable Rx timestamping on the transport socket with SO_TIMESTAMPNS. (2) If the sk_buff doesn't have a timestamp set when rxrpc_data_ready() collects it, set it at that point. (3) Allow ACKs to be requested on the last packet of a client call, but not a service call. We need to be careful lest we undo: bf7d620abf22c321208a4da4f435e7af52551a21 Author: David Howells Date: Thu Oct 6 08:11:51 2016 +0100 rxrpc: Don't request an ACK on the last DATA packet of a call's Tx phase but that only really applies to service calls that we're handling, since the client side gets to send the final ACK (or not). (4) When about to transmit an ACK or DATA packet, record the Tx timestamp before only; don't update the timestamp afterwards. (5) Switch the ordering between recording the serial and recording the timestamp to always set the serial number first. The serial number shouldn't be seen referenced by an ACK packet until we've transmitted the packet bearing it - so in the Rx path, we don't need the timestamp until we've checked the serial number. Fixes: cf1a6474f807 ("rxrpc: Add per-peer RTT tracker") Signed-off-by: David Howells --- net/rxrpc/input.c | 8 ++++++-- net/rxrpc/local_object.c | 9 +++++++++ net/rxrpc/output.c | 31 ++++++++++++++++++------------- 3 files changed, 33 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ec299c627f77..7f9ed3a60b9a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -622,13 +622,14 @@ static void rxrpc_input_requested_ack(struct rxrpc_call *call, if (!skb) continue; + sent_at = skb->tstamp; + smp_rmb(); /* Read timestamp before serial. */ sp = rxrpc_skb(skb); if (sp->hdr.serial != orig_serial) continue; - smp_rmb(); - sent_at = skb->tstamp; goto found; } + return; found: @@ -1143,6 +1144,9 @@ void rxrpc_data_ready(struct sock *udp_sk) return; } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 777c3ed4cfc0..81de7d889ffa 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -173,6 +173,15 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) _debug("setsockopt failed"); goto error; } + + /* We want receive timestamps. */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS, + (char *)&opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ccf5de160444..8a4da3fe96df 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -124,7 +124,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; - ktime_t now; size_t len, n; int ret; u8 reason; @@ -196,9 +195,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but * asking UDP to send the packet can take a relatively long - * time, so we update the time after, on the assumption that - * the packet transmission is more likely to happen towards the - * end of the kernel_sendmsg() call. + * time. */ call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); @@ -206,9 +203,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - now = ktime_get_real(); - if (ping) - call->ping_time = now; conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -363,8 +357,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. + * + * However, we mustn't request an ACK on the last reply packet of a + * service call, lest OpenAFS incorrectly send us an ACK with some + * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && + if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || + rxrpc_to_server(sp) + ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || @@ -390,6 +390,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto send_fragmentable; down_read(&conn->params.local->defrag_sem); + + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -413,12 +418,8 @@ done: trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, lost); if (ret >= 0) { - ktime_t now = ktime_get_real(); - skb->tstamp = now; - smp_wmb(); - sp->hdr.serial = serial; if (whdr.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = now; + call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_usage > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -457,6 +458,10 @@ send_fragmentable: down_write(&conn->params.local->defrag_sem); + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + switch (conn->params.local->srx.transport.family) { case AF_INET: opt = IP_PMTUDISC_DONT; -- cgit v1.2.3 From ece64fec164f523bfbe874abdef2a0e6ff376251 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Emit BUSY packets when supposed to rather than ABORTs In the input path, a received sk_buff can be marked for rejection by setting RXRPC_SKB_MARK_* in skb->mark and, if needed, some auxiliary data (such as an abort code) in skb->priority. The rejection is handled by queueing the sk_buff up for dealing with in process context. The output code reads the mark and priority and, theoretically, generates an appropriate response packet. However, if RXRPC_SKB_MARK_BUSY is set, this isn't noticed and an ABORT message with a random abort code is generated (since skb->priority wasn't set to anything). Fix this by outputting the appropriate sort of packet. Also, whilst we're at it, most of the marks are no longer used, so remove them and rename the remaining two to something more obvious. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 13 ++++--------- net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/input.c | 2 +- net/rxrpc/output.c | 23 ++++++++++++++++++----- 4 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fcb3e197b14..e8861cb78070 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -40,17 +40,12 @@ struct rxrpc_crypt { struct rxrpc_connection; /* - * Mark applied to socket buffers. + * Mark applied to socket buffers in skb->mark. skb->priority is used + * to pass supplementary information. */ enum rxrpc_skb_mark { - RXRPC_SKB_MARK_DATA, /* data message */ - RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ - RXRPC_SKB_MARK_BUSY, /* server busy message */ - RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ - RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ - RXRPC_SKB_MARK_NET_ERROR, /* network error message */ - RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ - RXRPC_SKB_MARK_NEW_CALL, /* local error message */ + RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ + RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9d1e298b784c..e88f131c1d7f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -353,7 +353,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [service]"); return NULL; @@ -364,7 +364,7 @@ found_service: rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [close]"); call = NULL; @@ -373,7 +373,7 @@ found_service: call = rxrpc_alloc_incoming_call(rx, local, conn, skb); if (!call) { - skb->mark = RXRPC_SKB_MARK_BUSY; + skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); call = NULL; goto out; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7f9ed3a60b9a..b0f12471f5e7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1354,7 +1354,7 @@ bad_message: protocol_error: skb->priority = RX_PROTOCOL_ERROR; post_abort: - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8a4da3fe96df..e8fb8922bca8 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -524,7 +524,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) struct kvec iov[2]; size_t size; __be32 code; - int ret; + int ret, ioc; _enter("%d", local->debug_id); @@ -532,7 +532,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local) iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; @@ -540,17 +539,31 @@ void rxrpc_reject_packets(struct rxrpc_local *local) msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); + switch (skb->mark) { + case RXRPC_SKB_MARK_REJECT_BUSY: + whdr.type = RXRPC_PACKET_TYPE_BUSY; + size = sizeof(whdr); + ioc = 1; + break; + case RXRPC_SKB_MARK_REJECT_ABORT: + whdr.type = RXRPC_PACKET_TYPE_ABORT; + code = htonl(skb->priority); + size = sizeof(whdr) + sizeof(code); + ioc = 2; + break; + default: + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + continue; + } + if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { msg.msg_namelen = srx.transport_len; - code = htonl(skb->priority); - whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); whdr.callNumber = htonl(sp->hdr.callNumber); -- cgit v1.2.3 From 403fc2a138457f1071b186786a7589ef7382c8bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Improve up-front incoming packet checking Do more up-front checking on incoming packets to weed out invalid ones and also ones aimed at services that we don't support. Whilst we're at it, replace the clearing of call and skew if we don't find a connection with just initialising the variables to zero at the top of the function. Signed-off-by: David Howells --- net/rxrpc/input.c | 63 +++++++++++++++++++++++++++++++++++++++++----------- net/rxrpc/protocol.h | 15 ------------- 2 files changed, 50 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b0f12471f5e7..a569e9e010d1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1125,12 +1125,13 @@ void rxrpc_data_ready(struct sock *udp_sk) { struct rxrpc_connection *conn; struct rxrpc_channel *chan; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; + struct rxrpc_sock *rx; struct sk_buff *skb; unsigned int channel; - int ret, skew; + int ret, skew = 0; _enter("%p", udp_sk); @@ -1181,12 +1182,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || - !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { - _proto("Rx Bad Packet Type %u", sp->hdr.type); - goto bad_message; - } - switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) @@ -1198,24 +1193,63 @@ void rxrpc_data_ready(struct sock *udp_sk) if (rxrpc_to_server(sp)) goto discard; /* Fall through */ + case RXRPC_PACKET_TYPE_ACK: + case RXRPC_PACKET_TYPE_ACKALL: + if (sp->hdr.callNumber == 0) + goto bad_message; + /* Fall through */ + case RXRPC_PACKET_TYPE_ABORT: + break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0) + if (sp->hdr.callNumber == 0 || + sp->hdr.seq == 0) goto bad_message; if (sp->hdr.flags & RXRPC_JUMBO_PACKET && !rxrpc_validate_jumbo(skb)) goto bad_message; break; + case RXRPC_PACKET_TYPE_CHALLENGE: + if (rxrpc_to_server(sp)) + goto discard; + break; + case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_to_client(sp)) + goto discard; + break; + /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: goto discard; + + default: + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; } + if (sp->hdr.serviceId == 0) + goto bad_message; + rcu_read_lock(); + if (rxrpc_to_server(sp)) { + /* Weed out packets to services we're not offering. Packets + * that would begin a call are explicitly rejected and the rest + * are just discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service)) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard_unlock; + } + } + conn = rxrpc_find_connection_rcu(local, skb); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) @@ -1297,14 +1331,10 @@ void rxrpc_data_ready(struct sock *udp_sk) if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) set_bit(RXRPC_CALL_RX_HEARD, &call->flags); } - } else { - skew = 0; - call = NULL; } if (!call || atomic_read(&call->usage) == 0) { if (rxrpc_to_client(sp) || - sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; if (sp->hdr.seq != 1) @@ -1340,6 +1370,13 @@ wrong_security: skb->priority = RXKADINCONSISTENCY; goto post_abort; +unsupported_service: + rcu_read_unlock(); + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto post_abort; + reupgrade: rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 93da73bf7098..f9cb83c938f3 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -50,7 +50,6 @@ struct rxrpc_wire_header { #define RXRPC_PACKET_TYPE_10 10 /* Ignored */ #define RXRPC_PACKET_TYPE_11 11 /* Ignored */ #define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ -#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ @@ -72,20 +71,6 @@ struct rxrpc_wire_header { } __packed; -#define RXRPC_SUPPORTED_PACKET_TYPES ( \ - (1 << RXRPC_PACKET_TYPE_DATA) | \ - (1 << RXRPC_PACKET_TYPE_ACK) | \ - (1 << RXRPC_PACKET_TYPE_BUSY) | \ - (1 << RXRPC_PACKET_TYPE_ABORT) | \ - (1 << RXRPC_PACKET_TYPE_ACKALL) | \ - (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ - (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ - /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ - (1 << RXRPC_PACKET_TYPE_PARAMS) | \ - (1 << RXRPC_PACKET_TYPE_10) | \ - (1 << RXRPC_PACKET_TYPE_11) | \ - (1 << RXRPC_PACKET_TYPE_VERSION)) - /*****************************************************************************/ /* * jumbo packet secondary header -- cgit v1.2.3 From 0099dc589bfa7caf6f2608c4cbc1181cfee22b0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Make service call handling more robust Make the following changes to improve the robustness of the code that sets up a new service call: (1) Cache the rxrpc_sock struct obtained in rxrpc_data_ready() to do a service ID check and pass that along to rxrpc_new_incoming_call(). This means that I can remove the check from rxrpc_new_incoming_call() without the need to worry about the socket attached to the local endpoint getting replaced - which would invalidate the check. (2) Cache the rxrpc_peer struct, thereby allowing the peer search to be done once. The peer is passed to rxrpc_new_incoming_call(), thereby saving the need to repeat the search. This also reduces the possibility of rxrpc_publish_service_conn() BUG()'ing due to the detection of a duplicate connection, despite the initial search done by rxrpc_find_connection_rcu() having turned up nothing. This BUG() shouldn't ever get hit since rxrpc_data_ready() *should* be non-reentrant and the result of the initial search should still hold true, but it has proven possible to hit. I *think* this may be due to __rxrpc_lookup_peer_rcu() cutting short the iteration over the hash table if it finds a matching peer with a zero usage count, but I don't know for sure since it's only ever been hit once that I know of. Another possibility is that a bug in rxrpc_data_ready() that checked the wrong byte in the header for the RXRPC_CLIENT_INITIATED flag might've let through a packet that caused a spurious and invalid call to be set up. That is addressed in another patch. (3) Fix __rxrpc_lookup_peer_rcu() to skip peer records that have a zero usage count rather than stopping and returning not found, just in case there's another peer record behind it in the bucket. (4) Don't search the peer records in rxrpc_alloc_incoming_call(), but rather either use the peer cached in (2) or, if one wasn't found, preemptively install a new one. Fixes: 8496af50eb38 ("rxrpc: Use RCU to access a peer's service connection tree") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 8 +++++--- net/rxrpc/call_accept.c | 41 ++++++++++++----------------------------- net/rxrpc/conn_object.c | 7 ++++++- net/rxrpc/input.c | 7 ++++--- net/rxrpc/peer_object.c | 35 +++++++++++------------------------ 5 files changed, 38 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e8861cb78070..c72686193d83 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -722,6 +722,8 @@ extern struct workqueue_struct *rxrpc_workqueue; int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, + struct rxrpc_sock *, + struct rxrpc_peer *, struct rxrpc_connection *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); @@ -913,7 +915,8 @@ extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, - struct sk_buff *); + struct sk_buff *, + struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); @@ -1049,8 +1052,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e88f131c1d7f..9c7f26d06a52 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -249,11 +249,11 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; - struct rxrpc_peer *peer, *xpeer; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; @@ -276,21 +276,18 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, return NULL; if (!conn) { - /* No connection. We're going to need a peer to start off - * with. If one doesn't yet exist, use a spare from the - * preallocation set. We dump the address into the spare in - * anticipation - and to save on stack space. - */ - xpeer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(local, &xpeer->srx, skb) < 0) - return NULL; - - peer = rxrpc_lookup_incoming_peer(local, xpeer); - if (peer == xpeer) { + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(local, &peer->srx, skb) < 0) + return NULL; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + + rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ @@ -335,30 +332,16 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_sock *rx; struct rxrpc_call *call; - u16 service_id = sp->hdr.serviceId; _enter(""); - /* Get the socket providing the service */ - rx = rcu_dereference(local->service); - if (rx && (service_id == rx->srx.srx_service || - service_id == rx->second_service)) - goto found_service; - - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [service]"); - return NULL; - -found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { @@ -371,7 +354,7 @@ found_service: goto out; } - call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 390ba50cfab4..b4438f98dc5c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -69,10 +69,14 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) * If successful, a pointer to the connection is returned, but no ref is taken. * NULL is returned if there is no match. * + * When searching for a service call, if we find a peer but no connection, we + * return that through *_peer in case we need to create a new service call. + * * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, - struct sk_buff *skb) + struct sk_buff *skb, + struct rxrpc_peer **_peer) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; @@ -104,6 +108,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, peer = rxrpc_lookup_peer_rcu(local, &srx); if (!peer) goto not_found; + *_peer = peer; conn = rxrpc_find_service_conn_rcu(peer, skb); if (!conn || atomic_read(&conn->usage) == 0) goto not_found; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a569e9e010d1..800f5b8a1baa 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1128,7 +1128,8 @@ void rxrpc_data_ready(struct sock *udp_sk) struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; - struct rxrpc_sock *rx; + struct rxrpc_peer *peer = NULL; + struct rxrpc_sock *rx = NULL; struct sk_buff *skb; unsigned int channel; int ret, skew = 0; @@ -1250,7 +1251,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } } - conn = rxrpc_find_connection_rcu(local, skb); + conn = rxrpc_find_connection_rcu(local, skb, &peer); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; @@ -1339,7 +1340,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto bad_message_unlock; if (sp->hdr.seq != 1) goto discard_unlock; - call = rxrpc_new_incoming_call(local, conn, skb); + call = rxrpc_new_incoming_call(local, rx, peer, conn, skb); if (!call) { rcu_read_unlock(); goto reject_packet; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1dc7648e3eff..70083e8fb6e5 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -124,11 +124,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { - if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { - if (atomic_read(&peer->usage) == 0) - return NULL; + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && + atomic_read(&peer->usage) > 0) return peer; - } } return NULL; @@ -299,34 +297,23 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, } /* - * Set up a new incoming peer. The address is prestored in the preallocated - * peer. + * Set up a new incoming peer. There shouldn't be any other matching peers + * since we've already done a search in the list from the non-reentrant context + * (the data_ready handler) that is the only place we can add new peers. */ -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, - struct rxrpc_peer *prealloc) +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { - struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; - hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); - prealloc->local = local; - rxrpc_init_peer(prealloc, hash_key); + hash_key = rxrpc_peer_hash_key(local, &peer->srx); + peer->local = local; + rxrpc_init_peer(peer, hash_key); spin_lock(&rxnet->peer_hash_lock); - - /* Need to check that we aren't racing with someone else */ - peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - peer = prealloc; - hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); - list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - } - + hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); - return peer; } /* -- cgit v1.2.3 From 37a675e768d7606fe8a53e0c459c9b53e121ac20 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Fix transport sockopts to get IPv4 errors on an IPv6 socket It seems that enabling IPV6_RECVERR on an IPv6 socket doesn't also turn on IP_RECVERR, so neither local errors nor ICMP-transported remote errors from IPv4 peer addresses are returned to the AF_RXRPC protocol. Make the sockopt setting code in rxrpc_open_socket() fall through from the AF_INET6 case to the AF_INET case to turn on all the AF_INET options too in the AF_INET6 case. Fixes: f2aeed3a591f ("rxrpc: Fix error reception on AF_INET6 sockets") Signed-off-by: David Howells --- net/rxrpc/local_object.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 81de7d889ffa..94d234e9c685 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -135,10 +135,10 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } switch (local->srx.transport.family) { - case AF_INET: - /* we want to receive ICMP errors */ + case AF_INET6: + /* we want to receive ICMPv6 errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -146,19 +146,22 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + opt = IPV6_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); goto error; } - break; - case AF_INET6: + /* Fall through and set IPv4 options too otherwise we don't get + * errors from IPv4 packets sent through the IPv6 socket. + */ + + case AF_INET: /* we want to receive ICMP errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -166,8 +169,8 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IPV6_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); -- cgit v1.2.3 From f334430316e7fd37c4821ebec627e27714bb5d76 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Fix error distribution Fix error distribution by immediately delivering the errors to all the affected calls rather than deferring them to a worker thread. The problem with the latter is that retries and things can happen in the meantime when we want to stop that sooner. To this end: (1) Stop the error distributor from removing calls from the error_targets list so that peer->lock isn't needed to synchronise against other adds and removals. (2) Require the peer's error_targets list to be accessed with RCU, thereby avoiding the need to take peer->lock over distribution. (3) Don't attempt to affect a call's state if it is already marked complete. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 4 +--- net/rxrpc/ar-internal.h | 5 ----- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 4 ++-- net/rxrpc/conn_object.c | 2 +- net/rxrpc/peer_event.c | 46 +++++++++++--------------------------------- net/rxrpc/peer_object.c | 17 ---------------- 7 files changed, 16 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 196587b8f204..837393fa897b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -56,7 +56,6 @@ enum rxrpc_peer_trace { rxrpc_peer_new, rxrpc_peer_processing, rxrpc_peer_put, - rxrpc_peer_queued_error, }; enum rxrpc_conn_trace { @@ -257,8 +256,7 @@ enum rxrpc_tx_point { EM(rxrpc_peer_got, "GOT") \ EM(rxrpc_peer_new, "NEW") \ EM(rxrpc_peer_processing, "PRO") \ - EM(rxrpc_peer_put, "PUT") \ - E_(rxrpc_peer_queued_error, "QER") + E_(rxrpc_peer_put, "PUT") #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c72686193d83..ef9554131434 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -288,7 +288,6 @@ struct rxrpc_peer { struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ - struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ @@ -299,8 +298,6 @@ struct rxrpc_peer { unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int error_report; /* Net (+0) or local (+1000000) to distribute */ -#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ @@ -1039,7 +1036,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); void rxrpc_peer_keepalive_worker(struct work_struct *); @@ -1057,7 +1053,6 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); -void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9486293fef5c..799f75b6900d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -400,7 +400,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); spin_lock(&conn->params.peer->lock); - hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f8f37188a932..8acf74fe24c0 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -710,8 +710,8 @@ int rxrpc_connect_call(struct rxrpc_call *call, } spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, - &call->conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, + &call->conn->params.peer->error_targets); spin_unlock_bh(&call->conn->params.peer->lock); out: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index b4438f98dc5c..885dae829f4a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -216,7 +216,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_cwnd = call->cong_cwnd; spin_lock_bh(&conn->params.peer->lock); - hlist_del_init(&call->error_link); + hlist_del_rcu(&call->error_link); spin_unlock_bh(&conn->params.peer->lock); if (rxrpc_is_client_call(call)) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 4f9da2f51c69..f3e6fc670da2 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -23,6 +23,8 @@ #include "ar-internal.h" static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); +static void rxrpc_distribute_error(struct rxrpc_peer *, int, + enum rxrpc_call_completion); /* * Find the peer associated with an ICMP packet. @@ -194,8 +196,6 @@ void rxrpc_error_report(struct sock *sk) rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - /* The ref we obtained is passed off to the work item */ - __rxrpc_queue_peer_error(peer); _leave(""); } @@ -205,6 +205,7 @@ void rxrpc_error_report(struct sock *sk) static void rxrpc_store_error(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) { + enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; struct sock_extended_err *ee; int err; @@ -255,7 +256,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); - err += RXRPC_LOCAL_ERROR_OFFSET; + compl = RXRPC_CALL_LOCAL_ERROR; break; case SO_EE_ORIGIN_ICMP6: @@ -264,48 +265,23 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; } - peer->error_report = err; + rxrpc_distribute_error(peer, err, compl); } /* - * Distribute an error that occurred on a peer + * Distribute an error that occurred on a peer. */ -void rxrpc_peer_error_distributor(struct work_struct *work) +static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, + enum rxrpc_call_completion compl) { - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; - enum rxrpc_call_completion compl; - int error; - - _enter(""); - - error = READ_ONCE(peer->error_report); - if (error < RXRPC_LOCAL_ERROR_OFFSET) { - compl = RXRPC_CALL_NETWORK_ERROR; - } else { - compl = RXRPC_CALL_LOCAL_ERROR; - error -= RXRPC_LOCAL_ERROR_OFFSET; - } - _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error); - - spin_lock_bh(&peer->lock); - - while (!hlist_empty(&peer->error_targets)) { - call = hlist_entry(peer->error_targets.first, - struct rxrpc_call, error_link); - hlist_del_init(&call->error_link); + hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - - if (rxrpc_set_call_completion(call, compl, 0, -error)) + if (call->state < RXRPC_CALL_COMPLETE && + rxrpc_set_call_completion(call, compl, 0, -error)) rxrpc_notify_socket(call); } - - spin_unlock_bh(&peer->lock); - - rxrpc_put_peer(peer); - _leave(""); } /* diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 70083e8fb6e5..01a9febfa367 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -220,8 +220,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) atomic_set(&peer->usage, 1); peer->local = local; INIT_HLIST_HEAD(&peer->error_targets); - INIT_WORK(&peer->error_distributor, - &rxrpc_peer_error_distributor); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); @@ -402,21 +400,6 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) return peer; } -/* - * Queue a peer record. This passes the caller's ref to the workqueue. - */ -void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) -{ - const void *here = __builtin_return_address(0); - int n; - - n = atomic_read(&peer->usage); - if (rxrpc_queue_work(&peer->error_distributor)) - trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); - else - rxrpc_put_peer(peer); -} - /* * Discard a peer record. */ -- cgit v1.2.3 From a13f814a67b12a2f29d1decf4b4f4e700658a517 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 30 Aug 2018 17:56:52 +0900 Subject: netfilter: nft_set_rbtree: add missing rb_erase() in GC routine The nft_set_gc_batch_check() checks whether gc buffer is full. If gc buffer is full, gc buffer is released by the nft_set_gc_batch_complete() internally. In case of rbtree, the rb_erase() should be called before calling the nft_set_gc_batch_complete(). therefore the rb_erase() should be called before calling the nft_set_gc_batch_check() too. test commands: table ip filter { set set1 { type ipv4_addr; flags interval, timeout; gc-interval 10s; timeout 1s; elements = { 1-2, 3-4, 5-6, ... 10000-10001, } } } %nft -f test.nft splat looks like: [ 430.273885] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 430.282158] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 430.283116] CPU: 1 PID: 190 Comm: kworker/1:2 Tainted: G B 4.18.0+ #7 [ 430.283116] Workqueue: events_power_efficient nft_rbtree_gc [nf_tables_set] [ 430.313559] RIP: 0010:rb_next+0x81/0x130 [ 430.313559] Code: 08 49 bd 00 00 00 00 00 fc ff df 48 bb 00 00 00 00 00 fc ff df 48 85 c0 75 05 eb 58 48 89 d4 [ 430.313559] RSP: 0018:ffff88010cdb7680 EFLAGS: 00010207 [ 430.313559] RAX: 0000000000b84854 RBX: dffffc0000000000 RCX: ffffffff83f01973 [ 430.313559] RDX: 000000000017090c RSI: 0000000000000008 RDI: 0000000000b84864 [ 430.313559] RBP: ffff8801060d4588 R08: fffffbfff09bc349 R09: fffffbfff09bc349 [ 430.313559] R10: 0000000000000001 R11: fffffbfff09bc348 R12: ffff880100f081a8 [ 430.313559] R13: dffffc0000000000 R14: ffff880100ff8688 R15: dffffc0000000000 [ 430.313559] FS: 0000000000000000(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000 [ 430.313559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 430.313559] CR2: 0000000001551008 CR3: 000000005dc16000 CR4: 00000000001006e0 [ 430.313559] Call Trace: [ 430.313559] nft_rbtree_gc+0x112/0x5c0 [nf_tables_set] [ 430.313559] process_one_work+0xc13/0x1ec0 [ 430.313559] ? _raw_spin_unlock_irq+0x29/0x40 [ 430.313559] ? pwq_dec_nr_in_flight+0x3c0/0x3c0 [ 430.313559] ? set_load_weight+0x270/0x270 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __schedule+0x6d3/0x1f50 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? __sched_text_start+0x8/0x8 [ 430.313559] ? cyc2ns_read_end+0x10/0x10 [ 430.313559] ? save_trace+0x300/0x300 [ 430.313559] ? sched_clock_local+0xd4/0x140 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? lock_contended+0xe70/0xe70 [ 430.313559] ? __lock_acquire+0x4500/0x4500 [ 430.535635] ? do_raw_spin_unlock+0xa5/0x330 [ 430.535635] ? do_raw_spin_trylock+0x101/0x1a0 [ 430.535635] ? do_raw_spin_lock+0x1f0/0x1f0 [ 430.535635] ? _raw_spin_lock_irq+0x10/0x70 [ 430.535635] worker_thread+0x15d/0x1120 [ ... ] Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 55e2d9215c0d..0e5ec126f6ad 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -355,12 +355,11 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { + struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; struct nft_set_gc_batch *gcb = NULL; - struct rb_node *node, *prev = NULL; - struct nft_rbtree_elem *rbe; struct nft_rbtree *priv; + struct rb_node *node; struct nft_set *set; - int i; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); @@ -371,7 +370,7 @@ static void nft_rbtree_gc(struct work_struct *work) rbe = rb_entry(node, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe)) { - prev = node; + rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) @@ -379,29 +378,30 @@ static void nft_rbtree_gc(struct work_struct *work) if (nft_set_elem_mark_busy(&rbe->ext)) continue; + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + rbe_prev = NULL; + } gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (!gcb) break; atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); + rbe_prev = rbe; - if (prev) { - rbe = rb_entry(prev, struct nft_rbtree_elem, node); + if (rbe_end) { atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - prev = NULL; + nft_set_gc_batch_add(gcb, rbe_end); + rb_erase(&rbe_end->node, &priv->root); + rbe_end = NULL; } node = rb_next(node); if (!node) break; } - if (gcb) { - for (i = 0; i < gcb->head.cnt; i++) { - rbe = gcb->elems[i]; - rb_erase(&rbe->node, &priv->root); - } - } + if (rbe_prev) + rb_erase(&rbe_prev->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); -- cgit v1.2.3 From 40e4f26e6a14fc1496eabb8b0004a547303114e6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Thu, 27 Sep 2018 19:36:28 -0300 Subject: netfilter: xt_socket: check sk before checking for netns. Only check for the network namespace if the socket is available. Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 0472f3472842..ada144e5645b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) -- cgit v1.2.3 From c24498c6827b71f80fecc9fb1b70a792053d41a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:51 -0700 Subject: netpoll: do not test NAPI_STATE_SCHED in poll_one_napi() Since we do no longer require NAPI drivers to provide an ndo_poll_controller(), napi_schedule() has not been done before poll_one_napi() invocation. So testing NAPI_STATE_SCHED is likely to cause early returns. While we are at it, remove outdated comment. Note to future bisections : This change might surface prior bugs in drivers. See commit 73f21c653f93 ("bnxt_en: Fix TX timeout during netpoll.") for one occurrence. Fixes: ac3d9dd034e5 ("netpoll: make ndo_poll_controller() optional") Signed-off-by: Eric Dumazet Tested-by: Song Liu Cc: Michael Chan Signed-off-by: David S. Miller --- net/core/netpoll.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3219a2932463..3ae899805f8b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -135,27 +135,9 @@ static void queue_process(struct work_struct *work) } } -/* - * Check whether delayed processing was scheduled for our NIC. If so, - * we attempt to grab the poll lock and use ->poll() to pump the card. - * If this fails, either we've recursed in ->poll() or it's already - * running on another CPU. - * - * Note: we don't mask interrupts with this lock because we're using - * trylock here and interrupts are already disabled in the softirq - * case. Further, we test the poll_owner to avoid recursion on UP - * systems where the lock doesn't exist. - */ static void poll_one_napi(struct napi_struct *napi) { - int work = 0; - - /* net_rx_action's ->poll() invocations and our's are - * synchronized by this test which is only made while - * holding the napi->poll_lock. - */ - if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return; + int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need -- cgit v1.2.3 From 30d65e0804d58a03d1a8ea4e12c6fc07ed08218b Mon Sep 17 00:00:00 2001 From: Matias Karhumaa Date: Fri, 28 Sep 2018 21:54:30 +0300 Subject: Bluetooth: Fix debugfs NULL pointer dereference Fix crash caused by NULL pointer dereference when debugfs functions le_max_key_read, le_max_key_size_write, le_min_key_size_read or le_min_key_size_write and Bluetooth adapter was powered off. Fix is to move max_key_size and min_key_size from smp_dev to hci_dev. At the same time they were renamed to le_max_key_size and le_min_key_size. BUG: unable to handle kernel NULL pointer dereference at 00000000000002e8 PGD 0 P4D 0 Oops: 0000 [#24] SMP PTI CPU: 2 PID: 6255 Comm: cat Tainted: G D OE 4.18.9-200.fc28.x86_64 #1 Hardware name: LENOVO 4286CTO/4286CTO, BIOS 8DET76WW (1.46 ) 06/21/2018 RIP: 0010:le_max_key_size_read+0x45/0xb0 [bluetooth] Code: 00 00 00 48 83 ec 10 65 48 8b 04 25 28 00 00 00 48 89 44 24 08 31 c0 48 8b 87 c8 00 00 00 48 8d 7c 24 04 48 8b 80 48 0a 00 00 <48> 8b 80 e8 02 00 00 0f b6 48 52 e8 fb b6 b3 ed be 04 00 00 00 48 RSP: 0018:ffffab23c3ff3df0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00007f0b4ca2e000 RCX: ffffab23c3ff3f08 RDX: ffffffffc0ddb033 RSI: 0000000000000004 RDI: ffffab23c3ff3df4 RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000000 R10: ffffab23c3ff3ed8 R11: 0000000000000000 R12: ffffab23c3ff3f08 R13: 00007f0b4ca2e000 R14: 0000000000020000 R15: ffffab23c3ff3f08 FS: 00007f0b4ca0f540(0000) GS:ffff91bd5e280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000002e8 CR3: 00000000629fa006 CR4: 00000000000606e0 Call Trace: full_proxy_read+0x53/0x80 __vfs_read+0x36/0x180 vfs_read+0x8a/0x140 ksys_read+0x4f/0xb0 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Matias Karhumaa Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 2 ++ net/bluetooth/smp.c | 23 +++++++++-------------- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9b0f821b2d3a..e5ea633ea368 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -267,6 +267,8 @@ struct hci_dev { __u16 le_max_tx_time; __u16 le_max_rx_len; __u16 le_max_rx_time; + __u8 le_max_key_size; + __u8 le_min_key_size; __u16 discov_interleaved_timeout; __u16 conn_info_min_age; __u16 conn_info_max_age; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0f1a8820d75c..7352fe85674b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3147,6 +3147,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; + hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; + hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a7b0773536b..090670fe385f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -88,9 +88,6 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; - u8 min_key_size; - u8 max_key_size; - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; @@ -720,7 +717,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; - req->max_key_size = SMP_DEV(hdev)->max_key_size; + req->max_key_size = hdev->le_max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -731,7 +728,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; - rsp->max_key_size = SMP_DEV(hdev)->max_key_size; + rsp->max_key_size = hdev->le_max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -745,7 +742,7 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; - if (max_key_size > SMP_DEV(hdev)->max_key_size || + if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; @@ -3243,8 +3240,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; - smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; - smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; create_chan: chan = l2cap_chan_create(); @@ -3370,7 +3365,7 @@ static ssize_t le_min_key_size_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[4]; - snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); + snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } @@ -3391,11 +3386,11 @@ static ssize_t le_min_key_size_write(struct file *file, sscanf(buf, "%hhu", &key_size); - if (key_size > SMP_DEV(hdev)->max_key_size || + if (key_size > hdev->le_max_key_size || key_size < SMP_MIN_ENC_KEY_SIZE) return -EINVAL; - SMP_DEV(hdev)->min_key_size = key_size; + hdev->le_min_key_size = key_size; return count; } @@ -3414,7 +3409,7 @@ static ssize_t le_max_key_size_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[4]; - snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); + snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } @@ -3436,10 +3431,10 @@ static ssize_t le_max_key_size_write(struct file *file, sscanf(buf, "%hhu", &key_size); if (key_size > SMP_MAX_ENC_KEY_SIZE || - key_size < SMP_DEV(hdev)->min_key_size) + key_size < hdev->le_min_key_size) return -EINVAL; - SMP_DEV(hdev)->max_key_size = key_size; + hdev->le_max_key_size = key_size; return count; } -- cgit v1.2.3 From c140eb166d681f66bd7e99fb121357db1a503e7f Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 21:00:54 +0200 Subject: tipc: fix failover problem We see the following scenario: 1) Link endpoint B on node 1 discovers that its peer endpoint is gone. Since there is a second working link, failover procedure is started. 2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint A on node 2. The node item 1->2 goes to state FAILINGOVER. 3) Linke endpoint A/2 receives the failover, and is supposed to take down its parallell link endpoint B/2, while producing a FAILOVER message to send back to A/1. 4) However, B/2 has already been deleted, so no FAILOVER message can created. 5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive any messages that can bring B/1 up again. We are left with a non- redundant link between node 1 and 2. We fix this with letting endpoint A/2 build a dummy FAILOVER message to send to back to A/1, so that the situation can be resolved. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 35 +++++++++++++++++++++++++++++++++++ net/tipc/link.h | 3 +++ net/tipc/node.c | 11 +++++++++++ 3 files changed, 49 insertions(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 26cc033ee167..4ed650ce6e61 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } +u32 tipc_link_state(struct tipc_link *l) +{ + return l->state; +} + /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -1385,6 +1390,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, __skb_queue_tail(xmitq, skb); } +void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + u32 onode = tipc_own_addr(l->net); + struct tipc_msg *hdr, *ihdr; + struct sk_buff_head tnlq; + struct sk_buff *skb; + u32 dnode = l->addr; + + skb_queue_head_init(&tnlq); + skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, + INT_H_SIZE, BASIC_H_SIZE, + dnode, onode, 0, 0, 0); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + + hdr = buf_msg(skb); + msg_set_msgcnt(hdr, 1); + msg_set_bearer_id(hdr, l->peer_bearer_id); + + ihdr = (struct tipc_msg *)msg_data(hdr); + tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, dnode); + msg_set_errcode(ihdr, TIPC_ERR_NO_PORT); + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, xmitq); +} + /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets * with contents of the link's transmit and backlog queues. */ diff --git a/net/tipc/link.h b/net/tipc/link.h index 7bc494a33fdf..90488c538a4e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link **link); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); +void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl, + struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); bool tipc_link_is_up(struct tipc_link *l); @@ -107,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); +u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 68014f1b6976..b0ee25f1f2e6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,6 +111,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; @@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); + n->failover_sent = false; n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); @@ -1615,6 +1617,15 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } + /* If parallel link was already down, and this happened before + * the tunnel link came up, FAILOVER was never sent. Ensure that + * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. + */ + if (n->state != NODE_FAILINGOVER && !n->failover_sent) { + tipc_link_create_dummy_tnl_msg(l, xmitq); + n->failover_sent = true; + } + /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; -- cgit v1.2.3 From 848e616e66d4592fe9afc40743d3504deb7632b4 Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Sun, 30 Sep 2018 12:53:00 +0200 Subject: cfg80211: fix wext-compat memory leak cfg80211_wext_giwrate and sinfo.pertid might allocate sinfo.pertid via rdev_get_station(), but never release it. Fix that. Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info") Signed-off-by: Stefan Seyfried [johannes: fix error path, use cfg80211_sinfo_release_content(), add Fixes] Signed-off-by: Johannes Berg --- net/wireless/wext-compat.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 167f7025ac98..06943d9c9835 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,12 +1278,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) - return -EOPNOTSUPP; + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { + err = -EOPNOTSUPP; + goto free; + } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); - return 0; +free: + cfg80211_sinfo_release_content(&sinfo); + return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ @@ -1293,7 +1297,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; - static struct station_info sinfo; + static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) @@ -1352,6 +1356,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; + cfg80211_sinfo_release_content(&sinfo); + return &wstats; } -- cgit v1.2.3 From 211710ca74adf790b46ab3867fcce8047b573cd1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 29 Sep 2018 16:01:58 +0200 Subject: mac80211: fix setting IEEE80211_KEY_FLAG_RX_MGMT for AP mode keys key->sta is only valid after ieee80211_key_link, which is called later in this function. Because of that, the IEEE80211_KEY_FLAG_RX_MGMT is never set when management frame protection is enabled. Fixes: e548c49e6dc6b ("mac80211: add key flag for management keys") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d25da0e66da1..5d22eda8a6b1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -427,7 +427,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ - if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP)) + if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: -- cgit v1.2.3 From 1db58529454742f67ebd96e3588315e880b72837 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 27 Sep 2018 17:05:04 -0600 Subject: cfg80211: fix use-after-free in reg_process_hint() reg_process_hint_country_ie() can free regulatory_request and return REG_REQ_ALREADY_SET. We shouldn't use regulatory_request after it's called. KASAN error was observed when this happens. BUG: KASAN: use-after-free in reg_process_hint+0x839/0x8aa [cfg80211] Read of size 4 at addr ffff8800c430d434 by task kworker/1:3/89 Workqueue: events reg_todo [cfg80211] Call Trace: dump_stack+0xc1/0x10c ? _atomic_dec_and_lock+0x1ad/0x1ad ? _raw_spin_lock_irqsave+0xa0/0xd2 print_address_description+0x86/0x26f ? reg_process_hint+0x839/0x8aa [cfg80211] kasan_report+0x241/0x29b reg_process_hint+0x839/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 ? worker_detach_from_pool+0x1b5/0x1b5 ? _raw_spin_unlock_irq+0x65/0xdd ? _raw_spin_unlock_irqrestore+0xf3/0xf3 worker_thread+0x5dd/0x841 ? kthread_parkme+0x1d/0x1d kthread+0x270/0x285 ? pr_cont_work+0xe3/0xe3 ? rcu_read_unlock_sched_notrace+0xca/0xca ret_from_fork+0x22/0x40 Allocated by task 2718: set_track+0x63/0xfa __kmalloc+0x119/0x1ac regulatory_hint_country_ie+0x38/0x329 [cfg80211] __cfg80211_connect_result+0x854/0xadd [cfg80211] cfg80211_rx_assoc_resp+0x3bc/0x4f0 [cfg80211] smsc95xx v1.0.6 ieee80211_sta_rx_queued_mgmt+0x1803/0x7ed5 [mac80211] ieee80211_iface_work+0x411/0x696 [mac80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Freed by task 89: set_track+0x63/0xfa kasan_slab_free+0x6a/0x87 kfree+0xdc/0x470 reg_process_hint+0x31e/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao Signed-off-by: Johannes Berg --- net/wireless/reg.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 765dedb12361..24cfa2776f50 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2661,11 +2661,12 @@ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; + enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - switch (reg_request->initiator) { + switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; @@ -2683,7 +2684,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: - WARN(1, "invalid initiator %d\n", reg_request->initiator); + WARN(1, "invalid initiator %d\n", initiator); goto out_free; } @@ -2698,7 +2699,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } -- cgit v1.2.3 From 1ad98e9d1bdf4724c0a8532fabd84bf3c457c2bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Oct 2018 15:02:26 -0700 Subject: tcp/dccp: fix lockdep issue when SYN is backlogged In normal SYN processing, packets are handled without listener lock and in RCU protected ingress path. But syzkaller is known to be able to trick us and SYN packets might be processed in process context, after being queued into socket backlog. In commit 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") I made a very stupid fix, that happened to work mostly because of the regular path being RCU protected. Really the thing protecting ireq->ireq_opt is RCU read lock, and the pseudo request refcnt is not relevant. This patch extends what I did in commit 449809a66c1d ("tcp/dccp: block BH for SYN processing") by adding an extra rcu_read_{lock|unlock} pair in the paths that might be taken when processing SYN from socket backlog (thus possibly in process context) Fixes: 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 +-- net/dccp/input.c | 4 +++- net/ipv4/tcp_input.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e03b93360f33..a8cd5cf9ff5b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,8 +132,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) { - return rcu_dereference_check(ireq->ireq_opt, - refcount_read(&ireq->req.rsk_refcnt) > 0); + return rcu_dereference(ireq->ireq_opt); } struct inet_cork { diff --git a/net/dccp/input.c b/net/dccp/input.c index d28d46bff6ab..85d6c879383d 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -606,11 +606,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf2f7bb2802..47e08c1b5bc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6009,11 +6009,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->fin) goto discard; /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; -- cgit v1.2.3 From aeadd93f2b0a609f603ac33e574b97a9832d1b90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Sep 2018 16:46:48 +0300 Subject: net: sched: act_ipt: check for underflow in __tcf_ipt_init() If "td->u.target_size" is larger than sizeof(struct xt_entry_target) we return -EINVAL. But we don't check whether it's smaller than sizeof(struct xt_entry_target) and that could lead to an out of bounds read. Fixes: 7ba699c604ab ("[NET_SCHED]: Convert actions from rtnetlink to new netlink API") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 23273b5303fd..8525de811616 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -135,7 +135,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); - if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { + if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else -- cgit v1.2.3 From d949cfedbcbab4e91590576cbace2671924ad69c Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 22:28:52 +0200 Subject: tipc: ignore STATE_MSG on wrong link session The initial session number when a link is created is based on a random value, taken from struct tipc_net->random. It is then incremented for each link reset to avoid mixing protocol messages from different link sessions. However, when a bearer is reset all its links are deleted, and will later be re-created using the same random value as the first time. This means that if the link never went down between creation and deletion we will still sometimes have two subsequent sessions with the same session number. In virtual environments with potentially long transmission times this has turned out to be a real problem. We now fix this by randomizing the session number each time a link is created. With a session number size of 16 bits this gives a risk of session collision of 1/64k. To reduce this further, we also introduce a sanity check on the very first STATE message arriving at a link. If this has an acknowledge value differing from 0, which is logically impossible, we ignore the message. The final risk for session collision is hence reduced to 1/4G, which should be sufficient. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 3 +++ net/tipc/node.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 4ed650ce6e61..fb886b525d95 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1516,6 +1516,9 @@ bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) return false; if (session != curr_session) return false; + /* Extra sanity check */ + if (!link_is_up(l) && msg_ack(hdr)) + return false; if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) return true; /* Accept only STATE with new sequence number */ diff --git a/net/tipc/node.c b/net/tipc/node.c index b0ee25f1f2e6..2afc4f8c37a7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -913,6 +913,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool reset = true; char *if_name; unsigned long intv; + u16 session; *dupl_addr = false; *respond = false; @@ -999,9 +1000,10 @@ void tipc_node_check_dest(struct net *net, u32 addr, goto exit; if_name = strchr(b->name, ':') + 1; + get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, mod(tipc_net(net)->random), + b->window, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -1625,7 +1627,6 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_link_create_dummy_tnl_msg(l, xmitq); n->failover_sent = true; } - /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; -- cgit v1.2.3 From 7f6d6558ae44bc193eb28df3617c364d3bb6df39 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:55:34 -0300 Subject: Revert "openvswitch: Fix template leak in error cases." This reverts commit 90c7afc96cbbd77f44094b5b651261968e97de67. When the commit was merged, the code used nf_ct_put() to free the entry, but later on commit 76644232e612 ("openvswitch: Free tmpl with tmpl_free.") replaced that with nf_ct_tmpl_free which is a more appropriate. Now the original problem is removed. Then 44d6e2f27328 ("net: Replace NF_CT_ASSERT() with WARN_ON().") replaced a debug assert with a WARN_ON() which is trigged now. Signed-off-by: Flavio Leitner Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 86a75105af1a..0aeb34c6389d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1624,10 +1624,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } - - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); - if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1639,6 +1635,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); -- cgit v1.2.3 From 893626d6a353d1356528f94e081246ecf233d77a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Sep 2018 12:28:41 -0700 Subject: rtnetlink: Fail dump if target netnsid is invalid Link dumps can return results from a target namespace. If the namespace id is invalid, then the dump request should fail if get_target_net fails rather than continuing with a dump of the current namespace. Fixes: 79e1ad148c844 ("rtnetlink: use netnsid to query interface") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 63ce2283a456..7f37fe9c65a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1898,10 +1898,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); tgt_net = get_target_net(skb->sk, netnsid); - if (IS_ERR(tgt_net)) { - tgt_net = net; - netnsid = -1; - } + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) -- cgit v1.2.3 From 6fe9487892b32cb1c8b8b0d552ed7222a527fe30 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 28 Sep 2018 16:26:08 -0400 Subject: bond: take rcu lock in netpoll_send_skb_on_dev The bonding driver lacks the rcu lock when it calls down into netdev_lower_get_next_private_rcu from bond_poll_controller, which results in a trace like: WARNING: CPU: 2 PID: 179 at net/core/dev.c:6567 netdev_lower_get_next_private_rcu+0x34/0x40 CPU: 2 PID: 179 Comm: kworker/u16:15 Not tainted 4.19.0-rc5-backup+ #1 Workqueue: bond0 bond_mii_monitor RIP: 0010:netdev_lower_get_next_private_rcu+0x34/0x40 Code: 48 89 fb e8 fe 29 63 ff 85 c0 74 1e 48 8b 45 00 48 81 c3 c0 00 00 00 48 8b 00 48 39 d8 74 0f 48 89 45 00 48 8b 40 f8 5b 5d c3 <0f> 0b eb de 31 c0 eb f5 0f 1f 40 00 0f 1f 44 00 00 48 8> RSP: 0018:ffffc9000087fa68 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff880429614560 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 00000000ffffffff RDI: ffffffffa184ada0 RBP: ffffc9000087fa80 R08: 0000000000000001 R09: 0000000000000000 R10: ffffc9000087f9f0 R11: ffff880429798040 R12: ffff8804289d5980 R13: ffffffffa1511f60 R14: 00000000000000c8 R15: 00000000ffffffff FS: 0000000000000000(0000) GS:ffff88042f880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4b78fce180 CR3: 000000018180f006 CR4: 00000000001606e0 Call Trace: bond_poll_controller+0x52/0x170 netpoll_poll_dev+0x79/0x290 netpoll_send_skb_on_dev+0x158/0x2c0 netpoll_send_udp+0x2d5/0x430 write_ext_msg+0x1e0/0x210 console_unlock+0x3c4/0x630 vprintk_emit+0xfa/0x2f0 printk+0x52/0x6e ? __netdev_printk+0x12b/0x220 netdev_info+0x64/0x80 ? bond_3ad_set_carrier+0xe9/0x180 bond_select_active_slave+0x1fc/0x310 bond_mii_monitor+0x709/0x9b0 process_one_work+0x221/0x5e0 worker_thread+0x4f/0x3b0 kthread+0x100/0x140 ? process_one_work+0x5e0/0x5e0 ? kthread_delayed_work_timer_fn+0x90/0x90 ret_from_fork+0x24/0x30 We're also doing rcu dereferences a layer up in netpoll_send_skb_on_dev before we call down into netpoll_poll_dev, so just take the lock there. Suggested-by: Cong Wang Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3ae899805f8b..de1d1ba92f2d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -312,6 +312,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; + rcu_read_lock_bh(); lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); @@ -356,6 +357,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + rcu_read_unlock_bh(); } EXPORT_SYMBOL(netpoll_send_skb_on_dev); -- cgit v1.2.3 From cc16567e5a8a7bb9439ef61ab80069acdd33f76f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 2 Oct 2018 11:03:40 +0200 Subject: net: drop unused skb_append_datato_frags() This helper is unused since commit 988cf74deb45 ("inet: Stop generating UFO packets.") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ----- net/core/skbuff.c | 58 -------------------------------------------------- 2 files changed, 63 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 87e29710373f..119d092c6b13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1082,11 +1082,6 @@ static inline int skb_pad(struct sk_buff *skb, int pad) } #define dev_kfree_skb(a) consume_skb(a) -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int getfrag(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb), - void *from, int length); - int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2c807f67aba..0e937d3d85b5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3381,64 +3381,6 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, } EXPORT_SYMBOL(skb_find_text); -/** - * skb_append_datato_frags - append the user data to a skb - * @sk: sock structure - * @skb: skb structure to be appended with user data. - * @getfrag: call back function to be used for getting the user data - * @from: pointer to user message iov - * @length: length of the iov message - * - * Description: This procedure append the user data in the fragment part - * of the skb if any page alloc fails user this procedure returns -ENOMEM - */ -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int (*getfrag)(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb), - void *from, int length) -{ - int frg_cnt = skb_shinfo(skb)->nr_frags; - int copy; - int offset = 0; - int ret; - struct page_frag *pfrag = ¤t->task_frag; - - do { - /* Return error if we don't have space for new frag */ - if (frg_cnt >= MAX_SKB_FRAGS) - return -EMSGSIZE; - - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; - - /* copy the user data to page */ - copy = min_t(int, length, pfrag->size - pfrag->offset); - - ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, - offset, copy, 0, skb); - if (ret < 0) - return -EFAULT; - - /* copy was successful so update the size parameters */ - skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, - copy); - frg_cnt++; - pfrag->offset += copy; - get_page(pfrag->page); - - skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); - skb->len += copy; - skb->data_len += copy; - offset += copy; - length -= copy; - - } while (length > 0); - - return 0; -} -EXPORT_SYMBOL(skb_append_datato_frags); - int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size) { -- cgit v1.2.3 From 2ab2ddd301a22ca3c5f0b743593e4ad2953dfa53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 12:35:05 -0700 Subject: inet: make sure to grab rcu_read_lock before using ireq->ireq_opt Timer handlers do not imply rcu_read_lock(), so my recent fix triggered a LOCKDEP warning when SYNACK is retransmit. Lets add rcu_read_lock()/rcu_read_unlock() pairs around ireq->ireq_opt usages instead of guessing what is done by callers, since it is not worth the pain. Get rid of ireq_opt_deref() helper since it hides the logic without real benefit, since it is now a standard rcu_dereference(). Fixes: 1ad98e9d1bdf ("tcp/dccp: fix lockdep issue when SYN is backlogged") Signed-off-by: Eric Dumazet Reported-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 ----- net/dccp/ipv4.c | 4 +++- net/ipv4/inet_connection_sock.c | 5 ++++- net/ipv4/tcp_ipv4.c | 4 +++- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a8cd5cf9ff5b..a80fd0ac4563 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -130,11 +130,6 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } -static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) -{ - return rcu_dereference(ireq->ireq_opt); -} - struct inet_cork { unsigned int flags; __be32 addr; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..8e08cea6f178 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -493,9 +493,11 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dfd5009f96ef..15e7f7915a21 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = ireq_opt_deref(ireq); + rcu_read_lock(); + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, @@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c09eddbb78..cd426313a298 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } -- cgit v1.2.3 From 0e1d6eca5113858ed2caea61a5adc03c595f6096 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 15:47:35 -0700 Subject: rtnl: limit IFLA_NUM_TX_QUEUES and IFLA_NUM_RX_QUEUES to 4096 We have an impressive number of syzkaller bugs that are linked to the fact that syzbot was able to create a networking device with millions of TX (or RX) queues. Let's limit the number of RX/TX queues to 4096, this really should cover all known cases. A separate patch will add various cond_resched() in the loops handling sysfs entries at device creation and dismantle. Tested: lpaa6:~# ip link add gre-4097 numtxqueues 4097 numrxqueues 4097 type ip6gretap RTNETLINK answers: Invalid argument lpaa6:~# time ip link add gre-4096 numtxqueues 4096 numrxqueues 4096 type ip6gretap real 0m0.180s user 0m0.000s sys 0m0.107s Fixes: 76ff5cc91935 ("rtnl: allow to specify number of rx and tx queues on device creation") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7f37fe9c65a5..448703312fed 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2835,6 +2835,12 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); + if (num_tx_queues < 1 || num_tx_queues > 4096) + return ERR_PTR(-EINVAL); + + if (num_rx_queues < 1 || num_rx_queues > 4096) + return ERR_PTR(-EINVAL); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) -- cgit v1.2.3 From e351bb6227fbe2bb5da6f38a4cf5bd18810b0557 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:46 -0700 Subject: net: ip_rt_get_source() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dce2ed66ebe1..02482b71498b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1217,18 +1217,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) src = ip_hdr(skb)->saddr; else { struct fib_result res; - struct flowi4 fl4; - struct iphdr *iph; - - iph = ip_hdr(skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = iph->daddr; - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = rt->dst.dev->ifindex; - fl4.flowi4_iif = skb->dev->ifindex; - fl4.flowi4_mark = skb->mark; + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->dst.dev->ifindex, + .flowi4_iif = skb->dev->ifindex, + .flowi4_mark = skb->mark, + }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) -- cgit v1.2.3 From e8e3fbe92c49d1e031eca72542956606c43122ba Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:47 -0700 Subject: net: inet_rtm_getroute() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02482b71498b..048919713f4e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2780,7 +2780,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi4 fl4; + struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; @@ -2820,7 +2820,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; - memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos; -- cgit v1.2.3 From 1f7f10ac4aab5aadd89bdf6db177e3e753e0a69c Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:48 -0700 Subject: net: ip6_redirect() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d28f83e01593..6f252fa914c2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2508,16 +2508,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .flowi6_mark = mark, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); -- cgit v1.2.3 From 0b26fb17cadf9df8f2e5a8feec4c2a850eabede0 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:49 -0700 Subject: net: ip6_redirect_no_header() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6f252fa914c2..dff80697c033 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2530,15 +2530,14 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = msg->dest; - fl6.saddr = iph->daddr; - fl6.flowi6_uid = sock_net_uid(net, NULL); + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .flowi6_mark = mark, + .daddr = msg->dest, + .saddr = iph->daddr, + .flowi6_uid = sock_net_uid(net, NULL), + }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); -- cgit v1.2.3 From d456336d164886d9339aaa112d6595e1c142f8bc Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:50 -0700 Subject: net: remove 1 always zero parameter from ip6_redirect_no_header() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (the parameter in question is mark) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +-- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7b9c82de11cc..cef186dbd2ce 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -165,8 +165,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid); -void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, - u32 mark); +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0ec273997d1d..51863ada15a4 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), - skb->dev->ifindex, 0); + skb->dev->ifindex); return; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dff80697c033..e50525a95a09 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2524,8 +2524,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, } EXPORT_SYMBOL_GPL(ip6_redirect); -void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, - u32 mark) +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); @@ -2533,7 +2532,6 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, - .flowi6_mark = mark, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), -- cgit v1.2.3 From dc92095dd92a955761b4779fed9618d407f8123b Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:51 -0700 Subject: net: ip6_update_pmtu() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e50525a95a09..dd19cf8dbcc1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2349,15 +2349,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_oif = oif, + .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) -- cgit v1.2.3 From 8823a3acfd903d1558f45f155570f8eb3eb5a76f Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:52 -0700 Subject: net: rtmsg_to_fib6_config() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dd19cf8dbcc1..c312ad4046d1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3604,23 +3604,23 @@ static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { - memset(cfg, 0, sizeof(*cfg)); + *cfg = (struct fib6_config){ + .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? + : RT6_TABLE_MAIN, + .fc_ifindex = rtmsg->rtmsg_ifindex, + .fc_metric = rtmsg->rtmsg_metric, + .fc_expires = rtmsg->rtmsg_info, + .fc_dst_len = rtmsg->rtmsg_dst_len, + .fc_src_len = rtmsg->rtmsg_src_len, + .fc_flags = rtmsg->rtmsg_flags, + .fc_type = rtmsg->rtmsg_type, + + .fc_nlinfo.nl_net = net, - cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? - : RT6_TABLE_MAIN; - cfg->fc_ifindex = rtmsg->rtmsg_ifindex; - cfg->fc_metric = rtmsg->rtmsg_metric; - cfg->fc_expires = rtmsg->rtmsg_info; - cfg->fc_dst_len = rtmsg->rtmsg_dst_len; - cfg->fc_src_len = rtmsg->rtmsg_src_len; - cfg->fc_flags = rtmsg->rtmsg_flags; - cfg->fc_type = rtmsg->rtmsg_type; - - cfg->fc_nlinfo.nl_net = net; - - cfg->fc_dst = rtmsg->rtmsg_dst; - cfg->fc_src = rtmsg->rtmsg_src; - cfg->fc_gateway = rtmsg->rtmsg_gateway; + .fc_dst = rtmsg->rtmsg_dst, + .fc_src = rtmsg->rtmsg_src, + .fc_gateway = rtmsg->rtmsg_gateway, + }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) -- cgit v1.2.3 From 84db84071505e0e46f2faa05dedfa7a313d41859 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:53 -0700 Subject: net: rtm_to_fib6_config() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c312ad4046d1..be5f7a15bc38 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4143,14 +4143,19 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = rtm->rtm_table; - cfg->fc_dst_len = rtm->rtm_dst_len; - cfg->fc_src_len = rtm->rtm_src_len; - cfg->fc_flags = RTF_UP; - cfg->fc_protocol = rtm->rtm_protocol; - cfg->fc_type = rtm->rtm_type; + *cfg = (struct fib6_config){ + .fc_table = rtm->rtm_table, + .fc_dst_len = rtm->rtm_dst_len, + .fc_src_len = rtm->rtm_src_len, + .fc_flags = RTF_UP, + .fc_protocol = rtm->rtm_protocol, + .fc_type = rtm->rtm_type, + + .fc_nlinfo.portid = NETLINK_CB(skb).portid, + .fc_nlinfo.nlh = nlh, + .fc_nlinfo.nl_net = sock_net(skb->sk), + }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || @@ -4166,10 +4171,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); - cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; - cfg->fc_nlinfo.nlh = nlh; - cfg->fc_nlinfo.nl_net = sock_net(skb->sk); - if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; -- cgit v1.2.3 From 744486d426dce4e4dc25fd2ce750116a0025736c Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 29 Sep 2018 23:44:54 -0700 Subject: net: inet6_rtm_getroute() - use new style struct initializer instead of memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index be5f7a15bc38..6311a7fc5f63 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4823,7 +4823,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi6 fl6; + struct flowi6 fl6 = {}; bool fibmatch; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, @@ -4832,7 +4832,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; err = -EINVAL; - memset(&fl6, 0, sizeof(fl6)); rtm = nlmsg_data(nlh); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); -- cgit v1.2.3 From 8873c064d1de579ea23412a6d3eee972593f142b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Oct 2018 23:24:26 -0700 Subject: tcp: do not release socket ownership in tcp_close() syzkaller was able to hit the WARN_ON(sock_owned_by_user(sk)); in tcp_close() While a socket is being closed, it is very possible other threads find it in rtnetlink dump. tcp_get_info() will acquire the socket lock for a short amount of time (slow = lock_sock_fast(sk)/unlock_sock_fast(sk, slow);), enough to trigger the warning. Fixes: 67db3e4bfbc9 ("tcp: no longer hold ehash lock while calling tcp_get_info()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 2 +- net/ipv4/tcp.c | 11 +++-------- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 38cae35f6e16..751549ac0a84 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1492,6 +1492,7 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } +void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ diff --git a/net/core/sock.c b/net/core/sock.c index 8537b6ca72c5..7e8796a6a089 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2317,7 +2317,7 @@ static void __lock_sock(struct sock *sk) finish_wait(&sk->sk_lock.wq, &wait); } -static void __release_sock(struct sock *sk) +void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2827fa5643bd..43ef83b2330e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2416,16 +2416,10 @@ adjudge_to_death: sock_hold(sk); sock_orphan(sk); - /* It is the last release_sock in its life. It will remove backlog. */ - release_sock(sk); - - - /* Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ local_bh_disable(); bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); + /* remove backlog if any, without releasing ownership. */ + __release_sock(sk); percpu_counter_inc(sk->sk_prot->orphan_count); @@ -2494,6 +2488,7 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); + release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); -- cgit v1.2.3 From 6919622af3a8d92ee73886ebfe22c69ecf4c650c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Oct 2018 11:57:01 +0300 Subject: bridge: mcast: Default back to multicast enabled state Commit 13cefad2f2c1 ("net: bridge: convert and rename mcast disabled") converted the 'multicast_disabled' field to an option bit named 'BROPT_MULTICAST_ENABLED'. While the old field was implicitly initialized to 0, the new field is not initialized, resulting in the bridge defaulting to multicast disabled state and breaking existing applications. Fix this by explicitly initializing the option. Fixes: 13cefad2f2c1 ("net: bridge: convert and rename mcast disabled") Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 928024d8360d..024139b51d3a 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1976,6 +1976,7 @@ void br_multicast_init(struct net_bridge *br) br->ip6_other_query.delay_time = 0; br->ip6_querier.port = NULL; #endif + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); -- cgit v1.2.3 From 854da991733d1b4f60042423875e32be7f8f0421 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Mon, 1 Oct 2018 09:40:23 +0100 Subject: ipv4: Allow sending multicast packets on specific i/f using VRF socket It is useful to be able to use the same socket for listening in a specific VRF, as for sending multicast packets out of a specific interface. However, the bound device on the socket currently takes precedence and results in the packets not being sent. Relax the condition on overriding the output interface to use for sending packets out of UDP, raw and ping sockets to allow multicast packets to be sent using the specified multicast interface. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33df4d76db2d..8ca3eb06ba04 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d69dd6fa7e8..5fc4beb1c336 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; -- cgit v1.2.3 From e4a38c0c4b2761b2b6bc9d64352f1dc6f8402eee Mon Sep 17 00:00:00 2001 From: Patrick Ruddy Date: Mon, 1 Oct 2018 09:41:27 +0100 Subject: ipv6: add vrf table handling code for ipv6 mcast The code to obtain the correct table for the incoming interface was missing for IPv6. This has been added along with the table creation notification to fib rules for the RTNL_FAMILY_IP6MR address family. Signed-off-by: Patrick Ruddy Signed-off-by: Mike Manning Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 11 +++++++++++ net/ipv6/ip6mr.c | 48 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f93547f257fb..69b7227c637e 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1215,8 +1215,19 @@ static int vrf_add_fib_rules(const struct net_device *dev) goto ipmr_err; #endif +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) + err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); + if (err < 0) + goto ip6mr_err; +#endif + return 0; +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) +ip6mr_err: + vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); +#endif + #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d0b7e0249c13..6f07b8380425 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, @@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(flp6)); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) @@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ip6mr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(net, mrt, skb, c); + ip6_mr_forward(net, mrt, skb->dev, skb, c); } } @@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; @@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c) + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) { int psend = -1; int vif, ct; - int true_vifi = ip6mr_find_vif(mrt, skb->dev); + int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; @@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2154,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb) .flowi6_mark = skb->mark, }; int err; + struct net_device *dev; + + /* skb->dev passed in is the master dev for vrfs. + * Get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { @@ -2165,7 +2189,7 @@ int ip6_mr_input(struct sk_buff *skb) cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { - int vif = ip6mr_find_vif(mrt, skb->dev); + int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, @@ -2179,9 +2203,9 @@ int ip6_mr_input(struct sk_buff *skb) if (!cache) { int vif; - vif = ip6mr_find_vif(mrt, skb->dev); + vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { - int err = ip6mr_cache_unresolved(mrt, vif, skb); + int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); read_unlock(&mrt_lock); return err; @@ -2191,7 +2215,7 @@ int ip6_mr_input(struct sk_buff *skb) return -ENODEV; } - ip6_mr_forward(net, mrt, skb, cache); + ip6_mr_forward(net, mrt, dev, skb, cache); read_unlock(&mrt_lock); @@ -2257,7 +2281,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; - err = ip6mr_cache_unresolved(mrt, vif, skb2); + err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); read_unlock(&mrt_lock); return err; -- cgit v1.2.3 From 64199fc0a46ba211362472f7f942f900af9492fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 30 Sep 2018 11:33:39 -0700 Subject: ipv4: fix use-after-free in ip_cmsg_recv_dstaddr() Caching ip_hdr(skb) before a call to pskb_may_pull() is buggy, do not do it. Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..26c36cccabdc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - const struct iphdr *iph = ip_hdr(skb); __be16 *ports; int end; @@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; + sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); -- cgit v1.2.3 From 4e6d47206c32d1bbb4931f1d851dae3870e0df81 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Sun, 30 Sep 2018 08:04:35 +0530 Subject: tls: Add support for inplace records encryption Presently, for non-zero copy case, separate pages are allocated for storing plaintext and encrypted text of records. These pages are stored in sg_plaintext_data and sg_encrypted_data scatterlists inside record structure. Further, sg_plaintext_data & sg_encrypted_data are passed to cryptoapis for record encryption. Allocating separate pages for plaintext and encrypted text is inefficient from both required memory and performance point of view. This patch adds support of inplace encryption of records. For non-zero copy case, we reuse the pages from sg_encrypted_data scatterlist to copy the application's plaintext data. For the movement of pages from sg_encrypted_data to sg_plaintext_data scatterlists, we introduce a new function move_to_plaintext_sg(). This function add pages into sg_plaintext_data from sg_encrypted_data scatterlists. tls_do_encryption() is modified to pass the same scatterlist as both source and destination into aead_request_set_crypt() if inplace crypto has been enabled. A new ariable 'inplace_crypto' has been introduced in record structure to signify whether the same scatterlist can be used. By default, the inplace_crypto is enabled in get_rec(). If zero-copy is used (i.e. plaintext data is not copied), inplace_crypto is set to '0'. Signed-off-by: Vakul Garg Reviewed-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_sw.c | 91 ++++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 74 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 262420cdad10..5e853835597e 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -101,6 +101,7 @@ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; + int inplace_crypto; /* AAD | sg_plaintext_data | sg_tag */ struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1d4c354d5516..aa9fdce272b6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -281,24 +281,72 @@ static int alloc_encrypted_sg(struct sock *sk, int len) return rc; } -static int alloc_plaintext_sg(struct sock *sk, int len) +static int move_to_plaintext_sg(struct sock *sk, int required_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - int rc = 0; + struct scatterlist *plain_sg = &rec->sg_plaintext_data[1]; + struct scatterlist *enc_sg = &rec->sg_encrypted_data[1]; + int enc_sg_idx = 0; + int skip, len; - rc = sk_alloc_sg(sk, len, - &rec->sg_plaintext_data[1], 0, - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) + return -ENOSPC; - if (rc == -ENOSPC) - rec->sg_plaintext_num_elem = - ARRAY_SIZE(rec->sg_plaintext_data) - 1; + /* We add page references worth len bytes from enc_sg at the + * end of plain_sg. It is guaranteed that sg_encrypted_data + * has enough required room (ensured by caller). + */ + len = required_size - rec->sg_plaintext_size; - return rc; + /* Skip initial bytes in sg_encrypted_data to be able + * to use same offset of both plain and encrypted data. + */ + skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size; + + while (enc_sg_idx < rec->sg_encrypted_num_elem) { + if (enc_sg[enc_sg_idx].length > skip) + break; + + skip -= enc_sg[enc_sg_idx].length; + enc_sg_idx++; + } + + /* unmark the end of plain_sg*/ + sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1); + + while (len) { + struct page *page = sg_page(&enc_sg[enc_sg_idx]); + int bytes = enc_sg[enc_sg_idx].length - skip; + int offset = enc_sg[enc_sg_idx].offset + skip; + + if (bytes > len) + bytes = len; + else + enc_sg_idx++; + + /* Skipping is required only one time */ + skip = 0; + + /* Increment page reference */ + get_page(page); + + sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page, + bytes, offset); + + sk_mem_charge(sk, bytes); + + len -= bytes; + rec->sg_plaintext_size += bytes; + + rec->sg_plaintext_num_elem++; + + if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) + return -ENOSPC; + } + + return 0; } static void free_sg(struct sock *sk, struct scatterlist *sg, @@ -459,16 +507,21 @@ static int tls_do_encryption(struct sock *sk, size_t data_len) { struct tls_rec *rec = ctx->open_rec; + struct scatterlist *plain_sg = rec->sg_plaintext_data; + struct scatterlist *enc_sg = rec->sg_encrypted_data; int rc; /* Skip the first index as it contains AAD data */ rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; + /* If it is inplace crypto, then pass same SG list as both src, dst */ + if (rec->inplace_crypto) + plain_sg = enc_sg; + aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, rec->sg_plaintext_data, - rec->sg_encrypted_data, + aead_request_set_crypt(aead_req, plain_sg, enc_sg, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -666,6 +719,7 @@ static struct tls_rec *get_rec(struct sock *sk) sizeof(rec->aad_space)); ctx->open_rec = rec; + rec->inplace_crypto = 1; return rec; } @@ -763,6 +817,8 @@ alloc_encrypted: if (ret) goto fallback_to_reg_send; + rec->inplace_crypto = 0; + num_zc++; copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); @@ -782,11 +838,11 @@ fallback_to_reg_send: } required_size = rec->sg_plaintext_size + try_to_copy; -alloc_plaintext: - ret = alloc_plaintext_sg(sk, required_size); + + ret = move_to_plaintext_sg(sk, required_size); if (ret) { if (ret != -ENOSPC) - goto wait_for_memory; + goto send_end; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due @@ -831,8 +887,6 @@ trim_sgl: if (rec->sg_encrypted_size < required_size) goto alloc_encrypted; - - goto alloc_plaintext; } if (!num_async) { @@ -958,6 +1012,7 @@ alloc_payload: if (full_record || eor || rec->sg_plaintext_num_elem == ARRAY_SIZE(rec->sg_plaintext_data) - 1) { + rec->inplace_crypto = 0; ret = tls_push_record(sk, flags, record_type); if (ret) { if (ret == -EINPROGRESS) -- cgit v1.2.3 From 2cc543f5cd6deda27ef463686fa08c16c8c0990b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 3 Oct 2018 12:45:56 +0200 Subject: sctp: fix fall-through annotation Replace "fallthru" with a proper "fall through" annotation. This fix is part of the ongoing efforts to enabling -Wimplicit-fallthrough Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d74d00b29942..42191ed9902b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, if (!ctx->packet || !ctx->packet->has_cookie_echo) return; - /* fallthru */ + /* fall through */ case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: -- cgit v1.2.3 From db7ff19e7b119adb4618fbc6410b441d1c3b55c5 Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Wed, 15 Aug 2018 16:02:18 +0300 Subject: devlink: Add extack for eswitch operations Add extack argument to the eswitch related operations. Signed-off-by: Eli Britstein Reviewed-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 3 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h | 3 ++- drivers/net/ethernet/cavium/liquidio/lio_main.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 9 ++++++--- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 19 ++++++++++++------- drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 3 ++- include/net/devlink.h | 9 ++++++--- net/core/devlink.c | 8 +++++--- 8 files changed, 37 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index b574fe8e974e..9a25c05aa571 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -521,7 +521,8 @@ int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode) return 0; } -int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode) +int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct bnxt *bp = bnxt_get_bp_from_dl(devlink); int rc = 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h index 38b9a75ad724..d7287651422f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h @@ -30,7 +30,8 @@ static inline u16 bnxt_vf_rep_get_fid(struct net_device *dev) bool bnxt_dev_is_vf_rep(struct net_device *dev); int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode); -int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode); +int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); #else diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 9d70e5c6157f..3d24133e5e49 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -3144,7 +3144,8 @@ liquidio_eswitch_mode_get(struct devlink *devlink, u16 *mode) } static int -liquidio_eswitch_mode_set(struct devlink *devlink, u16 mode) +liquidio_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct lio_devlink_priv *priv; struct octeon_device *oct; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 0b05bf2b91f6..dfc642de4e6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -269,12 +269,15 @@ struct mlx5_esw_flow_attr { struct mlx5e_tc_flow_parse_attr *parse_attr; }; -int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode); +int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode); -int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode); +int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, + struct netlink_ext_ack *extack); int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode); int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode); -int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap); +int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap, + struct netlink_ext_ack *extack); int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap); void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 21e957083f65..eee34ffcf242 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -810,7 +810,8 @@ out: return flow_rule; } -static int esw_offloads_start(struct mlx5_eswitch *esw) +static int esw_offloads_start(struct mlx5_eswitch *esw, + struct netlink_ext_ack *extack) { int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs; @@ -973,7 +974,8 @@ create_ft_err: return err; } -static int esw_offloads_stop(struct mlx5_eswitch *esw) +static int esw_offloads_stop(struct mlx5_eswitch *esw, + struct netlink_ext_ack *extack) { int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs; @@ -1092,7 +1094,8 @@ static int mlx5_devlink_eswitch_check(struct devlink *devlink) return 0; } -int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) +int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); u16 cur_mlx5_mode, mlx5_mode = 0; @@ -1111,9 +1114,9 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) return 0; if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) - return esw_offloads_start(dev->priv.eswitch); + return esw_offloads_start(dev->priv.eswitch, extack); else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) - return esw_offloads_stop(dev->priv.eswitch); + return esw_offloads_stop(dev->priv.eswitch, extack); else return -EINVAL; } @@ -1130,7 +1133,8 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) return esw_mode_to_devlink(dev->priv.eswitch->mode, mode); } -int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode) +int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); struct mlx5_eswitch *esw = dev->priv.eswitch; @@ -1232,7 +1236,8 @@ out: return 0; } -int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap) +int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); struct mlx5_eswitch *esw = dev->priv.eswitch; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index db463e20a876..4213fe42ac4d 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -177,7 +177,8 @@ static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) return nfp_app_eswitch_mode_get(pf->app, mode); } -static int nfp_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) +static int nfp_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct nfp_pf *pf = devlink_priv(devlink); int ret; diff --git a/include/net/devlink.h b/include/net/devlink.h index b9b89d6604d4..70671f0d4c30 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -451,11 +451,14 @@ struct devlink_ops { u32 *p_cur, u32 *p_max); int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode); - int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); + int (*eswitch_mode_set)(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode); - int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode); + int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode, + struct netlink_ext_ack *extack); int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode); - int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode); + int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c0ed225e280..de6adad7ccbe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1626,7 +1626,7 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = ops->eswitch_mode_set(devlink, mode); + err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; } @@ -1636,7 +1636,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, return -EOPNOTSUPP; inline_mode = nla_get_u8( info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); - err = ops->eswitch_inline_mode_set(devlink, inline_mode); + err = ops->eswitch_inline_mode_set(devlink, inline_mode, + info->extack); if (err) return err; } @@ -1645,7 +1646,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_encap_mode_set) return -EOPNOTSUPP; encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); - err = ops->eswitch_encap_mode_set(devlink, encap_mode); + err = ops->eswitch_encap_mode_set(devlink, encap_mode, + info->extack); if (err) return err; } -- cgit v1.2.3 From d2944b1c66a502ada8aa67f508cd29ecbf035892 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:32:27 +0100 Subject: rxrpc: Use rxrpc_free_skb() rather than rxrpc_lose_skb() rxrpc_lose_skb() is now exactly the same as rxrpc_free_skb(), so remove it and use the latter instead. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/input.c | 2 +- net/rxrpc/skbuff.c | 15 --------------- 3 files changed, 1 insertion(+), 17 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ef9554131434..9ba87e4d15c7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1095,7 +1095,6 @@ void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); -void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4b86b6be6c1f..5b2626929822 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1176,7 +1176,7 @@ void rxrpc_data_ready(struct sock *udp_sk) static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); + rxrpc_free_skb(skb, rxrpc_skb_rx_lost); return; } } diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index b8985d01876a..913dca65cc65 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -68,21 +68,6 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) } } -/* - * Note the injected loss of a socket buffer. - */ -void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) -{ - const void *here = __builtin_return_address(0); - if (skb) { - int n; - CHECK_SLAB_OKAY(&skb->users); - n = atomic_dec_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); - kfree_skb(skb); - } -} - /* * Clear a queue of socket buffers. */ -- cgit v1.2.3 From b3cfb6f567be00450d33b68f743c066af017a012 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:32:27 +0100 Subject: rxrpc: Emit the data Tx trace line before transmitting Print the data Tx trace line before transmitting so that it appears before the trace lines indicating success or failure of the transmission. This makes the trace log less confusing. Signed-off-by: David Howells --- net/rxrpc/output.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e8fb8922bca8..993d4cd247f9 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -378,11 +378,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, if ((lose++ & 7) == 7) { ret = 0; lost = true; - goto done; } } - _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, + retrans, lost); + if (lost) + goto done; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ @@ -415,8 +417,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto send_fragmentable; done: - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, - retrans, lost); if (ret >= 0) { if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = skb->tstamp; -- cgit v1.2.3 From 46894a13599a977ac35411b536fb3e0b2feefa95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:32:28 +0100 Subject: rxrpc: Use IPv4 addresses throught the IPv6 AF_RXRPC opens an IPv6 socket through which to send and receive network packets, both IPv6 and IPv4. It currently turns AF_INET addresses into AF_INET-as-AF_INET6 addresses based on an assumption that this was necessary; on further inspection of the code, however, it turns out that the IPv6 code just farms packets aimed at AF_INET addresses out to the IPv4 code. Fix AF_RXRPC to use AF_INET addresses directly when given them. Fixes: 7b674e390e51 ("rxrpc: Fix IPv6 support") Signed-off-by: David Howells --- fs/afs/addr_list.c | 29 +++++++++++++++-------------- net/rxrpc/af_rxrpc.c | 3 ++- net/rxrpc/conn_object.c | 5 +++-- net/rxrpc/peer_event.c | 12 +++++++----- net/rxrpc/utils.c | 19 +++++-------------- 5 files changed, 32 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 89374de4785c..55a756c60746 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -232,7 +232,7 @@ struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) */ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) { - struct sockaddr_in6 *p; + struct sockaddr_rxrpc *srx; u32 addr = ntohl(xdr); int i; @@ -240,9 +240,9 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) return; for (i = 0; i < alist->nr_ipv4; i++) { - struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; - u32 a_addr = ntohl(a->sin6_addr.s6_addr32[3]); - u16 a_port = ntohs(a->sin6_port); + struct sockaddr_in *a = &alist->addrs[i].transport.sin; + u32 a_addr = ntohl(a->sin_addr.s_addr); + u16 a_port = ntohs(a->sin_port); if (addr == a_addr && port == a_port) return; @@ -257,12 +257,11 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - p = &alist->addrs[i].transport.sin6; - p->sin6_port = htons(port); - p->sin6_addr.s6_addr32[0] = 0; - p->sin6_addr.s6_addr32[1] = 0; - p->sin6_addr.s6_addr32[2] = htonl(0xffff); - p->sin6_addr.s6_addr32[3] = xdr; + srx = &alist->addrs[i]; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = htons(port); + srx->transport.sin.sin_addr.s_addr = xdr; alist->nr_ipv4++; alist->nr_addrs++; } @@ -272,7 +271,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) */ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) { - struct sockaddr_in6 *p; + struct sockaddr_rxrpc *srx; int i, diff; if (alist->nr_addrs >= alist->max_addrs) @@ -296,9 +295,11 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - p = &alist->addrs[i].transport.sin6; - p->sin6_port = htons(port); - memcpy(&p->sin6_addr, xdr, 16); + srx = &alist->addrs[i]; + srx->transport_len = sizeof(srx->transport.sin6); + srx->transport.sin6.sin6_family = AF_INET6; + srx->transport.sin6.sin6_port = htons(port); + memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); alist->nr_addrs++; } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ac44d8afffb1..2fdd276f6842 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -97,7 +97,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, srx->transport_len > len) return -EINVAL; - if (srx->transport.family != rx->family) + if (srx->transport.family != rx->family && + srx->transport.family == AF_INET && rx->family != AF_INET6) return -EAFNOSUPPORT; switch (srx->transport.family) { diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 885dae829f4a..fb856a1ccda8 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -89,8 +89,9 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) goto not_found; - /* We may have to handle mixing IPv4 and IPv6 */ - if (srx.transport.family != local->srx.transport.family) { + if (srx.transport.family != local->srx.transport.family && + (srx.transport.family == AF_INET && + local->srx.transport.family != AF_INET6)) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", srx.transport.family, local->srx.transport.family); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index f3e6fc670da2..81a7869325a6 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -47,6 +47,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, */ switch (srx->transport.family) { case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP: @@ -70,20 +72,20 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: - srx->transport.sin6.sin6_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP6: _net("Rx ICMP6"); + srx->transport.sin6.sin6_port = serr->port; memcpy(&srx->transport.sin6.sin6_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in6_addr)); break; case SO_EE_ORIGIN_ICMP: _net("Rx ICMP on v6 sock"); - srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - memcpy(srx->transport.sin6.sin6_addr.s6_addr + 12, + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = serr->port; + memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in_addr)); break; diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index e801171fa351..017adaa54e90 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -25,20 +25,11 @@ int rxrpc_extract_addr_from_skb(struct rxrpc_local *local, switch (ntohs(skb->protocol)) { case ETH_P_IP: - if (local->srx.transport.family == AF_INET6) { - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = udp_hdr(skb)->source; - srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - srx->transport.sin6.sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr; - } else { - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.sin.sin_family = AF_INET; - srx->transport.sin.sin_port = udp_hdr(skb)->source; - srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - } + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = udp_hdr(skb)->source; + srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; return 0; #ifdef CONFIG_AF_RXRPC_IPV6 -- cgit v1.2.3 From 5a790b7375414cffb0f7e8ab0f175d2e02a0af0e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:32:28 +0100 Subject: rxrpc: Drop the local endpoint arg from rxrpc_extract_addr_from_skb() rxrpc_extract_addr_from_skb() doesn't use the argument that points to the local endpoint, so remove the argument. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/conn_object.c | 2 +- net/rxrpc/local_event.c | 2 +- net/rxrpc/output.c | 2 +- net/rxrpc/utils.c | 4 +--- 6 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9ba87e4d15c7..76569c178915 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1111,8 +1111,7 @@ static inline void rxrpc_sysctl_exit(void) {} /* * utils.c */ -int rxrpc_extract_addr_from_skb(struct rxrpc_local *, struct sockaddr_rxrpc *, - struct sk_buff *); +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); static inline bool before(u32 seq1, u32 seq2) { diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9c7f26d06a52..8354cadbb839 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -280,7 +280,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, peer = NULL; if (!peer) { peer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(local, &peer->srx, skb) < 0) + if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0) return NULL; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index fb856a1ccda8..c332722820c2 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -86,7 +86,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) goto not_found; if (srx.transport.family != local->srx.transport.family && diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 13bd8a4dfac7..927ead43df42 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -39,7 +39,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, _enter(""); - if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) return; msg.msg_name = &srx.transport; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 993d4cd247f9..0f0b499d1202 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -561,7 +561,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) continue; } - if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { + if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { msg.msg_namelen = srx.transport_len; whdr.epoch = htonl(sp->hdr.epoch); diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index 017adaa54e90..ff7af71c4b49 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -17,9 +17,7 @@ /* * Fill out a peer address from a socket buffer containing a packet. */ -int rxrpc_extract_addr_from_skb(struct rxrpc_local *local, - struct sockaddr_rxrpc *srx, - struct sk_buff *skb) +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) { memset(srx, 0, sizeof(*srx)); -- cgit v1.2.3 From 2070a3e44962212d6ef02c5def821b1b9744e496 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:42:29 +0100 Subject: rxrpc: Allow the reply time to be obtained on a client call Allow the timestamp on the sk_buff holding the first DATA packet of a reply to be queried. This can then be used as a base for the expiry time calculation on the callback promise duration indicated by an operation result. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 11 ++++++++++ include/net/af_rxrpc.h | 3 +++ net/rxrpc/recvmsg.c | 43 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index b5407163d53b..67879992b4c2 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1069,6 +1069,17 @@ The kernel interface functions are as follows: This function may transmit a PING ACK. + (*) Get reply timestamp. + + bool rxrpc_kernel_get_reply_time(struct socket *sock, + struct rxrpc_call *call, + ktime_t *_ts) + + This allows the timestamp on the first DATA packet of the reply of a + client call to be queried, provided that it is still in the Rx ring. If + successful, the timestamp will be stored into *_ts and true will be + returned; false will be returned otherwise. + ======================= CONFIGURABLE PARAMETERS diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f53edb3754bc..c4c912554dee 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -13,6 +13,7 @@ #define _NET_RXRPC_H #include +#include struct key; struct sock; @@ -77,5 +78,7 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); +bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, + ktime_t *); #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 816b19a78809..eaf19ebaa964 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -715,3 +715,46 @@ call_complete: goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); + +/** + * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet + * @sock: The socket that the call exists on + * @call: The call to query + * @_ts: Where to put the timestamp + * + * Retrieve the timestamp from the first DATA packet of the reply if it is + * in the ring. Returns true if successful, false if not. + */ +bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call, + ktime_t *_ts) +{ + struct sk_buff *skb; + rxrpc_seq_t hard_ack, top, seq; + bool success = false; + + mutex_lock(&call->user_mutex); + + if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY) + goto out; + + hard_ack = call->rx_hard_ack; + if (hard_ack != 0) + goto out; + + seq = hard_ack + 1; + top = smp_load_acquire(&call->rx_top); + if (after(seq, top)) + goto out; + + skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK]; + if (!skb) + goto out; + + *_ts = skb_get_ktime(skb); + success = true; + +out: + mutex_unlock(&call->user_mutex); + return success; +} +EXPORT_SYMBOL(rxrpc_kernel_get_reply_time); -- cgit v1.2.3 From e908bcf4f1a271e7c264dcbffc5881ced8bfacee Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 09:54:29 +0100 Subject: rxrpc: Allow the reply time to be obtained on a client call Allow the epoch value to be queried on a server connection. This is in the rxrpc header of every packet for use in routing and is derived from the client's state. It's also not supposed to change unless the client gets restarted. AFS can make use of this information to deduce whether a fileserver has been restarted because the fileserver makes client calls to the filesystem driver's cache manager to send notifications (ie. callback breaks) about conflicting changes from other clients. These convey the fileserver's own epoch value back to the filesystem. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 14 ++++++++++++++ include/net/af_rxrpc.h | 1 + net/rxrpc/af_rxrpc.c | 14 ++++++++++++++ 3 files changed, 29 insertions(+) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 67879992b4c2..605e00cdd6be 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1080,6 +1080,20 @@ The kernel interface functions are as follows: successful, the timestamp will be stored into *_ts and true will be returned; false will be returned otherwise. + (*) Get remote client epoch. + + u32 rxrpc_kernel_get_epoch(struct socket *sock, + struct rxrpc_call *call) + + This allows the epoch that's contained in packets of an incoming client + call to be queried. This value is returned. The function always + successful if the call is still in progress. It shouldn't be called once + the call has expired. Note that calling this on a local client call only + returns the local epoch. + + This value can be used to determine if the remote client has been + restarted as it shouldn't change otherwise. + ======================= CONFIGURABLE PARAMETERS diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index c4c912554dee..de587948042a 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -78,6 +78,7 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); +u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, ktime_t *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2fdd276f6842..013dbcb052e5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -385,6 +385,20 @@ u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) } EXPORT_SYMBOL(rxrpc_kernel_check_life); +/** + * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. + * @sock: The socket the call is on + * @call: The call to query + * + * Allow a kernel service to retrieve the epoch value from a service call to + * see if the client at the other end rebooted. + */ +u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) +{ + return call->conn->proto.epoch; +} +EXPORT_SYMBOL(rxrpc_kernel_get_epoch); + /** * rxrpc_kernel_check_call - Check a call's state * @sock: The socket the call is on -- cgit v1.2.3 From bbb4c4323a4d9cb5ca04db904aa3050a7586839a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 14:27:55 +0100 Subject: dns: Allow the dns resolver to retrieve a server set Allow the DNS resolver to retrieve a set of servers and their associated addresses, ports, preference and weight ratings. In terms of communication with userspace, "srv=1" is added to the callout string (the '1' indicating the maximum data version supported by the kernel) to ask the userspace side for this. If the userspace side doesn't recognise it, it will ignore the option and return the usual text address list. If the userspace side does recognise it, it will return some binary data that begins with a zero byte that would cause the string parsers to give an error. The second byte contains the version of the data in the blob (this may be between 1 and the version specified in the callout data). The remainder of the payload is version-specific. In version 1, the payload looks like (note that this is packed): u8 Non-string marker (ie. 0) u8 Content (0 => Server list) u8 Version (ie. 1) u8 Source (eg. DNS_RECORD_FROM_DNS_SRV) u8 Status (eg. DNS_LOOKUP_GOOD) u8 Number of servers foreach-server { u16 Name length (LE) u16 Priority (as per SRV record) (LE) u16 Weight (as per SRV record) (LE) u16 Port (LE) u8 Source (eg. DNS_RECORD_FROM_NSS) u8 Status (eg. DNS_LOOKUP_GOT_NOT_FOUND) u8 Protocol (eg. DNS_SERVER_PROTOCOL_UDP) u8 Number of addresses char[] Name (not NUL-terminated) foreach-address { u8 Family (AF_INET{,6}) union { u8[4] ipv4_addr u8[16] ipv6_addr } } } This can then be used to fetch a whole cell's VL-server configuration for AFS, for example. Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/linux/dns_resolver.h | 4 +- include/uapi/linux/dns_resolver.h | 116 ++++++++++++++++++++++++++++++++++++++ net/dns_resolver/dns_key.c | 67 +++++++++++++++++++++- net/dns_resolver/dns_query.c | 5 +- 4 files changed, 182 insertions(+), 10 deletions(-) create mode 100644 include/uapi/linux/dns_resolver.h (limited to 'net') diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h index 6ac3cad9aef1..34a744a1bafc 100644 --- a/include/linux/dns_resolver.h +++ b/include/linux/dns_resolver.h @@ -24,11 +24,9 @@ #ifndef _LINUX_DNS_RESOLVER_H #define _LINUX_DNS_RESOLVER_H -#ifdef __KERNEL__ +#include extern int dns_query(const char *type, const char *name, size_t namelen, const char *options, char **_result, time64_t *_expiry); -#endif /* KERNEL */ - #endif /* _LINUX_DNS_RESOLVER_H */ diff --git a/include/uapi/linux/dns_resolver.h b/include/uapi/linux/dns_resolver.h new file mode 100644 index 000000000000..129745f9c794 --- /dev/null +++ b/include/uapi/linux/dns_resolver.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* DNS resolver interface definitions. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_DNS_RESOLVER_H +#define _UAPI_LINUX_DNS_RESOLVER_H + +#include + +/* + * Type of payload. + */ +enum dns_payload_content_type { + DNS_PAYLOAD_IS_SERVER_LIST = 0, /* List of servers, requested by srv=1 */ +}; + +/* + * Type of address that might be found in an address record. + */ +enum dns_payload_address_type { + DNS_ADDRESS_IS_IPV4 = 0, /* 4-byte AF_INET address */ + DNS_ADDRESS_IS_IPV6 = 1, /* 16-byte AF_INET6 address */ +}; + +/* + * Type of protocol used to access a server. + */ +enum dns_payload_protocol_type { + DNS_SERVER_PROTOCOL_UNSPECIFIED = 0, + DNS_SERVER_PROTOCOL_UDP = 1, /* Use UDP to talk to the server */ + DNS_SERVER_PROTOCOL_TCP = 2, /* Use TCP to talk to the server */ +}; + +/* + * Source of record included in DNS resolver payload. + */ +enum dns_record_source { + DNS_RECORD_UNAVAILABLE = 0, /* No source available (empty record) */ + DNS_RECORD_FROM_CONFIG = 1, /* From local configuration data */ + DNS_RECORD_FROM_DNS_A = 2, /* From DNS A or AAAA record */ + DNS_RECORD_FROM_DNS_AFSDB = 3, /* From DNS AFSDB record */ + DNS_RECORD_FROM_DNS_SRV = 4, /* From DNS SRV record */ + DNS_RECORD_FROM_NSS = 5, /* From NSS */ + NR__dns_record_source +}; + +/* + * Status of record included in DNS resolver payload. + */ +enum dns_lookup_status { + DNS_LOOKUP_NOT_DONE = 0, /* No lookup has been made */ + DNS_LOOKUP_GOOD = 1, /* Good records obtained */ + DNS_LOOKUP_GOOD_WITH_BAD = 2, /* Good records, some decoding errors */ + DNS_LOOKUP_BAD = 3, /* Couldn't decode results */ + DNS_LOOKUP_GOT_NOT_FOUND = 4, /* Got a "Not Found" result */ + DNS_LOOKUP_GOT_LOCAL_FAILURE = 5, /* Local failure during lookup */ + DNS_LOOKUP_GOT_TEMP_FAILURE = 6, /* Temporary failure during lookup */ + DNS_LOOKUP_GOT_NS_FAILURE = 7, /* Name server failure */ + NR__dns_lookup_status +}; + +/* + * Header at the beginning of binary format payload. + */ +struct dns_payload_header { + __u8 zero; /* Zero byte: marks this as not being text */ + __u8 content; /* enum dns_payload_content_type */ + __u8 version; /* Encoding version */ +} __packed; + +/* + * Header at the beginning of a V1 server list. This is followed directly by + * the server records. Each server records begins with a struct of type + * dns_server_list_v1_server. + */ +struct dns_server_list_v1_header { + struct dns_payload_header hdr; + __u8 source; /* enum dns_record_source */ + __u8 status; /* enum dns_lookup_status */ + __u8 nr_servers; /* Number of server records following this */ +} __packed; + +/* + * Header at the beginning of each V1 server record. This is followed by the + * characters of the name with no NUL-terminator, followed by the address + * records for that server. Each address record begins with a struct of type + * struct dns_server_list_v1_address. + */ +struct dns_server_list_v1_server { + __u16 name_len; /* Length of name (LE) */ + __u16 priority; /* Priority (as SRV record) (LE) */ + __u16 weight; /* Weight (as SRV record) (LE) */ + __u16 port; /* UDP/TCP port number (LE) */ + __u8 source; /* enum dns_record_source */ + __u8 status; /* enum dns_lookup_status */ + __u8 protocol; /* enum dns_payload_protocol_type */ + __u8 nr_addrs; +} __packed; + +/* + * Header at the beginning of each V1 address record. This is followed by the + * bytes of the address, 4 for IPV4 and 16 for IPV6. + */ +struct dns_server_list_v1_address { + __u8 address_type; /* enum dns_payload_address_type */ +} __packed; + +#endif /* _UAPI_LINUX_DNS_RESOLVER_H */ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 7f4534828f6c..a65d553e730d 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache; /* * Preparse instantiation data for a dns_resolver key. * - * The data must be a NUL-terminated string, with the NUL char accounted in - * datalen. + * For normal hostname lookups, the data must be a NUL-terminated string, with + * the NUL char accounted in datalen. * * If the data contains a '#' characters, then we take the clause after each * one to be an option of the form 'key=value'. The actual data of interest is * the string leading up to the first '#'. For instance: * * "ip1,ip2,...#foo=bar" + * + * For server list requests, the data must begin with a NUL char and be + * followed by a byte indicating the version of the data format. Version 1 + * looks something like (note this is packed): + * + * u8 Non-string marker (ie. 0) + * u8 Content (DNS_PAYLOAD_IS_*) + * u8 Version (e.g. 1) + * u8 Source of server list + * u8 Lookup status of server list + * u8 Number of servers + * foreach-server { + * __le16 Name length + * __le16 Priority (as per SRV record, low first) + * __le16 Weight (as per SRV record, higher first) + * __le16 Port + * u8 Source of address list + * u8 Lookup status of address list + * u8 Protocol (DNS_SERVER_PROTOCOL_*) + * u8 Number of addresses + * char[] Name (not NUL-terminated) + * foreach-address { + * u8 Family (DNS_ADDRESS_IS_*) + * union { + * u8[4] ipv4_addr + * u8[16] ipv6_addr + * } + * } + * } + * */ static int dns_resolver_preparse(struct key_preparsed_payload *prep) { + const struct dns_payload_header *bin; struct user_key_payload *upayload; unsigned long derrno; int ret; int datalen = prep->datalen, result_len = 0; const char *data = prep->data, *end, *opt; + if (datalen <= 1 || !data) + return -EINVAL; + + if (data[0] == 0) { + /* It may be a server list. */ + if (datalen <= sizeof(*bin)) + return -EINVAL; + + bin = (const struct dns_payload_header *)data; + kenter("[%u,%u],%u", bin->content, bin->version, datalen); + if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) { + pr_warn_ratelimited( + "dns_resolver: Unsupported content type (%u)\n", + bin->content); + return -EINVAL; + } + + if (bin->version != 1) { + pr_warn_ratelimited( + "dns_resolver: Unsupported server list version (%u)\n", + bin->version); + return -EINVAL; + } + + result_len = datalen; + goto store_result; + } + kenter("'%*.*s',%u", datalen, datalen, data, datalen); - if (datalen <= 1 || !data || data[datalen - 1] != '\0') + if (!data || data[datalen - 1] != '\0') return -EINVAL; datalen--; @@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) return 0; } +store_result: kdebug("store result"); prep->quotalen = result_len; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 49da67034f29..76338c38738a 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen, if (_result) { ret = -ENOMEM; - *_result = kmalloc(len + 1, GFP_KERNEL); + *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL); if (!*_result) goto put; - - memcpy(*_result, upayload->data, len); - (*_result)[len] = '\0'; } if (_expiry) -- cgit v1.2.3 From e3b5106162a3f73c7633ae6051fbf244584ab584 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 4 Oct 2018 11:13:44 +0530 Subject: devlink: Add generic parameter ignore_ari ignore_ari - Device ignores ARI(Alternate Routing ID) capability, even when platforms has the support and creates same number of partitions when platform does not support ARI capability. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 70671f0d4c30..ae28ccbd6843 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -362,6 +362,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_MAX_MACS, DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -380,6 +381,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable" #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari" +#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index de6adad7ccbe..4a72fede11b4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2677,6 +2677,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, + .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From f61cba4291c06c201b1b855a341b036caefdc2d6 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 4 Oct 2018 11:13:45 +0530 Subject: devlink: Add generic parameter msix_vec_per_pf_max msix_vec_per_pf_max - This param sets the number of MSIX vectors that the device requests from the host on driver initialization. This value is set in the device which is applicable per PF. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index ae28ccbd6843..c9b08b49957c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -363,6 +363,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -384,6 +385,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari" #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max" +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 4a72fede11b4..d7501a588ad2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2682,6 +2682,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 16511789b9cc0a946611b1f9575b7a5b2b566301 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 4 Oct 2018 11:13:46 +0530 Subject: devlink: Add generic parameter msix_vec_per_pf_min msix_vec_per_pf_min - This param sets the number of minimal MSIX vectors required for the device initialization. This value is set in the device which limits MSIX vectors per PF. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index c9b08b49957c..9a70755ad1c2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -364,6 +364,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -388,6 +389,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max" #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min" +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index d7501a588ad2..938f68ee92f0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2687,6 +2687,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 5a781ccbd19e4664babcbe4b4ead7aa2b9283d22 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 28 Sep 2018 17:59:43 -0700 Subject: tc: Add support for configuring the taprio scheduler This traffic scheduler allows traffic classes states (transmission allowed/not allowed, in the simplest case) to be scheduled, according to a pre-generated time sequence. This is the basis of the IEEE 802.1Qbv specification. Example configuration: tc qdisc replace dev enp3s0 parent root handle 100 taprio \ num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \ queues 1@0 1@1 2@2 \ base-time 1528743495910289987 \ sched-entry S 01 300000 \ sched-entry S 02 300000 \ sched-entry S 04 300000 \ clockid CLOCK_TAI The configuration format is similar to mqprio. The main difference is the presence of a schedule, built by multiple "sched-entry" definitions, each entry has the following format: sched-entry The only supported is "S", which means "SetGateStates", following the IEEE 802.1Qbv-2015 definition (Table 8-6). is a bitmask where each bit is a associated with a traffic class, so bit 0 (the least significant bit) being "on" means that traffic class 0 is "active" for that schedule entry. is a time duration in nanoseconds that specifies for how long that state defined by and should be held before moving to the next entry. This schedule is circular, that is, after the last entry is executed it starts from the first one, indefinitely. The other parameters can be defined as follows: - base-time: specifies the instant when the schedule starts, if 'base-time' is a time in the past, the schedule will start at base-time + (N * cycle-time) where N is the smallest integer so the resulting time is greater than "now", and "cycle-time" is the sum of all the intervals of the entries in the schedule; - clockid: specifies the reference clock to be used; The parameters should be similar to what the IEEE 802.1Q family of specification defines. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 46 ++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/sch_taprio.c | 962 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1020 insertions(+) create mode 100644 net/sched/sch_taprio.c (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e9b7244ac381..89ee47c2f17d 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1084,4 +1084,50 @@ enum { CAKE_ATM_MAX }; + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e95741388311..1b9afdee5ba9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -194,6 +194,17 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_TAPRIO + tristate "Time Aware Priority (taprio) Scheduler" + help + Say Y here if you want to use the Time Aware Priority (taprio) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_taprio. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index f0403f49edcb..8a40431d7b5c 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o +obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c new file mode 100644 index 000000000000..206e4dbed12f --- /dev/null +++ b/net/sched/sch_taprio.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_taprio.c Time Aware Priority Scheduler + * + * Authors: Vinicius Costa Gomes + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TAPRIO_ALL_GATES_OPEN -1 + +struct sched_entry { + struct list_head list; + + /* The instant that this entry "closes" and the next one + * should open, the qdisc will make some effort so that no + * packet leaves after this time. + */ + ktime_t close_time; + atomic_t budget; + int index; + u32 gate_mask; + u32 interval; + u8 command; +}; + +struct taprio_sched { + struct Qdisc **qdiscs; + struct Qdisc *root; + s64 base_time; + int clockid; + int picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ + size_t num_entries; + + /* Protects the update side of the RCU protected current_entry */ + spinlock_t current_entry_lock; + struct sched_entry __rcu *current_entry; + struct list_head entries; + ktime_t (*get_time)(void); + struct hrtimer advance_timer; +}; + +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct Qdisc *child; + int queue; + + queue = skb_get_queue_mapping(skb); + + child = q->qdiscs[queue]; + if (unlikely(!child)) + return qdisc_drop(skb, sch, to_free); + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return qdisc_enqueue(skb, child, to_free); +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + gate_mask = entry ? entry->gate_mask : -1; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + int prio; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + return NULL; + + return skb; + } + + return NULL; +} + +static inline int length_to_duration(struct taprio_sched *q, int len) +{ + return (len * q->picos_per_byte) / 1000; +} + +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + /* if there's no entry, it means that the schedule didn't + * start yet, so force all gates to be open, this is in + * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 + * "AdminGateSates" + */ + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + ktime_t guard; + int prio; + int len; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + continue; + + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(q->get_time(), + length_to_duration(q, len)); + + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + ktime_after(guard, entry->close_time)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + atomic_sub_return(len, &entry->budget) < 0) + return NULL; + + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; + } + + return NULL; +} + +static bool should_restart_cycle(const struct taprio_sched *q, + const struct sched_entry *entry) +{ + WARN_ON(!entry); + + return list_is_last(&entry->list, &q->entries); +} + +static enum hrtimer_restart advance_sched(struct hrtimer *timer) +{ + struct taprio_sched *q = container_of(timer, struct taprio_sched, + advance_timer); + struct sched_entry *entry, *next; + struct Qdisc *sch = q->root; + ktime_t close_time; + + spin_lock(&q->current_entry_lock); + entry = rcu_dereference_protected(q->current_entry, + lockdep_is_held(&q->current_entry_lock)); + + /* This is the case that it's the first time that the schedule + * runs, so it only happens once per schedule. The first entry + * is pre-calculated during the schedule initialization. + */ + if (unlikely(!entry)) { + next = list_first_entry(&q->entries, struct sched_entry, + list); + close_time = next->close_time; + goto first_run; + } + + if (should_restart_cycle(q, entry)) + next = list_first_entry(&q->entries, struct sched_entry, + list); + else + next = list_next_entry(entry, list); + + close_time = ktime_add_ns(entry->close_time, next->interval); + + next->close_time = close_time; + atomic_set(&next->budget, + (next->interval * 1000) / q->picos_per_byte); + +first_run: + rcu_assign_pointer(q->current_entry, next); + spin_unlock(&q->current_entry_lock); + + hrtimer_set_expires(&q->advance_timer, close_time); + + rcu_read_lock(); + __netif_schedule(sch); + rcu_read_unlock(); + + return HRTIMER_RESTART; +} + +static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, + [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, +}; + +static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { + [TCA_TAPRIO_ATTR_PRIOMAP] = { + .len = sizeof(struct tc_mqprio_qopt) + }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) + entry->command = nla_get_u8( + tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) + entry->gate_mask = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) + interval = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + return 0; +} + +static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_sched_entry(tb, entry, extack); +} + +/* Returns the number of entries in case of success */ +static int parse_sched_single_entry(struct nlattr *n, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; + struct sched_entry *entry; + bool found = false; + u32 index; + int err; + + err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX, + n, entry_list_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { + NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); + return -EINVAL; + } + + err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX, + tb_list[TCA_TAPRIO_SCHED_ENTRY], + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { + NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); + return -EINVAL; + } + + index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); + if (index >= q->num_entries) { + NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); + return -EINVAL; + } + + list_for_each_entry(entry, &q->entries, list) { + if (entry->index == index) { + found = true; + break; + } + } + + if (!found) { + NL_SET_ERR_MSG(extack, "Could not find entry"); + return -ENOENT; + } + + err = fill_sched_entry(tb_entry, entry, extack); + if (err < 0) + return err; + + return q->num_entries; +} + +static int parse_sched_list(struct nlattr *list, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list) + return -EINVAL; + + nla_for_each_nested(n, list, rem) { + struct sched_entry *entry; + + if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + err = parse_sched_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + return err; + } + + list_add_tail(&entry->list, &q->entries); + i++; + } + + q->num_entries = i; + + return i; +} + +/* Returns the number of entries in case of success */ +static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + int err = 0; + int clockid; + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) + return -EINVAL; + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) + q->base_time = nla_get_s64( + tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) + return -EINVAL; + + q->clockid = clockid; + } + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) + err = parse_sched_list( + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); + else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + err = parse_sched_single_entry( + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); + + /* parse_sched_* return the number of entries in the schedule, + * a schedule with zero entries is an error. + */ + if (err == 0) { + NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); + return -EINVAL; + } + + return err; +} + +static int taprio_parse_mqprio_opt(struct net_device *dev, + struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + int i, j; + + if (!qopt) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* taprio imposes that traffic classes map 1:n to tx queues */ + if (qopt->num_tc > dev->num_tx_queues) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); + return -EINVAL; + } + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) { + NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); + return -EINVAL; + } + } + } + + return 0; +} + +static ktime_t taprio_get_start_time(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *entry; + ktime_t now, base, cycle; + s64 n; + + base = ns_to_ktime(q->base_time); + cycle = 0; + + /* Calculate the cycle_time, by summing all the intervals. + */ + list_for_each_entry(entry, &q->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) + return base; + + now = q->get_time(); + + if (ktime_after(base, now)) + return base; + + /* Schedule the start time for the beginning of the next + * cycle. + */ + n = div64_s64(ktime_sub_ns(now, base), cycle); + + return ktime_add_ns(base, (n + 1) * cycle); +} + +static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *first; + unsigned long flags; + + spin_lock_irqsave(&q->current_entry_lock, flags); + + first = list_first_entry(&q->entries, struct sched_entry, + list); + + first->close_time = ktime_add_ns(start, first->interval); + atomic_set(&first->budget, + (first->interval * 1000) / q->picos_per_byte); + rcu_assign_pointer(q->current_entry, NULL); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + + hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); +} + +static int taprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt *mqprio = NULL; + struct ethtool_link_ksettings ecmd; + int i, err, size; + s64 link_speed; + ktime_t start; + + err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, + taprio_policy, extack); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) + mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); + + err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (err < 0) + return err; + + /* A schedule with less than one entry is an error */ + size = parse_taprio_opt(tb, q, extack); + if (size < 0) + return size; + + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + return -ENOTSUPP; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + link_speed * 1000 * 1000); + + start = taprio_get_start_time(sch); + if (!start) + return 0; + + taprio_start_sched(sch, start); + + return 0; +} + +static void taprio_destroy(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry, *n; + unsigned int i; + + hrtimer_cancel(&q->advance_timer); + + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + qdisc_put(q->qdiscs[i]); + + kfree(q->qdiscs); + } + q->qdiscs = NULL; + + netdev_set_num_tc(dev, 0); + + list_for_each_entry_safe(entry, n, &q->entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int taprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + INIT_LIST_HEAD(&q->entries); + spin_lock_init(&q->current_entry_lock); + + /* We may overwrite the configuration later */ + hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + + q->root = sch; + + /* We only support static clockids. Use an invalid value as default + * and get the valid one on taprio_change(). + */ + q->clockid = -1; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdisc, attachment can't fail */ + q->qdiscs = kcalloc(dev->num_tx_queues, + sizeof(q->qdiscs[0]), + GFP_KERNEL); + + if (!q->qdiscs) + return -ENOMEM; + + if (!opt) + return -EINVAL; + + return taprio_change(sch, opt, extack); +} + +static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + + return netdev_get_tx_queue(dev, ntx); +} + +static int taprio_graft(struct Qdisc *sch, unsigned long cl, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + + if (new) + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int dump_entry(struct sk_buff *msg, + const struct sched_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, + entry->gate_mask)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, + entry->interval)) + goto nla_put_failure; + + return nla_nest_end(msg, item); + +nla_put_failure: + nla_nest_cancel(msg, item); + return -1; +} + +static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt opt = { 0 }; + struct nlattr *nest, *entry_list; + struct sched_entry *entry; + unsigned int i; + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + return -ENOSPC; + + if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) + goto options_error; + + if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + q->base_time, TCA_TAPRIO_PAD)) + goto options_error; + + if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + goto options_error; + + entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto options_error; + + list_for_each_entry(entry, &q->entries, list) { + if (dump_entry(skb, entry) < 0) + goto options_error; + } + + nla_nest_end(skb, entry_list); + + return nla_nest_end(skb, nest); + +options_error: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long taprio_find(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!taprio_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + + return 0; +} + +static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + return -1; + return 0; +} + +static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + +static const struct Qdisc_class_ops taprio_class_ops = { + .graft = taprio_graft, + .leaf = taprio_leaf, + .find = taprio_find, + .walk = taprio_walk, + .dump = taprio_dump_class, + .dump_stats = taprio_dump_class_stats, + .select_queue = taprio_select_queue, +}; + +static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { + .cl_ops = &taprio_class_ops, + .id = "taprio", + .priv_size = sizeof(struct taprio_sched), + .init = taprio_init, + .destroy = taprio_destroy, + .peek = taprio_peek, + .dequeue = taprio_dequeue, + .enqueue = taprio_enqueue, + .dump = taprio_dump, + .owner = THIS_MODULE, +}; + +static int __init taprio_module_init(void) +{ + return register_qdisc(&taprio_qdisc_ops); +} + +static void __exit taprio_module_exit(void) +{ + unregister_qdisc(&taprio_qdisc_ops); +} + +module_init(taprio_module_init); +module_exit(taprio_module_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 17c357efe5eceebdc3971a48b3d4d61a03c1178b Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:51:28 -0300 Subject: openvswitch: load NAT helper Load the respective NAT helper module if the flow uses it. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0aeb34c6389d..35ae64cbef33 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1312,6 +1312,10 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, rcu_assign_pointer(help->helper, helper); info->helper = helper; + + if (info->nat) + request_module("ip_nat_%s", name); + return 0; } -- cgit v1.2.3 From 767a2217533fed696af0d06bee7746d34c4e00aa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 20:07:51 -0700 Subject: net: common metrics init helper for FIB entries Consolidate initialization of ipv4 and ipv6 metrics when fib entries are created into a single helper, ip_fib_metrics_init, that handles the call to ip_metrics_convert. If no metrics are defined for the fib entry, then the metrics is set to dst_default_metrics. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++-- net/ipv4/fib_semantics.c | 27 +++++++-------------------- net/ipv4/metrics.c | 30 +++++++++++++++++++++++++++--- net/ipv6/ip6_fib.c | 2 -- net/ipv6/route.c | 29 +++++++---------------------- 5 files changed, 43 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index e44b1a44f67a..8cbe7e8c9e1e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -420,8 +420,8 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } -int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, - u32 *metrics); +struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, + int fc_mx_len); u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bee8db979195..e8f4add597ed 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1020,13 +1020,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } -static int -fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) -{ - return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, - fi->fib_metrics->metrics); -} - struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { @@ -1084,16 +1077,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (unlikely(!fi->fib_metrics)) { - kfree(fi); - return ERR_PTR(err); - } - refcount_set(&fi->fib_metrics->refcnt, 1); - } else { - fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; + fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, + cfg->fc_mx_len); + if (unlikely(IS_ERR(fi->fib_metrics))) { + err = PTR_ERR(fi->fib_metrics); + kfree(fi); + return ERR_PTR(err); } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1112,10 +1103,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; } endfor_nexthops(fi) - err = fib_convert_metrics(fi, cfg); - if (err) - goto failure; - if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 04311f7067e2..6d218f5a2e71 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -5,8 +5,8 @@ #include #include -int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, - u32 *metrics) +static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, + int fc_mx_len, u32 *metrics) { bool ecn_ca = false; struct nlattr *nla; @@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, return 0; } -EXPORT_SYMBOL_GPL(ip_metrics_convert); + +struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, + int fc_mx_len) +{ + struct dst_metrics *fib_metrics; + int err; + + if (!fc_mx) + return (struct dst_metrics *)&dst_default_metrics; + + fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); + if (unlikely(!fib_metrics)) + return ERR_PTR(-ENOMEM); + + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + if (!err) { + refcount_set(&fib_metrics->refcnt, 1); + } else { + kfree(fib_metrics); + fib_metrics = ERR_PTR(err); + } + + return fib_metrics; +} +EXPORT_SYMBOL_GPL(ip_fib_metrics_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5516f55e214b..de063780a175 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,8 +160,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) } INIT_LIST_HEAD(&f6i->fib6_siblings); - f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - atomic_inc(&f6i->fib6_ref); return f6i; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3adf107b42d2..b62b7aa53bbe 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2705,24 +2705,6 @@ out: return entries > rt_max_size; } -static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, - struct fib6_config *cfg) -{ - struct dst_metrics *p; - - if (!cfg->fc_mx) - return 0; - - p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); - if (unlikely(!p)) - return -ENOMEM; - - refcount_set(&p->refcnt, 1); - rt->fib6_metrics = p; - - return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); -} - static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, @@ -2998,13 +2980,15 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!rt) goto out; + rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len); + if (IS_ERR(rt->fib6_metrics)) { + err = PTR_ERR(rt->fib6_metrics); + goto out; + } + if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; - err = ip6_convert_metrics(net, rt, cfg); - if (err < 0) - goto out; - if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -3727,6 +3711,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, if (!f6i) return ERR_PTR(-ENOMEM); + f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0); f6i->dst_nocount = true; f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; -- cgit v1.2.3 From cc5f0eb2164f9aa11fe631f8d905192e0233e262 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 20:07:52 -0700 Subject: net: Move free of fib_metrics to helper Move the refcounting and potential free of dst metrics associated with a fib entry to a helper and use it in both ipv4 and ipv6. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 6 ++++++ net/ipv4/fib_semantics.c | 6 ++---- net/ipv6/ip6_fib.c | 6 ++---- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 8cbe7e8c9e1e..8fdd58ce580d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -422,6 +422,12 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, int fc_mx_len); +static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) +{ + if (fib_metrics != &dst_default_metrics && + refcount_dec_and_test(&fib_metrics->refcnt)) + kfree(fib_metrics); +} u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e8f4add597ed..f8c7ec8171a8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - m = fi->fib_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); + ip_fib_metrics_put(fi->fib_metrics); + kfree(fi); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index de063780a175..cf709eadc932 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -169,7 +170,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); struct rt6_exception_bucket *bucket; - struct dst_metrics *m; WARN_ON(f6i->fib6_node); @@ -201,9 +201,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); - m = f6i->fib6_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); + ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } -- cgit v1.2.3 From e1255ed4b6dafd9966c99cde5105891cc1ac70df Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 20:07:53 -0700 Subject: net: common metrics init helper for dst_entry ipv4 and ipv6 both use refcounted metrics if FIB entries have metrics set. Move the common initialization code to a helper and use for both protocols. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 12 ++++++++++++ net/ipv4/route.c | 7 ++----- net/ipv6/route.c | 6 +----- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 8fdd58ce580d..f9a7125b4bda 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -429,6 +429,18 @@ static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) kfree(fib_metrics); } +/* ipv4 and ipv6 both use refcounted metrics if it is not the default */ +static inline +void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics) +{ + dst_init_metrics(dst, fib_metrics->metrics, true); + + if (fib_metrics != &dst_default_metrics) { + dst->_metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&fib_metrics->refcnt); + } +} + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 048919713f4e..8ccbc8f2c2cc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1528,11 +1528,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); - if (fi->fib_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&fi->fib_metrics->refcnt); - } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b62b7aa53bbe..b91a9d3cf288 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -978,11 +978,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); - dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); - if (from->fib6_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&from->fib6_metrics->refcnt); - } + ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to @ort */ -- cgit v1.2.3 From 1620a33695d81611360d813a47ebde9386714036 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 20:07:54 -0700 Subject: net: Move free of dst_metrics to helper Move the refcounting and potential free of dst metrics associated for ipv4 and ipv6 to a common helper. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 9 +++++++++ net/ipv4/route.c | 5 +---- net/ipv6/route.c | 5 +---- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index f9a7125b4bda..72593e171d14 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -441,6 +441,15 @@ void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics) } } +static inline +void ip_dst_metrics_put(struct dst_entry *dst) +{ + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); + + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) + kfree(p); +} + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ccbc8f2c2cc..f71d2395c428 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1476,12 +1476,9 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *)dst; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt_del_uncached_list(rt); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b91a9d3cf288..6c1d817151ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -364,14 +364,11 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; -- cgit v1.2.3 From 9d2f67e43b73e8af7438be219b66a5de0cfa8bd9 Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Sat, 29 Sep 2018 15:41:27 +0000 Subject: net/packet: fix packet drop as of virtio gso When we use raw socket as the vhost backend, a packet from virito with gso offloading information, cannot be sent out in later validaton at xmit path, as we did not set correct skb->protocol which is further used for looking up the gso function. To fix this, we set this field according to virito hdr information. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Jianfeng Tan Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 18 ++++++++++++++++++ net/packet/af_packet.c | 11 +++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9397628a1967..cb462f9ab7dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,6 +5,24 @@ #include #include +static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_UDP: + skb->protocol = cpu_to_be16(ETH_P_IP); + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb->protocol = cpu_to_be16(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + + return 0; +} + static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..d6e94dc7e290 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2715,10 +2715,12 @@ tpacket_error: } } - if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr, - vio_le())) { - tp_len = -EINVAL; - goto tpacket_error; + if (po->has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + tp_len = -EINVAL; + goto tpacket_error; + } + virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; @@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; len += sizeof(vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb_probe_transport_header(skb, reserve); -- cgit v1.2.3 From 6f52f80e85309738121f2db51a3cac91b8195743 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 3 Oct 2018 15:33:12 -0700 Subject: net/neigh: Extend dump filter to proxy neighbor dumps Move the attribute parsing from neigh_dump_table to neigh_dump_info, and pass the filter arguments down to neigh_dump_table in a new struct. Add the filter option to proxy neigh dumps as well to make them consistent. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 72 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 20e0d3308148..fb023df48b83 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2329,35 +2329,24 @@ static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) return false; } +struct neigh_dump_filter { + int master_idx; + int dev_idx; +}; + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); - const struct nlmsghdr *nlh = cb->nlh; - struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; - int filter_master_idx = 0, filter_idx = 0; unsigned int flags = NLM_F_MULTI; - int err; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); - if (!err) { - if (tb[NDA_IFINDEX]) { - if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) - return -EINVAL; - filter_idx = nla_get_u32(tb[NDA_IFINDEX]); - } - if (tb[NDA_MASTER]) { - if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) - return -EINVAL; - filter_master_idx = nla_get_u32(tb[NDA_MASTER]); - } - if (filter_idx || filter_master_idx) - flags |= NLM_F_DUMP_FILTERED; - } + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); @@ -2370,8 +2359,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; - if (neigh_ifindex_filtered(n->dev, filter_idx) || - neigh_master_filtered(n->dev, filter_master_idx)) + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -2393,12 +2382,17 @@ out: } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int rc, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; + unsigned int flags = NLM_F_MULTI; + + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; read_lock_bh(&tbl->lock); @@ -2408,10 +2402,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { if (idx < s_idx || pneigh_net(n) != net) goto next; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, tbl) < 0) { + RTM_NEWNEIGH, flags, tbl) < 0) { read_unlock_bh(&tbl->lock); rc = -1; goto out; @@ -2432,20 +2428,36 @@ out: static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct neigh_dump_filter filter = {}; + struct nlattr *tb[NDA_MAX + 1]; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ - if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && - ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) + if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); + if (!err) { + if (tb[NDA_IFINDEX]) { + if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) + return -EINVAL; + filter.dev_idx = nla_get_u32(tb[NDA_IFINDEX]); + } + if (tb[NDA_MASTER]) { + if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) + return -EINVAL; + filter.master_idx = nla_get_u32(tb[NDA_MASTER]); + } + } s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { @@ -2459,9 +2471,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) - err = pneigh_dump_table(tbl, skb, cb); + err = pneigh_dump_table(tbl, skb, cb, &filter); else - err = neigh_dump_table(tbl, skb, cb); + err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } -- cgit v1.2.3 From 95278ddaa15cfa23e4a06ee9ed7b6ee0197c500b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 2 Oct 2018 12:50:19 -0700 Subject: net_sched: convert idrinfo->lock from spinlock to a mutex In commit ec3ed293e766 ("net_sched: change tcf_del_walker() to take idrinfo->lock") we move fl_hw_destroy_tmplt() to a workqueue to avoid blocking with the spinlock held. Unfortunately, this causes a lot of troubles here: 1. tcf_chain_destroy() could be called right after we queue the work but before the work runs. This is a use-after-free. 2. The chain refcnt is already 0, we can't even just hold it again. We can check refcnt==1 but it is ugly. 3. The chain with refcnt 0 is still visible in its block, which means it could be still found and used! 4. The block has a refcnt too, we can't hold it without introducing a proper API either. We can make it working but the end result is ugly. Instead of wasting time on reviewing it, let's just convert the troubling spinlock to a mutex, which allows us to use non-atomic allocations too. Fixes: ec3ed293e766 ("net_sched: change tcf_del_walker() to take idrinfo->lock") Reported-by: Ido Schimmel Cc: Jamal Hadi Salim Cc: Vlad Buslov Cc: Jiri Pirko Signed-off-by: Cong Wang Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/act_api.h | 4 ++-- net/sched/act_api.c | 44 ++++++++++++++++++++++---------------------- net/sched/cls_flower.c | 13 ++----------- 3 files changed, 26 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 1ddff3360592..05c7df41d737 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -13,7 +13,7 @@ #include struct tcf_idrinfo { - spinlock_t lock; + struct mutex lock; struct idr action_idr; }; @@ -117,7 +117,7 @@ int tc_action_net_init(struct tc_action_net *tn, if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; - spin_lock_init(&tn->idrinfo->lock); + mutex_init(&tn->idrinfo->lock); idr_init(&tn->idrinfo->action_idr); return err; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3c7c23421885..55153da00278 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -104,11 +104,11 @@ static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; - if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; @@ -200,7 +200,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct tc_action *p; unsigned long id = 1; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); s_i = cb->args[0]; @@ -235,7 +235,7 @@ done: if (index >= 0) cb->args[0] = index + 1; - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; @@ -277,18 +277,18 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, id) { ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); n_i++; } else if (ret < 0) { - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); goto nla_put_failure; } } - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); if (nla_put_u32(skb, TCA_FCNT, n_i)) goto nla_put_failure; @@ -324,13 +324,13 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); if (p) { *a = p; @@ -345,10 +345,10 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) struct tc_action *p; int ret = 0; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return -ENOENT; } @@ -358,7 +358,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); @@ -369,7 +369,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) ret = -EPERM; } - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return ret; } @@ -431,10 +431,10 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); @@ -444,10 +444,10 @@ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); @@ -465,14 +465,14 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, int ret; again: - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); if (*index) { p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); goto again; } @@ -485,7 +485,7 @@ again: } else { *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - *index, GFP_ATOMIC); + *index, GFP_KERNEL); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); @@ -494,12 +494,12 @@ again: *index = 1; *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - UINT_MAX, GFP_ATOMIC); + UINT_MAX, GFP_KERNEL); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); } - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 92dd5071a708..9aada2d0ef06 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -79,7 +79,6 @@ struct fl_flow_tmplt { struct fl_flow_key mask; struct flow_dissector dissector; struct tcf_chain *chain; - struct rcu_work rwork; }; struct cls_fl_head { @@ -1438,20 +1437,12 @@ errout_tb: return ERR_PTR(err); } -static void fl_tmplt_destroy_work(struct work_struct *work) -{ - struct fl_flow_tmplt *tmplt = container_of(to_rcu_work(work), - struct fl_flow_tmplt, rwork); - - fl_hw_destroy_tmplt(tmplt->chain, tmplt); - kfree(tmplt); -} - static void fl_tmplt_destroy(void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; - tcf_queue_work(&tmplt->rwork, fl_tmplt_destroy_work); + fl_hw_destroy_tmplt(tmplt->chain, tmplt); + kfree(tmplt); } static int fl_dump_key_val(struct sk_buff *skb, -- cgit v1.2.3 From 33aa8da1f8a7dc050b9d68f1db761ab787621065 Mon Sep 17 00:00:00 2001 From: Shanthosh RK Date: Fri, 5 Oct 2018 20:57:48 +0530 Subject: net: bpfilter: Fix type cast and pointer warnings Fixes the following Sparse warnings: net/bpfilter/bpfilter_kern.c:62:21: warning: cast removes address space of expression net/bpfilter/bpfilter_kern.c:101:49: warning: Using plain integer as NULL pointer Signed-off-by: Shanthosh RK Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index f0fc182d3db7..b64e1649993b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -59,7 +59,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.is_set = is_set; req.pid = current->pid; req.cmd = optname; - req.addr = (long)optval; + req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); if (!info.pid) @@ -98,7 +98,7 @@ static int __init load_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", info.pid); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { stop_umh(); return -EFAULT; } -- cgit v1.2.3 From f2e9de210d50187d206989e60bc5a99c2b692209 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 5 Oct 2018 11:31:40 -0400 Subject: udp: gro behind static key Avoid the socket lookup cost in udp_gro_receive if no socket has a udp tunnel callback configured. udp_sk(sk)->gro_receive requires a registration with setup_udp_tunnel_sock, which enables the static key. Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ net/ipv4/udp.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp.c | 2 +- net/ipv6/udp_offload.c | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index 8482a990b0bb..9e82cb391dea 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -443,8 +443,10 @@ int udpv4_offload_init(void); void udp_init(void); +DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5fc4beb1c336..1bec2203d558 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { static_branch_enable(&udp_encap_needed_key); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0c0522b79b43..802f2bc00d69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head, { struct udphdr *uh = udp_gro_udphdr(skb); - if (unlikely(!uh)) + if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 28c4aa5078fc..374e7d302f26 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -548,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_enable(&udpv6_encap_needed_key); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 95dee9ca8d22..1b8e161ac527 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -119,7 +119,7 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head, { struct udphdr *uh = udp_gro_udphdr(skb); - if (unlikely(!uh)) + if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ -- cgit v1.2.3 From fda21d46cce2879dbce981978f8cb07b36035369 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2018 09:17:50 -0700 Subject: ipv6: do not leave garbage in rt->fib6_metrics In case ip_fib_metrics_init() returns an error, we better rewrite rt->fib6_metrics with &dst_default_metrics so that we do not crash later in ip_fib_metrics_put() Fixes: 767a2217533f ("net: common metrics init helper for FIB entries") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6c1d817151ca..74d97addf1af 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2976,6 +2976,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); + /* Do not leave garbage there. */ + rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; goto out; } -- cgit v1.2.3 From bd961c9bc66497f0c63f4ba1d02900bb85078366 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 1 Oct 2018 22:46:40 -0300 Subject: rtnetlink: fix rtnl_fdb_dump() for ndmsg header Currently, rtnl_fdb_dump() assumes the family header is 'struct ifinfomsg', which is not always true -- 'struct ndmsg' is used by iproute2 ('ip neigh'). The problem is, the function bails out early if nlmsg_parse() fails, which does occur for iproute2 usage of 'struct ndmsg' because the payload length is shorter than the family header alone (as 'struct ifinfomsg' is assumed). This breaks backward compatibility with userspace -- nothing is sent back. Some examples with iproute2 and netlink library for go [1]: 1) $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent This one works, as it uses 'struct ifinfomsg'. fdb_show() @ iproute2/bridge/fdb.c """ .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), ... if (rtnl_dump_request(&rth, RTM_GETNEIGH, [...] """ 2) $ ip --family bridge neigh RTNETLINK answers: Invalid argument Dump terminated This one fails, as it uses 'struct ndmsg'. do_show_or_flush() @ iproute2/ip/ipneigh.c """ .n.nlmsg_type = RTM_GETNEIGH, .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), """ 3) $ ./neighlist < no output > This one fails, as it uses 'struct ndmsg'-based. neighList() @ netlink/neigh_linux.go """ req := h.newNetlinkRequest(unix.RTM_GETNEIGH, [...] msg := Ndmsg{ """ The actual breakage was introduced by commit 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error"), because nlmsg_parse() fails if the payload length (with the _actual_ family header) is less than the family header length alone (which is assumed, in parameter 'hdrlen'). This is true in the examples above with struct ndmsg, with size and payload length shorter than struct ifinfomsg. However, that commit just intends to fix something under the assumption the family header is indeed an 'struct ifinfomsg' - by preventing access to the payload as such (via 'ifm' pointer) if the payload length is not sufficient to actually contain it. The assumption was introduced by commit 5e6d24358799 ("bridge: netlink dump interface at par with brctl"), to support iproute2's 'bridge fdb' command (not 'ip neigh') which indeed uses 'struct ifinfomsg', thus is not broken. So, in order to unbreak the 'struct ndmsg' family headers and still allow 'struct ifinfomsg' to continue to work, check for the known message sizes used with 'struct ndmsg' in iproute2 (with zero or one attribute which is not used in this function anyway) then do not parse the data as ifinfomsg. Same examples with this patch applied (or revert/before the original fix): $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent $ ip --family bridge neigh dev ens3 lladdr 33:33:00:00:00:01 PERMANENT dev ens3 lladdr 01:00:5e:00:00:01 PERMANENT dev ens3 lladdr 33:33:ff:15:98:30 PERMANENT $ ./neighlist netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0x0, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x1, 0x0, 0x5e, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0xff, 0x15, 0x98, 0x30}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} Tested on mainline (v4.19-rc6) and net-next (3bd09b05b068). References: [1] netlink library for go (test-case) https://github.com/vishvananda/netlink $ cat ~/go/src/neighlist/main.go package main import ("fmt"; "syscall"; "github.com/vishvananda/netlink") func main() { neighs, _ := netlink.NeighList(0, syscall.AF_BRIDGE) for _, neigh := range neighs { fmt.Printf("%#v\n", neigh) } } $ export GOPATH=~/go $ go get github.com/vishvananda/netlink $ go build neighlist $ ~/go/src/neighlist/neighlist Thanks to David Ahern for suggestions to improve this patch. Fixes: 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error") Fixes: 5e6d24358799 ("bridge: netlink dump interface at par with brctl") Reported-by: Aidan Obley Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 448703312fed..37c7936124e6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3748,16 +3748,27 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); - if (err < 0) { - return -EINVAL; - } else if (err == 0) { - if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); - } + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && + (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } - brport_idx = ifm->ifi_index; + brport_idx = ifm->ifi_index; + } if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); -- cgit v1.2.3 From 8b4c3cdd9dd8290343ce959a132d3b334062c5b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 3 Oct 2018 15:05:36 -0700 Subject: net: sched: Add policy validation for tc attributes A number of TC attributes are processed without proper validation (e.g., length checks). Add a tca policy for all input attributes and use when invoking nlmsg_parse. The 2 Fixes tags below cover the latest additions. The other attributes are a string (KIND), nested attribute (OPTIONS which does seem to have validation in most cases), for dumps only or a flag. Fixes: 5bc1701881e39 ("net: sched: introduce multichain support for filters") Fixes: d47a6b0e7c492 ("net: sched: introduce ingress/egress block index attributes for qdisc") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_api.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..85e73f48e48f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1311,6 +1311,18 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ +const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { + [TCA_KIND] = { .type = NLA_STRING }, + [TCA_OPTIONS] = { .type = NLA_NESTED }, + [TCA_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct tc_estimator) }, + [TCA_STAB] = { .type = NLA_NESTED }, + [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, + [TCA_CHAIN] = { .type = NLA_U32 }, + [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, + [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, +}; + static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -1327,7 +1339,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1411,7 +1424,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1645,7 +1659,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, + rtm_tca_policy, NULL); if (err < 0) return err; @@ -1864,7 +1879,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; -- cgit v1.2.3 From 068b88cc175d594393f501a137c7a4af9f356c39 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 4 Oct 2018 11:09:40 +0200 Subject: socket: Tighten no-error check in bind() move_addr_to_kernel() returns only negative values on error, or zero on success. Rewrite the error check to an idiomatic form to avoid confusing the reader. Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 01f3f8f32d6f..713dc4833d40 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1475,7 +1475,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); - if (err >= 0) { + if (!err) { err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); -- cgit v1.2.3 From a688caa34beb2fd2a92f1b6d33e40cde433ba160 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 4 Oct 2018 10:12:37 -0700 Subject: ipv6: take rcu lock in rawv6_send_hdrinc() In rawv6_send_hdrinc(), in order to avoid an extra dst_hold(), we directly assign the dst to skb and set passed in dst to NULL to avoid double free. However, in error case, we free skb and then do stats update with the dst pointer passed in. This causes use-after-free on the dst. Fix it by taking rcu read lock right before dst could get released to make sure dst does not get freed until the stats update is done. Note: we don't have this issue in ipv4 cause dst is not used for stats update in v4. Syzkaller reported following crash: BUG: KASAN: use-after-free in rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] BUG: KASAN: use-after-free in rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 Read of size 8 at addr ffff8801d95ba730 by task syz-executor0/32088 CPU: 1 PID: 32088 Comm: syz-executor0 Not tainted 4.19.0-rc2+ #93 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457099 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f83756edc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f83756ee6d4 RCX: 0000000000457099 RDX: 0000000000000000 RSI: 0000000020003840 RDI: 0000000000000004 RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d4b30 R14: 00000000004c90b1 R15: 0000000000000000 Allocated by task 32088: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490 kmem_cache_alloc+0x12e/0x730 mm/slab.c:3554 dst_alloc+0xbb/0x1d0 net/core/dst.c:105 ip6_dst_alloc+0x35/0xa0 net/ipv6/route.c:353 ip6_rt_cache_alloc+0x247/0x7b0 net/ipv6/route.c:1186 ip6_pol_route+0x8f8/0xd90 net/ipv6/route.c:1895 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2093 fib6_rule_lookup+0x277/0x860 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2121 ip6_route_output include/net/ip6_route.h:88 [inline] ip6_dst_lookup_tail+0xe27/0x1d60 net/ipv6/ip6_output.c:951 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079 rawv6_sendmsg+0x12d9/0x4630 net/ipv6/raw.c:905 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 5356: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kmem_cache_free+0x83/0x290 mm/slab.c:3756 dst_destroy+0x267/0x3c0 net/core/dst.c:141 dst_destroy_rcu+0x16/0x19 net/core/dst.c:154 __rcu_reclaim kernel/rcu/rcu.h:236 [inline] rcu_do_batch kernel/rcu/tree.c:2576 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2880 [inline] __rcu_process_callbacks kernel/rcu/tree.c:2847 [inline] rcu_process_callbacks+0xf23/0x2670 kernel/rcu/tree.c:2864 __do_softirq+0x30b/0xad8 kernel/softirq.c:292 Fixes: 1789a640f556 ("raw: avoid two atomics in xmit") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/raw.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..5e0efd3954e9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -651,8 +651,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb->tstamp = sockc->transmit_time; - skb_dst_set(skb, &rt->dst); - *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -665,8 +663,14 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); - if (err) - goto error_fault; + if (err) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + skb_dst_set(skb, &rt->dst); + *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -675,21 +679,28 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (unlikely(!skb)) return 0; + /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev + * in the error path. Since skb has been freed, the dst could + * have been queued for deletion. + */ + rcu_read_lock(); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); - if (err) - goto error; + if (err) { + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + goto error_check; + } + rcu_read_unlock(); out: return 0; -error_fault: - err = -EFAULT; - kfree_skb(skb); error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); +error_check: if (err == -ENOBUFS && !np->recverr) err = 0; return err; -- cgit v1.2.3 From fb4ee67529ff3e4c5874768477887c2df5714c96 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Fri, 5 Oct 2018 10:46:01 -0700 Subject: net/ncsi: Add NCSI OEM command support This patch adds OEM commands and response handling. It also defines OEM command and response structure as per NCSI specification along with its handlers. ncsi_cmd_handler_oem: This is a generic command request handler for OEM commands ncsi_rsp_handler_oem: This is a generic response handler for OEM commands Signed-off-by: Vijay Khemka Reviewed-by: Justin Lee Reviewed-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/internal.h | 5 +++++ net/ncsi/ncsi-cmd.c | 30 +++++++++++++++++++++++++++--- net/ncsi/ncsi-pkt.h | 14 ++++++++++++++ net/ncsi/ncsi-rsp.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 88 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 8055e3965cef..3d0a33b874f5 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -68,6 +68,10 @@ enum { NCSI_MODE_MAX }; +/* OEM Vendor Manufacture ID */ +#define NCSI_OEM_MFR_MLX_ID 0x8119 +#define NCSI_OEM_MFR_BCM_ID 0x113d + struct ncsi_channel_version { u32 version; /* Supported BCD encoded NCSI version */ u32 alpha2; /* Supported BCD encoded NCSI version */ @@ -305,6 +309,7 @@ struct ncsi_cmd_arg { unsigned short words[8]; unsigned int dwords[4]; }; + unsigned char *data; /* NCSI OEM data */ }; extern struct list_head ncsi_dev_list; diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 7567ca63aae2..82b7d9201db8 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -211,6 +211,25 @@ static int ncsi_cmd_handler_snfc(struct sk_buff *skb, return 0; } +static int ncsi_cmd_handler_oem(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_oem_pkt *cmd; + unsigned int len; + + len = sizeof(struct ncsi_cmd_pkt_hdr) + 4; + if (nca->payload < 26) + len += 26; + else + len += nca->payload; + + cmd = skb_put_zero(skb, len); + memcpy(&cmd->mfr_id, nca->data, nca->payload); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + static struct ncsi_cmd_handler { unsigned char type; int payload; @@ -244,7 +263,7 @@ static struct ncsi_cmd_handler { { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default }, { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default }, { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, - { NCSI_PKT_CMD_OEM, 0, NULL }, + { NCSI_PKT_CMD_OEM, -1, ncsi_cmd_handler_oem }, { NCSI_PKT_CMD_PLDM, 0, NULL }, { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } }; @@ -316,8 +335,13 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) return -ENOENT; } - /* Get packet payload length and allocate the request */ - nca->payload = nch->payload; + /* Get packet payload length and allocate the request + * It is expected that if length set as negative in + * handler structure means caller is initializing it + * and setting length in nca before calling xmit function + */ + if (nch->payload >= 0) + nca->payload = nch->payload; nr = ncsi_alloc_command(nca); if (!nr) return -ENOMEM; diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 91b4b66438df..0f2087c8d42a 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -151,6 +151,20 @@ struct ncsi_cmd_snfc_pkt { unsigned char pad[22]; }; +/* OEM Request Command as per NCSI Specification */ +struct ncsi_cmd_oem_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mfr_id; /* Manufacture ID */ + unsigned char data[]; /* OEM Payload Data */ +}; + +/* OEM Response Packet as per NCSI Specification */ +struct ncsi_rsp_oem_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Command header */ + __be32 mfr_id; /* Manufacture ID */ + unsigned char data[]; /* Payload data */ +}; + /* Get Link Status */ struct ncsi_rsp_gls_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 930c1d3796f0..d66b34749027 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -596,6 +596,47 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } +static struct ncsi_rsp_oem_handler { + unsigned int mfr_id; + int (*handler)(struct ncsi_request *nr); +} ncsi_rsp_oem_handlers[] = { + { NCSI_OEM_MFR_MLX_ID, NULL }, + { NCSI_OEM_MFR_BCM_ID, NULL } +}; + +/* Response handler for OEM command */ +static int ncsi_rsp_handler_oem(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_pkt *rsp; + struct ncsi_rsp_oem_handler *nrh = NULL; + unsigned int mfr_id, i; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + mfr_id = ntohl(rsp->mfr_id); + + /* Check for manufacturer id and Find the handler */ + for (i = 0; i < ARRAY_SIZE(ncsi_rsp_oem_handlers); i++) { + if (ncsi_rsp_oem_handlers[i].mfr_id == mfr_id) { + if (ncsi_rsp_oem_handlers[i].handler) + nrh = &ncsi_rsp_oem_handlers[i]; + else + nrh = NULL; + + break; + } + } + + if (!nrh) { + netdev_err(nr->ndp->ndev.dev, "Received unrecognized OEM packet with MFR-ID (0x%x)\n", + mfr_id); + return -ENOENT; + } + + /* Process the packet */ + return nrh->handler(nr); +} + static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) { struct ncsi_rsp_gvi_pkt *rsp; @@ -932,7 +973,7 @@ static struct ncsi_rsp_handler { { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, - { NCSI_PKT_RSP_OEM, 0, NULL }, + { NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem }, { NCSI_PKT_RSP_PLDM, 0, NULL }, { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } }; -- cgit v1.2.3 From ac4a02c5ab86971b8c2f82b986d37df8e4c4a436 Mon Sep 17 00:00:00 2001 From: Leslie Monis Date: Sun, 7 Oct 2018 01:22:45 +0530 Subject: net: sched: pie: fix coding style issues Fix 5 warnings and 14 checks issued by checkpatch.pl: CHECK: Logical continuations should be on the previous line + if ((q->vars.qdelay < q->params.target / 2) + && (q->vars.prob < MAX_PROB / 5)) WARNING: line over 80 characters + q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); CHECK: Blank lines aren't necessary after an open brace '{' +{ + CHECK: braces {} should be used on all arms of this statement + if (qlen < QUEUE_THRESHOLD) [...] + else { [...] CHECK: Unbalanced braces around else statement + else { CHECK: No space is necessary after a cast + if (delta > (s32) (MAX_PROB / (100 / 2)) && CHECK: Unnecessary parentheses around 'qdelay == 0' + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) CHECK: Unnecessary parentheses around 'qdelay_old == 0' + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) CHECK: Unnecessary parentheses around 'q->vars.prob == 0' + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.qdelay_old < q->params.target / 2) && + (q->vars.prob == 0) && + (q->vars.avg_dq_rate > 0)) CHECK: Unnecessary parentheses around 'q->vars.avg_dq_rate > 0' + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.qdelay_old < q->params.target / 2) && + (q->vars.prob == 0) && + (q->vars.avg_dq_rate > 0)) CHECK: Blank lines aren't necessary before a close brace '}' + +} CHECK: Comparison to NULL could be written "!opts" + if (opts == NULL) CHECK: No space is necessary after a cast + ((u32) PSCHED_TICKS2NS(q->params.target)) / WARNING: line over 80 characters + nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || CHECK: Blank lines aren't necessary before a close brace '}' + +} CHECK: No space is necessary after a cast + .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / WARNING: Missing a blank line after declarations + struct sk_buff *skb; + skb = qdisc_dequeue_head(sch); WARNING: Missing a blank line after declarations + struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); WARNING: Missing a blank line after declarations + struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; Signed-off-by: Leslie Monis Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 18d30bb86881..d1429371592f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -110,8 +110,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) - && (q->vars.prob < MAX_PROB / 5)) + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.prob < MAX_PROB / 5)) return false; /* If we have fewer than 2 mtu-sized packets, disable drop_early, @@ -209,7 +209,8 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) - q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + q->params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); @@ -247,7 +248,6 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) { - struct pie_sched_data *q = qdisc_priv(sch); int qlen = sch->qstats.backlog; /* current queue size in bytes */ @@ -294,9 +294,9 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ - if (qlen < QUEUE_THRESHOLD) + if (qlen < QUEUE_THRESHOLD) { q->vars.dq_count = DQCOUNT_INVALID; - else { + } else { q->vars.dq_count = 0; q->vars.dq_tstamp = psched_get_time(); } @@ -370,7 +370,7 @@ static void calculate_probability(struct Qdisc *sch) oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ - if (delta > (s32) (MAX_PROB / (100 / 2)) && + if (delta > (s32)(MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; @@ -405,7 +405,7 @@ static void calculate_probability(struct Qdisc *sch) * delay is 0 for 2 consecutive Tupdate periods. */ - if ((qdelay == 0) && (qdelay_old == 0) && update_prob) + if (qdelay == 0 && qdelay_old == 0 && update_prob) q->vars.prob = (q->vars.prob * 98) / 100; q->vars.qdelay = qdelay; @@ -419,8 +419,8 @@ static void calculate_probability(struct Qdisc *sch) */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.qdelay_old < q->params.target / 2) && - (q->vars.prob == 0) && - (q->vars.avg_dq_rate > 0)) + q->vars.prob == 0 && + q->vars.avg_dq_rate > 0) pie_vars_init(&q->vars); } @@ -437,7 +437,6 @@ static void pie_timer(struct timer_list *t) if (q->params.tupdate) mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); spin_unlock(root_lock); - } static int pie_init(struct Qdisc *sch, struct nlattr *opt, @@ -469,15 +468,16 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); - if (opts == NULL) + if (!opts) goto nla_put_failure; /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, - ((u32) PSCHED_TICKS2NS(q->params.target)) / + ((u32)PSCHED_TICKS2NS(q->params.target)) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || + nla_put_u32(skb, TCA_PIE_TUPDATE, + jiffies_to_usecs(q->params.tupdate)) || nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || @@ -489,7 +489,6 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_failure: nla_nest_cancel(skb, opts); return -1; - } static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) @@ -497,7 +496,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob, - .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / + .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, /* unscale and return dq_rate in bytes per sec */ .avg_dq_rate = q->vars.avg_dq_rate * @@ -514,8 +513,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { - struct sk_buff *skb; - skb = qdisc_dequeue_head(sch); + struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; @@ -527,6 +525,7 @@ static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) static void pie_reset(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); pie_vars_init(&q->vars); } @@ -534,6 +533,7 @@ static void pie_reset(struct Qdisc *sch) static void pie_destroy(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; del_timer_sync(&q->adapt_timer); } -- cgit v1.2.3