From bf1ecd210541ef5f3a110e88e8ca5d33b4aa5c23 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Tue, 31 May 2016 00:16:50 +0300
Subject: cfg80211: Allow cfg80211_connect_result() errors to be distinguished

Previously, the status parameter to cfg80211_connect_result() was
documented as using WLAN_STATUS_UNSPECIFIED_FAILURE (1) when the real
status code for the failure is not known. This value can be used by an
AP (and often is) and as such, user space cannot distinguish between
explicitly rejected authentication/association and not being able to
even try to associate or not receiving a response from the AP.

Add a new inline function, cfg80211_connect_timeout(), to be used when
the driver knows that the connection attempt failed due to a reason
where connection could not be attempt or no response was received from
the AP. The internal functions now allow a negative status value (-1) to
be used as an indication of this special case. This results in the
NL80211_ATTR_TIMED_OUT to be added to the NL80211_CMD_CONNECT event to
allow user space to determine this case was hit. For backwards
compatibility, NL80211_STATUS_CODE with the value
WLAN_STATUS_UNSPECIFIED_FAILURE is still indicated in the event in such
a case.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
[johannes: fix cfg80211_connect_bss() prototype to use int for status,
 add cfg80211_connect_timeout() to docbook, fix docbook]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 53 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 63921672bed0..537f010cf5e1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2367,19 +2367,23 @@ struct cfg80211_qos_map {
  *	(invoked with the wireless_dev mutex held)
  *
  * @connect: Connect to the ESS with the specified parameters. When connected,
- *	call cfg80211_connect_result() with status code %WLAN_STATUS_SUCCESS.
- *	If the connection fails for some reason, call cfg80211_connect_result()
- *	with the status from the AP. The driver is allowed to roam to other
- *	BSSes within the ESS when the other BSS matches the connect parameters.
- *	When such roaming is initiated by the driver, the driver is expected to
- *	verify that the target matches the configured security parameters and
- *	to use Reassociation Request frame instead of Association Request frame.
- *	The connect function can also be used to request the driver to perform
- *	a specific roam when connected to an ESS. In that case, the prev_bssid
+ *	call cfg80211_connect_result()/cfg80211_connect_bss() with status code
+ *	%WLAN_STATUS_SUCCESS. If the connection fails for some reason, call
+ *	cfg80211_connect_result()/cfg80211_connect_bss() with the status code
+ *	from the AP or cfg80211_connect_timeout() if no frame with status code
+ *	was received.
+ *	The driver is allowed to roam to other BSSes within the ESS when the
+ *	other BSS matches the connect parameters. When such roaming is initiated
+ *	by the driver, the driver is expected to verify that the target matches
+ *	the configured security parameters and to use Reassociation Request
+ *	frame instead of Association Request frame.
+ *	The connect function can also be used to request the driver to perform a
+ *	specific roam when connected to an ESS. In that case, the prev_bssid
  *	parameter is set to the BSSID of the currently associated BSS as an
- *	indication of requesting reassociation. In both the driver-initiated and
- *	new connect() call initiated roaming cases, the result of roaming is
- *	indicated with a call to cfg80211_roamed() or cfg80211_roamed_bss().
+ *	indication of requesting reassociation.
+ *	In both the driver-initiated and new connect() call initiated roaming
+ *	cases, the result of roaming is indicated with a call to
+ *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
  * @disconnect: Disconnect from the BSS/ESS.
  *	(invoked with the wireless_dev mutex held)
@@ -4680,7 +4684,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, u16 status, gfp_t gfp);
+			  size_t resp_ie_len, int status, gfp_t gfp);
 
 /**
  * cfg80211_connect_result - notify cfg80211 of connection result
@@ -4709,6 +4713,29 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			     resp_ie_len, status, gfp);
 }
 
+/**
+ * cfg80211_connect_timeout - notify cfg80211 of connection timeout
+ *
+ * @dev: network device
+ * @bssid: the BSSID of the AP
+ * @req_ie: association request IEs (maybe be %NULL)
+ * @req_ie_len: association request IEs length
+ * @gfp: allocation flags
+ *
+ * It should be called by the underlying driver whenever connect() has failed
+ * in a sequence where no explicit authentication/association rejection was
+ * received from the AP. This could happen, e.g., due to not being able to send
+ * out the Authentication or Association Request frame or timing out while
+ * waiting for the response.
+ */
+static inline void
+cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
+			 const u8 *req_ie, size_t req_ie_len, gfp_t gfp)
+{
+	cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
+			     gfp);
+}
+
 /**
  * cfg80211_roamed - notify cfg80211 of roaming
  *
-- 
cgit v1.2.3


From 019ae3a918811715192b22c400ac78d54acc26a9 Mon Sep 17 00:00:00 2001
From: "Kanchanapally, Vidyullatha" <vkanchan@qti.qualcomm.com>
Date: Mon, 16 May 2016 10:41:04 +0530
Subject: cfg80211: Advertise extended capabilities per interface type to
 userspace

The driver extended capabilities may differ for different
interface types which the userspace needs to know (for
example the fine timing measurement initiator and responder
bits might differ for a station and AP). Add a new nl80211
attribute to provide extended capabilities per interface type
to userspace.

Signed-off-by: Vidyullatha Kanchanapally <vkanchan@qti.qualcomm.com>
Reviewed-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 28 +++++++++++++++++++++++++++-
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/core.c          | 30 ++++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 43 ++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 106 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 537f010cf5e1..7bbb00d8b2cd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3083,6 +3083,24 @@ struct wiphy_vendor_command {
 		      unsigned long *storage);
 };
 
+/**
+ * struct wiphy_iftype_ext_capab - extended capabilities per interface type
+ * @iftype: interface type
+ * @extended_capabilities: extended capabilities supported by the driver,
+ *	additional capabilities might be supported by userspace; these are the
+ *	802.11 extended capabilities ("Extended Capabilities element") and are
+ *	in the same format as in the information element. See IEEE Std
+ *	802.11-2012 8.4.2.29 for the defined fields.
+ * @extended_capabilities_mask: mask of the valid values
+ * @extended_capabilities_len: length of the extended capabilities
+ */
+struct wiphy_iftype_ext_capab {
+	enum nl80211_iftype iftype;
+	const u8 *extended_capabilities;
+	const u8 *extended_capabilities_mask;
+	u8 extended_capabilities_len;
+};
+
 /**
  * struct wiphy - wireless hardware description
  * @reg_notifier: the driver's regulatory notification callback,
@@ -3203,9 +3221,14 @@ struct wiphy_vendor_command {
  *	additional capabilities might be supported by userspace; these are
  *	the 802.11 extended capabilities ("Extended Capabilities element")
  *	and are in the same format as in the information element. See
- *	802.11-2012 8.4.2.29 for the defined fields.
+ *	802.11-2012 8.4.2.29 for the defined fields. These are the default
+ *	extended capabilities to be used if the capabilities are not specified
+ *	for a specific interface type in iftype_ext_capab.
  * @extended_capabilities_mask: mask of the valid values
  * @extended_capabilities_len: length of the extended capabilities
+ * @iftype_ext_capab: array of extended capabilities per interface type
+ * @num_iftype_ext_capab: number of interface types for which extended
+ *	capabilities are specified separately.
  * @coalesce: packet coalescing support information
  *
  * @vendor_commands: array of vendor commands supported by the hardware
@@ -3305,6 +3328,9 @@ struct wiphy {
 	const u8 *extended_capabilities, *extended_capabilities_mask;
 	u8 extended_capabilities_len;
 
+	const struct wiphy_iftype_ext_capab *iftype_ext_capab;
+	unsigned int num_iftype_ext_capab;
+
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
 	 * know whether it points to a wiphy your driver has registered
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8d995316aadb..53c8278827a0 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1824,6 +1824,11 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_PAD: attribute used for padding for 64-bit alignment
  *
+ * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes:
+ *	%NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA,
+ *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per
+ *	interface type.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2206,6 +2211,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PAD,
 
+	NL80211_ATTR_IFTYPE_EXT_CAPA,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index d25c82bc1bbe..b8e10a952111 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -750,6 +750,36 @@ int wiphy_register(struct wiphy *wiphy)
 		nl80211_send_reg_change_event(&request);
 	}
 
+	/* Check that nobody globally advertises any capabilities they do not
+	 * advertise on all possible interface types.
+	 */
+	if (wiphy->extended_capabilities_len &&
+	    wiphy->num_iftype_ext_capab &&
+	    wiphy->iftype_ext_capab) {
+		u8 supported_on_all, j;
+		const struct wiphy_iftype_ext_capab *capab;
+
+		capab = wiphy->iftype_ext_capab;
+		for (j = 0; j < wiphy->extended_capabilities_len; j++) {
+			if (capab[0].extended_capabilities_len > j)
+				supported_on_all =
+					capab[0].extended_capabilities[j];
+			else
+				supported_on_all = 0x00;
+			for (i = 1; i < wiphy->num_iftype_ext_capab; i++) {
+				if (j >= capab[i].extended_capabilities_len) {
+					supported_on_all = 0x00;
+					break;
+				}
+				supported_on_all &=
+					capab[i].extended_capabilities[j];
+			}
+			if (WARN_ON(wiphy->extended_capabilities[j] &
+				    ~supported_on_all))
+				break;
+		}
+	}
+
 	rdev->wiphy.registered = true;
 	rtnl_unlock();
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 03ac2ba8b174..d12044996a0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1264,7 +1264,7 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg,
 struct nl80211_dump_wiphy_state {
 	s64 filter_wiphy;
 	long start;
-	long split_start, band_start, chan_start;
+	long split_start, band_start, chan_start, capa_start;
 	bool split;
 };
 
@@ -1761,6 +1761,47 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			nla_nest_end(msg, nested);
 		}
 
+		state->split_start++;
+		break;
+	case 13:
+		if (rdev->wiphy.num_iftype_ext_capab &&
+		    rdev->wiphy.iftype_ext_capab) {
+			struct nlattr *nested_ext_capab, *nested;
+
+			nested = nla_nest_start(msg,
+						NL80211_ATTR_IFTYPE_EXT_CAPA);
+			if (!nested)
+				goto nla_put_failure;
+
+			for (i = state->capa_start;
+			     i < rdev->wiphy.num_iftype_ext_capab; i++) {
+				const struct wiphy_iftype_ext_capab *capab;
+
+				capab = &rdev->wiphy.iftype_ext_capab[i];
+
+				nested_ext_capab = nla_nest_start(msg, i);
+				if (!nested_ext_capab ||
+				    nla_put_u32(msg, NL80211_ATTR_IFTYPE,
+						capab->iftype) ||
+				    nla_put(msg, NL80211_ATTR_EXT_CAPA,
+					    capab->extended_capabilities_len,
+					    capab->extended_capabilities) ||
+				    nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK,
+					    capab->extended_capabilities_len,
+					    capab->extended_capabilities_mask))
+					goto nla_put_failure;
+
+				nla_nest_end(msg, nested_ext_capab);
+				if (state->split)
+					break;
+			}
+			nla_nest_end(msg, nested);
+			if (i < rdev->wiphy.num_iftype_ext_capab) {
+				state->capa_start = i + 1;
+				break;
+			}
+		}
+
 		/* done */
 		state->split_start = 0;
 		break;
-- 
cgit v1.2.3


From 595d0b29463343c3be995d3948930b8231e5b8cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 31 May 2016 15:22:41 -0700
Subject: udp: avoid csum_partial() for validated skb

In commit e6afc8ace6dd5 ("udp: remove headers from UDP packets before
queueing"), udp_csum_pull_header() helper was added but missed fact
that CHECKSUM_UNNECESSARY packets were now converted to CHECKSUM_NONE
and skb->csum_valid was set to 1 for them.

Since csum_partial() is quite expensive, even for 8-byte area, it is
worth adding a test.

We also can use skb->data instead of udp_hdr() as we are pulling
UDP headers, as it is sightly faster.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/udp.h b/include/net/udp.h
index ae07f375370d..8894d7144189 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -160,8 +160,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 
 static inline void udp_csum_pull_header(struct sk_buff *skb)
 {
-	if (skb->ip_summed == CHECKSUM_NONE)
-		skb->csum = csum_partial(udp_hdr(skb), sizeof(struct udphdr),
+	if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE)
+		skb->csum = csum_partial(skb->data, sizeof(struct udphdr),
 					 skb->csum);
 	skb_pull_rcsum(skb, sizeof(struct udphdr));
 	UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
-- 
cgit v1.2.3


From 90017accff61ae89283ad9a51f9ac46ca01633fb Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 2 Jun 2016 15:05:43 -0300
Subject: sctp: Add GSO support

SCTP has this pecualiarity that its packets cannot be just segmented to
(P)MTU. Its chunks must be contained in IP segments, padding respected.
So we can't just generate a big skb, set gso_size to the fragmentation
point and deliver it to IP layer.

This patch takes a different approach. SCTP will now build a skb as it
would be if it was received using GRO. That is, there will be a cover
skb with protocol headers and children ones containing the actual
segments, already segmented to a way that respects SCTP RFCs.

With that, we can tell skb_segment() to just split based on frag_list,
trusting its sizes are already in accordance.

This way SCTP can benefit from GSO and instead of passing several
packets through the stack, it can pass a single large packet.

v2:
- Added support for receiving GSO frames, as requested by Dave Miller.
- Clear skb->cb if packet is GSO (otherwise it's not used by SCTP)
- Added heuristics similar to what we have in TCP for not generating
  single GSO packets that fills cwnd.
v3:
- consider sctphdr size in skb_gso_transport_seglen()
- rebased due to 5c7cdf339af5 ("gso: Remove arbitrary checks for
  unsupported GSO")

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |   7 +-
 include/linux/netdevice.h       |   1 +
 include/linux/skbuff.h          |   2 +
 include/net/sctp/sctp.h         |   4 +
 include/net/sctp/structs.h      |   5 +
 net/core/ethtool.c              |   1 +
 net/core/skbuff.c               |   3 +
 net/sctp/Makefile               |   3 +-
 net/sctp/input.c                |  12 +-
 net/sctp/inqueue.c              |  51 +++++-
 net/sctp/offload.c              |  98 +++++++++++
 net/sctp/output.c               | 363 +++++++++++++++++++++++++++-------------
 net/sctp/protocol.c             |   3 +
 net/sctp/socket.c               |   2 +
 14 files changed, 429 insertions(+), 126 deletions(-)
 create mode 100644 net/sctp/offload.c

(limited to 'include/net')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index aa7b2400f98c..9c6c8ef2e9e7 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -53,8 +53,9 @@ enum {
 					 *     headers in software.
 					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
+	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
+		NETIF_F_GSO_SCTP_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -128,6 +129,7 @@ enum {
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
+#define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
@@ -166,7 +168,8 @@ enum {
 				 NETIF_F_FSO)
 
 /* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_UFO)
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_UFO | \
+				 NETIF_F_GSO_SCTP)
 
 /*
  * If one device supports one of these features, then enable them
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f45929ce8157..fa6df2699532 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4012,6 +4012,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa3f9d7e8d5c..dc0fca747c5e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -487,6 +487,8 @@ enum {
 	SKB_GSO_PARTIAL = 1 << 13,
 
 	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
+
+	SKB_GSO_SCTP = 1 << 15,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index b392ac8382f2..632e205ca54b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -186,6 +186,10 @@ void sctp_assocs_proc_exit(struct net *net);
 int sctp_remaddr_proc_init(struct net *net);
 void sctp_remaddr_proc_exit(struct net *net);
 
+/*
+ * sctp/offload.c
+ */
+int sctp_offload_init(void);
 
 /*
  * Module global variables
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 16b013a6191c..83c5ec58b93a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -566,6 +566,9 @@ struct sctp_chunk {
 	/* This points to the sk_buff containing the actual data.  */
 	struct sk_buff *skb;
 
+	/* In case of GSO packets, this will store the head one */
+	struct sk_buff *head_skb;
+
 	/* These are the SCTP headers by reverse order in a packet.
 	 * Note that some of these may happen more than once.  In that
 	 * case, we point at the "current" one, whatever that means
@@ -696,6 +699,8 @@ struct sctp_packet {
 	size_t overhead;
 	/* This is the total size of all chunks INCLUDING padding.  */
 	size_t size;
+	/* This is the maximum size this packet may have */
+	size_t max_size;
 
 	/* The packet is destined for this transport address.
 	 * The function we finally use to pass down to the next lower
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f4034817d255..977489820eb9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
+	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5ca562b56ec3..b6e0f95bef36 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -49,6 +49,7 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/sctp.h>
 #include <linux/netdevice.h>
 #ifdef CONFIG_NET_CLS_ACT
 #include <net/pkt_sched.h>
@@ -4383,6 +4384,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 			thlen += inner_tcp_hdrlen(skb);
 	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 		thlen = tcp_hdrlen(skb);
+	} else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) {
+		thlen = sizeof(struct sctphdr);
 	}
 	/* UFO sets gso_size to the size of the fragmentation
 	 * payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 0fca5824ad0e..6c4f7496cec6 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  transport.o chunk.o sm_make_chunk.o ulpevent.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
-	  output.o input.o debug.o ssnmap.o auth.o
+	  output.o input.o debug.o ssnmap.o auth.o \
+	  offload.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5cff2546c3dd..6f8e676d285e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -139,7 +139,9 @@ int sctp_rcv(struct sk_buff *skb)
 	skb->csum_valid = 0; /* Previous value not applicable */
 	if (skb_csum_unnecessary(skb))
 		__skb_decr_checksum_unnecessary(skb);
-	else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0)
+	else if (!sctp_checksum_disable &&
+		 !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) &&
+		 sctp_rcv_checksum(net, skb) < 0)
 		goto discard_it;
 	skb->csum_valid = 1;
 
@@ -1175,6 +1177,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 {
 	sctp_chunkhdr_t *ch;
 
+	/* We do not allow GSO frames here as we need to linearize and
+	 * then cannot guarantee frame boundaries. This shouldn't be an
+	 * issue as packets hitting this are mostly INIT or INIT-ACK and
+	 * those cannot be on GSO-style anyway.
+	 */
+	if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP)
+		return NULL;
+
 	if (skb_linearize(skb))
 		return NULL;
 
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 5ba08ceda3ab..edabbbdfca54 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -138,6 +138,17 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
 		if (chunk->singleton ||
 		    chunk->end_of_packet ||
 		    chunk->pdiscard) {
+			if (chunk->head_skb == chunk->skb) {
+				chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+				goto new_skb;
+			}
+			if (chunk->skb->next) {
+				chunk->skb = chunk->skb->next;
+				goto new_skb;
+			}
+
+			if (chunk->head_skb)
+				chunk->skb = chunk->head_skb;
 			sctp_chunk_free(chunk);
 			chunk = queue->in_progress = NULL;
 		} else {
@@ -155,15 +166,15 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
 
 next_chunk:
 		/* Is the queue empty?  */
-		if (list_empty(&queue->in_chunk_list))
+		entry = sctp_list_dequeue(&queue->in_chunk_list);
+		if (!entry)
 			return NULL;
 
-		entry = queue->in_chunk_list.next;
 		chunk = list_entry(entry, struct sctp_chunk, list);
-		list_del_init(entry);
 
 		/* Linearize if it's not GSO */
-		if (skb_is_nonlinear(chunk->skb)) {
+		if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP &&
+		    skb_is_nonlinear(chunk->skb)) {
 			if (skb_linearize(chunk->skb)) {
 				__SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
 				sctp_chunk_free(chunk);
@@ -174,15 +185,39 @@ next_chunk:
 			chunk->sctp_hdr = sctp_hdr(chunk->skb);
 		}
 
+		if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) {
+			/* GSO-marked skbs but without frags, handle
+			 * them normally
+			 */
+			if (skb_shinfo(chunk->skb)->frag_list)
+				chunk->head_skb = chunk->skb;
+
+			/* skbs with "cover letter" */
+			if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len)
+				chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+
+			if (WARN_ON(!chunk->skb)) {
+				__SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
+				sctp_chunk_free(chunk);
+				goto next_chunk;
+			}
+		}
+
+		if (chunk->asoc)
+			sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
+
 		queue->in_progress = chunk;
 
+new_skb:
 		/* This is the first chunk in the packet.  */
-		chunk->singleton = 1;
 		ch = (sctp_chunkhdr_t *) chunk->skb->data;
+		chunk->singleton = 1;
 		chunk->data_accepted = 0;
-
-		if (chunk->asoc)
-			sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
+		chunk->pdiscard = 0;
+		chunk->auth = 0;
+		chunk->has_asconf = 0;
+		chunk->end_of_packet = 0;
+		chunk->ecn_ce_done = 0;
 	}
 
 	chunk->chunk_hdr = ch;
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
new file mode 100644
index 000000000000..a37887b373a7
--- /dev/null
+++ b/net/sctp/offload.c
@@ -0,0 +1,98 @@
+/*
+ * sctp_offload - GRO/GSO Offloading for SCTP
+ *
+ * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/checksum.h>
+#include <net/protocol.h>
+
+static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
+{
+	skb->ip_summed = CHECKSUM_NONE;
+	return sctp_compute_cksum(skb, skb_transport_offset(skb));
+}
+
+static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
+					netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct sctphdr *sh;
+
+	sh = sctp_hdr(skb);
+	if (!pskb_may_pull(skb, sizeof(*sh)))
+		goto out;
+
+	__skb_pull(skb, sizeof(*sh));
+
+	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+		struct skb_shared_info *pinfo = skb_shinfo(skb);
+		struct sk_buff *frag_iter;
+
+		pinfo->gso_segs = 0;
+		if (skb->len != skb->data_len) {
+			/* Means we have chunks in here too */
+			pinfo->gso_segs++;
+		}
+
+		skb_walk_frags(skb, frag_iter)
+			pinfo->gso_segs++;
+
+		segs = NULL;
+		goto out;
+	}
+
+	segs = skb_segment(skb, features | NETIF_F_HW_CSUM);
+	if (IS_ERR(segs))
+		goto out;
+
+	/* All that is left is update SCTP CRC if necessary */
+	if (!(features & NETIF_F_SCTP_CRC)) {
+		for (skb = segs; skb; skb = skb->next) {
+			if (skb->ip_summed == CHECKSUM_PARTIAL) {
+				sh = sctp_hdr(skb);
+				sh->checksum = sctp_gso_make_checksum(skb);
+			}
+		}
+	}
+
+out:
+	return segs;
+}
+
+static const struct net_offload sctp_offload = {
+	.callbacks = {
+		.gso_segment = sctp_gso_segment,
+	},
+};
+
+int __init sctp_offload_init(void)
+{
+	return inet_add_offload(&sctp_offload, IPPROTO_SCTP);
+}
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 9844fe573029..60499a69179d 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet)
 struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
 				       __u32 vtag, int ecn_capable)
 {
-	struct sctp_chunk *chunk = NULL;
+	struct sctp_transport *tp = packet->transport;
+	struct sctp_association *asoc = tp->asoc;
 
 	pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
 
 	packet->vtag = vtag;
 
+	if (asoc && tp->dst) {
+		struct sock *sk = asoc->base.sk;
+
+		rcu_read_lock();
+		if (__sk_dst_get(sk) != tp->dst) {
+			dst_hold(tp->dst);
+			sk_setup_caps(sk, tp->dst);
+		}
+
+		if (sk_can_gso(sk)) {
+			struct net_device *dev = tp->dst->dev;
+
+			packet->max_size = dev->gso_max_size;
+		} else {
+			packet->max_size = asoc->pathmtu;
+		}
+		rcu_read_unlock();
+
+	} else {
+		packet->max_size = tp->pathmtu;
+	}
+
 	if (ecn_capable && sctp_packet_empty(packet)) {
-		chunk = sctp_get_ecne_prepend(packet->transport->asoc);
+		struct sctp_chunk *chunk;
 
 		/* If there a is a prepend chunk stick it on the list before
 		 * any other chunks get appended.
 		 */
+		chunk = sctp_get_ecne_prepend(asoc);
 		if (chunk)
 			sctp_packet_append_chunk(packet, chunk);
 	}
@@ -381,12 +405,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
 	struct sctphdr *sh;
-	struct sk_buff *nskb;
+	struct sk_buff *nskb = NULL, *head = NULL;
 	struct sctp_chunk *chunk, *tmp;
 	struct sock *sk;
 	int err = 0;
 	int padding;		/* How much padding do we need?  */
+	int pkt_size;
 	__u8 has_data = 0;
+	int gso = 0;
+	int pktcount = 0;
 	struct dst_entry *dst;
 	unsigned char *auth = NULL;	/* pointer to auth in skb data */
 
@@ -400,18 +427,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
 	sk = chunk->skb->sk;
 
-	/* Allocate the new skb.  */
-	nskb = alloc_skb(packet->size + MAX_HEADER, gfp);
-	if (!nskb)
+	/* Allocate the head skb, or main one if not in GSO */
+	if (packet->size > tp->pathmtu && !packet->ipfragok) {
+		if (sk_can_gso(sk)) {
+			gso = 1;
+			pkt_size = packet->overhead;
+		} else {
+			/* If this happens, we trash this packet and try
+			 * to build a new one, hopefully correct this
+			 * time. Application may notice this error.
+			 */
+			pr_err_once("Trying to GSO but underlying device doesn't support it.");
+			goto nomem;
+		}
+	} else {
+		pkt_size = packet->size;
+	}
+	head = alloc_skb(pkt_size + MAX_HEADER, gfp);
+	if (!head)
 		goto nomem;
+	if (gso) {
+		NAPI_GRO_CB(head)->last = head;
+		skb_shinfo(head)->gso_type = sk->sk_gso_type;
+	}
 
 	/* Make sure the outbound skb has enough header room reserved. */
-	skb_reserve(nskb, packet->overhead + MAX_HEADER);
+	skb_reserve(head, packet->overhead + MAX_HEADER);
 
 	/* Set the owning socket so that we know where to get the
 	 * destination IP address.
 	 */
-	sctp_packet_set_owner_w(nskb, sk);
+	sctp_packet_set_owner_w(head, sk);
 
 	if (!sctp_transport_dst_check(tp)) {
 		sctp_transport_route(tp, NULL, sctp_sk(sk));
@@ -422,11 +468,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	dst = dst_clone(tp->dst);
 	if (!dst)
 		goto no_route;
-	skb_dst_set(nskb, dst);
+	skb_dst_set(head, dst);
 
 	/* Build the SCTP header.  */
-	sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
-	skb_reset_transport_header(nskb);
+	sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
+	skb_reset_transport_header(head);
 	sh->source = htons(packet->source_port);
 	sh->dest   = htons(packet->destination_port);
 
@@ -441,90 +487,133 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	sh->vtag     = htonl(packet->vtag);
 	sh->checksum = 0;
 
-	/**
-	 * 6.10 Bundling
-	 *
-	 *    An endpoint bundles chunks by simply including multiple
-	 *    chunks in one outbound SCTP packet.  ...
-	 */
-
-	/**
-	 * 3.2  Chunk Field Descriptions
-	 *
-	 * The total length of a chunk (including Type, Length and
-	 * Value fields) MUST be a multiple of 4 bytes.  If the length
-	 * of the chunk is not a multiple of 4 bytes, the sender MUST
-	 * pad the chunk with all zero bytes and this padding is not
-	 * included in the chunk length field.  The sender should
-	 * never pad with more than 3 bytes.
-	 *
-	 * [This whole comment explains WORD_ROUND() below.]
-	 */
-
 	pr_debug("***sctp_transmit_packet***\n");
 
-	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
-		list_del_init(&chunk->list);
-		if (sctp_chunk_is_data(chunk)) {
-			/* 6.3.1 C4) When data is in flight and when allowed
-			 * by rule C5, a new RTT measurement MUST be made each
-			 * round trip.  Furthermore, new RTT measurements
-			 * SHOULD be made no more than once per round-trip
-			 * for a given destination transport address.
-			 */
+	do {
+		/* Set up convenience variables... */
+		chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+		pktcount++;
 
-			if (!chunk->resent && !tp->rto_pending) {
-				chunk->rtt_in_progress = 1;
-				tp->rto_pending = 1;
+		/* Calculate packet size, so it fits in PMTU. Leave
+		 * other chunks for the next packets.
+		 */
+		if (gso) {
+			pkt_size = packet->overhead;
+			list_for_each_entry(chunk, &packet->chunk_list, list) {
+				int padded = WORD_ROUND(chunk->skb->len);
+
+				if (pkt_size + padded > tp->pathmtu)
+					break;
+				pkt_size += padded;
 			}
 
-			has_data = 1;
-		}
+			/* Allocate a new skb. */
+			nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
+			if (!nskb)
+				goto nomem;
 
-		padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
-		if (padding)
-			memset(skb_put(chunk->skb, padding), 0, padding);
+			/* Make sure the outbound skb has enough header
+			 * room reserved.
+			 */
+			skb_reserve(nskb, packet->overhead + MAX_HEADER);
+		} else {
+			nskb = head;
+		}
 
-		/* if this is the auth chunk that we are adding,
-		 * store pointer where it will be added and put
-		 * the auth into the packet.
+		/**
+		 * 3.2  Chunk Field Descriptions
+		 *
+		 * The total length of a chunk (including Type, Length and
+		 * Value fields) MUST be a multiple of 4 bytes.  If the length
+		 * of the chunk is not a multiple of 4 bytes, the sender MUST
+		 * pad the chunk with all zero bytes and this padding is not
+		 * included in the chunk length field.  The sender should
+		 * never pad with more than 3 bytes.
+		 *
+		 * [This whole comment explains WORD_ROUND() below.]
 		 */
-		if (chunk == packet->auth)
-			auth = skb_tail_pointer(nskb);
 
-		memcpy(skb_put(nskb, chunk->skb->len),
+		pkt_size -= packet->overhead;
+		list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+			list_del_init(&chunk->list);
+			if (sctp_chunk_is_data(chunk)) {
+				/* 6.3.1 C4) When data is in flight and when allowed
+				 * by rule C5, a new RTT measurement MUST be made each
+				 * round trip.  Furthermore, new RTT measurements
+				 * SHOULD be made no more than once per round-trip
+				 * for a given destination transport address.
+				 */
+
+				if (!chunk->resent && !tp->rto_pending) {
+					chunk->rtt_in_progress = 1;
+					tp->rto_pending = 1;
+				}
+
+				has_data = 1;
+			}
+
+			padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+			if (padding)
+				memset(skb_put(chunk->skb, padding), 0, padding);
+
+			/* if this is the auth chunk that we are adding,
+			 * store pointer where it will be added and put
+			 * the auth into the packet.
+			 */
+			if (chunk == packet->auth)
+				auth = skb_tail_pointer(nskb);
+
+			memcpy(skb_put(nskb, chunk->skb->len),
 			       chunk->skb->data, chunk->skb->len);
 
-		pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, "
-			 "rtt_in_progress:%d\n", chunk,
-			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
-			 chunk->has_tsn ? "TSN" : "No TSN",
-			 chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
-			 ntohs(chunk->chunk_hdr->length), chunk->skb->len,
-			 chunk->rtt_in_progress);
-
-		/*
-		 * If this is a control chunk, this is our last
-		 * reference. Free data chunks after they've been
-		 * acknowledged or have failed.
-		 */
-		if (!sctp_chunk_is_data(chunk))
-			sctp_chunk_free(chunk);
-	}
+			pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
+				 chunk,
+				 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+				 chunk->has_tsn ? "TSN" : "No TSN",
+				 chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
+				 ntohs(chunk->chunk_hdr->length), chunk->skb->len,
+				 chunk->rtt_in_progress);
+
+			/* If this is a control chunk, this is our last
+			 * reference. Free data chunks after they've been
+			 * acknowledged or have failed.
+			 * Re-queue auth chunks if needed.
+			 */
+			pkt_size -= WORD_ROUND(chunk->skb->len);
 
-	/* SCTP-AUTH, Section 6.2
-	 *    The sender MUST calculate the MAC as described in RFC2104 [2]
-	 *    using the hash function H as described by the MAC Identifier and
-	 *    the shared association key K based on the endpoint pair shared key
-	 *    described by the shared key identifier.  The 'data' used for the
-	 *    computation of the AUTH-chunk is given by the AUTH chunk with its
-	 *    HMAC field set to zero (as shown in Figure 6) followed by all
-	 *    chunks that are placed after the AUTH chunk in the SCTP packet.
-	 */
-	if (auth)
-		sctp_auth_calculate_hmac(asoc, nskb,
-					 (struct sctp_auth_chunk *)auth,
-					 gfp);
+			if (chunk == packet->auth && !list_empty(&packet->chunk_list))
+				list_add(&chunk->list, &packet->chunk_list);
+			else if (!sctp_chunk_is_data(chunk))
+				sctp_chunk_free(chunk);
+
+			if (!pkt_size)
+				break;
+		}
+
+		/* SCTP-AUTH, Section 6.2
+		 *    The sender MUST calculate the MAC as described in RFC2104 [2]
+		 *    using the hash function H as described by the MAC Identifier and
+		 *    the shared association key K based on the endpoint pair shared key
+		 *    described by the shared key identifier.  The 'data' used for the
+		 *    computation of the AUTH-chunk is given by the AUTH chunk with its
+		 *    HMAC field set to zero (as shown in Figure 6) followed by all
+		 *    chunks that are placed after the AUTH chunk in the SCTP packet.
+		 */
+		if (auth)
+			sctp_auth_calculate_hmac(asoc, nskb,
+						 (struct sctp_auth_chunk *)auth,
+						 gfp);
+
+		if (!gso)
+			break;
+
+		if (skb_gro_receive(&head, nskb))
+			goto nomem;
+		nskb = NULL;
+		if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
+				 sk->sk_gso_max_segs))
+			goto nomem;
+	} while (!list_empty(&packet->chunk_list));
 
 	/* 2) Calculate the Adler-32 checksum of the whole packet,
 	 *    including the SCTP common header and all the
@@ -532,16 +621,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	 *
 	 * Note: Adler-32 is no longer applicable, as has been replaced
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
+	 *
+	 * If it's a GSO packet, it's postponed to sctp_skb_segment.
 	 */
-	if (!sctp_checksum_disable) {
-		if (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
-		    (dst_xfrm(dst) != NULL) || packet->ipfragok) {
-			sh->checksum = sctp_compute_cksum(nskb, 0);
+	if (!sctp_checksum_disable || gso) {
+		if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
+			     dst_xfrm(dst) || packet->ipfragok)) {
+			sh->checksum = sctp_compute_cksum(head, 0);
 		} else {
 			/* no need to seed pseudo checksum for SCTP */
-			nskb->ip_summed = CHECKSUM_PARTIAL;
-			nskb->csum_start = skb_transport_header(nskb) - nskb->head;
-			nskb->csum_offset = offsetof(struct sctphdr, checksum);
+			head->ip_summed = CHECKSUM_PARTIAL;
+			head->csum_start = skb_transport_header(head) - head->head;
+			head->csum_offset = offsetof(struct sctphdr, checksum);
 		}
 	}
 
@@ -557,7 +648,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	 * Note: The works for IPv6 layer checks this bit too later
 	 * in transmission.  See IP6_ECN_flow_xmit().
 	 */
-	tp->af_specific->ecn_capable(nskb->sk);
+	tp->af_specific->ecn_capable(sk);
 
 	/* Set up the IP options.  */
 	/* BUG: not implemented
@@ -566,7 +657,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 
 	/* Dump that on IP!  */
 	if (asoc) {
-		asoc->stats.opackets++;
+		asoc->stats.opackets += pktcount;
 		if (asoc->peer.last_sent_to != tp)
 			/* Considering the multiple CPU scenario, this is a
 			 * "correcter" place for last_sent_to.  --xguo
@@ -589,16 +680,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		}
 	}
 
-	pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len);
+	pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
+
+	if (gso) {
+		/* Cleanup our debris for IP stacks */
+		memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
+					sizeof(struct inet6_skb_parm)));
 
-	nskb->ignore_df = packet->ipfragok;
-	tp->af_specific->sctp_xmit(nskb, tp);
+		skb_shinfo(head)->gso_segs = pktcount;
+		skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+
+		/* We have to refresh this in case we are xmiting to
+		 * more than one transport at a time
+		 */
+		rcu_read_lock();
+		if (__sk_dst_get(sk) != tp->dst) {
+			dst_hold(tp->dst);
+			sk_setup_caps(sk, tp->dst);
+		}
+		rcu_read_unlock();
+	}
+	head->ignore_df = packet->ipfragok;
+	tp->af_specific->sctp_xmit(head, tp);
 
 out:
 	sctp_packet_reset(packet);
 	return err;
 no_route:
-	kfree_skb(nskb);
+	kfree_skb(head);
+	if (nskb != head)
+		kfree_skb(nskb);
 
 	if (asoc)
 		IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
@@ -751,39 +862,63 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
 					struct sctp_chunk *chunk,
 					u16 chunk_len)
 {
-	size_t psize;
-	size_t pmtu;
-	int too_big;
+	size_t psize, pmtu;
 	sctp_xmit_t retval = SCTP_XMIT_OK;
 
 	psize = packet->size;
-	pmtu  = ((packet->transport->asoc) ?
-		(packet->transport->asoc->pathmtu) :
-		(packet->transport->pathmtu));
-
-	too_big = (psize + chunk_len > pmtu);
+	if (packet->transport->asoc)
+		pmtu = packet->transport->asoc->pathmtu;
+	else
+		pmtu = packet->transport->pathmtu;
 
 	/* Decide if we need to fragment or resubmit later. */
-	if (too_big) {
-		/* It's OK to fragmet at IP level if any one of the following
+	if (psize + chunk_len > pmtu) {
+		/* It's OK to fragment at IP level if any one of the following
 		 * is true:
-		 * 	1. The packet is empty (meaning this chunk is greater
-		 * 	   the MTU)
-		 * 	2. The chunk we are adding is a control chunk
-		 * 	3. The packet doesn't have any data in it yet and data
-		 * 	requires authentication.
+		 *	1. The packet is empty (meaning this chunk is greater
+		 *	   the MTU)
+		 *	2. The packet doesn't have any data in it yet and data
+		 *	   requires authentication.
 		 */
-		if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) ||
+		if (sctp_packet_empty(packet) ||
 		    (!packet->has_data && chunk->auth)) {
 			/* We no longer do re-fragmentation.
 			 * Just fragment at the IP layer, if we
 			 * actually hit this condition
 			 */
 			packet->ipfragok = 1;
-		} else {
-			retval = SCTP_XMIT_PMTU_FULL;
+			goto out;
 		}
+
+		/* It is also okay to fragment if the chunk we are
+		 * adding is a control chunk, but only if current packet
+		 * is not a GSO one otherwise it causes fragmentation of
+		 * a large frame. So in this case we allow the
+		 * fragmentation by forcing it to be in a new packet.
+		 */
+		if (!sctp_chunk_is_data(chunk) && packet->has_data)
+			retval = SCTP_XMIT_PMTU_FULL;
+
+		if (psize + chunk_len > packet->max_size)
+			/* Hit GSO/PMTU limit, gotta flush */
+			retval = SCTP_XMIT_PMTU_FULL;
+
+		if (!packet->transport->burst_limited &&
+		    psize + chunk_len > (packet->transport->cwnd >> 1))
+			/* Do not allow a single GSO packet to use more
+			 * than half of cwnd.
+			 */
+			retval = SCTP_XMIT_PMTU_FULL;
+
+		if (packet->transport->burst_limited &&
+		    psize + chunk_len > (packet->transport->burst_limited >> 1))
+			/* Do not allow a single GSO packet to use more
+			 * than half of original cwnd.
+			 */
+			retval = SCTP_XMIT_PMTU_FULL;
+		/* Otherwise it will fit in the GSO packet */
 	}
 
+out:
 	return retval;
 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d3d50daa248b..40022ee885d7 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1516,6 +1516,9 @@ static __init int sctp_init(void)
 	if (status)
 		goto err_v6_add_protocol;
 
+	if (sctp_offload_init() < 0)
+		pr_crit("%s: Cannot add SCTP protocol offload\n", __func__);
+
 out:
 	return status;
 err_v6_add_protocol:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 67154b848aa9..712fb2339baa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4003,6 +4003,8 @@ static int sctp_init_sock(struct sock *sk)
 		return -ESOCKTNOSUPPORT;
 	}
 
+	sk->sk_gso_type = SKB_GSO_SCTP;
+
 	/* Initialize default send parameters. These parameters can be
 	 * modified with the SCTP_DEFAULT_SEND_PARAM socket option.
 	 */
-- 
cgit v1.2.3


From c8b098086b4c744084350d2757a637ad756adf34 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:16:57 +0200
Subject: net: dsa: Add a ports structure and use it in the switch structure

There are going to be more per-port members added to the switch
structure. So add a port structure and move the netdev into it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c   |  4 ++--
 drivers/net/dsa/mv88e6xxx.c | 27 ++++++++++++++++-----------
 include/net/dsa.h           |  8 ++++++--
 net/dsa/dsa.c               |  8 ++++----
 net/dsa/slave.c             |  4 ++--
 net/dsa/tag_brcm.c          |  4 ++--
 net/dsa/tag_dsa.c           |  4 ++--
 net/dsa/tag_edsa.c          |  4 ++--
 net/dsa/tag_trailer.c       |  4 ++--
 9 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 10ddd5a5dfb6..73df91bb0466 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -804,7 +804,7 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int port,
 			       int (*cb)(struct switchdev_obj *obj))
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
-	struct net_device *dev = ds->ports[port];
+	struct net_device *dev = ds->ports[port].netdev;
 	struct bcm_sf2_arl_entry results[2];
 	unsigned int count = 0;
 	int ret;
@@ -1248,7 +1248,7 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
 		 * state machine and make it go in PHY_FORCING state instead.
 		 */
 		if (!status->link)
-			netif_carrier_off(ds->ports[port]);
+			netif_carrier_off(ds->ports[port].netdev);
 		status->duplex = 1;
 	} else {
 		status->link = 1;
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index c361036e7f9c..85332d9a245a 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1327,7 +1327,7 @@ static int _mv88e6xxx_port_state(struct mv88e6xxx_priv_state *ps, int port,
 		if (ret)
 			return ret;
 
-		netdev_dbg(ds->ports[port], "PortState %s (was %s)\n",
+		netdev_dbg(ds->ports[port].netdev, "PortState %s (was %s)\n",
 			   mv88e6xxx_port_state_names[state],
 			   mv88e6xxx_port_state_names[oldstate]);
 	}
@@ -1405,7 +1405,8 @@ static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
 	mutex_unlock(&ps->smi_mutex);
 
 	if (err)
-		netdev_err(ds->ports[port], "failed to update state to %s\n",
+		netdev_err(ds->ports[port].netdev,
+			   "failed to update state to %s\n",
 			   mv88e6xxx_port_state_names[stp_state]);
 }
 
@@ -1431,8 +1432,8 @@ static int _mv88e6xxx_port_pvid(struct mv88e6xxx_priv_state *ps, int port,
 		if (ret < 0)
 			return ret;
 
-		netdev_dbg(ds->ports[port], "DefaultVID %d (was %d)\n", *new,
-			   pvid);
+		netdev_dbg(ds->ports[port].netdev,
+			   "DefaultVID %d (was %d)\n", *new, pvid);
 	}
 
 	if (old)
@@ -1847,7 +1848,8 @@ static int _mv88e6xxx_port_fid(struct mv88e6xxx_priv_state *ps, int port,
 		if (ret < 0)
 			return ret;
 
-		netdev_dbg(ds->ports[port], "FID %d (was %d)\n", *new, fid);
+		netdev_dbg(ds->ports[port].netdev,
+			   "FID %d (was %d)\n", *new, fid);
 	}
 
 	if (old)
@@ -2028,7 +2030,7 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port,
 			    ps->ports[port].bridge_dev)
 				break; /* same bridge, check next VLAN */
 
-			netdev_warn(ds->ports[port],
+			netdev_warn(ds->ports[port].netdev,
 				    "hardware VLAN %d already used by %s\n",
 				    vlan.vid,
 				    netdev_name(ps->ports[i].bridge_dev));
@@ -2078,7 +2080,7 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 		if (ret < 0)
 			goto unlock;
 
-		netdev_dbg(ds->ports[port], "802.1Q Mode %s (was %s)\n",
+		netdev_dbg(ds->ports[port].netdev, "802.1Q Mode %s (was %s)\n",
 			   mv88e6xxx_port_8021q_mode_names[new],
 			   mv88e6xxx_port_8021q_mode_names[old]);
 	}
@@ -2147,11 +2149,12 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
 
 	for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid)
 		if (_mv88e6xxx_port_vlan_add(ps, port, vid, untagged))
-			netdev_err(ds->ports[port], "failed to add VLAN %d%c\n",
+			netdev_err(ds->ports[port].netdev,
+				   "failed to add VLAN %d%c\n",
 				   vid, untagged ? 'u' : 't');
 
 	if (pvid && _mv88e6xxx_port_pvid_set(ps, port, vlan->vid_end))
-		netdev_err(ds->ports[port], "failed to set PVID %d\n",
+		netdev_err(ds->ports[port].netdev, "failed to set PVID %d\n",
 			   vlan->vid_end);
 
 	mutex_unlock(&ps->smi_mutex);
@@ -2336,7 +2339,8 @@ static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
 
 	mutex_lock(&ps->smi_mutex);
 	if (_mv88e6xxx_port_fdb_load(ps, port, fdb->addr, fdb->vid, state))
-		netdev_err(ds->ports[port], "failed to load MAC address\n");
+		netdev_err(ds->ports[port].netdev,
+			   "failed to load MAC address\n");
 	mutex_unlock(&ps->smi_mutex);
 }
 
@@ -2537,7 +2541,8 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 	for (i = 0; i < ps->info->num_ports; ++i)
 		if (i == port || ps->ports[i].bridge_dev == bridge)
 			if (_mv88e6xxx_port_based_vlan_map(ps, i))
-				netdev_warn(ds->ports[i], "failed to remap\n");
+				netdev_warn(ds->ports[i].netdev,
+					    "failed to remap\n");
 
 	mutex_unlock(&ps->smi_mutex);
 }
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 17c3d37b6779..9aed8572037c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -119,6 +119,10 @@ struct dsa_switch_tree {
 	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
 };
 
+struct dsa_port {
+	struct net_device	*netdev;
+};
+
 struct dsa_switch {
 	struct device *dev;
 
@@ -158,8 +162,8 @@ struct dsa_switch {
 	u32			dsa_port_mask;
 	u32			enabled_port_mask;
 	u32			phys_mii_mask;
+	struct dsa_port		ports[DSA_MAX_PORTS];
 	struct mii_bus		*slave_mii_bus;
-	struct net_device	*ports[DSA_MAX_PORTS];
 };
 
 static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
@@ -174,7 +178,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
 
 static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
 {
-	return ds->enabled_port_mask & (1 << p) && ds->ports[p];
+	return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev;
 }
 
 static inline u8 dsa_upstream_port(struct dsa_switch *ds)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index eff5dfc2e33f..18086e0cc617 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -437,10 +437,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		if (!(ds->enabled_port_mask & (1 << port)))
 			continue;
 
-		if (!ds->ports[port])
+		if (!ds->ports[port].netdev)
 			continue;
 
-		dsa_slave_destroy(ds->ports[port]);
+		dsa_slave_destroy(ds->ports[port].netdev);
 	}
 
 	/* Remove any fixed link PHYs */
@@ -469,7 +469,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_suspend(ds->ports[i]);
+		ret = dsa_slave_suspend(ds->ports[i].netdev);
 		if (ret)
 			return ret;
 	}
@@ -495,7 +495,7 @@ static int dsa_switch_resume(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_resume(ds->ports[i]);
+		ret = dsa_slave_resume(ds->ports[i].netdev);
 		if (ret)
 			return ret;
 	}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f640a48a6ff3..169abacbc6ce 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1183,12 +1183,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	p->old_link = -1;
 	p->old_duplex = -1;
 
-	ds->ports[port] = slave_dev;
+	ds->ports[port].netdev = slave_dev;
 	ret = register_netdev(slave_dev);
 	if (ret) {
 		netdev_err(master, "error %d registering interface %s\n",
 			   ret, slave_dev->name);
-		ds->ports[port] = NULL;
+		ds->ports[port].netdev = NULL;
 		free_netdev(slave_dev);
 		return ret;
 	}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index e2aadb73111d..21bffde6e4bf 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -127,7 +127,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
 	/* Validate port against switch setup, either the port is totally */
-	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/* Remove Broadcom tag and update checksum */
@@ -140,7 +140,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb_push(skb, ETH_HLEN);
 	skb->pkt_type = PACKET_HOST;
-	skb->dev = ds->ports[source_port];
+	skb->dev = ds->ports[source_port].netdev;
 	skb->protocol = eth_type_trans(skb, skb->dev);
 
 	skb->dev->stats.rx_packets++;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index f9832f097681..bce79ffe342b 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
@@ -163,7 +163,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 			2 * ETH_ALEN);
 	}
 
-	skb->dev = ds->ports[source_port];
+	skb->dev = ds->ports[source_port].netdev;
 	skb_push(skb, ETH_HLEN);
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, skb->dev);
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 3890aac8190f..6c1720e88537 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
@@ -182,7 +182,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 			2 * ETH_ALEN);
 	}
 
-	skb->dev = ds->ports[source_port];
+	skb->dev = ds->ports[source_port].netdev;
 	skb_push(skb, ETH_HLEN);
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, skb->dev);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index b6ca0890d018..5e3903eb1afa 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -82,12 +82,12 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto out_drop;
 
 	source_port = trailer[1] & 7;
-	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	pskb_trim_rcsum(skb, skb->len - 4);
 
-	skb->dev = ds->ports[source_port];
+	skb->dev = ds->ports[source_port].netdev;
 	skb_push(skb, ETH_HLEN);
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, skb->dev);
-- 
cgit v1.2.3


From 189b0d93ec61e1f991e96d7bc03b03cf929d164c Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:16:58 +0200
Subject: net: dsa: Move port device node into port structure

Move the port device node structure into the port structure, from the
chip data. This information is needed in the next step of implementing
the new binding.

The chip data structure is used while parsing the whole old binding,
before the individual switch structures exist. With the new bindings,
this is reversed, the switches exist first, and the interconnections
between the switches is derived from the individual switch
bindings. Thus this chip data structure becomes unneeded.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
eviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 net/dsa/dsa.c     | 8 ++++----
 net/dsa/slave.c   | 5 ++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9aed8572037c..8314197d028f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -121,6 +121,7 @@ struct dsa_switch_tree {
 
 struct dsa_port {
 	struct net_device	*netdev;
+	struct device_node	*dn;
 };
 
 struct dsa_switch {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 18086e0cc617..5907f8cd13b6 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -182,7 +182,6 @@ __ATTRIBUTE_GROUPS(dsa_hwmon);
 /* basic switch operations **************************************************/
 static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master)
 {
-	struct dsa_chip_data *cd = ds->cd;
 	struct device_node *port_dn;
 	struct phy_device *phydev;
 	int ret, port, mode;
@@ -191,7 +190,7 @@ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master)
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 
-		port_dn = cd->port_dn[port];
+		port_dn = ds->ports[port].dn;
 		if (of_phy_is_fixed_link(port_dn)) {
 			ret = of_phy_register_fixed_link(port_dn);
 			if (ret) {
@@ -325,6 +324,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 * Create network devices for physical switch ports.
 	 */
 	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		ds->ports[i].dn = cd->port_dn[i];
+
 		if (!(ds->enabled_port_mask & (1 << i)))
 			continue;
 
@@ -424,7 +425,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 {
 	struct device_node *port_dn;
 	struct phy_device *phydev;
-	struct dsa_chip_data *cd = ds->cd;
 	int port;
 
 #ifdef CONFIG_NET_DSA_HWMON
@@ -445,7 +445,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 
 	/* Remove any fixed link PHYs */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
-		port_dn = cd->port_dn[port];
+		port_dn = ds->ports[port].dn;
 		if (of_phy_is_fixed_link(port_dn)) {
 			phydev = of_phy_find_device(port_dn);
 			if (phydev) {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 169abacbc6ce..52f1183c42a0 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -998,13 +998,12 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 				struct net_device *slave_dev)
 {
 	struct dsa_switch *ds = p->parent;
-	struct dsa_chip_data *cd = ds->cd;
 	struct device_node *phy_dn, *port_dn;
 	bool phy_is_fixed = false;
 	u32 phy_flags = 0;
 	int mode, ret;
 
-	port_dn = cd->port_dn[p->port];
+	port_dn = ds->ports[p->port].dn;
 	mode = of_get_phy_mode(port_dn);
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
@@ -1146,7 +1145,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 				 NULL);
 
 	SET_NETDEV_DEV(slave_dev, parent);
-	slave_dev->dev.of_node = ds->cd->port_dn[port];
+	slave_dev->dev.of_node = ds->ports[port].dn;
 	slave_dev->vlan_features = master->vlan_features;
 
 	p = netdev_priv(slave_dev);
-- 
cgit v1.2.3


From 4a7704ffa86705b0580b6473c407b7b7618e072d Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:16:59 +0200
Subject: net: dsa: Remove dynamic allocate of routing table

With a maximum of four switches, the size of the routing table is the
same as the pointer to it. Removing it makes the code simpler.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx.c |  3 +--
 include/net/dsa.h           |  9 ++++-----
 net/dsa/dsa.c               | 12 ------------
 3 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 85332d9a245a..d622c0fb76cc 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -3024,8 +3024,7 @@ static int mv88e6xxx_setup_global(struct mv88e6xxx_priv_state *ps)
 	for (i = 0; i < 32; i++) {
 		int nexthop = 0x1f;
 
-		if (ps->ds->cd->rtable &&
-		    i != ps->ds->index && i < ps->ds->dst->pd->nr_chips)
+		if (i != ps->ds->index && i < ps->ds->dst->pd->nr_chips)
 			nexthop = ps->ds->cd->rtable[i] & 0x1f;
 
 		err = _mv88e6xxx_reg_write(
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8314197d028f..4e3afa9648ca 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -58,12 +58,11 @@ struct dsa_chip_data {
 	struct device_node *port_dn[DSA_MAX_PORTS];
 
 	/*
-	 * An array (with nr_chips elements) of which element [a]
-	 * indicates which port on this switch should be used to
-	 * send packets to that are destined for switch a.  Can be
-	 * NULL if there is only one switch chip.
+	 * An array of which element [a] indicates which port on this
+	 * switch should be used to send packets to that are destined
+	 * for switch a. Can be NULL if there is only one switch chip.
 	 */
-	s8		*rtable;
+	s8		rtable[DSA_MAX_SWITCHES];
 };
 
 struct dsa_platform_data {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 5907f8cd13b6..6177dd750847 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -587,17 +587,6 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
 	if (link_sw_addr >= pd->nr_chips)
 		return -EINVAL;
 
-	/* First time routing table allocation */
-	if (!cd->rtable) {
-		cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8),
-					   GFP_KERNEL);
-		if (!cd->rtable)
-			return -ENOMEM;
-
-		/* default to no valid uplink/downlink */
-		memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
-	}
-
 	cd->rtable[link_sw_addr] = port_index;
 
 	return 0;
@@ -639,7 +628,6 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
 			kfree(pd->chip[i].port_names[port_index]);
 			port_index++;
 		}
-		kfree(pd->chip[i].rtable);
 
 		/* Drop our reference to the MDIO bus device */
 		if (pd->chip[i].host_dev)
-- 
cgit v1.2.3


From 66472fc04e8be62858f29c7798ed17e984c1ab3b Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:17:00 +0200
Subject: net: dsa: Copy the routing table into the switch structure

The new binding will not have a chip data structure, it will place the
routing directly into the switch structure. To enable backwards
compatibility, copy the routing from the chip data into the switch
structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx.c | 4 ++--
 include/net/dsa.h           | 9 ++++++++-
 net/dsa/dsa.c               | 2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index d622c0fb76cc..492801a6398c 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -3024,8 +3024,8 @@ static int mv88e6xxx_setup_global(struct mv88e6xxx_priv_state *ps)
 	for (i = 0; i < 32; i++) {
 		int nexthop = 0x1f;
 
-		if (i != ps->ds->index && i < ps->ds->dst->pd->nr_chips)
-			nexthop = ps->ds->cd->rtable[i] & 0x1f;
+		if (i != ds->index && i < DSA_MAX_SWITCHES)
+			nexthop = ds->rtable[i] & 0x1f;
 
 		err = _mv88e6xxx_reg_write(
 			ps, REG_GLOBAL2,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e3afa9648ca..b666f27b3daa 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -148,6 +148,13 @@ struct dsa_switch {
 	 */
 	struct dsa_switch_driver	*drv;
 
+	/*
+	 * An array of which element [a] indicates which port on this
+	 * switch should be used to send packets to that are destined
+	 * for switch a. Can be NULL if there is only one switch chip.
+	 */
+	s8		rtable[DSA_MAX_SWITCHES];
+
 #ifdef CONFIG_NET_DSA_HWMON
 	/*
 	 * Hardware monitoring information
@@ -194,7 +201,7 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds)
 	if (dst->cpu_switch == ds->index)
 		return dst->cpu_port;
 	else
-		return ds->cd->rtable[dst->cpu_switch];
+		return ds->rtable[dst->cpu_switch];
 }
 
 struct switchdev_trans;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6177dd750847..bfe1d03d4730 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -297,6 +297,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 		dst->tag_protocol = drv->tag_protocol;
 	}
 
+	memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
+
 	/*
 	 * Do basic register setup.
 	 */
-- 
cgit v1.2.3


From 39a7f2a4eb496c0c68cc93fcb403190b48605168 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:17:03 +0200
Subject: net: dsa: Refactor selection of tag ops into a function

Replace the two switch statements with an array lookup, and store the
result in the dsa tree structure. The drivers no longer need to know
the selected tag protocol, so remove it from the dsa switch structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  8 +++++-
 net/dsa/dsa.c      | 71 ++++++++++++++++++++++++++++++++++--------------------
 net/dsa/dsa_priv.h |  1 +
 net/dsa/slave.c    | 35 +--------------------------
 4 files changed, 54 insertions(+), 61 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b666f27b3daa..bd6ecaa0de96 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -26,6 +26,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_TRAILER,
 	DSA_TAG_PROTO_EDSA,
 	DSA_TAG_PROTO_BRCM,
+	DSA_TAG_LAST,		/* MUST BE LAST */
 };
 
 #define DSA_MAX_SWITCHES	4
@@ -99,7 +100,6 @@ struct dsa_switch_tree {
 				       struct net_device *dev,
 				       struct packet_type *pt,
 				       struct net_device *orig_dev);
-	enum dsa_tag_protocol	tag_protocol;
 
 	/*
 	 * Original copy of the master netdev ethtool_ops
@@ -116,6 +116,12 @@ struct dsa_switch_tree {
 	 * Data for the individual switch chips.
 	 */
 	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
+
+	/*
+	 * Tagging protocol operations for adding and removing an
+	 * encapsulation tag.
+	 */
+	const struct dsa_device_ops *tag_ops;
 };
 
 struct dsa_port {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7140de475c07..221ebde4318d 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -29,6 +29,33 @@
 
 char dsa_driver_version[] = "0.1";
 
+static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	/* Just return the original SKB */
+	return skb;
+}
+
+static const struct dsa_device_ops none_ops = {
+	.xmit	= dsa_slave_notag_xmit,
+	.rcv	= NULL,
+};
+
+const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+	[DSA_TAG_PROTO_DSA] = &dsa_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+	[DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+	[DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+	[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
+#endif
+	[DSA_TAG_PROTO_NONE] = &none_ops,
+};
 
 /* switch driver registration ***********************************************/
 static DEFINE_MUTEX(dsa_switch_drivers_mutex);
@@ -225,6 +252,20 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
 	return 0;
 }
 
+const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
+{
+	const struct dsa_device_ops *ops;
+
+	if (tag_protocol >= DSA_TAG_LAST)
+		return ERR_PTR(-EINVAL);
+	ops = dsa_device_ops[tag_protocol];
+
+	if (!ops)
+		return ERR_PTR(-ENOPROTOOPT);
+
+	return ops;
+}
+
 static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 {
 	struct dsa_switch_driver *drv = ds->drv;
@@ -277,35 +318,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 * switch.
 	 */
 	if (dst->cpu_switch == index) {
-		switch (drv->tag_protocol) {
-#ifdef CONFIG_NET_DSA_TAG_DSA
-		case DSA_TAG_PROTO_DSA:
-			dst->rcv = dsa_netdev_ops.rcv;
-			break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_EDSA
-		case DSA_TAG_PROTO_EDSA:
-			dst->rcv = edsa_netdev_ops.rcv;
-			break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_TRAILER
-		case DSA_TAG_PROTO_TRAILER:
-			dst->rcv = trailer_netdev_ops.rcv;
-			break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM
-		case DSA_TAG_PROTO_BRCM:
-			dst->rcv = brcm_netdev_ops.rcv;
-			break;
-#endif
-		case DSA_TAG_PROTO_NONE:
-			break;
-		default:
-			ret = -ENOPROTOOPT;
+		dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol);
+		if (IS_ERR(dst->tag_ops)) {
+			ret = PTR_ERR(dst->tag_ops);
 			goto out;
 		}
 
-		dst->tag_protocol = drv->tag_protocol;
+		dst->rcv = dst->tag_ops->rcv;
 	}
 
 	memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index dbea5d9e7f75..72f7b8989cfb 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -53,6 +53,7 @@ extern char dsa_driver_version[];
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		      struct device_node *port_dn, int port);
 void dsa_cpu_dsa_destroy(struct device_node *port_dn);
+const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 52f1183c42a0..35e5f0f6688b 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -521,14 +521,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
-					    struct net_device *dev)
-{
-	/* Just return the original SKB */
-	return skb;
-}
-
-
 /* ethtool operations *******************************************************/
 static int
 dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
@@ -1151,32 +1143,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	p = netdev_priv(slave_dev);
 	p->parent = ds;
 	p->port = port;
-
-	switch (ds->dst->tag_protocol) {
-#ifdef CONFIG_NET_DSA_TAG_DSA
-	case DSA_TAG_PROTO_DSA:
-		p->xmit = dsa_netdev_ops.xmit;
-		break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_EDSA
-	case DSA_TAG_PROTO_EDSA:
-		p->xmit = edsa_netdev_ops.xmit;
-		break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_TRAILER
-	case DSA_TAG_PROTO_TRAILER:
-		p->xmit = trailer_netdev_ops.xmit;
-		break;
-#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM
-	case DSA_TAG_PROTO_BRCM:
-		p->xmit = brcm_netdev_ops.xmit;
-		break;
-#endif
-	default:
-		p->xmit	= dsa_slave_notag_xmit;
-		break;
-	}
+	p->xmit = dst->tag_ops->xmit;
 
 	p->old_pause = -1;
 	p->old_link = -1;
-- 
cgit v1.2.3


From 83c0afaec7b730b16c518aecc8e6246ec91b265e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Jun 2016 21:17:07 +0200
Subject: net: dsa: Add new binding implementation

The existing DSA binding has a number of limitations and problems. The
main problem is that it cannot represent a switch as a linux device,
hanging off some bus. It is limited to one CPU port. The DSA platform
device is artificial, and does not really represent hardware.

Implement a new binding which can be embedded into any type of node on
a bus to represent one switch device, and its links to other switches.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx.c |   7 +
 include/net/dsa.h           |  20 ++
 net/dsa/Makefile            |   2 +-
 net/dsa/dsa.c               |   5 +
 net/dsa/dsa2.c              | 654 ++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa_priv.h          |   2 +-
 net/dsa/slave.c             |   8 +-
 7 files changed, 694 insertions(+), 4 deletions(-)
 create mode 100644 net/dsa/dsa2.c

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 192b39cbdbcd..ee06055444f9 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -3748,6 +3748,12 @@ int mv88e6xxx_probe(struct mdio_device *mdiodev)
 
 	dev_set_drvdata(dev, ds);
 
+	err = dsa_register_switch(ds, mdiodev->dev.of_node);
+	if (err) {
+		mv88e6xxx_mdio_unregister(ps);
+		return err;
+	}
+
 	dev_info(dev, "switch 0x%x probed: %s, revision %u\n",
 		 prod_num, ps->info->name, rev);
 
@@ -3759,6 +3765,7 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev)
 	struct dsa_switch *ds = dev_get_drvdata(&mdiodev->dev);
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 
+	dsa_unregister_switch(ds);
 	put_device(&ps->bus->dev);
 
 	mv88e6xxx_mdio_unregister(ps);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index bd6ecaa0de96..cca7ef230742 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -85,6 +85,17 @@ struct dsa_platform_data {
 struct packet_type;
 
 struct dsa_switch_tree {
+	struct list_head	list;
+
+	/* Tree identifier */
+	u32 tree;
+
+	/* Number of switches attached to this tree */
+	struct kref refcount;
+
+	/* Has this tree been applied to the hardware? */
+	bool applied;
+
 	/*
 	 * Configuration data for the platform device that owns
 	 * this dsa switch tree instance.
@@ -169,10 +180,16 @@ struct dsa_switch {
 	struct device		*hwmon_dev;
 #endif
 
+	/*
+	 * The lower device this switch uses to talk to the host
+	 */
+	struct net_device *master_netdev;
+
 	/*
 	 * Slave mii_bus and devices for the individual ports.
 	 */
 	u32			dsa_port_mask;
+	u32			cpu_port_mask;
 	u32			enabled_port_mask;
 	u32			phys_mii_mask;
 	struct dsa_port		ports[DSA_MAX_PORTS];
@@ -361,4 +378,7 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
 {
 	return dst->rcv != NULL;
 }
+
+void dsa_unregister_switch(struct dsa_switch *ds);
+int dsa_register_switch(struct dsa_switch *ds, struct device_node *np);
 #endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index da06ed1df620..8af4ded70f1c 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o slave.o
+dsa_core-y += dsa.o slave.o dsa2.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6c314f300424..ce3b942dce76 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -294,6 +294,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			}
 			dst->cpu_switch = index;
 			dst->cpu_port = i;
+			ds->cpu_port_mask |= 1 << i;
 		} else if (!strcmp(name, "dsa")) {
 			ds->dsa_port_mask |= 1 << i;
 		} else {
@@ -492,6 +493,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 		dsa_cpu_dsa_destroy(ds->ports[port].dn);
+
+		/* Clearing a bit which is not set does no harm */
+		ds->cpu_port_mask |= ~(1 << port);
+		ds->dsa_port_mask |= ~(1 << port);
 	}
 
 	if (ds->slave_mii_bus && ds->drv->phy_read)
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
new file mode 100644
index 000000000000..80dfe08db825
--- /dev/null
+++ b/net/dsa/dsa2.c
@@ -0,0 +1,654 @@
+/*
+ * net/dsa/dsa2.c - Hardware switch handling, binding version 2
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <net/dsa.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include "dsa_priv.h"
+
+static LIST_HEAD(dsa_switch_trees);
+static DEFINE_MUTEX(dsa2_mutex);
+
+static struct dsa_switch_tree *dsa_get_dst(u32 tree)
+{
+	struct dsa_switch_tree *dst;
+
+	list_for_each_entry(dst, &dsa_switch_trees, list)
+		if (dst->tree == tree)
+			return dst;
+	return NULL;
+}
+
+static void dsa_free_dst(struct kref *ref)
+{
+	struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree,
+						   refcount);
+
+	list_del(&dst->list);
+	kfree(dst);
+}
+
+static void dsa_put_dst(struct dsa_switch_tree *dst)
+{
+	kref_put(&dst->refcount, dsa_free_dst);
+}
+
+static struct dsa_switch_tree *dsa_add_dst(u32 tree)
+{
+	struct dsa_switch_tree *dst;
+
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (!dst)
+		return NULL;
+	dst->tree = tree;
+	dst->cpu_switch = -1;
+	INIT_LIST_HEAD(&dst->list);
+	list_add_tail(&dsa_switch_trees, &dst->list);
+	kref_init(&dst->refcount);
+
+	return dst;
+}
+
+static void dsa_dst_add_ds(struct dsa_switch_tree *dst,
+			   struct dsa_switch *ds, u32 index)
+{
+	kref_get(&dst->refcount);
+	dst->ds[index] = ds;
+}
+
+static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
+			   struct dsa_switch *ds, u32 index)
+{
+	dst->ds[index] = NULL;
+	kref_put(&dst->refcount, dsa_free_dst);
+}
+
+static bool dsa_port_is_dsa(struct device_node *port)
+{
+	const char *name;
+
+	name = of_get_property(port, "label", NULL);
+	if (!name)
+		return false;
+
+	if (!strcmp(name, "dsa"))
+		return true;
+
+	return false;
+}
+
+static bool dsa_port_is_cpu(struct device_node *port)
+{
+	const char *name;
+
+	name = of_get_property(port, "label", NULL);
+	if (!name)
+		return false;
+
+	if (!strcmp(name, "cpu"))
+		return true;
+
+	return false;
+}
+
+static bool dsa_ds_find_port(struct dsa_switch *ds,
+			     struct device_node *port)
+{
+	u32 index;
+
+	for (index = 0; index < DSA_MAX_PORTS; index++)
+		if (ds->ports[index].dn == port)
+			return true;
+	return false;
+}
+
+static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
+					    struct device_node *port)
+{
+	struct dsa_switch *ds;
+	u32 index;
+
+	for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+		ds = dst->ds[index];
+		if (!ds)
+			continue;
+
+		if (dsa_ds_find_port(ds, port))
+			return ds;
+	}
+
+	return NULL;
+}
+
+static int dsa_port_complete(struct dsa_switch_tree *dst,
+			     struct dsa_switch *src_ds,
+			     struct device_node *port,
+			     u32 src_port)
+{
+	struct device_node *link;
+	int index;
+	struct dsa_switch *dst_ds;
+
+	for (index = 0;; index++) {
+		link = of_parse_phandle(port, "link", index);
+		if (!link)
+			break;
+
+		dst_ds = dsa_dst_find_port(dst, link);
+		of_node_put(link);
+
+		if (!dst_ds)
+			return 1;
+
+		src_ds->rtable[dst_ds->index] = src_port;
+	}
+
+	return 0;
+}
+
+/* A switch is complete if all the DSA ports phandles point to ports
+ * known in the tree. A return value of 1 means the tree is not
+ * complete. This is not an error condition. A value of 0 is
+ * success.
+ */
+static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+	struct device_node *port;
+	u32 index;
+	int err;
+
+	for (index = 0; index < DSA_MAX_PORTS; index++) {
+		port = ds->ports[index].dn;
+		if (!port)
+			continue;
+
+		if (!dsa_port_is_dsa(port))
+			continue;
+
+		err = dsa_port_complete(dst, ds, port, index);
+		if (err != 0)
+			return err;
+
+		ds->dsa_port_mask |= BIT(index);
+	}
+
+	return 0;
+}
+
+/* A tree is complete if all the DSA ports phandles point to ports
+ * known in the tree. A return value of 1 means the tree is not
+ * complete. This is not an error condition. A value of 0 is
+ * success.
+ */
+static int dsa_dst_complete(struct dsa_switch_tree *dst)
+{
+	struct dsa_switch *ds;
+	u32 index;
+	int err;
+
+	for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+		ds = dst->ds[index];
+		if (!ds)
+			continue;
+
+		err = dsa_ds_complete(dst, ds);
+		if (err != 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int dsa_dsa_port_apply(struct device_node *port, u32 index,
+			      struct dsa_switch *ds)
+{
+	int err;
+
+	err = dsa_cpu_dsa_setup(ds, ds->dev, port, index);
+	if (err) {
+		dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n",
+			 index, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void dsa_dsa_port_unapply(struct device_node *port, u32 index,
+				 struct dsa_switch *ds)
+{
+	dsa_cpu_dsa_destroy(port);
+}
+
+static int dsa_cpu_port_apply(struct device_node *port, u32 index,
+			      struct dsa_switch *ds)
+{
+	int err;
+
+	err = dsa_cpu_dsa_setup(ds, ds->dev, port, index);
+	if (err) {
+		dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n",
+			 index, err);
+		return err;
+	}
+
+	ds->cpu_port_mask |= BIT(index);
+
+	return 0;
+}
+
+static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
+				 struct dsa_switch *ds)
+{
+	dsa_cpu_dsa_destroy(port);
+	ds->cpu_port_mask &= ~BIT(index);
+
+}
+
+static int dsa_user_port_apply(struct device_node *port, u32 index,
+			       struct dsa_switch *ds)
+{
+	const char *name;
+	int err;
+
+	name = of_get_property(port, "label", NULL);
+
+	err = dsa_slave_create(ds, ds->dev, index, name);
+	if (err) {
+		dev_warn(ds->dev, "Failed to create slave %d: %d\n",
+			 index, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void dsa_user_port_unapply(struct device_node *port, u32 index,
+				  struct dsa_switch *ds)
+{
+	if (ds->ports[index].netdev) {
+		dsa_slave_destroy(ds->ports[index].netdev);
+		ds->ports[index].netdev = NULL;
+	}
+}
+
+static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+	struct device_node *port;
+	u32 index;
+	int err;
+
+	err = ds->drv->setup(ds);
+	if (err < 0)
+		return err;
+
+	err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
+	if (err < 0)
+		return err;
+
+	err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
+	if (err < 0)
+		return err;
+
+	for (index = 0; index < DSA_MAX_PORTS; index++) {
+		port = ds->ports[index].dn;
+		if (!port)
+			continue;
+
+		if (dsa_port_is_dsa(port)) {
+			err = dsa_dsa_port_apply(port, index, ds);
+			if (err)
+				return err;
+			continue;
+		}
+
+		if (dsa_port_is_cpu(port)) {
+			err = dsa_cpu_port_apply(port, index, ds);
+			if (err)
+				return err;
+			continue;
+		}
+
+		err = dsa_user_port_apply(port, index, ds);
+		if (err)
+			continue;
+	}
+
+	return 0;
+}
+
+static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+	struct device_node *port;
+	u32 index;
+
+	for (index = 0; index < DSA_MAX_PORTS; index++) {
+		port = ds->ports[index].dn;
+		if (!port)
+			continue;
+
+		if (dsa_port_is_dsa(port)) {
+			dsa_dsa_port_unapply(port, index, ds);
+			continue;
+		}
+
+		if (dsa_port_is_cpu(port)) {
+			dsa_cpu_port_unapply(port, index, ds);
+			continue;
+		}
+
+		dsa_user_port_unapply(port, index, ds);
+	}
+}
+
+static int dsa_dst_apply(struct dsa_switch_tree *dst)
+{
+	struct dsa_switch *ds;
+	u32 index;
+	int err;
+
+	for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+		ds = dst->ds[index];
+		if (!ds)
+			continue;
+
+		err = dsa_ds_apply(dst, ds);
+		if (err)
+			return err;
+	}
+
+	/* If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.
+	 */
+	wmb();
+	dst->master_netdev->dsa_ptr = (void *)dst;
+	dst->applied = true;
+
+	return 0;
+}
+
+static void dsa_dst_unapply(struct dsa_switch_tree *dst)
+{
+	struct dsa_switch *ds;
+	u32 index;
+
+	if (!dst->applied)
+		return;
+
+	dst->master_netdev->dsa_ptr = NULL;
+
+	/* If we used a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point get sent
+	 * without the tag and go through the regular receive path.
+	 */
+	wmb();
+
+	for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+		ds = dst->ds[index];
+		if (!ds)
+			continue;
+
+		dsa_ds_unapply(dst, ds);
+	}
+
+	pr_info("DSA: tree %d unapplied\n", dst->tree);
+	dst->applied = false;
+}
+
+static int dsa_cpu_parse(struct device_node *port, u32 index,
+			 struct dsa_switch_tree *dst,
+			 struct dsa_switch *ds)
+{
+	struct net_device *ethernet_dev;
+	struct device_node *ethernet;
+
+	ethernet = of_parse_phandle(port, "ethernet", 0);
+	if (!ethernet)
+		return -EINVAL;
+
+	ethernet_dev = of_find_net_device_by_node(ethernet);
+	if (!ethernet_dev)
+		return -EPROBE_DEFER;
+
+	if (!ds->master_netdev)
+		ds->master_netdev = ethernet_dev;
+
+	if (!dst->master_netdev)
+		dst->master_netdev = ethernet_dev;
+
+	if (dst->cpu_switch == -1) {
+		dst->cpu_switch = ds->index;
+		dst->cpu_port = index;
+	}
+
+	dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol);
+	if (IS_ERR(dst->tag_ops)) {
+		dev_warn(ds->dev, "No tagger for this switch\n");
+		return PTR_ERR(dst->tag_ops);
+	}
+
+	dst->rcv = dst->tag_ops->rcv;
+
+	return 0;
+}
+
+static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
+{
+	struct device_node *port;
+	u32 index;
+	int err;
+
+	for (index = 0; index < DSA_MAX_PORTS; index++) {
+		port = ds->ports[index].dn;
+		if (!port)
+			continue;
+
+		if (dsa_port_is_cpu(port)) {
+			err = dsa_cpu_parse(port, index, dst, ds);
+			if (err)
+				return err;
+		}
+	}
+
+	pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index);
+
+	return 0;
+}
+
+static int dsa_dst_parse(struct dsa_switch_tree *dst)
+{
+	struct dsa_switch *ds;
+	u32 index;
+	int err;
+
+	for (index = 0; index < DSA_MAX_SWITCHES; index++) {
+		ds = dst->ds[index];
+		if (!ds)
+			continue;
+
+		err = dsa_ds_parse(dst, ds);
+		if (err)
+			return err;
+	}
+
+	if (!dst->master_netdev) {
+		pr_warn("Tree has no master device\n");
+		return -EINVAL;
+	}
+
+	pr_info("DSA: tree %d parsed\n", dst->tree);
+
+	return 0;
+}
+
+static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
+{
+	struct device_node *port;
+	int err;
+	u32 reg;
+
+	for_each_available_child_of_node(ports, port) {
+		err = of_property_read_u32(port, "reg", &reg);
+		if (err)
+			return err;
+
+		if (reg >= DSA_MAX_PORTS)
+			return -EINVAL;
+
+		ds->ports[reg].dn = port;
+	}
+
+	return 0;
+}
+
+static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
+{
+	int err;
+
+	*tree = *index = 0;
+
+	err = of_property_read_u32_index(np, "dsa,member", 0, tree);
+	if (err) {
+		/* Does not exist, but it is optional */
+		if (err == -EINVAL)
+			return 0;
+		return err;
+	}
+
+	err = of_property_read_u32_index(np, "dsa,member", 1, index);
+	if (err)
+		return err;
+
+	if (*index >= DSA_MAX_SWITCHES)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct device_node *dsa_get_ports(struct dsa_switch *ds,
+					 struct device_node *np)
+{
+	struct device_node *ports;
+
+	ports = of_get_child_by_name(np, "ports");
+	if (!ports) {
+		dev_err(ds->dev, "no ports child node found\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return ports;
+}
+
+static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+{
+	struct device_node *ports = dsa_get_ports(ds, np);
+	struct dsa_switch_tree *dst;
+	u32 tree, index;
+	int err;
+
+	err = dsa_parse_member(np, &tree, &index);
+	if (err)
+		return err;
+
+	if (IS_ERR(ports))
+		return PTR_ERR(ports);
+
+	err = dsa_parse_ports_dn(ports, ds);
+	if (err)
+		return err;
+
+	dst = dsa_get_dst(tree);
+	if (!dst) {
+		dst = dsa_add_dst(tree);
+		if (!dst)
+			return -ENOMEM;
+	}
+
+	if (dst->ds[index]) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	ds->dst = dst;
+	ds->index = index;
+	dsa_dst_add_ds(dst, ds, index);
+
+	err = dsa_dst_complete(dst);
+	if (err < 0)
+		goto out_del_dst;
+
+	if (err == 1) {
+		/* Not all switches registered yet */
+		err = 0;
+		goto out;
+	}
+
+	if (dst->applied) {
+		pr_info("DSA: Disjoint trees?\n");
+		return -EINVAL;
+	}
+
+	err = dsa_dst_parse(dst);
+	if (err)
+		goto out_del_dst;
+
+	err = dsa_dst_apply(dst);
+	if (err) {
+		dsa_dst_unapply(dst);
+		goto out_del_dst;
+	}
+
+	dsa_put_dst(dst);
+	return 0;
+
+out_del_dst:
+	dsa_dst_del_ds(dst, ds, ds->index);
+out:
+	dsa_put_dst(dst);
+
+	return err;
+}
+
+int dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+{
+	int err;
+
+	mutex_lock(&dsa2_mutex);
+	err = _dsa_register_switch(ds, np);
+	mutex_unlock(&dsa2_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dsa_register_switch);
+
+void _dsa_unregister_switch(struct dsa_switch *ds)
+{
+	struct dsa_switch_tree *dst = ds->dst;
+
+	dsa_dst_unapply(dst);
+
+	dsa_dst_del_ds(dst, ds, ds->index);
+}
+
+void dsa_unregister_switch(struct dsa_switch *ds)
+{
+	mutex_lock(&dsa2_mutex);
+	_dsa_unregister_switch(ds);
+	mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_unregister_switch);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 72f7b8989cfb..b42f1a5f95f3 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -59,7 +59,7 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
 int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
-		     int port, char *name);
+		     int port, const char *name);
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 35e5f0f6688b..15a492261895 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1099,14 +1099,18 @@ int dsa_slave_resume(struct net_device *slave_dev)
 }
 
 int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
-		     int port, char *name)
+		     int port, const char *name)
 {
-	struct net_device *master = ds->dst->master_netdev;
 	struct dsa_switch_tree *dst = ds->dst;
+	struct net_device *master;
 	struct net_device *slave_dev;
 	struct dsa_slave_priv *p;
 	int ret;
 
+	master = ds->dst->master_netdev;
+	if (ds->master_netdev)
+		master = ds->master_netdev;
+
 	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name,
 				 NET_NAME_UNKNOWN, ether_setup);
 	if (slave_dev == NULL)
-- 
cgit v1.2.3


From 53eb440f4ada034ea43b295891feec3df0fa7a29 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 6 Jun 2016 06:32:54 -0400
Subject: net sched actions: introduce timestamp for firsttime use

Useful to know when the action was first used for accounting
(and debugging)

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        | 2 ++
 include/uapi/linux/pkt_cls.h | 1 +
 net/sched/act_api.c          | 1 +
 net/sched/act_bpf.c          | 1 +
 net/sched/act_connmark.c     | 1 +
 net/sched/act_csum.c         | 1 +
 net/sched/act_gact.c         | 1 +
 net/sched/act_ipt.c          | 1 +
 net/sched/act_mirred.c       | 1 +
 net/sched/act_nat.c          | 1 +
 net/sched/act_pedit.c        | 1 +
 net/sched/act_police.c       | 2 ++
 net/sched/act_simple.c       | 1 +
 net/sched/act_skbedit.c      | 1 +
 net/sched/act_vlan.c         | 1 +
 15 files changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9a9a8edc138f..8389c007076f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -76,6 +76,8 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
 
 	if (tm->lastuse != now)
 		tm->lastuse = now;
+	if (unlikely(!tm->firstuse))
+		tm->firstuse = now;
 }
 
 struct tc_action {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f4297c8a42fe..9ba1410bd21d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -124,6 +124,7 @@ struct tcf_t {
 	__u64   install;
 	__u64   lastuse;
 	__u64   expires;
+	__u64   firstuse;
 };
 
 struct tc_cnt {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 336774a535c3..5ebf6d6f85f6 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -283,6 +283,7 @@ err2:
 	p->tcfc_index = index ? index : tcf_hash_new_index(tn);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
+	p->tcfc_tm.firstuse = 0;
 	if (est) {
 		err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
 					&p->tcfc_rate_est,
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c7123e01c2ca..e4b877f9d322 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -156,6 +156,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
 
 	tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install);
 	tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
+	tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse);
 	tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
 
 	if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e0e6c6876bc7..e3f64f2d6206 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -163,6 +163,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
+	t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse);
 	if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
 			  TCA_CONNMARK_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 065f71618276..7725eafbe581 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -548,6 +548,7 @@ static int tcf_csum_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index ec5cc8435238..c9d59f38a3f8 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -190,6 +190,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 #endif
 	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 30e9087b3536..47525ee201a9 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -279,6 +279,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 		goto nla_put_failure;
 	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
 	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
+	tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse);
 	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index d3ac73e90f00..1b06093f8ef8 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -220,6 +220,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 9135aa8f2970..9fbf780a04c7 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -266,6 +266,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 67a17265c967..fb89275bc595 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -202,6 +202,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index b884dae692a1..820b11686f85 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -241,6 +241,7 @@ override:
 		tcf_hash_new_index(tn);
 	police->tcf_tm.install = jiffies;
 	police->tcf_tm.lastuse = jiffies;
+	police->tcf_tm.firstuse = 0;
 	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	spin_lock_bh(&hinfo->lock);
 	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
@@ -347,6 +348,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 
 	t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index f95d1c596986..81040f1cc3bb 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -160,6 +160,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 82105c8cb60f..cf34f31f4106 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -170,6 +170,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index da5120f5513f..978ec4c89684 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -184,6 +184,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 
 	t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
+	t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse);
 	t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
 	if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 48d8ee1694dd1ab25614b58f968123a4598f887e Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 6 Jun 2016 06:32:55 -0400
Subject: net sched actions: aggregate dumping of actions timeinfo

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h    | 8 ++++++++
 net/sched/act_bpf.c      | 6 +-----
 net/sched/act_connmark.c | 5 +----
 net/sched/act_csum.c     | 6 ++----
 net/sched/act_gact.c     | 5 +----
 net/sched/act_ife.c      | 4 +---
 net/sched/act_ipt.c      | 7 +++----
 net/sched/act_mirred.c   | 6 ++----
 net/sched/act_nat.c      | 6 ++----
 net/sched/act_pedit.c    | 7 +++----
 net/sched/act_simple.c   | 6 ++----
 net/sched/act_skbedit.c  | 6 ++----
 net/sched/act_vlan.c     | 5 +----
 13 files changed, 29 insertions(+), 48 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8389c007076f..a891978310e9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -80,6 +80,14 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
 		tm->firstuse = now;
 }
 
+static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
+{
+	dtm->install = jiffies_to_clock_t(jiffies - stm->install);
+	dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse);
+	dtm->firstuse = jiffies_to_clock_t(jiffies - stm->firstuse);
+	dtm->expires = jiffies_to_clock_t(stm->expires);
+}
+
 struct tc_action {
 	void			*priv;
 	const struct tc_action_ops	*ops;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index e4b877f9d322..ae0e7cbe488c 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -154,11 +154,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
 	if (ret)
 		goto nla_put_failure;
 
-	tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install);
-	tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
-	tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse);
-	tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
-
+	tcf_tm_dump(&tm, &prog->tcf_tm);
 	if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
 			  TCA_ACT_BPF_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e3f64f2d6206..35a5270f289d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -160,10 +160,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
-	t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
-	t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse);
+	tcf_tm_dump(&t, &ci->tcf_tm);
 	if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
 			  TCA_CONNMARK_PAD))
 		goto nla_put_failure;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7725eafbe581..dcd9ababd351 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -546,10 +546,8 @@ static int tcf_csum_dump(struct sk_buff *skb,
 
 	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
 		goto nla_put_failure;
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c9d59f38a3f8..4c6e0085054a 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -188,10 +188,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 			goto nla_put_failure;
 	}
 #endif
-	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
+	tcf_tm_dump(&t, &gact->tcf_tm);
 	if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
 		goto nla_put_failure;
 	return skb->len;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 649157624f46..02f5a8ba95d7 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -553,9 +553,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
-	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+	tcf_tm_dump(&t, &ife->tcf_tm);
 	if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
 		goto nla_put_failure;
 
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 47525ee201a9..3fcde44b8f4d 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -277,12 +277,11 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 	    nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
 	    nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
 		goto nla_put_failure;
-	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
-	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
-	tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse);
-	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
+
+	tcf_tm_dump(&tm, &ipt->tcf_tm);
 	if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
 		goto nla_put_failure;
+
 	kfree(t);
 	return skb->len;
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1b06093f8ef8..787751a7981a 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -218,10 +218,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
 
 	if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &m->tcf_tm);
 	if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
 		goto nla_put_failure;
 	return skb->len;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 9fbf780a04c7..06ccb03f25da 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -264,10 +264,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 
 	if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
 		goto nla_put_failure;
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fb89275bc595..82d3c1479029 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -200,12 +200,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 
 	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
 		goto nla_put_failure;
+
 	kfree(opt);
 	return skb->len;
 
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 81040f1cc3bb..be5fbb51cfed 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -158,10 +158,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
 	    nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
 		goto nla_put_failure;
 	return skb->len;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index cf34f31f4106..7e2bc3c2b6da 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -168,10 +168,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	    nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
 		    &d->mark))
 		goto nla_put_failure;
-	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+
+	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
 		goto nla_put_failure;
 	return skb->len;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 978ec4c89684..f0a08a11f54f 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -182,10 +182,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto)))
 		goto nla_put_failure;
 
-	t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
-	t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse);
-	t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
+	tcf_tm_dump(&t, &v->tcf_tm);
 	if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
 		goto nla_put_failure;
 	return skb->len;
-- 
cgit v1.2.3


From 0b0f43fe2e7291aa97b1febeaa5a0de453d007ca Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 5 Jun 2016 10:41:32 -0400
Subject: net sched: indentation and other OCD stylistic fixes

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 include/net/act_api.h          | 14 ++++++++------
 include/net/tc_act/tc_defact.h |  4 ++--
 include/uapi/linux/pkt_cls.h   |  6 +++---
 net/sched/act_api.c            | 19 +++++++++++--------
 net/sched/act_bpf.c            |  3 ++-
 net/sched/act_gact.c           |  3 ++-
 net/sched/act_ipt.c            |  6 ++++--
 net/sched/act_vlan.c           |  3 ++-
 net/sched/cls_api.c            | 11 +++++++----
 9 files changed, 41 insertions(+), 28 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a891978310e9..db218a12efb5 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -2,8 +2,8 @@
 #define __NET_ACT_API_H
 
 /*
- * Public police action API for classifiers/qdiscs
- */
+ * Public action API for classifiers/qdiscs
+*/
 
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
@@ -107,7 +107,8 @@ struct tc_action_ops {
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
 	struct module		*owner;
-	int     (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *);
+	int     (*act)(struct sk_buff *, const struct tc_action *,
+		       struct tcf_result *);
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	void	(*cleanup)(struct tc_action *, int bind);
 	int     (*lookup)(struct net *, struct tc_action *, u32);
@@ -125,8 +126,8 @@ struct tc_action_net {
 };
 
 static inline
-int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops,
-		       unsigned int mask)
+int tc_action_net_init(struct tc_action_net *tn,
+		       const struct tc_action_ops *ops, unsigned int mask)
 {
 	int err = 0;
 
@@ -169,7 +170,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind)
 }
 
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
-int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops);
+int tcf_unregister_action(struct tc_action_ops *a,
+			  struct pernet_operations *ops);
 int tcf_action_destroy(struct list_head *actions, int bind);
 int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
 		    struct tcf_result *res);
diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h
index 9763dcbb9bc3..ab9b5d6be67b 100644
--- a/include/net/tc_act/tc_defact.h
+++ b/include/net/tc_act/tc_defact.h
@@ -5,8 +5,8 @@
 
 struct tcf_defact {
 	struct tcf_common	common;
-	u32     		tcfd_datalen;
-	void    		*tcfd_defdata;
+	u32		tcfd_datalen;
+	void		*tcfd_defdata;
 };
 #define to_defact(a) \
 	container_of(a->priv, struct tcf_defact, common)
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9ba1410bd21d..5702e933dc07 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -115,8 +115,8 @@ struct tc_police {
 	__u32			mtu;
 	struct tc_ratespec	rate;
 	struct tc_ratespec	peakrate;
-	int 			refcnt;
-	int 			bindcnt;
+	int			refcnt;
+	int			bindcnt;
 	__u32			capab;
 };
 
@@ -128,7 +128,7 @@ struct tcf_t {
 };
 
 struct tc_cnt {
-	int                   refcnt; 
+	int                   refcnt;
 	int                   bindcnt;
 };
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 5ebf6d6f85f6..719bc2e85852 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -504,8 +504,8 @@ nla_put_failure:
 }
 EXPORT_SYMBOL(tcf_action_dump_1);
 
-int
-tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
+int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+		    int bind, int ref)
 {
 	struct tc_action *a;
 	int err = -EINVAL;
@@ -688,9 +688,9 @@ errout:
 	return -1;
 }
 
-static int
-tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
-	     u16 flags, int event, int bind, int ref)
+static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
+			u32 portid, u32 seq, u16 flags, int event, int bind,
+			int ref)
 {
 	struct tcamsg *t;
 	struct nlmsghdr *nlh;
@@ -731,7 +731,8 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
-	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
+			 0, 0) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -839,7 +840,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	if (a.ops == NULL) /*some idjot trying to flush unknown action */
 		goto err_out;
 
-	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION,
+			sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
 	t = nlmsg_data(nlh);
@@ -1002,7 +1004,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
-	if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+	if ((n->nlmsg_type != RTM_GETACTION) &&
+	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
 	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index ae0e7cbe488c..f7b6cf49ea6f 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -169,7 +169,8 @@ nla_put_failure:
 static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
 	[TCA_ACT_BPF_PARMS]	= { .len = sizeof(struct tc_act_bpf) },
 	[TCA_ACT_BPF_FD]	= { .type = NLA_U32 },
-	[TCA_ACT_BPF_NAME]	= { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN },
+	[TCA_ACT_BPF_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = ACT_BPF_NAME_LEN },
 	[TCA_ACT_BPF_OPS_LEN]	= { .type = NLA_U16 },
 	[TCA_ACT_BPF_OPS]	= { .type = NLA_BINARY,
 				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 4c6e0085054a..19058a7f3e5c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -162,7 +162,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	tm->lastuse = lastuse;
 }
 
-static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_gact *gact = a->priv;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 3fcde44b8f4d..e7c0f4d944a2 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -34,7 +34,8 @@ static int ipt_net_id;
 
 static int xt_net_id;
 
-static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table,
+			   unsigned int hook)
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
@@ -250,7 +251,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 
 }
 
-static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_ipt *ipt = a->priv;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index f0a08a11f54f..b075d50e0fc3 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -179,7 +179,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 
 	if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
 	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
-	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto)))
+	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
+			  v->tcfv_push_proto)))
 		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &v->tcf_tm);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a75864d93142..aafa6bce173e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -351,8 +351,9 @@ errout:
 	return err;
 }
 
-static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
-			 unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
+static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+			 struct tcf_proto *tp, unsigned long fh, u32 portid,
+			 u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -474,9 +475,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		    TC_H_MIN(tcm->tcm_info) != tp->protocol)
 			continue;
 		if (t > s_t)
-			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+			memset(&cb->args[1], 0,
+			       sizeof(cb->args)-sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
+			if (tcf_fill_node(net, skb, tp, 0,
+					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
 				break;
-- 
cgit v1.2.3


From f9eb8aea2a1e12fc2f584d1627deeb957435a801 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jun 2016 09:37:15 -0700
Subject: net_sched: transform qdisc running bit into a seqcount

Instead of using a single bit (__QDISC___STATE_RUNNING)
in sch->__state, use a seqcount.

This adds lockdep support, but more importantly it will allow us
to sample qdisc/class statistics without having to grab qdisc root lock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  2 ++
 drivers/net/ppp/ppp_generic.c   |  3 +++
 drivers/net/team/team.c         |  2 ++
 include/linux/netdevice.h       |  1 +
 include/net/sch_generic.h       | 15 ++++-----------
 net/bluetooth/6lowpan.c         |  2 ++
 net/core/dev.c                  |  2 +-
 net/ieee802154/6lowpan/core.c   |  3 +++
 net/l2tp/l2tp_eth.c             |  4 ++++
 net/sched/sch_generic.c         | 14 ++++++++++----
 10 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 941ec99cd3b6..681af31a60ed 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4610,6 +4610,7 @@ static int bond_check_params(struct bond_params *params)
 static struct lock_class_key bonding_netdev_xmit_lock_key;
 static struct lock_class_key bonding_netdev_addr_lock_key;
 static struct lock_class_key bonding_tx_busylock_key;
+static struct lock_class_key bonding_qdisc_running_key;
 
 static void bond_set_lockdep_class_one(struct net_device *dev,
 				       struct netdev_queue *txq,
@@ -4625,6 +4626,7 @@ static void bond_set_lockdep_class(struct net_device *dev)
 			  &bonding_netdev_addr_lock_key);
 	netdev_for_each_tx_queue(dev, bond_set_lockdep_class_one, NULL);
 	dev->qdisc_tx_busylock = &bonding_tx_busylock_key;
+	dev->qdisc_running_key = &bonding_qdisc_running_key;
 }
 
 /* Called from registration process */
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 8dedafa1a95d..aeabaa42317f 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1313,9 +1313,12 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 }
 
 static struct lock_class_key ppp_tx_busylock;
+static struct lock_class_key ppp_qdisc_running_key;
+
 static int ppp_dev_init(struct net_device *dev)
 {
 	dev->qdisc_tx_busylock = &ppp_tx_busylock;
+	dev->qdisc_running_key = &ppp_qdisc_running_key;
 	return 0;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 2ace126533cd..00eb38956a2c 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1577,6 +1577,7 @@ static const struct team_option team_options[] = {
 static struct lock_class_key team_netdev_xmit_lock_key;
 static struct lock_class_key team_netdev_addr_lock_key;
 static struct lock_class_key team_tx_busylock_key;
+static struct lock_class_key team_qdisc_running_key;
 
 static void team_set_lockdep_class_one(struct net_device *dev,
 				       struct netdev_queue *txq,
@@ -1590,6 +1591,7 @@ static void team_set_lockdep_class(struct net_device *dev)
 	lockdep_set_class(&dev->addr_list_lock, &team_netdev_addr_lock_key);
 	netdev_for_each_tx_queue(dev, team_set_lockdep_class_one, NULL);
 	dev->qdisc_tx_busylock = &team_tx_busylock_key;
+	dev->qdisc_running_key = &team_qdisc_running_key;
 }
 
 static int team_init(struct net_device *dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa6df2699532..59d7e06d88d5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1862,6 +1862,7 @@ struct net_device {
 #endif
 	struct phy_device	*phydev;
 	struct lock_class_key	*qdisc_tx_busylock;
+	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a1fd76c22a59..bff8d895ef8a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -29,13 +29,6 @@ enum qdisc_state_t {
 	__QDISC_STATE_THROTTLED,
 };
 
-/*
- * following bits are only changed while qdisc lock is held
- */
-enum qdisc___state_t {
-	__QDISC___STATE_RUNNING = 1,
-};
-
 struct qdisc_size_table {
 	struct rcu_head		rcu;
 	struct list_head	list;
@@ -93,7 +86,7 @@ struct Qdisc {
 	unsigned long		state;
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
-	unsigned int		__state;
+	seqcount_t		running;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
 	int			padded;
@@ -104,20 +97,20 @@ struct Qdisc {
 
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
-	return (qdisc->__state & __QDISC___STATE_RUNNING) ? true : false;
+	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc_is_running(qdisc))
 		return false;
-	qdisc->__state |= __QDISC___STATE_RUNNING;
+	write_seqcount_begin(&qdisc->running);
 	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
-	qdisc->__state &= ~__QDISC___STATE_RUNNING;
+	write_seqcount_end(&qdisc->running);
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 780089d75915..977a11e418d0 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -629,6 +629,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 static struct lock_class_key bt_tx_busylock;
 static struct lock_class_key bt_netdev_xmit_lock_key;
+static struct lock_class_key bt_qdisc_running_key;
 
 static void bt_set_lockdep_class_one(struct net_device *dev,
 				     struct netdev_queue *txq,
@@ -641,6 +642,7 @@ static int bt_dev_init(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL);
 	dev->qdisc_tx_busylock = &bt_tx_busylock;
+	dev->qdisc_running_key = &bt_qdisc_running_key;
 
 	return 0;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 896b686d1966..e0bcc39f4a7d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3075,7 +3075,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
-	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
+	 * This permits qdisc->running owner to get the lock more
 	 * often and dequeue packets faster.
 	 */
 	contended = qdisc_is_running(q);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index dd085db8580e..14aa5effd29a 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -60,6 +60,7 @@ static struct header_ops lowpan_header_ops = {
 
 static struct lock_class_key lowpan_tx_busylock;
 static struct lock_class_key lowpan_netdev_xmit_lock_key;
+static struct lock_class_key lowpan_qdisc_running_key;
 
 static void lowpan_set_lockdep_class_one(struct net_device *ldev,
 					 struct netdev_queue *txq,
@@ -73,6 +74,8 @@ static int lowpan_dev_init(struct net_device *ldev)
 {
 	netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL);
 	ldev->qdisc_tx_busylock = &lowpan_tx_busylock;
+	ldev->qdisc_running_key = &lowpan_qdisc_running_key;
+
 	return 0;
 }
 
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e253c26f31ac..c00d72d182fa 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -68,6 +68,8 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
 }
 
 static struct lock_class_key l2tp_eth_tx_busylock;
+static struct lock_class_key l2tp_qdisc_running_key;
+
 static int l2tp_eth_dev_init(struct net_device *dev)
 {
 	struct l2tp_eth *priv = netdev_priv(dev);
@@ -76,6 +78,8 @@ static int l2tp_eth_dev_init(struct net_device *dev)
 	eth_hw_addr_random(dev);
 	eth_broadcast_addr(dev->broadcast);
 	dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock;
+	dev->qdisc_running_key = &l2tp_qdisc_running_key;
+
 	return 0;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 269dd71b3828..cebea73e70ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -110,7 +110,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 
 /*
  * Transmit possibly several skbs, and handle the return status as
- * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
+ * required. Owning running seqcount bit guarantees that
  * only one CPU can execute this function.
  *
  * Returns to the caller:
@@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
-		spin_lock(root_lock);
+		spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING);
 		return qdisc_qlen(q);
 	}
-	spin_lock(root_lock);
+	spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING);
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
@@ -163,7 +163,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 /*
  * NOTE: Called under qdisc_lock(q) with locally disabled BH.
  *
- * __QDISC___STATE_RUNNING guarantees only one CPU can process
+ * running seqcount guarantees only one CPU can process
  * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
  * this queue.
  *
@@ -379,6 +379,7 @@ struct Qdisc noop_qdisc = {
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
+	.running	=	SEQCNT_ZERO(noop_qdisc.running),
 	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 };
 EXPORT_SYMBOL(noop_qdisc);
@@ -537,6 +538,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 EXPORT_SYMBOL(pfifo_fast_ops);
 
 static struct lock_class_key qdisc_tx_busylock;
+static struct lock_class_key qdisc_running_key;
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops)
@@ -570,6 +572,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
 
+	seqcount_init(&sch->running);
+	lockdep_set_class(&sch->running,
+			  dev->qdisc_running_key ?: &qdisc_running_key);
+
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
-- 
cgit v1.2.3


From edb09eb17ed89eaa82a52dd306beac93e292b485 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jun 2016 09:37:16 -0700
Subject: net: sched: do not acquire qdisc spinlock in qdisc/class stats dump

Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host
agent [1] are problematic at scale :

For each qdisc/class found in the dump, we currently lock the root qdisc
spinlock in order to get stats. Sampling stats every 5 seconds from
thousands of HTB classes is a challenge when the root qdisc spinlock is
under high pressure. Not only the dumps take time, they also slow
down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases.

An audit of existing qdiscs showed that sch_fq_codel is the only qdisc
that might need the qdisc lock in fq_codel_dump_stats() and
fq_codel_dump_class_stats()

In v2 of this patch, I now use the Qdisc running seqcount to provide
consistent reads of packets/bytes counters, regardless of 32/64 bit arches.

I also changed rate estimators to use the same infrastructure
so that they no longer need to lock root qdisc lock.

[1]
http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Kevin Athey <kda@google.com>
Cc: Xiaotian Pei <xiaotian@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/gen_stats.txt |  2 +-
 include/net/gen_stats.h                | 12 ++++++++----
 include/net/sch_generic.h              |  8 ++++++++
 net/core/gen_estimator.c               | 24 ++++++++++++++++--------
 net/core/gen_stats.c                   | 34 +++++++++++++++++++++++-----------
 net/netfilter/xt_RATEEST.c             |  2 +-
 net/sched/act_api.c                    |  4 ++--
 net/sched/act_police.c                 |  3 ++-
 net/sched/sch_api.c                    | 21 +++++++++++----------
 net/sched/sch_atm.c                    |  3 ++-
 net/sched/sch_cbq.c                    |  9 ++++++---
 net/sched/sch_drr.c                    |  9 ++++++---
 net/sched/sch_fq_codel.c               | 15 +++++++++++----
 net/sched/sch_hfsc.c                   | 10 +++++-----
 net/sched/sch_htb.c                    | 11 ++++++-----
 net/sched/sch_mq.c                     |  2 +-
 net/sched/sch_mqprio.c                 | 11 +++++++----
 net/sched/sch_multiq.c                 |  3 ++-
 net/sched/sch_prio.c                   |  3 ++-
 net/sched/sch_qfq.c                    |  9 ++++++---
 20 files changed, 126 insertions(+), 69 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt
index ff630a87b511..179b18ce45ff 100644
--- a/Documentation/networking/gen_stats.txt
+++ b/Documentation/networking/gen_stats.txt
@@ -21,7 +21,7 @@ struct mystruct {
 	...
 };
 
-Update statistics:
+Update statistics, in dequeue() methods only, (while owning qdisc->running)
 mystruct->tstats.packet++;
 mystruct->qstats.backlog += skb->pkt_len;
 
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 610cd397890e..231e121cc7d9 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -33,10 +33,12 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
 				 spinlock_t *lock, struct gnet_dump *d,
 				 int padattr);
 
-int gnet_stats_copy_basic(struct gnet_dump *d,
+int gnet_stats_copy_basic(const seqcount_t *running,
+			  struct gnet_dump *d,
 			  struct gnet_stats_basic_cpu __percpu *cpu,
 			  struct gnet_stats_basic_packed *b);
-void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+void __gnet_stats_copy_basic(const seqcount_t *running,
+			     struct gnet_stats_basic_packed *bstats,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
@@ -52,13 +54,15 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
-		      spinlock_t *stats_lock, struct nlattr *opt);
+		      spinlock_t *stats_lock,
+		      seqcount_t *running, struct nlattr *opt);
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est64 *rate_est);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
-			  spinlock_t *stats_lock, struct nlattr *opt);
+			  spinlock_t *stats_lock,
+			  seqcount_t *running, struct nlattr *opt);
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 			  const struct gnet_stats_rate_est64 *rate_est);
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bff8d895ef8a..c4f5749342ec 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -314,6 +314,14 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
 	return qdisc_lock(root);
 }
 
+static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
+{
+	struct Qdisc *root = qdisc_root_sleeping(qdisc);
+
+	ASSERT_RTNL();
+	return &root->running;
+}
+
 static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
 {
 	return qdisc->dev_queue->dev;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 4573d81093fe..cad8e791f28e 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -84,6 +84,7 @@ struct gen_estimator
 	struct gnet_stats_basic_packed	*bstats;
 	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
+	seqcount_t		*running;
 	int			ewma_log;
 	u32			last_packets;
 	unsigned long		avpps;
@@ -121,26 +122,28 @@ static void est_timer(unsigned long arg)
 		unsigned long rate;
 		u64 brate;
 
-		spin_lock(e->stats_lock);
+		if (e->stats_lock)
+			spin_lock(e->stats_lock);
 		read_lock(&est_lock);
 		if (e->bstats == NULL)
 			goto skip;
 
-		__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
 
 		brate = (b.bytes - e->last_bytes)<<(7 - idx);
 		e->last_bytes = b.bytes;
 		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		e->rate_est->bps = (e->avbps+0xF)>>5;
+		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
 
 		rate = b.packets - e->last_packets;
 		rate <<= (7 - idx);
 		e->last_packets = b.packets;
 		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		e->rate_est->pps = (e->avpps + 0xF) >> 5;
+		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
 skip:
 		read_unlock(&est_lock);
-		spin_unlock(e->stats_lock);
+		if (e->stats_lock)
+			spin_unlock(e->stats_lock);
 	}
 
 	if (!list_empty(&elist[idx].list))
@@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
  * @stats_lock: statistics lock
+ * @running: qdisc running seqcount
  * @opt: rate estimator configuration TLV
  *
  * Creates a new rate estimator with &bstats as source and &rate_est
@@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock,
+		      seqcount_t *running,
 		      struct nlattr *opt)
 {
 	struct gen_estimator *est;
@@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
-	__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
 
 	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
+	est->running  = running;
 	est->ewma_log = parm->ewma_log;
 	est->last_bytes = b.bytes;
 	est->avbps = rate_est->bps<<5;
@@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
  * @stats_lock: statistics lock
+ * @running: qdisc running seqcount (might be NULL)
  * @opt: rate estimator configuration TLV
  *
  * Replaces the configuration of a rate estimator by calling
@@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
-			  spinlock_t *stats_lock, struct nlattr *opt)
+			  spinlock_t *stats_lock,
+			  seqcount_t *running, struct nlattr *opt)
 {
 	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index f96ee8b9478d..d9c210caff32 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
 	return 0;
 
 nla_put_failure:
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	kfree(d->xstats);
 	d->xstats = NULL;
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return -1;
 }
 
@@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
 {
 	memset(d, 0, sizeof(*d));
 
-	spin_lock_bh(lock);
-	d->lock = lock;
 	if (type)
 		d->tail = (struct nlattr *)skb_tail_pointer(skb);
 	d->skb = skb;
 	d->compat_tc_stats = tc_stats_type;
 	d->compat_xstats = xstats_type;
 	d->padattr = padattr;
-
+	if (lock) {
+		d->lock = lock;
+		spin_lock_bh(lock);
+	}
 	if (d->tail)
 		return gnet_stats_copy(d, type, NULL, 0, padattr);
 
@@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
 }
 
 void
-__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+__gnet_stats_copy_basic(const seqcount_t *running,
+			struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_basic_cpu __percpu *cpu,
 			struct gnet_stats_basic_packed *b)
 {
+	unsigned int seq;
+
 	if (cpu) {
 		__gnet_stats_copy_basic_cpu(bstats, cpu);
-	} else {
+		return;
+	}
+	do {
+		if (running)
+			seq = read_seqcount_begin(running);
 		bstats->bytes = b->bytes;
 		bstats->packets = b->packets;
-	}
+	} while (running && read_seqcount_retry(running, seq));
 }
 EXPORT_SYMBOL(__gnet_stats_copy_basic);
 
@@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
  * if the room in the socket buffer was not sufficient.
  */
 int
-gnet_stats_copy_basic(struct gnet_dump *d,
+gnet_stats_copy_basic(const seqcount_t *running,
+		      struct gnet_dump *d,
 		      struct gnet_stats_basic_cpu __percpu *cpu,
 		      struct gnet_stats_basic_packed *b)
 {
 	struct gnet_stats_basic_packed bstats = {0};
 
-	__gnet_stats_copy_basic(&bstats, cpu, b);
+	__gnet_stats_copy_basic(running, &bstats, cpu, b);
 
 	if (d->compat_tc_stats) {
 		d->tc_stats.bytes = bstats.bytes;
@@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
 	return 0;
 
 err_out:
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return -1;
 }
 EXPORT_SYMBOL(gnet_stats_copy_app);
@@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
 			return -1;
 	}
 
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	kfree(d->xstats);
 	d->xstats = NULL;
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return 0;
 }
 EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 604df6fae6fc..515131f9e021 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	cfg.est.ewma_log	= info->ewma_log;
 
 	ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
-				&est->lock, &cfg.opt);
+				&est->lock, NULL, &cfg.opt);
 	if (ret < 0)
 		goto err2;
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 719bc2e85852..b6db56ec8117 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -287,7 +287,7 @@ err2:
 	if (est) {
 		err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
 					&p->tcfc_rate_est,
-					&p->tcfc_lock, est);
+					&p->tcfc_lock, NULL, est);
 		if (err) {
 			free_percpu(p->cpu_qstats);
 			goto err2;
@@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	if (err < 0)
 		goto errout;
 
-	if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
+	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
 	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
 				     &p->tcfc_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 820b11686f85..bcb31142556b 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -185,7 +185,8 @@ override:
 	if (est) {
 		err = gen_replace_estimator(&police->tcf_bstats, NULL,
 					    &police->tcf_rate_est,
-					    &police->tcf_lock, est);
+					    &police->tcf_lock,
+					    NULL, est);
 		if (err)
 			goto failure_unlock;
 	} else if (tb[TCA_POLICE_AVRATE] &&
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ddf047df5361..d4a8bbfcc953 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 			rcu_assign_pointer(sch->stab, stab);
 		}
 		if (tca[TCA_RATE]) {
-			spinlock_t *root_lock;
+			seqcount_t *running;
 
 			err = -EOPNOTSUPP;
 			if (sch->flags & TCQ_F_MQROOT)
@@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 			if ((sch->parent != TC_H_ROOT) &&
 			    !(sch->flags & TCQ_F_INGRESS) &&
 			    (!p || !(p->flags & TCQ_F_MQROOT)))
-				root_lock = qdisc_root_sleeping_lock(sch);
+				running = qdisc_root_sleeping_running(sch);
 			else
-				root_lock = qdisc_lock(sch);
+				running = &sch->running;
 
 			err = gen_new_estimator(&sch->bstats,
 						sch->cpu_bstats,
 						&sch->rate_est,
-						root_lock,
+						NULL,
+						running,
 						tca[TCA_RATE]);
 			if (err)
 				goto err_out4;
@@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 		gen_replace_estimator(&sch->bstats,
 				      sch->cpu_bstats,
 				      &sch->rate_est,
-				      qdisc_root_sleeping_lock(sch),
+				      NULL,
+				      qdisc_root_sleeping_running(sch),
 				      tca[TCA_RATE]);
 	}
 out:
@@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		goto nla_put_failure;
 
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
-					 qdisc_root_sleeping_lock(q), &d,
-					 TCA_PAD) < 0)
+					 NULL, &d, TCA_PAD) < 0)
 		goto nla_put_failure;
 
 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
@@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		cpu_qstats = q->cpu_qstats;
 	}
 
-	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
+				  &d, cpu_bstats, &q->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 		goto nla_put_failure;
@@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 		goto nla_put_failure;
 
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
-					 qdisc_root_sleeping_lock(q), &d,
-					 TCA_PAD) < 0)
+					 NULL, &d, TCA_PAD) < 0)
 		goto nla_put_failure;
 
 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1911af3ca7c0..34f8f79e56d5 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 {
 	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
 
-	if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &flow->bstats) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index baafddf229ce..1b8128fb845d 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	if (cl->undertime != PSCHED_PASTPERFECT)
 		cl->xstats.undertime = cl->undertime - q->now;
 
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
 		return -1;
@@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 		if (tca[TCA_RATE]) {
 			err = gen_replace_estimator(&cl->bstats, NULL,
 						    &cl->rate_est,
-						    qdisc_root_sleeping_lock(sch),
+						    NULL,
+						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err) {
 				qdisc_put_rtab(rtab);
@@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 
 	if (tca[TCA_RATE]) {
 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
-					qdisc_root_sleeping_lock(sch),
+					NULL,
+					qdisc_root_sleeping_running(sch),
 					tca[TCA_RATE]);
 		if (err) {
 			kfree(cl);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a63e879e8975..1b7e1a27773d 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		if (tca[TCA_RATE]) {
 			err = gen_replace_estimator(&cl->bstats, NULL,
 						    &cl->rate_est,
-						    qdisc_root_sleeping_lock(sch),
+						    NULL,
+						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err)
 				return err;
@@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	if (tca[TCA_RATE]) {
 		err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
-					    qdisc_root_sleeping_lock(sch),
+					    NULL,
+					    qdisc_root_sleeping_running(sch),
 					    tca[TCA_RATE]);
 		if (err) {
 			qdisc_destroy(cl->qdisc);
@@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	if (qlen)
 		xstats.deficit = cl->deficit;
 
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
 		return -1;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6883a8971562..1daa54237f4e 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -566,11 +566,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.qdisc_stats.memory_usage  = q->memory_usage;
 	st.qdisc_stats.drop_overmemory = q->drop_overmemory;
 
+	sch_tree_lock(sch);
 	list_for_each(pos, &q->new_flows)
 		st.qdisc_stats.new_flows_len++;
 
 	list_for_each(pos, &q->old_flows)
 		st.qdisc_stats.old_flows_len++;
+	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
@@ -624,7 +626,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	if (idx < q->flows_cnt) {
 		const struct fq_codel_flow *flow = &q->flows[idx];
-		const struct sk_buff *skb = flow->head;
+		const struct sk_buff *skb;
 
 		memset(&xstats, 0, sizeof(xstats));
 		xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
@@ -642,9 +644,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 				codel_time_to_us(delta) :
 				-codel_time_to_us(-delta);
 		}
-		while (skb) {
-			qs.qlen++;
-			skb = skb->next;
+		if (flow->head) {
+			sch_tree_lock(sch);
+			skb = flow->head;
+			while (skb) {
+				qs.qlen++;
+				skb = skb->next;
+			}
+			sch_tree_unlock(sch);
 		}
 		qs.backlog = q->backlogs[idx];
 		qs.drops = flow->dropped;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d783d7cc3348..74813dd49053 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1015,11 +1015,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		cur_time = psched_get_time();
 
 		if (tca[TCA_RATE]) {
-			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
-
 			err = gen_replace_estimator(&cl->bstats, NULL,
 						    &cl->rate_est,
-						    lock,
+						    NULL,
+						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err)
 				return err;
@@ -1068,7 +1067,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	if (tca[TCA_RATE]) {
 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
-					qdisc_root_sleeping_lock(sch),
+					NULL,
+					qdisc_root_sleeping_running(sch),
 					tca[TCA_RATE]);
 		if (err) {
 			kfree(cl);
@@ -1373,7 +1373,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.work    = cl->cl_total;
 	xstats.rtwork  = cl->cl_cumul;
 
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
 		return -1;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index d4b4218af6b1..2b057649f24b 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1141,7 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
 	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
 
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
 		return -1;
@@ -1395,7 +1396,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if (htb_rate_est || tca[TCA_RATE]) {
 			err = gen_new_estimator(&cl->bstats, NULL,
 						&cl->rate_est,
-						qdisc_root_sleeping_lock(sch),
+						NULL,
+						qdisc_root_sleeping_running(sch),
 						tca[TCA_RATE] ? : &est.nla);
 			if (err) {
 				kfree(cl);
@@ -1457,11 +1459,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			parent->children++;
 	} else {
 		if (tca[TCA_RATE]) {
-			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
-
 			err = gen_replace_estimator(&cl->bstats, NULL,
 						    &cl->rate_est,
-						    lock,
+						    NULL,
+						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err)
 				return err;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 56a77b878eb3..b9439827c172 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 
 	sch = dev_queue->qdisc_sleeping;
-	if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+	if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
 		return -1;
 	return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b8002ce3d010..549c66359924 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		 * hold here is the look on dev_queue->qdisc_sleeping
 		 * also acquired below.
 		 */
-		spin_unlock_bh(d->lock);
+		if (d->lock)
+			spin_unlock_bh(d->lock);
 
 		for (i = tc.offset; i < tc.offset + tc.count; i++) {
 			struct netdev_queue *q = netdev_get_tx_queue(dev, i);
@@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 			spin_unlock_bh(qdisc_lock(qdisc));
 		}
 		/* Reclaim root sleeping lock before completing stats */
-		spin_lock_bh(d->lock);
-		if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
+		if (d->lock)
+			spin_lock_bh(d->lock);
+		if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 ||
 		    gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
 			return -1;
 	} else {
 		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
 
 		sch = dev_queue->qdisc_sleeping;
-		if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+		if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+					  d, NULL, &sch->bstats) < 0 ||
 		    gnet_stats_copy_queue(d, NULL,
 					  &sch->qstats, sch->q.qlen) < 0)
 			return -1;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index bcdd54bb101c..21e69d2e8347 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -356,7 +356,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct Qdisc *cl_q;
 
 	cl_q = q->queues[cl - 1];
-	if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl_q->bstats) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fee1b15506b2..06eca7060683 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -319,7 +319,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct Qdisc *cl_q;
 
 	cl_q = q->queues[cl - 1];
-	if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl_q->bstats) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8d2d8d953432..85d41979d825 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		if (tca[TCA_RATE]) {
 			err = gen_replace_estimator(&cl->bstats, NULL,
 						    &cl->rate_est,
-						    qdisc_root_sleeping_lock(sch),
+						    NULL,
+						    qdisc_root_sleeping_running(sch),
 						    tca[TCA_RATE]);
 			if (err)
 				return err;
@@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (tca[TCA_RATE]) {
 		err = gen_new_estimator(&cl->bstats, NULL,
 					&cl->rate_est,
-					qdisc_root_sleeping_lock(sch),
+					NULL,
+					qdisc_root_sleeping_running(sch),
 					tca[TCA_RATE]);
 		if (err)
 			goto destroy_class;
@@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.weight = cl->agg->class_weight;
 	xstats.lmax = cl->agg->lmax;
 
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL,
 				  &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
-- 
cgit v1.2.3


From 0c73c523cf737b5d446705392e0e14ee0411a351 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 7 Jun 2016 16:32:42 -0700
Subject: net: dsa: Initialize CPU port ethtool ops per tree

Now that we can properly support multiple distinct trees in the system,
using a global variable: dsa_cpu_port_ethtool_ops is getting clobbered
as soon as the second switch tree gets probed, and we don't want that.

We need to move this to be dynamically allocated, and since we can't
really be comparing addresses anymore to determine first time
initialization versus any other times, just move this to dsa.c and
dsa2.c where the remainder of the dst/ds initialization happens.

The operations teardown restores the master netdev's ethtool_ops to its
original ethtool_ops pointer (typically within the Ethernet driver)

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  1 +
 net/dsa/dsa.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa2.c     |  6 ++++++
 net/dsa/dsa_priv.h |  2 ++
 net/dsa/slave.c    | 10 ----------
 5 files changed, 50 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index cca7ef230742..20b3087ad193 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -116,6 +116,7 @@ struct dsa_switch_tree {
 	 * Original copy of the master netdev ethtool_ops
 	 */
 	struct ethtool_ops	master_ethtool_ops;
+	const struct ethtool_ops *master_orig_ethtool_ops;
 
 	/*
 	 * The switch and port to which the CPU is attached.
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index ce3b942dce76..766d2a525ada 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -266,6 +266,41 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
 	return ops;
 }
 
+int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds)
+{
+	struct net_device *master;
+	struct ethtool_ops *cpu_ops;
+
+	master = ds->dst->master_netdev;
+	if (ds->master_netdev)
+		master = ds->master_netdev;
+
+	cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL);
+	if (!cpu_ops)
+		return -ENOMEM;
+
+	memcpy(&ds->dst->master_ethtool_ops, master->ethtool_ops,
+	       sizeof(struct ethtool_ops));
+	ds->dst->master_orig_ethtool_ops = master->ethtool_ops;
+	memcpy(cpu_ops, &ds->dst->master_ethtool_ops,
+	       sizeof(struct ethtool_ops));
+	dsa_cpu_port_ethtool_init(cpu_ops);
+	master->ethtool_ops = cpu_ops;
+
+	return 0;
+}
+
+void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
+{
+	struct net_device *master;
+
+	master = ds->dst->master_netdev;
+	if (ds->master_netdev)
+		master = ds->master_netdev;
+
+	master->ethtool_ops = ds->dst->master_orig_ethtool_ops;
+}
+
 static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 {
 	struct dsa_switch_driver *drv = ds->drv;
@@ -379,6 +414,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 		ret = 0;
 	}
 
+	ret = dsa_cpu_port_ethtool_setup(ds);
+	if (ret)
+		return ret;
+
 #ifdef CONFIG_NET_DSA_HWMON
 	/* If the switch provides a temperature sensor,
 	 * register with hardware monitoring subsystem.
@@ -963,6 +1002,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 			dsa_switch_destroy(ds);
 	}
 
+	dsa_cpu_port_ethtool_restore(dst->ds[0]);
+
 	dev_put(dst->master_netdev);
 }
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 4e0f3c268103..83b95fc4cede 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -394,6 +394,10 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 			return err;
 	}
 
+	err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
+	if (err)
+		return err;
+
 	/* If we use a tagging format that doesn't have an ethertype
 	 * field, make sure that all packets from this point on get
 	 * sent to the tag format's receive function.
@@ -429,6 +433,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 		dsa_ds_unapply(dst, ds);
 	}
 
+	dsa_cpu_port_ethtool_restore(dst->ds[0]);
+
 	pr_info("DSA: tree %d unapplied\n", dst->tree);
 	dst->applied = false;
 }
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 106a9f067f94..00077a9c97f4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -54,6 +54,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		      struct device_node *port_dn, int port);
 void dsa_cpu_dsa_destroy(struct device_node *port_dn);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
+int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
+void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
 
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8d159932e082..7236eb26dc97 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -892,8 +892,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_eee		= dsa_slave_get_eee,
 };
 
-static struct ethtool_ops dsa_cpu_port_ethtool_ops;
-
 static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
@@ -1126,14 +1124,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 
 	slave_dev->features = master->vlan_features;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
-	if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) {
-		memcpy(&dst->master_ethtool_ops, master->ethtool_ops,
-		       sizeof(struct ethtool_ops));
-		memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops,
-		       sizeof(struct ethtool_ops));
-		dsa_cpu_port_ethtool_init(&dsa_cpu_port_ethtool_ops);
-		master->ethtool_ops = &dsa_cpu_port_ethtool_ops;
-	}
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->priv_flags |= IFF_NO_QUEUE;
 	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
-- 
cgit v1.2.3


From 96c63fa7393d0a346acfe5a91e0c7d4c7782641b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 8 Jun 2016 10:55:39 -0700
Subject: net: Add l3mdev rule

Currently, VRFs require 1 oif and 1 iif rule per address family per
VRF. As the number of VRF devices increases it brings scalability
issues with the increasing rule list. All of the VRF rules have the
same format with the exception of the specific table id to direct the
lookup. Since the table id is available from the oif or iif in the
loopup, the VRF rules can be consolidated to a single rule that pulls
the table from the VRF device.

This patch introduces a new rule attribute l3mdev. The l3mdev rule
means the table id used for the lookup is pulled from the L3 master
device (e.g., VRF) rather than being statically defined. With the
l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev
rule per address family (IPv4 and IPv6).

If an admin wishes to insert higher priority rules for specific VRFs
those rules will co-exist with the l3mdev rule. This capability means
current VRF scripts will co-exist with this new simpler implementation.

Currently, the rules list for both ipv4 and ipv6 look like this:
    $ ip  ru ls
    1000:       from all oif vrf1 lookup 1001
    1000:       from all iif vrf1 lookup 1001
    1000:       from all oif vrf2 lookup 1002
    1000:       from all iif vrf2 lookup 1002
    1000:       from all oif vrf3 lookup 1003
    1000:       from all iif vrf3 lookup 1003
    1000:       from all oif vrf4 lookup 1004
    1000:       from all iif vrf4 lookup 1004
    1000:       from all oif vrf5 lookup 1005
    1000:       from all iif vrf5 lookup 1005
    1000:       from all oif vrf6 lookup 1006
    1000:       from all iif vrf6 lookup 1006
    1000:       from all oif vrf7 lookup 1007
    1000:       from all iif vrf7 lookup 1007
    1000:       from all oif vrf8 lookup 1008
    1000:       from all iif vrf8 lookup 1008
    ...
    32765:      from all lookup local
    32766:      from all lookup main
    32767:      from all lookup default

With the l3mdev rule the list is just the following regardless of the
number of VRFs:
    $ ip ru ls
    1000:       from all lookup [l3mdev table]
    32765:      from all lookup local
    32766:      from all lookup main
    32767:      from all lookup default

(Note: the above pretty print of the rule is based on an iproute2
       prototype. Actual verbage may change)

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        | 24 ++++++++++++++++++++++--
 include/net/l3mdev.h           | 12 ++++++++++++
 include/uapi/linux/fib_rules.h |  1 +
 net/core/fib_rules.c           | 33 ++++++++++++++++++++++++++++-----
 net/ipv4/fib_rules.c           |  6 ++++--
 net/ipv6/fib6_rules.c          |  6 ++++--
 net/l3mdev/l3mdev.c            | 38 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 109 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 59160de702b6..456e4a6006ab 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -17,7 +17,8 @@ struct fib_rule {
 	u32			flags;
 	u32			table;
 	u8			action;
-	/* 3 bytes hole, try to use */
+	u8			l3mdev;
+	/* 2 bytes hole, try to use */
 	u32			target;
 	__be64			tun_id;
 	struct fib_rule __rcu	*ctarget;
@@ -36,6 +37,7 @@ struct fib_lookup_arg {
 	void			*lookup_ptr;
 	void			*result;
 	struct fib_rule		*rule;
+	u32			table;
 	int			flags;
 #define FIB_LOOKUP_NOREF		1
 #define FIB_LOOKUP_IGNORE_LINKSTATE	2
@@ -89,7 +91,8 @@ struct fib_rules_ops {
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
-	[FRA_GOTO]	= { .type = NLA_U32 }
+	[FRA_GOTO]	= { .type = NLA_U32 }, \
+	[FRA_L3MDEV]	= { .type = NLA_U8 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
@@ -102,6 +105,20 @@ static inline void fib_rule_put(struct fib_rule *rule)
 		kfree_rcu(rule, rcu);
 }
 
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static inline u32 fib_rule_get_table(struct fib_rule *rule,
+				     struct fib_lookup_arg *arg)
+{
+	return rule->l3mdev ? arg->table : rule->table;
+}
+#else
+static inline u32 fib_rule_get_table(struct fib_rule *rule,
+				     struct fib_lookup_arg *arg)
+{
+	return rule->table;
+}
+#endif
+
 static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla)
 {
 	if (nla[FRA_TABLE])
@@ -117,4 +134,7 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
 		     struct fib_lookup_arg *);
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
 			 u32 flags);
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh);
+int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh);
 #endif
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 374388dc01c8..34f33eb96a5e 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -11,6 +11,8 @@
 #ifndef _NET_L3MDEV_H_
 #define _NET_L3MDEV_H_
 
+#include <net/fib_rules.h>
+
 /**
  * struct l3mdev_ops - l3mdev operations
  *
@@ -41,6 +43,9 @@ struct l3mdev_ops {
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 
+int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
+			  struct fib_lookup_arg *arg);
+
 int l3mdev_master_ifindex_rcu(const struct net_device *dev);
 static inline int l3mdev_master_ifindex(struct net_device *dev)
 {
@@ -236,6 +241,13 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
 {
 	return skb;
 }
+
+static inline
+int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
+			  struct fib_lookup_arg *arg)
+{
+	return 1;
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 620c8a5ddc00..14404b3ebb89 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -50,6 +50,7 @@ enum {
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
 	FRA_PAD,
+	FRA_L3MDEV,	/* iif or oif is l3mdev goto its table */
 	__FRA_MAX
 };
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 840acebbb80c..98298b11f534 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -173,7 +173,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
 
 static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
-			  struct flowi *fl, int flags)
+			  struct flowi *fl, int flags,
+			  struct fib_lookup_arg *arg)
 {
 	int ret = 0;
 
@@ -189,6 +190,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
 		goto out;
 
+	if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
+		goto out;
+
 	ret = ops->match(rule, fl, flags);
 out:
 	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -204,7 +208,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
 
 	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
 jumped:
-		if (!fib_rule_match(rule, ops, fl, flags))
+		if (!fib_rule_match(rule, ops, fl, flags, arg))
 			continue;
 
 		if (rule->action == FR_ACT_GOTO) {
@@ -265,7 +269,7 @@ errout:
 	return err;
 }
 
-static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -336,6 +340,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (tb[FRA_TUN_ID])
 		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
 
+	if (tb[FRA_L3MDEV]) {
+#ifdef CONFIG_NET_L3_MASTER_DEV
+		rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
+		if (rule->l3mdev != 1)
+#endif
+			goto errout_free;
+	}
+
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
@@ -371,6 +383,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	} else if (rule->action == FR_ACT_GOTO)
 		goto errout_free;
 
+	if (rule->l3mdev && rule->table)
+		goto errout_free;
+
 	err = ops->configure(rule, skb, frh, tb);
 	if (err < 0)
 		goto errout_free;
@@ -424,8 +439,9 @@ errout:
 	rules_ops_put(ops);
 	return err;
 }
+EXPORT_SYMBOL_GPL(fib_nl_newrule);
 
-static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -483,6 +499,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
 			continue;
 
+		if (tb[FRA_L3MDEV] &&
+		    (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
+			continue;
+
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -536,6 +556,7 @@ errout:
 	rules_ops_put(ops);
 	return err;
 }
+EXPORT_SYMBOL_GPL(fib_nl_delrule);
 
 static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 					 struct fib_rule *rule)
@@ -607,7 +628,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->target &&
 	     nla_put_u32(skb, FRA_GOTO, rule->target)) ||
 	    (rule->tun_id &&
-	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)))
+	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
+	    (rule->l3mdev &&
+	     nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f2bda9e89c61..6e9ea69e5f75 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 {
 	int err = -EAGAIN;
 	struct fib_table *tbl;
+	u32 tb_id;
 
 	switch (rule->action) {
 	case FR_ACT_TO_TBL:
@@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rcu_read_lock();
 
-	tbl = fib_get_table(rule->fr_net, rule->table);
+	tb_id = fib_rule_get_table(rule, arg);
+	tbl = fib_get_table(rule->fr_net, tb_id);
 	if (tbl)
 		err = fib_table_lookup(tbl, &flp->u.ip4,
 				       (struct fib_result *)arg->result,
@@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	if (err)
 		goto errout;
 
-	if (rule->table == RT_TABLE_UNSPEC) {
+	if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) {
 		if (rule->action == FR_ACT_TO_TBL) {
 			struct fib_table *table;
 
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ed33abf57abd..5857c1fc8b67 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -67,6 +67,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	struct net *net = rule->fr_net;
 	pol_lookup_t lookup = arg->lookup_ptr;
 	int err = 0;
+	u32 tb_id;
 
 	switch (rule->action) {
 	case FR_ACT_TO_TBL:
@@ -86,7 +87,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto discard_pkt;
 	}
 
-	table = fib6_get_table(net, rule->table);
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
 	if (!table) {
 		err = -EAGAIN;
 		goto out;
@@ -199,7 +201,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
-	if (rule->action == FR_ACT_TO_TBL) {
+	if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
 		if (rule->table == RT6_TABLE_UNSPEC)
 			goto errout;
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 6651a78e100c..7da97809a7e8 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/netdevice.h>
+#include <net/fib_rules.h>
 #include <net/l3mdev.h>
 
 /**
@@ -160,3 +161,40 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
 	return rc;
 }
 EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
+
+/**
+ *	l3mdev_fib_rule_match - Determine if flowi references an
+ *				L3 master device
+ *	@net: network namespace for device index lookup
+ *	@fl:  flow struct
+ */
+
+int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
+			  struct fib_lookup_arg *arg)
+{
+	struct net_device *dev;
+	int rc = 0;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+	if (dev && netif_is_l3_master(dev) &&
+	    dev->l3mdev_ops->l3mdev_fib_table) {
+		arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
+		rc = 1;
+		goto out;
+	}
+
+	dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+	if (dev && netif_is_l3_master(dev) &&
+	    dev->l3mdev_ops->l3mdev_fib_table) {
+		arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
+		rc = 1;
+		goto out;
+	}
+
+out:
+	rcu_read_unlock();
+
+	return rc;
+}
-- 
cgit v1.2.3


From dd47c1fa776cda48531b651c88341e951140b0a7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 9 Jun 2016 00:27:40 +0200
Subject: cbq: remove TCA_CBQ_POLICE support

iproute2 doesn't implement any cbq option that results in this attribute
being sent to kernel.

To make use of it, user would have to

- patch iproute2
- add a class
- attach a qdisc to the class (default pfifo doesn't work as
  q->handle is 0 and cbq_set_police() is a no-op in this case)
- re-'add' the same class (tc class change ...) again
- user must also specifiy a defmap (e.g. 'split 1:0 defmap 3f'), since
  this 'police' feature relies on its presence
- the added qdisc must be one of bfifo, pfifo or netem

If all of these conditions are met and _some_ leaf qdiscs, namely
p/bfifo, netem, plug or tbf would drop a packet, kernel calls back into
cbq, which will attempt to re-queue the skb into a different class
as indicated by the parents' defmap entry for TC_PRIO_BESTEFFORT.

[ i.e. we behave as if tc_classify returned TC_ACT_RECLASSIFY ].

This feature, which isn't documented or implemented in iproute2,
and isn't implemented consistently (most qdiscs like sfq, codel, etc
drop right away instead of attempting this reclassification) is the
sole reason for the reshape_fail and __parent member in Qdisc struct.

So remove TCA_CBQ_POLICE support from the kernel, reject it via EOPNOTSUPP
so userspace knows we don't support it, and then remove no-longer needed
infrastructure in followup commit.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  4 --
 net/sched/sch_cbq.c       | 95 +----------------------------------------------
 2 files changed, 1 insertion(+), 98 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c4f5749342ec..c069ac1dd75d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -68,10 +68,6 @@ struct Qdisc {
 
 	void			*u32_node;
 
-	/* This field is deprecated, but it is still used by CBQ
-	 * and it will live until better solution will be invented.
-	 */
-	struct Qdisc		*__parent;
 	struct netdev_queue	*dev_queue;
 
 	struct gnet_stats_rate_est64	rate_est;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index fdca45e34230..7f4d6c5a0efe 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -80,9 +80,6 @@ struct cbq_class {
 	unsigned char		priority;	/* class priority */
 	unsigned char		priority2;	/* priority to be used after overlimit */
 	unsigned char		ewma_log;	/* time constant for idle time calculation */
-#ifdef CONFIG_NET_CLS_ACT
-	unsigned char		police;
-#endif
 
 	u32			defmap;
 
@@ -377,9 +374,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return ret;
 	}
 
-#ifdef CONFIG_NET_CLS_ACT
-	cl->q->__parent = sch;
-#endif
 	ret = qdisc_enqueue(skb, cl->q);
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
@@ -524,40 +518,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
-{
-	struct Qdisc *sch = child->__parent;
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = q->rx_class;
-
-	q->rx_class = NULL;
-
-	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
-		int ret;
-
-		cbq_mark_toplevel(q, cl);
-
-		q->rx_class = cl;
-		cl->q->__parent = sch;
-
-		ret = qdisc_enqueue(skb, cl->q);
-		if (ret == NET_XMIT_SUCCESS) {
-			sch->q.qlen++;
-			if (!cl->next_alive)
-				cbq_activate_class(cl);
-			return 0;
-		}
-		if (net_xmit_drop_count(ret))
-			qdisc_qstats_drop(sch);
-		return 0;
-	}
-
-	qdisc_qstats_drop(sch);
-	return -1;
-}
-#endif
-
 /*
  * It is mission critical procedure.
  *
@@ -1179,21 +1139,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
 	return 0;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
-{
-	cl->police = p->police;
-
-	if (cl->q->handle) {
-		if (p->police == TC_POLICE_RECLASSIFY)
-			cl->q->reshape_fail = cbq_reshape_fail;
-		else
-			cl->q->reshape_fail = NULL;
-	}
-	return 0;
-}
-#endif
-
 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
 {
 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
@@ -1350,35 +1295,11 @@ nla_put_failure:
 	return -1;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
-{
-	unsigned char *b = skb_tail_pointer(skb);
-	struct tc_cbq_police opt;
-
-	if (cl->police) {
-		opt.police = cl->police;
-		opt.__res1 = 0;
-		opt.__res2 = 0;
-		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
-			goto nla_put_failure;
-	}
-	return skb->len;
-
-nla_put_failure:
-	nlmsg_trim(skb, b);
-	return -1;
-}
-#endif
-
 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
 {
 	if (cbq_dump_lss(skb, cl) < 0 ||
 	    cbq_dump_rate(skb, cl) < 0 ||
 	    cbq_dump_wrr(skb, cl) < 0 ||
-#ifdef CONFIG_NET_CLS_ACT
-	    cbq_dump_police(skb, cl) < 0 ||
-#endif
 	    cbq_dump_fopt(skb, cl) < 0)
 		return -1;
 	return 0;
@@ -1468,11 +1389,6 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 					&pfifo_qdisc_ops, cl->common.classid);
 		if (new == NULL)
 			return -ENOBUFS;
-	} else {
-#ifdef CONFIG_NET_CLS_ACT
-		if (cl->police == TC_POLICE_RECLASSIFY)
-			new->reshape_fail = cbq_reshape_fail;
-#endif
 	}
 
 	*old = qdisc_replace(sch, new, &cl->q);
@@ -1585,7 +1501,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_CBQ_OVL_STRATEGY])
+	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
 		return -EOPNOTSUPP;
 
 	if (cl) {
@@ -1636,11 +1552,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
 		}
 
-#ifdef CONFIG_NET_CLS_ACT
-		if (tb[TCA_CBQ_POLICE])
-			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
-#endif
-
 		if (tb[TCA_CBQ_FOPT])
 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
 
@@ -1736,10 +1647,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 		cl->maxidle = q->link.maxidle;
 	if (cl->avpkt == 0)
 		cl->avpkt = q->link.avpkt;
-#ifdef CONFIG_NET_CLS_ACT
-	if (tb[TCA_CBQ_POLICE])
-		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
-#endif
 	if (tb[TCA_CBQ_FOPT])
 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
 	sch_tree_unlock(sch);
-- 
cgit v1.2.3


From c3a173d7dba2d7c74dd4ab871b8f22bf56ac10b2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 9 Jun 2016 00:27:41 +0200
Subject: sched: remove qdisc_rehape_fail

After the removal of TCA_CBQ_POLICE in cbq scheduler qdisc->reshape_fail
is always NULL, i.e. qdisc_rehape_fail is now the same as qdisc_drop.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 19 -------------------
 net/sched/sch_fifo.c      |  4 ++--
 net/sched/sch_netem.c     |  4 ++--
 net/sched/sch_plug.c      |  2 +-
 net/sched/sch_tbf.c       |  4 ++--
 5 files changed, 7 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c069ac1dd75d..a9aec633d467 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -63,9 +63,6 @@ struct Qdisc {
 	struct list_head	list;
 	u32			handle;
 	u32			parent;
-	int			(*reshape_fail)(struct sk_buff *skb,
-					struct Qdisc *q);
-
 	void			*u32_node;
 
 	struct netdev_queue	*dev_queue;
@@ -771,22 +768,6 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_DROP;
 }
 
-static inline int qdisc_reshape_fail(struct sk_buff *skb, struct Qdisc *sch)
-{
-	qdisc_qstats_drop(sch);
-
-#ifdef CONFIG_NET_CLS_ACT
-	if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
-		goto drop;
-
-	return NET_XMIT_SUCCESS;
-
-drop:
-#endif
-	kfree_skb(skb);
-	return NET_XMIT_DROP;
-}
-
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 2177eac0a61e..7857f748a6e4 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -24,7 +24,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
-	return qdisc_reshape_fail(skb, sch);
+	return qdisc_drop(skb, sch);
 }
 
 static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
@@ -32,7 +32,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (likely(skb_queue_len(&sch->q) < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
-	return qdisc_reshape_fail(skb, sch);
+	return qdisc_drop(skb, sch);
 }
 
 static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 205bed00dd34..31984c708382 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -407,7 +407,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs)) {
-		qdisc_reshape_fail(skb, sch);
+		qdisc_drop(skb, sch);
 		return NULL;
 	}
 	consume_skb(skb);
@@ -499,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
-		return qdisc_reshape_fail(skb, sch);
+		return qdisc_drop(skb, sch);
 
 	qdisc_qstats_backlog_inc(sch, skb);
 
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 5abfe44678d4..ff0d968750df 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -96,7 +96,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return qdisc_enqueue_tail(skb, sch);
 	}
 
-	return qdisc_reshape_fail(skb, sch);
+	return qdisc_drop(skb, sch);
 }
 
 static struct sk_buff *plug_dequeue(struct Qdisc *sch)
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 83b90b584fae..801148c8b6ac 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -166,7 +166,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs))
-		return qdisc_reshape_fail(skb, sch);
+		return qdisc_drop(skb, sch);
 
 	nb = 0;
 	while (segs) {
@@ -198,7 +198,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (qdisc_pkt_len(skb) > q->max_size) {
 		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
 			return tbf_segment(skb, sch);
-		return qdisc_reshape_fail(skb, sch);
+		return qdisc_drop(skb, sch);
 	}
 	ret = qdisc_enqueue(skb, q->qdisc);
 	if (ret != NET_XMIT_SUCCESS) {
-- 
cgit v1.2.3


From a09ceb0e08140a1eec05b49b4c232d3481339cb0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 9 Jun 2016 00:27:42 +0200
Subject: sched: remove qdisc->drop

after removal of TCA_CBQ_OVL_STRATEGY from cbq scheduler, there are no
more callers of ->drop() outside of other ->drop functions, i.e.
nothing calls them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 36 ------------------------------------
 net/sched/sch_atm.c       | 15 ---------------
 net/sched/sch_cbq.c       | 26 --------------------------
 net/sched/sch_choke.c     | 17 -----------------
 net/sched/sch_drr.c       | 21 ---------------------
 net/sched/sch_dsmark.c    | 18 ------------------
 net/sched/sch_fifo.c      |  3 ---
 net/sched/sch_fq_codel.c  | 10 ----------
 net/sched/sch_gred.c      | 35 -----------------------------------
 net/sched/sch_hfsc.c      | 26 --------------------------
 net/sched/sch_hhf.c       | 10 ----------
 net/sched/sch_htb.c       | 26 --------------------------
 net/sched/sch_multiq.c    | 22 ----------------------
 net/sched/sch_netem.c     | 30 ------------------------------
 net/sched/sch_prio.c      | 19 -------------------
 net/sched/sch_qfq.c       | 47 -----------------------------------------------
 net/sched/sch_red.c       | 20 --------------------
 net/sched/sch_sfq.c       |  1 -
 net/sched/sch_tbf.c       | 13 -------------
 19 files changed, 395 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a9aec633d467..4dedb7f12ed5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -174,7 +174,6 @@ struct Qdisc_ops {
 	int 			(*enqueue)(struct sk_buff *, struct Qdisc *);
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
 	struct sk_buff *	(*peek)(struct Qdisc *);
-	unsigned int		(*drop)(struct Qdisc *);
 
 	int			(*init)(struct Qdisc *, struct nlattr *arg);
 	void			(*reset)(struct Qdisc *);
@@ -658,22 +657,6 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch)
 	return __qdisc_queue_drop_head(sch, &sch->q);
 }
 
-static inline struct sk_buff *__qdisc_dequeue_tail(struct Qdisc *sch,
-						   struct sk_buff_head *list)
-{
-	struct sk_buff *skb = __skb_dequeue_tail(list);
-
-	if (likely(skb != NULL))
-		qdisc_qstats_backlog_dec(sch, skb);
-
-	return skb;
-}
-
-static inline struct sk_buff *qdisc_dequeue_tail(struct Qdisc *sch)
-{
-	return __qdisc_dequeue_tail(sch, &sch->q);
-}
-
 static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
 {
 	return skb_peek(&sch->q);
@@ -741,25 +724,6 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	return old;
 }
 
-static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch,
-					      struct sk_buff_head *list)
-{
-	struct sk_buff *skb = __qdisc_dequeue_tail(sch, list);
-
-	if (likely(skb != NULL)) {
-		unsigned int len = qdisc_pkt_len(skb);
-		kfree_skb(skb);
-		return len;
-	}
-
-	return 0;
-}
-
-static inline unsigned int qdisc_queue_drop(struct Qdisc *sch)
-{
-	return __qdisc_queue_drop(sch, &sch->q);
-}
-
 static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
 {
 	kfree_skb(skb);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 34f8f79e56d5..7e6c12dfc66a 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -519,20 +519,6 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
 	return p->link.q->ops->peek(p->link.q);
 }
 
-static unsigned int atm_tc_drop(struct Qdisc *sch)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-	unsigned int len;
-
-	pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
-	list_for_each_entry(flow, &p->flows, list) {
-		if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
-			return len;
-	}
-	return 0;
-}
-
 static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
@@ -672,7 +658,6 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
 	.enqueue	= atm_tc_enqueue,
 	.dequeue	= atm_tc_dequeue,
 	.peek		= atm_tc_peek,
-	.drop		= atm_tc_drop,
 	.init		= atm_tc_init,
 	.reset		= atm_tc_reset,
 	.destroy	= atm_tc_destroy,
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 7f4d6c5a0efe..f2af31be6370 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1025,31 +1025,6 @@ static void cbq_link_class(struct cbq_class *this)
 	}
 }
 
-static unsigned int cbq_drop(struct Qdisc *sch)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl, *cl_head;
-	int prio;
-	unsigned int len;
-
-	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
-		cl_head = q->active[prio];
-		if (!cl_head)
-			continue;
-
-		cl = cl_head;
-		do {
-			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
-				sch->q.qlen--;
-				if (!cl->q->q.qlen)
-					cbq_deactivate_class(cl);
-				return len;
-			}
-		} while ((cl = cl->next_alive) != cl_head);
-	}
-	return 0;
-}
-
 static void
 cbq_reset(struct Qdisc *sch)
 {
@@ -1791,7 +1766,6 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
 	.enqueue	=	cbq_enqueue,
 	.dequeue	=	cbq_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	cbq_drop,
 	.init		=	cbq_init,
 	.reset		=	cbq_reset,
 	.destroy	=	cbq_destroy,
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 0a08c860eee4..04e0b0583e00 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -365,22 +365,6 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
-static unsigned int choke_drop(struct Qdisc *sch)
-{
-	struct choke_sched_data *q = qdisc_priv(sch);
-	unsigned int len;
-
-	len = qdisc_queue_drop(sch);
-	if (len > 0)
-		q->stats.other++;
-	else {
-		if (!red_is_idling(&q->vars))
-			red_start_of_idle_period(&q->vars);
-	}
-
-	return len;
-}
-
 static void choke_reset(struct Qdisc *sch)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
@@ -569,7 +553,6 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
 	.enqueue	=	choke_enqueue,
 	.dequeue	=	choke_dequeue,
 	.peek		=	choke_peek_head,
-	.drop		=	choke_drop,
 	.init		=	choke_init,
 	.destroy	=	choke_destroy,
 	.reset		=	choke_reset,
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 1b7e1a27773d..2bec94cf697c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -421,26 +421,6 @@ out:
 	return NULL;
 }
 
-static unsigned int drr_drop(struct Qdisc *sch)
-{
-	struct drr_sched *q = qdisc_priv(sch);
-	struct drr_class *cl;
-	unsigned int len;
-
-	list_for_each_entry(cl, &q->active, alist) {
-		if (cl->qdisc->ops->drop) {
-			len = cl->qdisc->ops->drop(cl->qdisc);
-			if (len > 0) {
-				sch->q.qlen--;
-				if (cl->qdisc->q.qlen == 0)
-					list_del(&cl->alist);
-				return len;
-			}
-		}
-	}
-	return 0;
-}
-
 static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct drr_sched *q = qdisc_priv(sch);
@@ -509,7 +489,6 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
 	.enqueue	= drr_enqueue,
 	.dequeue	= drr_dequeue,
 	.peek		= qdisc_peek_dequeued,
-	.drop		= drr_drop,
 	.init		= drr_init_qdisc,
 	.reset		= drr_reset_qdisc,
 	.destroy	= drr_destroy_qdisc,
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 34b4ddaca27c..b9ba5f658528 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -320,23 +320,6 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)
 	return p->q->ops->peek(p->q);
 }
 
-static unsigned int dsmark_drop(struct Qdisc *sch)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	unsigned int len;
-
-	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
-	if (p->q->ops->drop == NULL)
-		return 0;
-
-	len = p->q->ops->drop(p->q);
-	if (len)
-		sch->q.qlen--;
-
-	return len;
-}
-
 static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
@@ -489,7 +472,6 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
 	.enqueue	=	dsmark_enqueue,
 	.dequeue	=	dsmark_dequeue,
 	.peek		=	dsmark_peek,
-	.drop		=	dsmark_drop,
 	.init		=	dsmark_init,
 	.reset		=	dsmark_reset,
 	.destroy	=	dsmark_destroy,
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 7857f748a6e4..dea70e3ef0ba 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -99,7 +99,6 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.enqueue	=	pfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
-	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
 	.change		=	fifo_init,
@@ -114,7 +113,6 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.enqueue	=	bfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
-	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
 	.change		=	fifo_init,
@@ -129,7 +127,6 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
 	.enqueue	=	pfifo_tail_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
-	.drop		=	qdisc_queue_drop_head,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
 	.change		=	fifo_init,
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 1daa54237f4e..176a7e2561d7 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -184,15 +184,6 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 	return idx;
 }
 
-static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
-{
-	unsigned int prev_backlog;
-
-	prev_backlog = sch->qstats.backlog;
-	fq_codel_drop(sch, 1U);
-	return prev_backlog - sch->qstats.backlog;
-}
-
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -704,7 +695,6 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
 	.enqueue	=	fq_codel_enqueue,
 	.dequeue	=	fq_codel_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	fq_codel_qdisc_drop,
 	.init		=	fq_codel_init,
 	.reset		=	fq_codel_reset,
 	.destroy	=	fq_codel_destroy,
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 80105109f756..b5fb63c7be02 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -276,40 +276,6 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch)
 	return NULL;
 }
 
-static unsigned int gred_drop(struct Qdisc *sch)
-{
-	struct sk_buff *skb;
-	struct gred_sched *t = qdisc_priv(sch);
-
-	skb = qdisc_dequeue_tail(sch);
-	if (skb) {
-		unsigned int len = qdisc_pkt_len(skb);
-		struct gred_sched_data *q;
-		u16 dp = tc_index_to_dp(skb);
-
-		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
-					     tc_index_to_dp(skb));
-		} else {
-			q->backlog -= len;
-			q->stats.other++;
-
-			if (gred_wred_mode(t)) {
-				if (!sch->qstats.backlog)
-					red_start_of_idle_period(&t->wred_set);
-			} else {
-				if (!q->backlog)
-					red_start_of_idle_period(&q->vars);
-			}
-		}
-
-		qdisc_drop(skb, sch);
-		return len;
-	}
-
-	return 0;
-}
-
 static void gred_reset(struct Qdisc *sch)
 {
 	int i;
@@ -623,7 +589,6 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
 	.enqueue	=	gred_enqueue,
 	.dequeue	=	gred_dequeue,
 	.peek		=	qdisc_peek_head,
-	.drop		=	gred_drop,
 	.init		=	gred_init,
 	.reset		=	gred_reset,
 	.destroy	=	gred_destroy,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 74813dd49053..155e3ce81270 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1677,31 +1677,6 @@ hfsc_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
-static unsigned int
-hfsc_drop(struct Qdisc *sch)
-{
-	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hfsc_class *cl;
-	unsigned int len;
-
-	list_for_each_entry(cl, &q->droplist, dlist) {
-		if (cl->qdisc->ops->drop != NULL &&
-		    (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
-			if (cl->qdisc->q.qlen == 0) {
-				update_vf(cl, 0, 0);
-				set_passive(cl);
-			} else {
-				list_move_tail(&cl->dlist, &q->droplist);
-			}
-			cl->qstats.drops++;
-			qdisc_qstats_drop(sch);
-			sch->q.qlen--;
-			return len;
-		}
-	}
-	return 0;
-}
-
 static const struct Qdisc_class_ops hfsc_class_ops = {
 	.change		= hfsc_change_class,
 	.delete		= hfsc_delete_class,
@@ -1728,7 +1703,6 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
 	.enqueue	= hfsc_enqueue,
 	.dequeue	= hfsc_dequeue,
 	.peek		= qdisc_peek_dequeued,
-	.drop		= hfsc_drop,
 	.cl_ops		= &hfsc_class_ops,
 	.priv_size	= sizeof(struct hfsc_sched),
 	.owner		= THIS_MODULE
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 13d6f83ec491..c51791848a38 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -368,15 +368,6 @@ static unsigned int hhf_drop(struct Qdisc *sch)
 	return bucket - q->buckets;
 }
 
-static unsigned int hhf_qdisc_drop(struct Qdisc *sch)
-{
-	unsigned int prev_backlog;
-
-	prev_backlog = sch->qstats.backlog;
-	hhf_drop(sch);
-	return prev_backlog - sch->qstats.backlog;
-}
-
 static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
@@ -709,7 +700,6 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
 	.enqueue	=	hhf_enqueue,
 	.dequeue	=	hhf_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	hhf_qdisc_drop,
 	.init		=	hhf_init,
 	.reset		=	hhf_reset,
 	.destroy	=	hhf_destroy,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2b057649f24b..b74d06668ab4 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -936,31 +936,6 @@ fin:
 	return skb;
 }
 
-/* try to drop from each class (by prio) until one succeed */
-static unsigned int htb_drop(struct Qdisc *sch)
-{
-	struct htb_sched *q = qdisc_priv(sch);
-	int prio;
-
-	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
-		struct list_head *p;
-		list_for_each(p, q->drops + prio) {
-			struct htb_class *cl = list_entry(p, struct htb_class,
-							  un.leaf.drop_list);
-			unsigned int len;
-			if (cl->un.leaf.q->ops->drop &&
-			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
-				sch->qstats.backlog -= len;
-				sch->q.qlen--;
-				if (!cl->un.leaf.q->q.qlen)
-					htb_deactivate(q, cl);
-				return len;
-			}
-		}
-	}
-	return 0;
-}
-
 /* reset all classes */
 /* always caled under BH & queue lock */
 static void htb_reset(struct Qdisc *sch)
@@ -1600,7 +1575,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
 	.enqueue	=	htb_enqueue,
 	.dequeue	=	htb_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	htb_drop,
 	.init		=	htb_init,
 	.reset		=	htb_reset,
 	.destroy	=	htb_destroy,
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 21e69d2e8347..5ea93305d705 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -151,27 +151,6 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)
 
 }
 
-static unsigned int multiq_drop(struct Qdisc *sch)
-{
-	struct multiq_sched_data *q = qdisc_priv(sch);
-	int band;
-	unsigned int len;
-	struct Qdisc *qdisc;
-
-	for (band = q->bands - 1; band >= 0; band--) {
-		qdisc = q->queues[band];
-		if (qdisc->ops->drop) {
-			len = qdisc->ops->drop(qdisc);
-			if (len != 0) {
-				sch->q.qlen--;
-				return len;
-			}
-		}
-	}
-	return 0;
-}
-
-
 static void
 multiq_reset(struct Qdisc *sch)
 {
@@ -416,7 +395,6 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
 	.enqueue	=	multiq_enqueue,
 	.dequeue	=	multiq_dequeue,
 	.peek		=	multiq_peek,
-	.drop		=	multiq_drop,
 	.init		=	multiq_init,
 	.reset		=	multiq_reset,
 	.destroy	=	multiq_destroy,
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 31984c708382..9ca7947ab643 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -576,35 +576,6 @@ finish_segs:
 	return NET_XMIT_SUCCESS;
 }
 
-static unsigned int netem_drop(struct Qdisc *sch)
-{
-	struct netem_sched_data *q = qdisc_priv(sch);
-	unsigned int len;
-
-	len = qdisc_queue_drop(sch);
-
-	if (!len) {
-		struct rb_node *p = rb_first(&q->t_root);
-
-		if (p) {
-			struct sk_buff *skb = netem_rb_to_skb(p);
-
-			rb_erase(p, &q->t_root);
-			sch->q.qlen--;
-			skb->next = NULL;
-			skb->prev = NULL;
-			qdisc_qstats_backlog_dec(sch, skb);
-			kfree_skb(skb);
-		}
-	}
-	if (!len && q->qdisc && q->qdisc->ops->drop)
-	    len = q->qdisc->ops->drop(q->qdisc);
-	if (len)
-		qdisc_qstats_drop(sch);
-
-	return len;
-}
-
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -1143,7 +1114,6 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 	.enqueue	=	netem_enqueue,
 	.dequeue	=	netem_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	netem_drop,
 	.init		=	netem_init,
 	.reset		=	netem_reset,
 	.destroy	=	netem_destroy,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 06eca7060683..4c77780b55c3 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -125,24 +125,6 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
 
 }
 
-static unsigned int prio_drop(struct Qdisc *sch)
-{
-	struct prio_sched_data *q = qdisc_priv(sch);
-	int prio;
-	unsigned int len;
-	struct Qdisc *qdisc;
-
-	for (prio = q->bands-1; prio >= 0; prio--) {
-		qdisc = q->queues[prio];
-		if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
-			sch->q.qlen--;
-			return len;
-		}
-	}
-	return 0;
-}
-
-
 static void
 prio_reset(struct Qdisc *sch)
 {
@@ -379,7 +361,6 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
 	.enqueue	=	prio_enqueue,
 	.dequeue	=	prio_dequeue,
 	.peek		=	prio_peek,
-	.drop		=	prio_drop,
 	.init		=	prio_init,
 	.reset		=	prio_reset,
 	.destroy	=	prio_destroy,
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 85d41979d825..cdb3966572f9 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1423,52 +1423,6 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
 		qfq_deactivate_class(q, cl);
 }
 
-static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
-				       struct hlist_head *slot)
-{
-	struct qfq_aggregate *agg;
-	struct qfq_class *cl;
-	unsigned int len;
-
-	hlist_for_each_entry(agg, slot, next) {
-		list_for_each_entry(cl, &agg->active, alist) {
-
-			if (!cl->qdisc->ops->drop)
-				continue;
-
-			len = cl->qdisc->ops->drop(cl->qdisc);
-			if (len > 0) {
-				if (cl->qdisc->q.qlen == 0)
-					qfq_deactivate_class(q, cl);
-
-				return len;
-			}
-		}
-	}
-	return 0;
-}
-
-static unsigned int qfq_drop(struct Qdisc *sch)
-{
-	struct qfq_sched *q = qdisc_priv(sch);
-	struct qfq_group *grp;
-	unsigned int i, j, len;
-
-	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
-		grp = &q->groups[i];
-		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
-			len = qfq_drop_from_slot(q, &grp->slots[j]);
-			if (len > 0) {
-				sch->q.qlen--;
-				return len;
-			}
-		}
-
-	}
-
-	return 0;
-}
-
 static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
@@ -1563,7 +1517,6 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
 	.enqueue	= qfq_enqueue,
 	.dequeue	= qfq_dequeue,
 	.peek		= qdisc_peek_dequeued,
-	.drop		= qfq_drop,
 	.init		= qfq_init_qdisc,
 	.reset		= qfq_reset_qdisc,
 	.destroy	= qfq_destroy_qdisc,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8c0508c0e287..235942f3464e 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -134,25 +134,6 @@ static struct sk_buff *red_peek(struct Qdisc *sch)
 	return child->ops->peek(child);
 }
 
-static unsigned int red_drop(struct Qdisc *sch)
-{
-	struct red_sched_data *q = qdisc_priv(sch);
-	struct Qdisc *child = q->qdisc;
-	unsigned int len;
-
-	if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
-		q->stats.other++;
-		qdisc_qstats_drop(sch);
-		sch->q.qlen--;
-		return len;
-	}
-
-	if (!red_is_idling(&q->vars))
-		red_start_of_idle_period(&q->vars);
-
-	return 0;
-}
-
 static void red_reset(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
@@ -361,7 +342,6 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = {
 	.enqueue	=	red_enqueue,
 	.dequeue	=	red_dequeue,
 	.peek		=	red_peek,
-	.drop		=	red_drop,
 	.init		=	red_init,
 	.reset		=	red_reset,
 	.destroy	=	red_destroy,
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 498f0a2cb47f..a2e0b855d1c8 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -896,7 +896,6 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 	.enqueue	=	sfq_enqueue,
 	.dequeue	=	sfq_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	sfq_drop,
 	.init		=	sfq_init,
 	.reset		=	sfq_reset,
 	.destroy	=	sfq_destroy,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 801148c8b6ac..1342e46574b2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -211,18 +211,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 }
 
-static unsigned int tbf_drop(struct Qdisc *sch)
-{
-	struct tbf_sched_data *q = qdisc_priv(sch);
-	unsigned int len = 0;
-
-	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
-		sch->q.qlen--;
-		qdisc_qstats_drop(sch);
-	}
-	return len;
-}
-
 static bool tbf_peak_present(const struct tbf_sched_data *q)
 {
 	return q->peak.rate_bytes_ps;
@@ -555,7 +543,6 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 	.enqueue	=	tbf_enqueue,
 	.dequeue	=	tbf_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	tbf_drop,
 	.init		=	tbf_init,
 	.reset		=	tbf_reset,
 	.destroy	=	tbf_destroy,
-- 
cgit v1.2.3


From c8945043cdc687388b7a43fc6f474bddd9607e80 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 9 Jun 2016 00:27:43 +0200
Subject: sched: place state, next_sched and gso_skb in same cacheline again

Earlier commits removed two members from struct Qdisc which places
next_sched/gso_skb into a different cacheline than ->state.

This restores the struct layout to what it was before the removal.
Move the two members, then add an annotation so they all reside in the
same cacheline.

This adds a 16 byte hole after cpu_qstats.

The hole could be closed but as it doesn't decrease total struct size just
do it this way.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4dedb7f12ed5..49534e28824b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,11 +71,11 @@ struct Qdisc {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
 
-	struct Qdisc		*next_sched;
-	struct sk_buff		*gso_skb;
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
+	struct Qdisc		*next_sched ____cacheline_aligned_in_smp;
+	struct sk_buff		*gso_skb;
 	unsigned long		state;
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
-- 
cgit v1.2.3


From 80a83cfc434b1e3afe38974570b460db4898bec6 Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Thu, 19 May 2016 10:37:48 +0200
Subject: mac80211: skip netdev queue control with software queuing

Qdiscs are designed with no regard to 802.11
aggregation requirements and hand out
packet-by-packet with no guarantee they are
destined to the same tid. This does more bad than
good no matter how fairly a given qdisc may behave
on an ethernet interface.

Software queuing used per-AC netdev subqueue
congestion control whenever a global AC limit was
hit. This meant in practice a single station or
tid queue could starve others rather easily. This
could resonate with qdiscs in a bad way or could
just end up with poor aggregation performance.
Increasing the AC limit would increase induced
latency which is also bad.

Disabling qdiscs by default and performing
taildrop instead of netdev subqueue congestion
control on the other hand makes it possible for
tid queues to fill up "in the meantime" while
preventing stations starving each other.

This increases aggregation opportunities and
should allow software queuing based drivers
achieve better performance by utilizing airtime
more efficiently with big aggregates.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  4 ---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/iface.c       | 18 +++++++++--
 net/mac80211/main.c        |  3 --
 net/mac80211/sta_info.c    |  2 +-
 net/mac80211/tx.c          | 75 ++++++++++++++++++++++++++--------------------
 net/mac80211/util.c        | 11 +++----
 7 files changed, 66 insertions(+), 49 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index be30b0549b88..a8683aec6dbe 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2147,9 +2147,6 @@ enum ieee80211_hw_flags {
  * @n_cipher_schemes: a size of an array of cipher schemes definitions.
  * @cipher_schemes: a pointer to an array of cipher scheme definitions
  *	supported by HW.
- *
- * @txq_ac_max_pending: maximum number of frames per AC pending in all txq
- *	entries for a vif.
  */
 struct ieee80211_hw {
 	struct ieee80211_conf conf;
@@ -2180,7 +2177,6 @@ struct ieee80211_hw {
 	u8 uapsd_max_sp_len;
 	u8 n_cipher_schemes;
 	const struct ieee80211_cipher_scheme *cipher_schemes;
-	int txq_ac_max_pending;
 };
 
 static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9438c9406687..634603320374 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -856,7 +856,7 @@ struct ieee80211_sub_if_data {
 	bool control_port_no_encrypt;
 	int encrypt_headroom;
 
-	atomic_t txqs_len[IEEE80211_NUM_ACS];
+	atomic_t num_tx_queued;
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 	struct mac80211_qos_map __rcu *qos_map;
 
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index c59af3eb9fa4..609c5174d798 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -976,13 +976,13 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 	if (sdata->vif.txq) {
 		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+		int n = skb_queue_len(&txqi->queue);
 
 		spin_lock_bh(&txqi->queue.lock);
 		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+		atomic_sub(n, &sdata->num_tx_queued);
 		txqi->byte_cnt = 0;
 		spin_unlock_bh(&txqi->queue.lock);
-
-		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
 	}
 
 	if (local->open_count == 0)
@@ -1198,6 +1198,12 @@ static void ieee80211_if_setup(struct net_device *dev)
 	dev->destructor = ieee80211_if_free;
 }
 
+static void ieee80211_if_setup_no_queue(struct net_device *dev)
+{
+	ieee80211_if_setup(dev);
+	dev->priv_flags |= IFF_NO_QUEUE;
+}
+
 static void ieee80211_iface_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
@@ -1707,6 +1713,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 	struct net_device *ndev = NULL;
 	struct ieee80211_sub_if_data *sdata = NULL;
 	struct txq_info *txqi;
+	void (*if_setup)(struct net_device *dev);
 	int ret, i;
 	int txqs = 1;
 
@@ -1734,12 +1741,17 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 			txq_size += sizeof(struct txq_info) +
 				    local->hw.txq_data_size;
 
+		if (local->ops->wake_tx_queue)
+			if_setup = ieee80211_if_setup_no_queue;
+		else
+			if_setup = ieee80211_if_setup;
+
 		if (local->hw.queues >= IEEE80211_NUM_ACS)
 			txqs = IEEE80211_NUM_ACS;
 
 		ndev = alloc_netdev_mqs(size + txq_size,
 					name, name_assign_type,
-					ieee80211_if_setup, txqs, 1);
+					if_setup, txqs, 1);
 		if (!ndev)
 			return -ENOMEM;
 		dev_net_set(ndev, wiphy_net(local->hw.wiphy));
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 7ee91d6151d1..160ac6b8b9a1 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1055,9 +1055,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	local->dynamic_ps_forced_timeout = -1;
 
-	if (!local->hw.txq_ac_max_pending)
-		local->hw.txq_ac_max_pending = 64;
-
 	result = ieee80211_wep_init(local);
 	if (result < 0)
 		wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 5ccfdbd406bd..177cc6cd6416 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -116,7 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta)
 			int n = skb_queue_len(&txqi->queue);
 
 			ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
-			atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
+			atomic_sub(n, &sdata->num_tx_queued);
 			txqi->byte_cnt = 0;
 		}
 	}
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 203044379ce0..3e77da195ce8 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1236,27 +1236,21 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	return TX_CONTINUE;
 }
 
-static void ieee80211_drv_tx(struct ieee80211_local *local,
-			     struct ieee80211_vif *vif,
-			     struct ieee80211_sta *pubsta,
-			     struct sk_buff *skb)
+static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
+					  struct ieee80211_vif *vif,
+					  struct ieee80211_sta *pubsta,
+					  struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	struct ieee80211_tx_control control = {
-		.sta = pubsta,
-	};
 	struct ieee80211_txq *txq = NULL;
-	struct txq_info *txqi;
-	u8 ac;
 
 	if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
 	    (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
-		goto tx_normal;
+		return NULL;
 
 	if (!ieee80211_is_data(hdr->frame_control))
-		goto tx_normal;
+		return NULL;
 
 	if (pubsta) {
 		u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
@@ -1267,25 +1261,28 @@ static void ieee80211_drv_tx(struct ieee80211_local *local,
 	}
 
 	if (!txq)
-		goto tx_normal;
+		return NULL;
 
-	ac = txq->ac;
-	txqi = to_txq_info(txq);
-	atomic_inc(&sdata->txqs_len[ac]);
-	if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
-		netif_stop_subqueue(sdata->dev, ac);
+	return to_txq_info(txq);
+}
 
-	spin_lock_bh(&txqi->queue.lock);
-	txqi->byte_cnt += skb->len;
-	__skb_queue_tail(&txqi->queue, skb);
-	spin_unlock_bh(&txqi->queue.lock);
+static void ieee80211_txq_enqueue(struct ieee80211_local *local,
+				  struct txq_info *txqi,
+				  struct sk_buff *skb)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif);
 
-	drv_wake_tx_queue(local, txqi);
+	lockdep_assert_held(&txqi->queue.lock);
 
-	return;
+	if (atomic_read(&sdata->num_tx_queued) >= TOTAL_MAX_TX_BUFFER ||
+	    txqi->queue.qlen >= STA_MAX_TX_BUFFER) {
+		ieee80211_free_txskb(&local->hw, skb);
+		return;
+	}
 
-tx_normal:
-	drv_tx(local, &control, skb);
+	atomic_inc(&sdata->num_tx_queued);
+	txqi->byte_cnt += skb->len;
+	__skb_queue_tail(&txqi->queue, skb);
 }
 
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
@@ -1296,7 +1293,6 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
 	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
-	u8 ac = txq->ac;
 
 	spin_lock_bh(&txqi->queue.lock);
 
@@ -1307,12 +1303,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	if (!skb)
 		goto out;
 
+	atomic_dec(&sdata->num_tx_queued);
 	txqi->byte_cnt -= skb->len;
 
-	atomic_dec(&sdata->txqs_len[ac]);
-	if (__netif_subqueue_stopped(sdata->dev, ac))
-		ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
-
 	hdr = (struct ieee80211_hdr *)skb->data;
 	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
@@ -1343,7 +1336,9 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct sk_buff_head *skbs,
 			       bool txpending)
 {
+	struct ieee80211_tx_control control = {};
 	struct sk_buff *skb, *tmp;
+	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1358,6 +1353,21 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
+		txqi = ieee80211_get_txq(local, vif, sta, skb);
+		if (txqi) {
+			info->control.vif = vif;
+
+			__skb_unlink(skb, skbs);
+
+			spin_lock_bh(&txqi->queue.lock);
+			ieee80211_txq_enqueue(local, txqi, skb);
+			spin_unlock_bh(&txqi->queue.lock);
+
+			drv_wake_tx_queue(local, txqi);
+
+			continue;
+		}
+
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1400,9 +1410,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
 		info->control.vif = vif;
+		control.sta = sta;
 
 		__skb_unlink(skb, skbs);
-		ieee80211_drv_tx(local, vif, sta, skb);
+		drv_tx(local, &control, skb);
 	}
 
 	return true;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 905003f75c4d..0db46442bdcf 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -244,6 +244,9 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
 	struct ieee80211_sub_if_data *sdata;
 	int n_acs = IEEE80211_NUM_ACS;
 
+	if (local->ops->wake_tx_queue)
+		return;
+
 	if (local->hw.queues < IEEE80211_NUM_ACS)
 		n_acs = 1;
 
@@ -260,11 +263,6 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
 		for (ac = 0; ac < n_acs; ac++) {
 			int ac_queue = sdata->vif.hw_queue[ac];
 
-			if (local->ops->wake_tx_queue &&
-			    (atomic_read(&sdata->txqs_len[ac]) >
-			     local->hw.txq_ac_max_pending))
-				continue;
-
 			if (ac_queue == queue ||
 			    (sdata->vif.cab_queue == queue &&
 			     local->queue_stop_reasons[ac_queue] == 0 &&
@@ -352,6 +350,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 	if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
 		return;
 
+	if (local->ops->wake_tx_queue)
+		return;
+
 	if (local->hw.queues < IEEE80211_NUM_ACS)
 		n_acs = 1;
 
-- 
cgit v1.2.3


From 5caa328e3811b7cfa33fd02c93280ffa622deb0e Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Thu, 19 May 2016 10:37:51 +0200
Subject: mac80211: implement codel on fair queuing flows

There is no other limit other than a global
packet count limit when using software queuing.
This means a single flow queue can grow insanely
long. This is particularly bad for TCP congestion
algorithms which requires a little more
sophisticated frame dropping scheme than a mere
headdrop on limit overflow.

Hence apply (a slighly modified, to fit the knobs)
CoDel5 on flow queues. This improves TCP
convergence and stability when combined with
wireless driver which keeps its own tx queue/fifo
at a minimum fill level for given link conditions.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  14 +++++-
 net/mac80211/ieee80211_i.h |   5 +++
 net/mac80211/tx.c          | 109 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a8683aec6dbe..a52009ffc19f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -21,6 +21,7 @@
 #include <linux/skbuff.h>
 #include <linux/ieee80211.h>
 #include <net/cfg80211.h>
+#include <net/codel.h>
 #include <asm/unaligned.h>
 
 /**
@@ -895,7 +896,18 @@ struct ieee80211_tx_info {
 				unsigned long jiffies;
 			};
 			/* NB: vif can be NULL for injected frames */
-			struct ieee80211_vif *vif;
+			union {
+				/* NB: vif can be NULL for injected frames */
+				struct ieee80211_vif *vif;
+
+				/* When packets are enqueued on txq it's easy
+				 * to re-construct the vif pointer. There's no
+				 * more space in tx_info so it can be used to
+				 * store the necessary enqueue time for packet
+				 * sojourn time computation.
+				 */
+				codel_time_t enqueue_time;
+			};
 			struct ieee80211_key_conf *hw_key;
 			u32 flags;
 			/* 4 bytes free */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6f8375f1df88..54edfb6fc1d1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -812,10 +812,12 @@ enum txq_info_flags {
  * @tin: contains packets split into multiple flows
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
+ * @def_cvars: codel vars for @def_flow
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
+	struct codel_vars def_cvars;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1108,6 +1110,9 @@ struct ieee80211_local {
 	struct ieee80211_hw hw;
 
 	struct fq fq;
+	struct codel_vars *cvars;
+	struct codel_params cparams;
+	struct codel_stats cstats;
 
 	const struct ieee80211_ops *ops;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d8343fca6d4..44ec605a5682 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -24,6 +24,8 @@
 #include <net/ieee80211_radiotap.h>
 #include <net/cfg80211.h>
 #include <net/mac80211.h>
+#include <net/codel.h>
+#include <net/codel_impl.h>
 #include <asm/unaligned.h>
 #include <net/fq_impl.h>
 
@@ -1267,11 +1269,92 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 	return to_txq_info(txq);
 }
 
+static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
+{
+	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
+}
+
+static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi)
+{
+	IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif;
+}
+
+static u32 codel_skb_len_func(const struct sk_buff *skb)
+{
+	return skb->len;
+}
+
+static codel_time_t codel_skb_time_func(const struct sk_buff *skb)
+{
+	const struct ieee80211_tx_info *info;
+
+	info = (const struct ieee80211_tx_info *)skb->cb;
+	return info->control.enqueue_time;
+}
+
+static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars,
+					  void *ctx)
+{
+	struct ieee80211_local *local;
+	struct txq_info *txqi;
+	struct fq *fq;
+	struct fq_flow *flow;
+
+	txqi = ctx;
+	local = vif_to_sdata(txqi->txq.vif)->local;
+	fq = &local->fq;
+
+	if (cvars == &txqi->def_cvars)
+		flow = &txqi->def_flow;
+	else
+		flow = &fq->flows[cvars - local->cvars];
+
+	return fq_flow_dequeue(fq, flow);
+}
+
+static void codel_drop_func(struct sk_buff *skb,
+			    void *ctx)
+{
+	struct ieee80211_local *local;
+	struct ieee80211_hw *hw;
+	struct txq_info *txqi;
+
+	txqi = ctx;
+	local = vif_to_sdata(txqi->txq.vif)->local;
+	hw = &local->hw;
+
+	ieee80211_free_txskb(hw, skb);
+}
+
 static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
 					   struct fq_tin *tin,
 					   struct fq_flow *flow)
 {
-	return fq_flow_dequeue(fq, flow);
+	struct ieee80211_local *local;
+	struct txq_info *txqi;
+	struct codel_vars *cvars;
+	struct codel_params *cparams;
+	struct codel_stats *cstats;
+
+	local = container_of(fq, struct ieee80211_local, fq);
+	txqi = container_of(tin, struct txq_info, tin);
+	cparams = &local->cparams;
+	cstats = &local->cstats;
+
+	if (flow == &txqi->def_flow)
+		cvars = &txqi->def_cvars;
+	else
+		cvars = &local->cvars[flow - fq->flows];
+
+	return codel_dequeue(txqi,
+			     &flow->backlog,
+			     cparams,
+			     cvars,
+			     cstats,
+			     codel_skb_len_func,
+			     codel_skb_time_func,
+			     codel_drop_func,
+			     codel_dequeue_func);
 }
 
 static void fq_skb_free_func(struct fq *fq,
@@ -1303,6 +1386,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local,
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
 
+	ieee80211_set_skb_enqueue_time(skb);
 	fq_tin_enqueue(fq, tin, skb,
 		       fq_skb_free_func,
 		       fq_flow_get_default_func);
@@ -1314,6 +1398,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 {
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
+	codel_vars_init(&txqi->def_cvars);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1342,6 +1427,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 {
 	struct fq *fq = &local->fq;
 	int ret;
+	int i;
 
 	if (!local->ops->wake_tx_queue)
 		return 0;
@@ -1350,6 +1436,22 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 	if (ret)
 		return ret;
 
+	codel_params_init(&local->cparams);
+	codel_stats_init(&local->cstats);
+	local->cparams.interval = MS2TIME(100);
+	local->cparams.target = MS2TIME(20);
+	local->cparams.ecn = true;
+
+	local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]),
+			       GFP_KERNEL);
+	if (!local->cvars) {
+		fq_reset(fq, fq_skb_free_func);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < fq->flows_cnt; i++)
+		codel_vars_init(&local->cvars[i]);
+
 	return 0;
 }
 
@@ -1360,6 +1462,9 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
 	if (!local->ops->wake_tx_queue)
 		return;
 
+	kfree(local->cvars);
+	local->cvars = NULL;
+
 	fq_reset(fq, fq_skb_free_func);
 }
 
@@ -1382,6 +1487,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	if (!skb)
 		goto out;
 
+	ieee80211_set_skb_vif(skb, txqi);
+
 	hdr = (struct ieee80211_hdr *)skb->data;
 	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
-- 
cgit v1.2.3


From 52fbb2907988aa0583c6d9d53a56aee090b2df7e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Jun 2016 07:45:11 -0700
Subject: net: sched: fix qdisc->running lockdep annotations

1) qdisc_run_begin() is really using the equivalent of a trylock.
  Instead of using write_seqcount_begin(), use a combination of
  raw_write_seqcount_begin() and correct lockdep annotation.

2) sch_direct_xmit() should use regular spin_lock(root_lock)

Fixes: f9eb8aea2a1e ("net_sched: transform qdisc running bit into a seqcount")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 6 +++++-
 net/sched/sch_generic.c   | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 49534e28824b..a4c0f1649e2b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -97,7 +97,11 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc_is_running(qdisc))
 		return false;
-	write_seqcount_begin(&qdisc->running);
+	/* Variant of write_seqcount_begin() telling lockdep a trylock
+	 * was attempted.
+	 */
+	raw_write_seqcount_begin(&qdisc->running);
+	seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
 	return true;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cebea73e70ac..cad810b45e31 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
-		spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING);
+		spin_lock(root_lock);
 		return qdisc_qlen(q);
 	}
-	spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING);
+	spin_lock(root_lock);
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
-- 
cgit v1.2.3


From 6f094b9ec680209c5b7314feee983b2f4c910b1b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Wed, 8 Jun 2016 21:16:44 -0700
Subject: tcp: add in_flight to tcp_skb_cb

Add in_flight (bytes in flight when packet was sent) field
to tx component of tcp_skb_cb and make it available to
congestion modules' pkts_acked() function through the
ack_sample function argument.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_input.c  | 5 ++++-
 net/ipv4/tcp_output.c | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0bcc70f4e1fb..a79894b66726 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -767,6 +767,7 @@ struct tcp_skb_cb {
 	union {
 		struct {
 			/* There is space for up to 20 bytes */
+			__u32 in_flight;/* Bytes in flight when packet sent */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -859,6 +860,7 @@ union tcp_cc_info;
 struct ack_sample {
 	u32 pkts_acked;
 	s32 rtt_us;
+	u32 in_flight;
 };
 
 struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 89dd8d82826f..94d4aff97523 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3115,6 +3115,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	long ca_rtt_us = -1L;
 	struct sk_buff *skb;
 	u32 pkts_acked = 0;
+	u32 last_in_flight = 0;
 	bool rtt_update;
 	int flag = 0;
 
@@ -3154,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (!first_ackt.v64)
 				first_ackt = last_ackt;
 
+			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
 			reord = min(pkts_acked, reord);
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
@@ -3250,7 +3252,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	if (icsk->icsk_ca_ops->pkts_acked) {
 		struct ack_sample sample = { .pkts_acked = pkts_acked,
-					     .rtt_us = ca_rtt_us };
+					     .rtt_us = ca_rtt_us,
+					     .in_flight = last_in_flight };
 
 		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bd9911fdd16..b1bcba0563f2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -911,9 +911,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	int err;
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
+	tp = tcp_sk(sk);
 
 	if (clone_it) {
 		skb_mstamp_get(&skb->skb_mstamp);
+		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
+			- tp->snd_una;
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -924,7 +927,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	}
 
 	inet = inet_sk(sk);
-	tp = tcp_sk(sk);
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-- 
cgit v1.2.3


From 45f50bed1d808794e514e9eed0e579a8756ce2ba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 10 Jun 2016 16:41:39 -0700
Subject: net_sched: remove generic throttled management

__QDISC_STATE_THROTTLED bit manipulation is rather expensive
for HTB and few others.

I already removed it for sch_fq in commit f2600cf02b5b
("net: sched: avoid costly atomic operation in fq_dequeue()")
and so far nobody complained.

When one ore more packets are stuck in one or more throttled
HTB class, a htb dequeue() performs two atomic operations
to clear/set __QDISC_STATE_THROTTLED bit, while root qdisc
lock is held.

Removing this pair of atomic operations bring me a 8 % performance
increase on 200 TCP_RR tests, in presence of throttled classes.

This patch has no side effect, since nothing actually uses
disc_is_throttled() anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h   |  4 ++--
 include/net/sch_generic.h | 16 ----------------
 net/sched/sch_api.c       |  7 +------
 net/sched/sch_cbq.c       |  2 --
 net/sched/sch_fq.c        |  3 +--
 net/sched/sch_hfsc.c      |  1 -
 net/sched/sch_htb.c       |  3 +--
 net/sched/sch_netem.c     |  1 -
 net/sched/sch_tbf.c       |  4 +---
 9 files changed, 6 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fea53f4d92ca..7caa99b482c6 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -67,12 +67,12 @@ struct qdisc_watchdog {
 };
 
 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle);
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires);
 
 static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd,
 					   psched_time_t expires)
 {
-	qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires), true);
+	qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires));
 }
 
 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9f3581980c15..9a0d177884c6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -26,7 +26,6 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
 	__QDISC_STATE_SCHED,
 	__QDISC_STATE_DEACTIVATED,
-	__QDISC_STATE_THROTTLED,
 };
 
 struct qdisc_size_table {
@@ -125,21 +124,6 @@ static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
 #endif
 }
 
-static inline bool qdisc_is_throttled(const struct Qdisc *qdisc)
-{
-	return test_bit(__QDISC_STATE_THROTTLED, &qdisc->state) ? true : false;
-}
-
-static inline void qdisc_throttled(struct Qdisc *qdisc)
-{
-	set_bit(__QDISC_STATE_THROTTLED, &qdisc->state);
-}
-
-static inline void qdisc_unthrottled(struct Qdisc *qdisc)
-{
-	clear_bit(__QDISC_STATE_THROTTLED, &qdisc->state);
-}
-
 struct Qdisc_class_ops {
 	/* Child qdisc manipulation */
 	struct netdev_queue *	(*select_queue)(struct Qdisc *, struct tcmsg *);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d4a8bbfcc953..401eda6de682 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -583,7 +583,6 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 						 timer);
 
 	rcu_read_lock();
-	qdisc_unthrottled(wd->qdisc);
 	__netif_schedule(qdisc_root(wd->qdisc));
 	rcu_read_unlock();
 
@@ -598,15 +597,12 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_watchdog_init);
 
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 {
 	if (test_bit(__QDISC_STATE_DEACTIVATED,
 		     &qdisc_root_sleeping(wd->qdisc)->state))
 		return;
 
-	if (throttle)
-		qdisc_throttled(wd->qdisc);
-
 	if (wd->last_expires == expires)
 		return;
 
@@ -620,7 +616,6 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 {
 	hrtimer_cancel(&wd->timer);
-	qdisc_unthrottled(wd->qdisc);
 }
 EXPORT_SYMBOL(qdisc_watchdog_cancel);
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 6e61f9aa8783..a29fd811d7b9 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -513,7 +513,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 	}
 
-	qdisc_unthrottled(sch);
 	__netif_schedule(qdisc_root(sch));
 	return HRTIMER_NORESTART;
 }
@@ -819,7 +818,6 @@ cbq_dequeue(struct Qdisc *sch)
 		if (skb) {
 			qdisc_bstats_update(sch, skb);
 			sch->q.qlen--;
-			qdisc_unthrottled(sch);
 			return skb;
 		}
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 3c6a47d66a04..f49c81e91acd 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -445,8 +445,7 @@ begin:
 		if (!head->first) {
 			if (q->time_next_delayed_flow != ~0ULL)
 				qdisc_watchdog_schedule_ns(&q->watchdog,
-							   q->time_next_delayed_flow,
-							   false);
+							   q->time_next_delayed_flow);
 			return NULL;
 		}
 	}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index eb3d3f5aba80..bd08c363a26d 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1664,7 +1664,6 @@ hfsc_dequeue(struct Qdisc *sch)
 		set_passive(cl);
 	}
 
-	qdisc_unthrottled(sch);
 	qdisc_bstats_update(sch, skb);
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index b74d06668ab4..07dcd2933f01 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -889,7 +889,6 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	if (skb != NULL) {
 ok:
 		qdisc_bstats_update(sch, skb);
-		qdisc_unthrottled(sch);
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 		return skb;
@@ -929,7 +928,7 @@ ok:
 	}
 	qdisc_qstats_overlimit(sch);
 	if (likely(next_event > q->now))
-		qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
 	else
 		schedule_work(&q->work);
 fin:
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2dbe732ca135..876df13c745a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -587,7 +587,6 @@ tfifo_dequeue:
 	if (skb) {
 		qdisc_qstats_backlog_dec(sch, skb);
 deliver:
-		qdisc_unthrottled(sch);
 		qdisc_bstats_update(sch, skb);
 		return skb;
 	}
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 7fa3d6e1291c..c12df84d1078 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -254,14 +254,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 			q->ptokens = ptoks;
 			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
-			qdisc_unthrottled(sch);
 			qdisc_bstats_update(sch, skb);
 			return skb;
 		}
 
 		qdisc_watchdog_schedule_ns(&q->watchdog,
-					   now + max_t(long, -toks, -ptoks),
-					   true);
+					   now + max_t(long, -toks, -ptoks));
 
 		/* Maybe we have a shorter packet in the queue,
 		   which can be sent now. It sounds cool,
-- 
cgit v1.2.3


From cd2a9e62c8a3c5cae7691982667d79a0edc65283 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 13 Jun 2016 13:44:17 -0700
Subject: net: l3mdev: Remove const from flowi6 arg to get_rt6_dst

Allow drivers to pass flow arg to functions where the arg is not const
and allow the driver to make updates as needed (eg., setting oif).

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 2 +-
 include/net/l3mdev.h | 6 +++---
 net/l3mdev/l3mdev.c  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index b4d746943bc5..d2ce76c9dc64 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -861,7 +861,7 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
 
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
-					 const struct flowi6 *fl6)
+					 struct flowi6 *fl6)
 {
 	struct dst_entry *dst = NULL;
 
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 34f33eb96a5e..f8a416ec674c 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -38,7 +38,7 @@ struct l3mdev_ops {
 
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
-						 const struct flowi6 *fl6);
+						 struct flowi6 *fl6);
 };
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -139,7 +139,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 
 int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
+struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6);
 
 static inline
 struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
@@ -225,7 +225,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex,
 }
 
 static inline
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
+struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6)
 {
 	return NULL;
 }
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 7da97809a7e8..d90e4ef09e85 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
  */
 
 struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
-				     const struct flowi6 *fl6)
+				     struct flowi6 *fl6)
 {
 	struct dst_entry *dst = NULL;
 	struct net_device *dev;
-- 
cgit v1.2.3


From 9ff74384600aeecba34ebdacbbde0627489ff601 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 13 Jun 2016 13:44:19 -0700
Subject: net: vrf: Handle ipv6 multicast and link-local addresses

IPv6 multicast and link-local addresses require special handling by the
VRF driver:
1. Rather than using the VRF device index and full FIB lookups,
   packets to/from these addresses should use direct FIB lookups based on
   the VRF device table.

2. fail sends/receives on a VRF device to/from a multicast address
   (e.g, make ping6 ff02::1%<vrf> fail)

3. move the setting of the flow oif to the first dst lookup and revert
   the change in icmpv6_echo_reply made in ca254490c8dfd ("net: Add VRF
   support to IPv6 stack"). Linklocal/mcast addresses require use of the
   skb->dev.

With this change connections into and out of a VRF enslaved device work
for multicast and link-local addresses work (icmp, tcp, and udp)
e.g.,

1. packets into VM with VRF config:
    ping6 -c3 fe80::e0:f9ff:fe1c:b974%br1
    ping6 -c3 ff02::1%br1

    ssh -6 fe80::e0:f9ff:fe1c:b974%br1

2. packets going out a VRF enslaved device:
    ping6 -c3 fe80::18f8:83ff:fe4b:7a2e%eth1
    ping6 -c3 ff02::1%eth1
    ssh -6 root@fe80::18f8:83ff:fe4b:7a2e%eth1

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c       | 98 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/net/ip6_route.h |  2 +
 net/ipv6/icmp.c         |  2 +-
 net/ipv6/route.c        |  5 ++-
 4 files changed, 99 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index d2ce76c9dc64..0b5b3c258c2b 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -785,9 +785,63 @@ out:
 	return rc;
 }
 
+static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
+					     const struct net_device *dev,
+					     struct flowi6 *fl6,
+					     int ifindex,
+					     int flags)
+{
+	struct net_vrf *vrf = netdev_priv(dev);
+	struct fib6_table *table = NULL;
+	struct rt6_info *rt6;
+
+	rcu_read_lock();
+
+	/* fib6_table does not have a refcnt and can not be freed */
+	rt6 = rcu_dereference(vrf->rt6);
+	if (likely(rt6))
+		table = rt6->rt6i_table;
+
+	rcu_read_unlock();
+
+	if (!table)
+		return NULL;
+
+	return ip6_pol_route(net, table, ifindex, fl6, flags);
+}
+
+static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
+			      int ifindex)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct flowi6 fl6 = {
+		.daddr          = iph->daddr,
+		.saddr          = iph->saddr,
+		.flowlabel      = ip6_flowinfo(iph),
+		.flowi6_mark    = skb->mark,
+		.flowi6_proto   = iph->nexthdr,
+		.flowi6_iif     = ifindex,
+	};
+	struct net *net = dev_net(vrf_dev);
+	struct rt6_info *rt6;
+
+	rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex,
+				   RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE);
+	if (unlikely(!rt6))
+		return;
+
+	if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
+		return;
+
+	skb_dst_set(skb, &rt6->dst);
+}
+
 static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
 				   struct sk_buff *skb)
 {
+	int orig_iif = skb->skb_iif;
+	bool need_strict;
+
 	/* loopback traffic; do not push through packet taps again.
 	 * Reset pkt_type for upper layers to process skb
 	 */
@@ -798,8 +852,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
 		goto out;
 	}
 
-	/* if packet is NDISC keep the ingress interface */
-	if (!ipv6_ndisc_frame(skb)) {
+	/* if packet is NDISC or addressed to multicast or link-local
+	 * then keep the ingress interface
+	 */
+	need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
+	if (!ipv6_ndisc_frame(skb) && !need_strict) {
 		skb->dev = vrf_dev;
 		skb->skb_iif = vrf_dev->ifindex;
 
@@ -810,6 +867,9 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
 		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
 	}
 
+	if (need_strict)
+		vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
+
 out:
 	return skb;
 }
@@ -863,11 +923,35 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 					 struct flowi6 *fl6)
 {
+	bool need_strict = rt6_need_strict(&fl6->daddr);
+	struct net_vrf *vrf = netdev_priv(dev);
+	struct net *net = dev_net(dev);
 	struct dst_entry *dst = NULL;
+	struct rt6_info *rt;
 
-	if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
-		struct net_vrf *vrf = netdev_priv(dev);
-		struct rt6_info *rt;
+	/* send to link-local or multicast address */
+	if (need_strict) {
+		int flags = RT6_LOOKUP_F_IFACE;
+
+		/* VRF device does not have a link-local address and
+		 * sending packets to link-local or mcast addresses over
+		 * a VRF device does not make sense
+		 */
+		if (fl6->flowi6_oif == dev->ifindex) {
+			struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst;
+
+			dst_hold(dst);
+			return dst;
+		}
+
+		if (!ipv6_addr_any(&fl6->saddr))
+			flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags);
+		if (rt)
+			dst = &rt->dst;
+
+	} else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
 
 		rcu_read_lock();
 
@@ -880,6 +964,10 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 		rcu_read_unlock();
 	}
 
+	/* make sure oif is set to VRF device for lookup */
+	if (!need_strict)
+		fl6->flowi6_oif = dev->ifindex;
+
 	return dst;
 }
 #endif
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 54c779416eec..f55bf3d294aa 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -76,6 +76,8 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
 
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags);
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int ifindex, struct flowi6 *fl6, int flags);
 
 int ip6_route_init(void);
 void ip6_route_cleanup(void);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 40454bfb534e..e32a72fb9982 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -587,7 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	if (saddr)
 		fl6.saddr = *saddr;
-	fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
+	fl6.flowi6_oif = skb->dev->ifindex;
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c6ae6f9b5fe3..d51a1a48b839 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 	return pcpu_rt;
 }
 
-static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
-				      struct flowi6 *fl6, int flags)
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt;
@@ -1139,6 +1139,7 @@ redo_rt6_select:
 
 	}
 }
+EXPORT_SYMBOL_GPL(ip6_pol_route);
 
 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 					    struct flowi6 *fl6, int flags)
-- 
cgit v1.2.3


From b2313077ed0db35ee186905d8076a737248edd24 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 13 Jun 2016 13:46:28 -0700
Subject: net_sched: make tcf_hash_check() boolean

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h   | 4 ++--
 net/sched/act_api.c     | 8 ++++----
 net/sched/act_ife.c     | 3 ++-
 net/sched/act_ipt.c     | 3 ++-
 net/sched/act_mirred.c  | 3 ++-
 net/sched/act_simple.c  | 3 ++-
 net/sched/act_skbedit.c | 3 ++-
 net/sched/act_vlan.c    | 4 ++--
 8 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index db218a12efb5..fb82b5b5d9e7 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -155,8 +155,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct tc_action *a);
 int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index);
 u32 tcf_hash_new_index(struct tc_action_net *tn);
-int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
-		   int bind);
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		    int bind);
 int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		    struct tc_action *a, int size, int bind, bool cpustats);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index b6db56ec8117..f8c61d2a7963 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -224,8 +224,8 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
 }
 EXPORT_SYMBOL(tcf_hash_search);
 
-int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
-		   int bind)
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		    int bind)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = NULL;
@@ -235,9 +235,9 @@ int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
 		p->tcfc_refcnt++;
 		a->priv = p;
 		a->hinfo = hinfo;
-		return 1;
+		return true;
 	}
-	return 0;
+	return false;
 }
 EXPORT_SYMBOL(tcf_hash_check);
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 02f5a8ba95d7..b7fa96926c90 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -423,7 +423,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	u16 ife_type = 0;
 	u8 *daddr = NULL;
 	u8 *saddr = NULL;
-	int ret = 0, exists = 0;
+	bool exists = false;
+	int ret = 0;
 	int err;
 
 	err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8998a3594e86..6148e323ed93 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -97,7 +97,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
 	struct tcf_ipt *ipt;
 	struct xt_entry_target *td, *t;
 	char *tname;
-	int ret = 0, err, exists = 0;
+	bool exists = false;
+	int ret = 0, err;
 	u32 hook = 0;
 	u32 index = 0;
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 787751a7981a..5b135d357e1e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -62,7 +62,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	struct tc_mirred *parm;
 	struct tcf_mirred *m;
 	struct net_device *dev;
-	int ret, ok_push = 0, exists = 0;
+	int ret, ok_push = 0;
+	bool exists = false;
 
 	if (nla == NULL)
 		return -EINVAL;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index be5fbb51cfed..318328d34d12 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -86,8 +86,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tc_defact *parm;
 	struct tcf_defact *d;
+	bool exists = false;
+	int ret = 0, err;
 	char *defdata;
-	int ret = 0, err, exists = 0;
 
 	if (nla == NULL)
 		return -EINVAL;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7e2bc3c2b6da..53d1486cddf7 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -69,7 +69,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	struct tcf_skbedit *d;
 	u32 flags = 0, *priority = NULL, *mark = NULL;
 	u16 *queue_mapping = NULL;
-	int ret = 0, err, exists = 0;
+	bool exists = false;
+	int ret = 0, err;
 
 	if (nla == NULL)
 		return -EINVAL;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b075d50e0fc3..db9b7ed570ba 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -77,8 +77,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	int action;
 	__be16 push_vid = 0;
 	__be16 push_proto = 0;
-	int ret = 0, exists = 0;
-	int err;
+	bool exists = false;
+	int ret = 0, err;
 
 	if (!nla)
 		return -EINVAL;
-- 
cgit v1.2.3


From 1b5c5493e3e68181be344cb51bf9df192d05ffc2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Jun 2016 20:21:50 -0700
Subject: net_sched: add the ability to defer skb freeing

qdisc are changed under RTNL protection and often
while blocking BH and root qdisc spinlock.

When lots of skbs need to be dropped, we free
them under these locks causing TX/RX freezes,
and more generally latency spikes.

This commit adds rtnl_kfree_skbs(), used to queue
skbs for deferred freeing.

Actual freeing happens right after RTNL is released,
with appropriate scheduling points.

rtnl_qdisc_drop() can also be used in place
of disc_drop() when RTNL is held.

qdisc_reset_queue() and __qdisc_reset_queue() get
the new behavior, so standard qdiscs like pfifo, pfifo_fast...
have their ->reset() method automatically handled.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  5 +++--
 include/net/sch_generic.h | 16 ++++++++++++----
 net/core/rtnetlink.c      | 22 ++++++++++++++++++++++
 net/sched/sch_generic.c   |  2 +-
 4 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index c006cc900c44..2daece8979f7 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -89,8 +89,9 @@ void net_inc_egress_queue(void);
 void net_dec_egress_queue(void);
 #endif
 
-extern void rtnetlink_init(void);
-extern void __rtnl_unlock(void);
+void rtnetlink_init(void);
+void __rtnl_unlock(void);
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);
 
 #define ASSERT_RTNL() do { \
 	if (unlikely(!rtnl_is_locked())) { \
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9a0d177884c6..4f7cee8344c4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -683,19 +683,21 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 	return skb;
 }
 
-static inline void __qdisc_reset_queue(struct Qdisc *sch,
-				       struct sk_buff_head *list)
+static inline void __qdisc_reset_queue(struct sk_buff_head *list)
 {
 	/*
 	 * We do not know the backlog in bytes of this list, it
 	 * is up to the caller to correct it
 	 */
-	__skb_queue_purge(list);
+	if (!skb_queue_empty(list)) {
+		rtnl_kfree_skbs(list->next, list->prev);
+		__skb_queue_head_init(list);
+	}
 }
 
 static inline void qdisc_reset_queue(struct Qdisc *sch)
 {
-	__qdisc_reset_queue(sch, &sch->q);
+	__qdisc_reset_queue(&sch->q);
 	sch->qstats.backlog = 0;
 }
 
@@ -716,6 +718,12 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	return old;
 }
 
+static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
+{
+	rtnl_kfree_skbs(skb, skb);
+	qdisc_qstats_drop(sch);
+}
+
 static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
 {
 	kfree_skb(skb);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d69c4644f8f2..eb49ca24274a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -71,9 +71,31 @@ void rtnl_lock(void)
 }
 EXPORT_SYMBOL(rtnl_lock);
 
+static struct sk_buff *defer_kfree_skb_list;
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
+{
+	if (head && tail) {
+		tail->next = defer_kfree_skb_list;
+		defer_kfree_skb_list = head;
+	}
+}
+EXPORT_SYMBOL(rtnl_kfree_skbs);
+
 void __rtnl_unlock(void)
 {
+	struct sk_buff *head = defer_kfree_skb_list;
+
+	defer_kfree_skb_list = NULL;
+
 	mutex_unlock(&rtnl_mutex);
+
+	while (head) {
+		struct sk_buff *next = head->next;
+
+		kfree_skb(head);
+		cond_resched();
+		head = next;
+	}
 }
 
 void rtnl_unlock(void)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0c9cb516f2e3..773b632e1e33 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -493,7 +493,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__qdisc_reset_queue(qdisc, band2list(priv, prio));
+		__qdisc_reset_queue(band2list(priv, prio));
 
 	priv->bitmap = 0;
 	qdisc->qstats.backlog = 0;
-- 
cgit v1.2.3


From 8626a0c83b0d471d859bcd908d016874df951fc3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:16 +0200
Subject: 6lowpan: add private neighbour data

This patch will introduce a 6lowpan neighbour private data. Like the
interface private data we handle private data for generic 6lowpan and
for link-layer specific 6lowpan.

The current first use case if to save the short address for a 802.15.4
6lowpan neighbour.

Cc: David S. Miller <davem@davemloft.net>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h     |  3 +--
 include/net/6lowpan.h         | 10 ++++++++++
 net/ieee802154/6lowpan/core.c | 12 ++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d101e4d904ba..36e43bd422f8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1483,8 +1483,7 @@ enum netdev_priv_flags {
  * 	@perm_addr:		Permanent hw address
  * 	@addr_assign_type:	Hw address assignment type
  * 	@addr_len:		Hardware address length
- * 	@neigh_priv_len;	Used in neigh_alloc(),
- * 				initialized only in atm/clip.c
+ *	@neigh_priv_len:	Used in neigh_alloc()
  * 	@dev_id:		Used to differentiate devices that share
  * 				the same link layer address
  * 	@dev_port:		Used to differentiate devices that share
diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index da84cf920b78..2d9b9d39221e 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -141,6 +141,16 @@ struct lowpan_dev {
 	u8 priv[0] __aligned(sizeof(void *));
 };
 
+struct lowpan_802154_neigh {
+	__le16 short_addr;
+};
+
+static inline
+struct lowpan_802154_neigh *lowpan_802154_neigh(void *neigh_priv)
+{
+	return neigh_priv;
+}
+
 static inline
 struct lowpan_dev *lowpan_dev(const struct net_device *dev)
 {
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 4e2b30894224..8c004a0c8d64 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -81,11 +81,21 @@ static int lowpan_stop(struct net_device *dev)
 	return 0;
 }
 
+static int lowpan_neigh_construct(struct neighbour *n)
+{
+	struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+
+	/* default no short_addr is available for a neighbour */
+	neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+	return 0;
+}
+
 static const struct net_device_ops lowpan_netdev_ops = {
 	.ndo_init		= lowpan_dev_init,
 	.ndo_start_xmit		= lowpan_xmit,
 	.ndo_open		= lowpan_open,
 	.ndo_stop		= lowpan_stop,
+	.ndo_neigh_construct    = lowpan_neigh_construct,
 };
 
 static void lowpan_setup(struct net_device *ldev)
@@ -150,6 +160,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
 				wdev->needed_headroom;
 	ldev->needed_tailroom = wdev->needed_tailroom;
 
+	ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh);
+
 	ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
 	if (ret < 0) {
 		dev_put(wdev);
-- 
cgit v1.2.3


From 2ad3ed59198c5404c34515cfcfd9a2b3c54d964f Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:17 +0200
Subject: 6lowpan: add 802.15.4 short addr slaac

This patch adds the autoconfiguration if a valid 802.15.4 short address
is available for 802.15.4 6LoWPAN interfaces.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/6lowpan.h  |  6 ++++++
 include/net/addrconf.h |  3 +++
 net/6lowpan/core.c     | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/addrconf.c    |  5 +++--
 4 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index 2d9b9d39221e..5ab4c9901ccc 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -254,6 +254,12 @@ static inline bool lowpan_fetch_skb(struct sk_buff *skb, void *data,
 	return false;
 }
 
+static inline bool lowpan_802154_is_valid_src_short_addr(__le16 addr)
+{
+	/* First bit of addr is multicast, reserved or 802.15.4 specific */
+	return !(addr & cpu_to_le16(0x8000));
+}
+
 static inline void lowpan_push_hc_data(u8 **hc_ptr, const void *data,
 				       const size_t len)
 {
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 730d856683e5..b1774eb03f37 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -94,6 +94,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
+void addrconf_add_linklocal(struct inet6_dev *idev,
+			    const struct in6_addr *addr, u32 flags);
+
 static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
 {
 	if (dev->addr_len != ETH_ALEN)
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 7a240b3eaed1..801404ceea23 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 
 #include <net/6lowpan.h>
+#include <net/addrconf.h>
 
 #include "6lowpan_i.h"
 
@@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(lowpan_unregister_netdev);
 
+static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+{
+	struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+	/* Set short_addr autoconfiguration if short_addr is present only */
+	if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+		return -1;
+
+	/* For either address format, all zero addresses MUST NOT be used */
+	if (wpan_dev->pan_id == cpu_to_le16(0x0000) &&
+	    wpan_dev->short_addr == cpu_to_le16(0x0000))
+		return -1;
+
+	/* Alternatively, if no PAN ID is known, 16 zero bits may be used */
+	if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+		memset(eui, 0, 2);
+	else
+		ieee802154_le16_to_be16(eui, &wpan_dev->pan_id);
+
+	/* The "Universal/Local" (U/L) bit shall be set to zero */
+	eui[0] &= ~2;
+	eui[2] = 0;
+	eui[3] = 0xFF;
+	eui[4] = 0xFE;
+	eui[5] = 0;
+	ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr);
+	return 0;
+}
+
 static int lowpan_event(struct notifier_block *unused,
 			unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct inet6_dev *idev;
+	struct in6_addr addr;
 	int i;
 
 	if (dev->type != ARPHRD_6LOWPAN)
 		return NOTIFY_DONE;
 
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return NOTIFY_DONE;
+
 	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_CHANGE:
+		/* (802.15.4 6LoWPAN short address slaac handling */
+		if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) &&
+		    addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) {
+			__ipv6_addr_set_half(&addr.s6_addr32[0],
+					     htonl(0xFE800000), 0);
+			addrconf_add_linklocal(idev, &addr, 0);
+		}
+		break;
 	case NETDEV_DOWN:
 		for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
 			clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b12553905e42..1ce4048d1b5e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2947,8 +2947,8 @@ static void init_loopback(struct net_device *dev)
 	}
 }
 
-static void addrconf_add_linklocal(struct inet6_dev *idev,
-				   const struct in6_addr *addr, u32 flags)
+void addrconf_add_linklocal(struct inet6_dev *idev,
+			    const struct in6_addr *addr, u32 flags)
 {
 	struct inet6_ifaddr *ifp;
 	u32 addr_flags = flags | IFA_F_PERMANENT;
@@ -2967,6 +2967,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev,
 		in6_ifa_put(ifp);
 	}
 }
+EXPORT_SYMBOL_GPL(addrconf_add_linklocal);
 
 static bool ipv6_reserved_interfaceid(struct in6_addr address)
 {
-- 
cgit v1.2.3


From 1e82f961ac8e94c50a933e89ee08071fc81a4bbd Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:19 +0200
Subject: ndisc: add __ndisc_opt_addr_space function

This patch adds __ndisc_opt_addr_space as low-level function for
ndisc_opt_addr_space which doesn't depend on net_device parameter.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 2d8edaad29cb..4cee82654fe5 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -127,10 +127,15 @@ static inline int ndisc_addr_option_pad(unsigned short type)
 	}
 }
 
+static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad)
+{
+	return NDISC_OPT_SPACE(addr_len + pad);
+}
+
 static inline int ndisc_opt_addr_space(struct net_device *dev)
 {
-	return NDISC_OPT_SPACE(dev->addr_len +
-			       ndisc_addr_option_pad(dev->type));
+	return __ndisc_opt_addr_space(dev->addr_len,
+				      ndisc_addr_option_pad(dev->type));
 }
 
 static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
-- 
cgit v1.2.3


From 4f36ce84c54c971cd67c882035d9bde7b1a2ee53 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:20 +0200
Subject: ndisc: add __ndisc_opt_addr_data function

This patch adds __ndisc_opt_addr_data as low-level function for
ndisc_opt_addr_data which doesn't depend on net_device parameter.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 4cee82654fe5..c8962adf24d0 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -138,17 +138,23 @@ static inline int ndisc_opt_addr_space(struct net_device *dev)
 				      ndisc_addr_option_pad(dev->type));
 }
 
-static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
-				      struct net_device *dev)
+static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p,
+					unsigned char addr_len, int prepad)
 {
 	u8 *lladdr = (u8 *)(p + 1);
 	int lladdrlen = p->nd_opt_len << 3;
-	int prepad = ndisc_addr_option_pad(dev->type);
-	if (lladdrlen != ndisc_opt_addr_space(dev))
+	if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad))
 		return NULL;
 	return lladdr + prepad;
 }
 
+static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
+				      struct net_device *dev)
+{
+	return __ndisc_opt_addr_data(p, dev->addr_len,
+				     ndisc_addr_option_pad(dev->type));
+}
+
 static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd)
 {
 	const u32 *p32 = pkey;
-- 
cgit v1.2.3


From f997c55c1dc8841b3ee4df0493d0ac7966d42165 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:23 +0200
Subject: ipv6: introduce neighbour discovery ops

This patch introduces neighbour discovery ops callback structure. The
idea is to separate the handling for 6LoWPAN into the 6lowpan module.

These callback offers 6lowpan different handling, such as 802.15.4 short
address handling or RFC6775 (Neighbor Discovery Optimization for IPv6
over 6LoWPANs).

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   5 ++
 include/net/ndisc.h       | 197 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/addrconf.c       |  13 ++-
 net/ipv6/ndisc.c          | 101 ++++++++++++++++--------
 net/ipv6/route.c          |   8 +-
 5 files changed, 284 insertions(+), 40 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 36e43bd422f8..890158e99159 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,6 +1456,8 @@ enum netdev_priv_flags {
  *	@netdev_ops:	Includes several pointers to callbacks,
  *			if one wants to override the ndo_*() functions
  *	@ethtool_ops:	Management operations
+ *	@ndisc_ops:	Includes callbacks for different IPv6 neighbour
+ *			discovery handling. Necessary for e.g. 6LoWPAN.
  *	@header_ops:	Includes callbacks for creating,parsing,caching,etc
  *			of Layer 2 headers.
  *
@@ -1672,6 +1674,9 @@ struct net_device {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	const struct l3mdev_ops	*l3mdev_ops;
 #endif
+#if IS_ENABLED(CONFIG_IPV6)
+	const struct ndisc_ops *ndisc_ops;
+#endif
 
 	const struct header_ops *header_ops;
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index c8962adf24d0..a5e276703cb3 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -58,6 +58,7 @@ struct inet6_dev;
 struct net_device;
 struct net_proto_family;
 struct sk_buff;
+struct prefix_info;
 
 extern struct neigh_table nd_tbl;
 
@@ -110,9 +111,182 @@ struct ndisc_options {
 
 #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
 
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+					  u8 *opt, int opt_len,
 					  struct ndisc_options *ndopts);
 
+#define NDISC_OPS_REDIRECT_DATA_SPACE	2
+
+/*
+ * This structure defines the hooks for IPv6 neighbour discovery.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
+ *
+ * int (*is_useropt)(u8 nd_opt_type):
+ *     This function is called when IPv6 decide RA userspace options. if
+ *     this function returns 1 then the option given by nd_opt_type will
+ *     be handled as userspace option additional to the IPv6 options.
+ *
+ * int (*parse_options)(const struct net_device *dev,
+ *			struct nd_opt_hdr *nd_opt,
+ *			struct ndisc_options *ndopts):
+ *     This function is called while parsing ndisc ops and put each position
+ *     as pointer into ndopts. If this function return unequal 0, then this
+ *     function took care about the ndisc option, if 0 then the IPv6 ndisc
+ *     option parser will take care about that option.
+ *
+ * void (*update)(const struct net_device *dev, struct neighbour *n,
+ *		  u32 flags, u8 icmp6_type,
+ *		  const struct ndisc_options *ndopts):
+ *     This function is called when IPv6 ndisc updates the neighbour cache
+ *     entry. Additional options which can be updated may be previously
+ *     parsed by parse_opts callback and accessible over ndopts parameter.
+ *
+ * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
+ *			 struct neighbour *neigh, u8 *ha_buf,
+ *			 u8 **ha):
+ *     This function is called when the necessary option space will be
+ *     calculated before allocating a skb. The parameters neigh, ha_buf
+ *     abd ha are available on NDISC_REDIRECT messages only.
+ *
+ * void (*fill_addr_option)(const struct net_device *dev,
+ *			    struct sk_buff *skb, u8 icmp6_type,
+ *			    const u8 *ha):
+ *     This function is called when the skb will finally fill the option
+ *     fields inside skb. NOTE: this callback should fill the option
+ *     fields to the skb which are previously indicated by opt_space
+ *     parameter. That means the decision to add such option should
+ *     not lost between these two callbacks, e.g. protected by interface
+ *     up state.
+ *
+ * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
+ *			       const struct prefix_info *pinfo,
+ *			       struct inet6_dev *in6_dev,
+ *			       struct in6_addr *addr,
+ *			       int addr_type, u32 addr_flags,
+ *			       bool sllao, bool tokenized,
+ *			       __u32 valid_lft, u32 prefered_lft,
+ *			       bool dev_addr_generated):
+ *     This function is called when a RA messages is received with valid
+ *     PIO option fields and an IPv6 address will be added to the interface
+ *     for autoconfiguration. The parameter dev_addr_generated reports about
+ *     if the address was based on dev->dev_addr or not. This can be used
+ *     to add a second address if link-layer operates with two link layer
+ *     addresses. E.g. 802.15.4 6LoWPAN.
+ */
+struct ndisc_ops {
+	int	(*is_useropt)(u8 nd_opt_type);
+	int	(*parse_options)(const struct net_device *dev,
+				 struct nd_opt_hdr *nd_opt,
+				 struct ndisc_options *ndopts);
+	void	(*update)(const struct net_device *dev, struct neighbour *n,
+			  u32 flags, u8 icmp6_type,
+			  const struct ndisc_options *ndopts);
+	int	(*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
+				  struct neighbour *neigh, u8 *ha_buf,
+				  u8 **ha);
+	void	(*fill_addr_option)(const struct net_device *dev,
+				    struct sk_buff *skb, u8 icmp6_type,
+				    const u8 *ha);
+	void	(*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
+				       const struct prefix_info *pinfo,
+				       struct inet6_dev *in6_dev,
+				       struct in6_addr *addr,
+				       int addr_type, u32 addr_flags,
+				       bool sllao, bool tokenized,
+				       __u32 valid_lft, u32 prefered_lft,
+				       bool dev_addr_generated);
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline int ndisc_ops_is_useropt(const struct net_device *dev,
+				       u8 nd_opt_type)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->is_useropt)
+		return dev->ndisc_ops->is_useropt(nd_opt_type);
+	else
+		return 0;
+}
+
+static inline int ndisc_ops_parse_options(const struct net_device *dev,
+					  struct nd_opt_hdr *nd_opt,
+					  struct ndisc_options *ndopts)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->parse_options)
+		return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts);
+	else
+		return 0;
+}
+
+static inline void ndisc_ops_update(const struct net_device *dev,
+					  struct neighbour *n, u32 flags,
+					  u8 icmp6_type,
+					  const struct ndisc_options *ndopts)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->update)
+		dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts);
+}
+
+static inline int ndisc_ops_opt_addr_space(const struct net_device *dev,
+					   u8 icmp6_type)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space &&
+	    icmp6_type != NDISC_REDIRECT)
+		return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL,
+						      NULL, NULL);
+	else
+		return 0;
+}
+
+static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev,
+						    struct neighbour *neigh,
+						    u8 *ha_buf, u8 **ha)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space)
+		return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT,
+						      neigh, ha_buf, ha);
+	else
+		return 0;
+}
+
+static inline void ndisc_ops_fill_addr_option(const struct net_device *dev,
+					      struct sk_buff *skb,
+					      u8 icmp6_type)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option &&
+	    icmp6_type != NDISC_REDIRECT)
+		dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL);
+}
+
+static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev,
+						       struct sk_buff *skb,
+						       const u8 *ha)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option)
+		dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha);
+}
+
+static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net,
+						 struct net_device *dev,
+						 const struct prefix_info *pinfo,
+						 struct inet6_dev *in6_dev,
+						 struct in6_addr *addr,
+						 int addr_type, u32 addr_flags,
+						 bool sllao, bool tokenized,
+						 __u32 valid_lft,
+						 u32 prefered_lft,
+						 bool dev_addr_generated)
+{
+	if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr)
+		dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+						    addr, addr_type,
+						    addr_flags, sllao,
+						    tokenized, valid_lft,
+						    prefered_lft,
+						    dev_addr_generated);
+}
+#endif
+
 /*
  * Return the padding between the option length and the start of the
  * link addr.  Currently only IP-over-InfiniBand needs this, although
@@ -132,11 +306,25 @@ static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad)
 	return NDISC_OPT_SPACE(addr_len + pad);
 }
 
-static inline int ndisc_opt_addr_space(struct net_device *dev)
+#if IS_ENABLED(CONFIG_IPV6)
+static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type)
+{
+	return __ndisc_opt_addr_space(dev->addr_len,
+				      ndisc_addr_option_pad(dev->type)) +
+		ndisc_ops_opt_addr_space(dev, icmp6_type);
+}
+
+static inline int ndisc_redirect_opt_addr_space(struct net_device *dev,
+						struct neighbour *neigh,
+						u8 *ops_data_buf,
+						u8 **ops_data)
 {
 	return __ndisc_opt_addr_space(dev->addr_len,
-				      ndisc_addr_option_pad(dev->type));
+				      ndisc_addr_option_pad(dev->type)) +
+		ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf,
+						  ops_data);
 }
+#endif
 
 static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p,
 					unsigned char addr_len, int prepad)
@@ -205,6 +393,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target);
 int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev,
 		 int dir);
 
+void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
+		  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
+		  struct ndisc_options *ndopts);
 
 /*
  *	IGMP
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 44c1cefdb299..b6e9bdc610f2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2531,7 +2531,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 
 	if (pinfo->autoconf && in6_dev->cnf.autoconf) {
 		struct in6_addr addr;
-		bool tokenized = false;
+		bool tokenized = false, dev_addr_generated = false;
 
 		if (pinfo->prefix_len == 64) {
 			memcpy(&addr, &pinfo->prefix, 8);
@@ -2550,6 +2550,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 			} else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
 				   ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
 				goto put;
+			} else {
+				dev_addr_generated = true;
 			}
 			goto ok;
 		}
@@ -2565,6 +2567,15 @@ ok:
 						   prefered_lft);
 		if (err)
 			goto put;
+
+		/* Ignore error case here because previous prefix add addr was
+		 * successful which will be notified.
+		 */
+		ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
+					      addr_type, addr_flags, sllao,
+					      tokenized, valid_lft,
+					      prefered_lft,
+					      dev_addr_generated);
 	}
 	inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
 put:
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a7b9468e684a..2f4afd17954b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -172,10 +172,19 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
 }
 
 static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
-					  void *data)
+					  void *data, u8 icmp6_type)
 {
 	__ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
 				 ndisc_addr_option_pad(skb->dev->type));
+	ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
+}
+
+static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
+						   void *ha,
+						   const u8 *ops_data)
+{
+	ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
+	ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
 }
 
 static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
@@ -191,24 +200,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
 	return cur <= end && cur->nd_opt_type == type ? cur : NULL;
 }
 
-static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+static inline int ndisc_is_useropt(const struct net_device *dev,
+				   struct nd_opt_hdr *opt)
 {
 	return opt->nd_opt_type == ND_OPT_RDNSS ||
-		opt->nd_opt_type == ND_OPT_DNSSL;
+		opt->nd_opt_type == ND_OPT_DNSSL ||
+		ndisc_ops_is_useropt(dev, opt->nd_opt_type);
 }
 
-static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
+					     struct nd_opt_hdr *cur,
 					     struct nd_opt_hdr *end)
 {
 	if (!cur || !end || cur >= end)
 		return NULL;
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
-	} while (cur < end && !ndisc_is_useropt(cur));
-	return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+	} while (cur < end && !ndisc_is_useropt(dev, cur));
+	return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
 }
 
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+					  u8 *opt, int opt_len,
 					  struct ndisc_options *ndopts)
 {
 	struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
@@ -223,6 +236,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
 		l = nd_opt->nd_opt_len << 3;
 		if (opt_len < l || l == 0)
 			return NULL;
+		if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
+			goto next_opt;
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LL_ADDR:
 		case ND_OPT_TARGET_LL_ADDR:
@@ -249,7 +264,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
 			break;
 #endif
 		default:
-			if (ndisc_is_useropt(nd_opt)) {
+			if (ndisc_is_useropt(dev, nd_opt)) {
 				ndopts->nd_useropts_end = nd_opt;
 				if (!ndopts->nd_useropts)
 					ndopts->nd_useropts = nd_opt;
@@ -266,6 +281,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
 					  nd_opt->nd_opt_len);
 			}
 		}
+next_opt:
 		opt_len -= l;
 		nd_opt = ((void *)nd_opt) + l;
 	}
@@ -515,7 +531,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
 	if (!dev->addr_len)
 		inc_opt = 0;
 	if (inc_opt)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev,
+					       NDISC_NEIGHBOUR_ADVERTISEMENT);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -534,8 +551,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
 
 	if (inc_opt)
 		ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
-				       dev->dev_addr);
-
+				       dev->dev_addr,
+				       NDISC_NEIGHBOUR_ADVERTISEMENT);
 
 	ndisc_send_skb(skb, daddr, src_addr);
 }
@@ -580,7 +597,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 	if (ipv6_addr_any(saddr))
 		inc_opt = false;
 	if (inc_opt)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev,
+					       NDISC_NEIGHBOUR_SOLICITATION);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -596,7 +614,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 
 	if (inc_opt)
 		ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
-				       dev->dev_addr);
+				       dev->dev_addr,
+				       NDISC_NEIGHBOUR_SOLICITATION);
 
 	ndisc_send_skb(skb, daddr, saddr);
 }
@@ -632,7 +651,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
 	}
 #endif
 	if (send_sllao)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -647,7 +666,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
 
 	if (send_sllao)
 		ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
-				       dev->dev_addr);
+				       dev->dev_addr,
+				       NDISC_ROUTER_SOLICITATION);
 
 	ndisc_send_skb(skb, daddr, saddr);
 }
@@ -708,6 +728,15 @@ static int pndisc_is_router(const void *pkey,
 	return ret;
 }
 
+void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
+		  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
+		  struct ndisc_options *ndopts)
+{
+	neigh_update(neigh, lladdr, new, flags);
+	/* report ndisc ops about neighbour update */
+	ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
+}
+
 static void ndisc_recv_ns(struct sk_buff *skb)
 {
 	struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
@@ -744,7 +773,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND options\n");
 		return;
 	}
@@ -862,9 +891,10 @@ have_ifp:
 	neigh = __neigh_lookup(&nd_tbl, saddr, dev,
 			       !inc || lladdr || !dev->addr_len);
 	if (neigh)
-		neigh_update(neigh, lladdr, NUD_STALE,
+		ndisc_update(dev, neigh, lladdr, NUD_STALE,
 			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
-			     NEIGH_UPDATE_F_OVERRIDE);
+			     NEIGH_UPDATE_F_OVERRIDE,
+			     NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
 	if (neigh || !dev->header_ops) {
 		ndisc_send_na(dev, saddr, &msg->target, !!is_router,
 			      true, (ifp != NULL && inc), inc);
@@ -917,7 +947,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	    idev->cnf.drop_unsolicited_na)
 		return;
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
 	}
@@ -973,12 +1003,13 @@ static void ndisc_recv_na(struct sk_buff *skb)
 			goto out;
 		}
 
-		neigh_update(neigh, lladdr,
+		ndisc_update(dev, neigh, lladdr,
 			     msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
 			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
 			     (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
 			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
-			     (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+			     (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
+			     NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);
 
 		if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
 			/*
@@ -1023,7 +1054,7 @@ static void ndisc_recv_rs(struct sk_buff *skb)
 		goto out;
 
 	/* Parse ND options */
-	if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
 		goto out;
 	}
@@ -1037,10 +1068,11 @@ static void ndisc_recv_rs(struct sk_buff *skb)
 
 	neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
 	if (neigh) {
-		neigh_update(neigh, lladdr, NUD_STALE,
+		ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
 			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
 			     NEIGH_UPDATE_F_OVERRIDE|
-			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
+			     NDISC_ROUTER_SOLICITATION, &ndopts);
 		neigh_release(neigh);
 	}
 out:
@@ -1141,7 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) {
 		ND_PRINTK(2, warn, "RA: invalid ND options\n");
 		return;
 	}
@@ -1335,11 +1367,12 @@ skip_linkparms:
 				goto out;
 			}
 		}
-		neigh_update(neigh, lladdr, NUD_STALE,
+		ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
 			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
 			     NEIGH_UPDATE_F_OVERRIDE|
 			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
-			     NEIGH_UPDATE_F_ISROUTER);
+			     NEIGH_UPDATE_F_ISROUTER,
+			     NDISC_ROUTER_ADVERTISEMENT, &ndopts);
 	}
 
 	if (!ipv6_accept_ra(in6_dev)) {
@@ -1427,7 +1460,8 @@ skip_routeinfo:
 		struct nd_opt_hdr *p;
 		for (p = ndopts.nd_useropts;
 		     p;
-		     p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+		     p = ndisc_next_useropt(skb->dev, p,
+					    ndopts.nd_useropts_end)) {
 			ndisc_ra_useropt(skb, p);
 		}
 	}
@@ -1465,7 +1499,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+	if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
 		return;
 
 	if (!ndopts.nd_opts_rh) {
@@ -1510,7 +1544,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	struct dst_entry *dst;
 	struct flowi6 fl6;
 	int rd_len;
-	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
+	   ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
 	int oif = l3mdev_fib_oif(dev);
 	bool ret;
 
@@ -1569,7 +1604,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 			memcpy(ha_buf, neigh->ha, dev->addr_len);
 			read_unlock_bh(&neigh->lock);
 			ha = ha_buf;
-			optlen += ndisc_opt_addr_space(dev);
+			optlen += ndisc_redirect_opt_addr_space(dev, neigh,
+								ops_data_buf,
+								&ops_data);
 		} else
 			read_unlock_bh(&neigh->lock);
 
@@ -1600,7 +1637,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	 */
 
 	if (ha)
-		ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha);
+		ndisc_fill_redirect_addr_option(buff, ha, ops_data);
 
 	/*
 	 *	build redirect option and copy skb over to the new packet.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d51a1a48b839..9e1516785dac 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2201,7 +2201,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 *	first-hop router for the specified ICMP Destination Address.
 	 */
 
-	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
 		return;
 	}
@@ -2236,12 +2236,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 *	We have finally decided to accept it.
 	 */
 
-	neigh_update(neigh, lladdr, NUD_STALE,
+	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
 		     NEIGH_UPDATE_F_OVERRIDE|
 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
-				     NEIGH_UPDATE_F_ISROUTER))
-		     );
+				     NEIGH_UPDATE_F_ISROUTER)),
+		     NDISC_REDIRECT, &ndopts);
 
 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
 	if (!nrt)
-- 
cgit v1.2.3


From cc84b3c6b48ae81748c5e25d3558872385196162 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:24 +0200
Subject: ipv6: export several functions

This patch exports some neighbour discovery functions which can be used
by 6lowpan neighbour discovery ops functionality then.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  7 +++++++
 include/net/ndisc.h    | 12 ++++++++++++
 net/ipv6/addrconf.c    | 15 +++++++--------
 net/ipv6/ndisc.c       | 14 +++-----------
 4 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index b1774eb03f37..9826d3a9464c 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -97,6 +97,13 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 void addrconf_add_linklocal(struct inet6_dev *idev,
 			    const struct in6_addr *addr, u32 flags);
 
+int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
+				 const struct prefix_info *pinfo,
+				 struct inet6_dev *in6_dev,
+				 const struct in6_addr *addr, int addr_type,
+				 u32 addr_flags, bool sllao, bool tokenized,
+				 __u32 valid_lft, u32 prefered_lft);
+
 static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
 {
 	if (dev->addr_len != ETH_ALEN)
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index a5e276703cb3..3f0f41ddbeb0 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -53,6 +53,15 @@ enum {
 
 #include <net/neighbour.h>
 
+/* Set to 3 to get tracing... */
+#define ND_DEBUG 1
+
+#define ND_PRINTK(val, level, fmt, ...)				\
+do {								\
+	if (val <= ND_DEBUG)					\
+		net_##level##_ratelimited(fmt, ##__VA_ARGS__);	\
+} while (0)
+
 struct ctl_table;
 struct inet6_dev;
 struct net_device;
@@ -115,6 +124,9 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
 					  u8 *opt, int opt_len,
 					  struct ndisc_options *ndopts);
 
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+			      int data_len, int pad);
+
 #define NDISC_OPS_REDIRECT_DATA_SPACE	2
 
 /*
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b6e9bdc610f2..6c8fc3f96b11 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2333,14 +2333,12 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
 	       idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
 }
 
-static int addrconf_prefix_rcv_add_addr(struct net *net,
-					struct net_device *dev,
-					const struct prefix_info *pinfo,
-					struct inet6_dev *in6_dev,
-					const struct in6_addr *addr,
-					int addr_type, u32 addr_flags,
-					bool sllao, bool tokenized,
-					__u32 valid_lft, u32 prefered_lft)
+int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
+				 const struct prefix_info *pinfo,
+				 struct inet6_dev *in6_dev,
+				 const struct in6_addr *addr, int addr_type,
+				 u32 addr_flags, bool sllao, bool tokenized,
+				 __u32 valid_lft, u32 prefered_lft)
 {
 	struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
 	int create = 0, update_lft = 0;
@@ -2430,6 +2428,7 @@ static int addrconf_prefix_rcv_add_addr(struct net *net,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
 
 void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2f4afd17954b..fe65cdc28a45 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -73,15 +73,6 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...)				\
-do {								\
-	if (val <= ND_DEBUG)					\
-		net_##level##_ratelimited(fmt, ##__VA_ARGS__);	\
-} while (0)
-
 static u32 ndisc_hash(const void *pkey,
 		      const struct net_device *dev,
 		      __u32 *hash_rnd);
@@ -150,8 +141,8 @@ struct neigh_table nd_tbl = {
 };
 EXPORT_SYMBOL_GPL(nd_tbl);
 
-static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
-				     int data_len, int pad)
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+			      int data_len, int pad)
 {
 	int space = __ndisc_opt_addr_space(data_len, pad);
 	u8 *opt = skb_put(skb, space);
@@ -170,6 +161,7 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
 	if (space > 0)
 		memset(opt, 0, space);
 }
+EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
 
 static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
 					  void *data, u8 icmp6_type)
-- 
cgit v1.2.3


From bbe5f5cefe2818eda0392c178de141ffc5734d90 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 15 Jun 2016 21:20:25 +0200
Subject: 6lowpan: introduce 6lowpan-nd

This patch introduce different 6lowpan handling for receive and transmit
NS/NA messages for the ipv6 neighbour discovery. The first use-case is
for supporting 802.15.4 short addresses inside the option fields and
handling for RFC6775 6CO option field as userspace option.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h     |  18 ++--
 net/6lowpan/6lowpan_i.h |   4 +
 net/6lowpan/Makefile    |   2 +-
 net/6lowpan/core.c      |   4 +-
 net/6lowpan/ndisc.c     | 234 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 254 insertions(+), 8 deletions(-)
 create mode 100644 net/6lowpan/ndisc.c

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 3f0f41ddbeb0..be1fe2283254 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,6 +35,7 @@ enum {
 	ND_OPT_ROUTE_INFO = 24,		/* RFC4191 */
 	ND_OPT_RDNSS = 25,		/* RFC5006 */
 	ND_OPT_DNSSL = 31,		/* RFC6106 */
+	ND_OPT_6CO = 34,		/* RFC6775 */
 	__ND_OPT_MAX
 };
 
@@ -109,14 +110,19 @@ struct ndisc_options {
 #endif
 	struct nd_opt_hdr *nd_useropts;
 	struct nd_opt_hdr *nd_useropts_end;
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+	struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1];
+#endif
 };
 
-#define nd_opts_src_lladdr	nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
-#define nd_opts_tgt_lladdr	nd_opt_array[ND_OPT_TARGET_LL_ADDR]
-#define nd_opts_pi		nd_opt_array[ND_OPT_PREFIX_INFO]
-#define nd_opts_pi_end		nd_opt_array[__ND_OPT_PREFIX_INFO_END]
-#define nd_opts_rh		nd_opt_array[ND_OPT_REDIRECT_HDR]
-#define nd_opts_mtu		nd_opt_array[ND_OPT_MTU]
+#define nd_opts_src_lladdr		nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
+#define nd_opts_tgt_lladdr		nd_opt_array[ND_OPT_TARGET_LL_ADDR]
+#define nd_opts_pi			nd_opt_array[ND_OPT_PREFIX_INFO]
+#define nd_opts_pi_end			nd_opt_array[__ND_OPT_PREFIX_INFO_END]
+#define nd_opts_rh			nd_opt_array[ND_OPT_REDIRECT_HDR]
+#define nd_opts_mtu			nd_opt_array[ND_OPT_MTU]
+#define nd_802154_opts_src_lladdr	nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
+#define nd_802154_opts_tgt_lladdr	nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
 
 #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
 
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index 97ecc27aeca6..a67caee11929 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev,
 	return lowpan_dev(dev)->lltype == lltype;
 }
 
+extern const struct ndisc_ops lowpan_ndisc_ops;
+
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
+
 #ifdef CONFIG_6LOWPAN_DEBUGFS
 int lowpan_dev_debugfs_init(struct net_device *dev);
 void lowpan_dev_debugfs_exit(struct net_device *dev);
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
index e44f3bf2dd42..12d131ab2324 100644
--- a/net/6lowpan/Makefile
+++ b/net/6lowpan/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_6LOWPAN) += 6lowpan.o
 
-6lowpan-y := core.o iphc.o nhc.o
+6lowpan-y := core.o iphc.o nhc.o ndisc.o
 6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o
 
 #rfc6282 nhcs
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 1c7a42b48524..5945f7e19c67 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -34,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev,
 	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
 		lowpan_dev(dev)->ctx.table[i].id = i;
 
+	dev->ndisc_ops = &lowpan_ndisc_ops;
+
 	ret = register_netdevice(dev);
 	if (ret < 0)
 		return ret;
@@ -73,7 +75,7 @@ void lowpan_unregister_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(lowpan_unregister_netdev);
 
-static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
 {
 	struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
 
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
new file mode 100644
index 000000000000..ae1d4199aa4c
--- /dev/null
+++ b/net/6lowpan/ndisc.c
@@ -0,0 +1,234 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <net/6lowpan.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+
+#include "6lowpan_i.h"
+
+static int lowpan_ndisc_is_useropt(u8 nd_opt_type)
+{
+	return nd_opt_type == ND_OPT_6CO;
+}
+
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+#define NDISC_802154_SHORT_ADDR_LENGTH	1
+static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
+					     struct nd_opt_hdr *nd_opt,
+					     struct ndisc_options *ndopts)
+{
+	switch (nd_opt->nd_opt_len) {
+	case NDISC_802154_SHORT_ADDR_LENGTH:
+		if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
+			ND_PRINTK(2, warn,
+				  "%s: duplicated short addr ND6 option found: type=%d\n",
+				  __func__, nd_opt->nd_opt_type);
+		else
+			ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
+		return 1;
+	default:
+		/* all others will be handled by ndisc IPv6 option parsing */
+		return 0;
+	}
+}
+
+static int lowpan_ndisc_parse_options(const struct net_device *dev,
+				      struct nd_opt_hdr *nd_opt,
+				      struct ndisc_options *ndopts)
+{
+	switch (nd_opt->nd_opt_type) {
+	case ND_OPT_SOURCE_LL_ADDR:
+	case ND_OPT_TARGET_LL_ADDR:
+		return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts);
+	default:
+		return 0;
+	}
+}
+
+static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
+				       u8 icmp6_type,
+				       const struct ndisc_options *ndopts)
+{
+	struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+	u8 *lladdr_short = NULL;
+
+	switch (icmp6_type) {
+	case NDISC_ROUTER_SOLICITATION:
+	case NDISC_ROUTER_ADVERTISEMENT:
+	case NDISC_NEIGHBOUR_SOLICITATION:
+		if (ndopts->nd_802154_opts_src_lladdr) {
+			lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
+							     IEEE802154_SHORT_ADDR_LEN, 0);
+			if (!lladdr_short) {
+				ND_PRINTK(2, warn,
+					  "NA: invalid short link-layer address length\n");
+				return;
+			}
+		}
+		break;
+	case NDISC_REDIRECT:
+	case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		if (ndopts->nd_802154_opts_tgt_lladdr) {
+			lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
+							     IEEE802154_SHORT_ADDR_LEN, 0);
+			if (!lladdr_short) {
+				ND_PRINTK(2, warn,
+					  "NA: invalid short link-layer address length\n");
+				return;
+			}
+		}
+		break;
+	default:
+		break;
+	}
+
+	write_lock_bh(&n->lock);
+	if (lladdr_short)
+		ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
+	else
+		neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+	write_unlock_bh(&n->lock);
+}
+
+static void lowpan_ndisc_update(const struct net_device *dev,
+				struct neighbour *n, u32 flags, u8 icmp6_type,
+				const struct ndisc_options *ndopts)
+{
+	if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+		return;
+
+	/* react on overrides only. TODO check if this is really right. */
+	if (flags & NEIGH_UPDATE_F_OVERRIDE)
+		lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts);
+}
+
+static int lowpan_ndisc_opt_addr_space(const struct net_device *dev,
+				       u8 icmp6_type, struct neighbour *neigh,
+				       u8 *ha_buf, u8 **ha)
+{
+	struct lowpan_802154_neigh *n;
+	struct wpan_dev *wpan_dev;
+	int addr_space = 0;
+
+	if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+		return 0;
+
+	switch (icmp6_type) {
+	case NDISC_REDIRECT:
+		n = lowpan_802154_neigh(neighbour_priv(neigh));
+
+		read_lock_bh(&neigh->lock);
+		if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) {
+			memcpy(ha_buf, &n->short_addr,
+			       IEEE802154_SHORT_ADDR_LEN);
+			read_unlock_bh(&neigh->lock);
+			addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+			*ha = ha_buf;
+		}
+		read_unlock_bh(&neigh->lock);
+		break;
+	case NDISC_NEIGHBOUR_ADVERTISEMENT:
+	case NDISC_NEIGHBOUR_SOLICITATION:
+	case NDISC_ROUTER_SOLICITATION:
+		wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+		if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+			addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+		break;
+	default:
+		break;
+	}
+
+	return addr_space;
+}
+
+static void lowpan_ndisc_fill_addr_option(const struct net_device *dev,
+					  struct sk_buff *skb, u8 icmp6_type,
+					  const u8 *ha)
+{
+	struct wpan_dev *wpan_dev;
+	__be16 short_addr;
+	u8 opt_type;
+
+	if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+		return;
+
+	switch (icmp6_type) {
+	case NDISC_REDIRECT:
+		if (ha) {
+			ieee802154_le16_to_be16(&short_addr, ha);
+			__ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+						 &short_addr,
+						 IEEE802154_SHORT_ADDR_LEN, 0);
+		}
+		return;
+	case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		opt_type = ND_OPT_TARGET_LL_ADDR;
+		break;
+	case NDISC_ROUTER_SOLICITATION:
+	case NDISC_NEIGHBOUR_SOLICITATION:
+		opt_type = ND_OPT_SOURCE_LL_ADDR;
+		break;
+	default:
+		return;
+	}
+
+	wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+	if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+		ieee802154_le16_to_be16(&short_addr,
+					&wpan_dev->short_addr);
+		__ndisc_fill_addr_option(skb, opt_type, &short_addr,
+					 IEEE802154_SHORT_ADDR_LEN, 0);
+	}
+}
+
+static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
+					     struct net_device *dev,
+					     const struct prefix_info *pinfo,
+					     struct inet6_dev *in6_dev,
+					     struct in6_addr *addr,
+					     int addr_type, u32 addr_flags,
+					     bool sllao, bool tokenized,
+					     __u32 valid_lft,
+					     u32 prefered_lft,
+					     bool dev_addr_generated)
+{
+	int err;
+
+	/* generates short based address for RA PIO's */
+	if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated &&
+	    !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) {
+		err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+						   addr, addr_type, addr_flags,
+						   sllao, tokenized, valid_lft,
+						   prefered_lft);
+		if (err)
+			ND_PRINTK(2, warn,
+				  "RA: could not add a short address based address for prefix: %pI6c\n",
+				  &pinfo->prefix);
+	}
+}
+#endif
+
+const struct ndisc_ops lowpan_ndisc_ops = {
+	.is_useropt		= lowpan_ndisc_is_useropt,
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+	.parse_options		= lowpan_ndisc_parse_options,
+	.update			= lowpan_ndisc_update,
+	.opt_addr_space		= lowpan_ndisc_opt_addr_space,
+	.fill_addr_option	= lowpan_ndisc_fill_addr_option,
+	.prefix_rcv_add_addr	= lowpan_ndisc_prefix_rcv_add_addr,
+#endif
+};
-- 
cgit v1.2.3


From 22a59be8b7693eb2d0897a9638f5991f2f8e4ddd Mon Sep 17 00:00:00 2001
From: Philip Prindeville <philipp@redfish-solutions.com>
Date: Tue, 14 Jun 2016 15:53:02 -0600
Subject: net: ipv4: Add ability to have GRE ignore DF bit in IPv4 payloads

    In the presence of firewalls which improperly block ICMP Unreachable
    (including Fragmentation Required) messages, Path MTU Discovery is
    prevented from working.

    A workaround is to handle IPv4 payloads opaquely, ignoring the DF bit--as
    is done for other payloads like AppleTalk--and doing transparent
    fragmentation and reassembly.

    Redux includes the enforcement of mutual exclusion between this feature
    and Path MTU Discovery as suggested by Alexander Duyck.

    Cc: Alexander Duyck <alexander.duyck@gmail.com>
    Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
    Signed-off-by: Philip Prindeville <philipp@redfish-solutions.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |  1 +
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_gre.c              | 42 +++++++++++++++++++++++++++++++++---------
 net/ipv4/ip_tunnel.c           |  2 +-
 4 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index dbf444428437..9222678426a1 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -132,6 +132,7 @@ struct ip_tunnel {
 	int			ip_tnl_net_id;
 	struct gro_cells	gro_cells;
 	bool			collect_md;
+	bool			ignore_df;
 };
 
 #define TUNNEL_CSUM		__cpu_to_be16(0x01)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index af4de90ba27d..1046f5515174 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -113,6 +113,7 @@ enum {
 	IFLA_GRE_ENCAP_SPORT,
 	IFLA_GRE_ENCAP_DPORT,
 	IFLA_GRE_COLLECT_METADATA,
+	IFLA_GRE_IGNORE_DF,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4d2025f7ec57..0f8ca3fca00a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -841,17 +841,19 @@ out:
 	return ipgre_tunnel_validate(tb, data);
 }
 
-static void ipgre_netlink_parms(struct net_device *dev,
+static int ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
 				struct ip_tunnel_parm *parms)
 {
+	struct ip_tunnel *t = netdev_priv(dev);
+
 	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.protocol = IPPROTO_GRE;
 
 	if (!data)
-		return;
+		return 0;
 
 	if (data[IFLA_GRE_LINK])
 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
@@ -880,16 +882,26 @@ static void ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_TOS])
 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
 
-	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
+		if (t->ignore_df)
+			return -EINVAL;
 		parms->iph.frag_off = htons(IP_DF);
+	}
 
 	if (data[IFLA_GRE_COLLECT_METADATA]) {
-		struct ip_tunnel *t = netdev_priv(dev);
-
 		t->collect_md = true;
 		if (dev->type == ARPHRD_IPGRE)
 			dev->type = ARPHRD_NONE;
 	}
+
+	if (data[IFLA_GRE_IGNORE_DF]) {
+		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
+		  && (parms->iph.frag_off & htons(IP_DF)))
+			return -EINVAL;
+		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
+	}
+
+	return 0;
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -960,16 +972,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	int err;
 
 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
-		int err = ip_tunnel_encap_setup(t, &ipencap);
+		err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipgre_netlink_parms(dev, data, tb, &p);
+	err = ipgre_netlink_parms(dev, data, tb, &p);
+	if (err < 0)
+		return err;
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -978,16 +993,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	int err;
 
 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
-		int err = ip_tunnel_encap_setup(t, &ipencap);
+		err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipgre_netlink_parms(dev, data, tb, &p);
+	err = ipgre_netlink_parms(dev, data, tb, &p);
+	if (err < 0)
+		return err;
 	return ip_tunnel_changelink(dev, tb, &p);
 }
 
@@ -1024,6 +1042,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_COLLECT_METADATA */
 		nla_total_size(0) +
+		/* IFLA_GRE_IGNORE_DF */
+		nla_total_size(1) +
 		0;
 }
 
@@ -1057,6 +1077,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			t->encap.flags))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
+		goto nla_put_failure;
+
 	if (t->collect_md) {
 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
 			goto nla_put_failure;
@@ -1084,6 +1107,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
+	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d8f5e0a269f5..95649ebd2874 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	df = tnl_params->frag_off;
-	if (skb->protocol == htons(ETH_P_IP))
+	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 		df |= (inner_iph->frag_off&htons(IP_DF));
 
 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
-- 
cgit v1.2.3


From 86a98057256020e75e1be0f88d7617491a06e8f1 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 16 Jun 2016 12:20:44 -0700
Subject: vxlan/geneve: Include udp_tunnel.h in vxlan/geneve.h and fixup
 includes

This patch makes it so that we add udp_tunnel.h to vxlan.h and geneve.h
header files.  This is useful as I plan to move the generic handlers for
the port offloads into the udp_tunnel header file and leave the vxlan and
geneve headers to be a bit more protocol specific.

I also went through and cleaned out a number of redundant includes that
where in the .h and .c files for these drivers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  1 -
 drivers/net/vxlan.c      | 17 -----------------
 include/net/geneve.h     |  3 ---
 include/net/udp_tunnel.h |  2 ++
 include/net/vxlan.h      |  6 +-----
 5 files changed, 3 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index cadefe4fdaa2..e5e33cd01082 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -12,7 +12,6 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/hash.h>
 #include <net/dst_metadata.h>
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index f999db2f97b4..10ad41d60652 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -11,32 +11,18 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
-#include <linux/types.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/rculist.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/ip.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
-#include <linux/etherdevice.h>
 #include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/hash.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
 #include <net/ndisc.h>
 #include <net/ip.h>
-#include <net/ip_tunnels.h>
 #include <net/icmp.h>
-#include <net/udp.h>
-#include <net/udp_tunnel.h>
 #include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -44,12 +30,9 @@
 #include <net/protocol.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/addrconf.h>
 #include <net/ip6_tunnel.h>
 #include <net/ip6_checksum.h>
 #endif
-#include <net/dst_metadata.h>
 
 #define VXLAN_VERSION	"0.1"
 
diff --git a/include/net/geneve.h b/include/net/geneve.h
index cb544a530146..f8aff18d6702 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -1,10 +1,7 @@
 #ifndef __NET_GENEVE_H
 #define __NET_GENEVE_H  1
 
-#ifdef CONFIG_INET
 #include <net/udp_tunnel.h>
-#endif
-
 
 /* Geneve Header:
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 9d14f707e534..59019562d14c 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -105,12 +105,14 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
 				    __be16 flags, __be64 tunnel_id,
 				    int md_size);
 
+#ifdef CONFIG_INET
 static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 {
 	int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
 	return iptunnel_handle_offloads(skb, type);
 }
+#endif
 
 static inline void udp_tunnel_encap_enable(struct socket *sock)
 {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b8803165df91..7d944941f32f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -1,12 +1,8 @@
 #ifndef __NET_VXLAN_H
 #define __NET_VXLAN_H 1
 
-#include <linux/ip.h>
-#include <linux/ipv6.h>
 #include <linux/if_vlan.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/udp.h>
+#include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
 
 /* VXLAN protocol (RFC 7348) header:
-- 
cgit v1.2.3


From e7b3db5e60e8f471c3f5ef93b497bafe5863e56a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 16 Jun 2016 12:20:52 -0700
Subject: net: Combine GENEVE and VXLAN port notifiers into single functions

This patch merges the GENEVE and VXLAN code so that both functions pass
through a shared code path.  This way we can start the effort of using a
single function on the network device drivers to handle both of these
tunnel types.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  58 ++++-----------------------
 drivers/net/vxlan.c      |  60 +++++-----------------------
 include/net/udp_tunnel.h |  33 +++++++++++++++
 net/ipv4/udp_tunnel.c    | 102 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+), 102 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index e5e33cd01082..af6f676ca444 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -396,23 +396,6 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
 	return sock;
 }
 
-static void geneve_notify_add_rx_port(struct geneve_sock *gs)
-{
-	struct net_device *dev;
-	struct sock *sk = gs->sock->sk;
-	struct net *net = sock_net(sk);
-	sa_family_t sa_family = geneve_get_sk_family(gs);
-	__be16 port = inet_sk(sk)->inet_sport;
-
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
-		if (dev->netdev_ops->ndo_add_geneve_port)
-			dev->netdev_ops->ndo_add_geneve_port(dev, sa_family,
-							     port);
-	}
-	rcu_read_unlock();
-}
-
 static int geneve_hlen(struct genevehdr *gh)
 {
 	return sizeof(*gh) + gh->opt_len * 4;
@@ -532,7 +515,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 		INIT_HLIST_HEAD(&gs->vni_list[h]);
 
 	/* Initialize the geneve udp offloads structure */
-	geneve_notify_add_rx_port(gs);
+	udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
 
 	/* Mark socket as an encapsulation socket */
 	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
@@ -547,31 +530,13 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	return gs;
 }
 
-static void geneve_notify_del_rx_port(struct geneve_sock *gs)
-{
-	struct net_device *dev;
-	struct sock *sk = gs->sock->sk;
-	struct net *net = sock_net(sk);
-	sa_family_t sa_family = geneve_get_sk_family(gs);
-	__be16 port = inet_sk(sk)->inet_sport;
-
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
-		if (dev->netdev_ops->ndo_del_geneve_port)
-			dev->netdev_ops->ndo_del_geneve_port(dev, sa_family,
-							     port);
-	}
-
-	rcu_read_unlock();
-}
-
 static void __geneve_sock_release(struct geneve_sock *gs)
 {
 	if (!gs || --gs->refcnt)
 		return;
 
 	list_del(&gs->list);
-	geneve_notify_del_rx_port(gs);
+	udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
 	udp_tunnel_sock_release(gs->sock);
 	kfree_rcu(gs, rcu);
 }
@@ -1164,29 +1129,20 @@ static struct device_type geneve_type = {
 	.name = "geneve",
 };
 
-/* Calls the ndo_add_geneve_port of the caller in order to
+/* Calls the ndo_add_udp_enc_port of the caller in order to
  * supply the listening GENEVE udp ports. Callers are expected
- * to implement the ndo_add_geneve_port.
+ * to implement the ndo_add_udp_enc_port.
  */
 static void geneve_push_rx_ports(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
-	sa_family_t sa_family;
-	struct sock *sk;
-	__be16 port;
-
-	if (!dev->netdev_ops->ndo_add_geneve_port)
-		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(gs, &gn->sock_list, list) {
-		sk = gs->sock->sk;
-		sa_family = sk->sk_family;
-		port = inet_sk(sk)->inet_sport;
-		dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, port);
-	}
+	list_for_each_entry_rcu(gs, &gn->sock_list, list)
+		udp_tunnel_push_rx_port(dev, gs->sock,
+					UDP_TUNNEL_TYPE_GENEVE);
 	rcu_read_unlock();
 }
 
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 10ad41d60652..adbd979da22d 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -602,42 +602,6 @@ static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
 }
 
-/* Notify netdevs that UDP port started listening */
-static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
-{
-	struct net_device *dev;
-	struct sock *sk = vs->sock->sk;
-	struct net *net = sock_net(sk);
-	sa_family_t sa_family = vxlan_get_sk_family(vs);
-	__be16 port = inet_sk(sk)->inet_sport;
-
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
-		if (dev->netdev_ops->ndo_add_vxlan_port)
-			dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
-							    port);
-	}
-	rcu_read_unlock();
-}
-
-/* Notify netdevs that UDP port is no more listening */
-static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
-{
-	struct net_device *dev;
-	struct sock *sk = vs->sock->sk;
-	struct net *net = sock_net(sk);
-	sa_family_t sa_family = vxlan_get_sk_family(vs);
-	__be16 port = inet_sk(sk)->inet_sport;
-
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
-		if (dev->netdev_ops->ndo_del_vxlan_port)
-			dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family,
-							    port);
-	}
-	rcu_read_unlock();
-}
-
 /* Add new entry to forwarding table -- assumes lock held */
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			    const u8 *mac, union vxlan_addr *ip,
@@ -1033,7 +997,8 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
 	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
 	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vs->hlist);
-	vxlan_notify_del_rx_port(vs);
+	udp_tunnel_notify_del_rx_port(vs->sock,
+				      UDP_TUNNEL_TYPE_VXLAN);
 	spin_unlock(&vn->sock_lock);
 
 	return true;
@@ -2508,30 +2473,22 @@ static struct device_type vxlan_type = {
 	.name = "vxlan",
 };
 
-/* Calls the ndo_add_vxlan_port of the caller in order to
+/* Calls the ndo_add_udp_enc_port of the caller in order to
  * supply the listening VXLAN udp ports. Callers are expected
- * to implement the ndo_add_vxlan_port.
+ * to implement the ndo_add_udp_enc_port.
  */
 static void vxlan_push_rx_ports(struct net_device *dev)
 {
 	struct vxlan_sock *vs;
 	struct net *net = dev_net(dev);
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
-	sa_family_t sa_family;
-	__be16 port;
 	unsigned int i;
 
-	if (!dev->netdev_ops->ndo_add_vxlan_port)
-		return;
-
 	spin_lock(&vn->sock_lock);
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
-		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
-			port = inet_sk(vs->sock->sk)->inet_sport;
-			sa_family = vxlan_get_sk_family(vs);
-			dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
-							    port);
-		}
+		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist)
+			udp_tunnel_push_rx_port(dev, vs->sock,
+						UDP_TUNNEL_TYPE_VXLAN);
 	}
 	spin_unlock(&vn->sock_lock);
 }
@@ -2733,7 +2690,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
-	vxlan_notify_add_rx_port(vs);
+	udp_tunnel_notify_add_rx_port(sock,
+				      UDP_TUNNEL_TYPE_VXLAN);
 	spin_unlock(&vn->sock_lock);
 
 	/* Mark socket as an encapsulation socket. */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 59019562d14c..71afbea873a0 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -84,6 +84,39 @@ struct udp_tunnel_sock_cfg {
 void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 			   struct udp_tunnel_sock_cfg *sock_cfg);
 
+/* -- List of parsable UDP tunnel types --
+ *
+ * Adding to this list will result in serious debate.  The main issue is
+ * that this list is essentially a list of workarounds for either poorly
+ * designed tunnels, or poorly designed device offloads.
+ *
+ * The parsing supported via these types should really be used for Rx
+ * traffic only as the network stack will have already inserted offsets for
+ * the location of the headers in the skb.  In addition any ports that are
+ * pushed should be kept within the namespace without leaking to other
+ * devices such as VFs or other ports on the same device.
+ *
+ * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the
+ * need to use this for Rx checksum offload.  It should not be necessary to
+ * call this function to perform Tx offloads on outgoing traffic.
+ */
+enum udp_parsable_tunnel_type {
+	UDP_TUNNEL_TYPE_VXLAN,		/* RFC 7348 */
+	UDP_TUNNEL_TYPE_GENEVE,		/* draft-ietf-nvo3-geneve */
+};
+
+struct udp_tunnel_info {
+	unsigned short type;
+	sa_family_t sa_family;
+	__be16 port;
+};
+
+/* Notify network devices of offloadable types */
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type);
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
+
 /* Transmit the skb using UDP encapsulation. */
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 47f12c73d959..8174753e6494 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -76,6 +76,108 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 
+static void __udp_tunnel_push_rx_port(struct net_device *dev,
+				      struct udp_tunnel_info *ti)
+{
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!dev->netdev_ops->ndo_add_vxlan_port)
+			break;
+
+		dev->netdev_ops->ndo_add_vxlan_port(dev,
+						    ti->sa_family,
+						    ti->port);
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!dev->netdev_ops->ndo_add_geneve_port)
+			break;
+
+		dev->netdev_ops->ndo_add_geneve_port(dev,
+						     ti->sa_family,
+						     ti->port);
+		break;
+	default:
+		break;
+	}
+}
+
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct udp_tunnel_info ti;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	__udp_tunnel_push_rx_port(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+
+/* Notify netdevs that UDP port started listening */
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct udp_tunnel_info ti;
+	struct net_device *dev;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev)
+		__udp_tunnel_push_rx_port(dev, &ti);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
+
+static void __udp_tunnel_pull_rx_port(struct net_device *dev,
+				      struct udp_tunnel_info *ti)
+{
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!dev->netdev_ops->ndo_del_vxlan_port)
+			break;
+
+		dev->netdev_ops->ndo_del_vxlan_port(dev,
+						    ti->sa_family,
+						    ti->port);
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!dev->netdev_ops->ndo_del_geneve_port)
+			break;
+
+		dev->netdev_ops->ndo_del_geneve_port(dev,
+						     ti->sa_family,
+						     ti->port);
+		break;
+	default:
+		break;
+	}
+}
+
+/* Notify netdevs that UDP port is no more listening */
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct udp_tunnel_info ti;
+	struct net_device *dev;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev)
+		__udp_tunnel_pull_rx_port(dev, &ti);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
+
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
 			 __be16 df, __be16 src_port, __be16 dst_port,
-- 
cgit v1.2.3


From 7c46a640de6fcc4f35d0702710356a024eadf68f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 16 Jun 2016 12:21:00 -0700
Subject: net: Merge VXLAN and GENEVE push notifiers into a single notifier

This patch merges the notifiers for VXLAN and GENEVE into a single UDP
tunnel notifier.  The idea is that we will want to only have to make one
notifier call to receive the list of ports for VXLAN and GENEVE tunnels
that need to be offloaded.

In addition we add a new set of ndo functions named ndo_udp_tunnel_add and
ndo_udp_tunnel_del that are meant to allow us to track the tunnel meta-data
such as port and address family as tunnels are added and removed.  The
tunnel meta-data is now transported in a structure named udp_tunnel_info
which for now carries the type, address family, and port number.  In the
future this could be updated so that we can include a tuple of values
including things such as the destination IP address and other fields.

I also ended up going with a naming scheme that consisted of using the
prefix udp_tunnel on function names.  I applied this to the notifier and
ndo ops as well so that it hopefully points to the fact that these are
primarily used in the udp_tunnel functions.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      |  2 +-
 drivers/net/vxlan.c       |  2 +-
 include/linux/netdevice.h | 22 ++++++++++++++++++++--
 include/net/geneve.h      |  3 +--
 include/net/udp_tunnel.h  |  6 ++++++
 include/net/vxlan.h       |  4 ++--
 net/ipv4/udp_tunnel.c     | 10 ++++++++++
 7 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index af6f676ca444..aa61708bea69 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1497,7 +1497,7 @@ static int geneve_netdevice_event(struct notifier_block *unused,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (event == NETDEV_OFFLOAD_PUSH_GENEVE)
+	if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
 		geneve_push_rx_ports(dev);
 
 	return NOTIFY_DONE;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index adbd979da22d..31aeec967175 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3239,7 +3239,7 @@ static int vxlan_netdevice_event(struct notifier_block *unused,
 
 	if (event == NETDEV_UNREGISTER)
 		vxlan_handle_lowerdev_unregister(vn, dev);
-	else if (event == NETDEV_OFFLOAD_PUSH_VXLAN)
+	else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
 		vxlan_push_rx_ports(dev);
 
 	return NOTIFY_DONE;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 890158e99159..577d2a1814b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -61,6 +61,8 @@ struct wireless_dev;
 /* 802.15.4 specific */
 struct wpan_dev;
 struct mpls_dev;
+/* UDP Tunnel offloads */
+struct udp_tunnel_info;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1050,6 +1052,19 @@ struct tc_to_netdev {
  *	address family that vxlan is not listening to anymore. The operation
  *	is protected by the vxlan_net->sock_lock.
  *
+ * void (*ndo_udp_tunnel_add)(struct net_device *dev,
+ *			      struct udp_tunnel_info *ti);
+ *	Called by UDP tunnel to notify a driver about the UDP port and socket
+ *	address family that a UDP tunnel is listnening to. It is called only
+ *	when a new port starts listening. The operation is protected by the
+ *	RTNL.
+ *
+ * void (*ndo_udp_tunnel_del)(struct net_device *dev,
+ *			      struct udp_tunnel_info *ti);
+ *	Called by UDP tunnel to notify the driver about a UDP port and socket
+ *	address family that the UDP tunnel is not listening to anymore. The
+ *	operation is protected by the RTNL.
+ *
  * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
  *				 struct net_device *dev)
  *	Called by upper layer devices to accelerate switching or other
@@ -1269,6 +1284,10 @@ struct net_device_ops {
 	void			(*ndo_del_geneve_port)(struct  net_device *dev,
 						       sa_family_t sa_family,
 						       __be16 port);
+	void			(*ndo_udp_tunnel_add)(struct net_device *dev,
+						      struct udp_tunnel_info *ti);
+	void			(*ndo_udp_tunnel_del)(struct net_device *dev,
+						      struct udp_tunnel_info *ti);
 	void*			(*ndo_dfwd_add_station)(struct net_device *pdev,
 							struct net_device *dev);
 	void			(*ndo_dfwd_del_station)(struct net_device *pdev,
@@ -2255,8 +2274,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_BONDING_INFO	0x0019
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
-#define NETDEV_OFFLOAD_PUSH_VXLAN	0x001C
-#define NETDEV_OFFLOAD_PUSH_GENEVE	0x001D
+#define NETDEV_UDP_TUNNEL_PUSH_INFO	0x001C
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/geneve.h b/include/net/geneve.h
index f8aff18d6702..3410c4b5a382 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -61,8 +61,7 @@ struct genevehdr {
 
 static inline void geneve_get_rx_port(struct net_device *netdev)
 {
-	ASSERT_RTNL();
-	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_GENEVE, netdev);
+	udp_tunnel_get_rx_info(netdev);
 }
 
 #ifdef CONFIG_INET
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 71afbea873a0..1c9408a04213 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -117,6 +117,12 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
 void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
 void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
 
+static inline void udp_tunnel_get_rx_info(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
+}
+
 /* Transmit the skb using UDP encapsulation. */
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 7d944941f32f..c62e2ed1c3af 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -4,6 +4,7 @@
 #include <linux/if_vlan.h>
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
+#include <net/udp_tunnel.h>
 
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -390,8 +391,7 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset)
 
 static inline void vxlan_get_rx_port(struct net_device *netdev)
 {
-	ASSERT_RTNL();
-	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev);
+	udp_tunnel_get_rx_info(netdev);
 }
 
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 8174753e6494..683e494d9000 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -79,6 +79,11 @@ EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 static void __udp_tunnel_push_rx_port(struct net_device *dev,
 				      struct udp_tunnel_info *ti)
 {
+	if (dev->netdev_ops->ndo_udp_tunnel_add) {
+		dev->netdev_ops->ndo_udp_tunnel_add(dev, ti);
+		return;
+	}
+
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
 		if (!dev->netdev_ops->ndo_add_vxlan_port)
@@ -137,6 +142,11 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
 static void __udp_tunnel_pull_rx_port(struct net_device *dev,
 				      struct udp_tunnel_info *ti)
 {
+	if (dev->netdev_ops->ndo_udp_tunnel_del) {
+		dev->netdev_ops->ndo_udp_tunnel_del(dev, ti);
+		return;
+	}
+
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
 		if (!dev->netdev_ops->ndo_del_vxlan_port)
-- 
cgit v1.2.3


From 1938ee1fd3de74d761a60806b048df652666afec Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 16 Jun 2016 12:23:12 -0700
Subject: net: Remove deprecated tunnel specific UDP offload functions

Now that we have all the drivers using udp_tunnel_get_rx_ports,
ndo_add_udp_enc_rx_port, and ndo_del_udp_enc_rx_port we can drop the
function calls that were specific to VXLAN and GENEVE.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 38 -----------------------
 include/net/geneve.h      |  5 ---
 include/net/vxlan.h       |  5 ---
 net/ipv4/udp_tunnel.c     | 79 +++++++++--------------------------------------
 4 files changed, 14 insertions(+), 113 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 577d2a1814b1..e84d9d23c2d5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,32 +1026,6 @@ struct tc_to_netdev {
  *	not implement this, it is assumed that the hw is not able to have
  *	multiple net devices on single physical port.
  *
- * void (*ndo_add_vxlan_port)(struct  net_device *dev,
- *			      sa_family_t sa_family, __be16 port);
- *	Called by vxlan to notify a driver about the UDP port and socket
- *	address family that vxlan is listening to. It is called only when
- *	a new port starts listening. The operation is protected by the
- *	vxlan_net->sock_lock.
- *
- * void (*ndo_add_geneve_port)(struct net_device *dev,
- *			       sa_family_t sa_family, __be16 port);
- *	Called by geneve to notify a driver about the UDP port and socket
- *	address family that geneve is listnening to. It is called only when
- *	a new port starts listening. The operation is protected by the
- *	geneve_net->sock_lock.
- *
- * void (*ndo_del_geneve_port)(struct net_device *dev,
- *			       sa_family_t sa_family, __be16 port);
- *	Called by geneve to notify the driver about a UDP port and socket
- *	address family that geneve is not listening to anymore. The operation
- *	is protected by the geneve_net->sock_lock.
- *
- * void (*ndo_del_vxlan_port)(struct  net_device *dev,
- *			      sa_family_t sa_family, __be16 port);
- *	Called by vxlan to notify the driver about a UDP port and socket
- *	address family that vxlan is not listening to anymore. The operation
- *	is protected by the vxlan_net->sock_lock.
- *
  * void (*ndo_udp_tunnel_add)(struct net_device *dev,
  *			      struct udp_tunnel_info *ti);
  *	Called by UDP tunnel to notify a driver about the UDP port and socket
@@ -1272,18 +1246,6 @@ struct net_device_ops {
 							struct netdev_phys_item_id *ppid);
 	int			(*ndo_get_phys_port_name)(struct net_device *dev,
 							  char *name, size_t len);
-	void			(*ndo_add_vxlan_port)(struct  net_device *dev,
-						      sa_family_t sa_family,
-						      __be16 port);
-	void			(*ndo_del_vxlan_port)(struct  net_device *dev,
-						      sa_family_t sa_family,
-						      __be16 port);
-	void			(*ndo_add_geneve_port)(struct  net_device *dev,
-						       sa_family_t sa_family,
-						       __be16 port);
-	void			(*ndo_del_geneve_port)(struct  net_device *dev,
-						       sa_family_t sa_family,
-						       __be16 port);
 	void			(*ndo_udp_tunnel_add)(struct net_device *dev,
 						      struct udp_tunnel_info *ti);
 	void			(*ndo_udp_tunnel_del)(struct net_device *dev,
diff --git a/include/net/geneve.h b/include/net/geneve.h
index 3410c4b5a382..ec0327d4331b 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -59,11 +59,6 @@ struct genevehdr {
 	struct geneve_opt options[];
 };
 
-static inline void geneve_get_rx_port(struct net_device *netdev)
-{
-	udp_tunnel_get_rx_info(netdev);
-}
-
 #ifdef CONFIG_INET
 struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 					u8 name_assign_type, u16 dst_port);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index c62e2ed1c3af..b96d0360c095 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -389,11 +389,6 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset)
 	return vni_field;
 }
 
-static inline void vxlan_get_rx_port(struct net_device *netdev)
-{
-	udp_tunnel_get_rx_info(netdev);
-}
-
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 {
 	return vs->sock->sk->sk_family;
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 683e494d9000..58bd39fb14b4 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -76,47 +76,20 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 
-static void __udp_tunnel_push_rx_port(struct net_device *dev,
-				      struct udp_tunnel_info *ti)
-{
-	if (dev->netdev_ops->ndo_udp_tunnel_add) {
-		dev->netdev_ops->ndo_udp_tunnel_add(dev, ti);
-		return;
-	}
-
-	switch (ti->type) {
-	case UDP_TUNNEL_TYPE_VXLAN:
-		if (!dev->netdev_ops->ndo_add_vxlan_port)
-			break;
-
-		dev->netdev_ops->ndo_add_vxlan_port(dev,
-						    ti->sa_family,
-						    ti->port);
-		break;
-	case UDP_TUNNEL_TYPE_GENEVE:
-		if (!dev->netdev_ops->ndo_add_geneve_port)
-			break;
-
-		dev->netdev_ops->ndo_add_geneve_port(dev,
-						     ti->sa_family,
-						     ti->port);
-		break;
-	default:
-		break;
-	}
-}
-
 void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
 			     unsigned short type)
 {
 	struct sock *sk = sock->sk;
 	struct udp_tunnel_info ti;
 
+	if (!dev->netdev_ops->ndo_udp_tunnel_add)
+		return;
+
 	ti.type = type;
 	ti.sa_family = sk->sk_family;
 	ti.port = inet_sk(sk)->inet_sport;
 
-	__udp_tunnel_push_rx_port(dev, &ti);
+	dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
 
@@ -133,42 +106,15 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
 	ti.port = inet_sk(sk)->inet_sport;
 
 	rcu_read_lock();
-	for_each_netdev_rcu(net, dev)
-		__udp_tunnel_push_rx_port(dev, &ti);
+	for_each_netdev_rcu(net, dev) {
+		if (!dev->netdev_ops->ndo_udp_tunnel_add)
+			continue;
+		dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+	}
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
 
-static void __udp_tunnel_pull_rx_port(struct net_device *dev,
-				      struct udp_tunnel_info *ti)
-{
-	if (dev->netdev_ops->ndo_udp_tunnel_del) {
-		dev->netdev_ops->ndo_udp_tunnel_del(dev, ti);
-		return;
-	}
-
-	switch (ti->type) {
-	case UDP_TUNNEL_TYPE_VXLAN:
-		if (!dev->netdev_ops->ndo_del_vxlan_port)
-			break;
-
-		dev->netdev_ops->ndo_del_vxlan_port(dev,
-						    ti->sa_family,
-						    ti->port);
-		break;
-	case UDP_TUNNEL_TYPE_GENEVE:
-		if (!dev->netdev_ops->ndo_del_geneve_port)
-			break;
-
-		dev->netdev_ops->ndo_del_geneve_port(dev,
-						     ti->sa_family,
-						     ti->port);
-		break;
-	default:
-		break;
-	}
-}
-
 /* Notify netdevs that UDP port is no more listening */
 void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
 {
@@ -182,8 +128,11 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
 	ti.port = inet_sk(sk)->inet_sport;
 
 	rcu_read_lock();
-	for_each_netdev_rcu(net, dev)
-		__udp_tunnel_pull_rx_port(dev, &ti);
+	for_each_netdev_rcu(net, dev) {
+		if (!dev->netdev_ops->ndo_udp_tunnel_del)
+			continue;
+		dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+	}
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
-- 
cgit v1.2.3


From b9adcd69bd7b41625201686b4cfec7ff13357afc Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 16 Jun 2016 12:23:19 -0700
Subject: vxlan: Add new UDP encapsulation offload type for VXLAN-GPE

The fact is VXLAN with Generic Protocol Extensions cannot be supported by
the same hardware parsers that support VXLAN.  The protocol extensions
allow for things like a Next Protocol field which in turn allows for things
other than Ethernet to be passed over the tunnel.  Most existing parsers
will not know how to interpret this.

To resolve this I am giving VXLAN-GPE its own UDP encapsulation offload
type.  This way hardware that does support GPE can simply add this type to
the switch statement for VXLAN, and if they don't support it then this will
fix any issues where headers might be interpreted incorrectly.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c      | 6 ++++++
 include/net/udp_tunnel.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 31aeec967175..abb9cd2df9e9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -998,6 +998,8 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
 	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vs->hlist);
 	udp_tunnel_notify_del_rx_port(vs->sock,
+				      (vs->flags & VXLAN_F_GPE) ?
+				      UDP_TUNNEL_TYPE_VXLAN_GPE :
 				      UDP_TUNNEL_TYPE_VXLAN);
 	spin_unlock(&vn->sock_lock);
 
@@ -2488,6 +2490,8 @@ static void vxlan_push_rx_ports(struct net_device *dev)
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
 		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist)
 			udp_tunnel_push_rx_port(dev, vs->sock,
+						(vs->flags & VXLAN_F_GPE) ?
+						UDP_TUNNEL_TYPE_VXLAN_GPE :
 						UDP_TUNNEL_TYPE_VXLAN);
 	}
 	spin_unlock(&vn->sock_lock);
@@ -2691,6 +2695,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
 	udp_tunnel_notify_add_rx_port(sock,
+				      (vs->flags & VXLAN_F_GPE) ?
+				      UDP_TUNNEL_TYPE_VXLAN_GPE :
 				      UDP_TUNNEL_TYPE_VXLAN);
 	spin_unlock(&vn->sock_lock);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 1c9408a04213..02c5be037451 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -103,6 +103,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 enum udp_parsable_tunnel_type {
 	UDP_TUNNEL_TYPE_VXLAN,		/* RFC 7348 */
 	UDP_TUNNEL_TYPE_GENEVE,		/* draft-ietf-nvo3-geneve */
+	UDP_TUNNEL_TYPE_VXLAN_GPE,	/* draft-ietf-nvo3-vxlan-gpe */
 };
 
 struct udp_tunnel_info {
-- 
cgit v1.2.3


From a2e2ff560f5113e8ca31432fbd985f5f1889efdc Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 16 Jun 2016 16:24:24 -0700
Subject: net: ipv6: Move ip6_route_get_saddr to inline

VRF driver needs access to ip6_route_get_saddr code. Since it does
little beyond ipv6_dev_get_saddr and ipv6_dev_get_saddr is already
exported for modules move ip6_route_get_saddr to the header as an
inline.

Code move only; no functional change.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 21 ++++++++++++++++++---
 net/ipv6/route.c        | 17 -----------------
 2 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f55bf3d294aa..d97305d0e71f 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -18,6 +18,7 @@ struct route_info {
 	__u8			prefix[0];	/* 0,8 or 16 */
 };
 
+#include <net/addrconf.h>
 #include <net/flow.h>
 #include <net/ip6_fib.h>
 #include <net/sock.h>
@@ -88,9 +89,23 @@ int ip6_route_add(struct fib6_config *cfg);
 int ip6_ins_rt(struct rt6_info *);
 int ip6_del_rt(struct rt6_info *);
 
-int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
-			const struct in6_addr *daddr, unsigned int prefs,
-			struct in6_addr *saddr);
+static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
+				      const struct in6_addr *daddr,
+				      unsigned int prefs,
+				      struct in6_addr *saddr)
+{
+	struct inet6_dev *idev =
+			rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
+	int err = 0;
+
+	if (rt && rt->rt6i_prefsrc.plen)
+		*saddr = rt->rt6i_prefsrc.addr;
+	else
+		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
+					 daddr, prefs, saddr);
+
+	return err;
+}
 
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 			    const struct in6_addr *saddr, int oif, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9e1516785dac..08b77f421268 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2586,23 +2586,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	return rt;
 }
 
-int ip6_route_get_saddr(struct net *net,
-			struct rt6_info *rt,
-			const struct in6_addr *daddr,
-			unsigned int prefs,
-			struct in6_addr *saddr)
-{
-	struct inet6_dev *idev =
-		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
-	int err = 0;
-	if (rt && rt->rt6i_prefsrc.plen)
-		*saddr = rt->rt6i_prefsrc.addr;
-	else
-		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
-					 daddr, prefs, saddr);
-	return err;
-}
-
 /* remove deleted ip from prefsrc entries */
 struct arg_dev_net_ip {
 	struct net_device *dev;
-- 
cgit v1.2.3


From 0d240e7811c4ec1965760ee4643b5bbc9cfacbb3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 16 Jun 2016 16:24:25 -0700
Subject: net: vrf: Implement get_saddr for IPv6

IPv6 source address selection needs to consider the real egress route.
Similar to IPv4 implement a get_saddr6 method which is called if
source address has not been set.  The get_saddr6 method does a full
lookup which means pulling a route from the VRF FIB table and properly
considering linklocal/multicast destination addresses. Lookup failures
(eg., unreachable) then cause the source address selection to fail
which gets propagated back to the caller.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 include/net/l3mdev.h  | 11 +++++++++++
 net/ipv6/ip6_output.c | 12 ++++++++++--
 net/l3mdev/l3mdev.c   | 24 ++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 32173aa9208e..b3762822b653 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -999,6 +999,46 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 
 	return dst;
 }
+
+/* called under rcu_read_lock */
+static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk,
+			  struct flowi6 *fl6)
+{
+	struct net *net = dev_net(dev);
+	struct dst_entry *dst;
+	struct rt6_info *rt;
+	int err;
+
+	if (rt6_need_strict(&fl6->daddr)) {
+		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif,
+					  RT6_LOOKUP_F_IFACE);
+		if (unlikely(!rt))
+			return 0;
+
+		dst = &rt->dst;
+	} else {
+		__u8 flags = fl6->flowi6_flags;
+
+		fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
+		fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+
+		dst = ip6_route_output(net, sk, fl6);
+		rt = (struct rt6_info *)dst;
+
+		fl6->flowi6_flags = flags;
+	}
+
+	err = dst->error;
+	if (!err) {
+		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+					  sk ? inet6_sk(sk)->srcprefs : 0,
+					  &fl6->saddr);
+	}
+
+	dst_release(dst);
+
+	return err;
+}
 #endif
 
 static const struct l3mdev_ops vrf_l3mdev_ops = {
@@ -1008,6 +1048,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
+	.l3mdev_get_saddr6	= vrf_get_saddr6,
 #endif
 };
 
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index f8a416ec674c..818fd4f100fc 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -39,6 +39,9 @@ struct l3mdev_ops {
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
 						 struct flowi6 *fl6);
+	int		   (*l3mdev_get_saddr6)(struct net_device *dev,
+						const struct sock *sk,
+						struct flowi6 *fl6);
 };
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -140,6 +143,8 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
 struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6);
+int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
+		      struct flowi6 *fl6);
 
 static inline
 struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
@@ -230,6 +235,12 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6)
 	return NULL;
 }
 
+static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
+				    struct flowi6 *fl6)
+{
+	return 0;
+}
+
 static inline
 struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index fd3217579b65..1dfc402d9ad1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	int err;
 	int flags = 0;
 
+	if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
+	    (!*dst || !(*dst)->error)) {
+		err = l3mdev_get_saddr6(net, sk, fl6);
+		if (err)
+			goto out_err;
+	}
+
 	/* The correct way to handle this would be to do
 	 * ip6_route_get_saddr, and then ip6_route_output; however,
 	 * the route-specific preferred source forces the
@@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	return 0;
 
 out_err_release:
-	if (err == -ENETUNREACH)
-		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 	dst_release(*dst);
 	*dst = NULL;
+out_err:
+	if (err == -ENETUNREACH)
+		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 	return err;
 }
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index d90e4ef09e85..c4a1c3e84e12 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -162,6 +162,30 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
 }
 EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
 
+int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
+		      struct flowi6 *fl6)
+{
+	struct net_device *dev;
+	int rc = 0;
+
+	if (fl6->flowi6_oif) {
+		rcu_read_lock();
+
+		dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+		if (dev && netif_is_l3_slave(dev))
+			dev = netdev_master_upper_dev_get_rcu(dev);
+
+		if (dev && netif_is_l3_master(dev) &&
+		    dev->l3mdev_ops->l3mdev_get_saddr6)
+			rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
+
+		rcu_read_unlock();
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
+
 /**
  *	l3mdev_fib_rule_match - Determine if flowi references an
  *				L3 master device
-- 
cgit v1.2.3


From afbac6010aec514998214fb19a1f37732b7a1d77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 16 Jun 2016 16:24:26 -0700
Subject: net: ipv6: Address selection needs to consider L3 domains

IPv6 version of 3f2fb9a834cb ("net: l3mdev: address selection should only
consider devices in L3 domain") and the follow up commit, a17b693cdd876
("net: l3mdev: prefer VRF master for source address selection").

That is, if outbound device is given then the address preference order
is an address from that device, an address from the master device if it
is enslaved, and then an address from a device in the same L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h | 31 +++++++++++++++++++++++++++++++
 net/ipv6/addrconf.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'include/net')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 818fd4f100fc..e90095091aa0 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -79,6 +79,31 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 	return rc;
 }
 
+static inline
+const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
+{
+	/* netdev_master_upper_dev_get_rcu calls
+	 * list_first_or_null_rcu to walk the upper dev list.
+	 * list_first_or_null_rcu does not handle a const arg. We aren't
+	 * making changes, just want the master device from that list so
+	 * typecast to remove the const
+	 */
+	struct net_device *dev = (struct net_device *)_dev;
+	const struct net_device *master;
+
+	if (!dev)
+		return NULL;
+
+	if (netif_is_l3_master(dev))
+		master = dev;
+	else if (netif_is_l3_slave(dev))
+		master = netdev_master_upper_dev_get_rcu(dev);
+	else
+		master = NULL;
+
+	return master;
+}
+
 /* get index of an interface to use for FIB lookups. For devices
  * enslaved to an L3 master device FIB lookups are based on the
  * master index
@@ -190,6 +215,12 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 	return 0;
 }
 
+static inline
+const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
+{
+	return NULL;
+}
+
 static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
 {
 	return dev ? dev->ifindex : 0;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6c8fc3f96b11..a1f6b7b31531 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1524,6 +1524,28 @@ out:
 	return hiscore_idx;
 }
 
+static int ipv6_get_saddr_master(struct net *net,
+				 const struct net_device *dst_dev,
+				 const struct net_device *master,
+				 struct ipv6_saddr_dst *dst,
+				 struct ipv6_saddr_score *scores,
+				 int hiscore_idx)
+{
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dst_dev);
+	if (idev)
+		hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+						   scores, hiscore_idx);
+
+	idev = __in6_dev_get(master);
+	if (idev)
+		hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+						   scores, hiscore_idx);
+
+	return hiscore_idx;
+}
+
 int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 		       const struct in6_addr *daddr, unsigned int prefs,
 		       struct in6_addr *saddr)
@@ -1577,13 +1599,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 		if (idev)
 			hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
 	} else {
+		const struct net_device *master;
+		int master_idx = 0;
+
+		/* if dst_dev exists and is enslaved to an L3 device, then
+		 * prefer addresses from dst_dev and then the master over
+		 * any other enslaved devices in the L3 domain.
+		 */
+		master = l3mdev_master_dev_rcu(dst_dev);
+		if (master) {
+			master_idx = master->ifindex;
+
+			hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
+							    master, &dst,
+							    scores, hiscore_idx);
+
+			if (scores[hiscore_idx].ifa)
+				goto out;
+		}
+
 		for_each_netdev_rcu(net, dev) {
+			/* only consider addresses on devices in the
+			 * same L3 domain
+			 */
+			if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+				continue;
 			idev = __in6_dev_get(dev);
 			if (!idev)
 				continue;
 			hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
 		}
 	}
+
+out:
 	rcu_read_unlock();
 
 	hiscore = &scores[hiscore_idx];
-- 
cgit v1.2.3


From 9b8c6d7bf2e08a7d3eb6660a2bfaf29b8b49c329 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 18 Jun 2016 21:52:05 -0700
Subject: gre: better support for ICMP messages for gre+ipv6

ipgre_err() can call ip6_err_gen_icmpv6_unreach() for proper
support of ipv4+gre+icmp+ipv6+... frames, used for example
by traceroute/mtr.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 1 +
 net/ipv4/gre_demux.c     | 1 +
 net/ipv4/ip_gre.c        | 6 ++++++
 3 files changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9222678426a1..a5e7035fb93f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -157,6 +157,7 @@ struct tnl_ptk_info {
 	__be16 proto;
 	__be32 key;
 	__be32 seq;
+	int hdr_len;
 };
 
 #define PACKET_RCVD	0
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4c39f4fd332a..c4c3e439f424 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 		if ((*(u8 *)options & 0xF0) != 0x40)
 			hdr_len += 4;
 	}
+	tpi->hdr_len = hdr_len;
 	return hdr_len;
 }
 EXPORT_SYMBOL(gre_parse_header);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0f8ca3fca00a..ab4cff8e563d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -187,6 +187,12 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 	if (!t)
 		return;
 
+#if IS_ENABLED(CONFIG_IPV6)
+       if (tpi->proto == htons(ETH_P_IPV6) &&
+           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type))
+               return;
+#endif
+
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
 		return;
-- 
cgit v1.2.3


From 506e65df52f2bf250aa9b4264efd180d1646bdec Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 10 Jun 2016 23:09:01 +0200
Subject: netfilter: make comparision helpers stub functions in ZONES=n case

Those comparisions are useless in case of ZONES=n; all conntracks
will reside in the same zone by definition.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_zones.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 4e32512cef32..bd4692690914 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -68,22 +68,34 @@ static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
 static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
 				enum ip_conntrack_dir dir)
 {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
 	return nf_ct_zone_matches_dir(zone, dir) ?
 	       zone->id : NF_CT_DEFAULT_ZONE_ID;
+#else
+	return NF_CT_DEFAULT_ZONE_ID;
+#endif
 }
 
 static inline bool nf_ct_zone_equal(const struct nf_conn *a,
 				    const struct nf_conntrack_zone *b,
 				    enum ip_conntrack_dir dir)
 {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
 	return nf_ct_zone_id(nf_ct_zone(a), dir) ==
 	       nf_ct_zone_id(b, dir);
+#else
+	return true;
+#endif
 }
 
 static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
 					const struct nf_conntrack_zone *b)
 {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
 	return nf_ct_zone(a)->id == b->id;
+#else
+	return true;
+#endif
 }
 #endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */
 #endif /* _NF_CONNTRACK_ZONES_H */
-- 
cgit v1.2.3


From 6c8dee9842461e6ee6eb46081478999b3d5cb297 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 11 Jun 2016 21:57:35 +0200
Subject: netfilter: move zone info into struct nf_conn

Curently we store zone information as a conntrack extension.
This has one drawback: for every lookup we need to fetch the zone data
from the extension area.

This change place the zone data directly into the main conntrack object
structure and then removes the zone conntrack extension.

The zone data is just 4 bytes, it fits into a padding hole before
the tuplehash info, so we do not even increase the nf_conn structure size.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h        |  3 +++
 include/net/netfilter/nf_conntrack_extend.h |  4 ----
 include/net/netfilter/nf_conntrack_zones.h  | 33 ++++++++++-------------------
 net/netfilter/nf_conntrack_core.c           | 33 ++---------------------------
 4 files changed, 16 insertions(+), 57 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index dd78bea227c8..9c0ed3d7af89 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -85,6 +85,9 @@ struct nf_conn {
 	spinlock_t	lock;
 	u16		cpu;
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	struct nf_conntrack_zone zone;
+#endif
 	/* XXX should I move this to the tail ? - Y.K */
 	/* These are my tuples; original and reply */
 	struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 55d15049ab2f..b925395fa5ed 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -15,9 +15,6 @@ enum nf_ct_ext_id {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	NF_CT_EXT_ECACHE,
 #endif
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	NF_CT_EXT_ZONE,
-#endif
 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
 	NF_CT_EXT_TSTAMP,
 #endif
@@ -38,7 +35,6 @@ enum nf_ct_ext_id {
 #define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct
 #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
-#define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
 #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
 #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
 #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index bd4692690914..64a718b60839 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -9,12 +9,11 @@
 static inline const struct nf_conntrack_zone *
 nf_ct_zone(const struct nf_conn *ct)
 {
-	const struct nf_conntrack_zone *nf_ct_zone = NULL;
-
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE);
+	return &ct->zone;
+#else
+	return &nf_ct_zone_dflt;
 #endif
-	return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt;
 }
 
 static inline const struct nf_conntrack_zone *
@@ -31,32 +30,22 @@ static inline const struct nf_conntrack_zone *
 nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
 		struct nf_conntrack_zone *tmp)
 {
-	const struct nf_conntrack_zone *zone;
-
+#ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (!tmpl)
 		return &nf_ct_zone_dflt;
 
-	zone = nf_ct_zone(tmpl);
-	if (zone->flags & NF_CT_FLAG_MARK)
-		zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0);
-
-	return zone;
+	if (tmpl->zone.flags & NF_CT_FLAG_MARK)
+		return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
+#endif
+	return nf_ct_zone(tmpl);
 }
 
-static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags,
-				 const struct nf_conntrack_zone *info)
+static inline void nf_ct_zone_add(struct nf_conn *ct,
+				  const struct nf_conntrack_zone *zone)
 {
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	struct nf_conntrack_zone *nf_ct_zone;
-
-	nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags);
-	if (!nf_ct_zone)
-		return -ENOMEM;
-
-	nf_ct_zone_init(nf_ct_zone, info->id, info->dir,
-			info->flags);
+	ct->zone = *zone;
 #endif
-	return 0;
 }
 
 static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2903bb43547c..a459176c3253 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -327,16 +327,10 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 
 	tmpl->status = IPS_TEMPLATE;
 	write_pnet(&tmpl->ct_net, net);
-
-	if (nf_ct_zone_add(tmpl, flags, zone) < 0)
-		goto out_free;
-
+	nf_ct_zone_add(tmpl, zone);
 	atomic_set(&tmpl->ct_general.use, 0);
 
 	return tmpl;
-out_free:
-	kfree(tmpl);
-	return NULL;
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
@@ -929,16 +923,13 @@ __nf_conntrack_alloc(struct net *net,
 	       offsetof(struct nf_conn, proto) -
 	       offsetof(struct nf_conn, __nfct_init_offset[0]));
 
-	if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
-		goto out_free;
+	nf_ct_zone_add(ct, zone);
 
 	/* Because we use RCU lookups, we set ct_general.use to zero before
 	 * this is inserted in any list.
 	 */
 	atomic_set(&ct->ct_general.use, 0);
 	return ct;
-out_free:
-	kmem_cache_free(nf_conntrack_cachep, ct);
 out:
 	atomic_dec(&net->ct.count);
 	return ERR_PTR(-ENOMEM);
@@ -1342,14 +1333,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
-	.len	= sizeof(struct nf_conntrack_zone),
-	.align	= __alignof__(struct nf_conntrack_zone),
-	.id	= NF_CT_EXT_ZONE,
-};
-#endif
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -1532,9 +1515,6 @@ void nf_conntrack_cleanup_end(void)
 
 	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
 
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	nf_ct_extend_unregister(&nf_ct_zone_extend);
-#endif
 	nf_conntrack_proto_fini();
 	nf_conntrack_seqadj_fini();
 	nf_conntrack_labels_fini();
@@ -1771,11 +1751,6 @@ int nf_conntrack_init_start(void)
 	if (ret < 0)
 		goto err_seqadj;
 
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	ret = nf_ct_extend_register(&nf_ct_zone_extend);
-	if (ret < 0)
-		goto err_extend;
-#endif
 	ret = nf_conntrack_proto_init();
 	if (ret < 0)
 		goto err_proto;
@@ -1791,10 +1766,6 @@ int nf_conntrack_init_start(void)
 	return 0;
 
 err_proto:
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	nf_ct_extend_unregister(&nf_ct_zone_extend);
-err_extend:
-#endif
 	nf_conntrack_seqadj_fini();
 err_seqadj:
 	nf_conntrack_labels_fini();
-- 
cgit v1.2.3


From 7643507fe8b5bd8ab7522f6a81058cc1209d2585 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Tue, 21 Jun 2016 14:58:46 -0400
Subject: netfilter: xt_NFLOG: nflog-range does not truncate packets

li->u.ulog.copy_len is currently ignored by the kernel, we should truncate
the packet to either li->u.ulog.copy_len (if set) or copy_range before
sending it to userspace. 0 is a valid input for copy_len, so add a new
flag to indicate whether this was option was specified by the user or not.

Add two flags to indicate whether nflog-size/copy_len was set or not.
XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log

On the userspace side, this was initially represented by the option
nflog-range, this will be replaced by --nflog-size now. --nflog-range would
still exist but does not do anything.

Reported-by: Joe Dollard <jdollard@akamai.com>
Reviewed-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h          | 7 +++++++
 include/uapi/linux/netfilter/xt_NFLOG.h | 6 +++++-
 net/netfilter/nfnetlink_log.c           | 9 ++++++---
 net/netfilter/xt_NFLOG.c                | 3 +++
 4 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 57639fca223a..83d855ba6af1 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -12,6 +12,9 @@
 #define NF_LOG_UID		0x08	/* Log UID owning local socket */
 #define NF_LOG_MASK		0x0f
 
+/* This flag indicates that copy_len field in nf_loginfo is set */
+#define NF_LOG_F_COPY_LEN	0x1
+
 enum nf_log_type {
 	NF_LOG_TYPE_LOG		= 0,
 	NF_LOG_TYPE_ULOG,
@@ -22,9 +25,13 @@ struct nf_loginfo {
 	u_int8_t type;
 	union {
 		struct {
+			/* copy_len will be used iff you set
+			 * NF_LOG_F_COPY_LEN in flags
+			 */
 			u_int32_t copy_len;
 			u_int16_t group;
 			u_int16_t qthreshold;
+			u_int16_t flags;
 		} ulog;
 		struct {
 			u_int8_t level;
diff --git a/include/uapi/linux/netfilter/xt_NFLOG.h b/include/uapi/linux/netfilter/xt_NFLOG.h
index 87b58311ce6b..f33070730fc8 100644
--- a/include/uapi/linux/netfilter/xt_NFLOG.h
+++ b/include/uapi/linux/netfilter/xt_NFLOG.h
@@ -6,9 +6,13 @@
 #define XT_NFLOG_DEFAULT_GROUP		0x1
 #define XT_NFLOG_DEFAULT_THRESHOLD	0
 
-#define XT_NFLOG_MASK			0x0
+#define XT_NFLOG_MASK			0x1
+
+/* This flag indicates that 'len' field in xt_nflog_info is set*/
+#define XT_NFLOG_F_COPY_LEN		0x1
 
 struct xt_nflog_info {
+	/* 'len' will be used iff you set XT_NFLOG_F_COPY_LEN in flags */
 	__u32	len;
 	__u16	group;
 	__u16	threshold;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 11f81c8385fc..cbcfdfb586a6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net,
 		break;
 
 	case NFULNL_COPY_PACKET:
-		if (inst->copy_range > skb->len)
+		data_len = inst->copy_range;
+		if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) &&
+		    (li->u.ulog.copy_len < data_len))
+			data_len = li->u.ulog.copy_len;
+
+		if (data_len > skb->len)
 			data_len = skb->len;
-		else
-			data_len = inst->copy_range;
 
 		size += nla_total_size(data_len);
 		break;
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a1fa2c800cb9..018eed7e1ff1 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	li.u.ulog.group	     = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
+	if (info->flags & XT_NFLOG_F_COPY_LEN)
+		li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
+
 	nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
 			  par->out, &li, info->prefix);
 	return XT_CONTINUE;
-- 
cgit v1.2.3


From 889f7ee7c6e84251215d43cbc856ea116c72d3f2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 12 Jun 2016 18:07:07 +0200
Subject: netfilter: nf_tables: add generic macros to check for generation mask

Thus, we can reuse these to check the genmask of any object type, not
only rules. This is required now that tables, chain and sets will get a
generation mask field too in follow up patches.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 24 ++++++++++++++++++++
 net/netfilter/nf_tables_api.c     | 46 +++++++--------------------------------
 2 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 092235458691..d0778cbaf7f1 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -969,6 +969,30 @@ static inline u8 nft_genmask_cur(const struct net *net)
 
 #define NFT_GENMASK_ANY		((1 << 0) | (1 << 1))
 
+/*
+ * Generic transaction helpers
+ */
+
+/* Check if this object is currently active. */
+#define nft_is_active(__net, __obj)				\
+	(((__obj)->genmask & nft_genmask_cur(__net)) == 0)
+
+/* Check if this object is active in the next generation. */
+#define nft_is_active_next(__net, __obj)			\
+	(((__obj)->genmask & nft_genmask_next(__net)) == 0)
+
+/* This object becomes active in the next generation. */
+#define nft_activate_next(__net, __obj)				\
+	(__obj)->genmask = nft_genmask_cur(__net)
+
+/* This object becomes inactive in the next generation. */
+#define nft_deactivate_next(__net, __obj)			\
+        (__obj)->genmask = nft_genmask_next(__net)
+
+/* After committing the ruleset, clear the stale generation bit. */
+#define nft_clear(__net, __obj)					\
+	(__obj)->genmask &= ~nft_genmask_next(__net)
+
 /*
  * Set element transaction helpers
  */
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4d292b933b5c..d9f0f0797dec 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -234,42 +234,12 @@ static int nft_delchain(struct nft_ctx *ctx)
 	return err;
 }
 
-static inline bool
-nft_rule_is_active(struct net *net, const struct nft_rule *rule)
-{
-	return (rule->genmask & nft_genmask_cur(net)) == 0;
-}
-
-static inline int
-nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
-{
-	return (rule->genmask & nft_genmask_next(net)) == 0;
-}
-
-static inline void
-nft_rule_activate_next(struct net *net, struct nft_rule *rule)
-{
-	/* Now inactive, will be active in the future */
-	rule->genmask = nft_genmask_cur(net);
-}
-
-static inline void
-nft_rule_deactivate_next(struct net *net, struct nft_rule *rule)
-{
-	rule->genmask = nft_genmask_next(net);
-}
-
-static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
-{
-	rule->genmask &= ~nft_genmask_next(net);
-}
-
 static int
 nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
 {
 	/* You cannot delete the same rule twice */
-	if (nft_rule_is_active_next(ctx->net, rule)) {
-		nft_rule_deactivate_next(ctx->net, rule);
+	if (nft_is_active_next(ctx->net, rule)) {
+		nft_deactivate_next(ctx->net, rule);
 		ctx->chain->use--;
 		return 0;
 	}
@@ -1898,7 +1868,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 		list_for_each_entry_rcu(table, &afi->tables, list) {
 			list_for_each_entry_rcu(chain, &table->chains, list) {
 				list_for_each_entry_rcu(rule, &chain->rules, list) {
-					if (!nft_rule_is_active(net, rule))
+					if (!nft_is_active(net, rule))
 						goto cont;
 					if (idx < s_idx)
 						goto cont;
@@ -2102,7 +2072,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	if (rule == NULL)
 		goto err1;
 
-	nft_rule_activate_next(net, rule);
+	nft_activate_next(net, rule);
 
 	rule->handle = handle;
 	rule->dlen   = size;
@@ -2124,14 +2094,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
-		if (nft_rule_is_active_next(net, old_rule)) {
+		if (nft_is_active_next(net, old_rule)) {
 			trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
 						   old_rule);
 			if (trans == NULL) {
 				err = -ENOMEM;
 				goto err2;
 			}
-			nft_rule_deactivate_next(net, old_rule);
+			nft_deactivate_next(net, old_rule);
 			chain->use--;
 			list_add_tail_rcu(&rule->list, &old_rule->list);
 		} else {
@@ -3980,7 +3950,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 						   trans->ctx.afi->nops);
 			break;
 		case NFT_MSG_NEWRULE:
-			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+			nft_clear(trans->ctx.net, nft_trans_rule(trans));
 			nf_tables_rule_notify(&trans->ctx,
 					      nft_trans_rule(trans),
 					      NFT_MSG_NEWRULE);
@@ -4116,7 +4086,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			break;
 		case NFT_MSG_DELRULE:
 			trans->ctx.chain->use++;
-			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+			nft_clear(trans->ctx.net, nft_trans_rule(trans));
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSET:
-- 
cgit v1.2.3


From f2a6d766765d2794e26e25655d4ffcfe29c3ec2f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 14 Jun 2016 17:29:18 +0200
Subject: netfilter: nf_tables: add generation mask to tables

This patch addresses two problems:

1) The netlink dump is inconsistent when interfering with an ongoing
   transaction update for several reasons:

1.a) We don't honor the internal NFT_TABLE_INACTIVE flag, and we should
     be skipping these inactive objects in the dump.

1.b) We perform speculative deletion during the preparation phase, that
     may result in skipping active objects.

1.c) The listing order changes, which generates noise when tracking
     incremental ruleset update via tools like git or our own
     testsuite.

2) We don't allow to add and to update the object in the same batch,
   eg. add table x; add table x { flags dormant\; }.

In order to resolve these problems:

1) If the user requests a deletion, the object becomes inactive in the
   next generation. Then, ignore objects that scheduled to be deleted
   from the lookup path, as they will be effectively removed in the
   next generation.

2) From the get/dump path, if the object is not currently active, we
   skip it.

3) Support 'add X -> update X' sequence from a transaction.

After this update, we obtain a consistent list as long as we stay
in the same generation. The userspace side can detect interferences
through the generation counter so it can restart the dumping.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   6 ++-
 net/netfilter/nf_tables_api.c     | 101 +++++++++++++++++++++-----------------
 2 files changed, 62 insertions(+), 45 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d0778cbaf7f1..05c9a64b39aa 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -838,6 +838,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
+ *	@genmask: generation mask
  *	@name: name of the table
  */
 struct nft_table {
@@ -846,7 +847,8 @@ struct nft_table {
 	struct list_head		sets;
 	u64				hgenerator;
 	u32				use;
-	u16				flags;
+	u16				flags:14,
+					genmask:2;
 	char				name[NFT_TABLE_MAXNAMELEN];
 };
 
@@ -992,6 +994,8 @@ static inline u8 nft_genmask_cur(const struct net *net)
 /* After committing the ruleset, clear the stale generation bit. */
 #define nft_clear(__net, __obj)					\
 	(__obj)->genmask &= ~nft_genmask_next(__net)
+#define nft_active_genmask(__obj, __genmask)			\
+	!((__obj)->genmask & __genmask)
 
 /*
  * Set element transaction helpers
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d9f0f0797dec..a4a77d60e631 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -175,9 +175,6 @@ static void nf_tables_unregister_hooks(const struct nft_table *table,
 	nft_unregister_basechain(nft_base_chain(chain), hook_nops);
 }
 
-/* Internal table flags */
-#define NFT_TABLE_INACTIVE	(1 << 15)
-
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
 {
 	struct nft_trans *trans;
@@ -187,7 +184,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
 		return -ENOMEM;
 
 	if (msg_type == NFT_MSG_NEWTABLE)
-		ctx->table->flags |= NFT_TABLE_INACTIVE;
+		nft_activate_next(ctx->net, ctx->table);
 
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 	return 0;
@@ -201,7 +198,7 @@ static int nft_deltable(struct nft_ctx *ctx)
 	if (err < 0)
 		return err;
 
-	list_del_rcu(&ctx->table->list);
+	nft_deactivate_next(ctx->net, ctx->table);
 	return err;
 }
 
@@ -334,26 +331,29 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
  */
 
 static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
-					  const struct nlattr *nla)
+					  const struct nlattr *nla,
+					  u8 genmask)
 {
 	struct nft_table *table;
 
 	list_for_each_entry(table, &afi->tables, list) {
-		if (!nla_strcmp(nla, table->name))
+		if (!nla_strcmp(nla, table->name) &&
+		    nft_active_genmask(table, genmask))
 			return table;
 	}
 	return NULL;
 }
 
 static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
-						const struct nlattr *nla)
+						const struct nlattr *nla,
+						u8 genmask)
 {
 	struct nft_table *table;
 
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	table = nft_table_lookup(afi, nla);
+	table = nft_table_lookup(afi, nla, genmask);
 	if (table != NULL)
 		return table;
 
@@ -494,6 +494,8 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 			if (idx > s_idx)
 				memset(&cb->args[1], 0,
 				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (!nft_is_active(net, table))
+				continue;
 			if (nf_tables_fill_table_info(skb, net,
 						      NETLINK_CB(cb->skb).portid,
 						      cb->nlh->nlmsg_seq,
@@ -518,6 +520,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 			      const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct sk_buff *skb2;
@@ -535,11 +538,9 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
-	if (table->flags & NFT_TABLE_INACTIVE)
-		return -ENOENT;
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -648,6 +649,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 			      const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	const struct nlattr *name;
 	struct nft_af_info *afi;
 	struct nft_table *table;
@@ -661,7 +663,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(afi);
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(afi, name);
+	table = nf_tables_table_lookup(afi, name, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -669,8 +671,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	}
 
 	if (table != NULL) {
-		if (table->flags & NFT_TABLE_INACTIVE)
-			return -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -765,6 +765,9 @@ static int nft_flush(struct nft_ctx *ctx, int family)
 
 		ctx->afi = afi;
 		list_for_each_entry_safe(table, nt, &afi->tables, list) {
+			if (!nft_is_active_next(ctx->net, table))
+				continue;
+
 			if (nla[NFTA_TABLE_NAME] &&
 			    nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
 				continue;
@@ -785,6 +788,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 			      const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_af_info *afi;
 	struct nft_table *table;
 	int family = nfmsg->nfgen_family;
@@ -798,7 +802,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1074,6 +1078,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 			      const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
@@ -1092,11 +1097,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
-	if (table->flags & NFT_TABLE_INACTIVE)
-		return -ENOENT;
 
 	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
 	if (IS_ERR(chain))
@@ -1201,6 +1204,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	struct nft_chain *chain;
 	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
+	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
 	struct net_device *dev = NULL;
 	u8 policy = NF_ACCEPT;
@@ -1217,7 +1221,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1449,6 +1453,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 			      const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
@@ -1459,7 +1464,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1901,6 +1906,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			     const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	const struct nft_chain *chain;
@@ -1920,11 +1926,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
-	if (table->flags & NFT_TABLE_INACTIVE)
-		return -ENOENT;
 
 	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
 	if (IS_ERR(chain))
@@ -1979,6 +1983,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			     const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
@@ -1999,7 +2004,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2144,6 +2149,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			     const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain = NULL;
@@ -2155,7 +2161,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2309,7 +2315,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
 static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 				     const struct sk_buff *skb,
 				     const struct nlmsghdr *nlh,
-				     const struct nlattr * const nla[])
+				     const struct nlattr * const nla[],
+				     u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nft_af_info *afi = NULL;
@@ -2325,7 +2332,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 		if (afi == NULL)
 			return -EAFNOSUPPORT;
 
-		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE],
+					       genmask);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -2586,6 +2594,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 			    struct sk_buff *skb, const struct nlmsghdr *nlh,
 			    const struct nlattr * const nla[])
 {
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_set *set;
 	struct nft_ctx ctx;
 	struct sk_buff *skb2;
@@ -2593,7 +2602,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	int err;
 
 	/* Verify existence before starting dump */
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
 	if (err < 0)
 		return err;
 
@@ -2661,6 +2670,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 			    const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	const struct nft_set_ops *ops;
 	struct nft_af_info *afi;
 	struct nft_table *table;
@@ -2758,7 +2768,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2863,6 +2873,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 			    const struct nlattr * const nla[])
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_set *set;
 	struct nft_ctx ctx;
 	int err;
@@ -2872,7 +2883,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TABLE] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
 	if (err < 0)
 		return err;
 
@@ -3001,7 +3012,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
 static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 				      const struct sk_buff *skb,
 				      const struct nlmsghdr *nlh,
-				      const struct nlattr * const nla[])
+				      const struct nlattr * const nla[],
+				      u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nft_af_info *afi;
@@ -3011,7 +3023,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE],
+				       genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -3108,6 +3121,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
 static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_set *set;
 	struct nft_set_dump_args args;
 	struct nft_ctx ctx;
@@ -3124,11 +3138,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 		return err;
 
 	err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
-					 (void *)nla);
+					 (void *)nla, genmask);
 	if (err < 0)
 		return err;
-	if (ctx.table->flags & NFT_TABLE_INACTIVE)
-		return -ENOENT;
 
 	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
 	if (IS_ERR(set))
@@ -3187,15 +3199,14 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 				struct sk_buff *skb, const struct nlmsghdr *nlh,
 				const struct nlattr * const nla[])
 {
+	u8 genmask = nft_genmask_cur(net);
 	const struct nft_set *set;
 	struct nft_ctx ctx;
 	int err;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
 	if (err < 0)
 		return err;
-	if (ctx.table->flags & NFT_TABLE_INACTIVE)
-		return -ENOENT;
 
 	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
 	if (IS_ERR(set))
@@ -3519,6 +3530,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 				struct sk_buff *skb, const struct nlmsghdr *nlh,
 				const struct nlattr * const nla[])
 {
+	u8 genmask = nft_genmask_next(net);
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -3527,7 +3539,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
 	if (err < 0)
 		return err;
 
@@ -3641,6 +3653,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 				struct sk_buff *skb, const struct nlmsghdr *nlh,
 				const struct nlattr * const nla[])
 {
+	u8 genmask = nft_genmask_next(net);
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -3649,7 +3662,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
 	if (err < 0)
 		return err;
 
@@ -3926,12 +3939,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 				}
 			} else {
-				trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE;
+				nft_clear(net, trans->ctx.table);
 			}
 			nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELTABLE:
+			list_del_rcu(&trans->ctx.table->list);
 			nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
 			break;
 		case NFT_MSG_NEWCHAIN:
@@ -4057,8 +4071,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			}
 			break;
 		case NFT_MSG_DELTABLE:
-			list_add_tail_rcu(&trans->ctx.table->list,
-					  &trans->ctx.afi->tables);
+			nft_clear(trans->ctx.net, trans->ctx.table);
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWCHAIN:
-- 
cgit v1.2.3


From 664b0f8cd8c66d02d14168ee7ac6a957cc88177f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 12 Jun 2016 19:21:31 +0200
Subject: netfilter: nf_tables: add generation mask to chains

Similar to ("netfilter: nf_tables: add generation mask to tables").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +-
 net/netfilter/nf_tables_api.c     | 89 +++++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 33 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 05c9a64b39aa..b023e287ea92 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -732,7 +732,6 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
 
 enum nft_chain_flags {
 	NFT_BASE_CHAIN			= 0x1,
-	NFT_CHAIN_INACTIVE		= 0x2,
 };
 
 /**
@@ -754,7 +753,8 @@ struct nft_chain {
 	u64				handle;
 	u32				use;
 	u16				level;
-	u8				flags;
+	u8				flags:6,
+					genmask:2;
 	char				name[NFT_CHAIN_MAXNAMELEN];
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a4a77d60e631..cae88f8a990e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -211,7 +211,7 @@ static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
 		return -ENOMEM;
 
 	if (msg_type == NFT_MSG_NEWCHAIN)
-		ctx->chain->flags |= NFT_CHAIN_INACTIVE;
+		nft_activate_next(ctx->net, ctx->chain);
 
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 	return 0;
@@ -226,7 +226,7 @@ static int nft_delchain(struct nft_ctx *ctx)
 		return err;
 
 	ctx->table->use--;
-	list_del_rcu(&ctx->chain->list);
+	nft_deactivate_next(ctx->net, ctx->chain);
 
 	return err;
 }
@@ -559,13 +559,16 @@ err:
 	return err;
 }
 
-static int nf_tables_table_enable(const struct nft_af_info *afi,
+static int nf_tables_table_enable(struct net *net,
+				  const struct nft_af_info *afi,
 				  struct nft_table *table)
 {
 	struct nft_chain *chain;
 	int err, i = 0;
 
 	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_active_next(net, chain))
+			continue;
 		if (!(chain->flags & NFT_BASE_CHAIN))
 			continue;
 
@@ -578,6 +581,8 @@ static int nf_tables_table_enable(const struct nft_af_info *afi,
 	return 0;
 err:
 	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_active_next(net, chain))
+			continue;
 		if (!(chain->flags & NFT_BASE_CHAIN))
 			continue;
 
@@ -589,12 +594,15 @@ err:
 	return err;
 }
 
-static void nf_tables_table_disable(const struct nft_af_info *afi,
+static void nf_tables_table_disable(struct net *net,
+				    const struct nft_af_info *afi,
 				    struct nft_table *table)
 {
 	struct nft_chain *chain;
 
 	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_active_next(net, chain))
+			continue;
 		if (chain->flags & NFT_BASE_CHAIN)
 			nft_unregister_basechain(nft_base_chain(chain),
 						 afi->nops);
@@ -627,7 +635,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 		nft_trans_table_enable(trans) = false;
 	} else if (!(flags & NFT_TABLE_F_DORMANT) &&
 		   ctx->table->flags & NFT_TABLE_F_DORMANT) {
-		ret = nf_tables_table_enable(ctx->afi, ctx->table);
+		ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table);
 		if (ret >= 0) {
 			ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
 			nft_trans_table_enable(trans) = true;
@@ -722,6 +730,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
 	struct nft_set *set, *ns;
 
 	list_for_each_entry(chain, &ctx->table->chains, list) {
+		if (!nft_is_active_next(ctx->net, chain))
+			continue;
+
 		ctx->chain = chain;
 
 		err = nft_delrule_by_chain(ctx);
@@ -740,6 +751,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
 	}
 
 	list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
+		if (!nft_is_active_next(ctx->net, chain))
+			continue;
+
 		ctx->chain = chain;
 
 		err = nft_delchain(ctx);
@@ -849,12 +863,14 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
  */
 
 static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
+				u8 genmask)
 {
 	struct nft_chain *chain;
 
 	list_for_each_entry(chain, &table->chains, list) {
-		if (chain->handle == handle)
+		if (chain->handle == handle &&
+		    nft_active_genmask(chain, genmask))
 			return chain;
 	}
 
@@ -862,7 +878,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
 }
 
 static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
-						const struct nlattr *nla)
+						const struct nlattr *nla,
+						u8 genmask)
 {
 	struct nft_chain *chain;
 
@@ -870,7 +887,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
 		return ERR_PTR(-EINVAL);
 
 	list_for_each_entry(chain, &table->chains, list) {
-		if (!nla_strcmp(nla, chain->name))
+		if (!nla_strcmp(nla, chain->name) &&
+		    nft_active_genmask(chain, genmask))
 			return chain;
 	}
 
@@ -1053,6 +1071,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 				if (idx > s_idx)
 					memset(&cb->args[1], 0,
 					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (!nft_is_active(net, chain))
+					continue;
 				if (nf_tables_fill_chain_info(skb, net,
 							      NETLINK_CB(cb->skb).portid,
 							      cb->nlh->nlmsg_seq,
@@ -1101,11 +1121,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
-	if (chain->flags & NFT_CHAIN_INACTIVE)
-		return -ENOENT;
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -1230,11 +1248,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
-		chain = nf_tables_chain_lookup_byhandle(table, handle);
+		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	} else {
-		chain = nf_tables_chain_lookup(table, name);
+		chain = nf_tables_chain_lookup(table, name, genmask);
 		if (IS_ERR(chain)) {
 			if (PTR_ERR(chain) != -ENOENT)
 				return PTR_ERR(chain);
@@ -1265,16 +1283,20 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		struct nft_stats *stats = NULL;
 		struct nft_trans *trans;
 
-		if (chain->flags & NFT_CHAIN_INACTIVE)
-			return -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		if (nla[NFTA_CHAIN_HANDLE] && name &&
-		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
-			return -EEXIST;
+		if (nla[NFTA_CHAIN_HANDLE] && name) {
+			struct nft_chain *chain2;
+
+			chain2 = nf_tables_chain_lookup(table,
+							nla[NFTA_CHAIN_NAME],
+							genmask);
+			if (IS_ERR(chain2))
+				return PTR_ERR(chain2);
+		}
 
 		if (nla[NFTA_CHAIN_COUNTERS]) {
 			if (!(chain->flags & NFT_BASE_CHAIN))
@@ -1468,7 +1490,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 	if (chain->use > 0)
@@ -1930,11 +1952,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
-	if (chain->flags & NFT_CHAIN_INACTIVE)
-		return -ENOENT;
 
 	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 	if (IS_ERR(rule))
@@ -2008,7 +2028,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
@@ -2166,7 +2186,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 
 	if (nla[NFTA_RULE_CHAIN]) {
-		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
+					       genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	}
@@ -2186,6 +2207,9 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 		}
 	} else {
 		list_for_each_entry(chain, &table->chains, list) {
+			if (!nft_is_active_next(net, chain))
+				continue;
+
 			ctx.chain = chain;
 			err = nft_delrule_by_chain(&ctx);
 			if (err < 0)
@@ -3934,7 +3958,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWTABLE:
 			if (nft_trans_table_update(trans)) {
 				if (!nft_trans_table_enable(trans)) {
-					nf_tables_table_disable(trans->ctx.afi,
+					nf_tables_table_disable(net,
+								trans->ctx.afi,
 								trans->ctx.table);
 					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 				}
@@ -3952,12 +3977,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			if (nft_trans_chain_update(trans))
 				nft_chain_commit_update(trans);
 			else
-				trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE;
+				nft_clear(net, trans->ctx.chain);
 
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELCHAIN:
+			list_del_rcu(&trans->ctx.chain->list);
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
 			nf_tables_unregister_hooks(trans->ctx.table,
 						   trans->ctx.chain,
@@ -4061,7 +4087,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWTABLE:
 			if (nft_trans_table_update(trans)) {
 				if (nft_trans_table_enable(trans)) {
-					nf_tables_table_disable(trans->ctx.afi,
+					nf_tables_table_disable(net,
+								trans->ctx.afi,
 								trans->ctx.table);
 					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 				}
@@ -4089,8 +4116,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			break;
 		case NFT_MSG_DELCHAIN:
 			trans->ctx.table->use++;
-			list_add_tail_rcu(&trans->ctx.chain->list,
-					  &trans->ctx.table->chains);
+			nft_clear(trans->ctx.net, trans->ctx.chain);
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWRULE:
@@ -4413,6 +4439,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
 static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 			    struct nft_data_desc *desc, const struct nlattr *nla)
 {
+	u8 genmask = nft_genmask_next(ctx->net);
 	struct nlattr *tb[NFTA_VERDICT_MAX + 1];
 	struct nft_chain *chain;
 	int err;
@@ -4445,7 +4472,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 		if (!tb[NFTA_VERDICT_CHAIN])
 			return -EINVAL;
 		chain = nf_tables_chain_lookup(ctx->table,
-					       tb[NFTA_VERDICT_CHAIN]);
+					       tb[NFTA_VERDICT_CHAIN], genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 		if (chain->flags & NFT_BASE_CHAIN)
-- 
cgit v1.2.3


From 37a9cc52552579f22e18cca401cfc4351b6cbc72 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 12 Jun 2016 22:52:45 +0200
Subject: netfilter: nf_tables: add generation mask to sets

Similar to ("netfilter: nf_tables: add generation mask to tables").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  8 +++--
 net/netfilter/nf_tables_api.c     | 68 +++++++++++++++++++++++----------------
 net/netfilter/nft_dynset.c        |  7 ++--
 net/netfilter/nft_lookup.c        |  6 ++--
 4 files changed, 54 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index b023e287ea92..07a5ba47cbda 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -296,6 +296,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * 	@ops: set ops
  * 	@pnet: network namespace
  * 	@flags: set flags
+ *	@genmask: generation mask
  * 	@klen: key length
  * 	@dlen: data length
  * 	@data: private set data
@@ -317,7 +318,8 @@ struct nft_set {
 	/* runtime data below here */
 	const struct nft_set_ops	*ops ____cacheline_aligned;
 	possible_net_t			pnet;
-	u16				flags;
+	u16				flags:14,
+					genmask:2;
 	u8				klen;
 	u8				dlen;
 	unsigned char			data[]
@@ -335,9 +337,9 @@ static inline struct nft_set *nft_set_container_of(const void *priv)
 }
 
 struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-				     const struct nlattr *nla);
+				     const struct nlattr *nla, u8 genmask);
 struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
-					  const struct nlattr *nla);
+					  const struct nlattr *nla, u8 genmask);
 
 static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
 {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cae88f8a990e..3316bce0a878 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -289,9 +289,6 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
 	return 0;
 }
 
-/* Internal set flag */
-#define NFT_SET_INACTIVE	(1 << 15)
-
 static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
 			     struct nft_set *set)
 {
@@ -304,7 +301,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
 	if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
 		nft_trans_set_id(trans) =
 			ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
-		set->flags |= NFT_SET_INACTIVE;
+		nft_activate_next(ctx->net, set);
 	}
 	nft_trans_set(trans) = set;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -320,7 +317,7 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
 	if (err < 0)
 		return err;
 
-	list_del_rcu(&set->list);
+	nft_deactivate_next(ctx->net, set);
 	ctx->table->use--;
 
 	return err;
@@ -741,6 +738,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
 	}
 
 	list_for_each_entry_safe(set, ns, &ctx->table->sets, list) {
+		if (!nft_is_active_next(ctx->net, set))
+			continue;
+
 		if (set->flags & NFT_SET_ANONYMOUS &&
 		    !list_empty(&set->bindings))
 			continue;
@@ -2367,7 +2367,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 }
 
 struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-				     const struct nlattr *nla)
+				     const struct nlattr *nla, u8 genmask)
 {
 	struct nft_set *set;
 
@@ -2375,22 +2375,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 		return ERR_PTR(-EINVAL);
 
 	list_for_each_entry(set, &table->sets, list) {
-		if (!nla_strcmp(nla, set->name))
+		if (!nla_strcmp(nla, set->name) &&
+		    nft_active_genmask(set, genmask))
 			return set;
 	}
 	return ERR_PTR(-ENOENT);
 }
 
 struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
-					  const struct nlattr *nla)
+					  const struct nlattr *nla,
+					  u8 genmask)
 {
 	struct nft_trans *trans;
 	u32 id = ntohl(nla_get_be32(nla));
 
 	list_for_each_entry(trans, &net->nft.commit_list, list) {
+		struct nft_set *set = nft_trans_set(trans);
+
 		if (trans->msg_type == NFT_MSG_NEWSET &&
-		    id == nft_trans_set_id(trans))
-			return nft_trans_set(trans);
+		    id == nft_trans_set_id(trans) &&
+		    nft_active_genmask(set, genmask))
+			return set;
 	}
 	return ERR_PTR(-ENOENT);
 }
@@ -2415,6 +2420,8 @@ cont:
 		list_for_each_entry(i, &ctx->table->sets, list) {
 			int tmp;
 
+			if (!nft_is_active_next(ctx->net, set))
+				continue;
 			if (!sscanf(i->name, name, &tmp))
 				continue;
 			if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
@@ -2434,6 +2441,8 @@ cont:
 
 	snprintf(set->name, sizeof(set->name), name, min + n);
 	list_for_each_entry(i, &ctx->table->sets, list) {
+		if (!nft_is_active_next(ctx->net, i))
+			continue;
 		if (!strcmp(set->name, i->name))
 			return -ENFILE;
 	}
@@ -2582,6 +2591,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 			list_for_each_entry_rcu(set, &table->sets, list) {
 				if (idx < s_idx)
 					goto cont;
+				if (!nft_is_active(net, set))
+					goto cont;
 
 				ctx_set = *ctx;
 				ctx_set.table = table;
@@ -2651,11 +2662,9 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
 		return -EAFNOSUPPORT;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
-	if (set->flags & NFT_SET_INACTIVE)
-		return -ENOENT;
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (skb2 == NULL)
@@ -2798,7 +2807,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
 
-	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
 		if (PTR_ERR(set) != -ENOENT)
 			return PTR_ERR(set);
@@ -2911,7 +2920,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 	if (!list_empty(&set->bindings))
@@ -2980,7 +2989,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	list_del_rcu(&binding->list);
 
 	if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
-	    !(set->flags & NFT_SET_INACTIVE))
+	    nft_is_active(ctx->net, set))
 		nf_tables_set_destroy(ctx, set);
 }
 
@@ -3166,11 +3175,10 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				   genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
-	if (set->flags & NFT_SET_INACTIVE)
-		return -ENOENT;
 
 	event  = NFT_MSG_NEWSETELEM;
 	event |= NFNL_SUBSYS_NFTABLES << 8;
@@ -3232,11 +3240,10 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				   genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
-	if (set->flags & NFT_SET_INACTIVE)
-		return -ENOENT;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
@@ -3567,11 +3574,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				   genmask);
 	if (IS_ERR(set)) {
 		if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
 			set = nf_tables_set_lookup_byid(net,
-					nla[NFTA_SET_ELEM_LIST_SET_ID]);
+					nla[NFTA_SET_ELEM_LIST_SET_ID],
+					genmask);
 		}
 		if (IS_ERR(set))
 			return PTR_ERR(set);
@@ -3690,7 +3699,8 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				   genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4003,7 +4013,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 					      NFT_MSG_DELRULE);
 			break;
 		case NFT_MSG_NEWSET:
-			nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE;
+			nft_clear(net, nft_trans_set(trans));
 			/* This avoids hitting -EBUSY when deleting the table
 			 * from the transaction.
 			 */
@@ -4016,6 +4026,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELSET:
+			list_del_rcu(&nft_trans_set(trans)->list);
 			nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
 					     NFT_MSG_DELSET, GFP_KERNEL);
 			break;
@@ -4134,8 +4145,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			break;
 		case NFT_MSG_DELSET:
 			trans->ctx.table->use++;
-			list_add_tail_rcu(&nft_trans_set(trans)->list,
-					  &trans->ctx.table->sets);
+			nft_clear(trans->ctx.net, nft_trans_set(trans));
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSETELEM:
@@ -4282,6 +4292,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 	}
 
 	list_for_each_entry(set, &ctx->table->sets, list) {
+		if (!nft_is_active_next(ctx->net, set))
+			continue;
 		if (!(set->flags & NFT_SET_MAP) ||
 		    set->dtype != NFT_DATA_VERDICT)
 			continue;
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 78d4914fb39c..0af26699bf04 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -103,6 +103,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_dynset *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set *set;
 	u64 timeout;
 	int err;
@@ -112,11 +113,13 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	    tb[NFTA_DYNSET_SREG_KEY] == NULL)
 		return -EINVAL;
 
-	set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]);
+	set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
+				   genmask);
 	if (IS_ERR(set)) {
 		if (tb[NFTA_DYNSET_SET_ID])
 			set = nf_tables_set_lookup_byid(ctx->net,
-							tb[NFTA_DYNSET_SET_ID]);
+							tb[NFTA_DYNSET_SET_ID],
+							genmask);
 		if (IS_ERR(set))
 			return PTR_ERR(set);
 	}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index b3c31ef8015d..8a102cf855d0 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -54,6 +54,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_lookup *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_set *set;
 	int err;
 
@@ -61,11 +62,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
 	    tb[NFTA_LOOKUP_SREG] == NULL)
 		return -EINVAL;
 
-	set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]);
+	set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask);
 	if (IS_ERR(set)) {
 		if (tb[NFTA_LOOKUP_SET_ID]) {
 			set = nf_tables_set_lookup_byid(ctx->net,
-							tb[NFTA_LOOKUP_SET_ID]);
+							tb[NFTA_LOOKUP_SET_ID],
+							genmask);
 		}
 		if (IS_ERR(set))
 			return PTR_ERR(set);
-- 
cgit v1.2.3


From 3183ab8997a477c8d9ad175a1cef70dff77c6dbc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 22 Jun 2016 13:26:10 +0200
Subject: netfilter: conntrack: allow increasing bucket size via sysctl too

No need to restrict this to module parameter.

We export a copy of the real hash size -- when user alters the value we
allocate the new table, copy entries etc before we update the real size
to the requested one.

This is also needed because the real size is used by concurrent readers
and cannot be changed without synchronizing the conntrack generation
seqcnt.

We only allow changing this value from the initial net namespace.

Tested using http-client-benchmark vs. httpterm with concurrent

while true;do
 echo $RANDOM > /proc/sys/net/netfilter/nf_conntrack_buckets
done

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.txt |  3 +-
 include/net/netfilter/nf_conntrack.h             |  1 +
 net/netfilter/nf_conntrack_core.c                | 41 ++++++++++++++++--------
 net/netfilter/nf_conntrack_standalone.c          | 36 ++++++++++++++++++---
 4 files changed, 62 insertions(+), 19 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index f55599c62c9d..4fb51d32fccc 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -7,12 +7,13 @@ nf_conntrack_acct - BOOLEAN
 	Enable connection tracking flow accounting. 64-bit byte and packet
 	counters per flow are added.
 
-nf_conntrack_buckets - INTEGER (read-only)
+nf_conntrack_buckets - INTEGER
 	Size of hash table. If not specified as parameter during module
 	loading, the default size is calculated by dividing total memory
 	by 16384 to determine the number of buckets but the hash table will
 	never have fewer than 32 and limited to 16384 buckets. For systems
 	with more than 4GB of memory it will be 65536 buckets.
+	This sysctl is only writeable in the initial net namespace.
 
 nf_conntrack_checksum - BOOLEAN
 	0 - disabled
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 9c0ed3d7af89..5d3397f34583 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -290,6 +290,7 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
 struct kernel_param;
 
 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
+int nf_conntrack_hash_resize(unsigned int hashsize);
 extern unsigned int nf_conntrack_htable_size;
 extern unsigned int nf_conntrack_max;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index a459176c3253..e17d5c7faca0 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1595,24 +1595,14 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
 }
 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
 
-int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+int nf_conntrack_hash_resize(unsigned int hashsize)
 {
-	int i, bucket, rc;
-	unsigned int hashsize, old_size;
+	int i, bucket;
+	unsigned int old_size;
 	struct hlist_nulls_head *hash, *old_hash;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
-	if (current->nsproxy->net_ns != &init_net)
-		return -EOPNOTSUPP;
-
-	/* On boot, we can set this without any fancy locking. */
-	if (!nf_conntrack_htable_size)
-		return param_set_uint(val, kp);
-
-	rc = kstrtouint(val, 0, &hashsize);
-	if (rc)
-		return rc;
 	if (!hashsize)
 		return -EINVAL;
 
@@ -1620,6 +1610,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 	if (!hash)
 		return -ENOMEM;
 
+	old_size = nf_conntrack_htable_size;
+	if (old_size == hashsize) {
+		nf_ct_free_hashtable(hash, hashsize);
+		return 0;
+	}
+
 	local_bh_disable();
 	nf_conntrack_all_lock();
 	write_seqcount_begin(&nf_conntrack_generation);
@@ -1655,6 +1651,25 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 	nf_ct_free_hashtable(old_hash, old_size);
 	return 0;
 }
+
+int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+{
+	unsigned int hashsize;
+	int rc;
+
+	if (current->nsproxy->net_ns != &init_net)
+		return -EOPNOTSUPP;
+
+	/* On boot, we can set this without any fancy locking. */
+	if (!nf_conntrack_htable_size)
+		return param_set_uint(val, kp);
+
+	rc = kstrtouint(val, 0, &hashsize);
+	if (rc)
+		return rc;
+
+	return nf_conntrack_hash_resize(hashsize);
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f87e84ebcec3..a0cc1919f081 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -434,8 +434,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
 
 #ifdef CONFIG_SYSCTL
 /* Log invalid packets of a given protocol */
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
+static int log_invalid_proto_min __read_mostly;
+static int log_invalid_proto_max __read_mostly = 255;
+
+/* size the user *wants to set */
+static unsigned int nf_conntrack_htable_size_user __read_mostly;
+
+static int
+nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	/* update ret, we might not be able to satisfy request */
+	ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user);
+
+	/* update it to the actual value used by conntrack */
+	nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+	return ret;
+}
 
 static struct ctl_table_header *nf_ct_netfilter_header;
 
@@ -456,10 +477,10 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 	},
 	{
 		.procname       = "nf_conntrack_buckets",
-		.data           = &nf_conntrack_htable_size,
+		.data           = &nf_conntrack_htable_size_user,
 		.maxlen         = sizeof(unsigned int),
-		.mode           = 0444,
-		.proc_handler   = proc_dointvec,
+		.mode           = 0644,
+		.proc_handler   = nf_conntrack_hash_sysctl,
 	},
 	{
 		.procname	= "nf_conntrack_checksum",
@@ -517,6 +538,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	if (net->user_ns != &init_user_ns)
 		table[0].procname = NULL;
 
+	if (!net_eq(&init_net, net))
+		table[2].mode = 0444;
+
 	net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
 	if (!net->ct.sysctl_header)
 		goto out_unregister_netfilter;
@@ -606,6 +630,8 @@ static int __init nf_conntrack_standalone_init(void)
 		ret = -ENOMEM;
 		goto out_sysctl;
 	}
+
+	nf_conntrack_htable_size_user = nf_conntrack_htable_size;
 #endif
 
 	ret = register_pernet_subsys(&nf_conntrack_net_ops);
-- 
cgit v1.2.3


From 82bec71d46b83f39860e2838ff8394e4fcd6efab Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 22 Jun 2016 14:26:33 +0200
Subject: netfilter: nf_tables: get rid of NFT_BASECHAIN_DISABLED

This flag was introduced to restore rulesets from the new netdev
family, but since 5ebe0b0eec9d6f7 ("netfilter: nf_tables: destroy
basechain and rules on netdevice removal") the ruleset is released
once the netdev is gone.

This also removes nft_register_basechain() and
nft_unregister_basechain() since they have no clients anymore after
this rework.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  1 -
 net/netfilter/nf_tables_api.c     | 62 ++++++++++++++++-----------------------
 2 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 07a5ba47cbda..1ea19a6e72e6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -798,7 +798,6 @@ struct nft_stats {
 };
 
 #define NFT_HOOK_OPS_MAX		2
-#define NFT_BASECHAIN_DISABLED		(1 << 0)
 
 /**
  *	struct nft_base_chain - nf_tables base chain
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3316bce0a878..92c9faeb2bf8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -131,29 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans)
 	kfree(trans);
 }
 
-static int nft_register_basechain(struct nft_base_chain *basechain,
-				  unsigned int hook_nops)
-{
-	struct net *net = read_pnet(&basechain->pnet);
-
-	if (basechain->flags & NFT_BASECHAIN_DISABLED)
-		return 0;
-
-	return nf_register_net_hooks(net, basechain->ops, hook_nops);
-}
-
-static void nft_unregister_basechain(struct nft_base_chain *basechain,
-				     unsigned int hook_nops)
-{
-	struct net *net = read_pnet(&basechain->pnet);
-
-	if (basechain->flags & NFT_BASECHAIN_DISABLED)
-		return;
-
-	nf_unregister_net_hooks(net, basechain->ops, hook_nops);
-}
-
-static int nf_tables_register_hooks(const struct nft_table *table,
+static int nf_tables_register_hooks(struct net *net,
+				    const struct nft_table *table,
 				    struct nft_chain *chain,
 				    unsigned int hook_nops)
 {
@@ -161,10 +140,12 @@ static int nf_tables_register_hooks(const struct nft_table *table,
 	    !(chain->flags & NFT_BASE_CHAIN))
 		return 0;
 
-	return nft_register_basechain(nft_base_chain(chain), hook_nops);
+	return nf_register_net_hooks(net, nft_base_chain(chain)->ops,
+				     hook_nops);
 }
 
-static void nf_tables_unregister_hooks(const struct nft_table *table,
+static void nf_tables_unregister_hooks(struct net *net,
+				       const struct nft_table *table,
 				       struct nft_chain *chain,
 				       unsigned int hook_nops)
 {
@@ -172,7 +153,7 @@ static void nf_tables_unregister_hooks(const struct nft_table *table,
 	    !(chain->flags & NFT_BASE_CHAIN))
 		return;
 
-	nft_unregister_basechain(nft_base_chain(chain), hook_nops);
+	nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -569,7 +550,8 @@ static int nf_tables_table_enable(struct net *net,
 		if (!(chain->flags & NFT_BASE_CHAIN))
 			continue;
 
-		err = nft_register_basechain(nft_base_chain(chain), afi->nops);
+		err = nf_register_net_hooks(net, nft_base_chain(chain)->ops,
+					    afi->nops);
 		if (err < 0)
 			goto err;
 
@@ -586,7 +568,8 @@ err:
 		if (i-- <= 0)
 			break;
 
-		nft_unregister_basechain(nft_base_chain(chain), afi->nops);
+		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+					afi->nops);
 	}
 	return err;
 }
@@ -600,9 +583,11 @@ static void nf_tables_table_disable(struct net *net,
 	list_for_each_entry(chain, &table->chains, list) {
 		if (!nft_is_active_next(net, chain))
 			continue;
-		if (chain->flags & NFT_BASE_CHAIN)
-			nft_unregister_basechain(nft_base_chain(chain),
-						 afi->nops);
+		if (!(chain->flags & NFT_BASE_CHAIN))
+			continue;
+
+		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+					afi->nops);
 	}
 }
 
@@ -1451,7 +1436,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	chain->table = table;
 	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
-	err = nf_tables_register_hooks(table, chain, afi->nops);
+	err = nf_tables_register_hooks(net, table, chain, afi->nops);
 	if (err < 0)
 		goto err1;
 
@@ -1464,7 +1449,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	list_add_tail_rcu(&chain->list, &table->chains);
 	return 0;
 err2:
-	nf_tables_unregister_hooks(table, chain, afi->nops);
+	nf_tables_unregister_hooks(net, table, chain, afi->nops);
 err1:
 	nf_tables_chain_destroy(chain);
 	return err;
@@ -3995,7 +3980,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_DELCHAIN:
 			list_del_rcu(&trans->ctx.chain->list);
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
-			nf_tables_unregister_hooks(trans->ctx.table,
+			nf_tables_unregister_hooks(trans->ctx.net,
+						   trans->ctx.table,
 						   trans->ctx.chain,
 						   trans->ctx.afi->nops);
 			break;
@@ -4120,7 +4106,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			} else {
 				trans->ctx.table->use--;
 				list_del_rcu(&trans->ctx.chain->list);
-				nf_tables_unregister_hooks(trans->ctx.table,
+				nf_tables_unregister_hooks(trans->ctx.net,
+							   trans->ctx.table,
 							   trans->ctx.chain,
 							   trans->ctx.afi->nops);
 			}
@@ -4662,7 +4649,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 
 	BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN));
 
-	nf_tables_unregister_hooks(ctx->chain->table, ctx->chain,
+	nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain,
 				   ctx->afi->nops);
 	list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
 		list_del(&rule->list);
@@ -4691,7 +4678,8 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 
 	list_for_each_entry_safe(table, nt, &afi->tables, list) {
 		list_for_each_entry(chain, &table->chains, list)
-			nf_tables_unregister_hooks(table, chain, afi->nops);
+			nf_tables_unregister_hooks(net, table, chain,
+						   afi->nops);
 		/* No packets are walking on these chains anymore. */
 		ctx.table = table;
 		list_for_each_entry(chain, &table->chains, list) {
-- 
cgit v1.2.3


From 520ac30f45519b0a82dd92117c181d1d6144677b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Jun 2016 23:16:49 -0700
Subject: net_sched: drop packets after root qdisc lock is released

Qdisc performance suffers when packets are dropped at enqueue()
time because drops (kfree_skb()) are done while qdisc lock is held,
delaying a dequeue() draining the queue.

Nominal throughput can be reduced by 50 % when this happens,
at a time we would like the dequeue() to proceed as fast as possible.

Even FQ is vulnerable to this problem, while one of FQ goals was
to provide some flow isolation.

This patch adds a 'struct sk_buff **to_free' parameter to all
qdisc->enqueue(), and in qdisc_drop() helper.

I measured a performance increase of up to 12 %, but this patch
is a prereq so that future batches in enqueue() can fly.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 41 ++++++++++++++++++++++++++++++-----------
 net/core/dev.c            |  7 +++++--
 net/sched/sch_atm.c       |  9 +++++----
 net/sched/sch_blackhole.c |  5 +++--
 net/sched/sch_cbq.c       |  7 ++++---
 net/sched/sch_choke.c     | 16 +++++++++-------
 net/sched/sch_codel.c     |  8 +++++---
 net/sched/sch_drr.c       |  7 ++++---
 net/sched/sch_dsmark.c    |  9 +++++----
 net/sched/sch_fifo.c      | 15 +++++++++------
 net/sched/sch_fq.c        |  7 ++++---
 net/sched/sch_fq_codel.c  | 15 +++++++++------
 net/sched/sch_generic.c   | 10 ++++++----
 net/sched/sch_gred.c      |  7 ++++---
 net/sched/sch_hfsc.c      |  6 +++---
 net/sched/sch_hhf.c       | 10 +++++-----
 net/sched/sch_htb.c       | 10 ++++++----
 net/sched/sch_multiq.c    |  7 ++++---
 net/sched/sch_netem.c     | 25 +++++++++++++++----------
 net/sched/sch_pie.c       |  5 +++--
 net/sched/sch_plug.c      |  5 +++--
 net/sched/sch_prio.c      |  4 ++--
 net/sched/sch_qfq.c       |  7 ++++---
 net/sched/sch_red.c       |  7 ++++---
 net/sched/sch_sfb.c       |  7 ++++---
 net/sched/sch_sfq.c       |  8 ++++----
 net/sched/sch_tbf.c       | 16 +++++++++-------
 net/sched/sch_teql.c      |  4 ++--
 28 files changed, 170 insertions(+), 114 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4f7cee8344c4..04e84c07c94f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -37,8 +37,10 @@ struct qdisc_size_table {
 };
 
 struct Qdisc {
-	int 			(*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
-	struct sk_buff *	(*dequeue)(struct Qdisc *dev);
+	int 			(*enqueue)(struct sk_buff *skb,
+					   struct Qdisc *sch,
+					   struct sk_buff **to_free);
+	struct sk_buff *	(*dequeue)(struct Qdisc *sch);
 	unsigned int		flags;
 #define TCQ_F_BUILTIN		1
 #define TCQ_F_INGRESS		2
@@ -160,7 +162,9 @@ struct Qdisc_ops {
 	char			id[IFNAMSIZ];
 	int			priv_size;
 
-	int 			(*enqueue)(struct sk_buff *, struct Qdisc *);
+	int 			(*enqueue)(struct sk_buff *skb,
+					   struct Qdisc *sch,
+					   struct sk_buff **to_free);
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
 	struct sk_buff *	(*peek)(struct Qdisc *);
 
@@ -498,10 +502,11 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
 #endif
 }
 
-static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+				struct sk_buff **to_free)
 {
 	qdisc_calculate_pkt_len(skb, sch);
-	return sch->enqueue(skb, sch);
+	return sch->enqueue(skb, sch, to_free);
 }
 
 static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
@@ -626,24 +631,36 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
 	return __qdisc_dequeue_head(sch, &sch->q);
 }
 
+/* Instead of calling kfree_skb() while root qdisc lock is held,
+ * queue the skb for future freeing at end of __dev_xmit_skb()
+ */
+static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
+{
+	skb->next = *to_free;
+	*to_free = skb;
+}
+
 static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
-					      struct sk_buff_head *list)
+						   struct sk_buff_head *list,
+						   struct sk_buff **to_free)
 {
 	struct sk_buff *skb = __skb_dequeue(list);
 
 	if (likely(skb != NULL)) {
 		unsigned int len = qdisc_pkt_len(skb);
+
 		qdisc_qstats_backlog_dec(sch, skb);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return len;
 	}
 
 	return 0;
 }
 
-static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch)
+static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch,
+						 struct sk_buff **to_free)
 {
-	return __qdisc_queue_drop_head(sch, &sch->q);
+	return __qdisc_queue_drop_head(sch, &sch->q, to_free);
 }
 
 static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
@@ -724,9 +741,11 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
 	qdisc_qstats_drop(sch);
 }
 
-static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
+
+static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
+			     struct sk_buff **to_free)
 {
-	kfree_skb(skb);
+	__qdisc_drop(skb, to_free);
 	qdisc_qstats_drop(sch);
 
 	return NET_XMIT_DROP;
diff --git a/net/core/dev.c b/net/core/dev.c
index d40593b3b9fb..aba10d2a8bc3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3070,6 +3070,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct netdev_queue *txq)
 {
 	spinlock_t *root_lock = qdisc_lock(q);
+	struct sk_buff *to_free = NULL;
 	bool contended;
 	int rc;
 
@@ -3086,7 +3087,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		kfree_skb(skb);
+		__qdisc_drop(skb, &to_free);
 		rc = NET_XMIT_DROP;
 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 		   qdisc_run_begin(q)) {
@@ -3109,7 +3110,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
@@ -3119,6 +3120,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		}
 	}
 	spin_unlock(root_lock);
+	if (unlikely(to_free))
+		kfree_skb_list(to_free);
 	if (unlikely(contended))
 		spin_unlock(&q->busylock);
 	return rc;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index e04ea6994d1c..481e4f12aeb4 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -357,7 +357,8 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
 
 /* --------------------------- Qdisc operations ---------------------------- */
 
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
@@ -398,10 +399,10 @@ done:
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
-			kfree_skb(skb);
+			__qdisc_drop(skb, to_free);
 			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
-			kfree_skb(skb);
+			__qdisc_drop(skb, to_free);
 			goto drop;
 		case TC_ACT_RECLASSIFY:
 			if (flow->excess)
@@ -413,7 +414,7 @@ done:
 #endif
 	}
 
-	ret = qdisc_enqueue(skb, flow->q);
+	ret = qdisc_enqueue(skb, flow->q, to_free);
 	if (ret != NET_XMIT_SUCCESS) {
 drop: __maybe_unused
 		if (net_xmit_drop_count(ret)) {
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 3fee70d9814f..c98a61e980ba 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -17,9 +17,10 @@
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
 
-static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			     struct sk_buff **to_free)
 {
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_SUCCESS;
 }
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index a29fd811d7b9..beb554aa8cfb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -358,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 }
 
 static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+	    struct sk_buff **to_free)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	int uninitialized_var(ret);
@@ -370,11 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl == NULL) {
 		if (ret & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return ret;
 	}
 
-	ret = qdisc_enqueue(skb, cl->q);
+	ret = qdisc_enqueue(skb, cl->q, to_free);
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
 		cbq_mark_toplevel(q, cl);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 789b69ee9e51..3b6d5bd69101 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q)
 }
 
 /* Drop packet from queue array by creating a "hole" */
-static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
+			      struct sk_buff **to_free)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb = q->tab[idx];
@@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 
 	qdisc_qstats_backlog_dec(sch, skb);
 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	--sch->q.qlen;
 }
 
@@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q,
 	return choke_match_flow(oskb, nskb);
 }
 
-static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			 struct sk_buff **to_free)
 {
 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	struct choke_sched_data *q = qdisc_priv(sch);
@@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		/* Draw a packet at random from queue and compare flow */
 		if (choke_match_random(q, skb, &idx)) {
 			q->stats.matched++;
-			choke_drop_by_idx(sch, idx);
+			choke_drop_by_idx(sch, idx, to_free);
 			goto congestion_drop;
 		}
 
@@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	q->stats.pdrop++;
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 
 congestion_drop:
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_CN;
 
 other_drop:
 	if (ret & __NET_XMIT_BYPASS)
 		qdisc_qstats_drop(sch);
-	kfree_skb(skb);
+	__qdisc_drop(skb, to_free);
 	return ret;
 }
 
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index c5bc424e3b3c..4002df3c7d9f 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
 {
 	struct Qdisc *sch = ctx;
 
-	qdisc_drop(skb, sch);
+	kfree_skb(skb);
+	qdisc_qstats_drop(sch);
 }
 
 static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
@@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
-static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			       struct sk_buff **to_free)
 {
 	struct codel_sched_data *q;
 
@@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 	q = qdisc_priv(sch);
 	q->drop_overlimit++;
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
 static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 22609e4e845f..8af5c59eef84 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -350,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 	return NULL;
 }
 
-static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
@@ -360,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl == NULL) {
 		if (err & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return err;
 	}
 
-	err = qdisc_enqueue(skb, cl->qdisc);
+	err = qdisc_enqueue(skb, cl->qdisc, to_free);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		if (net_xmit_drop_count(err)) {
 			cl->qstats.drops++;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index b9ba5f658528..1308bbf460f7 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
 
 /* --------------------------- Qdisc operations ---------------------------- */
 
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	int err;
@@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 #ifdef CONFIG_NET_CLS_ACT
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
-			kfree_skb(skb);
+			__qdisc_drop(skb, to_free);
 			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
 		case TC_ACT_SHOT:
@@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 	}
 
-	err = qdisc_enqueue(skb, p->q);
+	err = qdisc_enqueue(skb, p->q, to_free);
 	if (err != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(err))
 			qdisc_qstats_drop(sch);
@@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 
 drop:
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 }
 
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index dea70e3ef0ba..6ea0db427f91 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,29 +19,32 @@
 
 /* 1 band FIFO pseudo-"scheduler" */
 
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			 struct sk_buff **to_free)
 {
 	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			 struct sk_buff **to_free)
 {
 	if (likely(skb_queue_len(&sch->q) < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			      struct sk_buff **to_free)
 {
 	if (likely(skb_queue_len(&sch->q) < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
 	/* queue full, remove one skb to fulfill the limit */
-	__qdisc_queue_drop_head(sch, &sch->q);
+	__qdisc_queue_drop_head(sch, &sch->q, to_free);
 	qdisc_qstats_drop(sch);
 	qdisc_enqueue_tail(skb, sch);
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 6eb06674f778..e5458b99e09c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 	}
 }
 
-static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		      struct sk_buff **to_free)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct fq_flow *f;
 
 	if (unlikely(sch->q.qlen >= sch->limit))
-		return qdisc_drop(skb, sch);
+		return qdisc_drop(skb, sch, to_free);
 
 	f = fq_classify(skb, q);
 	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
 		q->stat_flows_plimit++;
-		return qdisc_drop(skb, sch);
+		return qdisc_drop(skb, sch, to_free);
 	}
 
 	f->qlen++;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 2dc0a849515a..f715195459c9 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
 	skb->next = NULL;
 }
 
-static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
+				  struct sk_buff **to_free)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
@@ -172,7 +173,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 		skb = dequeue_head(flow);
 		len += qdisc_pkt_len(skb);
 		mem += skb->truesize;
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 	} while (++i < max_packets && len < threshold);
 
 	flow->dropped += i;
@@ -184,7 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 	return idx;
 }
 
-static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			    struct sk_buff **to_free)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	unsigned int idx, prev_backlog, prev_qlen;
@@ -197,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (idx == 0) {
 		if (ret & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return ret;
 	}
 	idx--;
@@ -229,7 +231,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 * So instead of dropping a single packet, drop half of its backlog
 	 * with a 64 packets limit to not add a too big cpu spike here.
 	 */
-	ret = fq_codel_drop(sch, q->drop_batch_size);
+	ret = fq_codel_drop(sch, q->drop_batch_size, to_free);
 
 	prev_qlen -= sch->q.qlen;
 	prev_backlog -= sch->qstats.backlog;
@@ -276,7 +278,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
 {
 	struct Qdisc *sch = ctx;
 
-	qdisc_drop(skb, sch);
+	kfree_skb(skb);
+	qdisc_qstats_drop(sch);
 }
 
 static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 773b632e1e33..ff86606954f2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -348,9 +348,10 @@ EXPORT_SYMBOL(netif_carrier_off);
    cheaper.
  */
 
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+			struct sk_buff **to_free)
 {
-	kfree_skb(skb);
+	__qdisc_drop(skb, to_free);
 	return NET_XMIT_CN;
 }
 
@@ -439,7 +440,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
 	return priv->q + band;
 }
 
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+			      struct sk_buff **to_free)
 {
 	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
 		int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -451,7 +453,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
 		return __qdisc_enqueue_tail(skb, qdisc, list);
 	}
 
-	return qdisc_drop(skb, qdisc);
+	return qdisc_drop(skb, qdisc, to_free);
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index b5fb63c7be02..c78a093c551a 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t)
 	return t->red_flags & TC_RED_HARDDROP;
 }
 
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			struct sk_buff **to_free)
 {
 	struct gred_sched_data *q = NULL;
 	struct gred_sched *t = qdisc_priv(sch);
@@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	q->stats.pdrop++;
 drop:
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 
 congestion_drop:
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_CN;
 }
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index bd08c363a26d..8cb5eff7b79c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1572,7 +1572,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
 }
 
 static int
-hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 {
 	struct hfsc_class *cl;
 	int uninitialized_var(err);
@@ -1581,11 +1581,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl == NULL) {
 		if (err & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return err;
 	}
 
-	err = qdisc_enqueue(skb, cl->qdisc);
+	err = qdisc_enqueue(skb, cl->qdisc, to_free);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		if (net_xmit_drop_count(err)) {
 			cl->qstats.drops++;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c44593b8e65a..e3d0458af17b 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
 	skb->next = NULL;
 }
 
-static unsigned int hhf_drop(struct Qdisc *sch)
+static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	struct wdrr_bucket *bucket;
@@ -359,16 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch)
 		struct sk_buff *skb = dequeue_head(bucket);
 
 		sch->q.qlen--;
-		qdisc_qstats_drop(sch);
 		qdisc_qstats_backlog_dec(sch, skb);
-		kfree_skb(skb);
+		qdisc_drop(skb, sch, to_free);
 	}
 
 	/* Return id of the bucket from which the packet was dropped. */
 	return bucket - q->buckets;
 }
 
-static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	enum wdrr_bucket_idx idx;
@@ -406,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	/* Return Congestion Notification only if we dropped a packet from this
 	 * bucket.
 	 */
-	if (hhf_drop(sch) == idx)
+	if (hhf_drop(sch, to_free) == idx)
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this. */
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index a454605ab5cb..f3882259c385 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -569,7 +569,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 	list_del_init(&cl->un.leaf.drop_list);
 }
 
-static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	int uninitialized_var(ret);
 	struct htb_sched *q = qdisc_priv(sch);
@@ -581,16 +582,17 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 			__skb_queue_tail(&q->direct_queue, skb);
 			q->direct_pkts++;
 		} else {
-			return qdisc_drop(skb, sch);
+			return qdisc_drop(skb, sch, to_free);
 		}
 #ifdef CONFIG_NET_CLS_ACT
 	} else if (!cl) {
 		if (ret & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return ret;
 #endif
-	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+					to_free)) != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(ret)) {
 			qdisc_qstats_drop(sch);
 			cl->qstats.drops++;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 5ea93305d705..9ffbb025b37e 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 }
 
 static int
-multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+	       struct sk_buff **to_free)
 {
 	struct Qdisc *qdisc;
 	int ret;
@@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 		if (ret & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return ret;
 	}
 #endif
 
-	ret = qdisc_enqueue(skb, qdisc);
+	ret = qdisc_enqueue(skb, qdisc, to_free);
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e271967439bf..ccca8ca4c722 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -397,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
  * when we statistically choose to corrupt one, we instead segment it, returning
  * the first packet to be corrupted, and re-enqueue the remaining frames
  */
-static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
+static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
+				     struct sk_buff **to_free)
 {
 	struct sk_buff *segs;
 	netdev_features_t features = netif_skb_features(skb);
@@ -405,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs)) {
-		qdisc_drop(skb, sch);
+		qdisc_drop(skb, sch, to_free);
 		return NULL;
 	}
 	consume_skb(skb);
@@ -418,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
  * 	NET_XMIT_DROP: queue length didn't change.
  *      NET_XMIT_SUCCESS: one skb was queued.
  */
-static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			 struct sk_buff **to_free)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	/* We don't fill cb now as skb_unshare() may invalidate it */
@@ -443,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 	if (count == 0) {
 		qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	}
 
@@ -463,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 
 		q->duplicate = 0;
-		rootq->enqueue(skb2, rootq);
+		rootq->enqueue(skb2, rootq, to_free);
 		q->duplicate = dupsave;
 	}
 
@@ -475,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 */
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 		if (skb_is_gso(skb)) {
-			segs = netem_segment(skb, sch);
+			segs = netem_segment(skb, sch, to_free);
 			if (!segs)
 				return NET_XMIT_DROP;
 		} else {
@@ -488,7 +490,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
 		     skb_checksum_help(skb))) {
-			rc = qdisc_drop(skb, sch);
+			rc = qdisc_drop(skb, sch, to_free);
 			goto finish_segs;
 		}
 
@@ -497,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
-		return qdisc_drop(skb, sch);
+		return qdisc_drop(skb, sch, to_free);
 
 	qdisc_qstats_backlog_inc(sch, skb);
 
@@ -557,7 +559,7 @@ finish_segs:
 			segs->next = NULL;
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
 			last_len = segs->len;
-			rc = qdisc_enqueue(segs, sch);
+			rc = qdisc_enqueue(segs, sch, to_free);
 			if (rc != NET_XMIT_SUCCESS) {
 				if (net_xmit_drop_count(rc))
 					qdisc_qstats_drop(sch);
@@ -615,8 +617,11 @@ deliver:
 #endif
 
 			if (q->qdisc) {
-				int err = qdisc_enqueue(skb, q->qdisc);
+				struct sk_buff *to_free = NULL;
+				int err;
 
+				err = qdisc_enqueue(skb, q->qdisc, &to_free);
+				kfree_skb_list(to_free);
 				if (unlikely(err != NET_XMIT_SUCCESS)) {
 					if (net_xmit_drop_count(err)) {
 						qdisc_qstats_drop(sch);
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 912a46a5d02e..a570b0bb254c 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
 	return false;
 }
 
-static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			     struct sk_buff **to_free)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 	bool enqueue = false;
@@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 out:
 	q->stats.dropped++;
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
 static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index a12cd37680f8..1c6cbab3e7b9 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -88,7 +88,8 @@ struct plug_sched_data {
 	u32 pkts_to_release;
 };
 
-static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			struct sk_buff **to_free)
 {
 	struct plug_sched_data *q = qdisc_priv(sch);
 
@@ -98,7 +99,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return qdisc_enqueue_tail(skb, sch);
 	}
 
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
 static struct sk_buff *plug_dequeue(struct Qdisc *sch)
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index de492682caee..f4d443aeae54 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 }
 
 static int
-prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 {
 	struct Qdisc *qdisc;
 	int ret;
@@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 #endif
 
-	ret = qdisc_enqueue(skb, qdisc);
+	ret = qdisc_enqueue(skb, qdisc, to_free);
 	if (ret == NET_XMIT_SUCCESS) {
 		qdisc_qstats_backlog_inc(sch, skb);
 		sch->q.qlen++;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 0427fa8b23f2..f27ffee106f6 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1217,7 +1217,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
 	return agg;
 }
 
-static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
@@ -1240,11 +1241,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 				     qdisc_pkt_len(skb));
 		if (err) {
 			cl->qstats.drops++;
-			return qdisc_drop(skb, sch);
+			return qdisc_drop(skb, sch, to_free);
 		}
 	}
 
-	err = qdisc_enqueue(skb, cl->qdisc);
+	err = qdisc_enqueue(skb, cl->qdisc, to_free);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
 		if (net_xmit_drop_count(err)) {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a0d57530335e..249b2a18acbd 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q)
 	return q->flags & TC_RED_HARDDROP;
 }
 
-static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct Qdisc *child = q->qdisc;
@@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		break;
 	}
 
-	ret = qdisc_enqueue(skb, child);
+	ret = qdisc_enqueue(skb, child, to_free);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
 		qdisc_qstats_backlog_inc(sch, skb);
 		sch->q.qlen++;
@@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return ret;
 
 congestion_drop:
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_CN;
 }
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index c69611640fa5..add3cc7d37ec 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 	return false;
 }
 
-static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 
 	struct sfb_sched_data *q = qdisc_priv(sch);
@@ -397,7 +398,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 enqueue:
-	ret = qdisc_enqueue(skb, child);
+	ret = qdisc_enqueue(skb, child, to_free);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
 		sch->q.qlen++;
 		increment_qlen(skb, q);
@@ -408,7 +409,7 @@ enqueue:
 	return ret;
 
 drop:
-	qdisc_drop(skb, sch);
+	qdisc_drop(skb, sch, to_free);
 	return NET_XMIT_CN;
 other_drop:
 	if (ret & __NET_XMIT_BYPASS)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 57d118b41cad..7f195ed4d568 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q)
 }
 
 static int
-sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	unsigned int hash, dropped;
@@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (x == SFQ_EMPTY_SLOT) {
 		x = q->dep[0].next; /* get a free slot */
 		if (x >= SFQ_MAX_FLOWS)
-			return qdisc_drop(skb, sch);
+			return qdisc_drop(skb, sch, to_free);
 		q->ht[hash] = x;
 		slot = &q->slots[x];
 		slot->hash = hash;
@@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (slot->qlen >= q->maxdepth) {
 congestion_drop:
 		if (!sfq_headdrop(q))
-			return qdisc_drop(skb, sch);
+			return qdisc_drop(skb, sch, to_free);
 
 		/* We know we have at least one packet in queue */
 		head = slot_dequeue_head(slot);
 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 		sch->qstats.backlog -= delta;
 		slot->backlog -= delta;
-		qdisc_drop(head, sch);
+		qdisc_drop(head, sch, to_free);
 
 		slot_queue_add(slot, skb);
 		return NET_XMIT_CN;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c12df84d1078..303355c449ab 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
 /* GSO packet is too big, segment it so that tbf can transmit
  * each segment in time
  */
-static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *segs, *nskb;
@@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs))
-		return qdisc_drop(skb, sch);
+		return qdisc_drop(skb, sch, to_free);
 
 	nb = 0;
 	while (segs) {
@@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 		segs->next = NULL;
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
 		len += segs->len;
-		ret = qdisc_enqueue(segs, q->qdisc);
+		ret = qdisc_enqueue(segs, q->qdisc, to_free);
 		if (ret != NET_XMIT_SUCCESS) {
 			if (net_xmit_drop_count(ret))
 				qdisc_qstats_drop(sch);
@@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 }
 
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	int ret;
 
 	if (qdisc_pkt_len(skb) > q->max_size) {
 		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
-			return tbf_segment(skb, sch);
-		return qdisc_drop(skb, sch);
+			return tbf_segment(skb, sch, to_free);
+		return qdisc_drop(skb, sch, to_free);
 	}
-	ret = qdisc_enqueue(skb, q->qdisc);
+	ret = qdisc_enqueue(skb, q->qdisc, to_free);
 	if (ret != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(ret))
 			qdisc_qstats_drop(sch);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index e02687185a59..2cd9b4478b92 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -77,7 +77,7 @@ struct teql_sched_data {
 /* "teql*" qdisc routines */
 
 static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct teql_sched_data *q = qdisc_priv(sch);
@@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_SUCCESS;
 	}
 
-	return qdisc_drop(skb, sch);
+	return qdisc_drop(skb, sch, to_free);
 }
 
 static struct sk_buff *
-- 
cgit v1.2.3


From 008830bc321c0fc22c0db8d5b0b56f854ed90a5c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Jun 2016 23:16:50 -0700
Subject: net_sched: fq_codel: cache skb->truesize into skb->cb

Now we defer skb drops, it makes sense to keep a copy
of skb->truesize in struct codel_skb_cb to avoid one
cache line miss per dropped skb in fq_codel_drop(),
to reduce latencies a bit further.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/codel_qdisc.h | 1 +
 net/sched/sch_fq_codel.c  | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/codel_qdisc.h b/include/net/codel_qdisc.h
index 8144d9cd2908..098630f83a55 100644
--- a/include/net/codel_qdisc.h
+++ b/include/net/codel_qdisc.h
@@ -52,6 +52,7 @@
 /* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */
 struct codel_skb_cb {
 	codel_time_t enqueue_time;
+	unsigned int mem_usage;
 };
 
 static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index f715195459c9..a5ea0e9b6be4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -172,7 +172,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
 	do {
 		skb = dequeue_head(flow);
 		len += qdisc_pkt_len(skb);
-		mem += skb->truesize;
+		mem += get_codel_cb(skb)->mem_usage;
 		__qdisc_drop(skb, to_free);
 	} while (++i < max_packets && len < threshold);
 
@@ -216,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		flow->deficit = q->quantum;
 		flow->dropped = 0;
 	}
-	q->memory_usage += skb->truesize;
+	get_codel_cb(skb)->mem_usage = skb->truesize;
+	q->memory_usage += get_codel_cb(skb)->mem_usage;
 	memory_limited = q->memory_usage > q->memory_limit;
 	if (++sch->q.qlen <= sch->limit && !memory_limited)
 		return NET_XMIT_SUCCESS;
@@ -267,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 	if (flow->head) {
 		skb = dequeue_head(flow);
 		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
-		q->memory_usage -= skb->truesize;
+		q->memory_usage -= get_codel_cb(skb)->mem_usage;
 		sch->q.qlen--;
 		sch->qstats.backlog -= qdisc_pkt_len(skb);
 	}
-- 
cgit v1.2.3


From 4d202a0d31b96ab3324b21e7500d9a2da9ef57dd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Jun 2016 23:16:52 -0700
Subject: net_sched: generalize bulk dequeue

When qdisc bulk dequeue was added in linux-3.18 (commit
5772e9a3463b "qdisc: bulk dequeue support for qdiscs
with TCQ_F_ONETXQUEUE"), it was constrained to some
specific qdiscs.

With some extra care, we can extend this to all qdiscs,
so that typical traffic shaping solutions can benefit from
small batches (8 packets in this patch).

For example, HTB is often used on some multi queue device.
And bonding/team are multi queue devices...

Idea is to bulk-dequeue packets mapping to the same transmit queue.

This brings between 35 and 80 % performance increase in HTB setup
under pressure on a bonding setup :

1) NUMA node contention :   610,000 pps -> 1,110,000 pps
2) No node contention   : 1,380,000 pps -> 1,930,000 pps

Now we should work to add batches on the enqueue() side ;)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  7 ++---
 net/sched/sch_generic.c   | 68 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 04e84c07c94f..909aff2db2b3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -75,13 +75,14 @@ struct Qdisc {
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
-	struct Qdisc		*next_sched ____cacheline_aligned_in_smp;
-	struct sk_buff		*gso_skb;
-	unsigned long		state;
+	struct sk_buff		*gso_skb ____cacheline_aligned_in_smp;
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
 	seqcount_t		running;
 	struct gnet_stats_queue	qstats;
+	unsigned long		state;
+	struct Qdisc            *next_sched;
+	struct sk_buff		*skb_bad_txq;
 	struct rcu_head		rcu_head;
 	int			padded;
 	atomic_t		refcnt;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff86606954f2..e95b67cd5718 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
 	skb->next = NULL;
 }
 
+/* This variant of try_bulk_dequeue_skb() makes sure
+ * all skbs in the chain are for the same txq
+ */
+static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
+				      struct sk_buff *skb,
+				      int *packets)
+{
+	int mapping = skb_get_queue_mapping(skb);
+	struct sk_buff *nskb;
+	int cnt = 0;
+
+	do {
+		nskb = q->dequeue(q);
+		if (!nskb)
+			break;
+		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
+			q->skb_bad_txq = nskb;
+			qdisc_qstats_backlog_inc(q, nskb);
+			q->q.qlen++;
+			break;
+		}
+		skb->next = nskb;
+		skb = nskb;
+	} while (++cnt < 8);
+	(*packets) += cnt;
+	skb->next = NULL;
+}
+
 /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
  * A requeued skb (via q->gso_skb) can also be a SKB list.
  */
@@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	const struct netdev_queue *txq = q->dev_queue;
 
 	*packets = 1;
-	*validate = true;
 	if (unlikely(skb)) {
+		/* skb in gso_skb were already validated */
+		*validate = false;
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
@@ -97,15 +126,30 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 			q->q.qlen--;
 		} else
 			skb = NULL;
-		/* skb in gso_skb were already validated */
-		*validate = false;
-	} else {
-		if (!(q->flags & TCQ_F_ONETXQUEUE) ||
-		    !netif_xmit_frozen_or_stopped(txq)) {
-			skb = q->dequeue(q);
-			if (skb && qdisc_may_bulk(q))
-				try_bulk_dequeue_skb(q, skb, txq, packets);
+		return skb;
+	}
+	*validate = true;
+	skb = q->skb_bad_txq;
+	if (unlikely(skb)) {
+		/* check the reason of requeuing without tx lock first */
+		txq = skb_get_tx_queue(txq->dev, skb);
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			q->skb_bad_txq = NULL;
+			qdisc_qstats_backlog_dec(q, skb);
+			q->q.qlen--;
+			goto bulk;
 		}
+		return NULL;
+	}
+	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
+	    !netif_xmit_frozen_or_stopped(txq))
+		skb = q->dequeue(q);
+	if (skb) {
+bulk:
+		if (qdisc_may_bulk(q))
+			try_bulk_dequeue_skb(q, skb, txq, packets);
+		else
+			try_bulk_dequeue_skb_slow(q, skb, packets);
 	}
 	return skb;
 }
@@ -624,11 +668,14 @@ void qdisc_reset(struct Qdisc *qdisc)
 	if (ops->reset)
 		ops->reset(qdisc);
 
+	kfree_skb(qdisc->skb_bad_txq);
+	qdisc->skb_bad_txq = NULL;
+
 	if (qdisc->gso_skb) {
 		kfree_skb_list(qdisc->gso_skb);
 		qdisc->gso_skb = NULL;
-		qdisc->q.qlen = 0;
 	}
+	qdisc->q.qlen = 0;
 }
 EXPORT_SYMBOL(qdisc_reset);
 
@@ -667,6 +714,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	dev_put(qdisc_dev(qdisc));
 
 	kfree_skb_list(qdisc->gso_skb);
+	kfree_skb(qdisc->skb_bad_txq);
 	/*
 	 * gen_estimator est_timer() might access qdisc->q.lock,
 	 * wait a RCU grace period before freeing qdisc.
-- 
cgit v1.2.3


From cb72d38211eacda2dd90b09540542b6582da614e Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:46 -0400
Subject: netlabel: Initial support for the CALIPSO netlink protocol.

CALIPSO is a packet labelling protocol for IPv6 which is very similar
to CIPSO.  It is specified in RFC 5570.  Much of the code is based on
the current CIPSO code.

This adds support for adding passthrough-type CALIPSO DOIs through the
NLBL_CALIPSO_C_ADD command.  It requires attributes:

 NLBL_CALIPSO_A_TYPE which must be CALIPSO_MAP_PASS.
 NLBL_CALIPSO_A_DOI.

In passthrough mode the CALIPSO engine will map MLS secattr levels
and categories directly to the packet label.

At this stage, the major difference between this and the CIPSO
code is that IPv6 may be compiled as a module.  To allow for
this the CALIPSO functions are registered at module init time.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/calipso.h           |  79 +++++++++++++++
 include/net/netlabel.h          |  23 +++++
 include/uapi/linux/audit.h      |   2 +
 net/ipv6/Makefile               |   1 +
 net/ipv6/af_inet6.c             |   9 +-
 net/ipv6/calipso.c              | 169 +++++++++++++++++++++++++++++++
 net/netlabel/Makefile           |   2 +-
 net/netlabel/netlabel_calipso.c | 216 ++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.h |  90 +++++++++++++++++
 net/netlabel/netlabel_kapi.c    |   1 +
 net/netlabel/netlabel_mgmt.c    |   9 ++
 net/netlabel/netlabel_user.c    |   5 +
 12 files changed, 604 insertions(+), 2 deletions(-)
 create mode 100644 include/net/calipso.h
 create mode 100644 net/ipv6/calipso.c
 create mode 100644 net/netlabel/netlabel_calipso.c
 create mode 100644 net/netlabel/netlabel_calipso.h

(limited to 'include/net')

diff --git a/include/net/calipso.h b/include/net/calipso.h
new file mode 100644
index 000000000000..38dbb4707150
--- /dev/null
+++ b/include/net/calipso.h
@@ -0,0 +1,79 @@
+/*
+ * CALIPSO - Common Architecture Label IPv6 Security Option
+ *
+ * This is an implementation of the CALIPSO protocol as specified in
+ * RFC 5570.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ *          Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _CALIPSO_H
+#define _CALIPSO_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/netlabel.h>
+#include <net/request_sock.h>
+#include <linux/atomic.h>
+#include <asm/unaligned.h>
+
+/* known doi values */
+#define CALIPSO_DOI_UNKNOWN          0x00000000
+
+/* doi mapping types */
+#define CALIPSO_MAP_UNKNOWN          0
+#define CALIPSO_MAP_PASS             2
+
+/*
+ * CALIPSO DOI definitions
+ */
+
+/* DOI definition struct */
+struct calipso_doi {
+	u32 doi;
+	u32 type;
+
+	atomic_t refcount;
+	struct list_head list;
+	struct rcu_head rcu;
+};
+
+#ifdef CONFIG_NETLABEL
+int __init calipso_init(void);
+void calipso_exit(void);
+#else
+static inline int __init calipso_init(void)
+{
+	return 0;
+}
+
+static inline void calipso_exit(void)
+{
+}
+#endif /* CONFIG_NETLABEL */
+
+#endif /* _CALIPSO_H */
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 7b5a300de7f5..6af1bb6df4ab 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -40,6 +40,7 @@
 #include <linux/atomic.h>
 
 struct cipso_v4_doi;
+struct calipso_doi;
 
 /*
  * NetLabel - A management interface for maintaining network packet label
@@ -94,6 +95,8 @@ struct cipso_v4_doi;
 #define NETLBL_NLTYPE_UNLABELED_NAME    "NLBL_UNLBL"
 #define NETLBL_NLTYPE_ADDRSELECT        6
 #define NETLBL_NLTYPE_ADDRSELECT_NAME   "NLBL_ADRSEL"
+#define NETLBL_NLTYPE_CALIPSO           7
+#define NETLBL_NLTYPE_CALIPSO_NAME      "NLBL_CALIPSO"
 
 /*
  * NetLabel - Kernel API for accessing the network packet label mappings.
@@ -216,6 +219,23 @@ struct netlbl_lsm_secattr {
 	} attr;
 };
 
+/**
+ * struct netlbl_calipso_ops - NetLabel CALIPSO operations
+ * @doi_add: add a CALIPSO DOI
+ * @doi_free: free a CALIPSO DOI
+ *
+ * Description:
+ * This structure is filled out by the CALIPSO engine and passed
+ * to the NetLabel core via a call to netlbl_calipso_ops_register().
+ * It enables the CALIPSO engine (and hence IPv6) to be compiled
+ * as a module.
+ */
+struct netlbl_calipso_ops {
+	int (*doi_add)(struct calipso_doi *doi_def,
+		       struct netlbl_audit *audit_info);
+	void (*doi_free)(struct calipso_doi *doi_def);
+};
+
 /*
  * LSM security attribute operations (inline)
  */
@@ -598,4 +618,7 @@ static inline struct audit_buffer *netlbl_audit_start(int type,
 }
 #endif /* CONFIG_NETLABEL */
 
+const struct netlbl_calipso_ops *
+netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops);
+
 #endif /* _NETLABEL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d820aa979620..82e8aa59446b 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -130,6 +130,8 @@
 #define AUDIT_MAC_IPSEC_EVENT	1415	/* Audit an IPSec event */
 #define AUDIT_MAC_UNLBL_STCADD	1416	/* NetLabel: add a static label */
 #define AUDIT_MAC_UNLBL_STCDEL	1417	/* NetLabel: del a static label */
+#define AUDIT_MAC_CALIPSO_ADD	1418	/* NetLabel: add CALIPSO DOI entry */
+#define AUDIT_MAC_CALIPSO_DEL	1419	/* NetLabel: del CALIPSO DOI entry */
 
 #define AUDIT_FIRST_KERN_ANOM_MSG   1700
 #define AUDIT_LAST_KERN_ANOM_MSG    1799
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fbd90bf8d33..88ad4477705c 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -21,6 +21,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o
 ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+ipv6-$(CONFIG_NETLABEL) += calipso.o
 
 ipv6-objs += $(ipv6-y)
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37cfd67c..c241c1805728 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -60,6 +60,7 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
+#include <net/calipso.h>
 
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
@@ -970,6 +971,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto pingv6_fail;
 
+	err = calipso_init();
+	if (err)
+		goto calipso_fail;
+
 #ifdef CONFIG_SYSCTL
 	err = ipv6_sysctl_register();
 	if (err)
@@ -980,8 +985,10 @@ out:
 
 #ifdef CONFIG_SYSCTL
 sysctl_fail:
-	pingv6_exit();
+	calipso_exit();
 #endif
+calipso_fail:
+	pingv6_exit();
 pingv6_fail:
 	ipv6_packet_cleanup();
 ipv6_packet_fail:
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
new file mode 100644
index 000000000000..aa44310120be
--- /dev/null
+++ b/net/ipv6/calipso.c
@@ -0,0 +1,169 @@
+/*
+ * CALIPSO - Common Architecture Label IPv6 Security Option
+ *
+ * This is an implementation of the CALIPSO protocol as specified in
+ * RFC 5570.
+ *
+ * Authors: Paul Moore <paul.moore@hp.com>
+ *          Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+/* List of available DOI definitions */
+static DEFINE_SPINLOCK(calipso_doi_list_lock);
+static LIST_HEAD(calipso_doi_list);
+
+/* DOI List Functions
+ */
+
+/**
+ * calipso_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct calipso_doi *calipso_doi_search(u32 doi)
+{
+	struct calipso_doi *iter;
+
+	list_for_each_entry_rcu(iter, &calipso_doi_list, list)
+		if (iter->doi == doi && atomic_read(&iter->refcount))
+			return iter;
+	return NULL;
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains.  The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details).  Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+static int calipso_doi_add(struct calipso_doi *doi_def,
+			   struct netlbl_audit *audit_info)
+{
+	int ret_val = -EINVAL;
+	u32 doi;
+	u32 doi_type;
+	struct audit_buffer *audit_buf;
+
+	doi = doi_def->doi;
+	doi_type = doi_def->type;
+
+	if (doi_def->doi == CALIPSO_DOI_UNKNOWN)
+		goto doi_add_return;
+
+	atomic_set(&doi_def->refcount, 1);
+
+	spin_lock(&calipso_doi_list_lock);
+	if (calipso_doi_search(doi_def->doi)) {
+		spin_unlock(&calipso_doi_list_lock);
+		ret_val = -EEXIST;
+		goto doi_add_return;
+	}
+	list_add_tail_rcu(&doi_def->list, &calipso_doi_list);
+	spin_unlock(&calipso_doi_list_lock);
+	ret_val = 0;
+
+doi_add_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info);
+	if (audit_buf) {
+		const char *type_str;
+
+		switch (doi_type) {
+		case CALIPSO_MAP_PASS:
+			type_str = "pass";
+			break;
+		default:
+			type_str = "(unknown)";
+		}
+		audit_log_format(audit_buf,
+				 " calipso_doi=%u calipso_type=%s res=%u",
+				 doi, type_str, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+static void calipso_doi_free(struct calipso_doi *doi_def)
+{
+	kfree(doi_def);
+}
+
+static const struct netlbl_calipso_ops ops = {
+	.doi_add          = calipso_doi_add,
+	.doi_free         = calipso_doi_free,
+};
+
+/**
+ * calipso_init - Initialize the CALIPSO module
+ *
+ * Description:
+ * Initialize the CALIPSO module and prepare it for use.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int __init calipso_init(void)
+{
+	netlbl_calipso_ops_register(&ops);
+	return 0;
+}
+
+void calipso_exit(void)
+{
+	netlbl_calipso_ops_register(NULL);
+}
diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
index d2732fc952e2..d341ede0dca5 100644
--- a/net/netlabel/Makefile
+++ b/net/netlabel/Makefile
@@ -12,4 +12,4 @@ obj-y	+= netlabel_mgmt.o
 # protocol modules
 obj-y	+= netlabel_unlabeled.o
 obj-y	+= netlabel_cipso_v4.o
-
+obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
new file mode 100644
index 000000000000..8a113f91b7a1
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.c
@@ -0,0 +1,216 @@
+/*
+ * NetLabel CALIPSO/IPv6 Support
+ *
+ * This file defines the CALIPSO/IPv6 functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and CALIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ *          Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_calipso.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
+
+/* NetLabel Generic NETLINK CALIPSO family */
+static struct genl_family netlbl_calipso_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CALIPSO_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CALIPSO_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
+	[NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 },
+	[NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 },
+};
+
+/* NetLabel Command Handlers
+ */
+/**
+ * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message
+ * and add it to the CALIPSO engine.  Return zero on success and non-zero on
+ * error.
+ *
+ */
+static int netlbl_calipso_add_pass(struct genl_info *info,
+				   struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct calipso_doi *doi_def = NULL;
+
+	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+	if (!doi_def)
+		return -ENOMEM;
+	doi_def->type = CALIPSO_MAP_PASS;
+	doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+	ret_val = calipso_doi_add(doi_def, audit_info);
+	if (ret_val != 0)
+		calipso_doi_free(doi_def);
+
+	return ret_val;
+}
+
+/**
+ * netlbl_calipso_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new DOI definition based on the given ADD message and add it to the
+ * CALIPSO engine.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info)
+
+{
+	int ret_val = -EINVAL;
+	struct netlbl_audit audit_info;
+
+	if (!info->attrs[NLBL_CALIPSO_A_DOI] ||
+	    !info->attrs[NLBL_CALIPSO_A_MTYPE])
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+	switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) {
+	case CALIPSO_MAP_PASS:
+		ret_val = netlbl_calipso_add_pass(info, &audit_info);
+		break;
+	}
+	if (ret_val == 0)
+		atomic_inc(&netlabel_mgmt_protocount);
+
+	return ret_val;
+}
+
+/* NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_calipso_ops[] = {
+	{
+	.cmd = NLBL_CALIPSO_C_ADD,
+	.flags = GENL_ADMIN_PERM,
+	.policy = calipso_genl_policy,
+	.doit = netlbl_calipso_add,
+	.dumpit = NULL,
+	},
+};
+
+/* NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component
+ *
+ * Description:
+ * Register the CALIPSO packet NetLabel component with the Generic NETLINK
+ * mechanism.  Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_calipso_genl_init(void)
+{
+	return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
+					     netlbl_calipso_ops);
+}
+
+static const struct netlbl_calipso_ops *calipso_ops;
+
+/**
+ * netlbl_calipso_ops_register - Register the CALIPSO operations
+ *
+ * Description:
+ * Register the CALIPSO packet engine operations.
+ *
+ */
+const struct netlbl_calipso_ops *
+netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops)
+{
+	return xchg(&calipso_ops, ops);
+}
+EXPORT_SYMBOL(netlbl_calipso_ops_register);
+
+static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void)
+{
+	return ACCESS_ONCE(calipso_ops);
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains.  The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details).  Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int calipso_doi_add(struct calipso_doi *doi_def,
+		    struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->doi_add(doi_def, audit_info);
+	return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void calipso_doi_free(struct calipso_doi *doi_def)
+{
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ops->doi_free(doi_def);
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
new file mode 100644
index 000000000000..f78790a6ce4f
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.h
@@ -0,0 +1,90 @@
+/*
+ * NetLabel CALIPSO Support
+ *
+ * This file defines the CALIPSO functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ *          Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_CALIPSO
+#define _NETLABEL_CALIPSO
+
+#include <net/netlabel.h>
+#include <net/calipso.h>
+
+/* The following NetLabel payloads are supported by the CALIPSO subsystem.
+ *
+ * o ADD:
+ *   Sent by an application to add a new DOI mapping table.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CALIPSO_A_DOI
+ *     NLBL_CALIPSO_A_MTYPE
+ *
+ *   If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
+ */
+
+/* NetLabel CALIPSO commands */
+enum {
+	NLBL_CALIPSO_C_UNSPEC,
+	NLBL_CALIPSO_C_ADD,
+	NLBL_CALIPSO_C_REMOVE,
+	NLBL_CALIPSO_C_LIST,
+	NLBL_CALIPSO_C_LISTALL,
+	__NLBL_CALIPSO_C_MAX,
+};
+
+/* NetLabel CALIPSO attributes */
+enum {
+	NLBL_CALIPSO_A_UNSPEC,
+	NLBL_CALIPSO_A_DOI,
+	/* (NLA_U32)
+	 * the DOI value */
+	NLBL_CALIPSO_A_MTYPE,
+	/* (NLA_U32)
+	 * the mapping table type (defined in the calipso.h header as
+	 * CALIPSO_MAP_*) */
+	__NLBL_CALIPSO_A_MAX,
+};
+
+#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1)
+
+/* NetLabel protocol functions */
+#if IS_ENABLED(CONFIG_IPV6)
+int netlbl_calipso_genl_init(void);
+#else
+static inline int netlbl_calipso_genl_init(void)
+{
+	return 0;
+}
+#endif
+
+int calipso_doi_add(struct calipso_doi *doi_def,
+		    struct netlbl_audit *audit_info);
+void calipso_doi_free(struct calipso_doi *doi_def);
+
+#endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 7e2a68f9d165..8d2e483167a8 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1170,6 +1170,7 @@ struct audit_buffer *netlbl_audit_start(int type,
 {
 	return netlbl_audit_start_common(type, audit_info);
 }
+EXPORT_SYMBOL(netlbl_audit_start);
 
 /*
  * Setup Functions
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index b2aeb5d44601..975b1e93271e 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -674,6 +674,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb,
 			goto protocols_return;
 		protos_sent++;
 	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (protos_sent == 2) {
+		if (netlbl_mgmt_protocols_cb(skb,
+					     cb,
+					     NETLBL_NLTYPE_CALIPSO) < 0)
+			goto protocols_return;
+		protos_sent++;
+	}
+#endif
 
 protocols_return:
 	cb->args[0] = protos_sent;
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index adf8b7900da2..58495f44c62a 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -44,6 +44,7 @@
 #include "netlabel_mgmt.h"
 #include "netlabel_unlabeled.h"
 #include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
 #include "netlabel_user.h"
 
 /*
@@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void)
 	if (ret_val != 0)
 		return ret_val;
 
+	ret_val = netlbl_calipso_genl_init();
+	if (ret_val != 0)
+		return ret_val;
+
 	return netlbl_unlabel_genl_init();
 }
 
-- 
cgit v1.2.3


From a5e34490c3160e09814403d040765b0ae0003121 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:47 -0400
Subject: netlabel: Add support for querying a CALIPSO DOI.

Query a specified DOI through the NLBL_CALIPSO_C_LIST command.
It requires the attribute:
 NLBL_CALIPSO_A_DOI.

The reply will contain:
 NLBL_CALIPSO_A_MTYPE

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h          |   4 ++
 net/ipv6/calipso.c              |  68 +++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.c | 102 ++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.h |  19 ++++++++
 4 files changed, 193 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 6af1bb6df4ab..0f05b83851e2 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -223,6 +223,8 @@ struct netlbl_lsm_secattr {
  * struct netlbl_calipso_ops - NetLabel CALIPSO operations
  * @doi_add: add a CALIPSO DOI
  * @doi_free: free a CALIPSO DOI
+ * @doi_getdef: returns a reference to a DOI
+ * @doi_putdef: releases a reference of a DOI
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -234,6 +236,8 @@ struct netlbl_calipso_ops {
 	int (*doi_add)(struct calipso_doi *doi_def,
 		       struct netlbl_audit *audit_info);
 	void (*doi_free)(struct calipso_doi *doi_def);
+	struct calipso_doi *(*doi_getdef)(u32 doi);
+	void (*doi_putdef)(struct calipso_doi *doi_def);
 };
 
 /*
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index aa44310120be..128cc6945e34 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -144,9 +144,77 @@ static void calipso_doi_free(struct calipso_doi *doi_def)
 	kfree(doi_def);
 }
 
+/**
+ * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void calipso_doi_free_rcu(struct rcu_head *entry)
+{
+	struct calipso_doi *doi_def;
+
+	doi_def = container_of(entry, struct calipso_doi, rcu);
+	calipso_doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller.  Otherwise NULL is returned.  The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+static struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+	struct calipso_doi *doi_def;
+
+	rcu_read_lock();
+	doi_def = calipso_doi_search(doi);
+	if (!doi_def)
+		goto doi_getdef_return;
+	if (!atomic_inc_not_zero(&doi_def->refcount))
+		doi_def = NULL;
+
+doi_getdef_return:
+	rcu_read_unlock();
+	return doi_def;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+static void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+	if (!doi_def)
+		return;
+
+	if (!atomic_dec_and_test(&doi_def->refcount))
+		return;
+	spin_lock(&calipso_doi_list_lock);
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&calipso_doi_list_lock);
+
+	call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+}
+
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
+	.doi_getdef       = calipso_doi_getdef,
+	.doi_putdef       = calipso_doi_putdef,
 };
 
 /**
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 8a113f91b7a1..6161170dc5f4 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -124,6 +124,65 @@ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info)
 	return ret_val;
 }
 
+/**
+ * netlbl_calipso_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond accordingly.
+ * Returns zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val;
+	struct sk_buff *ans_skb = NULL;
+	void *data;
+	u32 doi;
+	struct calipso_doi *doi_def;
+
+	if (!info->attrs[NLBL_CALIPSO_A_DOI]) {
+		ret_val = -EINVAL;
+		goto list_failure;
+	}
+
+	doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+
+	doi_def = calipso_doi_getdef(doi);
+	if (!doi_def) {
+		ret_val = -EINVAL;
+		goto list_failure;
+	}
+
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!ans_skb) {
+		ret_val = -ENOMEM;
+		goto list_failure_put;
+	}
+	data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family,
+				 0, NLBL_CALIPSO_C_LIST);
+	if (!data) {
+		ret_val = -ENOMEM;
+		goto list_failure_put;
+	}
+
+	ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type);
+	if (ret_val != 0)
+		goto list_failure_put;
+
+	calipso_doi_putdef(doi_def);
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+list_failure_put:
+	calipso_doi_putdef(doi_def);
+list_failure:
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
 /* NetLabel Generic NETLINK Command Definitions
  */
 
@@ -135,6 +194,13 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	.doit = netlbl_calipso_add,
 	.dumpit = NULL,
 	},
+	{
+	.cmd = NLBL_CALIPSO_C_LIST,
+	.flags = 0,
+	.policy = calipso_genl_policy,
+	.doit = netlbl_calipso_list,
+	.dumpit = NULL,
+	},
 };
 
 /* NetLabel Generic NETLINK Protocol Functions
@@ -214,3 +280,39 @@ void calipso_doi_free(struct calipso_doi *doi_def)
 	if (ops)
 		ops->doi_free(doi_def);
 }
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller.  Otherwise NULL is returned.  The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+	struct calipso_doi *ret_val = NULL;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->doi_getdef(doi);
+	return ret_val;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ops->doi_putdef(doi_def);
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index f78790a6ce4f..6da737a9f37a 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -46,6 +46,23 @@
  *
  *   If using CALIPSO_MAP_PASS no additional attributes are required.
  *
+ * o LIST:
+ *   Sent by an application to list the details of a DOI definition.  On
+ *   success the kernel should send a response using the following format.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CALIPSO_A_DOI
+ *
+ *   The valid response message format depends on the type of the DOI mapping,
+ *   the defined formats are shown below.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CALIPSO_A_MTYPE
+ *
+ *   If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
  */
 
 /* NetLabel CALIPSO commands */
@@ -86,5 +103,7 @@ static inline int netlbl_calipso_genl_init(void)
 int calipso_doi_add(struct calipso_doi *doi_def,
 		    struct netlbl_audit *audit_info);
 void calipso_doi_free(struct calipso_doi *doi_def);
+struct calipso_doi *calipso_doi_getdef(u32 doi);
+void calipso_doi_putdef(struct calipso_doi *doi_def);
 
 #endif
-- 
cgit v1.2.3


From e1ce69df7e6e8cbdca78ae831ecf435b12b4c168 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:48 -0400
Subject: netlabel: Add support for enumerating the CALIPSO DOI list.

Enumerate the DOI list through the NLBL_CALIPSO_C_LISTALL command.
It takes no attributes.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h          |   4 ++
 net/ipv6/calipso.c              |  41 ++++++++++++++++
 net/netlabel/netlabel_calipso.c | 106 ++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.h |  14 ++++++
 4 files changed, 165 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 0f05b83851e2..2653d3a8cde8 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -225,6 +225,7 @@ struct netlbl_lsm_secattr {
  * @doi_free: free a CALIPSO DOI
  * @doi_getdef: returns a reference to a DOI
  * @doi_putdef: releases a reference of a DOI
+ * @doi_walk: enumerate the DOI list
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -238,6 +239,9 @@ struct netlbl_calipso_ops {
 	void (*doi_free)(struct calipso_doi *doi_def);
 	struct calipso_doi *(*doi_getdef)(u32 doi);
 	void (*doi_putdef)(struct calipso_doi *doi_def);
+	int (*doi_walk)(u32 *skip_cnt,
+			int (*callback)(struct calipso_doi *doi_def, void *arg),
+			void *cb_arg);
 };
 
 /*
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 128cc6945e34..fa17c7a7f4be 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -210,11 +210,52 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def)
 	call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
 }
 
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return.  Updates the value in @skip_cnt upon
+ * return.  Returns zero on success, negative values on failure.
+ *
+ */
+static int calipso_doi_walk(u32 *skip_cnt,
+			    int (*callback)(struct calipso_doi *doi_def,
+					    void *arg),
+			    void *cb_arg)
+{
+	int ret_val = -ENOENT;
+	u32 doi_cnt = 0;
+	struct calipso_doi *iter_doi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list)
+		if (atomic_read(&iter_doi->refcount) > 0) {
+			if (doi_cnt++ < *skip_cnt)
+				continue;
+			ret_val = callback(iter_doi, cb_arg);
+			if (ret_val < 0) {
+				doi_cnt--;
+				goto doi_walk_return;
+			}
+		}
+
+doi_walk_return:
+	rcu_read_unlock();
+	*skip_cnt = doi_cnt;
+	return ret_val;
+}
+
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
 	.doi_getdef       = calipso_doi_getdef,
 	.doi_putdef       = calipso_doi_putdef,
+	.doi_walk         = calipso_doi_walk,
 };
 
 /**
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 6161170dc5f4..214114575812 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -46,6 +46,13 @@
 #include "netlabel_mgmt.h"
 #include "netlabel_domainhash.h"
 
+/* Argument struct for calipso_doi_walk() */
+struct netlbl_calipso_doiwalk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
 /* NetLabel Generic NETLINK CALIPSO family */
 static struct genl_family netlbl_calipso_gnl_family = {
 	.id = GENL_ID_GENERATE,
@@ -183,6 +190,73 @@ list_failure:
 	return ret_val;
 }
 
+/**
+ * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL
+ * @doi_def: the CALIPSO DOI definition
+ * @arg: the netlbl_calipso_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * calipso_doi_walk() function for use in generating a response for a LISTALL
+ * message.  Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_calipso_doiwalk_arg *cb_arg = arg;
+	void *data;
+
+	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+			   cb_arg->seq, &netlbl_calipso_gnl_family,
+			   NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL);
+	if (!data)
+		goto listall_cb_failure;
+
+	ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+	ret_val = nla_put_u32(cb_arg->skb,
+			      NLBL_CALIPSO_A_MTYPE,
+			      doi_def->type);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+
+	genlmsg_end(cb_arg->skb, data);
+	return 0;
+
+listall_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
+	return ret_val;
+}
+
+/**
+ * netlbl_calipso_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and respond accordingly.  Returns
+ * zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_listall(struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct netlbl_calipso_doiwalk_arg cb_arg;
+	u32 doi_skip = cb->args[0];
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg);
+
+	cb->args[0] = doi_skip;
+	return skb->len;
+}
+
 /* NetLabel Generic NETLINK Command Definitions
  */
 
@@ -201,6 +275,13 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	.doit = netlbl_calipso_list,
 	.dumpit = NULL,
 	},
+	{
+	.cmd = NLBL_CALIPSO_C_LISTALL,
+	.flags = 0,
+	.policy = calipso_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_calipso_listall,
+	},
 };
 
 /* NetLabel Generic NETLINK Protocol Functions
@@ -316,3 +397,28 @@ void calipso_doi_putdef(struct calipso_doi *doi_def)
 	if (ops)
 		ops->doi_putdef(doi_def);
 }
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return.  Updates the value in @skip_cnt upon
+ * return.  Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_doi_walk(u32 *skip_cnt,
+		     int (*callback)(struct calipso_doi *doi_def, void *arg),
+		     void *cb_arg)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->doi_walk(skip_cnt, callback, cb_arg);
+	return ret_val;
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index 6da737a9f37a..1f24174466e4 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -63,6 +63,17 @@
  *
  *   If using CALIPSO_MAP_PASS no additional attributes are required.
  *
+ * o LISTALL:
+ *   This message is sent by an application to list the valid DOIs on the
+ *   system.  When sent by an application there is no payload and the
+ *   NLM_F_DUMP flag should be set.  The kernel should respond with a series of
+ *   the following messages.
+ *
+ *   Required attributes:
+ *
+ *    NLBL_CALIPSO_A_DOI
+ *    NLBL_CALIPSO_A_MTYPE
+ *
  */
 
 /* NetLabel CALIPSO commands */
@@ -105,5 +116,8 @@ int calipso_doi_add(struct calipso_doi *doi_def,
 void calipso_doi_free(struct calipso_doi *doi_def);
 struct calipso_doi *calipso_doi_getdef(u32 doi);
 void calipso_doi_putdef(struct calipso_doi *doi_def);
+int calipso_doi_walk(u32 *skip_cnt,
+		     int (*callback)(struct calipso_doi *doi_def, void *arg),
+		     void *cb_arg);
 
 #endif
-- 
cgit v1.2.3


From d7cce01504a0ccb95b5007d846560cfccbc1947f Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:49 -0400
Subject: netlabel: Add support for removing a CALIPSO DOI.

Remove a specified DOI through the NLBL_CALIPSO_C_REMOVE command.
It requires the attribute:
 NLBL_CALIPSO_A_DOI.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h          |  1 +
 net/ipv6/calipso.c              | 48 +++++++++++++++++++++
 net/netlabel/netlabel_calipso.c | 92 +++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.h |  9 ++++
 4 files changed, 150 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 2653d3a8cde8..2c0513b0cc88 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -237,6 +237,7 @@ struct netlbl_calipso_ops {
 	int (*doi_add)(struct calipso_doi *doi_def,
 		       struct netlbl_audit *audit_info);
 	void (*doi_free)(struct calipso_doi *doi_def);
+	int (*doi_remove)(u32 doi, struct netlbl_audit *audit_info);
 	struct calipso_doi *(*doi_getdef)(u32 doi);
 	void (*doi_putdef)(struct calipso_doi *doi_def);
 	int (*doi_walk)(u32 *skip_cnt,
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index fa17c7a7f4be..d7df7a4bd32e 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -162,6 +162,53 @@ static void calipso_doi_free_rcu(struct rcu_head *entry)
 	calipso_doi_free(doi_def);
 }
 
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine.  The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list.  Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct calipso_doi *doi_def;
+	struct audit_buffer *audit_buf;
+
+	spin_lock(&calipso_doi_list_lock);
+	doi_def = calipso_doi_search(doi);
+	if (!doi_def) {
+		spin_unlock(&calipso_doi_list_lock);
+		ret_val = -ENOENT;
+		goto doi_remove_return;
+	}
+	if (!atomic_dec_and_test(&doi_def->refcount)) {
+		spin_unlock(&calipso_doi_list_lock);
+		ret_val = -EBUSY;
+		goto doi_remove_return;
+	}
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&calipso_doi_list_lock);
+
+	call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+	ret_val = 0;
+
+doi_remove_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info);
+	if (audit_buf) {
+		audit_log_format(audit_buf,
+				 " calipso_doi=%u res=%u",
+				 doi, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
 /**
  * calipso_doi_getdef - Returns a reference to a valid DOI definition
  * @doi: the DOI value
@@ -253,6 +300,7 @@ doi_walk_return:
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
+	.doi_remove       = calipso_doi_remove,
 	.doi_getdef       = calipso_doi_getdef,
 	.doi_putdef       = calipso_doi_putdef,
 	.doi_walk         = calipso_doi_walk,
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 214114575812..2857673e8802 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -53,6 +53,12 @@ struct netlbl_calipso_doiwalk_arg {
 	u32 seq;
 };
 
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+	struct netlbl_audit *audit_info;
+	u32 doi;
+};
+
 /* NetLabel Generic NETLINK CALIPSO family */
 static struct genl_family netlbl_calipso_gnl_family = {
 	.id = GENL_ID_GENERATE,
@@ -257,6 +263,64 @@ static int netlbl_calipso_listall(struct sk_buff *skb,
 	return skb->len;
 }
 
+/**
+ * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_calipso_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CALIPSO DOI specified in @arg.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+	struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+	if (entry->def.type == NETLBL_NLTYPE_CALIPSO &&
+	    entry->def.calipso->doi == cb_arg->doi)
+		return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+	return 0;
+}
+
+/**
+ * netlbl_calipso_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and respond accordingly.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val = -EINVAL;
+	struct netlbl_domhsh_walk_arg cb_arg;
+	struct netlbl_audit audit_info;
+	u32 skip_bkt = 0;
+	u32 skip_chain = 0;
+
+	if (!info->attrs[NLBL_CALIPSO_A_DOI])
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+	cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+	cb_arg.audit_info = &audit_info;
+	ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+				     netlbl_calipso_remove_cb, &cb_arg);
+	if (ret_val == 0 || ret_val == -ENOENT) {
+		ret_val = calipso_doi_remove(cb_arg.doi, &audit_info);
+		if (ret_val == 0)
+			atomic_dec(&netlabel_mgmt_protocount);
+	}
+
+	return ret_val;
+}
+
 /* NetLabel Generic NETLINK Command Definitions
  */
 
@@ -269,6 +333,13 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	.dumpit = NULL,
 	},
 	{
+	.cmd = NLBL_CALIPSO_C_REMOVE,
+	.flags = GENL_ADMIN_PERM,
+	.policy = calipso_genl_policy,
+	.doit = netlbl_calipso_remove,
+	.dumpit = NULL,
+	},
+	{
 	.cmd = NLBL_CALIPSO_C_LIST,
 	.flags = 0,
 	.policy = calipso_genl_policy,
@@ -362,6 +433,27 @@ void calipso_doi_free(struct calipso_doi *doi_def)
 		ops->doi_free(doi_def);
 }
 
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine.  The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list.  Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->doi_remove(doi, audit_info);
+	return ret_val;
+}
+
 /**
  * calipso_doi_getdef - Returns a reference to a valid DOI definition
  * @doi: the DOI value
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index 1f24174466e4..ed78554ba472 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -46,6 +46,14 @@
  *
  *   If using CALIPSO_MAP_PASS no additional attributes are required.
  *
+ * o REMOVE:
+ *   Sent by an application to remove a specific DOI mapping table from the
+ *   CALIPSO system.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CALIPSO_A_DOI
+ *
  * o LIST:
  *   Sent by an application to list the details of a DOI definition.  On
  *   success the kernel should send a response using the following format.
@@ -114,6 +122,7 @@ static inline int netlbl_calipso_genl_init(void)
 int calipso_doi_add(struct calipso_doi *doi_def,
 		    struct netlbl_audit *audit_info);
 void calipso_doi_free(struct calipso_doi *doi_def);
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info);
 struct calipso_doi *calipso_doi_getdef(u32 doi);
 void calipso_doi_putdef(struct calipso_doi *doi_def);
 int calipso_doi_walk(u32 *skip_cnt,
-- 
cgit v1.2.3


From e67ae213c72f72be50561c060ae17e92426651da Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:50 -0400
Subject: ipv6: Add ipv6_renew_options_kern() that accepts a kernel mem
 pointer.

The functionality is equivalent to ipv6_renew_options() except
that the newopt pointer is in kernel, not user, memory

The kernel memory implementation will be used by the CALIPSO network
labelling engine, which needs to be able to set IPv6 hop-by-hop
options.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/ipv6.h |  6 ++++++
 net/ipv6/exthdrs.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d0aeb97aec5d..887313d978d0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -308,6 +308,12 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 					  int newtype,
 					  struct ipv6_opt_hdr __user *newopt,
 					  int newoptlen);
+struct ipv6_txoptions *
+ipv6_renew_options_kern(struct sock *sk,
+			struct ipv6_txoptions *opt,
+			int newtype,
+			struct ipv6_opt_hdr *newopt,
+			int newoptlen);
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ea7c4d64a00a..d5fd3e799f86 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -758,6 +758,27 @@ static int ipv6_renew_option(void *ohdr,
 	return 0;
 }
 
+/**
+ * ipv6_renew_options - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (user-mem)
+ * @newoptlen: length of @newopt
+ *
+ * Returns a new set of options which is a copy of @opt with the
+ * option type @newtype replaced with @newopt.
+ *
+ * @opt may be NULL, in which case a new set of options is returned
+ * containing just @newopt.
+ *
+ * @newopt may be NULL, in which case the specified option type is
+ * not copied into the new set of options.
+ *
+ * The new set of options is allocated from the socket option memory
+ * buffer of @sk.
+ */
 struct ipv6_txoptions *
 ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 		   int newtype,
@@ -830,6 +851,34 @@ out:
 	return ERR_PTR(err);
 }
 
+/**
+ * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (kernel-mem)
+ * @newoptlen: length of @newopt
+ *
+ * See ipv6_renew_options().  The difference is that @newopt is
+ * kernel memory, rather than user memory.
+ */
+struct ipv6_txoptions *
+ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
+			int newtype, struct ipv6_opt_hdr *newopt,
+			int newoptlen)
+{
+	struct ipv6_txoptions *ret_val;
+	const mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret_val = ipv6_renew_options(sk, opt, newtype,
+				     (struct ipv6_opt_hdr __user *)newopt,
+				     newoptlen);
+	set_fs(old_fs);
+	return ret_val;
+}
+
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt)
 {
-- 
cgit v1.2.3


From 3faa8f982f958961fda68b8d63e682fe77a032d4 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:51 -0400
Subject: netlabel: Move bitmap manipulation functions to the NetLabel core.

This is to allow the CALIPSO labelling engine to use these.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h       |  6 +++
 net/ipv4/cipso_ipv4.c        | 88 +++++---------------------------------------
 net/netlabel/netlabel_kapi.c | 70 +++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 79 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 2c0513b0cc88..9fc2cab9be98 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -434,6 +434,12 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
 			  unsigned long bitmap,
 			  gfp_t flags);
 
+/* Bitmap functions
+ */
+int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
+		       u32 offset, u8 state);
+void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state);
+
 /*
  * LSM protocol operations (NetLabel LSM/kernel API)
  */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec363..d710d4eb6a29 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -134,76 +134,6 @@ int cipso_v4_rbm_strictvalid = 1;
  * Helper Functions
  */
 
-/**
- * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
- * @bitmap: the bitmap
- * @bitmap_len: length in bits
- * @offset: starting offset
- * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
- *
- * Description:
- * Starting at @offset, walk the bitmap from left to right until either the
- * desired bit is found or we reach the end.  Return the bit offset, -1 if
- * not found, or -2 if error.
- */
-static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
-				u32 bitmap_len,
-				u32 offset,
-				u8 state)
-{
-	u32 bit_spot;
-	u32 byte_offset;
-	unsigned char bitmask;
-	unsigned char byte;
-
-	/* gcc always rounds to zero when doing integer division */
-	byte_offset = offset / 8;
-	byte = bitmap[byte_offset];
-	bit_spot = offset;
-	bitmask = 0x80 >> (offset % 8);
-
-	while (bit_spot < bitmap_len) {
-		if ((state && (byte & bitmask) == bitmask) ||
-		    (state == 0 && (byte & bitmask) == 0))
-			return bit_spot;
-
-		bit_spot++;
-		bitmask >>= 1;
-		if (bitmask == 0) {
-			byte = bitmap[++byte_offset];
-			bitmask = 0x80;
-		}
-	}
-
-	return -1;
-}
-
-/**
- * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
- * @bitmap: the bitmap
- * @bit: the bit
- * @state: if non-zero, set the bit (1) else clear the bit (0)
- *
- * Description:
- * Set a single bit in the bitmask.  Returns zero on success, negative values
- * on error.
- */
-static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
-				   u32 bit,
-				   u8 state)
-{
-	u32 byte_spot;
-	u8 bitmask;
-
-	/* gcc always rounds to zero when doing integer division */
-	byte_spot = bit / 8;
-	bitmask = 0x80 >> (bit % 8);
-	if (state)
-		bitmap[byte_spot] |= bitmask;
-	else
-		bitmap[byte_spot] &= ~bitmask;
-}
-
 /**
  * cipso_v4_cache_entry_free - Frees a cache entry
  * @entry: the entry to free
@@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
 		cipso_cat_size = doi_def->map.std->cat.cipso_size;
 		cipso_array = doi_def->map.std->cat.cipso;
 		for (;;) {
-			cat = cipso_v4_bitmap_walk(bitmap,
-						   bitmap_len_bits,
-						   cat + 1,
-						   1);
+			cat = netlbl_bitmap_walk(bitmap,
+						 bitmap_len_bits,
+						 cat + 1,
+						 1);
 			if (cat < 0)
 				break;
 			if (cat >= cipso_cat_size ||
@@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
 		}
 		if (net_spot >= net_clen_bits)
 			return -ENOSPC;
-		cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+		netlbl_bitmap_setbit(net_cat, net_spot, 1);
 
 		if (net_spot > net_spot_max)
 			net_spot_max = net_spot;
@@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
 	}
 
 	for (;;) {
-		net_spot = cipso_v4_bitmap_walk(net_cat,
-						net_clen_bits,
-						net_spot + 1,
-						1);
+		net_spot = netlbl_bitmap_walk(net_cat,
+					      net_clen_bits,
+					      net_spot + 1,
+					      1);
 		if (net_spot < 0) {
 			if (net_spot == -2)
 				return -EFAULT;
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 8d2e483167a8..54f13a33b52c 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -728,6 +728,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
 	return 0;
 }
 
+/* Bitmap functions
+ */
+
+/**
+ * netlbl_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end.  Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
+		       u32 offset, u8 state)
+{
+	u32 bit_spot;
+	u32 byte_offset;
+	unsigned char bitmask;
+	unsigned char byte;
+
+	byte_offset = offset / 8;
+	byte = bitmap[byte_offset];
+	bit_spot = offset;
+	bitmask = 0x80 >> (offset % 8);
+
+	while (bit_spot < bitmap_len) {
+		if ((state && (byte & bitmask) == bitmask) ||
+		    (state == 0 && (byte & bitmask) == 0))
+			return bit_spot;
+
+		bit_spot++;
+		bitmask >>= 1;
+		if (bitmask == 0) {
+			byte = bitmap[++byte_offset];
+			bitmask = 0x80;
+		}
+	}
+
+	return -1;
+}
+EXPORT_SYMBOL(netlbl_bitmap_walk);
+
+/**
+ * netlbl_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask.  Returns zero on success, negative values
+ * on error.
+ */
+void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state)
+{
+	u32 byte_spot;
+	u8 bitmask;
+
+	/* gcc always rounds to zero when doing integer division */
+	byte_spot = bit / 8;
+	bitmask = 0x80 >> (bit % 8);
+	if (state)
+		bitmap[byte_spot] |= bitmask;
+	else
+		bitmap[byte_spot] &= ~bitmask;
+}
+EXPORT_SYMBOL(netlbl_bitmap_setbit);
+
 /*
  * LSM Functions
  */
-- 
cgit v1.2.3


From ceba1832b1b2da0149c51de62a847c00bca1677a Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:02:51 -0400
Subject: calipso: Set the calipso socket label to match the secattr.

CALIPSO is a hop-by-hop IPv6 option.  A lot of this patch is based on
the equivalent CISPO code.  The main difference is due to manipulating
the options in the hop-by-hop header.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/ipv6.h              |   2 +
 include/net/netlabel.h          |   9 +
 include/uapi/linux/in6.h        |   1 +
 net/ipv6/calipso.c              | 595 ++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ipv6_sockglue.c        |   1 -
 net/netlabel/Kconfig            |   1 +
 net/netlabel/netlabel_calipso.c |  64 +++++
 net/netlabel/netlabel_calipso.h |   5 +
 net/netlabel/netlabel_kapi.c    |  58 +++-
 security/selinux/netlabel.c     |   2 +-
 10 files changed, 728 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 887313d978d0..4e279a83cdd0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -319,6 +319,8 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 
 bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
 		       const struct inet6_skb_parm *opt);
+struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
+					   struct ipv6_txoptions *opt);
 
 static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 {
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 9fc2cab9be98..918a6044c89c 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -226,6 +226,9 @@ struct netlbl_lsm_secattr {
  * @doi_getdef: returns a reference to a DOI
  * @doi_putdef: releases a reference of a DOI
  * @doi_walk: enumerate the DOI list
+ * @sock_getattr: retrieve the socket's attr
+ * @sock_setattr: set the socket's attr
+ * @sock_delattr: remove the socket's attr
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -243,6 +246,12 @@ struct netlbl_calipso_ops {
 	int (*doi_walk)(u32 *skip_cnt,
 			int (*callback)(struct calipso_doi *doi_def, void *arg),
 			void *cb_arg);
+	int (*sock_getattr)(struct sock *sk,
+			    struct netlbl_lsm_secattr *secattr);
+	int (*sock_setattr)(struct sock *sk,
+			    const struct calipso_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr);
+	void (*sock_delattr)(struct sock *sk);
 };
 
 /*
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 318a4828bf98..b39ea4f2e701 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -143,6 +143,7 @@ struct in6_flowlabel_req {
 #define IPV6_TLV_PAD1		0
 #define IPV6_TLV_PADN		1
 #define IPV6_TLV_ROUTERALERT	5
+#define IPV6_TLV_CALIPSO	7	/* RFC 5570 */
 #define IPV6_TLV_JUMBO		194
 #define IPV6_TLV_HAO		201	/* home address option */
 
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index d7df7a4bd32e..959db5268b38 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -44,6 +44,24 @@
 #include <linux/atomic.h>
 #include <linux/bug.h>
 #include <asm/unaligned.h>
+#include <linux/crc-ccitt.h>
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_OPT_LEN_MAX (2 + 252)
+
+/* Size of the minimum calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_HDR_LEN (2 + 8)
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header and upto 3 bytes of
+ * leading pad and 7 bytes of trailing pad.
+ */
+#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
+
 
 /* List of available DOI definitions */
 static DEFINE_SPINLOCK(calipso_doi_list_lock);
@@ -297,6 +315,580 @@ doi_walk_return:
 	return ret_val;
 }
 
+/**
+ * calipso_map_cat_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CALIPSO bitmap using the given DOI definition.  Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int calipso_map_cat_hton(const struct calipso_doi *doi_def,
+				const struct netlbl_lsm_secattr *secattr,
+				unsigned char *net_cat,
+				u32 net_cat_len)
+{
+	int spot = -1;
+	u32 net_spot_max = 0;
+	u32 net_clen_bits = net_cat_len * 8;
+
+	for (;;) {
+		spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+					  spot + 1);
+		if (spot < 0)
+			break;
+		if (spot >= net_clen_bits)
+			return -ENOSPC;
+		netlbl_bitmap_setbit(net_cat, spot, 1);
+
+		if (spot > net_spot_max)
+			net_spot_max = spot;
+	}
+
+	return (net_spot_max / 32 + 1) * 4;
+}
+
+/**
+ * calipso_map_cat_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CALIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def,
+				const unsigned char *net_cat,
+				u32 net_cat_len,
+				struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	int spot = -1;
+	u32 net_clen_bits = net_cat_len * 8;
+
+	for (;;) {
+		spot = netlbl_bitmap_walk(net_cat,
+					  net_clen_bits,
+					  spot + 1,
+					  1);
+		if (spot < 0) {
+			if (spot == -2)
+				return -EFAULT;
+			return 0;
+		}
+
+		ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+					       spot,
+					       GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * calipso_pad_write - Writes pad bytes in TLV format
+ * @buf: the buffer
+ * @offset: offset from start of buffer to write padding
+ * @count: number of pad bytes to write
+ *
+ * Description:
+ * Write @count bytes of TLV padding into @buffer starting at offset @offset.
+ * @count should be less than 8 - see RFC 4942.
+ *
+ */
+static int calipso_pad_write(unsigned char *buf, unsigned int offset,
+			     unsigned int count)
+{
+	if (WARN_ON_ONCE(count >= 8))
+		return -EINVAL;
+
+	switch (count) {
+	case 0:
+		break;
+	case 1:
+		buf[offset] = IPV6_TLV_PAD1;
+		break;
+	default:
+		buf[offset] = IPV6_TLV_PADN;
+		buf[offset + 1] = count - 2;
+		if (count > 2)
+			memset(buf + offset + 2, 0, count - 2);
+		break;
+	}
+	return 0;
+}
+
+/**
+ * calipso_genopt - Generate a CALIPSO option
+ * @buf: the option buffer
+ * @start: offset from which to write
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CALIPSO option using the DOI definition and security attributes
+ * passed to the function. This also generates upto three bytes of leading
+ * padding that ensures that the option is 4n + 2 aligned.  It returns the
+ * number of bytes written (including any initial padding).
+ */
+static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
+			  const struct calipso_doi *doi_def,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 len, pad;
+	u16 crc;
+	static const unsigned char padding[4] = {2, 1, 0, 3};
+	unsigned char *calipso;
+
+	/* CALIPSO has 4n + 2 alignment */
+	pad = padding[start & 3];
+	if (buf_len <= start + pad + CALIPSO_HDR_LEN)
+		return -ENOSPC;
+
+	if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+		return -EPERM;
+
+	len = CALIPSO_HDR_LEN;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = calipso_map_cat_hton(doi_def,
+					       secattr,
+					       buf + start + pad + len,
+					       buf_len - start - pad - len);
+		if (ret_val < 0)
+			return ret_val;
+		len += ret_val;
+	}
+
+	calipso_pad_write(buf, start, pad);
+	calipso = buf + start + pad;
+
+	calipso[0] = IPV6_TLV_CALIPSO;
+	calipso[1] = len - 2;
+	*(__be32 *)(calipso + 2) = htonl(doi_def->doi);
+	calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
+	calipso[7] = secattr->attr.mls.lvl,
+	crc = ~crc_ccitt(0xffff, calipso, len);
+	calipso[8] = crc & 0xff;
+	calipso[9] = (crc >> 8) & 0xff;
+	return pad + len;
+}
+
+/* Hop-by-hop hdr helper functions
+ */
+
+/**
+ * calipso_opt_update - Replaces socket's hop options with a new set
+ * @sk: the socket
+ * @hop: new hop options
+ *
+ * Description:
+ * Replaces @sk's hop options with @hop.  @hop may be NULL to leave
+ * the socket with no hop options.
+ *
+ */
+static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
+{
+	struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
+
+	txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
+					 hop, hop ? ipv6_optlen(hop) : 0);
+	txopt_put(old);
+	if (IS_ERR(txopts))
+		return PTR_ERR(txopts);
+
+	txopts = ipv6_update_options(sk, txopts);
+	if (txopts) {
+		atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+		txopt_put(txopts);
+	}
+
+	return 0;
+}
+
+/**
+ * calipso_tlv_len - Returns the length of the TLV
+ * @opt: the option header
+ * @offset: offset of the TLV within the header
+ *
+ * Description:
+ * Returns the length of the TLV option at offset @offset within
+ * the option header @opt.  Checks that the entire TLV fits inside
+ * the option header, returns a negative value if this is not the case.
+ */
+static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset)
+{
+	unsigned char *tlv = (unsigned char *)opt;
+	unsigned int opt_len = ipv6_optlen(opt), tlv_len;
+
+	if (offset < sizeof(*opt) || offset >= opt_len)
+		return -EINVAL;
+	if (tlv[offset] == IPV6_TLV_PAD1)
+		return 1;
+	if (offset + 1 >= opt_len)
+		return -EINVAL;
+	tlv_len = tlv[offset + 1] + 2;
+	if (offset + tlv_len > opt_len)
+		return -EINVAL;
+	return tlv_len;
+}
+
+/**
+ * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header
+ * @hop: the hop options header
+ * @start: on return holds the offset of any leading padding
+ * @end: on return holds the offset of the first non-pad TLV after CALIPSO
+ *
+ * Description:
+ * Finds the space occupied by a CALIPSO option (including any leading and
+ * trailing padding).
+ *
+ * If a CALIPSO option exists set @start and @end to the
+ * offsets within @hop of the start of padding before the first
+ * CALIPSO option and the end of padding after the first CALIPSO
+ * option.  In this case the function returns 0.
+ *
+ * In the absence of a CALIPSO option, @start and @end will be
+ * set to the start and end of any trailing padding in the header.
+ * This is useful when appending a new option, as the caller may want
+ * to overwrite some of this padding.  In this case the function will
+ * return -ENOENT.
+ */
+static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start,
+			    unsigned int *end)
+{
+	int ret_val = -ENOENT, tlv_len;
+	unsigned int opt_len, offset, offset_s = 0, offset_e = 0;
+	unsigned char *opt = (unsigned char *)hop;
+
+	opt_len = ipv6_optlen(hop);
+	offset = sizeof(*hop);
+
+	while (offset < opt_len) {
+		tlv_len = calipso_tlv_len(hop, offset);
+		if (tlv_len < 0)
+			return tlv_len;
+
+		switch (opt[offset]) {
+		case IPV6_TLV_PAD1:
+		case IPV6_TLV_PADN:
+			if (offset_e)
+				offset_e = offset;
+			break;
+		case IPV6_TLV_CALIPSO:
+			ret_val = 0;
+			offset_e = offset;
+			break;
+		default:
+			if (offset_e == 0)
+				offset_s = offset;
+			else
+				goto out;
+		}
+		offset += tlv_len;
+	}
+
+out:
+	if (offset_s)
+		*start = offset_s + calipso_tlv_len(hop, offset_s);
+	else
+		*start = sizeof(*hop);
+	if (offset_e)
+		*end = offset_e + calipso_tlv_len(hop, offset_e);
+	else
+		*end = opt_len;
+
+	return ret_val;
+}
+
+/**
+ * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr
+ * @hop: the original hop options header
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Creates a new hop options header based on @hop with a
+ * CALIPSO option added to it.  If @hop already contains a CALIPSO
+ * option this is overwritten, otherwise the new option is appended
+ * after any existing options.  If @hop is NULL then the new header
+ * will contain just the CALIPSO option and any needed padding.
+ *
+ */
+static struct ipv6_opt_hdr *
+calipso_opt_insert(struct ipv6_opt_hdr *hop,
+		   const struct calipso_doi *doi_def,
+		   const struct netlbl_lsm_secattr *secattr)
+{
+	unsigned int start, end, buf_len, pad, hop_len;
+	struct ipv6_opt_hdr *new;
+	int ret_val;
+
+	if (hop) {
+		hop_len = ipv6_optlen(hop);
+		ret_val = calipso_opt_find(hop, &start, &end);
+		if (ret_val && ret_val != -ENOENT)
+			return ERR_PTR(ret_val);
+	} else {
+		hop_len = 0;
+		start = sizeof(*hop);
+		end = 0;
+	}
+
+	buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD;
+	new = kzalloc(buf_len, GFP_ATOMIC);
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+
+	if (start > sizeof(*hop))
+		memcpy(new, hop, start);
+	ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def,
+				 secattr);
+	if (ret_val < 0)
+		return ERR_PTR(ret_val);
+
+	buf_len = start + ret_val;
+	/* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */
+	pad = ((buf_len & 4) + (end & 7)) & 7;
+	calipso_pad_write((unsigned char *)new, buf_len, pad);
+	buf_len += pad;
+
+	if (end != hop_len) {
+		memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end);
+		buf_len += hop_len - end;
+	}
+	new->nexthdr = 0;
+	new->hdrlen = buf_len / 8 - 1;
+
+	return new;
+}
+
+/**
+ * calipso_opt_del - Removes the CALIPSO option from an option header
+ * @hop: the original header
+ * @new: the new header
+ *
+ * Description:
+ * Creates a new header based on @hop without any CALIPSO option.  If @hop
+ * doesn't contain a CALIPSO option it returns -ENOENT.  If @hop contains
+ * no other non-padding options, it returns zero with @new set to NULL.
+ * Otherwise it returns zero, creates a new header without the CALIPSO
+ * option (and removing as much padding as possible) and returns with
+ * @new set to that header.
+ *
+ */
+static int calipso_opt_del(struct ipv6_opt_hdr *hop,
+			   struct ipv6_opt_hdr **new)
+{
+	int ret_val;
+	unsigned int start, end, delta, pad, hop_len;
+
+	ret_val = calipso_opt_find(hop, &start, &end);
+	if (ret_val)
+		return ret_val;
+
+	hop_len = ipv6_optlen(hop);
+	if (start == sizeof(*hop) && end == hop_len) {
+		/* There's no other option in the header so return NULL */
+		*new = NULL;
+		return 0;
+	}
+
+	delta = (end - start) & ~7;
+	*new = kzalloc(hop_len - delta, GFP_ATOMIC);
+	if (!*new)
+		return -ENOMEM;
+
+	memcpy(*new, hop, start);
+	(*new)->hdrlen -= delta / 8;
+	pad = (end - start) & 7;
+	calipso_pad_write((unsigned char *)*new, start, pad);
+	if (end != hop_len)
+		memcpy((char *)*new + start + pad, (char *)hop + end,
+		       hop_len - end);
+
+	return 0;
+}
+
+/**
+ * calipso_opt_getattr - Get the security attributes from a memory block
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_opt_getattr(const unsigned char *calipso,
+			       struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	u32 doi, len = calipso[1], cat_len = calipso[6] * 4;
+	struct calipso_doi *doi_def;
+
+	if (cat_len + 8 > len)
+		return -EINVAL;
+
+	doi = get_unaligned_be32(calipso + 2);
+	rcu_read_lock();
+	doi_def = calipso_doi_search(doi);
+	if (!doi_def)
+		goto getattr_return;
+
+	secattr->attr.mls.lvl = calipso[7];
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (cat_len) {
+		ret_val = calipso_map_cat_ntoh(doi_def,
+					       calipso + 10,
+					       cat_len,
+					       secattr);
+		if (ret_val != 0) {
+			netlbl_catmap_free(secattr->attr.mls.cat);
+			goto getattr_return;
+		}
+
+		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	secattr->type = NETLBL_NLTYPE_CALIPSO;
+
+getattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/* sock functions.
+ */
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr.  This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself.  Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_sock_getattr(struct sock *sk,
+				struct netlbl_lsm_secattr *secattr)
+{
+	struct ipv6_opt_hdr *hop;
+	int opt_len, len, ret_val = -ENOMSG, offset;
+	unsigned char *opt;
+	struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+	if (!txopts || !txopts->hopopt)
+		goto done;
+
+	hop = txopts->hopopt;
+	opt = (unsigned char *)hop;
+	opt_len = ipv6_optlen(hop);
+	offset = sizeof(*hop);
+	while (offset < opt_len) {
+		len = calipso_tlv_len(hop, offset);
+		if (len < 0) {
+			ret_val = len;
+			goto done;
+		}
+		switch (opt[offset]) {
+		case IPV6_TLV_CALIPSO:
+			if (len < CALIPSO_HDR_LEN)
+				ret_val = -EINVAL;
+			else
+				ret_val = calipso_opt_getattr(&opt[offset],
+							      secattr);
+			goto done;
+		default:
+			offset += len;
+			break;
+		}
+	}
+done:
+	txopt_put(txopts);
+	return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked.  Returns zero on success and negative
+ * values on failure.
+ *
+ */
+static int calipso_sock_setattr(struct sock *sk,
+				const struct calipso_doi *doi_def,
+				const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct ipv6_opt_hdr *old, *new;
+	struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+	old = NULL;
+	if (txopts)
+		old = txopts->hopopt;
+
+	new = calipso_opt_insert(old, doi_def, secattr);
+	txopt_put(txopts);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	ret_val = calipso_opt_update(sk, new);
+
+	kfree(new);
+	return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+static void calipso_sock_delattr(struct sock *sk)
+{
+	struct ipv6_opt_hdr *new_hop;
+	struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+	if (!txopts || !txopts->hopopt)
+		goto done;
+
+	if (calipso_opt_del(txopts->hopopt, &new_hop))
+		goto done;
+
+	calipso_opt_update(sk, new_hop);
+	kfree(new_hop);
+
+done:
+	txopt_put(txopts);
+}
+
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
@@ -304,6 +896,9 @@ static const struct netlbl_calipso_ops ops = {
 	.doi_getdef       = calipso_doi_getdef,
 	.doi_putdef       = calipso_doi_putdef,
 	.doi_walk         = calipso_doi_walk,
+	.sock_getattr     = calipso_sock_getattr,
+	.sock_setattr     = calipso_sock_setattr,
+	.sock_delattr     = calipso_sock_delattr,
 };
 
 /**
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4449ad1f8114..8a80d59bed34 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel)
 	return 0;
 }
 
-static
 struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 					   struct ipv6_txoptions *opt)
 {
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
index 56958c85f2b4..d9eaa30ffe3f 100644
--- a/net/netlabel/Kconfig
+++ b/net/netlabel/Kconfig
@@ -5,6 +5,7 @@
 config NETLABEL
 	bool "NetLabel subsystem support"
 	depends on SECURITY
+	select CRC_CCITT if IPV6
 	default n
 	---help---
 	  NetLabel provides support for explicit network packet labeling
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2857673e8802..6f9c6589a022 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -514,3 +514,67 @@ int calipso_doi_walk(u32 *skip_cnt,
 		ret_val = ops->doi_walk(skip_cnt, callback, cb_arg);
 	return ret_val;
 }
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr.  This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself.  Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->sock_getattr(sk, secattr);
+	return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked.  Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int calipso_sock_setattr(struct sock *sk,
+			 const struct calipso_doi *doi_def,
+			 const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->sock_setattr(sk, doi_def, secattr);
+	return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+void calipso_sock_delattr(struct sock *sk)
+{
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ops->sock_delattr(sk);
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index ed78554ba472..49bc116705c4 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -128,5 +128,10 @@ void calipso_doi_putdef(struct calipso_doi *doi_def);
 int calipso_doi_walk(u32 *skip_cnt,
 		     int (*callback)(struct calipso_doi *doi_def, void *arg),
 		     void *cb_arg);
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr);
+int calipso_sock_setattr(struct sock *sk,
+			 const struct calipso_doi *doi_def,
+			 const struct netlbl_lsm_secattr *secattr);
+void calipso_sock_delattr(struct sock *sk);
 
 #endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 54f13a33b52c..00bab51c291e 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -37,12 +37,14 @@
 #include <net/ipv6.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
+#include <net/calipso.h>
 #include <asm/bug.h>
 #include <linux/atomic.h>
 
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
 #include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
 #include "netlabel_user.h"
 #include "netlabel_mgmt.h"
 #include "netlabel_addrlist.h"
@@ -521,6 +523,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
 
 	return -ENOENT;
 }
+EXPORT_SYMBOL(netlbl_catmap_walk);
 
 /**
  * netlbl_catmap_walkrng - Find the end of a string of set bits
@@ -656,6 +659,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
 
 	return 0;
 }
+EXPORT_SYMBOL(netlbl_catmap_setbit);
 
 /**
  * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
@@ -870,9 +874,21 @@ int netlbl_sock_setattr(struct sock *sk,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		/* since we don't support any IPv6 labeling protocols right
-		 * now we can optimize everything away until we do */
-		ret_val = 0;
+		switch (dom_entry->def.type) {
+		case NETLBL_NLTYPE_ADDRSELECT:
+			ret_val = -EDESTADDRREQ;
+			break;
+		case NETLBL_NLTYPE_CALIPSO:
+			ret_val = calipso_sock_setattr(sk,
+						       dom_entry->def.calipso,
+						       secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
 		break;
 #endif /* IPv6 */
 	default:
@@ -899,6 +915,11 @@ void netlbl_sock_delattr(struct sock *sk)
 	case AF_INET:
 		cipso_v4_sock_delattr(sk);
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		calipso_sock_delattr(sk);
+		break;
+#endif /* IPv6 */
 	}
 }
 
@@ -925,7 +946,7 @@ int netlbl_sock_getattr(struct sock *sk,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		ret_val = -ENOMSG;
+		ret_val = calipso_sock_getattr(sk, secattr);
 		break;
 #endif /* IPv6 */
 	default:
@@ -953,6 +974,9 @@ int netlbl_conn_setattr(struct sock *sk,
 {
 	int ret_val;
 	struct sockaddr_in *addr4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 *addr6;
+#endif
 	struct netlbl_dommap_def *entry;
 
 	rcu_read_lock();
@@ -973,7 +997,7 @@ int netlbl_conn_setattr(struct sock *sk,
 		case NETLBL_NLTYPE_UNLABELED:
 			/* just delete the protocols we support for right now
 			 * but we could remove other protocols if needed */
-			cipso_v4_sock_delattr(sk);
+			netlbl_sock_delattr(sk);
 			ret_val = 0;
 			break;
 		default:
@@ -982,9 +1006,27 @@ int netlbl_conn_setattr(struct sock *sk,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		/* since we don't support any IPv6 labeling protocols right
-		 * now we can optimize everything away until we do */
-		ret_val = 0;
+		addr6 = (struct sockaddr_in6 *)addr;
+		entry = netlbl_domhsh_getentry_af6(secattr->domain,
+						   &addr6->sin6_addr);
+		if (entry == NULL) {
+			ret_val = -ENOENT;
+			goto conn_setattr_return;
+		}
+		switch (entry->type) {
+		case NETLBL_NLTYPE_CALIPSO:
+			ret_val = calipso_sock_setattr(sk,
+						       entry->calipso, secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			netlbl_sock_delattr(sk);
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
 		break;
 #endif /* IPv6 */
 	default:
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 1f989a539fd4..5470f32eca54 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -333,7 +333,7 @@ int selinux_netlbl_socket_post_create(struct sock *sk, u16 family)
 	struct sk_security_struct *sksec = sk->sk_security;
 	struct netlbl_lsm_secattr *secattr;
 
-	if (family != PF_INET)
+	if (family != PF_INET && family != PF_INET6)
 		return 0;
 
 	secattr = selinux_netlbl_sock_genattr(sk);
-- 
cgit v1.2.3


From 56ac42bc94b18d45b6c484edeac33be86bfb3efa Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:05:28 -0400
Subject: ipv6: Allow request socks to contain IPv6 options.

If set, these will take precedence over the parent's options during
both sending and child creation.  If they're not set, the parent's
options (if any) will be used.

This is to allow the security_inet_conn_request() hook to modify the
IPv6 options in just the same way that it already may do for IPv4.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/inet_sock.h |  7 ++++++-
 net/dccp/ipv6.c         | 12 +++++++++---
 net/ipv4/tcp_input.c    |  3 +++
 net/ipv6/tcp_ipv6.c     | 12 +++++++++---
 4 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 012b1f91f3ec..236a81034fef 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -97,7 +97,12 @@ struct inet_request_sock {
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu	*opt;
-		struct sk_buff		*pktopts;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct {
+			struct ipv6_txoptions	*ipv6_opt;
+			struct sk_buff		*pktopts;
+		};
+#endif
 	};
 };
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4663a01d5039..3381748bd0f3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 	skb = dccp_make_response(sk, dst, req);
 	if (skb != NULL) {
 		struct dccp_hdr *dh = dccp_hdr(skb);
+		struct ipv6_txoptions *opt;
 
 		dh->dccph_checksum = dccp_v6_csum_finish(skb,
 							 &ireq->ir_v6_loc_addr,
 							 &ireq->ir_v6_rmt_addr);
 		fl6.daddr = ireq->ir_v6_rmt_addr;
 		rcu_read_lock();
-		err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
-			       np->tclass);
+		opt = ireq->ipv6_opt;
+		if (!opt)
+			opt = rcu_dereference(np->opt);
+		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -236,6 +239,7 @@ done:
 static void dccp_v6_reqsk_destructor(struct request_sock *req)
 {
 	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+	kfree(inet_rsk(req)->ipv6_opt);
 	kfree_skb(inet_rsk(req)->pktopts);
 }
 
@@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	 * Yes, keeping reference count would be much more clever, but we make
 	 * one more one thing there: reattach optmem to newsk.
 	 */
-	opt = rcu_dereference(np->opt);
+	opt = ireq->ipv6_opt;
+	if (!opt)
+		opt = rcu_dereference(np->opt);
 	if (opt) {
 		opt = ipv6_dup_options(newsk, opt);
 		RCU_INIT_POINTER(newnp->opt, opt);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6e65f79ade8..071174c13bf9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6146,6 +6146,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 
 		kmemcheck_annotate_bitfield(ireq, flags);
 		ireq->opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+		ireq->pktopts = NULL;
+#endif
 		atomic64_set(&ireq->ir_cookie, 0);
 		ireq->ireq_state = TCP_NEW_SYN_RECV;
 		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 711d209f9124..18daddc6001c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -443,6 +443,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_txoptions *opt;
 	struct flowi6 *fl6 = &fl->u.ip6;
 	struct sk_buff *skb;
 	int err = -ENOMEM;
@@ -463,8 +464,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 			fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
 
 		rcu_read_lock();
-		err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
-			       np->tclass);
+		opt = ireq->ipv6_opt;
+		if (!opt)
+			opt = rcu_dereference(np->opt);
+		err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -476,6 +479,7 @@ done:
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
+	kfree(inet_rsk(req)->ipv6_opt);
 	kfree_skb(inet_rsk(req)->pktopts);
 }
 
@@ -1107,7 +1111,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	   but we make one more one thing there: reattach optmem
 	   to newsk.
 	 */
-	opt = rcu_dereference(np->opt);
+	opt = ireq->ipv6_opt;
+	if (!opt)
+		opt = rcu_dereference(np->opt);
 	if (opt) {
 		opt = ipv6_dup_options(newsk, opt);
 		RCU_INIT_POINTER(newnp->opt, opt);
-- 
cgit v1.2.3


From e1adea927080821ebfa7505bff752a4015955660 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:05:29 -0400
Subject: calipso: Allow request sockets to be relabelled by the lsm.

Request sockets need to have a label that takes into account the
incoming connection as well as their parent's label.  This is used
for the outgoing SYN-ACK and for their child full-socket.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h          |  6 +++
 net/ipv6/calipso.c              | 86 +++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.c | 40 +++++++++++++++++++
 net/netlabel/netlabel_calipso.h |  4 ++
 net/netlabel/netlabel_kapi.c    | 33 ++++++++++++----
 security/selinux/netlabel.c     |  2 +-
 6 files changed, 163 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 918a6044c89c..a2408c30a7f7 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -229,6 +229,8 @@ struct netlbl_lsm_secattr {
  * @sock_getattr: retrieve the socket's attr
  * @sock_setattr: set the socket's attr
  * @sock_delattr: remove the socket's attr
+ * @req_setattr: set the req socket's attr
+ * @req_delattr: remove the req socket's attr
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -252,6 +254,10 @@ struct netlbl_calipso_ops {
 			    const struct calipso_doi *doi_def,
 			    const struct netlbl_lsm_secattr *secattr);
 	void (*sock_delattr)(struct sock *sk);
+	int (*req_setattr)(struct request_sock *req,
+			   const struct calipso_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr);
+	void (*req_delattr)(struct request_sock *req);
 };
 
 /*
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 959db5268b38..067a5640ef17 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -889,6 +889,90 @@ done:
 	txopt_put(txopts);
 }
 
+/* request sock functions.
+ */
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  Returns zero on success and
+ * negative values on failure.
+ *
+ */
+static int calipso_req_setattr(struct request_sock *req,
+			       const struct calipso_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr)
+{
+	struct ipv6_txoptions *txopts;
+	struct inet_request_sock *req_inet = inet_rsk(req);
+	struct ipv6_opt_hdr *old, *new;
+	struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+	if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt)
+		old = req_inet->ipv6_opt->hopopt;
+	else
+		old = NULL;
+
+	new = calipso_opt_insert(old, doi_def, secattr);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+					 new, new ? ipv6_optlen(new) : 0);
+
+	kfree(new);
+
+	if (IS_ERR(txopts))
+		return PTR_ERR(txopts);
+
+	txopts = xchg(&req_inet->ipv6_opt, txopts);
+	if (txopts) {
+		atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+		txopt_put(txopts);
+	}
+
+	return 0;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+static void calipso_req_delattr(struct request_sock *req)
+{
+	struct inet_request_sock *req_inet = inet_rsk(req);
+	struct ipv6_opt_hdr *new;
+	struct ipv6_txoptions *txopts;
+	struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+	if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt)
+		return;
+
+	if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
+		return; /* Nothing to do */
+
+	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+					 new, new ? ipv6_optlen(new) : 0);
+
+	if (!IS_ERR(txopts)) {
+		txopts = xchg(&req_inet->ipv6_opt, txopts);
+		if (txopts) {
+			atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+			txopt_put(txopts);
+		}
+	}
+	kfree(new);
+}
+
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
@@ -899,6 +983,8 @@ static const struct netlbl_calipso_ops ops = {
 	.sock_getattr     = calipso_sock_getattr,
 	.sock_setattr     = calipso_sock_setattr,
 	.sock_delattr     = calipso_sock_delattr,
+	.req_setattr      = calipso_req_setattr,
+	.req_delattr      = calipso_req_delattr,
 };
 
 /**
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 6f9c6589a022..79519e3a6a93 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -578,3 +578,43 @@ void calipso_sock_delattr(struct sock *sk)
 	if (ops)
 		ops->sock_delattr(sk);
 }
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int calipso_req_setattr(struct request_sock *req,
+			const struct calipso_doi *doi_def,
+			const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->req_setattr(req, doi_def, secattr);
+	return ret_val;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+void calipso_req_delattr(struct request_sock *req)
+{
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ops->req_delattr(req);
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index 49bc116705c4..1372fdd86588 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -133,5 +133,9 @@ int calipso_sock_setattr(struct sock *sk,
 			 const struct calipso_doi *doi_def,
 			 const struct netlbl_lsm_secattr *secattr);
 void calipso_sock_delattr(struct sock *sk);
+int calipso_req_setattr(struct request_sock *req,
+			const struct calipso_doi *doi_def,
+			const struct netlbl_lsm_secattr *secattr);
+void calipso_req_delattr(struct request_sock *req);
 
 #endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 00bab51c291e..9b725f75c750 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1053,12 +1053,13 @@ int netlbl_req_setattr(struct request_sock *req,
 {
 	int ret_val;
 	struct netlbl_dommap_def *entry;
+	struct inet_request_sock *ireq = inet_rsk(req);
 
 	rcu_read_lock();
 	switch (req->rsk_ops->family) {
 	case AF_INET:
 		entry = netlbl_domhsh_getentry_af4(secattr->domain,
-						   inet_rsk(req)->ir_rmt_addr);
+						   ireq->ir_rmt_addr);
 		if (entry == NULL) {
 			ret_val = -ENOENT;
 			goto req_setattr_return;
@@ -1069,9 +1070,7 @@ int netlbl_req_setattr(struct request_sock *req,
 						       entry->cipso, secattr);
 			break;
 		case NETLBL_NLTYPE_UNLABELED:
-			/* just delete the protocols we support for right now
-			 * but we could remove other protocols if needed */
-			cipso_v4_req_delattr(req);
+			netlbl_req_delattr(req);
 			ret_val = 0;
 			break;
 		default:
@@ -1080,9 +1079,24 @@ int netlbl_req_setattr(struct request_sock *req,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		/* since we don't support any IPv6 labeling protocols right
-		 * now we can optimize everything away until we do */
-		ret_val = 0;
+		entry = netlbl_domhsh_getentry_af6(secattr->domain,
+						   &ireq->ir_v6_rmt_addr);
+		if (entry == NULL) {
+			ret_val = -ENOENT;
+			goto req_setattr_return;
+		}
+		switch (entry->type) {
+		case NETLBL_NLTYPE_CALIPSO:
+			ret_val = calipso_req_setattr(req,
+						      entry->calipso, secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			netlbl_req_delattr(req);
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
 		break;
 #endif /* IPv6 */
 	default:
@@ -1108,6 +1122,11 @@ void netlbl_req_delattr(struct request_sock *req)
 	case AF_INET:
 		cipso_v4_req_delattr(req);
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		calipso_req_delattr(req);
+		break;
+#endif /* IPv6 */
 	}
 }
 
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 2477a75f16e7..ca220c3fbcf9 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -284,7 +284,7 @@ int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family)
 	int rc;
 	struct netlbl_lsm_secattr secattr;
 
-	if (family != PF_INET)
+	if (family != PF_INET && family != PF_INET6)
 		return 0;
 
 	netlbl_secattr_init(&secattr);
-- 
cgit v1.2.3


From 0868383b822e4d8ebde980c7aac973a6aa81a3ec Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:15 -0400
Subject: ipv6: constify the skb pointer of ipv6_find_tlv().

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/ipv6.h      | 2 +-
 net/ipv6/exthdrs_core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4e279a83cdd0..24a5ebecaf1f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -945,7 +945,7 @@ enum {
 int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
 		  unsigned short *fragoff, int *fragflg);
 
-int ipv6_find_tlv(struct sk_buff *skb, int offset, int type);
+int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);
 
 struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
 				const struct ipv6_txoptions *opt,
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 9508a20fbf61..305e2ed730bf 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
 }
 EXPORT_SYMBOL(ipv6_skip_exthdr);
 
-int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
 {
 	const unsigned char *nh = skb_network_header(skb);
 	int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
-- 
cgit v1.2.3


From 2917f57b6bc15cc6787496ee5f2fdf17f0e9b7d3 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:15 -0400
Subject: calipso: Allow the lsm to label the skbuff directly.

In some cases, the lsm needs to add the label to the skbuff directly.
A NF_INET_LOCAL_OUT IPv6 hook is added to selinux to match the IPv4
behaviour.  This allows selinux to label the skbuffs that it requires.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h          |  11 +++
 net/ipv6/calipso.c              | 165 ++++++++++++++++++++++++++++++++++++++++
 net/netlabel/netlabel_calipso.c |  82 ++++++++++++++++++++
 net/netlabel/netlabel_calipso.h |   7 ++
 net/netlabel/netlabel_kapi.c    |  32 +++++++-
 security/selinux/hooks.c        |  15 ++++
 6 files changed, 308 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index a2408c30a7f7..e0e4ce8f22af 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -231,6 +231,10 @@ struct netlbl_lsm_secattr {
  * @sock_delattr: remove the socket's attr
  * @req_setattr: set the req socket's attr
  * @req_delattr: remove the req socket's attr
+ * @opt_getattr: retrieve attr from memory block
+ * @skbuff_optptr: find option in packet
+ * @skbuff_setattr: set the skbuff's attr
+ * @skbuff_delattr: remove the skbuff's attr
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -258,6 +262,13 @@ struct netlbl_calipso_ops {
 			   const struct calipso_doi *doi_def,
 			   const struct netlbl_lsm_secattr *secattr);
 	void (*req_delattr)(struct request_sock *req);
+	int (*opt_getattr)(const unsigned char *calipso,
+			   struct netlbl_lsm_secattr *secattr);
+	unsigned char *(*skbuff_optptr)(const struct sk_buff *skb);
+	int (*skbuff_setattr)(struct sk_buff *skb,
+			      const struct calipso_doi *doi_def,
+			      const struct netlbl_lsm_secattr *secattr);
+	int (*skbuff_delattr)(struct sk_buff *skb);
 };
 
 /*
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 067a5640ef17..fa371a8827cf 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -62,6 +62,11 @@
  */
 #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
 
+ /* Maximium size of u32 aligned buffer required to hold calipso
+  * option.  Max of 3 initial pad bytes starting from buffer + 3.
+  * i.e. the worst case is when the previous tlv finishes on 4n + 3.
+  */
+#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX)
 
 /* List of available DOI definitions */
 static DEFINE_SPINLOCK(calipso_doi_list_lock);
@@ -973,6 +978,162 @@ static void calipso_req_delattr(struct request_sock *req)
 	kfree(new);
 }
 
+/* skbuff functions.
+ */
+
+/**
+ * calipso_skbuff_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option.  Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb);
+	int offset;
+
+	if (ip6_hdr->nexthdr != NEXTHDR_HOP)
+		return NULL;
+
+	offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO);
+	if (offset >= 0)
+		return (unsigned char *)ip6_hdr + offset;
+
+	return NULL;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+static int calipso_skbuff_setattr(struct sk_buff *skb,
+				  const struct calipso_doi *doi_def,
+				  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct ipv6hdr *ip6_hdr;
+	struct ipv6_opt_hdr *hop;
+	unsigned char buf[CALIPSO_MAX_BUFFER];
+	int len_delta, new_end, pad;
+	unsigned int start, end;
+
+	ip6_hdr = ipv6_hdr(skb);
+	if (ip6_hdr->nexthdr == NEXTHDR_HOP) {
+		hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+		ret_val = calipso_opt_find(hop, &start, &end);
+		if (ret_val && ret_val != -ENOENT)
+			return ret_val;
+	} else {
+		start = 0;
+		end = 0;
+	}
+
+	memset(buf, 0, sizeof(buf));
+	ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr);
+	if (ret_val < 0)
+		return ret_val;
+
+	new_end = start + ret_val;
+	/* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */
+	pad = ((new_end & 4) + (end & 7)) & 7;
+	len_delta = new_end - (int)end + pad;
+	ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+	if (ret_val < 0)
+		return ret_val;
+
+	if (len_delta) {
+		if (len_delta > 0)
+			skb_push(skb, len_delta);
+		else
+			skb_pull(skb, -len_delta);
+		memmove((char *)ip6_hdr - len_delta, ip6_hdr,
+			sizeof(*ip6_hdr) + start);
+		skb_reset_network_header(skb);
+		ip6_hdr = ipv6_hdr(skb);
+	}
+
+	hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+	if (start == 0) {
+		struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf;
+
+		new_hop->nexthdr = ip6_hdr->nexthdr;
+		new_hop->hdrlen = len_delta / 8 - 1;
+		ip6_hdr->nexthdr = NEXTHDR_HOP;
+	} else {
+		hop->hdrlen += len_delta / 8;
+	}
+	memcpy((char *)hop + start, buf + (start & 3), new_end - start);
+	calipso_pad_write((unsigned char *)hop, new_end, pad);
+
+	return 0;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+	int ret_val;
+	struct ipv6hdr *ip6_hdr;
+	struct ipv6_opt_hdr *old_hop;
+	u32 old_hop_len, start = 0, end = 0, delta, size, pad;
+
+	if (!calipso_skbuff_optptr(skb))
+		return 0;
+
+	/* since we are changing the packet we should make a copy */
+	ret_val = skb_cow(skb, skb_headroom(skb));
+	if (ret_val < 0)
+		return ret_val;
+
+	ip6_hdr = ipv6_hdr(skb);
+	old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+	old_hop_len = ipv6_optlen(old_hop);
+
+	ret_val = calipso_opt_find(old_hop, &start, &end);
+	if (ret_val)
+		return ret_val;
+
+	if (start == sizeof(*old_hop) && end == old_hop_len) {
+		/* There's no other option in the header so we delete
+		 * the whole thing. */
+		delta = old_hop_len;
+		size = sizeof(*ip6_hdr);
+		ip6_hdr->nexthdr = old_hop->nexthdr;
+	} else {
+		delta = (end - start) & ~7;
+		if (delta)
+			old_hop->hdrlen -= delta / 8;
+		pad = (end - start) & 7;
+		size = sizeof(*ip6_hdr) + start + pad;
+		calipso_pad_write((unsigned char *)old_hop, start, pad);
+	}
+
+	if (delta) {
+		skb_pull(skb, delta);
+		memmove((char *)ip6_hdr + delta, ip6_hdr, size);
+		skb_reset_network_header(skb);
+	}
+
+	return 0;
+}
+
 static const struct netlbl_calipso_ops ops = {
 	.doi_add          = calipso_doi_add,
 	.doi_free         = calipso_doi_free,
@@ -985,6 +1146,10 @@ static const struct netlbl_calipso_ops ops = {
 	.sock_delattr     = calipso_sock_delattr,
 	.req_setattr      = calipso_req_setattr,
 	.req_delattr      = calipso_req_delattr,
+	.opt_getattr      = calipso_opt_getattr,
+	.skbuff_optptr    = calipso_skbuff_optptr,
+	.skbuff_setattr   = calipso_skbuff_setattr,
+	.skbuff_delattr   = calipso_skbuff_delattr,
 };
 
 /**
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 79519e3a6a93..0d02c262dbf6 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -618,3 +618,85 @@ void calipso_req_delattr(struct request_sock *req)
 	if (ops)
 		ops->req_delattr(req);
 }
+
+/**
+ * calipso_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option.  Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+unsigned char *calipso_optptr(const struct sk_buff *skb)
+{
+	unsigned char *ret_val = NULL;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->skbuff_optptr(skb);
+	return ret_val;
+}
+
+/**
+ * calipso_getattr - Get the security attributes from a memory block.
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_getattr(const unsigned char *calipso,
+		    struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->opt_getattr(calipso, secattr);
+	return ret_val;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int calipso_skbuff_setattr(struct sk_buff *skb,
+			   const struct calipso_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->skbuff_setattr(skb, doi_def, secattr);
+	return ret_val;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->skbuff_delattr(skb);
+	return ret_val;
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index 1372fdd86588..66ba92e9289f 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -137,5 +137,12 @@ int calipso_req_setattr(struct request_sock *req,
 			const struct calipso_doi *doi_def,
 			const struct netlbl_lsm_secattr *secattr);
 void calipso_req_delattr(struct request_sock *req);
+unsigned char *calipso_optptr(const struct sk_buff *skb);
+int calipso_getattr(const unsigned char *calipso,
+		    struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_setattr(struct sk_buff *skb,
+			   const struct calipso_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_delattr(struct sk_buff *skb);
 
 #endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 9b725f75c750..c50a1c5d1e1a 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1147,13 +1147,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
 {
 	int ret_val;
 	struct iphdr *hdr4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct ipv6hdr *hdr6;
+#endif
 	struct netlbl_dommap_def *entry;
 
 	rcu_read_lock();
 	switch (family) {
 	case AF_INET:
 		hdr4 = ip_hdr(skb);
-		entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr);
+		entry = netlbl_domhsh_getentry_af4(secattr->domain,
+						   hdr4->daddr);
 		if (entry == NULL) {
 			ret_val = -ENOENT;
 			goto skbuff_setattr_return;
@@ -1174,9 +1178,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		/* since we don't support any IPv6 labeling protocols right
-		 * now we can optimize everything away until we do */
-		ret_val = 0;
+		hdr6 = ipv6_hdr(skb);
+		entry = netlbl_domhsh_getentry_af6(secattr->domain,
+						   &hdr6->daddr);
+		if (entry == NULL) {
+			ret_val = -ENOENT;
+			goto skbuff_setattr_return;
+		}
+		switch (entry->type) {
+		case NETLBL_NLTYPE_CALIPSO:
+			ret_val = calipso_skbuff_setattr(skb, entry->calipso,
+							 secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			ret_val = calipso_skbuff_delattr(skb);
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
 		break;
 #endif /* IPv6 */
 	default:
@@ -1215,6 +1236,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
+		ptr = calipso_optptr(skb);
+		if (ptr && calipso_getattr(ptr, secattr) == 0)
+			return 0;
 		break;
 #endif /* IPv6 */
 	}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a00ab81ab719..cb7c5c8028e7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5063,6 +5063,15 @@ static unsigned int selinux_ipv4_output(void *priv,
 	return selinux_ip_output(skb, PF_INET);
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int selinux_ipv6_output(void *priv,
+					struct sk_buff *skb,
+					const struct nf_hook_state *state)
+{
+	return selinux_ip_output(skb, PF_INET6);
+}
+#endif	/* IPV6 */
+
 static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
 						int ifindex,
 						u16 family)
@@ -6297,6 +6306,12 @@ static struct nf_hook_ops selinux_nf_ops[] = {
 		.hooknum =	NF_INET_FORWARD,
 		.priority =	NF_IP6_PRI_SELINUX_FIRST,
 	},
+	{
+		.hook =		selinux_ipv6_output,
+		.pf =		NFPROTO_IPV6,
+		.hooknum =	NF_INET_LOCAL_OUT,
+		.priority =	NF_IP6_PRI_SELINUX_FIRST,
+	},
 #endif	/* IPV6 */
 };
 
-- 
cgit v1.2.3


From a04e71f631fa3d2fd2aa0404c11484739d1e9073 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:16 -0400
Subject: netlabel: Pass a family parameter to netlbl_skbuff_err().

This makes it possible to route the error to the appropriate
labelling engine.  CALIPSO is far less verbose than CIPSO
when encountering a bogus packet, so there is no need for a
CALIPSO error handler.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h              |  2 +-
 net/netlabel/netlabel_kapi.c        | 11 ++++++++---
 security/selinux/hooks.c            |  6 +++---
 security/selinux/include/netlabel.h |  4 +++-
 security/selinux/netlabel.c         |  6 +++---
 security/smack/smack_lsm.c          |  2 +-
 6 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index e0e4ce8f22af..d8a46a8ed512 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -488,7 +488,7 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
 int netlbl_skbuff_getattr(const struct sk_buff *skb,
 			  u16 family,
 			  struct netlbl_lsm_secattr *secattr);
-void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway);
+void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway);
 
 /*
  * LSM label mapping cache operations
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index c50a1c5d1e1a..a42ce3c15d70 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1249,6 +1249,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
 /**
  * netlbl_skbuff_err - Handle a LSM error on a sk_buff
  * @skb: the packet
+ * @family: the family
  * @error: the error code
  * @gateway: true if host is acting as a gateway, false otherwise
  *
@@ -1258,10 +1259,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
  * according to the packet's labeling protocol.
  *
  */
-void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
+void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
 {
-	if (cipso_v4_optptr(skb))
-		cipso_v4_error(skb, error, gateway);
+	switch (family) {
+	case AF_INET:
+		if (cipso_v4_optptr(skb))
+			cipso_v4_error(skb, error, gateway);
+		break;
+	}
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index cb7c5c8028e7..51eafe5d3bf4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4603,13 +4603,13 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif,
 					       addrp, family, peer_sid, &ad);
 		if (err) {
-			selinux_netlbl_err(skb, err, 0);
+			selinux_netlbl_err(skb, family, err, 0);
 			return err;
 		}
 		err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
 				   PEER__RECV, &ad);
 		if (err) {
-			selinux_netlbl_err(skb, err, 0);
+			selinux_netlbl_err(skb, family, err, 0);
 			return err;
 		}
 	}
@@ -4977,7 +4977,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb,
 		err = selinux_inet_sys_rcv_skb(dev_net(indev), indev->ifindex,
 					       addrp, family, peer_sid, &ad);
 		if (err) {
-			selinux_netlbl_err(skb, err, 1);
+			selinux_netlbl_err(skb, family, err, 1);
 			return NF_DROP;
 		}
 	}
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h
index 8c59b8f150e8..75686d53df07 100644
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -40,7 +40,8 @@
 #ifdef CONFIG_NETLABEL
 void selinux_netlbl_cache_invalidate(void);
 
-void selinux_netlbl_err(struct sk_buff *skb, int error, int gateway);
+void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error,
+			int gateway);
 
 void selinux_netlbl_sk_security_free(struct sk_security_struct *sksec);
 void selinux_netlbl_sk_security_reset(struct sk_security_struct *sksec);
@@ -72,6 +73,7 @@ static inline void selinux_netlbl_cache_invalidate(void)
 }
 
 static inline void selinux_netlbl_err(struct sk_buff *skb,
+				      u16 family,
 				      int error,
 				      int gateway)
 {
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index ca220c3fbcf9..dfca50dc292a 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -151,9 +151,9 @@ void selinux_netlbl_cache_invalidate(void)
  * present on the packet, NetLabel is smart enough to only act when it should.
  *
  */
-void selinux_netlbl_err(struct sk_buff *skb, int error, int gateway)
+void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error, int gateway)
 {
-	netlbl_skbuff_err(skb, error, gateway);
+	netlbl_skbuff_err(skb, family, error, gateway);
 }
 
 /**
@@ -405,7 +405,7 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 		return 0;
 
 	if (nlbl_sid != SECINITSID_UNLABELED)
-		netlbl_skbuff_err(skb, rc, 0);
+		netlbl_skbuff_err(skb, family, rc, 0);
 	return rc;
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 11f79013ae1f..292fdeaaec7e 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3992,7 +3992,7 @@ access_check:
 		rc = smk_bu_note("IPv4 delivery", skp, ssp->smk_in,
 					MAY_WRITE, rc);
 		if (rc != 0)
-			netlbl_skbuff_err(skb, rc, 0);
+			netlbl_skbuff_err(skb, sk->sk_family, rc, 0);
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case PF_INET6:
-- 
cgit v1.2.3


From 2e532b702834c07f614caf4489feb691e713232a Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:17 -0400
Subject: calipso: Add validation of CALIPSO option.

Lengths, checksum and the DOI are checked.  Checking of the
level and categories are left for the socket layer.

CRC validation is performed in the calipso module to avoid
unconditionally linking crc_ccitt() into ipv6.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/calipso.h |  6 ++++++
 net/ipv6/calipso.c    | 41 +++++++++++++++++++++++++++++++++++++++++
 net/ipv6/exthdrs.c    | 27 +++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'include/net')

diff --git a/include/net/calipso.h b/include/net/calipso.h
index 38dbb4707150..85404e2375d8 100644
--- a/include/net/calipso.h
+++ b/include/net/calipso.h
@@ -65,6 +65,7 @@ struct calipso_doi {
 #ifdef CONFIG_NETLABEL
 int __init calipso_init(void);
 void calipso_exit(void);
+bool calipso_validate(const struct sk_buff *skb, const unsigned char *option);
 #else
 static inline int __init calipso_init(void)
 {
@@ -74,6 +75,11 @@ static inline int __init calipso_init(void)
 static inline void calipso_exit(void)
 {
 }
+static inline bool calipso_validate(const struct sk_buff *skb,
+				    const unsigned char *option)
+{
+	return true;
+}
 #endif /* CONFIG_NETLABEL */
 
 #endif /* _CALIPSO_H */
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index fa371a8827cf..ea80450efe56 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -320,6 +320,47 @@ doi_walk_return:
 	return ret_val;
 }
 
+/**
+ * calipso_validate - Validate a CALIPSO option
+ * @skb: the packet
+ * @option: the start of the option
+ *
+ * Description:
+ * This routine is called to validate a CALIPSO option.
+ * If the option is valid then %true is returned, otherwise
+ * %false is returned.
+ *
+ * The caller should have already checked that the length of the
+ * option (including the TLV header) is >= 10 and that the catmap
+ * length is consistent with the option length.
+ *
+ * We leave checks on the level and categories to the socket layer.
+ */
+bool calipso_validate(const struct sk_buff *skb, const unsigned char *option)
+{
+	struct calipso_doi *doi_def;
+	bool ret_val;
+	u16 crc, len = option[1] + 2;
+	static const u8 zero[2];
+
+	/* The original CRC runs over the option including the TLV header
+	 * with the CRC-16 field (at offset 8) zeroed out. */
+	crc = crc_ccitt(0xffff, option, 8);
+	crc = crc_ccitt(crc, zero, sizeof(zero));
+	if (len > 10)
+		crc = crc_ccitt(crc, option + 10, len - 10);
+	crc = ~crc;
+	if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff))
+		return false;
+
+	rcu_read_lock();
+	doi_def = calipso_doi_search(get_unaligned_be32(option + 2));
+	ret_val = !!doi_def;
+	rcu_read_unlock();
+
+	return ret_val;
+}
+
 /**
  * calipso_map_cat_hton - Perform a category mapping from host to network
  * @doi_def: the DOI definition
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index d5fd3e799f86..0f69cab39986 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -43,6 +43,7 @@
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
+#include <net/calipso.h>
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 #include <net/xfrm.h>
 #endif
@@ -603,6 +604,28 @@ drop:
 	return false;
 }
 
+/* CALIPSO RFC 5570 */
+
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
+{
+	const unsigned char *nh = skb_network_header(skb);
+
+	if (nh[optoff + 1] < 8)
+		goto drop;
+
+	if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
+		goto drop;
+
+	if (!calipso_validate(skb, nh + optoff))
+		goto drop;
+
+	return true;
+
+drop:
+	kfree_skb(skb);
+	return false;
+}
+
 static const struct tlvtype_proc tlvprochopopt_lst[] = {
 	{
 		.type	= IPV6_TLV_ROUTERALERT,
@@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
 		.type	= IPV6_TLV_JUMBO,
 		.func	= ipv6_hop_jumbo,
 	},
+	{
+		.type	= IPV6_TLV_CALIPSO,
+		.func	= ipv6_hop_calipso,
+	},
 	{ -1, }
 };
 
-- 
cgit v1.2.3


From 4fee5242bf41d9ad641d4c1b821e36eb7ba37fbf Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:17 -0400
Subject: calipso: Add a label cache.

This works in exactly the same way as the CIPSO label cache.
The idea is to allow the lsm to cache the result of a secattr
lookup so that it doesn't need to perform the lookup for
every skbuff.

It introduces two sysctl controls:
 calipso_cache_enable - enables/disables the cache.
 calipso_cache_bucket_size - sets the size of a cache bucket.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/calipso.h           |   6 +
 include/net/netlabel.h          |   9 +-
 net/ipv6/calipso.c              | 264 +++++++++++++++++++++++++++++++++++++++-
 net/ipv6/sysctl_net_ipv6.c      |  19 +++
 net/netlabel/netlabel_calipso.c |  38 ++++++
 net/netlabel/netlabel_calipso.h |   3 +
 net/netlabel/netlabel_kapi.c    |  24 +++-
 security/selinux/netlabel.c     |   9 +-
 8 files changed, 360 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/calipso.h b/include/net/calipso.h
index 85404e2375d8..b1b30cd36601 100644
--- a/include/net/calipso.h
+++ b/include/net/calipso.h
@@ -62,6 +62,12 @@ struct calipso_doi {
 	struct rcu_head rcu;
 };
 
+/*
+ * Sysctl Variables
+ */
+extern int calipso_cache_enabled;
+extern int calipso_cache_bucketsize;
+
 #ifdef CONFIG_NETLABEL
 int __init calipso_init(void);
 void calipso_exit(void);
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index d8a46a8ed512..a306bc7d2642 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -235,6 +235,8 @@ struct netlbl_lsm_secattr {
  * @skbuff_optptr: find option in packet
  * @skbuff_setattr: set the skbuff's attr
  * @skbuff_delattr: remove the skbuff's attr
+ * @cache_invalidate: invalidate cache
+ * @cache_add: add cache entry
  *
  * Description:
  * This structure is filled out by the CALIPSO engine and passed
@@ -269,6 +271,9 @@ struct netlbl_calipso_ops {
 			      const struct calipso_doi *doi_def,
 			      const struct netlbl_lsm_secattr *secattr);
 	int (*skbuff_delattr)(struct sk_buff *skb);
+	void (*cache_invalidate)(void);
+	int (*cache_add)(const unsigned char *calipso_ptr,
+			 const struct netlbl_lsm_secattr *secattr);
 };
 
 /*
@@ -494,7 +499,7 @@ void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway);
  * LSM label mapping cache operations
  */
 void netlbl_cache_invalidate(void);
-int netlbl_cache_add(const struct sk_buff *skb,
+int netlbl_cache_add(const struct sk_buff *skb, u16 family,
 		     const struct netlbl_lsm_secattr *secattr);
 
 /*
@@ -647,7 +652,7 @@ static inline void netlbl_cache_invalidate(void)
 {
 	return;
 }
-static inline int netlbl_cache_add(const struct sk_buff *skb,
+static inline int netlbl_cache_add(const struct sk_buff *skb, u16 family,
 				   const struct netlbl_lsm_secattr *secattr)
 {
 	return 0;
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index ea80450efe56..c53b92c617c5 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -72,6 +72,255 @@
 static DEFINE_SPINLOCK(calipso_doi_list_lock);
 static LIST_HEAD(calipso_doi_list);
 
+/* Label mapping cache */
+int calipso_cache_enabled = 1;
+int calipso_cache_bucketsize = 10;
+#define CALIPSO_CACHE_BUCKETBITS     7
+#define CALIPSO_CACHE_BUCKETS        BIT(CALIPSO_CACHE_BUCKETBITS)
+#define CALIPSO_CACHE_REORDERLIMIT   10
+struct calipso_map_cache_bkt {
+	spinlock_t lock;
+	u32 size;
+	struct list_head list;
+};
+
+struct calipso_map_cache_entry {
+	u32 hash;
+	unsigned char *key;
+	size_t key_len;
+
+	struct netlbl_lsm_cache *lsm_data;
+
+	u32 activity;
+	struct list_head list;
+};
+
+static struct calipso_map_cache_bkt *calipso_cache;
+
+/* Label Mapping Cache Functions
+ */
+
+/**
+ * calipso_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry)
+{
+	if (entry->lsm_data)
+		netlbl_secattr_cache_free(entry->lsm_data);
+	kfree(entry->key);
+	kfree(entry);
+}
+
+/**
+ * calipso_map_cache_hash - Hashing function for the CALIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CALIPSO tag hashing function.  Returns a 32-bit hash value.
+ *
+ */
+static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+/**
+ * calipso_cache_init - Initialize the CALIPSO cache
+ *
+ * Description:
+ * Initializes the CALIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file.  Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init calipso_cache_init(void)
+{
+	u32 iter;
+
+	calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS,
+				sizeof(struct calipso_map_cache_bkt),
+				GFP_KERNEL);
+	if (!calipso_cache)
+		return -ENOMEM;
+
+	for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+		spin_lock_init(&calipso_cache[iter].lock);
+		calipso_cache[iter].size = 0;
+		INIT_LIST_HEAD(&calipso_cache[iter].list);
+	}
+
+	return 0;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+static void calipso_cache_invalidate(void)
+{
+	struct calipso_map_cache_entry *entry, *tmp_entry;
+	u32 iter;
+
+	for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+		spin_lock_bh(&calipso_cache[iter].lock);
+		list_for_each_entry_safe(entry,
+					 tmp_entry,
+					 &calipso_cache[iter].list, list) {
+			list_del(&entry->list);
+			calipso_cache_entry_free(entry);
+		}
+		calipso_cache[iter].size = 0;
+		spin_unlock_bh(&calipso_cache[iter].lock);
+	}
+}
+
+/**
+ * calipso_cache_check - Check the CALIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key.  If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes.  The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ *  1. The cache entry's activity counter is incremented
+ *  2. The previous (higher ranking) entry's activity counter is decremented
+ *  3. If the difference between the two activity counters is geater than
+ *     CALIPSO_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int calipso_cache_check(const unsigned char *key,
+			       u32 key_len,
+			       struct netlbl_lsm_secattr *secattr)
+{
+	u32 bkt;
+	struct calipso_map_cache_entry *entry;
+	struct calipso_map_cache_entry *prev_entry = NULL;
+	u32 hash;
+
+	if (!calipso_cache_enabled)
+		return -ENOENT;
+
+	hash = calipso_map_cache_hash(key, key_len);
+	bkt = hash & (CALIPSO_CACHE_BUCKETS - 1);
+	spin_lock_bh(&calipso_cache[bkt].lock);
+	list_for_each_entry(entry, &calipso_cache[bkt].list, list) {
+		if (entry->hash == hash &&
+		    entry->key_len == key_len &&
+		    memcmp(entry->key, key, key_len) == 0) {
+			entry->activity += 1;
+			atomic_inc(&entry->lsm_data->refcount);
+			secattr->cache = entry->lsm_data;
+			secattr->flags |= NETLBL_SECATTR_CACHE;
+			secattr->type = NETLBL_NLTYPE_CALIPSO;
+			if (!prev_entry) {
+				spin_unlock_bh(&calipso_cache[bkt].lock);
+				return 0;
+			}
+
+			if (prev_entry->activity > 0)
+				prev_entry->activity -= 1;
+			if (entry->activity > prev_entry->activity &&
+			    entry->activity - prev_entry->activity >
+			    CALIPSO_CACHE_REORDERLIMIT) {
+				__list_del(entry->list.prev, entry->list.next);
+				__list_add(&entry->list,
+					   prev_entry->list.prev,
+					   &prev_entry->list);
+			}
+
+			spin_unlock_bh(&calipso_cache[bkt].lock);
+			return 0;
+		}
+		prev_entry = entry;
+	}
+	spin_unlock_bh(&calipso_cache[bkt].lock);
+
+	return -ENOENT;
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache.  Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first.  It is important to note that there is
+ * currently no checking for duplicate keys.  Returns zero on success,
+ * negative values on failure.  The key stored starts at calipso_ptr + 2,
+ * i.e. the type and length bytes are not stored, this corresponds to
+ * calipso_ptr[1] bytes of data.
+ *
+ */
+static int calipso_cache_add(const unsigned char *calipso_ptr,
+			     const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	u32 bkt;
+	struct calipso_map_cache_entry *entry = NULL;
+	struct calipso_map_cache_entry *old_entry = NULL;
+	u32 calipso_ptr_len;
+
+	if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0)
+		return 0;
+
+	calipso_ptr_len = calipso_ptr[1];
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+	entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC);
+	if (!entry->key) {
+		ret_val = -ENOMEM;
+		goto cache_add_failure;
+	}
+	entry->key_len = calipso_ptr_len;
+	entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len);
+	atomic_inc(&secattr->cache->refcount);
+	entry->lsm_data = secattr->cache;
+
+	bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1);
+	spin_lock_bh(&calipso_cache[bkt].lock);
+	if (calipso_cache[bkt].size < calipso_cache_bucketsize) {
+		list_add(&entry->list, &calipso_cache[bkt].list);
+		calipso_cache[bkt].size += 1;
+	} else {
+		old_entry = list_entry(calipso_cache[bkt].list.prev,
+				       struct calipso_map_cache_entry, list);
+		list_del(&old_entry->list);
+		list_add(&entry->list, &calipso_cache[bkt].list);
+		calipso_cache_entry_free(old_entry);
+	}
+	spin_unlock_bh(&calipso_cache[bkt].lock);
+
+	return 0;
+
+cache_add_failure:
+	if (entry)
+		calipso_cache_entry_free(entry);
+	return ret_val;
+}
+
 /* DOI List Functions
  */
 
@@ -789,6 +1038,9 @@ static int calipso_opt_getattr(const unsigned char *calipso,
 	if (cat_len + 8 > len)
 		return -EINVAL;
 
+	if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0)
+		return 0;
+
 	doi = get_unaligned_be32(calipso + 2);
 	rcu_read_lock();
 	doi_def = calipso_doi_search(doi);
@@ -1191,6 +1443,8 @@ static const struct netlbl_calipso_ops ops = {
 	.skbuff_optptr    = calipso_skbuff_optptr,
 	.skbuff_setattr   = calipso_skbuff_setattr,
 	.skbuff_delattr   = calipso_skbuff_delattr,
+	.cache_invalidate = calipso_cache_invalidate,
+	.cache_add        = calipso_cache_add
 };
 
 /**
@@ -1203,11 +1457,17 @@ static const struct netlbl_calipso_ops ops = {
  */
 int __init calipso_init(void)
 {
-	netlbl_calipso_ops_register(&ops);
-	return 0;
+	int ret_val;
+
+	ret_val = calipso_cache_init();
+	if (!ret_val)
+		netlbl_calipso_ops_register(&ops);
+	return ret_val;
 }
 
 void calipso_exit(void)
 {
 	netlbl_calipso_ops_register(NULL);
+	calipso_cache_invalidate();
+	kfree(calipso_cache);
 }
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 45243bbe5253..69c50e737c54 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -15,6 +15,9 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/inet_frag.h>
+#ifdef CONFIG_NETLABEL
+#include <net/calipso.h>
+#endif
 
 static int one = 1;
 static int auto_flowlabels_min;
@@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one
 	},
+#ifdef CONFIG_NETLABEL
+	{
+		.procname	= "calipso_cache_enable",
+		.data		= &calipso_cache_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "calipso_cache_bucket_size",
+		.data		= &calipso_cache_bucketsize,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif /* CONFIG_NETLABEL */
 	{ }
 };
 
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 0d02c262dbf6..2ec93c5e77bb 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -700,3 +700,41 @@ int calipso_skbuff_delattr(struct sk_buff *skb)
 		ret_val = ops->skbuff_delattr(skb);
 	return ret_val;
 }
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void calipso_cache_invalidate(void)
+{
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ops->cache_invalidate();
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_cache_add(const unsigned char *calipso_ptr,
+		      const struct netlbl_lsm_secattr *secattr)
+
+{
+	int ret_val = -ENOMSG;
+	const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+	if (ops)
+		ret_val = ops->cache_add(calipso_ptr, secattr);
+	return ret_val;
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
index 66ba92e9289f..9fd291cd0fc5 100644
--- a/net/netlabel/netlabel_calipso.h
+++ b/net/netlabel/netlabel_calipso.h
@@ -144,5 +144,8 @@ int calipso_skbuff_setattr(struct sk_buff *skb,
 			   const struct calipso_doi *doi_def,
 			   const struct netlbl_lsm_secattr *secattr);
 int calipso_skbuff_delattr(struct sk_buff *skb);
+void calipso_cache_invalidate(void);
+int calipso_cache_add(const unsigned char *calipso_ptr,
+		      const struct netlbl_lsm_secattr *secattr);
 
 #endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index a42ce3c15d70..fbad7187d4fc 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1281,11 +1281,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
 void netlbl_cache_invalidate(void)
 {
 	cipso_v4_cache_invalidate();
+#if IS_ENABLED(CONFIG_IPV6)
+	calipso_cache_invalidate();
+#endif /* IPv6 */
 }
 
 /**
  * netlbl_cache_add - Add an entry to a NetLabel protocol cache
  * @skb: the packet
+ * @family: the family
  * @secattr: the packet's security attributes
  *
  * Description:
@@ -1294,7 +1298,7 @@ void netlbl_cache_invalidate(void)
  * values on error.
  *
  */
-int netlbl_cache_add(const struct sk_buff *skb,
+int netlbl_cache_add(const struct sk_buff *skb, u16 family,
 		     const struct netlbl_lsm_secattr *secattr)
 {
 	unsigned char *ptr;
@@ -1302,10 +1306,20 @@ int netlbl_cache_add(const struct sk_buff *skb,
 	if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
 		return -ENOMSG;
 
-	ptr = cipso_v4_optptr(skb);
-	if (ptr)
-		return cipso_v4_cache_add(ptr, secattr);
-
+	switch (family) {
+	case AF_INET:
+		ptr = cipso_v4_optptr(skb);
+		if (ptr)
+			return cipso_v4_cache_add(ptr, secattr);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		ptr = calipso_optptr(skb);
+		if (ptr)
+			return calipso_cache_add(ptr, secattr);
+		break;
+#endif /* IPv6 */
+	}
 	return -ENOMSG;
 }
 
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index dfca50dc292a..aaba6677ee2e 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -54,6 +54,7 @@
  *
  */
 static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
+					   u16 family,
 					   struct netlbl_lsm_secattr *secattr,
 					   u32 *sid)
 {
@@ -63,7 +64,7 @@ static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
 	if (rc == 0 &&
 	    (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
 	    (secattr->flags & NETLBL_SECATTR_CACHE))
-		netlbl_cache_add(skb, secattr);
+		netlbl_cache_add(skb, family, secattr);
 
 	return rc;
 }
@@ -214,7 +215,8 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
 	netlbl_secattr_init(&secattr);
 	rc = netlbl_skbuff_getattr(skb, family, &secattr);
 	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-		rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
+		rc = selinux_netlbl_sidlookup_cached(skb, family,
+						     &secattr, sid);
 	else
 		*sid = SECSID_NULL;
 	*type = secattr.type;
@@ -382,7 +384,8 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 	netlbl_secattr_init(&secattr);
 	rc = netlbl_skbuff_getattr(skb, family, &secattr);
 	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-		rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
+		rc = selinux_netlbl_sidlookup_cached(skb, family,
+						     &secattr, &nlbl_sid);
 	else
 		nlbl_sid = SECINITSID_UNLABELED;
 	netlbl_secattr_destroy(&secattr);
-- 
cgit v1.2.3


From 3f09354ac84c6904787189d85fb306bf60f714b8 Mon Sep 17 00:00:00 2001
From: Huw Davies <huw@codeweavers.com>
Date: Mon, 27 Jun 2016 15:06:18 -0400
Subject: netlabel: Implement CALIPSO config functions for SMACK.

SMACK uses similar functions to control CIPSO, these are
the equivalent functions for CALIPSO and follow exactly
the same semantics.

int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                           struct netlbl_audit *audit_info)
    Adds a CALIPSO doi.

void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
    Removes a CALIPSO doi.

int netlbl_cfg_calipso_map_add(u32 doi, const char *domain,
                               const struct in6_addr *addr,
                               const struct in6_addr *mask,
                               struct netlbl_audit *audit_info)
    Creates a mapping between a domain and a CALIPSO doi.  If
    addr and mask are non-NULL this creates an address-selector
    type mapping.

This also extends netlbl_cfg_map_del() to remove IPv6 address-selector
mappings.

Signed-off-by: Huw Davies <huw@codeweavers.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/netlabel.h             |  26 +++++++
 net/netlabel/netlabel_domainhash.c |  66 ++++++++++++++++++
 net/netlabel/netlabel_domainhash.h |   8 +++
 net/netlabel/netlabel_kapi.c       | 138 +++++++++++++++++++++++++++++++++++++
 4 files changed, 238 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index a306bc7d2642..efe98068880f 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -445,6 +445,14 @@ int netlbl_cfg_cipsov4_map_add(u32 doi,
 			       const struct in_addr *addr,
 			       const struct in_addr *mask,
 			       struct netlbl_audit *audit_info);
+int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
+			   struct netlbl_audit *audit_info);
+void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info);
+int netlbl_cfg_calipso_map_add(u32 doi,
+			       const char *domain,
+			       const struct in6_addr *addr,
+			       const struct in6_addr *mask,
+			       struct netlbl_audit *audit_info);
 /*
  * LSM security attribute operations
  */
@@ -561,6 +569,24 @@ static inline int netlbl_cfg_cipsov4_map_add(u32 doi,
 {
 	return -ENOSYS;
 }
+static inline int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
+					 struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
+static inline void netlbl_cfg_calipso_del(u32 doi,
+					  struct netlbl_audit *audit_info)
+{
+	return;
+}
+static inline int netlbl_cfg_calipso_map_add(u32 doi,
+					     const char *domain,
+					     const struct in6_addr *addr,
+					     const struct in6_addr *mask,
+					     struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
 static inline int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap,
 				     u32 offset)
 {
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f1dee0d5e5e2..41d0e95d171e 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -725,6 +725,72 @@ remove_af4_failure:
 	return -ENOENT;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_domhsh_remove_af6 - Removes an address selector entry
+ * @domain: the domain
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an individual address selector from a domain mapping and potentially
+ * the entire mapping if it is empty.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_af6(const char *domain,
+			     const struct in6_addr *addr,
+			     const struct in6_addr *mask,
+			     struct netlbl_audit *audit_info)
+{
+	struct netlbl_dom_map *entry_map;
+	struct netlbl_af6list *entry_addr;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af6list *iter6;
+	struct netlbl_domaddr6_map *entry;
+
+	rcu_read_lock();
+
+	if (domain)
+		entry_map = netlbl_domhsh_search(domain, AF_INET6);
+	else
+		entry_map = netlbl_domhsh_search_def(domain, AF_INET6);
+	if (entry_map == NULL ||
+	    entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
+		goto remove_af6_failure;
+
+	spin_lock(&netlbl_domhsh_lock);
+	entry_addr = netlbl_af6list_remove(addr, mask,
+					   &entry_map->def.addrsel->list6);
+	spin_unlock(&netlbl_domhsh_lock);
+
+	if (entry_addr == NULL)
+		goto remove_af6_failure;
+	netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
+		goto remove_af6_single_addr;
+	netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
+		goto remove_af6_single_addr;
+	/* the domain mapping is empty so remove it from the mapping table */
+	netlbl_domhsh_remove_entry(entry_map, audit_info);
+
+remove_af6_single_addr:
+	rcu_read_unlock();
+	/* yick, we can't use call_rcu here because we don't have a rcu head
+	 * pointer but hopefully this should be a rare case so the pause
+	 * shouldn't be a problem */
+	synchronize_rcu();
+	entry = netlbl_domhsh_addr6_entry(entry_addr);
+	calipso_doi_putdef(entry->def.calipso);
+	kfree(entry);
+	return 0;
+
+remove_af6_failure:
+	rcu_read_unlock();
+	return -ENOENT;
+}
+#endif /* IPv6 */
+
 /**
  * netlbl_domhsh_remove - Removes an entry from the domain hash table
  * @domain: the domain to remove
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index b7ddb6e5c53f..1f9247781927 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -93,6 +93,10 @@ int netlbl_domhsh_remove_af4(const char *domain,
 			     const struct in_addr *addr,
 			     const struct in_addr *mask,
 			     struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_af6(const char *domain,
+			     const struct in6_addr *addr,
+			     const struct in6_addr *mask,
+			     struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove(const char *domain, u16 family,
 			 struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info);
@@ -102,6 +106,10 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
 #if IS_ENABLED(CONFIG_IPV6)
 struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
 						   const struct in6_addr *addr);
+int netlbl_domhsh_remove_af6(const char *domain,
+			     const struct in6_addr *addr,
+			     const struct in6_addr *mask,
+			     struct netlbl_audit *audit_info);
 #endif /* IPv6 */
 
 int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index fbad7187d4fc..28c56b95fb7f 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -80,6 +80,11 @@ int netlbl_cfg_map_del(const char *domain,
 		case AF_INET:
 			return netlbl_domhsh_remove_af4(domain, addr, mask,
 							audit_info);
+#if IS_ENABLED(CONFIG_IPV6)
+		case AF_INET6:
+			return netlbl_domhsh_remove_af6(domain, addr, mask,
+							audit_info);
+#endif /* IPv6 */
 		default:
 			return -EPFNOSUPPORT;
 		}
@@ -403,6 +408,139 @@ out_entry:
 	return ret_val;
 }
 
+/**
+ * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition
+ * @doi_def: CALIPSO DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CALIPSO DOI definition as defined by @doi_def.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
+			   struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	return calipso_doi_add(doi_def, audit_info);
+#else /* IPv6 */
+	return -ENOSYS;
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition
+ * @doi: CALIPSO DOI
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an existing CALIPSO DOI definition matching @doi.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	calipso_doi_remove(doi, audit_info);
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping
+ * @doi: the CALIPSO DOI
+ * @domain: the domain mapping to add
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the
+ * NetLabel subsystem.  A @domain value of NULL adds a new default domain
+ * mapping.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_map_add(u32 doi,
+			       const char *domain,
+			       const struct in6_addr *addr,
+			       const struct in6_addr *mask,
+			       struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	int ret_val = -ENOMEM;
+	struct calipso_doi *doi_def;
+	struct netlbl_dom_map *entry;
+	struct netlbl_domaddr_map *addrmap = NULL;
+	struct netlbl_domaddr6_map *addrinfo = NULL;
+
+	doi_def = calipso_doi_getdef(doi);
+	if (doi_def == NULL)
+		return -ENOENT;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		goto out_entry;
+	entry->family = AF_INET6;
+	if (domain != NULL) {
+		entry->domain = kstrdup(domain, GFP_ATOMIC);
+		if (entry->domain == NULL)
+			goto out_domain;
+	}
+
+	if (addr == NULL && mask == NULL) {
+		entry->def.calipso = doi_def;
+		entry->def.type = NETLBL_NLTYPE_CALIPSO;
+	} else if (addr != NULL && mask != NULL) {
+		addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+		if (addrmap == NULL)
+			goto out_addrmap;
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
+		if (addrinfo == NULL)
+			goto out_addrinfo;
+		addrinfo->def.calipso = doi_def;
+		addrinfo->def.type = NETLBL_NLTYPE_CALIPSO;
+		addrinfo->list.addr = *addr;
+		addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+		addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+		addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+		addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+		addrinfo->list.mask = *mask;
+		addrinfo->list.valid = 1;
+		ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6);
+		if (ret_val != 0)
+			goto cfg_calipso_map_add_failure;
+
+		entry->def.addrsel = addrmap;
+		entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+	} else {
+		ret_val = -EINVAL;
+		goto out_addrmap;
+	}
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto cfg_calipso_map_add_failure;
+
+	return 0;
+
+cfg_calipso_map_add_failure:
+	kfree(addrinfo);
+out_addrinfo:
+	kfree(addrmap);
+out_addrmap:
+	kfree(entry->domain);
+out_domain:
+	kfree(entry);
+out_entry:
+	calipso_doi_putdef(doi_def);
+	return ret_val;
+#else /* IPv6 */
+	return -ENOSYS;
+#endif /* IPv6 */
+}
+
 /*
  * Security Attribute Functions
  */
-- 
cgit v1.2.3


From 80e73cc563c4359be809a03bcb8e7e28141a813a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 28 Jun 2016 16:57:05 +0200
Subject: net: rtnetlink: add support for the IFLA_STATS_LINK_XSTATS_SLAVE
 attribute

This patch adds support for the IFLA_STATS_LINK_XSTATS_SLAVE attribute
which allows to export per-slave statistics if the master device supports
the linkxstats callback. The attribute is passed down to the linkxstats
callback and it is up to the callback user to use it (an example has been
added to the only current user - the bridge). This allows us to query only
specific slaves of master devices like bridge ports and export only what
we're interested in instead of having to dump all ports and searching only
for a single one. This will be used to export per-port IGMP/MLD stats and
also per-port vlan stats in the future, possibly other statistics as well.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h      |  5 ++--
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_netlink.c      | 59 +++++++++++++++++++++++++++++++++++++++++---
 net/core/rtnetlink.c         | 50 +++++++++++++++++++++++++++++++++++--
 4 files changed, 108 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 006a7b81d758..4113916cc1bb 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -98,10 +98,11 @@ struct rtnl_link_ops {
 						   const struct net_device *dev,
 						   const struct net_device *slave_dev);
 	struct net		*(*get_link_net)(const struct net_device *dev);
-	size_t			(*get_linkxstats_size)(const struct net_device *dev);
+	size_t			(*get_linkxstats_size)(const struct net_device *dev,
+						       int attr);
 	int			(*fill_linkxstats)(struct sk_buff *skb,
 						   const struct net_device *dev,
-						   int *prividx);
+						   int *prividx, int attr);
 };
 
 int __rtnl_link_register(struct rtnl_link_ops *ops);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bb36bd5675a7..db2458ade81c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -822,6 +822,7 @@ enum {
 	IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
+	IFLA_STATS_LINK_XSTATS_SLAVE,
 	__IFLA_STATS_MAX,
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 85e89f693589..ed75ff9ff9e6 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1234,7 +1234,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	return 0;
 }
 
-static size_t br_get_linkxstats_size(const struct net_device *dev)
+static size_t bridge_get_linkxstats_size(const struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_vlan_group *vg;
@@ -1254,8 +1254,30 @@ static size_t br_get_linkxstats_size(const struct net_device *dev)
 	       nla_total_size(0);
 }
 
-static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
-			      int *prividx)
+static size_t brport_get_linkxstats_size(const struct net_device *dev)
+{
+	return nla_total_size(0);
+}
+
+static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
+{
+	size_t retsize = 0;
+
+	switch (attr) {
+	case IFLA_STATS_LINK_XSTATS:
+		retsize = bridge_get_linkxstats_size(dev);
+		break;
+	case IFLA_STATS_LINK_XSTATS_SLAVE:
+		retsize = brport_get_linkxstats_size(dev);
+		break;
+	}
+
+	return retsize;
+}
+
+static int bridge_fill_linkxstats(struct sk_buff *skb,
+				  const struct net_device *dev,
+				  int *prividx)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_vlan_group *vg;
@@ -1298,6 +1320,37 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int brport_fill_linkxstats(struct sk_buff *skb,
+				  const struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
+	if (!nest)
+		return -EMSGSIZE;
+	nla_nest_end(skb, nest);
+
+	return 0;
+}
+
+static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
+			      int *prividx, int attr)
+{
+	int ret = -EINVAL;
+
+	switch (attr) {
+	case IFLA_STATS_LINK_XSTATS:
+		ret = bridge_fill_linkxstats(skb, dev, prividx);
+		break;
+	case IFLA_STATS_LINK_XSTATS_SLAVE:
+		ret = brport_fill_linkxstats(skb, dev, prividx);
+		break;
+	}
+
+	return ret;
+}
+
 static struct rtnl_af_ops br_af_ops __read_mostly = {
 	.family			= AF_BRIDGE,
 	.get_link_af_size	= br_get_link_af_size_filtered,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eb49ca24274a..cfed7bc14ee6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3519,7 +3519,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			if (!attr)
 				goto nla_put_failure;
 
-			err = ops->fill_linkxstats(skb, dev, prividx);
+			err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+			nla_nest_end(skb, attr);
+			if (err)
+				goto nla_put_failure;
+			*idxattr = 0;
+		}
+	}
+
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
+			     *idxattr)) {
+		const struct rtnl_link_ops *ops = NULL;
+		const struct net_device *master;
+
+		master = netdev_master_upper_dev_get(dev);
+		if (master)
+			ops = master->rtnl_link_ops;
+		if (ops && ops->fill_linkxstats) {
+			int err;
+
+			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
+			attr = nla_nest_start(skb,
+					      IFLA_STATS_LINK_XSTATS_SLAVE);
+			if (!attr)
+				goto nla_put_failure;
+
+			err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
 			nla_nest_end(skb, attr);
 			if (err)
 				goto nla_put_failure;
@@ -3555,14 +3580,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+		int attr = IFLA_STATS_LINK_XSTATS;
 
 		if (ops && ops->get_linkxstats_size) {
-			size += nla_total_size(ops->get_linkxstats_size(dev));
+			size += nla_total_size(ops->get_linkxstats_size(dev,
+									attr));
 			/* for IFLA_STATS_LINK_XSTATS */
 			size += nla_total_size(0);
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
+		struct net_device *_dev = (struct net_device *)dev;
+		const struct rtnl_link_ops *ops = NULL;
+		const struct net_device *master;
+
+		/* netdev_master_upper_dev_get can't take const */
+		master = netdev_master_upper_dev_get(_dev);
+		if (master)
+			ops = master->rtnl_link_ops;
+		if (ops && ops->get_linkxstats_size) {
+			int attr = IFLA_STATS_LINK_XSTATS_SLAVE;
+
+			size += nla_total_size(ops->get_linkxstats_size(dev,
+									attr));
+			/* for IFLA_STATS_LINK_XSTATS_SLAVE */
+			size += nla_total_size(0);
+		}
+	}
+
 	return size;
 }
 
-- 
cgit v1.2.3


From 2631b79f6cb8b634fe41b77de9c4add0ec6b3cae Mon Sep 17 00:00:00 2001
From: "Seymour, Shane M" <shane.seymour@hpe.com>
Date: Tue, 28 Jun 2016 23:06:48 +0000
Subject: tcp: increase size at which tcp_bound_to_half_wnd bounds to >
 TCP_MSS_DEFAULT

In previous commit 01f83d69844d307be2aa6fea88b0e8fe5cbdb2f4
the following comments were added:

"When peer uses tiny windows, there is no use in packetizing to sub-MSS
pieces for the sake of SWS or making sure there are enough packets in
the pipe for fast recovery."

The test should be > TCP_MSS_DEFAULT not >= 512. This allows low end
devices that send an MSS of 536 (TCP_MSS_DEFAULT) to see better network
performance by sending it 536 bytes of data at a time instead of bounding
to half window size (268). Other network stacks work this way, e.g. HP-UX.

Signed-off-by: Shane Seymour <shane.seymour@hpe.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a79894b66726..d825858fe4f1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -589,7 +589,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 	 * On the other hand, for extremely large MSS devices, handling
 	 * smaller than MSS windows in this way does make sense.
 	 */
-	if (tp->max_window >= 512)
+	if (tp->max_window > TCP_MSS_DEFAULT)
 		cutoff = (tp->max_window >> 1);
 	else
 		cutoff = tp->max_window;
-- 
cgit v1.2.3


From 19689e38eca5d7b32755182d4e62efd7a5376c45 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 Jun 2016 18:51:53 +0200
Subject: tcp: md5: use kmalloc() backed scratch areas

Some arches have virtually mapped kernel stacks, or will soon have.

tcp_md5_hash_header() uses an automatic variable to copy tcp header
before mangling th->check and calling crypto function, which might
be problematic on such arches.

David says that using percpu storage is also problematic on non SMP
builds.

Just use kmalloc() to allocate scratch areas.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  3 +--
 net/ipv4/tcp.c      | 10 ++++++++++
 net/ipv4/tcp_ipv4.c | 31 ++++++++++++++-----------------
 net/ipv6/tcp_ipv6.c | 29 ++++++++++++++++-------------
 4 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d825858fe4f1..c00e7d51bb18 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1384,7 +1384,7 @@ union tcp_md5sum_block {
 /* - pool: digest algorithm, hash description and scratch buffer */
 struct tcp_md5sig_pool {
 	struct ahash_request	*md5_req;
-	union tcp_md5sum_block	md5_blk;
+	void			*scratch;
 };
 
 /* - functions */
@@ -1420,7 +1420,6 @@ static inline void tcp_put_md5sig_pool(void)
 	local_bh_enable();
 }
 
-int tcp_md5_hash_header(struct tcp_md5sig_pool *, const struct tcphdr *);
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
 			  unsigned int header_len);
 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 108ef2a6665c..032a96d78c99 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3026,8 +3026,18 @@ static void __tcp_alloc_md5sig_pool(void)
 		return;
 
 	for_each_possible_cpu(cpu) {
+		void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
 		struct ahash_request *req;
 
+		if (!scratch) {
+			scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
+					       sizeof(struct tcphdr),
+					       GFP_KERNEL,
+					       cpu_to_node(cpu));
+			if (!scratch)
+				return;
+			per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
+		}
 		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
 			continue;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3708de2a6683..32b048e524d6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1018,27 +1018,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 			      GFP_KERNEL);
 }
 
-static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
-					__be32 daddr, __be32 saddr, int nbytes)
+static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+				   __be32 daddr, __be32 saddr,
+				   const struct tcphdr *th, int nbytes)
 {
 	struct tcp4_pseudohdr *bp;
 	struct scatterlist sg;
+	struct tcphdr *_th;
 
-	bp = &hp->md5_blk.ip4;
-
-	/*
-	 * 1. the TCP pseudo-header (in the order: source IP address,
-	 * destination IP address, zero-padded protocol number, and
-	 * segment length)
-	 */
+	bp = hp->scratch;
 	bp->saddr = saddr;
 	bp->daddr = daddr;
 	bp->pad = 0;
 	bp->protocol = IPPROTO_TCP;
 	bp->len = cpu_to_be16(nbytes);
 
-	sg_init_one(&sg, bp, sizeof(*bp));
-	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+	_th = (struct tcphdr *)(bp + 1);
+	memcpy(_th, th, sizeof(*th));
+	_th->check = 0;
+
+	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+				sizeof(*bp) + sizeof(*th));
 	return crypto_ahash_update(hp->md5_req);
 }
 
@@ -1055,9 +1056,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 
 	if (crypto_ahash_init(req))
 		goto clear_hash;
-	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_header(hp, th))
+	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
@@ -1101,9 +1100,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 	if (crypto_ahash_init(req))
 		goto clear_hash;
 
-	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
-		goto clear_hash;
-	if (tcp_md5_hash_header(hp, th))
+	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
 		goto clear_hash;
 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
 		goto clear_hash;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2255d2bf5f6b..37cf91323319 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -526,26 +526,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
 			      AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 }
 
-static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
-					const struct in6_addr *daddr,
-					const struct in6_addr *saddr, int nbytes)
+static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr,
+				   const struct tcphdr *th, int nbytes)
 {
 	struct tcp6_pseudohdr *bp;
 	struct scatterlist sg;
+	struct tcphdr *_th;
 
-	bp = &hp->md5_blk.ip6;
+	bp = hp->scratch;
 	/* 1. TCP pseudo-header (RFC2460) */
 	bp->saddr = *saddr;
 	bp->daddr = *daddr;
 	bp->protocol = cpu_to_be32(IPPROTO_TCP);
 	bp->len = cpu_to_be32(nbytes);
 
-	sg_init_one(&sg, bp, sizeof(*bp));
-	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+	_th = (struct tcphdr *)(bp + 1);
+	memcpy(_th, th, sizeof(*th));
+	_th->check = 0;
+
+	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+				sizeof(*bp) + sizeof(*th));
 	return crypto_ahash_update(hp->md5_req);
 }
 
-static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       const struct in6_addr *daddr, struct in6_addr *saddr,
 			       const struct tcphdr *th)
 {
@@ -559,9 +566,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
 
 	if (crypto_ahash_init(req))
 		goto clear_hash;
-	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_header(hp, th))
+	if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
@@ -606,9 +611,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 	if (crypto_ahash_init(req))
 		goto clear_hash;
 
-	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
-		goto clear_hash;
-	if (tcp_md5_hash_header(hp, th))
+	if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
 		goto clear_hash;
 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
 		goto clear_hash;
-- 
cgit v1.2.3


From 08f4b5918b2d6b491f0403cc1886f5cdccef89bb Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Fri, 1 Jul 2016 14:51:01 +0300
Subject: net/devlink: Add E-Switch mode control

Add the commands to set and show the mode of SRIOV E-Switch, two modes
are supported:

* legacy: operating in the "old" L2 based mode (DMAC --> VF vport)

* switchdev: the E-Switch is referred to as whitebox switch configured
using standard tools such as tc, bridge, openvswitch etc. To allow
working with the tools, for each VF, a VF representor netdevice is
created by the E-Switch manager vendor device driver instance (e.g PF).

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  3 ++
 include/uapi/linux/devlink.h |  8 ++++
 net/core/devlink.c           | 87 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1d45b61cb320..c99ffe8cef3c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -90,6 +90,9 @@ struct devlink_ops {
 				       u16 tc_index,
 				       enum devlink_sb_pool_type pool_type,
 				       u32 *p_cur, u32 *p_max);
+
+	int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode);
+	int (*eswitch_mode_set)(struct devlink *devlink, u16 mode);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index ba0073b26fa6..915bfa74458c 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -57,6 +57,8 @@ enum devlink_command {
 	DEVLINK_CMD_SB_OCC_SNAPSHOT,
 	DEVLINK_CMD_SB_OCC_MAX_CLEAR,
 
+	DEVLINK_CMD_ESWITCH_MODE_GET,
+	DEVLINK_CMD_ESWITCH_MODE_SET,
 	/* add new commands above here */
 
 	__DEVLINK_CMD_MAX,
@@ -95,6 +97,11 @@ enum devlink_sb_threshold_type {
 
 #define DEVLINK_SB_THRESHOLD_TO_ALPHA_MAX 20
 
+enum devlink_eswitch_mode {
+	DEVLINK_ESWITCH_MODE_LEGACY,
+	DEVLINK_ESWITCH_MODE_SWITCHDEV,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -125,6 +132,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_SB_TC_INDEX,		/* u16 */
 	DEVLINK_ATTR_SB_OCC_CUR,		/* u32 */
 	DEVLINK_ATTR_SB_OCC_MAX,		/* u32 */
+	DEVLINK_ATTR_ESWITCH_MODE,		/* u16 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 933e8d4d3968..b2e592a198c0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1394,6 +1394,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
 	return -EOPNOTSUPP;
 }
 
+static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags, u16 mode)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
+						struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	const struct devlink_ops *ops = devlink->ops;
+	struct sk_buff *msg;
+	u16 mode;
+	int err;
+
+	if (!ops || !ops->eswitch_mode_get)
+		return -EOPNOTSUPP;
+
+	err = ops->eswitch_mode_get(devlink, &mode);
+	if (err)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+				   info->snd_portid, info->snd_seq, 0, mode);
+
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
+						struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	const struct devlink_ops *ops = devlink->ops;
+	u16 mode;
+
+	if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
+		return -EINVAL;
+
+	mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+
+	if (ops && ops->eswitch_mode_set)
+		return ops->eswitch_mode_set(devlink, mode);
+	return -EOPNOTSUPP;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -1407,6 +1479,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+	[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -1525,6 +1598,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 				  DEVLINK_NL_FLAG_NEED_SB |
 				  DEVLINK_NL_FLAG_LOCK_PORTS,
 	},
+	{
+		.cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
+		.doit = devlink_nl_cmd_eswitch_mode_get_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
+		.doit = devlink_nl_cmd_eswitch_mode_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 /**
-- 
cgit v1.2.3


From f86dec94e3a86c992a637df1c301a4df25a85801 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Fri, 15 Apr 2016 18:14:25 +0200
Subject: NFC: hci: delete unused nfc_llc_get_rx_head_tail_room()

It used to be EXPORTed, but then EXPORT usage was cleaned up
(in 2012), without noticing that the function has no users at all
(and curiously, never had any users).

Delete it.

While at it, remove non-static "inline" hints on nearby functions:
these hints don't work across compilation units anyway,
and these functions are not used in their .c file, thus they are
never inlined. IOW: "inline" here does not help in any way.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Samuel Ortiz <sameo@linux.intel.com>
CC: Christophe Ricard <christophe.ricard@gmail.com>
CC: linux-wireless@vger.kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/llc.h |  4 ----
 net/nfc/hci/llc.c     | 17 +++++------------
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nfc/llc.h b/include/net/nfc/llc.h
index c25fbdee0d61..7ecb45757897 100644
--- a/include/net/nfc/llc.h
+++ b/include/net/nfc/llc.h
@@ -37,10 +37,6 @@ struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev,
 				 int tx_tailroom, llc_failure_t llc_failure);
 void nfc_llc_free(struct nfc_llc *llc);
 
-void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom,
-				   int *rx_tailroom);
-
-
 int nfc_llc_start(struct nfc_llc *llc);
 int nfc_llc_stop(struct nfc_llc *llc);
 void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb);
diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c
index 1399a03fa6e6..3d699cbc7435 100644
--- a/net/nfc/hci/llc.c
+++ b/net/nfc/hci/llc.c
@@ -133,36 +133,29 @@ void nfc_llc_free(struct nfc_llc *llc)
 	kfree(llc);
 }
 
-inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom,
-					  int *rx_tailroom)
-{
-	*rx_headroom = llc->rx_headroom;
-	*rx_tailroom = llc->rx_tailroom;
-}
-
-inline int nfc_llc_start(struct nfc_llc *llc)
+int nfc_llc_start(struct nfc_llc *llc)
 {
 	return llc->ops->start(llc);
 }
 EXPORT_SYMBOL(nfc_llc_start);
 
-inline int nfc_llc_stop(struct nfc_llc *llc)
+int nfc_llc_stop(struct nfc_llc *llc)
 {
 	return llc->ops->stop(llc);
 }
 EXPORT_SYMBOL(nfc_llc_stop);
 
-inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
+void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
 {
 	llc->ops->rcv_from_drv(llc, skb);
 }
 
-inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
+int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
 {
 	return llc->ops->xmit_from_hci(llc, skb);
 }
 
-inline void *nfc_llc_get_data(struct nfc_llc *llc)
+void *nfc_llc_get_data(struct nfc_llc *llc)
 {
 	return llc->data;
 }
-- 
cgit v1.2.3


From 7854a44526de84142e367f08288c9f3a33c4c8ee Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@collabora.com>
Date: Tue, 7 Jun 2016 16:21:52 +0200
Subject: NFC: digital: Add a delay between poll cycles

This replaces the polling work struct with a delayed work struct and add
a 10 ms delay between 2 poll cycles. This avoids to flood the device
with 'switch off'/'switch on' commands.

Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/digital.h |  2 +-
 net/nfc/digital_core.c    | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h
index 0ae101eef0f4..506e3f6eabef 100644
--- a/include/net/nfc/digital.h
+++ b/include/net/nfc/digital.h
@@ -220,7 +220,7 @@ struct nfc_digital_dev {
 	struct list_head cmd_queue;
 	struct mutex cmd_lock;
 
-	struct work_struct poll_work;
+	struct delayed_work poll_work;
 
 	u8 curr_protocol;
 	u8 curr_rf_tech;
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index dd9003f38822..27769ac89d27 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -30,6 +30,9 @@
 
 #define DIGITAL_PROTO_ISO15693_RF_TECH	NFC_PROTO_ISO15693_MASK
 
+/* Delay between each poll frame (ms) */
+#define DIGITAL_POLL_INTERVAL 10
+
 struct digital_cmd {
 	struct list_head queue;
 
@@ -419,7 +422,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev)
 
 	mutex_unlock(&ddev->poll_lock);
 
-	schedule_work(&ddev->poll_work);
+	schedule_delayed_work(&ddev->poll_work,
+			      msecs_to_jiffies(DIGITAL_POLL_INTERVAL));
 }
 
 static void digital_wq_poll(struct work_struct *work)
@@ -428,7 +432,7 @@ static void digital_wq_poll(struct work_struct *work)
 	struct digital_poll_tech *poll_tech;
 	struct nfc_digital_dev *ddev = container_of(work,
 						    struct nfc_digital_dev,
-						    poll_work);
+						    poll_work.work);
 	mutex_lock(&ddev->poll_lock);
 
 	if (!ddev->poll_tech_count) {
@@ -543,7 +547,7 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols,
 		return -EINVAL;
 	}
 
-	schedule_work(&ddev->poll_work);
+	schedule_delayed_work(&ddev->poll_work, 0);
 
 	return 0;
 }
@@ -564,7 +568,7 @@ static void digital_stop_poll(struct nfc_dev *nfc_dev)
 
 	mutex_unlock(&ddev->poll_lock);
 
-	cancel_work_sync(&ddev->poll_work);
+	cancel_delayed_work_sync(&ddev->poll_work);
 
 	digital_abort_cmd(ddev);
 }
@@ -770,7 +774,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops,
 	INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete);
 
 	mutex_init(&ddev->poll_lock);
-	INIT_WORK(&ddev->poll_work, digital_wq_poll);
+	INIT_DELAYED_WORK(&ddev->poll_work, digital_wq_poll);
 
 	if (supported_protocols & NFC_PROTO_JEWEL_MASK)
 		ddev->protocols |= NFC_PROTO_JEWEL_MASK;
@@ -832,7 +836,7 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev)
 	ddev->poll_tech_count = 0;
 	mutex_unlock(&ddev->poll_lock);
 
-	cancel_work_sync(&ddev->poll_work);
+	cancel_delayed_work_sync(&ddev->poll_work);
 	cancel_work_sync(&ddev->cmd_work);
 	cancel_work_sync(&ddev->cmd_complete_work);
 
-- 
cgit v1.2.3


From ff202ee1ed8f032f05b80b541664cf02e75d7080 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 2 Jul 2016 06:43:15 -0400
Subject: net sched actions: skbedit add support for mod-ing skb pkt_type

Extremely useful for setting packet type to host so i dont
have to modify the dst mac address using pedit (which requires
that i know the mac address)

Example usage:
tc filter add dev eth0 parent ffff: protocol ip pref 9 u32 \
match ip src 5.5.5.5/32 \
flowid 1:5 action skbedit ptype host

This will tag all packets incoming from 5.5.5.5 with type
PACKET_HOST

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbedit.h        | 10 +++++-----
 include/uapi/linux/tc_act/tc_skbedit.h |  2 ++
 net/sched/act_skbedit.c                | 18 +++++++++++++++++-
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index b496d5ad7d42..d01a5d40cfb5 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -24,11 +24,11 @@
 
 struct tcf_skbedit {
 	struct tcf_common	common;
-	u32			flags;
-	u32     		priority;
-	u32     		mark;
-	u16			queue_mapping;
-	/* XXX: 16-bit pad here? */
+	u32		flags;
+	u32		priority;
+	u32		mark;
+	u16		queue_mapping;
+	u16		ptype;
 };
 #define to_skbedit(a) \
 	container_of(a->priv, struct tcf_skbedit, common)
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index fecb5cc48c40..a4d00c608d8f 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -27,6 +27,7 @@
 #define SKBEDIT_F_PRIORITY		0x1
 #define SKBEDIT_F_QUEUE_MAPPING		0x2
 #define SKBEDIT_F_MARK			0x4
+#define SKBEDIT_F_PTYPE			0x8
 
 struct tc_skbedit {
 	tc_gen;
@@ -40,6 +41,7 @@ enum {
 	TCA_SKBEDIT_QUEUE_MAPPING,
 	TCA_SKBEDIT_MARK,
 	TCA_SKBEDIT_PAD,
+	TCA_SKBEDIT_PTYPE,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 53d1486cddf7..1c4c9240a3f8 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -47,6 +47,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 		skb_set_queue_mapping(skb, d->queue_mapping);
 	if (d->flags & SKBEDIT_F_MARK)
 		skb->mark = d->mark;
+	if (d->flags & SKBEDIT_F_PTYPE)
+		skb->pkt_type = d->ptype;
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
@@ -57,6 +59,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -68,7 +71,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	struct tc_skbedit *parm;
 	struct tcf_skbedit *d;
 	u32 flags = 0, *priority = NULL, *mark = NULL;
-	u16 *queue_mapping = NULL;
+	u16 *queue_mapping = NULL, *ptype = NULL;
 	bool exists = false;
 	int ret = 0, err;
 
@@ -92,6 +95,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
 	}
 
+	if (tb[TCA_SKBEDIT_PTYPE] != NULL) {
+		ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]);
+		if (!skb_pkt_type_ok(*ptype))
+			return -EINVAL;
+		flags |= SKBEDIT_F_PTYPE;
+	}
+
 	if (tb[TCA_SKBEDIT_MARK] != NULL) {
 		flags |= SKBEDIT_F_MARK;
 		mark = nla_data(tb[TCA_SKBEDIT_MARK]);
@@ -132,6 +142,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		d->queue_mapping = *queue_mapping;
 	if (flags & SKBEDIT_F_MARK)
 		d->mark = *mark;
+	if (flags & SKBEDIT_F_PTYPE)
+		d->ptype = *ptype;
 
 	d->tcf_action = parm->action;
 
@@ -169,6 +181,10 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	    nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
 		    &d->mark))
 		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_PTYPE) &&
+	    nla_put(skb, TCA_SKBEDIT_PTYPE, sizeof(d->ptype),
+		    &d->ptype))
+		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
-- 
cgit v1.2.3


From 2a4501ae18b52fcdf553404286e6cefabd1d17ec Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 5 Jul 2016 11:27:42 +0200
Subject: neigh: Send a notification when DELAY_PROBE_TIME changes

When the data plane is offloaded the traffic doesn't go through the
networking stack. Therefore, after first resolving a neighbour the NUD
state machine will transition it from REACHABLE to STALE until it's
finally deleted by the garbage collector.

To prevent such situations the offloading driver should notify the NUD
state machine on any neighbours that were recently used. The driver's
polling interval should be set so that the NUD state machine can
function as if the traffic wasn't offloaded.

Currently, there are no in-tree drivers that can report confirmation for
a neighbour, but only 'used' indication. Therefore, the polling interval
should be set according to DELAY_FIRST_PROBE_TIME, as a neighbour will
transition from REACHABLE state to DELAY (instead of STALE) if "a packet
was sent within the last DELAY_FIRST_PROBE_TIME seconds" (RFC 4861).

Send a netevent whenever the DELAY_FIRST_PROBE_TIME changes - either via
netlink or sysctl - so that offloading drivers can correctly set their
polling interval.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netevent.h | 1 +
 net/core/neighbour.c   | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netevent.h b/include/net/netevent.h
index d8bbb38584b6..f440df172b56 100644
--- a/include/net/netevent.h
+++ b/include/net/netevent.h
@@ -24,6 +24,7 @@ struct netevent_redirect {
 enum netevent_notif_type {
 	NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */
 	NETEVENT_REDIRECT,	   /* arg is struct netevent_redirect ptr */
+	NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
 };
 
 int register_netevent_notifier(struct notifier_block *nb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 952aabb5aa56..5cdc62a8eb84 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2047,6 +2047,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
 			case NDTPA_DELAY_PROBE_TIME:
 				NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
 					      nla_get_msecs(tbp[i]));
+				call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
 				break;
 			case NDTPA_RETRANS_TIME:
 				NEIGH_VAR_SET(p, RETRANS_TIME,
@@ -2930,6 +2931,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
 		return;
 
 	set_bit(index, p->data_state);
+	call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
 	if (!dev) /* NULL dev means this is default value */
 		neigh_copy_dflt_parms(net, p, index);
 }
-- 
cgit v1.2.3


From c6e6a0c8be575c830a97b1942dabeab70f423fe0 Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Tue, 5 Jul 2016 15:23:08 +0300
Subject: nl80211: Add API to support VHT MU-MIMO air sniffer

add API to support VHT MU-MIMO air sniffer.
in MU-MIMO there are parallel frames on the air while the HW
has only one RX.
add the capability to sniff one of the MU-MIMO parallel frames by
giving the sniffer additional information so it'll know which
of the parallel frames it shall follow.

Add attribute - NL80211_ATTR_MU_MIMO_GROUP_DATA - for getting
a MU-MIMO groupID in order to monitor packets from that group
using VHT MU-MIMO.
And add attribute -NL80211_ATTR_MU_MIMO_FOLLOW_ADDR - for passing
MAC address to monitor mode.
that option will be used by VHT MU-MIMO air sniffer to follow a
station according to it's MAC address using VHT MU-MIMO.

Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 10 ++++++++--
 include/uapi/linux/nl80211.h | 29 +++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7bbb00d8b2cd..fa4f0f793817 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -330,6 +330,9 @@ struct ieee80211_supported_band {
  * in a separate chapter.
  */
 
+#define VHT_MUMIMO_GROUPS_DATA_LEN (WLAN_MEMBERSHIP_LEN +\
+				    WLAN_USER_POSITION_LEN)
+
 /**
  * struct vif_params - describes virtual interface parameters
  * @use_4addr: use 4-address frames
@@ -339,10 +342,13 @@ struct ieee80211_supported_band {
  *	This feature is only fully supported by drivers that enable the
  *	%NL80211_FEATURE_MAC_ON_CREATE flag.  Others may support creating
  **	only p2p devices with specified MAC.
+ * @vht_mumimo_groups: MU-MIMO groupID. used for monitoring only
+ *	 packets belonging to that MU-MIMO groupID.
  */
 struct vif_params {
-       int use_4addr;
-       u8 macaddr[ETH_ALEN];
+	int use_4addr;
+	u8 macaddr[ETH_ALEN];
+	u8 vht_mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN];
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 53c8278827a0..1d7da7888dcf 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1829,6 +1829,25 @@ enum nl80211_commands {
  *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per
  *	interface type.
  *
+ * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO
+ *	groupID for monitor mode.
+ *	The first 8 bytes are a mask that defines the membership in each
+ *	group (there are 64 groups, group 0 and 63 are reserved),
+ *	each bit represents a group and set to 1 for being a member in
+ *	that group and 0 for not being a member.
+ *	The remaining 16 bytes define the position in each group: 2 bits for
+ *	each group.
+ *	(smaller group numbers represented on most significant bits and bigger
+ *	group numbers on least significant bits.)
+ *	This attribute is used only if all interfaces are in monitor mode.
+ *	Set this attribute in order to monitor packets using the given MU-MIMO
+ *	groupID data.
+ *	to turn off that feature set all the bits of the groupID to zero.
+ * @NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR: mac address for the sniffer to follow
+ *	when using MU-MIMO air sniffer.
+ *	to turn that feature off set an invalid mac address
+ *	(e.g. FF:FF:FF:FF:FF:FF)
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2213,6 +2232,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_IFTYPE_EXT_CAPA,
 
+	NL80211_ATTR_MU_MIMO_GROUP_DATA,
+	NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -4479,6 +4501,12 @@ enum nl80211_feature_flags {
  *	%NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set
  *	the ASSOC_REQ_USE_RRM flag in the association request even if
  *	NL80211_FEATURE_QUIET is not advertized.
+ * @NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER: This device supports MU-MIMO air
+ *	sniffer which means that it can be configured to hear packets from
+ *	certain groups which can be configured by the
+ *	%NL80211_ATTR_MU_MIMO_GROUP_DATA attribute,
+ *	or can be configured to follow a station by configuring the
+ *	%NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR attribute.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4486,6 +4514,7 @@ enum nl80211_feature_flags {
 enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
 	NL80211_EXT_FEATURE_RRM,
+	NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 244d552d5647..447026f8cc76 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -405,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
 	[NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
 	[NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 },
+	[NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
+		.len = VHT_MUMIMO_GROUPS_DATA_LEN
+	},
+	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
 };
 
 /* policy for the key attributes */
@@ -2695,6 +2699,38 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		change = true;
 	}
 
+	if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) {
+		const u8 *mumimo_groups;
+		u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+		if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
+			return -EOPNOTSUPP;
+
+		mumimo_groups =
+			nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]);
+
+		/* bits 0 and 63 are reserved and must be zero */
+		if ((mumimo_groups[0] & BIT(7)) ||
+		    (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0)))
+			return -EINVAL;
+
+		memcpy(params.vht_mumimo_groups, mumimo_groups,
+		       VHT_MUMIMO_GROUPS_DATA_LEN);
+		change = true;
+	}
+
+	if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) {
+		u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+		if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
+			return -EOPNOTSUPP;
+
+		nla_memcpy(params.macaddr,
+			   info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR],
+			   ETH_ALEN);
+		change = true;
+	}
+
 	if (flags && (*flags & MONITOR_FLAG_ACTIVE) &&
 	    !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 1d76250bd34af86c6498fc51e50cab3bfbbeceaa Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Tue, 5 Jul 2016 17:10:13 +0300
Subject: nl80211: support beacon report scanning

Beacon report radio measurement requires reporting observed BSSs
on the channels specified in the beacon request. If the measurement
mode is set to passive or active, it requires actually performing a
scan (passive or active, accordingly), and reporting the time that
the scan was started and the time each beacon/probe was received
(both in terms of TSF of the BSS of the requesting AP). If the
request mode is table, this information is optional.
In addition, the radio measurement request specifies the channel
dwell time for the measurement.

In order to use scan for beacon report when the mode is active or
passive, add a parameter to scan request that specifies the
channel dwell time, and add scan start time and beacon received time
to scan results information.

Supporting beacon report is required for Multi Band Operation (MBO).

Signed-off-by: Assaf Krauss <assaf.krauss@intel.com>
Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         | 17 +++++++--
 drivers/net/wireless/ath/wil6210/cfg80211.c        |  6 +++-
 drivers/net/wireless/ath/wil6210/main.c            | 12 +++++--
 drivers/net/wireless/ath/wil6210/p2p.c             |  6 +++-
 drivers/net/wireless/ath/wil6210/wmi.c             |  8 +++--
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  6 +++-
 drivers/net/wireless/intersil/orinoco/scan.c       | 12 +++++--
 drivers/net/wireless/marvell/libertas/cfg.c        | 11 ++++--
 drivers/net/wireless/marvell/mwifiex/cmdevt.c      | 12 +++++--
 drivers/net/wireless/marvell/mwifiex/main.c        |  6 +++-
 drivers/net/wireless/marvell/mwifiex/scan.c        | 12 +++++--
 drivers/net/wireless/rndis_wlan.c                  | 10 ++++--
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c  | 11 ++++--
 drivers/staging/wilc1000/wilc_wfi_cfgoperations.c  | 12 +++++--
 drivers/staging/wlan-ng/cfg80211.c                 |  5 ++-
 include/net/cfg80211.h                             | 40 ++++++++++++++++++---
 include/uapi/linux/nl80211.h                       | 42 ++++++++++++++++++++++
 net/mac80211/scan.c                                |  9 +++--
 net/wireless/core.c                                |  4 +--
 net/wireless/core.h                                | 12 +++++++
 net/wireless/nl80211.c                             | 27 ++++++++++++++
 net/wireless/scan.c                                | 18 ++++++----
 net/wireless/trace.h                               | 33 ++++++++++++-----
 23 files changed, 279 insertions(+), 52 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 4e11ba06f089..ef5b40ef6d67 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -859,7 +859,11 @@ void ath6kl_cfg80211_disconnect_event(struct ath6kl_vif *vif, u8 reason,
 	struct ath6kl *ar = vif->ar;
 
 	if (vif->scan_req) {
-		cfg80211_scan_done(vif->scan_req, true);
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
+		cfg80211_scan_done(vif->scan_req, &info);
 		vif->scan_req = NULL;
 	}
 
@@ -1069,6 +1073,9 @@ static int ath6kl_cfg80211_scan(struct wiphy *wiphy,
 void ath6kl_cfg80211_scan_complete_event(struct ath6kl_vif *vif, bool aborted)
 {
 	struct ath6kl *ar = vif->ar;
+	struct cfg80211_scan_info info = {
+		.aborted = aborted,
+	};
 	int i;
 
 	ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: status%s\n", __func__,
@@ -1089,7 +1096,7 @@ void ath6kl_cfg80211_scan_complete_event(struct ath6kl_vif *vif, bool aborted)
 	}
 
 out:
-	cfg80211_scan_done(vif->scan_req, aborted);
+	cfg80211_scan_done(vif->scan_req, &info);
 	vif->scan_req = NULL;
 }
 
@@ -3614,7 +3621,11 @@ void ath6kl_cfg80211_vif_stop(struct ath6kl_vif *vif, bool wmi_ready)
 	}
 
 	if (vif->scan_req) {
-		cfg80211_scan_done(vif->scan_req, true);
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
+		cfg80211_scan_done(vif->scan_req, &info);
 		vif->scan_req = NULL;
 	}
 
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 62bf9331bd7f..f0e1175fb76a 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -1369,7 +1369,11 @@ static void wil_cfg80211_stop_p2p_device(struct wiphy *wiphy,
 	mutex_lock(&wil->mutex);
 	started = wil_p2p_stop_discovery(wil);
 	if (started && wil->scan_request) {
-		cfg80211_scan_done(wil->scan_request, 1);
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
+		cfg80211_scan_done(wil->scan_request, &info);
 		wil->scan_request = NULL;
 		wil->radio_wdev = wil->wdev;
 	}
diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c
index 8e31d755bbee..4bc92e54984a 100644
--- a/drivers/net/wireless/ath/wil6210/main.c
+++ b/drivers/net/wireless/ath/wil6210/main.c
@@ -850,10 +850,14 @@ int wil_reset(struct wil6210_priv *wil, bool load_fw)
 	mutex_unlock(&wil->wmi_mutex);
 
 	if (wil->scan_request) {
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
 		wil_dbg_misc(wil, "Abort scan_request 0x%p\n",
 			     wil->scan_request);
 		del_timer_sync(&wil->scan_timer);
-		cfg80211_scan_done(wil->scan_request, true);
+		cfg80211_scan_done(wil->scan_request, &info);
 		wil->scan_request = NULL;
 	}
 
@@ -1049,10 +1053,14 @@ int __wil_down(struct wil6210_priv *wil)
 	(void)wil_p2p_stop_discovery(wil);
 
 	if (wil->scan_request) {
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
 		wil_dbg_misc(wil, "Abort scan_request 0x%p\n",
 			     wil->scan_request);
 		del_timer_sync(&wil->scan_timer);
-		cfg80211_scan_done(wil->scan_request, true);
+		cfg80211_scan_done(wil->scan_request, &info);
 		wil->scan_request = NULL;
 	}
 
diff --git a/drivers/net/wireless/ath/wil6210/p2p.c b/drivers/net/wireless/ath/wil6210/p2p.c
index 213b8259638c..e0f8aa0ebfac 100644
--- a/drivers/net/wireless/ath/wil6210/p2p.c
+++ b/drivers/net/wireless/ath/wil6210/p2p.c
@@ -252,8 +252,12 @@ void wil_p2p_search_expired(struct work_struct *work)
 	mutex_unlock(&wil->mutex);
 
 	if (started) {
+		struct cfg80211_scan_info info = {
+			.aborted = false,
+		};
+
 		mutex_lock(&wil->p2p_wdev_mutex);
-		cfg80211_scan_done(wil->scan_request, 0);
+		cfg80211_scan_done(wil->scan_request, &info);
 		wil->scan_request = NULL;
 		wil->radio_wdev = wil->wdev;
 		mutex_unlock(&wil->p2p_wdev_mutex);
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index b80c5d850e1e..4d92541913c0 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -426,15 +426,17 @@ static void wmi_evt_scan_complete(struct wil6210_priv *wil, int id,
 {
 	if (wil->scan_request) {
 		struct wmi_scan_complete_event *data = d;
-		bool aborted = (data->status != WMI_SCAN_SUCCESS);
+		struct cfg80211_scan_info info = {
+			.aborted = (data->status != WMI_SCAN_SUCCESS),
+		};
 
 		wil_dbg_wmi(wil, "SCAN_COMPLETE(0x%08x)\n", data->status);
 		wil_dbg_misc(wil, "Complete scan_request 0x%p aborted %d\n",
-			     wil->scan_request, aborted);
+			     wil->scan_request, info.aborted);
 
 		del_timer_sync(&wil->scan_timer);
 		mutex_lock(&wil->p2p_wdev_mutex);
-		cfg80211_scan_done(wil->scan_request, aborted);
+		cfg80211_scan_done(wil->scan_request, &info);
 		wil->radio_wdev = wil->wdev;
 		mutex_unlock(&wil->p2p_wdev_mutex);
 		wil->scan_request = NULL;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 264bd638a3d9..afe2b202040a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -775,9 +775,13 @@ s32 brcmf_notify_escan_complete(struct brcmf_cfg80211_info *cfg,
 		if (!aborted)
 			cfg80211_sched_scan_results(cfg_to_wiphy(cfg));
 	} else if (scan_request) {
+		struct cfg80211_scan_info info = {
+			.aborted = aborted,
+		};
+
 		brcmf_dbg(SCAN, "ESCAN Completed scan: %s\n",
 			  aborted ? "Aborted" : "Done");
-		cfg80211_scan_done(scan_request, aborted);
+		cfg80211_scan_done(scan_request, &info);
 	}
 	if (!test_and_clear_bit(BRCMF_SCAN_STATUS_BUSY, &cfg->scan_status))
 		brcmf_dbg(SCAN, "Scan complete, probably P2P scan\n");
diff --git a/drivers/net/wireless/intersil/orinoco/scan.c b/drivers/net/wireless/intersil/orinoco/scan.c
index d0ceb06c72d0..6d1d084854fb 100644
--- a/drivers/net/wireless/intersil/orinoco/scan.c
+++ b/drivers/net/wireless/intersil/orinoco/scan.c
@@ -237,7 +237,11 @@ void orinoco_add_hostscan_results(struct orinoco_private *priv,
 
  scan_abort:
 	if (priv->scan_request) {
-		cfg80211_scan_done(priv->scan_request, abort);
+		struct cfg80211_scan_info info = {
+			.aborted = abort,
+		};
+
+		cfg80211_scan_done(priv->scan_request, &info);
 		priv->scan_request = NULL;
 	}
 }
@@ -245,7 +249,11 @@ void orinoco_add_hostscan_results(struct orinoco_private *priv,
 void orinoco_scan_done(struct orinoco_private *priv, bool abort)
 {
 	if (priv->scan_request) {
-		cfg80211_scan_done(priv->scan_request, abort);
+		struct cfg80211_scan_info info = {
+			.aborted = abort,
+		};
+
+		cfg80211_scan_done(priv->scan_request, &info);
 		priv->scan_request = NULL;
 	}
 }
diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c
index 776b44bfd93a..ea4802446618 100644
--- a/drivers/net/wireless/marvell/libertas/cfg.c
+++ b/drivers/net/wireless/marvell/libertas/cfg.c
@@ -796,10 +796,15 @@ void lbs_scan_done(struct lbs_private *priv)
 {
 	WARN_ON(!priv->scan_req);
 
-	if (priv->internal_scan)
+	if (priv->internal_scan) {
 		kfree(priv->scan_req);
-	else
-		cfg80211_scan_done(priv->scan_req, false);
+	} else {
+		struct cfg80211_scan_info info = {
+			.aborted = false,
+		};
+
+		cfg80211_scan_done(priv->scan_req, &info);
+	}
 
 	priv->scan_req = NULL;
 }
diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
index 6bc2011d8609..e7a21443647e 100644
--- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c
+++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
@@ -1057,8 +1057,12 @@ mwifiex_cancel_all_pending_cmd(struct mwifiex_adapter *adapter)
 			if (!priv)
 				continue;
 			if (priv->scan_request) {
+				struct cfg80211_scan_info info = {
+					.aborted = true,
+				};
+
 				mwifiex_dbg(adapter, WARN, "info: aborting scan\n");
-				cfg80211_scan_done(priv->scan_request, 1);
+				cfg80211_scan_done(priv->scan_request, &info);
 				priv->scan_request = NULL;
 			}
 		}
@@ -1112,8 +1116,12 @@ mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter)
 			if (!priv)
 				continue;
 			if (priv->scan_request) {
+				struct cfg80211_scan_info info = {
+					.aborted = true,
+				};
+
 				mwifiex_dbg(adapter, WARN, "info: aborting scan\n");
-				cfg80211_scan_done(priv->scan_request, 1);
+				cfg80211_scan_done(priv->scan_request, &info);
 				priv->scan_request = NULL;
 			}
 		}
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 0e280f879b58..db4925db39aa 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -697,9 +697,13 @@ mwifiex_close(struct net_device *dev)
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 
 	if (priv->scan_request) {
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
 		mwifiex_dbg(priv->adapter, INFO,
 			    "aborting scan on ndo_stop\n");
-		cfg80211_scan_done(priv->scan_request, 1);
+		cfg80211_scan_done(priv->scan_request, &info);
 		priv->scan_request = NULL;
 		priv->scan_aborting = true;
 	}
diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c
index bc5e52cebce1..fdd749110fcb 100644
--- a/drivers/net/wireless/marvell/mwifiex/scan.c
+++ b/drivers/net/wireless/marvell/mwifiex/scan.c
@@ -1956,9 +1956,13 @@ static void mwifiex_check_next_scan_command(struct mwifiex_private *priv)
 			mwifiex_complete_scan(priv);
 
 		if (priv->scan_request) {
+			struct cfg80211_scan_info info = {
+				.aborted = false,
+			};
+
 			mwifiex_dbg(adapter, INFO,
 				    "info: notifying scan done\n");
-			cfg80211_scan_done(priv->scan_request, 0);
+			cfg80211_scan_done(priv->scan_request, &info);
 			priv->scan_request = NULL;
 		} else {
 			priv->scan_aborting = false;
@@ -1977,9 +1981,13 @@ static void mwifiex_check_next_scan_command(struct mwifiex_private *priv)
 
 		if (!adapter->active_scan_triggered) {
 			if (priv->scan_request) {
+				struct cfg80211_scan_info info = {
+					.aborted = true,
+				};
+
 				mwifiex_dbg(adapter, INFO,
 					    "info: aborting scan\n");
-				cfg80211_scan_done(priv->scan_request, 1);
+				cfg80211_scan_done(priv->scan_request, &info);
 				priv->scan_request = NULL;
 			} else {
 				priv->scan_aborting = false;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 569918c485b4..603c90470225 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -2134,6 +2134,7 @@ static void rndis_get_scan_results(struct work_struct *work)
 	struct rndis_wlan_private *priv =
 		container_of(work, struct rndis_wlan_private, scan_work.work);
 	struct usbnet *usbdev = priv->usbdev;
+	struct cfg80211_scan_info info = {};
 	int ret;
 
 	netdev_dbg(usbdev->net, "get_scan_results\n");
@@ -2143,7 +2144,8 @@ static void rndis_get_scan_results(struct work_struct *work)
 
 	ret = rndis_check_bssid_list(usbdev, NULL, NULL);
 
-	cfg80211_scan_done(priv->scan_request, ret < 0);
+	info.aborted = ret < 0;
+	cfg80211_scan_done(priv->scan_request, &info);
 
 	priv->scan_request = NULL;
 }
@@ -3574,7 +3576,11 @@ static int rndis_wlan_stop(struct usbnet *usbdev)
 	flush_workqueue(priv->workqueue);
 
 	if (priv->scan_request) {
-		cfg80211_scan_done(priv->scan_request, true);
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
+		cfg80211_scan_done(priv->scan_request, &info);
 		priv->scan_request = NULL;
 	}
 
diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c
index 0da559d929bc..d0ba3778990e 100644
--- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c
@@ -1256,10 +1256,15 @@ void rtw_cfg80211_indicate_scan_done(struct rtw_wdev_priv *pwdev_priv,
 		DBG_8723A("%s with scan req\n", __func__);
 
 		if (pwdev_priv->scan_request->wiphy !=
-		    pwdev_priv->rtw_wdev->wiphy)
+		    pwdev_priv->rtw_wdev->wiphy) {
 			DBG_8723A("error wiphy compare\n");
-		else
-			cfg80211_scan_done(pwdev_priv->scan_request, aborted);
+		} else {
+			struct cfg80211_scan_info info = {
+				.aborted = aborted,
+			};
+
+			cfg80211_scan_done(pwdev_priv->scan_request, &info);
+		}
 
 		pwdev_priv->scan_request = NULL;
 	} else {
diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
index 51aff4ff7d7c..a0d8e22e575b 100644
--- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
+++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
@@ -454,7 +454,11 @@ static void CfgScanResult(enum scan_event scan_event,
 			mutex_lock(&priv->scan_req_lock);
 
 			if (priv->pstrScanReq) {
-				cfg80211_scan_done(priv->pstrScanReq, false);
+				struct cfg80211_scan_info info = {
+					.aborted = false,
+				};
+
+				cfg80211_scan_done(priv->pstrScanReq, &info);
 				priv->u32RcvdChCount = 0;
 				priv->bCfgScanning = false;
 				priv->pstrScanReq = NULL;
@@ -464,10 +468,14 @@ static void CfgScanResult(enum scan_event scan_event,
 			mutex_lock(&priv->scan_req_lock);
 
 			if (priv->pstrScanReq) {
+				struct cfg80211_scan_info info = {
+					.aborted = false,
+				};
+
 				update_scan_time();
 				refresh_scan(priv, 1, false);
 
-				cfg80211_scan_done(priv->pstrScanReq, false);
+				cfg80211_scan_done(priv->pstrScanReq, &info);
 				priv->bCfgScanning = false;
 				priv->pstrScanReq = NULL;
 			}
diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c
index a6e6fb9f42e1..f46dfe6b24e8 100644
--- a/drivers/staging/wlan-ng/cfg80211.c
+++ b/drivers/staging/wlan-ng/cfg80211.c
@@ -338,6 +338,8 @@ static int prism2_scan(struct wiphy *wiphy,
 	struct p80211msg_dot11req_scan msg1;
 	struct p80211msg_dot11req_scan_results msg2;
 	struct cfg80211_bss *bss;
+	struct cfg80211_scan_info info = {};
+
 	int result;
 	int err = 0;
 	int numbss = 0;
@@ -440,7 +442,8 @@ static int prism2_scan(struct wiphy *wiphy,
 		err = prism2_result2err(msg2.resultcode.data);
 
 exit:
-	cfg80211_scan_done(request, err ? 1 : 0);
+	info.aborted = !!(err);
+	cfg80211_scan_done(request, &info);
 	priv->scan_request = NULL;
 	return err;
 }
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fa4f0f793817..e2658e392a1f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1423,6 +1423,21 @@ struct cfg80211_ssid {
 	u8 ssid_len;
 };
 
+/**
+ * struct cfg80211_scan_info - information about completed scan
+ * @scan_start_tsf: scan start time in terms of the TSF of the BSS that the
+ *	wireless device that requested the scan is connected to. If this
+ *	information is not available, this field is left zero.
+ * @tsf_bssid: the BSSID according to which %scan_start_tsf is set.
+ * @aborted: set to true if the scan was aborted for any reason,
+ *	userspace will be notified of that
+ */
+struct cfg80211_scan_info {
+	u64 scan_start_tsf;
+	u8 tsf_bssid[ETH_ALEN] __aligned(2);
+	bool aborted;
+};
+
 /**
  * struct cfg80211_scan_request - scan request description
  *
@@ -1433,12 +1448,17 @@ struct cfg80211_ssid {
  * @scan_width: channel width for scanning
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
+ * @duration: how long to listen on each channel, in TUs. If
+ *	%duration_mandatory is not set, this is the maximum dwell time and
+ *	the actual dwell time may be shorter.
+ * @duration_mandatory: if set, the scan duration must be as specified by the
+ *	%duration field.
  * @flags: bit field of flags controlling operation
  * @rates: bitmap of rates to advertise for each band
  * @wiphy: the wiphy this was for
  * @scan_start: time (in jiffies) when the scan started
  * @wdev: the wireless device to scan for
- * @aborted: (internal) scan request was notified as aborted
+ * @info: (internal) information about completed scan
  * @notified: (internal) scan request was notified as done or aborted
  * @no_cck: used to send probe requests at non CCK rate in 2GHz band
  * @mac_addr: MAC address used with randomisation
@@ -1454,6 +1474,8 @@ struct cfg80211_scan_request {
 	enum nl80211_bss_scan_width scan_width;
 	const u8 *ie;
 	size_t ie_len;
+	u16 duration;
+	bool duration_mandatory;
 	u32 flags;
 
 	u32 rates[NUM_NL80211_BANDS];
@@ -1467,7 +1489,8 @@ struct cfg80211_scan_request {
 	/* internal */
 	struct wiphy *wiphy;
 	unsigned long scan_start;
-	bool aborted, notified;
+	struct cfg80211_scan_info info;
+	bool notified;
 	bool no_cck;
 
 	/* keep last */
@@ -1600,12 +1623,19 @@ enum cfg80211_signal_type {
  *	buffered on the device) and be accurate to about 10ms.
  *	If the frame isn't buffered, just passing the return value of
  *	ktime_get_boot_ns() is likely appropriate.
+ * @parent_tsf: the time at the start of reception of the first octet of the
+ *	timestamp field of the frame. The time is the TSF of the BSS specified
+ *	by %parent_bssid.
+ * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
+ *	the BSS that requested the scan in which the beacon/probe was received.
  */
 struct cfg80211_inform_bss {
 	struct ieee80211_channel *chan;
 	enum nl80211_bss_scan_width scan_width;
 	s32 signal;
 	u64 boottime_ns;
+	u64 parent_tsf;
+	u8 parent_bssid[ETH_ALEN] __aligned(2);
 };
 
 /**
@@ -4067,10 +4097,10 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator);
  * cfg80211_scan_done - notify that scan finished
  *
  * @request: the corresponding scan request
- * @aborted: set to true if the scan was aborted for any reason,
- *	userspace will be notified of that
+ * @info: information about the completed scan
  */
-void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted);
+void cfg80211_scan_done(struct cfg80211_scan_request *request,
+			struct cfg80211_scan_info *info);
 
 /**
  * cfg80211_sched_scan_results - notify that new scan results are available
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1d7da7888dcf..b39ccab45333 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1848,6 +1848,22 @@ enum nl80211_commands {
  *	to turn that feature off set an invalid mac address
  *	(e.g. FF:FF:FF:FF:FF:FF)
  *
+ * @NL80211_ATTR_SCAN_START_TIME_TSF: The time at which the scan was actually
+ *	started (u64). The time is the TSF of the BSS the interface that
+ *	requested the scan is connected to (if available, otherwise this
+ *	attribute must not be included).
+ * @NL80211_ATTR_SCAN_START_TIME_TSF_BSSID: The BSS according to which
+ *	%NL80211_ATTR_SCAN_START_TIME_TSF is set.
+ * @NL80211_ATTR_MEASUREMENT_DURATION: measurement duration in TUs (u16). If
+ *	%NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY is not set, this is the
+ *	maximum measurement duration allowed. This attribute is used with
+ *	measurement requests. It can also be used with %NL80211_CMD_TRIGGER_SCAN
+ *	if the scan is used for beacon report radio measurement.
+ * @NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY: flag attribute that indicates
+ *	that the duration specified with %NL80211_ATTR_MEASUREMENT_DURATION is
+ *	mandatory. If this flag is not set, the duration is the maximum duration
+ *	and the actual measurement duration may be shorter.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2235,6 +2251,11 @@ enum nl80211_attrs {
 	NL80211_ATTR_MU_MIMO_GROUP_DATA,
 	NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR,
 
+	NL80211_ATTR_SCAN_START_TIME_TSF,
+	NL80211_ATTR_SCAN_START_TIME_TSF_BSSID,
+	NL80211_ATTR_MEASUREMENT_DURATION,
+	NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3496,6 +3517,12 @@ enum nl80211_bss_scan_width {
  *	was last updated by a received frame. The value is expected to be
  *	accurate to about 10ms. (u64, nanoseconds)
  * @NL80211_BSS_PAD: attribute used for padding for 64-bit alignment
+ * @NL80211_BSS_PARENT_TSF: the time at the start of reception of the first
+ *	octet of the timestamp field of the last beacon/probe received for
+ *	this BSS. The time is the TSF of the BSS specified by
+ *	@NL80211_BSS_PARENT_BSSID. (u64).
+ * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF
+ *	is set.
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -3517,6 +3544,8 @@ enum nl80211_bss {
 	NL80211_BSS_PRESP_DATA,
 	NL80211_BSS_LAST_SEEN_BOOTTIME,
 	NL80211_BSS_PAD,
+	NL80211_BSS_PARENT_TSF,
+	NL80211_BSS_PARENT_BSSID,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
@@ -4507,6 +4536,16 @@ enum nl80211_feature_flags {
  *	%NL80211_ATTR_MU_MIMO_GROUP_DATA attribute,
  *	or can be configured to follow a station by configuring the
  *	%NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR attribute.
+ * @NL80211_EXT_FEATURE_SCAN_START_TIME: This driver includes the actual
+ *	time the scan started in scan results event. The time is the TSF of
+ *	the BSS that the interface that requested the scan is connected to
+ *	(if available).
+ * @NL80211_EXT_FEATURE_BSS_PARENT_TSF: Per BSS, this driver reports the
+ *	time the last beacon/probe was received. The time is the TSF of the
+ *	BSS that the interface that requested the scan is connected to
+ *	(if available).
+ * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of
+ *	channel dwell time.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4515,6 +4554,9 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
 	NL80211_EXT_FEATURE_RRM,
 	NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER,
+	NL80211_EXT_FEATURE_SCAN_START_TIME,
+	NL80211_EXT_FEATURE_BSS_PARENT_TSF,
+	NL80211_EXT_FEATURE_SET_SCAN_DWELL,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f9648ef9e31f..4ec1c52a1549 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -353,8 +353,13 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	scan_req = rcu_dereference_protected(local->scan_req,
 					     lockdep_is_held(&local->mtx));
 
-	if (scan_req != local->int_scan_req)
-		cfg80211_scan_done(scan_req, aborted);
+	if (scan_req != local->int_scan_req) {
+		struct cfg80211_scan_info info = {
+			.aborted = aborted,
+		};
+
+		cfg80211_scan_done(scan_req, &info);
+	}
 	RCU_INIT_POINTER(local->scan_req, NULL);
 
 	scan_sdata = rcu_dereference_protected(local->scan_sdata,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 39d9abd309ea..7645e97362c0 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -220,7 +220,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
 
 	if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
 		if (WARN_ON(!rdev->scan_req->notified))
-			rdev->scan_req->aborted = true;
+			rdev->scan_req->info.aborted = true;
 		___cfg80211_scan_done(rdev, false);
 	}
 }
@@ -1087,7 +1087,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		cfg80211_update_iface_num(rdev, wdev->iftype, -1);
 		if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
 			if (WARN_ON(!rdev->scan_req->notified))
-				rdev->scan_req->aborted = true;
+				rdev->scan_req->info.aborted = true;
 			___cfg80211_scan_done(rdev, false);
 		}
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index a4d547f99f8d..eee91443924d 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -141,6 +141,18 @@ struct cfg80211_internal_bss {
 	unsigned long refcount;
 	atomic_t hold;
 
+	/* time at the start of the reception of the first octet of the
+	 * timestamp field of the last beacon/probe received for this BSS.
+	 * The time is the TSF of the BSS specified by %parent_bssid.
+	 */
+	u64 parent_tsf;
+
+	/* the BSS according to which %parent_tsf is set. This is set to
+	 * the BSS that the interface that requested the scan was connected to
+	 * when the beacon/probe was received.
+	 */
+	u8 parent_bssid[ETH_ALEN] __aligned(2);
+
 	/* must be last because of priv member */
 	struct cfg80211_bss pub;
 };
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 447026f8cc76..c53b5462ed00 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6223,6 +6223,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) {
+		if (!wiphy_ext_feature_isset(wiphy,
+					NL80211_EXT_FEATURE_SET_SCAN_DWELL)) {
+			err = -EOPNOTSUPP;
+			goto out_free;
+		}
+
+		request->duration =
+			nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]);
+		request->duration_mandatory =
+			nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]);
+	}
+
 	if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) {
 		request->flags = nla_get_u32(
 			info->attrs[NL80211_ATTR_SCAN_FLAGS]);
@@ -7056,6 +7069,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			jiffies_to_msecs(jiffies - intbss->ts)))
 		goto nla_put_failure;
 
+	if (intbss->parent_tsf &&
+	    (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF,
+			       intbss->parent_tsf, NL80211_BSS_PAD) ||
+	     nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN,
+		     intbss->parent_bssid)))
+		goto nla_put_failure;
+
 	if (intbss->ts_boottime &&
 	    nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
 			      intbss->ts_boottime, NL80211_BSS_PAD))
@@ -11829,6 +11849,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 	    nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
 		goto nla_put_failure;
 
+	if (req->info.scan_start_tsf &&
+	    (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF,
+			       req->info.scan_start_tsf, NL80211_BSS_PAD) ||
+	     nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN,
+		     req->info.tsf_bssid)))
+		goto nla_put_failure;
+
 	return 0;
  nla_put_failure:
 	return -ENOBUFS;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ef2955c89a00..0358e12be54b 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright 2016	Intel Deutschland GmbH
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -194,7 +195,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	if (wdev->netdev)
 		cfg80211_sme_scan_done(wdev->netdev);
 
-	if (!request->aborted &&
+	if (!request->info.aborted &&
 	    request->flags & NL80211_SCAN_FLAG_FLUSH) {
 		/* flush entries from previous scans */
 		spin_lock_bh(&rdev->bss_lock);
@@ -202,10 +203,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 		spin_unlock_bh(&rdev->bss_lock);
 	}
 
-	msg = nl80211_build_scan_msg(rdev, wdev, request->aborted);
+	msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted);
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (wdev->netdev && !request->aborted) {
+	if (wdev->netdev && !request->info.aborted) {
 		memset(&wrqu, 0, sizeof(wrqu));
 
 		wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL);
@@ -236,12 +237,13 @@ void __cfg80211_scan_done(struct work_struct *wk)
 	rtnl_unlock();
 }
 
-void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted)
+void cfg80211_scan_done(struct cfg80211_scan_request *request,
+			struct cfg80211_scan_info *info)
 {
-	trace_cfg80211_scan_done(request, aborted);
+	trace_cfg80211_scan_done(request, info);
 	WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req);
 
-	request->aborted = aborted;
+	request->info = *info;
 	request->notified = true;
 	queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk);
 }
@@ -843,6 +845,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 		found->pub.capability = tmp->pub.capability;
 		found->ts = tmp->ts;
 		found->ts_boottime = tmp->ts_boottime;
+		found->parent_tsf = tmp->parent_tsf;
+		ether_addr_copy(found->parent_bssid, tmp->parent_bssid);
 	} else {
 		struct cfg80211_internal_bss *new;
 		struct cfg80211_internal_bss *hidden;
@@ -1086,6 +1090,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
 	tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
 	tmp.ts_boottime = data->boottime_ns;
+	tmp.parent_tsf = data->parent_tsf;
+	ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
 
 	signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
 		wiphy->max_adj_channel_rssi_comp;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 3c1091ae6c36..72b5255cefe2 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2642,8 +2642,9 @@ TRACE_EVENT(cfg80211_tdls_oper_request,
 	);
 
 TRACE_EVENT(cfg80211_scan_done,
-	TP_PROTO(struct cfg80211_scan_request *request, bool aborted),
-	TP_ARGS(request, aborted),
+	TP_PROTO(struct cfg80211_scan_request *request,
+		 struct cfg80211_scan_info *info),
+	TP_ARGS(request, info),
 	TP_STRUCT__entry(
 		__field(u32, n_channels)
 		__dynamic_array(u8, ie, request ? request->ie_len : 0)
@@ -2652,6 +2653,8 @@ TRACE_EVENT(cfg80211_scan_done,
 		MAC_ENTRY(wiphy_mac)
 		__field(bool, no_cck)
 		__field(bool, aborted)
+		__field(u64, scan_start_tsf)
+		MAC_ENTRY(tsf_bssid)
 	),
 	TP_fast_assign(
 		if (request) {
@@ -2666,9 +2669,16 @@ TRACE_EVENT(cfg80211_scan_done,
 					   request->wiphy->perm_addr);
 			__entry->no_cck = request->no_cck;
 		}
-		__entry->aborted = aborted;
+		if (info) {
+			__entry->aborted = info->aborted;
+			__entry->scan_start_tsf = info->scan_start_tsf;
+			MAC_ASSIGN(tsf_bssid, info->tsf_bssid);
+		}
 	),
-	TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted))
+	TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT,
+		  BOOL_TO_STR(__entry->aborted),
+		  (unsigned long long)__entry->scan_start_tsf,
+		  MAC_PR_ARG(tsf_bssid))
 );
 
 DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results,
@@ -2721,6 +2731,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
 		__dynamic_array(u8, mgmt, len)
 		__field(s32, signal)
 		__field(u64, ts_boottime)
+		__field(u64, parent_tsf)
+		MAC_ENTRY(parent_bssid)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -2730,10 +2742,15 @@ TRACE_EVENT(cfg80211_inform_bss_frame,
 			memcpy(__get_dynamic_array(mgmt), mgmt, len);
 		__entry->signal = data->signal;
 		__entry->ts_boottime = data->boottime_ns;
-	),
-	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu",
-		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
-		  __entry->signal, (unsigned long long)__entry->ts_boottime)
+		__entry->parent_tsf = data->parent_tsf;
+		MAC_ASSIGN(parent_bssid, data->parent_bssid);
+	),
+	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
+		  "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: "
+		  MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+		  __entry->signal, (unsigned long long)__entry->ts_boottime,
+		  (unsigned long long)__entry->parent_tsf,
+		  MAC_PR_ARG(parent_bssid))
 );
 
 DECLARE_EVENT_CLASS(cfg80211_bss_evt,
-- 
cgit v1.2.3


From 7947d3e075cde1a18e538f2dafbc850aa356ff79 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Tue, 5 Jul 2016 15:23:12 +0300
Subject: mac80211: Add support for beacon report radio measurement

Add the following to support beacon report radio measurement
with the measurement mode field set to passive or active:
1. Propagate the required scan duration to the device
2. Report the scan start time (in terms of TSF)
3. Report each BSS's detection time (also in terms of TSF)

TSF times refer to the BSS that the interface that requested the
scan is connected to.

Signed-off-by: Assaf Krauss <assaf.krauss@intel.com>
Signed-off-by: Avraham Stern <avraham.stern@intel.com>
[changed ath9k/10k, at76c59x-usb, iwlegacy, wl1251 and wlcore to match
the new API]
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c         | 14 +++++----
 drivers/net/wireless/ath/ath9k/channel.c      |  5 +++-
 drivers/net/wireless/atmel/at76c50x-usb.c     |  5 +++-
 drivers/net/wireless/intel/iwlegacy/common.c  |  6 +++-
 drivers/net/wireless/intel/iwlwifi/dvm/scan.c |  6 +++-
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 37 ++++++++++++++++++-----
 drivers/net/wireless/mac80211_hwsim.c         | 11 +++++--
 drivers/net/wireless/st/cw1200/scan.c         |  6 +++-
 drivers/net/wireless/ti/wl1251/event.c        |  6 +++-
 drivers/net/wireless/ti/wl1251/main.c         |  6 +++-
 drivers/net/wireless/ti/wlcore/main.c         | 11 +++++--
 drivers/net/wireless/ti/wlcore/scan.c         |  5 +++-
 include/net/mac80211.h                        |  5 ++--
 net/mac80211/ieee80211_i.h                    |  1 +
 net/mac80211/scan.c                           | 42 ++++++++++++++++++++++-----
 15 files changed, 131 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index d4b7a168f7c0..ebc12c521fe0 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3858,12 +3858,16 @@ void __ath10k_scan_finish(struct ath10k *ar)
 		break;
 	case ATH10K_SCAN_RUNNING:
 	case ATH10K_SCAN_ABORTING:
-		if (!ar->scan.is_roc)
-			ieee80211_scan_completed(ar->hw,
-						 (ar->scan.state ==
-						  ATH10K_SCAN_ABORTING));
-		else if (ar->scan.roc_notify)
+		if (!ar->scan.is_roc) {
+			struct cfg80211_scan_info info = {
+				.aborted = (ar->scan.state ==
+					    ATH10K_SCAN_ABORTING),
+			};
+
+			ieee80211_scan_completed(ar->hw, &info);
+		} else if (ar->scan.roc_notify) {
 			ieee80211_remain_on_channel_expired(ar->hw);
+		}
 		/* fall through */
 	case ATH10K_SCAN_STARTING:
 		ar->scan.state = ATH10K_SCAN_IDLE;
diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
index e56bafcf5864..57e26a640477 100644
--- a/drivers/net/wireless/ath/ath9k/channel.c
+++ b/drivers/net/wireless/ath/ath9k/channel.c
@@ -960,6 +960,9 @@ void ath_roc_complete(struct ath_softc *sc, enum ath_roc_complete_reason reason)
 void ath_scan_complete(struct ath_softc *sc, bool abort)
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	struct cfg80211_scan_info info = {
+		.aborted = abort,
+	};
 
 	if (abort)
 		ath_dbg(common, CHAN_CTX, "HW scan aborted\n");
@@ -969,7 +972,7 @@ void ath_scan_complete(struct ath_softc *sc, bool abort)
 	sc->offchannel.scan_req = NULL;
 	sc->offchannel.scan_vif = NULL;
 	sc->offchannel.state = ATH_OFFCHANNEL_IDLE;
-	ieee80211_scan_completed(sc->hw, abort);
+	ieee80211_scan_completed(sc->hw, &info);
 	clear_bit(ATH_OP_SCANNING, &common->op_flags);
 	spin_lock_bh(&sc->chan_lock);
 	if (test_bit(ATH_OP_MULTI_CHANNEL, &common->op_flags))
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 7c108047fb46..0e180677c7fc 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -1922,6 +1922,9 @@ static void at76_dwork_hw_scan(struct work_struct *work)
 {
 	struct at76_priv *priv = container_of(work, struct at76_priv,
 					      dwork_hw_scan.work);
+	struct cfg80211_scan_info info = {
+		.aborted = false,
+	};
 	int ret;
 
 	if (priv->device_unplugged)
@@ -1948,7 +1951,7 @@ static void at76_dwork_hw_scan(struct work_struct *work)
 
 	mutex_unlock(&priv->mtx);
 
-	ieee80211_scan_completed(priv->hw, false);
+	ieee80211_scan_completed(priv->hw, &info);
 
 	ieee80211_wake_queues(priv->hw);
 }
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index eb24b9241bb2..140b6ea8f7cc 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -1305,10 +1305,14 @@ il_send_scan_abort(struct il_priv *il)
 static void
 il_complete_scan(struct il_priv *il, bool aborted)
 {
+	struct cfg80211_scan_info info = {
+		.aborted = aborted,
+	};
+
 	/* check if scan was requested from mac80211 */
 	if (il->scan_request) {
 		D_SCAN("Complete scan in mac80211\n");
-		ieee80211_scan_completed(il->hw, aborted);
+		ieee80211_scan_completed(il->hw, &info);
 	}
 
 	il->scan_vif = NULL;
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/scan.c b/drivers/net/wireless/intel/iwlwifi/dvm/scan.c
index d01766f16175..17e6a32384d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/scan.c
@@ -94,10 +94,14 @@ static int iwl_send_scan_abort(struct iwl_priv *priv)
 
 static void iwl_complete_scan(struct iwl_priv *priv, bool aborted)
 {
+	struct cfg80211_scan_info info = {
+		.aborted = aborted,
+	};
+
 	/* check if scan was requested from mac80211 */
 	if (priv->scan_request) {
 		IWL_DEBUG_SCAN(priv, "Complete scan in mac80211\n");
-		ieee80211_scan_completed(priv->hw, aborted);
+		ieee80211_scan_completed(priv->hw, &info);
 	}
 
 	priv->scan_type = IWL_SCAN_NORMAL;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index e78fc567ff7d..1cac10c5d818 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -391,13 +391,16 @@ void iwl_mvm_rx_lmac_scan_complete_notif(struct iwl_mvm *mvm,
 		ieee80211_sched_scan_stopped(mvm->hw);
 		mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED;
 	} else if (mvm->scan_status & IWL_MVM_SCAN_REGULAR) {
+		struct cfg80211_scan_info info = {
+			.aborted = aborted,
+		};
+
 		IWL_DEBUG_SCAN(mvm, "Regular scan %s, EBS status %s (FW)\n",
 			       aborted ? "aborted" : "completed",
 			       iwl_mvm_ebs_status_str(scan_notif->ebs_status));
 
 		mvm->scan_status &= ~IWL_MVM_SCAN_REGULAR;
-		ieee80211_scan_completed(mvm->hw,
-				scan_notif->status == IWL_SCAN_OFFLOAD_ABORTED);
+		ieee80211_scan_completed(mvm->hw, &info);
 		iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN);
 		del_timer(&mvm->scan_timer);
 	} else {
@@ -1430,7 +1433,11 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm,
 
 	/* if the scan is already stopping, we don't need to notify mac80211 */
 	if (mvm->scan_uid_status[uid] == IWL_MVM_SCAN_REGULAR) {
-		ieee80211_scan_completed(mvm->hw, aborted);
+		struct cfg80211_scan_info info = {
+			.aborted = aborted,
+		};
+
+		ieee80211_scan_completed(mvm->hw, &info);
 		iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN);
 		del_timer(&mvm->scan_timer);
 	} else if (mvm->scan_uid_status[uid] == IWL_MVM_SCAN_SCHED) {
@@ -1564,7 +1571,11 @@ void iwl_mvm_report_scan_aborted(struct iwl_mvm *mvm)
 
 		uid = iwl_mvm_scan_uid_by_status(mvm, IWL_MVM_SCAN_REGULAR);
 		if (uid >= 0) {
-			ieee80211_scan_completed(mvm->hw, true);
+			struct cfg80211_scan_info info = {
+				.aborted = true,
+			};
+
+			ieee80211_scan_completed(mvm->hw, &info);
 			mvm->scan_uid_status[uid] = 0;
 		}
 		uid = iwl_mvm_scan_uid_by_status(mvm, IWL_MVM_SCAN_SCHED);
@@ -1585,8 +1596,13 @@ void iwl_mvm_report_scan_aborted(struct iwl_mvm *mvm)
 				mvm->scan_uid_status[i] = 0;
 		}
 	} else {
-		if (mvm->scan_status & IWL_MVM_SCAN_REGULAR)
-			ieee80211_scan_completed(mvm->hw, true);
+		if (mvm->scan_status & IWL_MVM_SCAN_REGULAR) {
+			struct cfg80211_scan_info info = {
+				.aborted = true,
+			};
+
+			ieee80211_scan_completed(mvm->hw, &info);
+		}
 
 		/* Sched scan will be restarted by mac80211 in
 		 * restart_hw, so do not report if FW is about to be
@@ -1629,8 +1645,13 @@ out:
 		 */
 		iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN);
 		del_timer(&mvm->scan_timer);
-		if (notify)
-			ieee80211_scan_completed(mvm->hw, true);
+		if (notify) {
+			struct cfg80211_scan_info info = {
+				.aborted = true,
+			};
+
+			ieee80211_scan_completed(mvm->hw, &info);
+		}
 	} else if (notify) {
 		ieee80211_sched_scan_stopped(mvm->hw);
 		mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 337794f9506e..8c35ac838fce 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1941,8 +1941,12 @@ static void hw_scan_work(struct work_struct *work)
 
 	mutex_lock(&hwsim->mutex);
 	if (hwsim->scan_chan_idx >= req->n_channels) {
+		struct cfg80211_scan_info info = {
+			.aborted = false,
+		};
+
 		wiphy_debug(hwsim->hw->wiphy, "hw scan complete\n");
-		ieee80211_scan_completed(hwsim->hw, false);
+		ieee80211_scan_completed(hwsim->hw, &info);
 		hwsim->hw_scan_request = NULL;
 		hwsim->hw_scan_vif = NULL;
 		hwsim->tmp_chan = NULL;
@@ -2027,13 +2031,16 @@ static void mac80211_hwsim_cancel_hw_scan(struct ieee80211_hw *hw,
 					  struct ieee80211_vif *vif)
 {
 	struct mac80211_hwsim_data *hwsim = hw->priv;
+	struct cfg80211_scan_info info = {
+		.aborted = true,
+	};
 
 	wiphy_debug(hw->wiphy, "hwsim cancel_hw_scan\n");
 
 	cancel_delayed_work_sync(&hwsim->hw_scan);
 
 	mutex_lock(&hwsim->mutex);
-	ieee80211_scan_completed(hwsim->hw, true);
+	ieee80211_scan_completed(hwsim->hw, &info);
 	hwsim->tmp_chan = NULL;
 	hwsim->hw_scan_request = NULL;
 	hwsim->hw_scan_vif = NULL;
diff --git a/drivers/net/wireless/st/cw1200/scan.c b/drivers/net/wireless/st/cw1200/scan.c
index 983788156bb0..0a0ff7e31f5b 100644
--- a/drivers/net/wireless/st/cw1200/scan.c
+++ b/drivers/net/wireless/st/cw1200/scan.c
@@ -167,6 +167,10 @@ void cw1200_scan_work(struct work_struct *work)
 	}
 
 	if (!priv->scan.req || (priv->scan.curr == priv->scan.end)) {
+		struct cfg80211_scan_info info = {
+			.aborted = priv->scan.status ? 1 : 0,
+		};
+
 		if (priv->scan.output_power != priv->output_power)
 			wsm_set_output_power(priv, priv->output_power * 10);
 		if (priv->join_status == CW1200_JOIN_STATUS_STA &&
@@ -188,7 +192,7 @@ void cw1200_scan_work(struct work_struct *work)
 		cw1200_scan_restart_delayed(priv);
 		wsm_unlock_tx(priv);
 		mutex_unlock(&priv->conf_mutex);
-		ieee80211_scan_completed(priv->hw, priv->scan.status ? 1 : 0);
+		ieee80211_scan_completed(priv->hw, &info);
 		up(&priv->scan.lock);
 		return;
 	} else {
diff --git a/drivers/net/wireless/ti/wl1251/event.c b/drivers/net/wireless/ti/wl1251/event.c
index c98630394a1a..d0593bc1f1a9 100644
--- a/drivers/net/wireless/ti/wl1251/event.c
+++ b/drivers/net/wireless/ti/wl1251/event.c
@@ -36,7 +36,11 @@ static int wl1251_event_scan_complete(struct wl1251 *wl,
 		     mbox->scheduled_scan_channels);
 
 	if (wl->scanning) {
-		ieee80211_scan_completed(wl->hw, false);
+		struct cfg80211_scan_info info = {
+			.aborted = false,
+		};
+
+		ieee80211_scan_completed(wl->hw, &info);
 		wl1251_debug(DEBUG_MAC80211, "mac80211 hw scan completed");
 		wl->scanning = false;
 		if (wl->hw->conf.flags & IEEE80211_CONF_IDLE)
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index 56384a4e2a35..bbf7604889b7 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -448,7 +448,11 @@ static void wl1251_op_stop(struct ieee80211_hw *hw)
 	WARN_ON(wl->state != WL1251_STATE_ON);
 
 	if (wl->scanning) {
-		ieee80211_scan_completed(wl->hw, true);
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
+		ieee80211_scan_completed(wl->hw, &info);
 		wl->scanning = false;
 	}
 
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 10fd24c28ece..69267d592504 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -2615,6 +2615,10 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl,
 
 	if (wl->scan.state != WL1271_SCAN_STATE_IDLE &&
 	    wl->scan_wlvif == wlvif) {
+		struct cfg80211_scan_info info = {
+			.aborted = true,
+		};
+
 		/*
 		 * Rearm the tx watchdog just before idling scan. This
 		 * prevents just-finished scans from triggering the watchdog
@@ -2625,7 +2629,7 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl,
 		memset(wl->scan.scanned_ch, 0, sizeof(wl->scan.scanned_ch));
 		wl->scan_wlvif = NULL;
 		wl->scan.req = NULL;
-		ieee80211_scan_completed(wl->hw, true);
+		ieee80211_scan_completed(wl->hw, &info);
 	}
 
 	if (wl->sched_vif == wlvif)
@@ -3649,6 +3653,9 @@ static void wl1271_op_cancel_hw_scan(struct ieee80211_hw *hw,
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
+	struct cfg80211_scan_info info = {
+		.aborted = true,
+	};
 	int ret;
 
 	wl1271_debug(DEBUG_MAC80211, "mac80211 cancel hw scan");
@@ -3681,7 +3688,7 @@ static void wl1271_op_cancel_hw_scan(struct ieee80211_hw *hw,
 	memset(wl->scan.scanned_ch, 0, sizeof(wl->scan.scanned_ch));
 	wl->scan_wlvif = NULL;
 	wl->scan.req = NULL;
-	ieee80211_scan_completed(wl->hw, true);
+	ieee80211_scan_completed(wl->hw, &info);
 
 out_sleep:
 	wl1271_ps_elp_sleep(wl);
diff --git a/drivers/net/wireless/ti/wlcore/scan.c b/drivers/net/wireless/ti/wlcore/scan.c
index 23343643207a..5612f5916b4e 100644
--- a/drivers/net/wireless/ti/wlcore/scan.c
+++ b/drivers/net/wireless/ti/wlcore/scan.c
@@ -36,6 +36,9 @@ void wl1271_scan_complete_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct wl1271 *wl;
 	struct wl12xx_vif *wlvif;
+	struct cfg80211_scan_info info = {
+		.aborted = false,
+	};
 	int ret;
 
 	dwork = to_delayed_work(work);
@@ -82,7 +85,7 @@ void wl1271_scan_complete_work(struct work_struct *work)
 
 	wlcore_cmd_regdomain_config_locked(wl);
 
-	ieee80211_scan_completed(wl->hw, false);
+	ieee80211_scan_completed(wl->hw, &info);
 
 out:
 	mutex_unlock(&wl->mutex);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a52009ffc19f..b4faadbb4e01 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4697,9 +4697,10 @@ void ieee80211_wake_queues(struct ieee80211_hw *hw);
  * any context, including hardirq context.
  *
  * @hw: the hardware that finished the scan
- * @aborted: set to true if scan was aborted
+ * @info: information about the completed scan
  */
-void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted);
+void ieee80211_scan_completed(struct ieee80211_hw *hw,
+			      struct cfg80211_scan_info *info);
 
 /**
  * ieee80211_sched_scan_results - got results from scheduled scan
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 54edfb6fc1d1..f56d342c31b8 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1250,6 +1250,7 @@ struct ieee80211_local {
 	int scan_channel_idx;
 	int scan_ies_len;
 	int hw_scan_ies_bufsize;
+	struct cfg80211_scan_info scan_info;
 
 	struct work_struct sched_scan_stopped_work;
 	struct ieee80211_sub_if_data __rcu *sched_scan_sdata;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 4ec1c52a1549..8d4a9cd8a39a 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -7,6 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
+ * Copyright 2016  Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -70,6 +71,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 		.boottime_ns = rx_status->boottime_ns,
 	};
 	bool signal_valid;
+	struct ieee80211_sub_if_data *scan_sdata;
 
 	if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
 		bss_meta.signal = rx_status->signal * 100;
@@ -83,6 +85,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 		bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10;
 
 	bss_meta.chan = channel;
+
+	rcu_read_lock();
+	scan_sdata = rcu_dereference(local->scan_sdata);
+	if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    scan_sdata->vif.bss_conf.assoc &&
+	    ieee80211_have_rx_timestamp(rx_status)) {
+		bss_meta.parent_tsf =
+			ieee80211_calculate_rx_timestamp(local, rx_status,
+							 len + FCS_LEN, 24);
+		ether_addr_copy(bss_meta.parent_bssid,
+				scan_sdata->vif.bss_conf.bssid);
+	}
+	rcu_read_unlock();
+
 	cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta,
 					      mgmt, len, GFP_ATOMIC);
 	if (!cbss)
@@ -345,6 +361,11 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 
 		if (rc == 0)
 			return;
+
+		/* HW scan failed and is going to be reported as done, so clear
+		 * old scan info.
+		 */
+		memset(&local->scan_info, 0, sizeof(local->scan_info));
 	}
 
 	kfree(local->hw_scan_req);
@@ -354,11 +375,8 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 					     lockdep_is_held(&local->mtx));
 
 	if (scan_req != local->int_scan_req) {
-		struct cfg80211_scan_info info = {
-			.aborted = aborted,
-		};
-
-		cfg80211_scan_done(scan_req, &info);
+		local->scan_info.aborted = aborted;
+		cfg80211_scan_done(scan_req, &local->scan_info);
 	}
 	RCU_INIT_POINTER(local->scan_req, NULL);
 
@@ -396,15 +414,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 		ieee80211_start_next_roc(local);
 }
 
-void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+void ieee80211_scan_completed(struct ieee80211_hw *hw,
+			      struct cfg80211_scan_info *info)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 
-	trace_api_scan_completed(local, aborted);
+	trace_api_scan_completed(local, info);
 
 	set_bit(SCAN_COMPLETED, &local->scanning);
-	if (aborted)
+	if (info->aborted)
 		set_bit(SCAN_ABORTED, &local->scanning);
+
+	memcpy(&local->scan_info, info, sizeof(*info));
+
 	ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
 }
 EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -571,6 +593,9 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 		local->hw_scan_req->req.ie = ies;
 		local->hw_scan_req->req.flags = req->flags;
 		eth_broadcast_addr(local->hw_scan_req->req.bssid);
+		local->hw_scan_req->req.duration = req->duration;
+		local->hw_scan_req->req.duration_mandatory =
+			req->duration_mandatory;
 
 		local->hw_scan_band = 0;
 
@@ -1078,6 +1103,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local)
 	 */
 	cancel_delayed_work(&local->scan_work);
 	/* and clean up */
+	memset(&local->scan_info, 0, sizeof(local->scan_info));
 	__ieee80211_scan_completed(&local->hw, true);
 out:
 	mutex_unlock(&local->mtx);
-- 
cgit v1.2.3


From 7d27a0ba7adc8ef30c2aae7592fce4c162aee4df Mon Sep 17 00:00:00 2001
From: Masashi Honma <masashi.honma@gmail.com>
Date: Fri, 1 Jul 2016 10:19:34 +0900
Subject: cfg80211: Add mesh peer AID setting API

Previously, mesh power management functionality works only with kernel
MPM. Because user space MPM did not report mesh peer AID to kernel,
the kernel could not identify the bit in TIM element. So this patch
adds mesh peer AID setting API.

Signed-off-by: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 5 +++++
 net/mac80211/cfg.c           | 1 +
 net/wireless/nl80211.c       | 6 ++++++
 4 files changed, 14 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e2658e392a1f..9c23f4d33e06 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -780,6 +780,7 @@ enum station_parameters_apply_mask {
  *	(bitmask of BIT(NL80211_STA_FLAG_...))
  * @listen_interval: listen interval or -1 for no change
  * @aid: AID or zero for no change
+ * @peer_aid: mesh peer AID or zero for no change
  * @plink_action: plink action to take
  * @plink_state: set the peer link state for a station
  * @ht_capa: HT capabilities of station
@@ -811,6 +812,7 @@ struct station_parameters {
 	u32 sta_modify_mask;
 	int listen_interval;
 	u16 aid;
+	u16 peer_aid;
 	u8 supported_rates_len;
 	u8 plink_action;
 	u8 plink_state;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b39ccab45333..220694151434 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1864,6 +1864,9 @@ enum nl80211_commands {
  *	mandatory. If this flag is not set, the duration is the maximum duration
  *	and the actual measurement duration may be shorter.
  *
+ * @NL80211_ATTR_MESH_PEER_AID: Association ID for the mesh peer (u16). This is
+ *	used to pull the stored data for mesh peer in power save state.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2256,6 +2259,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MEASUREMENT_DURATION,
 	NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY,
 
+	NL80211_ATTR_MESH_PEER_AID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 0c12e4001f19..47e99ab8d97a 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -997,6 +997,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
 			if (sta->mesh->plink_state != NL80211_PLINK_ESTAB)
 				changed = mesh_plink_inc_estab_count(sdata);
 			sta->mesh->plink_state = params->plink_state;
+			sta->mesh->aid = params->peer_aid;
 
 			ieee80211_mps_sta_status_update(sta);
 			changed |= ieee80211_mps_set_sta_local_pm(sta,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c53b5462ed00..5782f718d567 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4446,6 +4446,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 			nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
 		if (params.plink_state >= NUM_NL80211_PLINK_STATES)
 			return -EINVAL;
+		if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) {
+			params.peer_aid = nla_get_u16(
+				info->attrs[NL80211_ATTR_MESH_PEER_AID]);
+			if (params.peer_aid > IEEE80211_MAX_AID)
+				return -EINVAL;
+		}
 		params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
 	}
 
-- 
cgit v1.2.3


From aece0c3fe1f06962a591268b8c236df0ae5d9e4e Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Sat, 18 Jun 2016 10:45:33 +0200
Subject: nl802154: move PAD to right position

The PAD define should be above the experimental support. We don't care
about if we break userspace in experimental stuff but PAD is part of the
existing UAPI.

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Stefan Schmidt<stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/nl802154.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index fcab4de49951..7aad2fdfd16a 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -124,6 +124,8 @@ enum nl802154_attrs {
 
 	NL802154_ATTR_ACKREQ_DEFAULT,
 
+	NL802154_ATTR_PAD,
+
 	/* add attributes here, update the policy in nl802154.c */
 
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
@@ -138,8 +140,6 @@ enum nl802154_attrs {
 	NL802154_ATTR_SEC_KEY,
 #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
 
-	NL802154_ATTR_PAD,
-
 	__NL802154_ATTR_AFTER_LAST,
 	NL802154_ATTR_MAX = __NL802154_ATTR_AFTER_LAST - 1
 };
-- 
cgit v1.2.3


From 66e5c2672cd11b9310008099faf6a4ffb9dfb6d0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Sat, 18 Jun 2016 10:45:34 +0200
Subject: ieee802154: add netns support

This patch adds netns support for 802.15.4 subsystem. Most parts are
copy&pasted from wireless subsystem, it has the identically userspace
API.

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   | 13 +++++++++
 include/net/nl802154.h    |  5 ++++
 net/ieee802154/core.c     | 70 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ieee802154/core.h     |  2 ++
 net/ieee802154/nl802154.c | 54 ++++++++++++++++++++++++++++++++----
 5 files changed, 138 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 171cd76558fb..795ca4008f72 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -219,9 +219,22 @@ struct wpan_phy {
 
 	struct device dev;
 
+	/* the network namespace this phy lives in currently */
+	possible_net_t _net;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+static inline struct net *wpan_phy_net(struct wpan_phy *wpan_phy)
+{
+	return read_pnet(&wpan_phy->_net);
+}
+
+static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net)
+{
+	write_pnet(&wpan_phy->_net, net);
+}
+
 struct ieee802154_addr {
 	u8 mode;
 	__le16 pan_id;
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index 7aad2fdfd16a..ddcee128f5d9 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -54,6 +54,8 @@ enum nl802154_commands {
 
 	NL802154_CMD_SET_ACKREQ_DEFAULT,
 
+	NL802154_CMD_SET_WPAN_PHY_NETNS,
+
 	/* add new commands above here */
 
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
@@ -126,6 +128,9 @@ enum nl802154_attrs {
 
 	NL802154_ATTR_PAD,
 
+	NL802154_ATTR_PID,
+	NL802154_ATTR_NETNS_FD,
+
 	/* add attributes here, update the policy in nl802154.c */
 
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index c35fdfa6d04e..cb7176cd4cd6 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -140,6 +140,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
 	rdev->wpan_phy.dev.class = &wpan_phy_class;
 	rdev->wpan_phy.dev.platform_data = rdev;
 
+	wpan_phy_net_set(&rdev->wpan_phy, &init_net);
+
 	init_waitqueue_head(&rdev->dev_wait);
 
 	return &rdev->wpan_phy;
@@ -207,6 +209,49 @@ void wpan_phy_free(struct wpan_phy *phy)
 }
 EXPORT_SYMBOL(wpan_phy_free);
 
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+			   struct net *net)
+{
+	struct wpan_dev *wpan_dev;
+	int err = 0;
+
+	list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+		if (!wpan_dev->netdev)
+			continue;
+		wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
+		if (err)
+			break;
+		wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+	}
+
+	if (err) {
+		/* failed -- clean up to old netns */
+		net = wpan_phy_net(&rdev->wpan_phy);
+
+		list_for_each_entry_continue_reverse(wpan_dev,
+						     &rdev->wpan_dev_list,
+						     list) {
+			if (!wpan_dev->netdev)
+				continue;
+			wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+			err = dev_change_net_namespace(wpan_dev->netdev, net,
+						       "wpan%d");
+			WARN_ON(err);
+			wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+		}
+
+		return err;
+	}
+
+	wpan_phy_net_set(&rdev->wpan_phy, net);
+
+	err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev));
+	WARN_ON(err);
+
+	return 0;
+}
+
 void cfg802154_dev_free(struct cfg802154_registered_device *rdev)
 {
 	kfree(rdev);
@@ -286,14 +331,34 @@ static struct notifier_block cfg802154_netdev_notifier = {
 	.notifier_call = cfg802154_netdev_notifier_call,
 };
 
+static void __net_exit cfg802154_pernet_exit(struct net *net)
+{
+	struct cfg802154_registered_device *rdev;
+
+	rtnl_lock();
+	list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+		if (net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+			WARN_ON(cfg802154_switch_netns(rdev, &init_net));
+	}
+	rtnl_unlock();
+}
+
+static struct pernet_operations cfg802154_pernet_ops = {
+	.exit = cfg802154_pernet_exit,
+};
+
 static int __init wpan_phy_class_init(void)
 {
 	int rc;
 
-	rc = wpan_phy_sysfs_init();
+	rc = register_pernet_device(&cfg802154_pernet_ops);
 	if (rc)
 		goto err;
 
+	rc = wpan_phy_sysfs_init();
+	if (rc)
+		goto err_sysfs;
+
 	rc = register_netdevice_notifier(&cfg802154_netdev_notifier);
 	if (rc)
 		goto err_nl;
@@ -315,6 +380,8 @@ err_notifier:
 	unregister_netdevice_notifier(&cfg802154_netdev_notifier);
 err_nl:
 	wpan_phy_sysfs_exit();
+err_sysfs:
+	unregister_pernet_device(&cfg802154_pernet_ops);
 err:
 	return rc;
 }
@@ -326,6 +393,7 @@ static void __exit wpan_phy_class_exit(void)
 	ieee802154_nl_exit();
 	unregister_netdevice_notifier(&cfg802154_netdev_notifier);
 	wpan_phy_sysfs_exit();
+	unregister_pernet_device(&cfg802154_pernet_ops);
 }
 module_exit(wpan_phy_class_exit);
 
diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h
index 231fade959f3..81141f58d079 100644
--- a/net/ieee802154/core.h
+++ b/net/ieee802154/core.h
@@ -38,6 +38,8 @@ wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
 extern struct list_head cfg802154_rdev_list;
 extern int cfg802154_rdev_list_generation;
 
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+			   struct net *net);
 /* free object */
 void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
 struct cfg802154_registered_device *
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 116187b5c267..d90a4ed5b8a0 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -80,7 +80,8 @@ __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs)
 	list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
 		struct wpan_dev *wpan_dev;
 
-		/* TODO netns compare */
+		if (wpan_phy_net(&rdev->wpan_phy) != netns)
+			continue;
 
 		if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx)
 			continue;
@@ -175,7 +176,8 @@ __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
 	if (!rdev)
 		return ERR_PTR(-ENODEV);
 
-	/* TODO netns compare */
+	if (netns != wpan_phy_net(&rdev->wpan_phy))
+		return ERR_PTR(-ENODEV);
 
 	return rdev;
 }
@@ -233,6 +235,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 
 	[NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 },
 
+	[NL802154_ATTR_PID] = { .type = NLA_U32 },
+	[NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
 	[NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
 	[NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
@@ -590,7 +594,6 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
 		struct cfg802154_registered_device *rdev;
 		int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]);
 
-		/* TODO netns */
 		netdev = __dev_get_by_index(&init_net, ifidx);
 		if (!netdev)
 			return -ENODEV;
@@ -629,7 +632,8 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
-		/* TODO net ns compare */
+		if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+			continue;
 		if (++idx <= state->start)
 			continue;
 		if (state->filter_wpan_phy != -1 &&
@@ -871,7 +875,8 @@ nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
 
 	rtnl_lock();
 	list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
-		/* TODO netns compare */
+		if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+			continue;
 		if (wp_idx < wp_start) {
 			wp_idx++;
 			continue;
@@ -1271,6 +1276,37 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_ackreq_default(rdev, wpan_dev, ackreq);
 }
 
+static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	struct net *net;
+	int err;
+
+	if (info->attrs[NL802154_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]);
+
+		net = get_net_ns_by_pid(pid);
+	} else if (info->attrs[NL802154_ATTR_NETNS_FD]) {
+		u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]);
+
+		net = get_net_ns_by_fd(fd);
+	} else {
+		return -EINVAL;
+	}
+
+	if (IS_ERR(net))
+		return PTR_ERR(net);
+
+	err = 0;
+
+	/* check if anything to do */
+	if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+		err = cfg802154_switch_netns(rdev, net);
+
+	put_net(net);
+	return err;
+}
+
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
 static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
 	[NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
@@ -2261,6 +2297,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_WPAN_PHY_NETNS,
+		.doit = nl802154_wpan_phy_netns,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL802154_CMD_SET_PAN_ID,
 		.doit = nl802154_set_pan_id,
-- 
cgit v1.2.3


From 9cc577dd25b9762df7f353658426bb2e048c480a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 6 Jul 2016 23:32:24 +0200
Subject: ieee802154: add ieee802154_skb_dst_pan helper

This patch adds ieee802154_skb_dst_pan function to get the pointer
address of the destination pan id at skb mac pointer.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 16 ++++++++++++++++
 include/net/mac802154.h    | 29 +++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index acedbb68a5a3..91f4665fea63 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -31,6 +31,8 @@
 #define IEEE802154_MIN_PSDU_LEN		9
 #define IEEE802154_FCS_LEN		2
 #define IEEE802154_MAX_AUTH_TAG_LEN	16
+#define IEEE802154_FC_LEN		2
+#define IEEE802154_SEQ_LEN		1
 
 /*  General MAC frame format:
  *  2 bytes: Frame Control
@@ -221,9 +223,14 @@ enum {
 #define IEEE802154_FCTL_ACKREQ		0x0020
 #define IEEE802154_FCTL_SECEN		0x0004
 #define IEEE802154_FCTL_INTRA_PAN	0x0040
+#define IEEE802154_FCTL_DADDR		0x0c00
 
 #define IEEE802154_FTYPE_DATA		0x0001
 
+#define IEEE802154_FCTL_ADDR_NONE	0x0000
+#define IEEE802154_FCTL_DADDR_SHORT	0x0800
+#define IEEE802154_FCTL_DADDR_EXTENDED	0x0c00
+
 /*
  * ieee802154_is_data - check if type is IEEE802154_FTYPE_DATA
  * @fc: frame control bytes in little-endian byteorder
@@ -261,6 +268,15 @@ static inline bool ieee802154_is_intra_pan(__le16 fc)
 	return fc & cpu_to_le16(IEEE802154_FCTL_INTRA_PAN);
 }
 
+/*
+ * ieee802154_daddr_mode - get daddr mode from fc
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline __le16 ieee802154_daddr_mode(__le16 fc)
+{
+	return fc & cpu_to_le16(IEEE802154_FCTL_DADDR);
+}
+
 /**
  * ieee802154_is_valid_psdu_len - check if psdu len is valid
  * available lengths:
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index e465c8551ac3..b3f7cd868fe9 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -257,6 +257,35 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 	return get_unaligned_le16(skb_mac_header(skb));
 }
 
+/**
+ * ieee802154_skb_dst_pan - get the pointer to destination pan field
+ * @fc: mac header frame control field
+ * @skb: skb where the destination pan pointer will be get from
+ */
+static inline unsigned char *ieee802154_skb_dst_pan(__le16 fc,
+						    const struct sk_buff *skb)
+{
+	unsigned char *dst_pan;
+
+	switch (ieee802154_daddr_mode(fc)) {
+	case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE):
+		dst_pan = NULL;
+		break;
+	case cpu_to_le16(IEEE802154_FCTL_DADDR_SHORT):
+	case cpu_to_le16(IEEE802154_FCTL_DADDR_EXTENDED):
+		dst_pan = skb_mac_header(skb) +
+			  IEEE802154_FC_LEN +
+			  IEEE802154_SEQ_LEN;
+		break;
+	default:
+		WARN_ONCE(1, "invalid addr mode detected");
+		dst_pan = NULL;
+		break;
+	}
+
+	return dst_pan;
+}
+
 /**
  * ieee802154_be64_to_le64 - copies and convert be64 to le64
  * @le64_dst: le64 destination pointer
-- 
cgit v1.2.3


From 19580cc1ed299c736b56b45c7576b477f185f8f5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 6 Jul 2016 23:32:25 +0200
Subject: ieee802154: add ieee802154_skb_src_pan helper

This patch adds ieee802154_skb_src_pan function to get the pointer
address of the source pan id at skb mac pointer.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 13 ++++++++++
 include/net/mac802154.h    | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 91f4665fea63..ddb890174a0e 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -50,6 +50,7 @@
 
 #define IEEE802154_EXTENDED_ADDR_LEN	8
 #define IEEE802154_SHORT_ADDR_LEN	2
+#define IEEE802154_PAN_ID_LEN		2
 
 #define IEEE802154_LIFS_PERIOD		40
 #define IEEE802154_SIFS_PERIOD		12
@@ -224,12 +225,15 @@ enum {
 #define IEEE802154_FCTL_SECEN		0x0004
 #define IEEE802154_FCTL_INTRA_PAN	0x0040
 #define IEEE802154_FCTL_DADDR		0x0c00
+#define IEEE802154_FCTL_SADDR		0xc000
 
 #define IEEE802154_FTYPE_DATA		0x0001
 
 #define IEEE802154_FCTL_ADDR_NONE	0x0000
 #define IEEE802154_FCTL_DADDR_SHORT	0x0800
 #define IEEE802154_FCTL_DADDR_EXTENDED	0x0c00
+#define IEEE802154_FCTL_SADDR_SHORT	0x8000
+#define IEEE802154_FCTL_SADDR_EXTENDED	0xc000
 
 /*
  * ieee802154_is_data - check if type is IEEE802154_FTYPE_DATA
@@ -277,6 +281,15 @@ static inline __le16 ieee802154_daddr_mode(__le16 fc)
 	return fc & cpu_to_le16(IEEE802154_FCTL_DADDR);
 }
 
+/*
+ * ieee802154_saddr_mode - get saddr mode from fc
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline __le16 ieee802154_saddr_mode(__le16 fc)
+{
+	return fc & cpu_to_le16(IEEE802154_FCTL_SADDR);
+}
+
 /**
  * ieee802154_is_valid_psdu_len - check if psdu len is valid
  * available lengths:
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index b3f7cd868fe9..ec01b35bc969 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -286,6 +286,65 @@ static inline unsigned char *ieee802154_skb_dst_pan(__le16 fc,
 	return dst_pan;
 }
 
+/**
+ * ieee802154_skb_src_pan - get the pointer to source pan field
+ * @fc: mac header frame control field
+ * @skb: skb where the source pan pointer will be get from
+ */
+static inline unsigned char *ieee802154_skb_src_pan(__le16 fc,
+						    const struct sk_buff *skb)
+{
+	unsigned char *src_pan;
+
+	switch (ieee802154_saddr_mode(fc)) {
+	case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE):
+		src_pan = NULL;
+		break;
+	case cpu_to_le16(IEEE802154_FCTL_SADDR_SHORT):
+	case cpu_to_le16(IEEE802154_FCTL_SADDR_EXTENDED):
+		/* if intra-pan and source addr mode is non none,
+		 * then source pan id is equal destination pan id.
+		 */
+		if (ieee802154_is_intra_pan(fc)) {
+			src_pan = ieee802154_skb_dst_pan(fc, skb);
+			break;
+		}
+
+		switch (ieee802154_daddr_mode(fc)) {
+		case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE):
+			src_pan = skb_mac_header(skb) +
+				  IEEE802154_FC_LEN +
+				  IEEE802154_SEQ_LEN;
+			break;
+		case cpu_to_le16(IEEE802154_FCTL_DADDR_SHORT):
+			src_pan = skb_mac_header(skb) +
+				  IEEE802154_FC_LEN +
+				  IEEE802154_SEQ_LEN +
+				  IEEE802154_PAN_ID_LEN +
+				  IEEE802154_SHORT_ADDR_LEN;
+			break;
+		case cpu_to_le16(IEEE802154_FCTL_DADDR_EXTENDED):
+			src_pan = skb_mac_header(skb) +
+				  IEEE802154_FC_LEN +
+				  IEEE802154_SEQ_LEN +
+				  IEEE802154_PAN_ID_LEN +
+				  IEEE802154_EXTENDED_ADDR_LEN;
+			break;
+		default:
+			WARN_ONCE(1, "invalid addr mode detected");
+			src_pan = NULL;
+			break;
+		}
+		break;
+	default:
+		WARN_ONCE(1, "invalid addr mode detected");
+		src_pan = NULL;
+		break;
+	}
+
+	return src_pan;
+}
+
 /**
  * ieee802154_be64_to_le64 - copies and convert be64 to le64
  * @le64_dst: le64 destination pointer
-- 
cgit v1.2.3


From 0ea0b9af9b7599ada307258dc841f4300873e8a1 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 6 Jul 2016 23:32:26 +0200
Subject: ieee802154: 6lowpan: fix intra pan id check

The RIOT-OS stack does send intra-pan frames but don't set the intra pan
flag inside the mac header. It seems this is valid frame addressing but
inefficient. Anyway this patch adds a new function for intra pan
addressing, doesn't matter if intra pan flag or source and destination
are the same. The newly introduction function will be used to check on
intra pan addressing for 6lowpan.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h     | 19 +++++++++++++++++++
 net/ieee802154/6lowpan/rx.c |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index ec01b35bc969..d757edd0b0b7 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -345,6 +345,25 @@ static inline unsigned char *ieee802154_skb_src_pan(__le16 fc,
 	return src_pan;
 }
 
+/**
+ * ieee802154_skb_is_intra_pan_addressing - checks whenever the mac addressing
+ *	is an intra pan communication
+ * @fc: mac header frame control field
+ * @skb: skb where the source and destination pan should be get from
+ */
+static inline bool ieee802154_skb_is_intra_pan_addressing(__le16 fc,
+							  const struct sk_buff *skb)
+{
+	unsigned char *dst_pan = ieee802154_skb_dst_pan(fc, skb),
+		      *src_pan = ieee802154_skb_src_pan(fc, skb);
+
+	/* if one is NULL is no intra pan addressing */
+	if (!dst_pan || !src_pan)
+		return false;
+
+	return !memcmp(dst_pan, src_pan, IEEE802154_PAN_ID_LEN);
+}
+
 /**
  * ieee802154_be64_to_le64 - copies and convert be64 to le64
  * @le64_dst: le64 destination pointer
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index ef185dd4110d..649e7d45e88f 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -262,7 +262,7 @@ static inline bool lowpan_rx_h_check(struct sk_buff *skb)
 
 	/* check on ieee802154 conform 6LoWPAN header */
 	if (!ieee802154_is_data(fc) ||
-	    !ieee802154_is_intra_pan(fc))
+	    !ieee802154_skb_is_intra_pan_addressing(fc, skb))
 		return false;
 
 	/* check if we can dereference the dispatch */
-- 
cgit v1.2.3


From aaa7088eb29a0ffe6cf8d8a443695df41e5a62f3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 6 Jul 2016 23:32:27 +0200
Subject: ieee802154: fix skb get fc on big endian

This patch fixes ieee802154_get_fc_from_skb function on big endian
machines. The function get_unaligned_le16 converts the byte order to
host byte order but we want to keep the byte order like in mac header.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index d757edd0b0b7..bb7bfecc5ab3 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -247,6 +247,8 @@ struct ieee802154_ops {
  */
 static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 {
+	__le16 fc;
+
 	/* check if we can fc at skb_mac_header of sk buffer */
 	if (unlikely(!skb_mac_header_was_set(skb) ||
 		     (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) {
@@ -254,7 +256,8 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 		return cpu_to_le16(0);
 	}
 
-	return get_unaligned_le16(skb_mac_header(skb));
+	memcpy(&fc, skb_mac_header(skb), IEEE802154_FC_LEN);
+	return fc;
 }
 
 /**
-- 
cgit v1.2.3


From 048e7f7e66a8693fe1707997bffd19d08cde08f5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 6 Jul 2016 23:32:30 +0200
Subject: ieee802154: cleanup WARN_ON for fc fetch

This patch cleanups the WARN_ON which occurs when the sk buffer has
insufficient buffer space by moving the WARN_ON into if condition.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index bb7bfecc5ab3..286824acd008 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -250,11 +250,10 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 	__le16 fc;
 
 	/* check if we can fc at skb_mac_header of sk buffer */
-	if (unlikely(!skb_mac_header_was_set(skb) ||
-		     (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) {
-		WARN_ON(1);
+	if (WARN_ON(!skb_mac_header_was_set(skb) ||
+		    (skb_tail_pointer(skb) -
+		     skb_mac_header(skb)) < IEEE802154_FC_LEN))
 		return cpu_to_le16(0);
-	}
 
 	memcpy(&fc, skb_mac_header(skb), IEEE802154_FC_LEN);
 	return fc;
-- 
cgit v1.2.3


From d390238c4fba7c87a3bcd859ce3373c864eb7b02 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed, 6 Jul 2016 20:03:54 -0400
Subject: net: dsa: initialize the routing table

The routing table of every switch in a tree is currently initialized to
all zeros. This is an issue since 0 is a valid port number.

Add a DSA_RTABLE_NONE=-1 constant to initialize the signed values of the
routing table pointing to other switches.

This fixes the device mapping of the mv88e6xxx driver where the port
pointing to the switch itself and to non-existent switches was wrongly
configured to be 0. It is now set to the expected 0xf value.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 net/dsa/dsa.c     | 6 ++++++
 net/dsa/dsa2.c    | 7 ++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 20b3087ad193..52ab18bc2b0d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -32,6 +32,8 @@ enum dsa_tag_protocol {
 #define DSA_MAX_SWITCHES	4
 #define DSA_MAX_PORTS		12
 
+#define DSA_RTABLE_NONE		-1
+
 struct dsa_chip_data {
 	/*
 	 * How to access the switch configuration registers.
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 766d2a525ada..7e68bc6bc853 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -774,11 +774,17 @@ static int dsa_of_probe(struct device *dev)
 
 	chip_index = -1;
 	for_each_available_child_of_node(np, child) {
+		int i;
+
 		chip_index++;
 		cd = &pd->chip[chip_index];
 
 		cd->of_node = child;
 
+		/* Initialize the routing table */
+		for (i = 0; i < DSA_MAX_SWITCHES; ++i)
+			cd->rtable[i] = DSA_RTABLE_NONE;
+
 		/* When assigning the host device, increment its refcount */
 		cd->host_dev = get_device(&mdio_bus->dev);
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 83b95fc4cede..78e4c0131c30 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -595,7 +595,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
 	struct device_node *ports = dsa_get_ports(ds, np);
 	struct dsa_switch_tree *dst;
 	u32 tree, index;
-	int err;
+	int i, err;
 
 	err = dsa_parse_member(np, &tree, &index);
 	if (err)
@@ -622,6 +622,11 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
 
 	ds->dst = dst;
 	ds->index = index;
+
+	/* Initialize the routing table */
+	for (i = 0; i < DSA_MAX_SWITCHES; ++i)
+		ds->rtable[i] = DSA_RTABLE_NONE;
+
 	dsa_dst_add_ds(dst, ds, index);
 
 	err = dsa_dst_complete(dst);
-- 
cgit v1.2.3


From ca8bee5dde1f02c2dbe8c8453dce27f2dfafb21c Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 5 Jul 2016 14:30:14 +0200
Subject: Bluetooth: Rename HCI_BREDR into HCI_PRIMARY

The HCI_BREDR naming is confusing since it actually stands for Primary
Bluetooth Controller. Which is a term that has been used in the latest
standard. However from a legacy point of view there only really have
been Basic Rate (BR) and Enhanced Data Rate (EDR). Recent versions of
Bluetooth introduced Low Energy (LE) and made this terminology a little
bit confused since Dual Mode Controllers include BR/EDR and LE. To
simplify this the name HCI_PRIMARY stands for the Primary Controller
which can be a single mode or dual mode controller.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 drivers/bluetooth/btmrvl_main.c |  2 +-
 drivers/bluetooth/btsdio.c      |  2 +-
 drivers/bluetooth/btusb.c       |  2 +-
 drivers/bluetooth/hci_ldisc.c   |  2 +-
 drivers/bluetooth/hci_vhci.c    |  6 +++---
 include/net/bluetooth/hci.h     |  2 +-
 net/bluetooth/hci_conn.c        |  2 +-
 net/bluetooth/hci_core.c        | 28 +++++++++++++---------------
 net/bluetooth/hci_event.c       |  2 +-
 net/bluetooth/hci_sock.c        |  2 +-
 net/bluetooth/l2cap_core.c      |  2 +-
 net/bluetooth/mgmt.c            | 16 ++++++++--------
 12 files changed, 33 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index 7ad8d61c0c61..e6a85f0e6309 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -138,7 +138,7 @@ int btmrvl_process_event(struct btmrvl_private *priv, struct sk_buff *skb)
 			if (event->length > 3 && event->data[3])
 				priv->btmrvl_dev.dev_type = HCI_AMP;
 			else
-				priv->btmrvl_dev.dev_type = HCI_BREDR;
+				priv->btmrvl_dev.dev_type = HCI_PRIMARY;
 
 			BT_DBG("dev_type: %d", priv->btmrvl_dev.dev_type);
 		} else if (priv->btmrvl_dev.sendcmdflag &&
diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
index 2b05661e3818..1cb958e199eb 100644
--- a/drivers/bluetooth/btsdio.c
+++ b/drivers/bluetooth/btsdio.c
@@ -311,7 +311,7 @@ static int btsdio_probe(struct sdio_func *func,
 	if (id->class == SDIO_CLASS_BT_AMP)
 		hdev->dev_type = HCI_AMP;
 	else
-		hdev->dev_type = HCI_BREDR;
+		hdev->dev_type = HCI_PRIMARY;
 
 	data->hdev = hdev;
 
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 880bb5546b25..f2e8fd701bf6 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2832,7 +2832,7 @@ static int btusb_probe(struct usb_interface *intf,
 	if (id->driver_info & BTUSB_AMP)
 		hdev->dev_type = HCI_AMP;
 	else
-		hdev->dev_type = HCI_BREDR;
+		hdev->dev_type = HCI_PRIMARY;
 
 	data->hdev = hdev;
 
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 49b3e1e2d236..dda97398c59a 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -609,7 +609,7 @@ static int hci_uart_register_dev(struct hci_uart *hu)
 	if (test_bit(HCI_UART_CREATE_AMP, &hu->hdev_flags))
 		hdev->dev_type = HCI_AMP;
 	else
-		hdev->dev_type = HCI_BREDR;
+		hdev->dev_type = HCI_PRIMARY;
 
 	if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags))
 		return 0;
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index aba31210c802..3ff229b2e7f3 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -97,10 +97,10 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 	if (data->hdev)
 		return -EBADFD;
 
-	/* bits 0-1 are dev_type (BR/EDR or AMP) */
+	/* bits 0-1 are dev_type (Primary or AMP) */
 	dev_type = opcode & 0x03;
 
-	if (dev_type != HCI_BREDR && dev_type != HCI_AMP)
+	if (dev_type != HCI_PRIMARY && dev_type != HCI_AMP)
 		return -EINVAL;
 
 	/* bits 2-5 are reserved (must be zero) */
@@ -316,7 +316,7 @@ static void vhci_open_timeout(struct work_struct *work)
 	struct vhci_data *data = container_of(work, struct vhci_data,
 					      open_timeout.work);
 
-	vhci_create_device(data, amp ? HCI_AMP : HCI_BREDR);
+	vhci_create_device(data, amp ? HCI_AMP : HCI_PRIMARY);
 }
 
 static int vhci_open(struct inode *inode, struct file *file)
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index eefcf3e96421..a3f86de6f100 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -65,7 +65,7 @@
 #define HCI_I2C		8
 
 /* HCI controller types */
-#define HCI_BREDR	0x00
+#define HCI_PRIMARY	0x00
 #define HCI_AMP		0x01
 
 /* First BR/EDR Controller shall have ID = 0 */
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index bf9f8a801a2e..3809617aa98d 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -625,7 +625,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
 	list_for_each_entry(d, &hci_dev_list, list) {
 		if (!test_bit(HCI_UP, &d->flags) ||
 		    hci_dev_test_flag(d, HCI_USER_CHANNEL) ||
-		    d->dev_type != HCI_BREDR)
+		    d->dev_type != HCI_PRIMARY)
 			continue;
 
 		/* Simple routing:
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 45a9fc68c677..98f6c3770736 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -260,14 +260,12 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt)
 		hci_reset_req(req, 0);
 
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		bredr_init(req);
 		break;
-
 	case HCI_AMP:
 		amp_init1(req);
 		break;
-
 	default:
 		BT_ERR("Unknown device type %d", hdev->dev_type);
 		break;
@@ -791,11 +789,11 @@ static int __hci_init(struct hci_dev *hdev)
 	if (err < 0)
 		return err;
 
-	/* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode
+	/* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode
 	 * BR/EDR/LE type controllers. AMP controllers only need the
 	 * first two stages of init.
 	 */
-	if (hdev->dev_type != HCI_BREDR)
+	if (hdev->dev_type != HCI_PRIMARY)
 		return 0;
 
 	err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL);
@@ -1202,7 +1200,7 @@ int hci_inquiry(void __user *arg)
 		goto done;
 	}
 
-	if (hdev->dev_type != HCI_BREDR) {
+	if (hdev->dev_type != HCI_PRIMARY) {
 		err = -EOPNOTSUPP;
 		goto done;
 	}
@@ -1307,7 +1305,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
 		 * since AMP controllers do not have an address.
 		 */
 		if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
-		    hdev->dev_type == HCI_BREDR &&
+		    hdev->dev_type == HCI_PRIMARY &&
 		    !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
 		    !bacmp(&hdev->static_addr, BDADDR_ANY)) {
 			ret = -EADDRNOTAVAIL;
@@ -1402,7 +1400,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
 		    !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
 		    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
 		    hci_dev_test_flag(hdev, HCI_MGMT) &&
-		    hdev->dev_type == HCI_BREDR) {
+		    hdev->dev_type == HCI_PRIMARY) {
 			ret = __hci_req_hci_power_on(hdev);
 			mgmt_power_on(hdev, ret);
 		}
@@ -1563,7 +1561,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
 
 	auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
 
-	if (!auto_off && hdev->dev_type == HCI_BREDR &&
+	if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
 	    hci_dev_test_flag(hdev, HCI_MGMT))
 		__mgmt_power_off(hdev);
 
@@ -1802,7 +1800,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
 		goto done;
 	}
 
-	if (hdev->dev_type != HCI_BREDR) {
+	if (hdev->dev_type != HCI_PRIMARY) {
 		err = -EOPNOTSUPP;
 		goto done;
 	}
@@ -2043,7 +2041,7 @@ static void hci_power_on(struct work_struct *work)
 	 */
 	if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
 	    hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
-	    (hdev->dev_type == HCI_BREDR &&
+	    (hdev->dev_type == HCI_PRIMARY &&
 	     !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
 	     !bacmp(&hdev->static_addr, BDADDR_ANY))) {
 		hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
@@ -3030,7 +3028,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	 * so the index can be used as the AMP controller ID.
 	 */
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL);
 		break;
 	case HCI_AMP:
@@ -3090,7 +3088,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	hci_dev_set_flag(hdev, HCI_SETUP);
 	hci_dev_set_flag(hdev, HCI_AUTO_OFF);
 
-	if (hdev->dev_type == HCI_BREDR) {
+	if (hdev->dev_type == HCI_PRIMARY) {
 		/* Assume BR/EDR support until proven otherwise (such as
 		 * through reading supported features during init.
 		 */
@@ -3415,7 +3413,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
 	hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
 
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		hci_add_acl_hdr(skb, conn->handle, flags);
 		break;
 	case HCI_AMP:
@@ -3826,7 +3824,7 @@ static void hci_sched_acl(struct hci_dev *hdev)
 	BT_DBG("%s", hdev->name);
 
 	/* No ACL link over BR/EDR controller */
-	if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR)
+	if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY)
 		return;
 
 	/* No AMP link over AMP controller */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index d4b3dd5413be..3fb95c47243c 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3249,7 +3249,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
 	struct hci_chan *chan;
 
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		return hci_conn_hash_lookup_handle(hdev, handle);
 	case HCI_AMP:
 		chan = hci_chan_lookup_handle(hdev, handle);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 12e9294b02e0..6ef8a01a9ad4 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -676,7 +676,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
 	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
 		return -EOPNOTSUPP;
 
-	if (hdev->dev_type != HCI_BREDR)
+	if (hdev->dev_type != HCI_PRIMARY)
 		return -EOPNOTSUPP;
 
 	switch (cmd) {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index eb4f5f24cbe3..54ceb1f2cc9a 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -7468,7 +7468,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 	int len;
 
 	/* For AMP controller do not create l2cap conn */
-	if (!conn && hcon->hdev->dev_type != HCI_BREDR)
+	if (!conn && hcon->hdev->dev_type != HCI_PRIMARY)
 		goto drop;
 
 	if (!conn)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 9e4b931588cf..7983ec8d4c60 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -359,7 +359,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	count = 0;
 	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_BREDR &&
+		if (d->dev_type == HCI_PRIMARY &&
 		    !hci_dev_test_flag(d, HCI_UNCONFIGURED))
 			count++;
 	}
@@ -384,7 +384,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_BREDR &&
+		if (d->dev_type == HCI_PRIMARY &&
 		    !hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
 			BT_DBG("Added hci%u", d->id);
@@ -419,7 +419,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 
 	count = 0;
 	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_BREDR &&
+		if (d->dev_type == HCI_PRIMARY &&
 		    hci_dev_test_flag(d, HCI_UNCONFIGURED))
 			count++;
 	}
@@ -444,7 +444,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_BREDR &&
+		if (d->dev_type == HCI_PRIMARY &&
 		    hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
 			BT_DBG("Added hci%u", d->id);
@@ -479,7 +479,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 
 	count = 0;
 	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP)
+		if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP)
 			count++;
 	}
 
@@ -503,7 +503,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_BREDR) {
+		if (d->dev_type == HCI_PRIMARY) {
 			if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
 				rp->entry[count].type = 0x01;
 			else
@@ -6366,7 +6366,7 @@ void mgmt_index_added(struct hci_dev *hdev)
 		return;
 
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
 			mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev,
 					 NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
@@ -6399,7 +6399,7 @@ void mgmt_index_removed(struct hci_dev *hdev)
 		return;
 
 	switch (hdev->dev_type) {
-	case HCI_BREDR:
+	case HCI_PRIMARY:
 		mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
 		if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
-- 
cgit v1.2.3


From 1d984c2e03c1fb21539a9f50627e312788512013 Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@collabora.com>
Date: Fri, 8 Jul 2016 15:52:39 +0200
Subject: NFC: digital: Fix handling of saved PDU sk_buff pointers

This patch fixes the way an I-PDU is saved in case it needs to be sent
again. It is now copied using pskb_copy() and not simply referenced
using skb_get() since it could be modified by the driver.

digital_in_send_saved_skb() and digital_tg_send_saved_skb() still get a
reference on the saved skb which is re-sent but release it if the send
operation fails. That way the caller doesn't have to take care about skb
ref in case of error.

RTOX supervisor PDU must not be saved as this can override a previously
saved I-PDU that should be re-sent later on.

Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/digital.h |  1 -
 net/nfc/digital_dep.c     | 59 +++++++++++++++++++++++------------------------
 2 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h
index 506e3f6eabef..f9a4e4771861 100644
--- a/include/net/nfc/digital.h
+++ b/include/net/nfc/digital.h
@@ -237,7 +237,6 @@ struct nfc_digital_dev {
 	int nack_count;
 
 	struct sk_buff *saved_skb;
-	unsigned int saved_skb_len;
 
 	u16 target_fsc;
 
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index b62c85dc12a2..804585cb3f8e 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -524,8 +524,7 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	ddev->saved_skb = skb_get(skb);
-	ddev->saved_skb_len = skb->len;
+	ddev->saved_skb = pskb_copy(skb, GFP_KERNEL);
 
 	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
 				 data_exch);
@@ -627,16 +626,10 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	ddev->saved_skb = skb_get(skb);
-	ddev->saved_skb_len = skb->len;
-
 	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
 				 data_exch);
-	if (rc) {
+	if (rc)
 		kfree_skb(skb);
-		kfree_skb(ddev->saved_skb);
-		ddev->saved_skb = NULL;
-	}
 
 	return rc;
 }
@@ -644,11 +637,19 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
 static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev,
 				     struct digital_data_exch *data_exch)
 {
+	int rc;
+
+	if (!ddev->saved_skb)
+		return -EINVAL;
+
 	skb_get(ddev->saved_skb);
-	skb_push(ddev->saved_skb, ddev->saved_skb_len);
 
-	return digital_in_send_cmd(ddev, ddev->saved_skb, 1500,
-				   digital_in_recv_dep_res, data_exch);
+	rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500,
+				 digital_in_recv_dep_res, data_exch);
+	if (rc)
+		kfree_skb(ddev->saved_skb);
+
+	return rc;
 }
 
 static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
@@ -812,17 +813,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
 	case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU:
 		if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */
 			rc = digital_in_send_saved_skb(ddev, data_exch);
-			if (rc) {
-				kfree_skb(ddev->saved_skb);
+			if (rc)
 				goto error;
-			}
 
 			return;
 		}
 
-		kfree_skb(ddev->saved_skb);
-		ddev->saved_skb = NULL;
-
 		rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]);
 		if (rc)
 			goto error;
@@ -876,8 +872,7 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(tmp_skb);
 
-	ddev->saved_skb = skb_get(tmp_skb);
-	ddev->saved_skb_len = tmp_skb->len;
+	ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL);
 
 	rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res,
 				 data_exch);
@@ -956,8 +951,7 @@ static int digital_tg_send_ack(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	ddev->saved_skb = skb_get(skb);
-	ddev->saved_skb_len = skb->len;
+	ddev->saved_skb = pskb_copy(skb, GFP_KERNEL);
 
 	rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req,
 				 data_exch);
@@ -1009,11 +1003,19 @@ static int digital_tg_send_atn(struct nfc_digital_dev *ddev)
 
 static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev)
 {
+	int rc;
+
+	if (!ddev->saved_skb)
+		return -EINVAL;
+
 	skb_get(ddev->saved_skb);
-	skb_push(ddev->saved_skb, ddev->saved_skb_len);
 
-	return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500,
-				   digital_tg_recv_dep_req, NULL);
+	rc = digital_tg_send_cmd(ddev, ddev->saved_skb, 1500,
+				 digital_tg_recv_dep_req, NULL);
+	if (rc)
+		kfree_skb(ddev->saved_skb);
+
+	return rc;
 }
 
 static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
@@ -1163,10 +1165,8 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
 			ddev->atn_count = 0;
 
 			rc = digital_tg_send_saved_skb(ddev);
-			if (rc) {
-				kfree_skb(ddev->saved_skb);
+			if (rc)
 				goto exit;
-			}
 		}
 
 		return;
@@ -1235,8 +1235,7 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb)
 
 	ddev->skb_add_crc(tmp_skb);
 
-	ddev->saved_skb = skb_get(tmp_skb);
-	ddev->saved_skb_len = tmp_skb->len;
+	ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL);
 
 	rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req,
 				 NULL);
-- 
cgit v1.2.3


From 1a09c56f545c8ff8d338a38c7c40d79f4165a94c Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@collabora.com>
Date: Fri, 8 Jul 2016 15:52:45 +0200
Subject: NFC: digital: Add support for NFC DEP Response Waiting Time

When sending an ATR_REQ, the initiator must wait for the ATR_RES at
least 'RWT(nfcdep,activation) + dRWT(nfcdep)' and no more than
'RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator)'. This
gives a timeout value between 1237 ms and 1337 ms. This patch defines
DIGITAL_ATR_RES_RWT to 1337 used for the timeout value of ATR_REQ
command.

For other DEP PDUs, the initiator must wait between 'RWT + dRWT(nfcdep)'
and 'RWT + dRWT(nfcdep) + dT(nfcdep,initiator)' where RWT is given by
the following formula: '(256 * 16 / f(c)) * 2^wt' where wt is the value
of the TO field in the ATR_RES response and is in the range between 0
and 14. This patch declares a mapping table for wt values and gives RWT
max values between 100 ms and 5049 ms.

This patch also defines DIGITAL_ATR_RES_TO_WT, the maximum wt value in
target mode, to 8.

Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/digital.h |  1 +
 net/nfc/digital_dep.c     | 71 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h
index f9a4e4771861..74fa7eb94e72 100644
--- a/include/net/nfc/digital.h
+++ b/include/net/nfc/digital.h
@@ -226,6 +226,7 @@ struct nfc_digital_dev {
 	u8 curr_rf_tech;
 	u8 curr_nfc_dep_pni;
 	u8 did;
+	u16 dep_rwt;
 
 	u8 local_payload_max;
 	u8 remote_payload_max;
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index ba52a5dbf3cc..6cf2eeb2e865 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -35,6 +35,8 @@
 #define DIGITAL_ATR_REQ_MIN_SIZE 16
 #define DIGITAL_ATR_REQ_MAX_SIZE 64
 
+#define DIGITAL_ATR_RES_TO_WT(s)	((s) & 0xF)
+
 #define DIGITAL_DID_MAX	14
 
 #define DIGITAL_PAYLOAD_SIZE_MAX	254
@@ -122,6 +124,37 @@ static const u8 digital_payload_bits_map[4] = {
 	[3] = 254
 };
 
+/* Response Waiting Time for ATR_RES PDU in ms
+ *
+ * RWT(ATR_RES) = RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator)
+ *
+ * with:
+ *  RWT(nfcdep,activation) = 4096 * 2^12 / f(c) s
+ *  dRWT(nfcdep) = 16 / f(c) s
+ *  dT(nfcdep,initiator) = 100 ms
+ *  f(c) = 13560000 Hz
+ */
+#define DIGITAL_ATR_RES_RWT 1337
+
+/* Response Waiting Time for other DEP PDUs in ms
+ *
+ * max_rwt = rwt + dRWT(nfcdep) + dT(nfcdep,initiator)
+ *
+ * with:
+ *  rwt = (256 * 16 / f(c)) * 2^wt s
+ *  dRWT(nfcdep) = 16 / f(c) s
+ *  dT(nfcdep,initiator) = 100 ms
+ *  f(c) = 13560000 Hz
+ *  0 <= wt <= 14 (given by the target by the TO field of ATR_RES response)
+ */
+#define DIGITAL_NFC_DEP_IN_MAX_WT 14
+#define DIGITAL_NFC_DEP_TG_MAX_WT 8
+static const u16 digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT + 1] = {
+	100,  101,  101,  102,  105,
+	110,  119,  139,  177,  255,
+	409,  719, 1337, 2575, 5049,
+};
+
 static u8 digital_payload_bits_to_size(u8 payload_bits)
 {
 	if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map))
@@ -366,8 +399,8 @@ static int digital_in_send_psl_req(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res,
-				 target);
+	rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+				 digital_in_recv_psl_res, target);
 	if (rc)
 		kfree_skb(skb);
 
@@ -380,6 +413,7 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg,
 	struct nfc_target *target = arg;
 	struct digital_atr_res *atr_res;
 	u8 gb_len, payload_bits;
+	u8 wt;
 	int rc;
 
 	if (IS_ERR(resp)) {
@@ -409,6 +443,11 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg,
 
 	atr_res = (struct digital_atr_res *)resp->data;
 
+	wt = DIGITAL_ATR_RES_TO_WT(atr_res->to);
+	if (wt > DIGITAL_NFC_DEP_IN_MAX_WT)
+		wt = DIGITAL_NFC_DEP_IN_MAX_WT;
+	ddev->dep_rwt = digital_rwt_map[wt];
+
 	payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp);
 	ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits);
 
@@ -490,8 +529,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res,
-				 target);
+	rc = digital_in_send_cmd(ddev, skb, DIGITAL_ATR_RES_RWT,
+				 digital_in_recv_atr_res, target);
 	if (rc)
 		kfree_skb(skb);
 
@@ -524,8 +563,8 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev,
 
 	ddev->saved_skb = pskb_copy(skb, GFP_KERNEL);
 
-	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
-				 data_exch);
+	rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+				 digital_in_recv_dep_res, data_exch);
 	if (rc) {
 		kfree_skb(skb);
 		kfree_skb(ddev->saved_skb);
@@ -559,8 +598,8 @@ static int digital_in_send_nack(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
-				 data_exch);
+	rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+				 digital_in_recv_dep_res, data_exch);
 	if (rc)
 		kfree_skb(skb);
 
@@ -590,8 +629,8 @@ static int digital_in_send_atn(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
-				 data_exch);
+	rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+				 digital_in_recv_dep_res, data_exch);
 	if (rc)
 		kfree_skb(skb);
 
@@ -624,8 +663,8 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
 
 	ddev->skb_add_crc(skb);
 
-	rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
-				 data_exch);
+	rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt,
+				 digital_in_recv_dep_res, data_exch);
 	if (rc)
 		kfree_skb(skb);
 
@@ -642,7 +681,7 @@ static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev,
 
 	skb_get(ddev->saved_skb);
 
-	rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500,
+	rc = digital_in_send_cmd(ddev, ddev->saved_skb, ddev->dep_rwt,
 				 digital_in_recv_dep_res, data_exch);
 	if (rc)
 		kfree_skb(ddev->saved_skb);
@@ -885,8 +924,8 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
 
 	ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL);
 
-	rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res,
-				 data_exch);
+	rc = digital_in_send_cmd(ddev, tmp_skb, ddev->dep_rwt,
+				 digital_in_recv_dep_res, data_exch);
 	if (rc) {
 		if (tmp_skb != skb)
 			kfree_skb(tmp_skb);
@@ -1465,7 +1504,7 @@ static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev,
 	atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
 	atr_res->cmd = DIGITAL_CMD_ATR_RES;
 	memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3));
-	atr_res->to = 8;
+	atr_res->to = DIGITAL_NFC_DEP_TG_MAX_WT;
 
 	ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX;
 	payload_bits = digital_payload_size_to_bits(ddev->local_payload_max);
-- 
cgit v1.2.3


From 64b87639c9cbeb03e26bc65528416c961b1dde96 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 3 Jul 2016 13:18:43 +0800
Subject: netfilter: conntrack: fix race between nf_conntrack proc read and
 hash resize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we do "cat /proc/net/nf_conntrack", and meanwhile resize the conntrack
hash table via /sys/module/nf_conntrack/parameters/hashsize, race will
happen, because reader can observe a newly allocated hash but the old size
(or vice versa). So oops will happen like follows：

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000017
  IP: [<ffffffffa0418e21>] seq_print_acct+0x11/0x50 [nf_conntrack]
  Call Trace:
  [<ffffffffa0412f4e>] ? ct_seq_show+0x14e/0x340 [nf_conntrack]
  [<ffffffff81261a1c>] seq_read+0x2cc/0x390
  [<ffffffff812a8d62>] proc_reg_read+0x42/0x70
  [<ffffffff8123bee7>] __vfs_read+0x37/0x130
  [<ffffffff81347980>] ? security_file_permission+0xa0/0xc0
  [<ffffffff8123cf75>] vfs_read+0x95/0x140
  [<ffffffff8123e475>] SyS_read+0x55/0xc0
  [<ffffffff817c2572>] entry_SYSCALL_64_fastpath+0x1a/0xa4

It is very easy to reproduce this kernel crash.
1. open one shell and input the following cmds:
  while : ; do
    echo $RANDOM > /sys/module/nf_conntrack/parameters/hashsize
  done
2. open more shells and input the following cmds:
  while : ; do
    cat /proc/net/nf_conntrack
  done
3. just wait a monent, oops will happen soon.

The solution in this patch is based on Florian's Commit 5e3c61f98175
("netfilter: conntrack: fix lookup race during hash resize"). And
add a wrapper function nf_conntrack_get_ht to get hash and hsize
suggested by Florian Westphal.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h             |  2 ++
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 ++++++++++----
 net/netfilter/nf_conntrack_core.c                     | 17 +++++++++++++++++
 net/netfilter/nf_conntrack_standalone.c               | 14 +++++++++-----
 4 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3e2f3328945c..79d7ac5c9740 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -51,6 +51,8 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 			const struct nf_conntrack_l3proto *l3proto,
 			const struct nf_conntrack_l4proto *l4proto);
 
+void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize);
+
 /* Find a connection corresponding to a tuple. */
 struct nf_conntrack_tuple_hash *
 nf_conntrack_find_get(struct net *net,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index c6f3c406f707..63923710f325 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -26,6 +26,8 @@
 
 struct ct_iter_state {
 	struct seq_net_private p;
+	struct hlist_nulls_head *hash;
+	unsigned int htable_size;
 	unsigned int bucket;
 };
 
@@ -35,10 +37,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 	struct hlist_nulls_node *n;
 
 	for (st->bucket = 0;
-	     st->bucket < nf_conntrack_htable_size;
+	     st->bucket < st->htable_size;
 	     st->bucket++) {
 		n = rcu_dereference(
-			hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 		if (!is_a_nulls(n))
 			return n;
 	}
@@ -53,11 +55,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
 	head = rcu_dereference(hlist_nulls_next_rcu(head));
 	while (is_a_nulls(head)) {
 		if (likely(get_nulls_value(head) == st->bucket)) {
-			if (++st->bucket >= nf_conntrack_htable_size)
+			if (++st->bucket >= st->htable_size)
 				return NULL;
 		}
 		head = rcu_dereference(
-			hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 	}
 	return head;
 }
@@ -75,7 +77,11 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
 static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ct_iter_state *st = seq->private;
+
 	rcu_read_lock();
+
+	nf_conntrack_get_ht(&st->hash, &st->htable_size);
 	return ct_get_idx(seq, *pos);
 }
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 153e33ffeeaa..1289e7e5e0de 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -460,6 +460,23 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	       net_eq(net, nf_ct_net(ct));
 }
 
+/* must be called with rcu read lock held */
+void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+{
+	struct hlist_nulls_head *hptr;
+	unsigned int sequence, hsz;
+
+	do {
+		sequence = read_seqcount_begin(&nf_conntrack_generation);
+		hsz = nf_conntrack_htable_size;
+		hptr = nf_conntrack_hash;
+	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+	*hash = hptr;
+	*hsize = hsz;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
+
 /*
  * Warning :
  * - Caller must take a reference on returned object
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2aaa188ee961..958a1455ca7f 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(print_tuple);
 
 struct ct_iter_state {
 	struct seq_net_private p;
+	struct hlist_nulls_head *hash;
+	unsigned int htable_size;
 	unsigned int bucket;
 	u_int64_t time_now;
 };
@@ -58,9 +60,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 	struct hlist_nulls_node *n;
 
 	for (st->bucket = 0;
-	     st->bucket < nf_conntrack_htable_size;
+	     st->bucket < st->htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+		n = rcu_dereference(
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 		if (!is_a_nulls(n))
 			return n;
 	}
@@ -75,12 +78,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
 	head = rcu_dereference(hlist_nulls_next_rcu(head));
 	while (is_a_nulls(head)) {
 		if (likely(get_nulls_value(head) == st->bucket)) {
-			if (++st->bucket >= nf_conntrack_htable_size)
+			if (++st->bucket >= st->htable_size)
 				return NULL;
 		}
 		head = rcu_dereference(
-				hlist_nulls_first_rcu(
-					&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 	}
 	return head;
 }
@@ -102,6 +104,8 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 
 	st->time_now = ktime_get_real_ns();
 	rcu_read_lock();
+
+	nf_conntrack_get_ht(&st->hash, &st->htable_size);
 	return ct_get_idx(seq, *pos);
 }
 
-- 
cgit v1.2.3


From 242922a027176cd260c5adce4ba6bbfa3a05190c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 3 Jul 2016 20:44:01 +0200
Subject: netfilter: conntrack: simplify early_drop

We don't need to acquire the bucket lock during early drop, we can
use lockless traveral just like ____nf_conntrack_find.

The timer deletion serves as synchronization point, if another cpu
attempts to evict same entry, only one will succeed with timer deletion.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  1 +
 net/netfilter/nf_conntrack_core.c    | 95 ++++++++++++++++++------------------
 2 files changed, 48 insertions(+), 48 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 5d3397f34583..2a5133e214c9 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -301,6 +301,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
+#define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))
 
 #define MODULE_ALIAS_NFCT_HELPER(helper) \
         MODULE_ALIAS("nfct-helper-" helper)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1289e7e5e0de..e0e9c9a0f5ba 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -834,67 +834,66 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
-static noinline int early_drop(struct net *net, unsigned int _hash)
+static unsigned int early_drop_list(struct net *net,
+				    struct hlist_nulls_head *head)
 {
-	/* Use oldest entry, which is roughly LRU */
 	struct nf_conntrack_tuple_hash *h;
-	struct nf_conn *tmp;
 	struct hlist_nulls_node *n;
-	unsigned int i, hash, sequence;
-	struct nf_conn *ct = NULL;
-	spinlock_t *lockp;
-	bool ret = false;
+	unsigned int drops = 0;
+	struct nf_conn *tmp;
 
-	i = 0;
+	hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
+		tmp = nf_ct_tuplehash_to_ctrack(h);
 
-	local_bh_disable();
-restart:
-	sequence = read_seqcount_begin(&nf_conntrack_generation);
-	for (; i < NF_CT_EVICTION_RANGE; i++) {
-		hash = scale_hash(_hash++);
-		lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
-		nf_conntrack_lock(lockp);
-		if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
-			spin_unlock(lockp);
-			goto restart;
-		}
-		hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
-					       hnnode) {
-			tmp = nf_ct_tuplehash_to_ctrack(h);
-
-			if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
-			    !net_eq(nf_ct_net(tmp), net) ||
-			    nf_ct_is_dying(tmp))
-				continue;
-
-			if (atomic_inc_not_zero(&tmp->ct_general.use)) {
-				ct = tmp;
-				break;
-			}
-		}
+		if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+		    !net_eq(nf_ct_net(tmp), net) ||
+		    nf_ct_is_dying(tmp))
+			continue;
 
-		spin_unlock(lockp);
-		if (ct)
-			break;
+		if (!atomic_inc_not_zero(&tmp->ct_general.use))
+			continue;
+
+		/* kill only if still in same netns -- might have moved due to
+		 * SLAB_DESTROY_BY_RCU rules.
+		 *
+		 * We steal the timer reference.  If that fails timer has
+		 * already fired or someone else deleted it. Just drop ref
+		 * and move to next entry.
+		 */
+		if (net_eq(nf_ct_net(tmp), net) &&
+		    nf_ct_is_confirmed(tmp) &&
+		    del_timer(&tmp->timeout) &&
+		    nf_ct_delete(tmp, 0, 0))
+			drops++;
+
+		nf_ct_put(tmp);
 	}
 
-	local_bh_enable();
+	return drops;
+}
 
-	if (!ct)
-		return false;
+static noinline int early_drop(struct net *net, unsigned int _hash)
+{
+	unsigned int i;
 
-	/* kill only if in same netns -- might have moved due to
-	 * SLAB_DESTROY_BY_RCU rules
-	 */
-	if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
-		if (nf_ct_delete(ct, 0, 0)) {
-			NF_CT_STAT_INC_ATOMIC(net, early_drop);
-			ret = true;
+	for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
+		struct hlist_nulls_head *ct_hash;
+		unsigned hash, sequence, drops;
+
+		do {
+			sequence = read_seqcount_begin(&nf_conntrack_generation);
+			hash = scale_hash(_hash++);
+			ct_hash = nf_conntrack_hash;
+		} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+		drops = early_drop_list(net, &ct_hash[hash]);
+		if (drops) {
+			NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
+			return true;
 		}
 	}
 
-	nf_ct_put(ct);
-	return ret;
+	return false;
 }
 
 static struct nf_conn *
-- 
cgit v1.2.3


From 7c9664351980aaa6a4b8837a314360b3a4ad382a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 5 Jul 2016 12:07:23 +0200
Subject: netfilter: move nat hlist_head to nf_conn

The nat extension structure is 32bytes in size on x86_64:

struct nf_conn_nat {
        struct hlist_node          bysource;             /*     0    16 */
        struct nf_conn *           ct;                   /*    16     8 */
        union nf_conntrack_nat_help help;                /*    24     4 */
        int                        masq_index;           /*    28     4 */
        /* size: 32, cachelines: 1, members: 4 */
        /* last cacheline: 32 bytes */
};

The hlist is needed to quickly check for possible tuple collisions
when installing a new nat binding. Storing this in the extension
area has two drawbacks:

1. We need ct backpointer to get the conntrack struct from the extension.
2. When reallocation of extension area occurs we need to fixup the bysource
   hash head via hlist_replace_rcu.

We can avoid both by placing the hlist_head in nf_conn and place nf_conn in
the bysource hash rather than the extenstion.

We can also remove the ->move support; no other extension needs it.

Moving the entire nat extension into nf_conn would be possible as well but
then we have to add yet another callback for deletion from the bysource
hash table rather than just using nat extension ->destroy hook for this.

nf_conn size doesn't increase due to aligment, followup patch replaces
hlist_node with single pointer.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h        |  3 +++
 include/net/netfilter/nf_conntrack_extend.h |  3 ---
 include/net/netfilter/nf_nat.h              |  2 --
 net/netfilter/nf_conntrack_extend.c         | 15 ++-----------
 net/netfilter/nf_nat_core.c                 | 33 ++++++-----------------------
 5 files changed, 12 insertions(+), 44 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 2a5133e214c9..e5135d8728b4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -117,6 +117,9 @@ struct nf_conn {
 	/* Extensions */
 	struct nf_ct_ext *ext;
 
+#if IS_ENABLED(CONFIG_NF_NAT)
+	struct hlist_node	nat_bysource;
+#endif
 	/* Storage reserved for other modules, must be the last member */
 	union nf_conntrack_proto proto;
 };
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index b925395fa5ed..1c3035dda31f 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -99,9 +99,6 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
 struct nf_ct_ext_type {
 	/* Destroys relationships (can be NULL). */
 	void (*destroy)(struct nf_conn *ct);
-	/* Called when realloacted (can be NULL).
-	   Contents has already been moved. */
-	void (*move)(void *new, void *old);
 
 	enum nf_ct_ext_id id;
 
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 344b1ab19220..02515f7ed4cc 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -29,8 +29,6 @@ struct nf_conn;
 
 /* The structure embedded in the conntrack structure. */
 struct nf_conn_nat {
-	struct hlist_node bysource;
-	struct nf_conn *ct;
 	union nf_conntrack_nat_help help;
 #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \
     IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6)
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 1a9545965c0d..02bcf00c2492 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -73,7 +73,7 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
 			     size_t var_alloc_len, gfp_t gfp)
 {
 	struct nf_ct_ext *old, *new;
-	int i, newlen, newoff;
+	int newlen, newoff;
 	struct nf_ct_ext_type *t;
 
 	/* Conntrack must not be confirmed to avoid races on reallocation. */
@@ -99,19 +99,8 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
 		return NULL;
 
 	if (new != old) {
-		for (i = 0; i < NF_CT_EXT_NUM; i++) {
-			if (!__nf_ct_ext_exist(old, i))
-				continue;
-
-			rcu_read_lock();
-			t = rcu_dereference(nf_ct_ext_types[i]);
-			if (t && t->move)
-				t->move((void *)new + new->offset[i],
-					(void *)old + old->offset[i]);
-			rcu_read_unlock();
-		}
 		kfree_rcu(old, rcu);
-		ct->ext = new;
+		rcu_assign_pointer(ct->ext, new);
 	}
 
 	new->offset[id] = newoff;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6877a396f8fc..692534701426 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -198,11 +198,9 @@ find_appropriate_src(struct net *net,
 		     const struct nf_nat_range *range)
 {
 	unsigned int h = hash_by_src(net, tuple);
-	const struct nf_conn_nat *nat;
 	const struct nf_conn *ct;
 
-	hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
-		ct = nat->ct;
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
 		if (same_src(ct, tuple) &&
 		    net_eq(net, nf_ct_net(ct)) &&
 		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
@@ -435,8 +433,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		spin_lock_bh(&nf_nat_lock);
 		/* nf_conntrack_alter_reply might re-allocate extension aera */
 		nat = nfct_nat(ct);
-		nat->ct = ct;
-		hlist_add_head_rcu(&nat->bysource,
+		hlist_add_head_rcu(&ct->nat_bysource,
 				   &nf_nat_bysource[srchash]);
 		spin_unlock_bh(&nf_nat_lock);
 	}
@@ -543,7 +540,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	if (nf_nat_proto_remove(ct, data))
 		return 1;
 
-	if (!nat || !nat->ct)
+	if (!nat)
 		return 0;
 
 	/* This netns is being destroyed, and conntrack has nat null binding.
@@ -556,9 +553,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 		return 1;
 
 	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&nat->bysource);
+	hlist_del_rcu(&ct->nat_bysource);
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	nat->ct = NULL;
 	spin_unlock_bh(&nf_nat_lock);
 
 	add_timer(&ct->timeout);
@@ -688,27 +684,13 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
 	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
 
-	if (nat == NULL || nat->ct == NULL)
+	if (!nat)
 		return;
 
-	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
-
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&nat->bysource);
-	spin_unlock_bh(&nf_nat_lock);
-}
-
-static void nf_nat_move_storage(void *new, void *old)
-{
-	struct nf_conn_nat *new_nat = new;
-	struct nf_conn_nat *old_nat = old;
-	struct nf_conn *ct = old_nat->ct;
-
-	if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
-		return;
+	NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
 
 	spin_lock_bh(&nf_nat_lock);
-	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
+	hlist_del_rcu(&ct->nat_bysource);
 	spin_unlock_bh(&nf_nat_lock);
 }
 
@@ -716,7 +698,6 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
 	.len		= sizeof(struct nf_conn_nat),
 	.align		= __alignof__(struct nf_conn_nat),
 	.destroy	= nf_nat_cleanup_conntrack,
-	.move		= nf_nat_move_storage,
 	.id		= NF_CT_EXT_NAT,
 	.flags		= NF_CT_EXT_F_PREALLOC,
 };
-- 
cgit v1.2.3


From 870190a9ec9075205c0fa795a09fa931694a3ff1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 5 Jul 2016 12:07:24 +0200
Subject: netfilter: nat: convert nat bysrc hash to rhashtable

It did use a fixed-size bucket list plus single lock to protect add/del.

Unlike the main conntrack table we only need to add and remove keys.
Convert it to rhashtable to get table autosizing and per-bucket locking.

The maximum number of entries is -- as before -- tied to the number of
conntracks so we do not need another upperlimit.

The change does not handle rhashtable_remove_fast error, only possible
"error" is -ENOENT, and that is something that can happen legitimetely,
e.g. because nat module was inserted at a later time and no src manip
took place yet.

Tested with http-client-benchmark + httpterm with DNAT and SNAT rules
in place.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |   3 +-
 include/net/netfilter/nf_nat.h       |   1 +
 net/netfilter/nf_nat_core.c          | 126 +++++++++++++++++++----------------
 3 files changed, 71 insertions(+), 59 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e5135d8728b4..a08825b7e955 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,6 +17,7 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
+#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -118,7 +119,7 @@ struct nf_conn {
 	struct nf_ct_ext *ext;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct hlist_node	nat_bysource;
+	struct rhash_head	nat_bysource;
 #endif
 	/* Storage reserved for other modules, must be the last member */
 	union nf_conntrack_proto proto;
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 02515f7ed4cc..c327a431a6f3 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,5 +1,6 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
+#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 692534701426..de31818417b8 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,17 +30,19 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
-static DEFINE_SPINLOCK(nf_nat_lock);
-
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-static struct hlist_head *nf_nat_bysource __read_mostly;
-static unsigned int nf_nat_htable_size __read_mostly;
-static unsigned int nf_nat_hash_rnd __read_mostly;
+struct nf_nat_conn_key {
+	const struct net *net;
+	const struct nf_conntrack_tuple *tuple;
+	const struct nf_conntrack_zone *zone;
+};
+
+static struct rhashtable nf_nat_bysource_table;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -119,19 +121,17 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
+static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
 {
-	unsigned int hash;
-
-	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+	const struct nf_conntrack_tuple *t;
+	const struct nf_conn *ct = data;
 
+	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
-	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
-		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	return reciprocal_scale(hash, nf_nat_htable_size);
+	seed ^= net_hash_mix(nf_ct_net(ct));
+	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
+		      t->dst.protonum ^ seed);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -187,6 +187,26 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
+static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
+			       const void *obj)
+{
+	const struct nf_nat_conn_key *key = arg->key;
+	const struct nf_conn *ct = obj;
+
+	return same_src(ct, key->tuple) &&
+	       net_eq(nf_ct_net(ct), key->net) &&
+	       nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+}
+
+static struct rhashtable_params nf_nat_bysource_params = {
+	.head_offset = offsetof(struct nf_conn, nat_bysource),
+	.obj_hashfn = nf_nat_bysource_hash,
+	.obj_cmpfn = nf_nat_bysource_cmp,
+	.nelem_hint = 256,
+	.min_size = 1024,
+	.nulls_base = (1U << RHT_BASE_SHIFT),
+};
+
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -197,23 +217,23 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
-	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
+	struct nf_nat_conn_key key = {
+		.net = net,
+		.tuple = tuple,
+		.zone = zone
+	};
 
-	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
-		if (same_src(ct, tuple) &&
-		    net_eq(net, nf_ct_net(ct)) &&
-		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
-			/* Copy source part from reply tuple. */
-			nf_ct_invert_tuplepr(result,
-				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-			result->dst = tuple->dst;
-
-			if (in_range(l3proto, l4proto, result, range))
-				return 1;
-		}
-	}
-	return 0;
+	ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
+				    nf_nat_bysource_params);
+	if (!ct)
+		return 0;
+
+	nf_ct_invert_tuplepr(result,
+			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	result->dst = tuple->dst;
+
+	return in_range(l3proto, l4proto, result, range);
 }
 
 /* For [FUTURE] fragmentation handling, we want the least-used
@@ -385,7 +405,6 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
-	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
 
@@ -426,16 +445,13 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		unsigned int srchash;
-
-		srchash = hash_by_src(net,
-				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		spin_lock_bh(&nf_nat_lock);
-		/* nf_conntrack_alter_reply might re-allocate extension aera */
-		nat = nfct_nat(ct);
-		hlist_add_head_rcu(&ct->nat_bysource,
-				   &nf_nat_bysource[srchash]);
-		spin_unlock_bh(&nf_nat_lock);
+		int err;
+
+		err = rhashtable_insert_fast(&nf_nat_bysource_table,
+					     &ct->nat_bysource,
+					     nf_nat_bysource_params);
+		if (err)
+			return NF_DROP;
 	}
 
 	/* It's done. */
@@ -552,10 +568,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	if (!del_timer(&ct->timeout))
 		return 1;
 
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&ct->nat_bysource);
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	spin_unlock_bh(&nf_nat_lock);
+
+	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
+			       nf_nat_bysource_params);
 
 	add_timer(&ct->timeout);
 
@@ -687,11 +703,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 	if (!nat)
 		return;
 
-	NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
-
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&ct->nat_bysource);
-	spin_unlock_bh(&nf_nat_lock);
+	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
+			       nf_nat_bysource_params);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -826,16 +839,13 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	/* Leave them the same for the moment. */
-	nf_nat_htable_size = nf_conntrack_htable_size;
-
-	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
-	if (!nf_nat_bysource)
-		return -ENOMEM;
+	ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+	if (ret)
+		return ret;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+		rhashtable_destroy(&nf_nat_bysource_table);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -859,7 +869,7 @@ static int __init nf_nat_init(void)
 	return 0;
 
  cleanup_extend:
-	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+	rhashtable_destroy(&nf_nat_bysource_table);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -877,8 +887,8 @@ static void __exit nf_nat_cleanup(void)
 #endif
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-	synchronize_net();
-	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+
+	rhashtable_destroy(&nf_nat_bysource_table);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From d51ed8367bcbbb06f4f4986d1ef7dc2480bed1ad Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 8 Jul 2016 13:08:50 +0200
Subject: netfilter: constify arg to is_dying/confirmed

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index a08825b7e955..1e04911b78ea 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -270,12 +270,12 @@ static inline int nf_ct_is_template(const struct nf_conn *ct)
 }
 
 /* It's confirmed if it is, or has been in the hash table. */
-static inline int nf_ct_is_confirmed(struct nf_conn *ct)
+static inline int nf_ct_is_confirmed(const struct nf_conn *ct)
 {
 	return test_bit(IPS_CONFIRMED_BIT, &ct->status);
 }
 
-static inline int nf_ct_is_dying(struct nf_conn *ct)
+static inline int nf_ct_is_dying(const struct nf_conn *ct)
 {
 	return test_bit(IPS_DYING_BIT, &ct->status);
 }
-- 
cgit v1.2.3


From 42a55769132fdf4f44bac1471b371d7f80bcde35 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 8 Jul 2016 14:41:49 +0200
Subject: netfilter: nf_tables: get rid of possible_net_t from set and
 basechain

We can pass the netns pointer as parameter to the functions that need to
gain access to it. From basechains, I didn't find any client for this
field anymore so let's remove this too.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 21 +++++++++++----------
 net/netfilter/nf_tables_api.c     | 10 ++++------
 net/netfilter/nft_hash.c          | 20 ++++++++++----------
 net/netfilter/nft_lookup.c        |  2 +-
 net/netfilter/nft_rbtree.c        | 26 ++++++++++++++------------
 5 files changed, 40 insertions(+), 39 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 30c1d9489ae2..f2f13399ce44 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -236,7 +236,8 @@ struct nft_expr;
  *	@features: features supported by the implementation
  */
 struct nft_set_ops {
-	bool				(*lookup)(const struct nft_set *set,
+	bool				(*lookup)(const struct net *net,
+						  const struct nft_set *set,
 						  const u32 *key,
 						  const struct nft_set_ext **ext);
 	bool				(*update)(struct nft_set *set,
@@ -248,11 +249,14 @@ struct nft_set_ops {
 						  struct nft_regs *regs,
 						  const struct nft_set_ext **ext);
 
-	int				(*insert)(const struct nft_set *set,
+	int				(*insert)(const struct net *net,
+						  const struct nft_set *set,
 						  const struct nft_set_elem *elem);
-	void				(*activate)(const struct nft_set *set,
+	void				(*activate)(const struct net *net,
+						    const struct nft_set *set,
 						    const struct nft_set_elem *elem);
-	void *				(*deactivate)(const struct nft_set *set,
+	void *				(*deactivate)(const struct net *net,
+						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
 	void				(*remove)(const struct nft_set *set,
 						  const struct nft_set_elem *elem);
@@ -295,7 +299,6 @@ void nft_unregister_set(struct nft_set_ops *ops);
  *	@udlen: user data length
  *	@udata: user data
  * 	@ops: set ops
- * 	@pnet: network namespace
  * 	@flags: set flags
  *	@genmask: generation mask
  * 	@klen: key length
@@ -318,7 +321,6 @@ struct nft_set {
 	unsigned char			*udata;
 	/* runtime data below here */
 	const struct nft_set_ops	*ops ____cacheline_aligned;
-	possible_net_t			pnet;
 	u16				flags:14,
 					genmask:2;
 	u8				klen;
@@ -804,7 +806,6 @@ struct nft_stats {
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
- *	@pnet: net namespace that this chain belongs to
  *	@type: chain type
  *	@policy: default policy
  *	@stats: per-cpu chain stats
@@ -813,7 +814,6 @@ struct nft_stats {
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops[NFT_HOOK_OPS_MAX];
-	possible_net_t			pnet;
 	const struct nf_chain_type	*type;
 	u8				policy;
 	u8				flags;
@@ -1009,10 +1009,11 @@ static inline bool nft_set_elem_active(const struct nft_set_ext *ext,
 	return !(ext->genmask & genmask);
 }
 
-static inline void nft_set_elem_change_active(const struct nft_set *set,
+static inline void nft_set_elem_change_active(const struct net *net,
+					      const struct nft_set *set,
 					      struct nft_set_ext *ext)
 {
-	ext->genmask ^= nft_genmask_next(read_pnet(&set->pnet));
+	ext->genmask ^= nft_genmask_next(net);
 }
 
 /*
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 18b7f8578ee0..0211eaec9060 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1405,7 +1405,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 			rcu_assign_pointer(basechain->stats, stats);
 		}
 
-		write_pnet(&basechain->pnet, net);
 		basechain->type = type;
 		chain = &basechain->chain;
 
@@ -2841,7 +2840,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	}
 
 	INIT_LIST_HEAD(&set->bindings);
-	write_pnet(&set->pnet, net);
 	set->ops   = ops;
 	set->ktype = ktype;
 	set->klen  = desc.klen;
@@ -3520,7 +3518,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		goto err4;
 
 	ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
-	err = set->ops->insert(set, &elem);
+	err = set->ops->insert(ctx->net, set, &elem);
 	if (err < 0)
 		goto err5;
 
@@ -3644,7 +3642,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 		goto err3;
 	}
 
-	priv = set->ops->deactivate(set, &elem);
+	priv = set->ops->deactivate(ctx->net, set, &elem);
 	if (priv == NULL) {
 		err = -ENOENT;
 		goto err4;
@@ -4018,7 +4016,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			te->set->ops->activate(te->set, &te->elem);
+			te->set->ops->activate(net, te->set, &te->elem);
 			nf_tables_setelem_notify(&trans->ctx, te->set,
 						 &te->elem,
 						 NFT_MSG_NEWSETELEM, 0);
@@ -4143,7 +4141,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_DELSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			te->set->ops->activate(te->set, &te->elem);
+			te->set->ops->activate(net, te->set, &te->elem);
 			te->set->ndeact--;
 
 			nft_trans_destroy(trans);
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index ea924816b7b8..564fa7929ed5 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -71,13 +71,13 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
 	return 0;
 }
 
-static bool nft_hash_lookup(const struct nft_set *set, const u32 *key,
-			    const struct nft_set_ext **ext)
+static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
+			    const u32 *key, const struct nft_set_ext **ext)
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	const struct nft_hash_elem *he;
 	struct nft_hash_cmp_arg arg = {
-		.genmask = nft_genmask_cur(read_pnet(&set->pnet)),
+		.genmask = nft_genmask_cur(net),
 		.set	 = set,
 		.key	 = key,
 	};
@@ -125,13 +125,13 @@ err1:
 	return false;
 }
 
-static int nft_hash_insert(const struct nft_set *set,
+static int nft_hash_insert(const struct net *net, const struct nft_set *set,
 			   const struct nft_set_elem *elem)
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	struct nft_hash_elem *he = elem->priv;
 	struct nft_hash_cmp_arg arg = {
-		.genmask = nft_genmask_next(read_pnet(&set->pnet)),
+		.genmask = nft_genmask_next(net),
 		.set	 = set,
 		.key	 = elem->key.val.data,
 	};
@@ -140,20 +140,20 @@ static int nft_hash_insert(const struct nft_set *set,
 					    nft_hash_params);
 }
 
-static void nft_hash_activate(const struct nft_set *set,
+static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
 	struct nft_hash_elem *he = elem->priv;
 
-	nft_set_elem_change_active(set, &he->ext);
+	nft_set_elem_change_active(net, set, &he->ext);
 	nft_set_elem_clear_busy(&he->ext);
 }
 
-static void *nft_hash_deactivate(const struct nft_set *set,
+static void *nft_hash_deactivate(const struct net *net,
+				 const struct nft_set *set,
 				 const struct nft_set_elem *elem)
 {
 	struct nft_hash *priv = nft_set_priv(set);
-	struct net *net = read_pnet(&set->pnet);
 	struct nft_hash_elem *he;
 	struct nft_hash_cmp_arg arg = {
 		.genmask = nft_genmask_next(net),
@@ -166,7 +166,7 @@ static void *nft_hash_deactivate(const struct nft_set *set,
 	if (he != NULL) {
 		if (!nft_set_elem_mark_busy(&he->ext) ||
 		    !nft_is_active(net, &he->ext))
-			nft_set_elem_change_active(set, &he->ext);
+			nft_set_elem_change_active(net, set, &he->ext);
 		else
 			he = NULL;
 	}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index b8d18f598569..e164325d1bc0 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -35,7 +35,7 @@ static void nft_lookup_eval(const struct nft_expr *expr,
 	const struct nft_set_ext *ext;
 	bool found;
 
-	found = set->ops->lookup(set, &regs->data[priv->sreg], &ext) ^
+	found = set->ops->lookup(pkt->net, set, &regs->data[priv->sreg], &ext) ^
 		priv->invert;
 
 	if (!found) {
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index c0f638745adc..6473936d05c6 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -41,13 +41,13 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
 	return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
 }
 
-static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
-			      const struct nft_set_ext **ext)
+static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+			      const u32 *key, const struct nft_set_ext **ext)
 {
 	const struct nft_rbtree *priv = nft_set_priv(set);
 	const struct nft_rbtree_elem *rbe, *interval = NULL;
+	u8 genmask = nft_genmask_cur(net);
 	const struct rb_node *parent;
-	u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
 	const void *this;
 	int d;
 
@@ -93,13 +93,13 @@ out:
 	return false;
 }
 
-static int __nft_rbtree_insert(const struct nft_set *set,
+static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_next(net);
 	struct nft_rbtree_elem *rbe;
 	struct rb_node *parent, **p;
-	u8 genmask = nft_genmask_next(read_pnet(&set->pnet));
 	int d;
 
 	parent = NULL;
@@ -132,14 +132,14 @@ static int __nft_rbtree_insert(const struct nft_set *set,
 	return 0;
 }
 
-static int nft_rbtree_insert(const struct nft_set *set,
+static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem)
 {
 	struct nft_rbtree_elem *rbe = elem->priv;
 	int err;
 
 	spin_lock_bh(&nft_rbtree_lock);
-	err = __nft_rbtree_insert(set, rbe);
+	err = __nft_rbtree_insert(net, set, rbe);
 	spin_unlock_bh(&nft_rbtree_lock);
 
 	return err;
@@ -156,21 +156,23 @@ static void nft_rbtree_remove(const struct nft_set *set,
 	spin_unlock_bh(&nft_rbtree_lock);
 }
 
-static void nft_rbtree_activate(const struct nft_set *set,
+static void nft_rbtree_activate(const struct net *net,
+				const struct nft_set *set,
 				const struct nft_set_elem *elem)
 {
 	struct nft_rbtree_elem *rbe = elem->priv;
 
-	nft_set_elem_change_active(set, &rbe->ext);
+	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
-static void *nft_rbtree_deactivate(const struct nft_set *set,
+static void *nft_rbtree_deactivate(const struct net *net,
+				   const struct nft_set *set,
 				   const struct nft_set_elem *elem)
 {
 	const struct nft_rbtree *priv = nft_set_priv(set);
 	const struct rb_node *parent = priv->root.rb_node;
 	struct nft_rbtree_elem *rbe, *this = elem->priv;
-	u8 genmask = nft_genmask_next(read_pnet(&set->pnet));
+	u8 genmask = nft_genmask_next(net);
 	int d;
 
 	while (parent != NULL) {
@@ -196,7 +198,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
 				parent = parent->rb_right;
 				continue;
 			}
-			nft_set_elem_change_active(set, &rbe->ext);
+			nft_set_elem_change_active(net, set, &rbe->ext);
 			return rbe;
 		}
 	}
-- 
cgit v1.2.3


From 28aa4c26fce2202db8d42ae76b639ca1d9a23d25 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 9 Jul 2016 19:47:40 +0800
Subject: sctp: add SCTP_PR_SUPPORTED on sctp sockopt

According to section 4.5 of rfc7496, prsctp_enable should be per asoc.
We will add prsctp_enable to both asoc and ep, and replace the places
where it used net.sctp->prsctp_enable with asoc->prsctp_enable.

ep->prsctp_enable will be initialized with net.sctp->prsctp_enable, and
asoc->prsctp_enable will be initialized with ep->prsctp_enable. We can
also modify it's value through sockopt SCTP_PR_SUPPORTED.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  6 ++--
 include/uapi/linux/sctp.h  |  1 +
 net/sctp/associola.c       |  1 +
 net/sctp/endpointola.c     |  1 +
 net/sctp/sm_make_chunk.c   | 12 +++----
 net/sctp/socket.c          | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 93 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 83c5ec58b93a..07115ca9de4d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1256,7 +1256,8 @@ struct sctp_endpoint {
 	/* SCTP-AUTH: endpoint shared keys */
 	struct list_head endpoint_shared_keys;
 	__u16 active_key_id;
-	__u8  auth_enable;
+	__u8  auth_enable:1,
+	      prsctp_enable:1;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1848,7 +1849,8 @@ struct sctp_association {
 	__u16 active_key_id;
 
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
-	     temp:1;		/* Is it a temporary association? */
+	     temp:1,		/* Is it a temporary association? */
+	     prsctp_enable:1;
 
 	struct sctp_priv_assoc_stats stats;
 };
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index ce70fe6b45df..aa08906f292d 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -112,6 +112,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_SOCKOPT_CONNECTX	110		/* CONNECTX requests. */
 #define SCTP_SOCKOPT_CONNECTX3	111	/* CONNECTX requests (updated) */
 #define SCTP_GET_ASSOC_STATS	112	/* Read only */
+#define SCTP_PR_SUPPORTED	113
 
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in <bits/socket.h> as an enum.  */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e1849f3714ad..1c23060c41a6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -268,6 +268,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 		goto fail_init;
 
 	asoc->active_key_id = ep->active_key_id;
+	asoc->prsctp_enable = ep->prsctp_enable;
 
 	/* Save the hmacs and chunks list into this association */
 	if (ep->auth_hmacs_list)
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 9d494e35e7f9..1f03065686fe 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -163,6 +163,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	 */
 	ep->auth_hmacs_list = auth_hmacs;
 	ep->auth_chunk_list = auth_chunks;
+	ep->prsctp_enable = net->sctp.prsctp_enable;
 
 	return ep;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 56f364d8f932..0e3045ef57fa 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
 	chunksize += sizeof(ecap_param);
 
-	if (net->sctp.prsctp_enable)
+	if (asoc->prsctp_enable)
 		chunksize += sizeof(prsctp_param);
 
 	/* ADDIP: Section 4.2.7:
@@ -355,7 +355,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		sctp_addto_param(retval, num_ext, extensions);
 	}
 
-	if (net->sctp.prsctp_enable)
+	if (asoc->prsctp_enable)
 		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
 
 	if (sp->adaptation_ind) {
@@ -2024,8 +2024,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 	for (i = 0; i < num_ext; i++) {
 		switch (param.ext->chunks[i]) {
 		case SCTP_CID_FWD_TSN:
-			if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable)
-				    asoc->peer.prsctp_capable = 1;
+			if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
+				asoc->peer.prsctp_capable = 1;
 			break;
 		case SCTP_CID_AUTH:
 			/* if the peer reports AUTH, assume that he
@@ -2169,7 +2169,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net,
 		break;
 
 	case SCTP_PARAM_FWD_TSN_SUPPORT:
-		if (net->sctp.prsctp_enable)
+		if (ep->prsctp_enable)
 			break;
 		goto fallthrough;
 
@@ -2653,7 +2653,7 @@ do_addr_param:
 		break;
 
 	case SCTP_PARAM_FWD_TSN_SUPPORT:
-		if (net->sctp.prsctp_enable) {
+		if (asoc->prsctp_enable) {
 			asoc->peer.prsctp_capable = 1;
 			break;
 		}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index cdabbd8219b1..7460ddebd9ce 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3661,6 +3661,39 @@ static int sctp_setsockopt_recvnxtinfo(struct sock *sk,
 	return 0;
 }
 
+static int sctp_setsockopt_pr_supported(struct sock *sk,
+					char __user *optval,
+					unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		asoc->prsctp_enable = !!params.assoc_value;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		sp->ep->prsctp_enable = !!params.assoc_value;
+	} else {
+		goto out;
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3821,6 +3854,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RECVNXTINFO:
 		retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen);
 		break;
+	case SCTP_PR_SUPPORTED:
+		retval = sctp_setsockopt_pr_supported(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6166,6 +6202,47 @@ static int sctp_getsockopt_recvnxtinfo(struct sock *sk,	int len,
 	return 0;
 }
 
+static int sctp_getsockopt_pr_supported(struct sock *sk, int len,
+					char __user *optval,
+					int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->prsctp_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->ep->prsctp_enable;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6319,6 +6396,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RECVNXTINFO:
 		retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen);
 		break;
+	case SCTP_PR_SUPPORTED:
+		retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 826d253d57b11f69add81c8086d2e7f1dce5ec77 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 9 Jul 2016 19:47:42 +0800
Subject: sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt

This patch adds SCTP_PR_ASSOC_STATUS to sctp sockopt, which is used
to dump the prsctp statistics info from the asoc. The prsctp statistics
includes abandoned_sent/unsent from the asoc. abandoned_sent is the
count of the packets we drop packets from retransmit/transmited queue,
and abandoned_unsent is the count of the packets we drop from out_queue
according to the policy.

Note: another option for prsctp statistics dump described in rfc is
SCTP_PR_STREAM_STATUS, which is used to dump the prsctp statistics
info from each stream. But by now, linux doesn't yet have per stream
statistics info, it needs rfc6525 to be implemented. As the prsctp
statistics for each stream has to be based on per stream statistics,
we will delay it until rfc6525 is done in linux.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  3 +++
 include/uapi/linux/sctp.h  | 12 +++++++++
 net/sctp/socket.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 07115ca9de4d..d8e464aacb20 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1853,6 +1853,9 @@ struct sctp_association {
 	     prsctp_enable:1;
 
 	struct sctp_priv_assoc_stats stats;
+
+	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
+	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 };
 
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 984cf2e9a61d..d304f4c9792c 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -114,6 +114,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_STATS	112	/* Read only */
 #define SCTP_PR_SUPPORTED	113
 #define SCTP_DEFAULT_PRINFO	114
+#define SCTP_PR_ASSOC_STATUS	115
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -926,6 +927,17 @@ struct sctp_paddrthlds {
 	__u16 spt_pathpfthld;
 };
 
+/*
+ * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status
+ */
+struct sctp_prstatus {
+	sctp_assoc_t sprstat_assoc_id;
+	__u16 sprstat_sid;
+	__u16 sprstat_policy;
+	__u64 sprstat_abandoned_unsent;
+	__u64 sprstat_abandoned_sent;
+};
+
 struct sctp_default_prinfo {
 	sctp_assoc_t pr_assoc_id;
 	__u32 pr_value;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c03fe1b76706..c3167c43a9c1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6330,6 +6330,64 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len,
+					  char __user *optval,
+					  int __user *optlen)
+{
+	struct sctp_prstatus params;
+	struct sctp_association *asoc;
+	int policy;
+	int retval = -EINVAL;
+
+	if (len < sizeof(params))
+		goto out;
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	policy = params.sprstat_policy;
+	if (policy & ~SCTP_PR_SCTP_MASK)
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
+	if (!asoc)
+		goto out;
+
+	if (policy == SCTP_PR_SCTP_NONE) {
+		params.sprstat_abandoned_unsent = 0;
+		params.sprstat_abandoned_sent = 0;
+		for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
+			params.sprstat_abandoned_unsent +=
+				asoc->abandoned_unsent[policy];
+			params.sprstat_abandoned_sent +=
+				asoc->abandoned_sent[policy];
+		}
+	} else {
+		params.sprstat_abandoned_unsent =
+			asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)];
+		params.sprstat_abandoned_sent =
+			asoc->abandoned_sent[__SCTP_PR_INDEX(policy)];
+	}
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, &params, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6490,6 +6548,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_default_prinfo(sk, len, optval,
 							optlen);
 		break;
+	case SCTP_PR_ASSOC_STATUS:
+		retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
+							optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From a6c2f792873aff332a4689717c3cd6104f46684c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 9 Jul 2016 19:47:43 +0800
Subject: sctp: implement prsctp TTL policy

prsctp TTL policy is a policy to abandon chunks when they expire
at the specific time in local stack. It's similar with expires_at
in struct sctp_datamsg.

This patch uses sinfo->sinfo_timetolive to set the specific time for
TTL policy. sinfo->sinfo_timetolive is also used for msg->expires_at.
So if prsctp_enable or TTL policy is not enabled, msg->expires_at
still works as before.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 10 ++++++++++
 net/sctp/chunk.c           | 20 +++++++++++++++++---
 net/sctp/output.c          |  2 ++
 net/sctp/sm_make_chunk.c   | 12 ++++++++++++
 net/sctp/socket.c          |  4 ++--
 5 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d8e464aacb20..6bcda715008e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -602,6 +602,16 @@ struct sctp_chunk {
 	/* This needs to be recoverable for SCTP_SEND_FAILED events. */
 	struct sctp_sndrcvinfo sinfo;
 
+	/* We use this field to record param for prsctp policies,
+	 * for TTL policy, it is the time_to_drop of this chunk,
+	 * for RTX policy, it is the max_sent_count of this chunk,
+	 * for PRIO policy, it is the priority of this chunk.
+	 */
+	unsigned long prsctp_param;
+
+	/* How many times this chunk have been sent, for prsctp RTX policy */
+	int sent_count;
+
 	/* Which association does this belong to?  */
 	struct sctp_association *asoc;
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 1eb94bf18ef4..2698d122e201 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -335,13 +335,27 @@ errout:
 /* Check whether this message has expired. */
 int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 {
-	struct sctp_datamsg *msg = chunk->msg;
+	if (!chunk->asoc->prsctp_enable ||
+	    !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
+		struct sctp_datamsg *msg = chunk->msg;
+
+		if (!msg->can_abandon)
+			return 0;
+
+		if (time_after(jiffies, msg->expires_at))
+			return 1;
 
-	if (!msg->can_abandon)
 		return 0;
+	}
 
-	if (time_after(jiffies, msg->expires_at))
+	if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
+	    time_after(jiffies, chunk->prsctp_param)) {
+		if (chunk->sent_count)
+			chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
+		else
+			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		return 1;
+	}
 
 	return 0;
 }
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 2e9223bb1b3a..7425f6c23888 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -316,6 +316,8 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
 		packet->has_data = 1;
 		/* timestamp the chunk for rtx purposes */
 		chunk->sent_at = jiffies;
+		/* Mainly used for prsctp RTX policy */
+		chunk->sent_count++;
 		break;
 	case SCTP_CID_COOKIE_ECHO:
 		packet->has_cookie_echo = 1;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 0e3045ef57fa..2c431eef3a50 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -711,6 +711,17 @@ nodata:
 	return retval;
 }
 
+static void sctp_set_prsctp_policy(struct sctp_chunk *chunk,
+				   const struct sctp_sndrcvinfo *sinfo)
+{
+	if (!chunk->asoc->prsctp_enable)
+		return;
+
+	if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
+		chunk->prsctp_param =
+			jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
+}
+
 /* Make a DATA chunk for the given association from the provided
  * parameters.  However, do not populate the data payload.
  */
@@ -744,6 +755,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 
 	retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
 	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+	sctp_set_prsctp_policy(retval, sinfo);
 
 nodata:
 	return retval;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c3167c43a9c1..08614296628a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7099,7 +7099,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 
 			if (cmsgs->srinfo->sinfo_flags &
 			    ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
-			      SCTP_SACK_IMMEDIATELY |
+			      SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
 			      SCTP_ABORT | SCTP_EOF))
 				return -EINVAL;
 			break;
@@ -7123,7 +7123,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 
 			if (cmsgs->sinfo->snd_flags &
 			    ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
-			      SCTP_SACK_IMMEDIATELY |
+			      SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
 			      SCTP_ABORT | SCTP_EOF))
 				return -EINVAL;
 			break;
-- 
cgit v1.2.3


From 8dbdf1f5b09cb22560e7c7173b52fe3c631046bd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 9 Jul 2016 19:47:45 +0800
Subject: sctp: implement prsctp PRIO policy

prsctp PRIO policy is a policy to abandon lower priority chunks when
asoc doesn't have enough snd buffer, so that the current chunk with
higher priority can be queued successfully.

Similar to TTL/RTX policy, we will set the priority of the chunk to
prsctp_param with sinfo->sinfo_timetolive in sctp_set_prsctp_policy().
So if PRIO policy is enabled, msg->expire_at won't work.

asoc->sent_cnt_removable will record how many chunks can be checked to
remove. If priority policy is enabled, when the chunk is queued into
the out_queue, we will increase sent_cnt_removable. When the chunk is
moved to abandon_queue or dequeue and free, we will decrease
sent_cnt_removable.

In sctp_sendmsg, we will check if there is enough snd buffer for current
msg and if sent_cnt_removable is not 0. Then try to abandon chunks in
sctp_prune_prsctp when sendmsg from the retransmit/transmited queue, and
free chunks from out_queue in right order until the abandon+free size >
msg_len - sctp_wfree. For the abandon size, we have to wait until it
sends FORWARD TSN, receives the sack and the chunks are really freed.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++
 net/sctp/chunk.c           |  1 +
 net/sctp/outqueue.c        | 99 ++++++++++++++++++++++++++++++++++++++++++++++
 net/sctp/sm_make_chunk.c   |  3 +-
 net/sctp/socket.c          |  3 ++
 5 files changed, 109 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 6bcda715008e..8626bdd3249a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1084,6 +1084,8 @@ void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
 		     sctp_retransmit_reason_t);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
 int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
+void sctp_prsctp_prune(struct sctp_association *asoc,
+		       struct sctp_sndrcvinfo *sinfo, int msg_len);
 /* Uncork and flush an outqueue.  */
 static inline void sctp_outq_cork(struct sctp_outq *q)
 {
@@ -1864,6 +1866,8 @@ struct sctp_association {
 
 	struct sctp_priv_assoc_stats stats;
 
+	int sent_cnt_removable;
+
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 };
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index b3692b55a8d2..a55e54738b81 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -360,6 +360,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		return 1;
 	}
+	/* PRIO policy is processed by sendmsg, not here */
 
 	return 0;
 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 084718f9b3da..72e54a416af6 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -326,6 +326,9 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 
 			sctp_chunk_hold(chunk);
 			sctp_outq_tail_data(q, chunk);
+			if (chunk->asoc->prsctp_enable &&
+			    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+				chunk->asoc->sent_cnt_removable++;
 			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
 				SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
 			else
@@ -372,6 +375,96 @@ static void sctp_insert_list(struct list_head *head, struct list_head *new)
 		list_add_tail(new, head);
 }
 
+static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
+				  struct sctp_sndrcvinfo *sinfo,
+				  struct list_head *queue, int msg_len)
+{
+	struct sctp_chunk *chk, *temp;
+
+	list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
+		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		    chk->prsctp_param <= sinfo->sinfo_timetolive)
+			continue;
+
+		list_del_init(&chk->transmitted_list);
+		sctp_insert_list(&asoc->outqueue.abandoned,
+				 &chk->transmitted_list);
+
+		asoc->sent_cnt_removable--;
+		asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
+
+		if (!chk->tsn_gap_acked) {
+			if (chk->transport)
+				chk->transport->flight_size -=
+						sctp_data_size(chk);
+			asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
+		}
+
+		msg_len -= SCTP_DATA_SNDSIZE(chk) +
+			   sizeof(struct sk_buff) +
+			   sizeof(struct sctp_chunk);
+		if (msg_len <= 0)
+			break;
+	}
+
+	return msg_len;
+}
+
+static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
+				    struct sctp_sndrcvinfo *sinfo,
+				    struct list_head *queue, int msg_len)
+{
+	struct sctp_chunk *chk, *temp;
+
+	list_for_each_entry_safe(chk, temp, queue, list) {
+		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		    chk->prsctp_param <= sinfo->sinfo_timetolive)
+			continue;
+
+		list_del_init(&chk->list);
+		asoc->sent_cnt_removable--;
+		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
+
+		msg_len -= SCTP_DATA_SNDSIZE(chk) +
+			   sizeof(struct sk_buff) +
+			   sizeof(struct sctp_chunk);
+		sctp_chunk_free(chk);
+		if (msg_len <= 0)
+			break;
+	}
+
+	return msg_len;
+}
+
+/* Abandon the chunks according their priorities */
+void sctp_prsctp_prune(struct sctp_association *asoc,
+		       struct sctp_sndrcvinfo *sinfo, int msg_len)
+{
+	struct sctp_transport *transport;
+
+	if (!asoc->prsctp_enable || !asoc->sent_cnt_removable)
+		return;
+
+	msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
+					 &asoc->outqueue.retransmit,
+					 msg_len);
+	if (msg_len <= 0)
+		return;
+
+	list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+			    transports) {
+		msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
+						 &transport->transmitted,
+						 msg_len);
+		if (msg_len <= 0)
+			return;
+	}
+
+	sctp_prsctp_prune_unsent(asoc, sinfo,
+				 &asoc->outqueue.out_chunk_list,
+				 msg_len);
+}
+
 /* Mark all the eligible packets on a transport for retransmission.  */
 void sctp_retransmit_mark(struct sctp_outq *q,
 			  struct sctp_transport *transport,
@@ -962,6 +1055,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 				/* Mark as failed send. */
 				sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
+				if (asoc->prsctp_enable &&
+				    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+					asoc->sent_cnt_removable--;
 				sctp_chunk_free(chunk);
 				continue;
 			}
@@ -1251,6 +1347,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 		tsn = ntohl(tchunk->subh.data_hdr->tsn);
 		if (TSN_lte(tsn, ctsn)) {
 			list_del_init(&tchunk->transmitted_list);
+			if (asoc->prsctp_enable &&
+			    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+				asoc->sent_cnt_removable--;
 			sctp_chunk_free(tchunk);
 		}
 	}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index cfde934af5c5..1c96f4740e67 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -720,7 +720,8 @@ static void sctp_set_prsctp_policy(struct sctp_chunk *chunk,
 	if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
 		chunk->prsctp_param =
 			jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
-	else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags))
+	else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) ||
+		 SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags))
 		chunk->prsctp_param = sinfo->sinfo_timetolive;
 }
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 08614296628a..71c7dc5ea62e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1914,6 +1914,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_free;
 	}
 
+	if (sctp_wspace(asoc) < msg_len)
+		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
+
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 	if (!sctp_wspace(asoc)) {
 		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
-- 
cgit v1.2.3


From 160b925163c0aabc2c2fbb7d58a75e38b7cd6a17 Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Tue, 12 Jul 2016 02:12:16 +0200
Subject: Bluetooth: Add Authentication Failed reason to Disconnected Mgmt
 event

If link is disconnected due to Authentication Failure (PIN or Key
Missing status) userspace will be notified about this with proper error
code. Many LE profiles define "PIN or Key Missing" status as indication
of remote lost bond so this allows userspace to take action on this.

@ Device Connected: 88:63:DF:88:0E:83 (1) flags 0x0000
        02 01 1a 05 03 0a 18 0d 18 0b 09 48 65 61 72 74  ...........Heart
        20 52 61 74 65                                    Rate
> HCI Event: Command Status (0x0f) plen 4
      LE Read Remote Used Features (0x08|0x0016) ncmd 1
        Status: Success (0x00)
> ACL Data RX: Handle 3585 flags 0x02 dlen 11
      ATT: Read By Group Type Request (0x10) len 6
        Handle range: 0x0001-0xffff
        Attribute group type: Primary Service (0x2800)
> HCI Event: LE Meta Event (0x3e) plen 12
      LE Read Remote Used Features (0x04)
        Status: Success (0x00)
        Handle: 3585
        Features: 0x01 0x00 0x00 0x00 0x00 0x00 0x00 0x00
          LE Encryption
< HCI Command: LE Start Encryption (0x08|0x0019) plen 28
        Handle: 3585
        Random number: 0x0000000000000000
        Encrypted diversifier: 0x0000
        Long term key: 26201cd479a0921b6f949f0b1fa8dc82
> HCI Event: Command Status (0x0f) plen 4
      LE Start Encryption (0x08|0x0019) ncmd 1
        Status: Success (0x00)
> HCI Event: Encryption Change (0x08) plen 4
        Status: PIN or Key Missing (0x06)
        Handle: 3585
        Encryption: Disabled (0x00)
< HCI Command: Disconnect (0x01|0x0006) plen 3
        Handle: 3585
        Reason: Authentication Failure (0x05)
> HCI Event: Command Status (0x0f) plen 4
      Disconnect (0x01|0x0006) ncmd 1
        Status: Success (0x00)
> HCI Event: Disconnect Complete (0x05) plen 4
        Status: Success (0x00)
        Handle: 3585
        Reason: Connection Terminated By Local Host (0x16)
@ Device Disconnected: 88:63:DF:88:0E:83 (1) reason 4

@ Device Connected: C4:43:8F:A3:4D:83 (0) flags 0x0000
        08 09 4e 65 78 75 73 20 35                       ..Nexus 5
> HCI Event: Command Status (0x0f) plen 4
      Authentication Requested (0x01|0x0011) ncmd 1
        Status: Success (0x00)
> HCI Event: Link Key Request (0x17) plen 6
        Address: C4:43:8F:A3:4D:83 (LG Electronics)
< HCI Command: Link Key Request Reply (0x01|0x000b) plen 22
        Address: C4:43:8F:A3:4D:83 (LG Electronics)
        Link key: 080812e4aa97a863d11826f71f65a933
> HCI Event: Command Complete (0x0e) plen 10
      Link Key Request Reply (0x01|0x000b) ncmd 1
        Status: Success (0x00)
        Address: C4:43:8F:A3:4D:83 (LG Electronics)
> HCI Event: Auth Complete (0x06) plen 3
        Status: PIN or Key Missing (0x06)
        Handle: 75
@ Authentication Failed: C4:43:8F:A3:4D:83 (0) status 0x05
< HCI Command: Disconnect (0x01|0x0006) plen 3
        Handle: 75
        Reason: Remote User Terminated Connection (0x13)
> HCI Event: Command Status (0x0f) plen 4
      Disconnect (0x01|0x0006) ncmd 1
        Status: Success (0x00)
> HCI Event: Disconnect Complete (0x05) plen 4
        Status: Success (0x00)
        Handle: 75
        Reason: Connection Terminated By Local Host (0x16)
@ Device Disconnected: C4:43:8F:A3:4D:83 (0) reason 4

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_core.h |  1 +
 include/net/bluetooth/mgmt.h     |  1 +
 net/bluetooth/hci_event.c        | 16 +++++++++++++++-
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index a3f86de6f100..003b25283407 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -445,6 +445,7 @@ enum {
 /* ---- HCI Error Codes ---- */
 #define HCI_ERROR_UNKNOWN_CONN_ID	0x02
 #define HCI_ERROR_AUTH_FAILURE		0x05
+#define HCI_ERROR_PIN_OR_KEY_MISSING	0x06
 #define HCI_ERROR_MEMORY_EXCEEDED	0x07
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index dc71473462ac..77d7fe115a0d 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -654,6 +654,7 @@ enum {
 	HCI_CONN_PARAM_REMOVAL_PEND,
 	HCI_CONN_NEW_LINK_KEY,
 	HCI_CONN_SCANNING,
+	HCI_CONN_AUTH_FAILURE,
 };
 
 static inline bool hci_conn_ssp_enabled(struct hci_conn *conn)
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index ea73e0826aa7..7647964b1efa 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -645,6 +645,7 @@ struct mgmt_ev_device_connected {
 #define MGMT_DEV_DISCONN_TIMEOUT	0x01
 #define MGMT_DEV_DISCONN_LOCAL_HOST	0x02
 #define MGMT_DEV_DISCONN_REMOTE		0x03
+#define MGMT_DEV_DISCONN_AUTH_FAILURE	0x04
 
 #define MGMT_EV_DEVICE_DISCONNECTED	0x000C
 struct mgmt_ev_device_disconnected {
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 3fb95c47243c..e17aacbc5630 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2332,7 +2332,7 @@ static u8 hci_to_mgmt_reason(u8 err)
 static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_disconn_complete *ev = (void *) skb->data;
-	u8 reason = hci_to_mgmt_reason(ev->reason);
+	u8 reason;
 	struct hci_conn_params *params;
 	struct hci_conn *conn;
 	bool mgmt_connected;
@@ -2355,6 +2355,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	conn->state = BT_CLOSED;
 
 	mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
+
+	if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags))
+		reason = MGMT_DEV_DISCONN_AUTH_FAILURE;
+	else
+		reason = hci_to_mgmt_reason(ev->reason);
+
 	mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
 				reason, mgmt_connected);
 
@@ -2421,6 +2427,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		goto unlock;
 
 	if (!ev->status) {
+		clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
 		if (!hci_conn_ssp_enabled(conn) &&
 		    test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) {
 			BT_INFO("re-auth of legacy device is not possible.");
@@ -2429,6 +2437,9 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 			conn->sec_level = conn->pending_sec_level;
 		}
 	} else {
+		if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+			set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
 		mgmt_auth_failed(conn, ev->status);
 	}
 
@@ -2613,6 +2624,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
 
 	if (ev->status && conn->state == BT_CONNECTED) {
+		if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+			set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
 		hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
 		hci_conn_drop(conn);
 		goto unlock;
-- 
cgit v1.2.3


From 9e238323799fb8c2add2b1de9a22edd4d4e51e30 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 13 Jul 2016 15:08:55 -0300
Subject: sctp: allow others to use sctp_input_cb

We process input path in other files too and having access to it is
nice, so move it to a header where it's shared.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 15 +++++++++++++++
 net/sctp/input.c           | 11 -----------
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8626bdd3249a..966c3a40039c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -59,6 +59,7 @@
 #include <linux/workqueue.h>	/* We need tq_struct.	 */
 #include <linux/sctp.h>		/* We need sctp* header structs.  */
 #include <net/sctp/auth.h>	/* We need auth specific structs */
+#include <net/ip.h>		/* For inet_skb_parm */
 
 /* A convenience structure for handling sockaddr structures.
  * We should wean ourselves off this.
@@ -1092,6 +1093,20 @@ static inline void sctp_outq_cork(struct sctp_outq *q)
 	q->cork = 1;
 }
 
+/* SCTP skb control block.
+ * sctp_input_cb is currently used on rx and sock rx queue
+ */
+struct sctp_input_cb {
+	union {
+		struct inet_skb_parm	h4;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;
+	struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
+
 /* These bind address data fields common between endpoints and associations */
 struct sctp_bind_addr {
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 6f8e676d285e..7a327ff71f08 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
-struct sctp_input_cb {
-	union {
-		struct inet_skb_parm	h4;
-#if IS_ENABLED(CONFIG_IPV6)
-		struct inet6_skb_parm	h6;
-#endif
-	} header;
-	struct sctp_chunk *chunk;
-};
-#define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
-
 /*
  * This is the routine which IP calls when receiving an SCTP packet.
  */
-- 
cgit v1.2.3


From f5d258e60722142e88cb6f0f337d78bca67cf973 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 13 Jul 2016 15:08:56 -0300
Subject: sctp: reorder sctp_ulpevent and shrink msg_flags

The next patch needs 8 bytes in there. sctp_ulpevent has a hole due to
bad alignment; msg_flags is using 4 bytes while it actually uses only 2, so
we shrink it, and iif member (4 bytes) which can be easily fetched from
another place once the next patch is there, so we remove it and thus
creating space for 8 bytes.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h | 10 +++++-----
 net/sctp/ulpevent.c         |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index cccdcfd14973..aa342645dbce 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -48,15 +48,15 @@
  */
 struct sctp_ulpevent {
 	struct sctp_association *asoc;
-	__u16 stream;
-	__u16 ssn;
-	__u16 flags;
+	unsigned int rmem_len;
 	__u32 ppid;
 	__u32 tsn;
 	__u32 cumtsn;
-	int msg_flags;
 	int iif;
-	unsigned int rmem_len;
+	__u16 stream;
+	__u16 ssn;
+	__u16 flags;
+	__u16 msg_flags;
 };
 
 /* Retrieve the skb this event sits inside of. */
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d1e38308f615..706f5bc9f0c3 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
 
 /* Initialize an ULP event from an given skb.  */
 static void sctp_ulpevent_init(struct sctp_ulpevent *event,
-			       int msg_flags,
+			       __u16 msg_flags,
 			       unsigned int len)
 {
 	memset(event, 0, sizeof(struct sctp_ulpevent));
@@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event,
 }
 
 /* Create a new sctp_ulpevent.  */
-static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
+static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags,
 					       gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
-- 
cgit v1.2.3


From 1f45f78f8e511203f03138f2ccde3d2cf90d2cbf Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 13 Jul 2016 15:08:57 -0300
Subject: sctp: allow GSO frags to access the chunk too

SCTP will try to access original IP headers on sctp_recvmsg in order to
copy the addresses used. There are also other places that do similar access
to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO
support") they aren't always there because they are only present in the
header skb.

SCTP handles the queueing of incoming data by cloning the incoming skb
and limiting to only the relevant payload. This clone has its cb updated
to something different and it's then queued on socket rx queue. Thus we
need to fix this in two moments.

For rx path, not related to socket queue yet, this patch uses a
partially copied sctp_input_cb to such GSO frags. This restores the
ability to access the headers for this part of the code.

Regarding the socket rx queue, it removes iif member from sctp_event and
also add a chunk pointer on it.

With these changes we're always able to reach the headers again.

The biggest change here is that now the sctp_chunk struct and the
original skb are only freed after the application consumed the buffer.
Note however that the original payload was already like this due to the
skb cloning.

For iif, SCTP's IPv4 code doesn't use it, so no change is necessary.
IPv6 now can fetch it directly from original's IPv6 CB as the original
skb is still accessible.

In the future we probably can simplify sctp_v*_skb_iif() stuff, as
sctp_v4_skb_iif() was called but it's return value not used, and now
it's not even called, but such cleanup is out of scope for this change.

Fixes: 90017accff61 ("sctp: Add GSO support")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h  |  7 +++++++
 include/net/sctp/ulpevent.h |  2 +-
 net/sctp/inqueue.c          |  7 +++++++
 net/sctp/ipv6.c             |  9 ++++-----
 net/sctp/protocol.c         |  1 +
 net/sctp/sm_statefuns.c     |  3 ++-
 net/sctp/socket.c           | 10 +++++++---
 net/sctp/ulpevent.c         | 10 +++++++++-
 8 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 966c3a40039c..f6f201de6fa4 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1107,6 +1107,13 @@ struct sctp_input_cb {
 };
 #define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
 
+static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb)
+{
+	const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+
+	return chunk->head_skb ? : skb;
+}
+
 /* These bind address data fields common between endpoints and associations */
 struct sctp_bind_addr {
 
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index aa342645dbce..2c098cd7e7e2 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -48,11 +48,11 @@
  */
 struct sctp_ulpevent {
 	struct sctp_association *asoc;
+	struct sctp_chunk *chunk;
 	unsigned int rmem_len;
 	__u32 ppid;
 	__u32 tsn;
 	__u32 cumtsn;
-	int iif;
 	__u16 stream;
 	__u16 ssn;
 	__u16 flags;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index edabbbdfca54..147d975b0455 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -218,6 +218,13 @@ new_skb:
 		chunk->has_asconf = 0;
 		chunk->end_of_packet = 0;
 		chunk->ecn_ce_done = 0;
+		if (chunk->head_skb) {
+			struct sctp_input_cb
+				*cb = SCTP_INPUT_CB(chunk->skb),
+				*head_cb = SCTP_INPUT_CB(chunk->head_skb);
+
+			cb->chunk = head_cb->chunk;
+		}
 	}
 
 	chunk->chunk_hdr = ch;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0657d18a85bf..ae6f1a2178ba 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 	addr->v6.sin6_flowinfo = 0; /* FIXME */
 	addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
 
+	/* Always called on head skb, so this is safe */
 	sh = sctp_hdr(skb);
 	if (is_saddr) {
 		*port  = sh->source;
@@ -710,8 +711,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
 /* Where did this skb come from?  */
 static int sctp_v6_skb_iif(const struct sk_buff *skb)
 {
-	struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
-	return opt->iif;
+	return IP6CB(skb)->iif;
 }
 
 /* Was this packet marked by Explicit Congestion Notification? */
@@ -780,15 +780,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname,
 	if (ip_hdr(skb)->version == 4) {
 		addr->v4.sin_family = AF_INET;
 		addr->v4.sin_port = sh->source;
-		addr->v4.sin_addr.s_addr =  ip_hdr(skb)->saddr;
+		addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr;
 	} else {
 		addr->v6.sin6_family = AF_INET6;
 		addr->v6.sin6_flowinfo = 0;
 		addr->v6.sin6_port = sh->source;
 		addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
 		if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
-			struct sctp_ulpevent *ev = sctp_skb2event(skb);
-			addr->v6.sin6_scope_id = ev->iif;
+			addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb);
 		}
 	}
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 3b56ae55aba3..1adb9270e317 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -240,6 +240,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 	port = &addr->v4.sin_port;
 	addr->v4.sin_family = AF_INET;
 
+	/* Always called on head skb, so this is safe */
 	sh = sctp_hdr(skb);
 	if (is_saddr) {
 		*port  = sh->source;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f1f08c8f277b..5aabf42065e2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6125,7 +6125,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 		af = sctp_get_af_specific(
 			ipver2af(ip_hdr(chunk->skb)->version));
 
-		if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) {
+		if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) &&
+		    asoc->peer.ecn_capable) {
 			/* Do real work as sideffect. */
 			sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
 					SCTP_U32(tsn));
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 71c7dc5ea62e..52fdd540a9ef 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2066,7 +2066,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 {
 	struct sctp_ulpevent *event = NULL;
 	struct sctp_sock *sp = sctp_sk(sk);
-	struct sk_buff *skb;
+	struct sk_buff *skb, *head_skb;
 	int copied;
 	int err = 0;
 	int skb_len;
@@ -2102,12 +2102,16 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (err)
 		goto out_free;
 
-	sock_recv_ts_and_drops(msg, sk, skb);
+	if (event->chunk && event->chunk->head_skb)
+		head_skb = event->chunk->head_skb;
+	else
+		head_skb = skb;
+	sock_recv_ts_and_drops(msg, sk, head_skb);
 	if (sctp_ulpevent_is_notification(event)) {
 		msg->msg_flags |= MSG_NOTIFICATION;
 		sp->pf->event_msgname(event, msg->msg_name, addr_len);
 	} else {
-		sp->pf->skb_msgname(skb, msg->msg_name, addr_len);
+		sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len);
 	}
 
 	/* Check if we allow SCTP_NXTINFO. */
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 706f5bc9f0c3..f6219b164b42 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -701,6 +701,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 
 	sctp_ulpevent_receive_data(event, asoc);
 
+	/* And hold the chunk as we need it for getting the IP headers
+	 * later in recvmsg
+	 */
+	sctp_chunk_hold(chunk);
+	event->chunk = chunk;
+
 	event->stream = ntohs(chunk->subh.data_hdr->stream);
 	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
 	event->ppid = chunk->subh.data_hdr->ppid;
@@ -710,11 +716,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	}
 	event->tsn = ntohl(chunk->subh.data_hdr->tsn);
 	event->msg_flags |= chunk->chunk_hdr->flags;
-	event->iif = sctp_chunk_iif(chunk);
 
 	return event;
 
 fail_mark:
+	sctp_chunk_put(chunk);
 	kfree_skb(skb);
 fail:
 	return NULL;
@@ -1007,6 +1013,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
 
 done:
 	sctp_assoc_rwnd_increase(event->asoc, len);
+	sctp_chunk_put(event->chunk);
 	sctp_ulpevent_release_owner(event);
 }
 
@@ -1029,6 +1036,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
 	}
 
 done:
+	sctp_chunk_put(event->chunk);
 	sctp_ulpevent_release_owner(event);
 }
 
-- 
cgit v1.2.3


From e7487c86dc5c4a528a7dbd9dc14f453a0de61a84 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 13 Jul 2016 15:08:58 -0300
Subject: sctp: avoid identifying address family many times for a chunk

Identifying address family operations during rx path is not something
expensive but it's ugly to the eye to have it done multiple times,
specially when we already validated it during initial rx processing.

This patch takes advantage of the now shared sctp_input_cb and make the
pointer to the operations readily available.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 net/sctp/input.c           |  1 +
 net/sctp/inqueue.c         |  1 +
 net/sctp/sm_make_chunk.c   | 20 ++++----------------
 net/sctp/sm_statefuns.c    |  7 ++-----
 5 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f6f201de6fa4..ce93c4b10d26 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1104,6 +1104,7 @@ struct sctp_input_cb {
 #endif
 	} header;
 	struct sctp_chunk *chunk;
+	struct sctp_af *af;
 };
 #define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 7a327ff71f08..30d72f7707b6 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -140,6 +140,7 @@ int sctp_rcv(struct sk_buff *skb)
 	af = sctp_get_af_specific(family);
 	if (unlikely(!af))
 		goto discard_it;
+	SCTP_INPUT_CB(skb)->af = af;
 
 	/* Initialize local addresses for lookups. */
 	af->from_skb(&src, skb, 1);
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 147d975b0455..8fc773f9b59a 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -224,6 +224,7 @@ new_skb:
 				*head_cb = SCTP_INPUT_CB(chunk->head_skb);
 
 			cb->chunk = head_cb->chunk;
+			cb->af = head_cb->af;
 		}
 	}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 1c96f4740e67..8c77b87a8565 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
 {
-	struct sctp_af *af;
-	int iif = 0;
-
-	af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version));
-	if (af)
-		iif = af->skb_iif(chunk->skb);
+	struct sk_buff *skb = chunk->skb;
 
-	return iif;
+	return SCTP_INPUT_CB(skb)->af->skb_iif(skb);
 }
 
 /* RFC 2960 3.3.2 Initiation (INIT) (1)
@@ -1600,7 +1595,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
 	struct sctp_association *asoc;
 	struct sk_buff *skb;
 	sctp_scope_t scope;
-	struct sctp_af *af;
 
 	/* Create the bare association.  */
 	scope = sctp_scope(sctp_source(chunk));
@@ -1610,16 +1604,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
 	asoc->temp = 1;
 	skb = chunk->skb;
 	/* Create an entry for the source address of the packet.  */
-	af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version));
-	if (unlikely(!af))
-		goto fail;
-	af->from_skb(&asoc->c.peer_addr, skb, 1);
+	SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1);
+
 nodata:
 	return asoc;
-
-fail:
-	sctp_association_free(asoc);
-	return NULL;
 }
 
 /* Build a cookie representing asoc.
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5aabf42065e2..b7c1f7f3c838 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6119,13 +6119,10 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 */
 
 	if (!chunk->ecn_ce_done) {
-		struct sctp_af *af;
+		struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af;
 		chunk->ecn_ce_done = 1;
 
-		af = sctp_get_af_specific(
-			ipver2af(ip_hdr(chunk->skb)->version));
-
-		if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) &&
+		if (af->is_ce(sctp_gso_headskb(chunk->skb)) &&
 		    asoc->peer.ecn_capable) {
 			/* Do real work as sideffect. */
 			sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
-- 
cgit v1.2.3


From 8438884d4ab423161b974854ebb90c08219dd678 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 14 Jul 2016 10:32:43 +0300
Subject: net/switchdev: Export the same parent ID service function

This helper serves to know if two switchdev port netdevices belong to the
same HW ASIC, e.g to figure out if forwarding offload is possible between them.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   | 8 ++++++++
 net/switchdev/switchdev.c | 5 +++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 985619a59323..9023e3e3be0b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -227,6 +227,8 @@ void switchdev_port_fwd_mark_set(struct net_device *dev,
 				 struct net_device *group_dev,
 				 bool joining);
 
+bool switchdev_port_same_parent_id(struct net_device *a,
+				   struct net_device *b);
 #else
 
 static inline void switchdev_deferred_process(void)
@@ -351,6 +353,12 @@ static inline void switchdev_port_fwd_mark_set(struct net_device *dev,
 {
 }
 
+static inline bool switchdev_port_same_parent_id(struct net_device *a,
+						 struct net_device *b)
+{
+	return false;
+}
+
 #endif
 
 #endif /* _LINUX_SWITCHDEV_H_ */
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 59658b2e9cdf..a5fc9dd24aa9 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1286,8 +1286,8 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
 }
 EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
 
-static bool switchdev_port_same_parent_id(struct net_device *a,
-					  struct net_device *b)
+bool switchdev_port_same_parent_id(struct net_device *a,
+				   struct net_device *b)
 {
 	struct switchdev_attr a_attr = {
 		.orig_dev = a,
@@ -1323,6 +1323,7 @@ static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
 
 	return dev->ifindex;
 }
+EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
 
 static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
 					  u32 old_mark, u32 *reset_mark)
-- 
cgit v1.2.3


From f962fe32f2f85769cd835ddcecbff8c1d34cf561 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sun, 17 Jul 2016 19:55:15 +0200
Subject: Bluetooth: Move hci_recv_frame and hci_recv_diag prototypes

The protoypes for hci_recv_frame and hci_recv_diag are in the wrong
location in the header file. Move them close to all the other hci_dev
related exported functions.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 77d7fe115a0d..84d0273d826a 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1022,6 +1022,8 @@ void hci_unregister_dev(struct hci_dev *hdev);
 int hci_suspend_dev(struct hci_dev *hdev);
 int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
+int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
+int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
@@ -1098,9 +1100,6 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance);
 
 void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb);
 
-int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
-int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
-
 void hci_init_sysfs(struct hci_dev *hdev);
 void hci_conn_init_sysfs(struct hci_conn *conn);
 void hci_conn_add_sysfs(struct hci_conn *conn);
-- 
cgit v1.2.3


From 5177a83827cd0b8cf6ce0391b00dd4417352d2f1 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sun, 17 Jul 2016 19:55:16 +0200
Subject: Bluetooth: Add debugfs fields for hardware and firmware info

Some Bluetooth controllers allow for reading hardware and firmware
related vendor specific infos. If they are available, then they can be
exposed via debugfs now.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  4 ++++
 net/bluetooth/hci_core.c         | 24 ++++++++++++++++++++++++
 net/bluetooth/hci_debugfs.c      | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 84d0273d826a..ee7fc47680a1 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -372,6 +372,8 @@ struct hci_dev {
 
 	atomic_t		promisc;
 
+	const char		*hw_info;
+	const char		*fw_info;
 	struct dentry		*debugfs;
 
 	struct device		dev;
@@ -1024,6 +1026,8 @@ int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
 int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
 int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
+void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...);
+void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...);
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 98f6c3770736..ddf8432fe8fb 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3163,6 +3163,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	device_del(&hdev->dev);
 
 	debugfs_remove_recursive(hdev->debugfs);
+	kfree_const(hdev->hw_info);
+	kfree_const(hdev->fw_info);
 
 	destroy_workqueue(hdev->workqueue);
 	destroy_workqueue(hdev->req_workqueue);
@@ -3266,6 +3268,28 @@ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(hci_recv_diag);
 
+void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+	va_list vargs;
+
+	va_start(vargs, fmt);
+	kfree_const(hdev->hw_info);
+	hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+	va_end(vargs);
+}
+EXPORT_SYMBOL(hci_set_hw_info);
+
+void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+	va_list vargs;
+
+	va_start(vargs, fmt);
+	kfree_const(hdev->fw_info);
+	hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+	va_end(vargs);
+}
+EXPORT_SYMBOL(hci_set_fw_info);
+
 /* ---- Interface to upper protocols ---- */
 
 int hci_register_cb(struct hci_cb *cb)
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 7db4220941cc..63df63ebfb24 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -76,6 +76,30 @@ static const struct file_operations __name ## _fops = {			      \
 	.llseek		= default_llseek,				      \
 }									      \
 
+#define DEFINE_INFO_ATTRIBUTE(__name, __field)				      \
+static int __name ## _show(struct seq_file *f, void *ptr)		      \
+{									      \
+	struct hci_dev *hdev = f->private;				      \
+									      \
+	hci_dev_lock(hdev);						      \
+	seq_printf(f, "%s\n", hdev->__field ? : "");			      \
+	hci_dev_unlock(hdev);						      \
+									      \
+	return 0;							      \
+}									      \
+									      \
+static int __name ## _open(struct inode *inode, struct file *file)	      \
+{									      \
+	return single_open(file, __name ## _show, inode->i_private);	      \
+}									      \
+									      \
+static const struct file_operations __name ## _fops = {			      \
+	.open		= __name ## _open,				      \
+	.read		= seq_read,					      \
+	.llseek		= seq_lseek,					      \
+	.release	= single_release,				      \
+}									      \
+
 static int features_show(struct seq_file *f, void *ptr)
 {
 	struct hci_dev *hdev = f->private;
@@ -349,6 +373,9 @@ static const struct file_operations sc_only_mode_fops = {
 	.llseek		= default_llseek,
 };
 
+DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info);
+DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info);
+
 void hci_debugfs_create_common(struct hci_dev *hdev)
 {
 	debugfs_create_file("features", 0444, hdev->debugfs, hdev,
@@ -382,6 +409,14 @@ void hci_debugfs_create_common(struct hci_dev *hdev)
 	if (lmp_sc_capable(hdev) || lmp_le_capable(hdev))
 		debugfs_create_file("sc_only_mode", 0444, hdev->debugfs,
 				    hdev, &sc_only_mode_fops);
+
+	if (hdev->hw_info)
+		debugfs_create_file("hardware_info", 0444, hdev->debugfs,
+				    hdev, &hardware_info_fops);
+
+	if (hdev->fw_info)
+		debugfs_create_file("firmware_info", 0444, hdev->debugfs,
+				    hdev, &firmware_info_fops);
 }
 
 static int inquiry_cache_show(struct seq_file *f, void *p)
-- 
cgit v1.2.3


From 359ebda25aa06fe3a1d028f7e338a849165e661b Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 18 Jul 2016 14:49:33 +0300
Subject: net/ipv4: Introduce IPSKB_FRAG_SEGS bit to inet_skb_parm.flags

This flag indicates whether fragmentation of segments is allowed.

Formerly this policy was hardcoded according to IPSKB_FORWARDED (set by
either ip_forward or ipmr_forward).

Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      | 1 +
 net/ipv4/ip_forward.c | 2 +-
 net/ipv4/ip_output.c  | 6 ++++--
 net/ipv4/ipmr.c       | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 08f36cd2b874..9742b92dc933 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -47,6 +47,7 @@ struct inet_skb_parm {
 #define IPSKB_REROUTED		BIT(4)
 #define IPSKB_DOREDIRECT	BIT(5)
 #define IPSKB_FRAG_PMTU		BIT(6)
+#define IPSKB_FRAG_SEGS		BIT(7)
 
 	u16			frag_max_size;
 };
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 9f0a7b96646f..8b4ffd216839 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb)
 	if (opt->is_strictroute && rt->rt_uses_gateway)
 		goto sr_failed;
 
-	IPCB(skb)->flags |= IPSKB_FORWARDED;
+	IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
 	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
 	if (ip_exceeds_mtu(skb, mtu)) {
 		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e23f141c9ba5..dde37fb340bf 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -223,8 +223,10 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 	struct sk_buff *segs;
 	int ret = 0;
 
-	/* common case: locally created skb or seglen is <= mtu */
-	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+	/* common case: fragmentation of segments is not allowed,
+	 * or seglen is <= mtu
+	 */
+	if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) ||
 	      skb_gso_validate_mtu(skb, mtu))
 		return ip_finish_output2(net, sk, skb);
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e0d76f5f0113..eec234161b89 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1749,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 		vif->dev->stats.tx_bytes += skb->len;
 	}
 
-	IPCB(skb)->flags |= IPSKB_FORWARDED;
+	IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
 
 	/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
 	 * not only before forwarding, but after forwarding on all output
-- 
cgit v1.2.3


From 34a79f63bbe49c888f95e75dd759685a238556b6 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 18 Jul 2016 20:45:38 -0400
Subject: net: dsa: support switchdev ageing time attr

Add a new function for DSA drivers to handle the switchdev
SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute.

The ageing time is passed as milliseconds.

Also because we can have multiple logical bridges on top of a physical
switch and ageing time are switch-wide, call the driver function with
the fastest ageing time in use on the chip instead of the requested one.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 52ab18bc2b0d..2217a3f817f8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -141,6 +141,7 @@ struct dsa_switch_tree {
 struct dsa_port {
 	struct net_device	*netdev;
 	struct device_node	*dn;
+	unsigned int		ageing_time;
 };
 
 struct dsa_switch {
@@ -329,6 +330,7 @@ struct dsa_switch_driver {
 	/*
 	 * Bridge integration
 	 */
+	int	(*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
 	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
 	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7236eb26dc97..fc9196745225 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -333,6 +333,44 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
 	return 0;
 }
 
+static int dsa_fastest_ageing_time(struct dsa_switch *ds,
+				   unsigned int ageing_time)
+{
+	int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; ++i) {
+		struct dsa_port *dp = &ds->ports[i];
+
+		if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
+			ageing_time = dp->ageing_time;
+	}
+
+	return ageing_time;
+}
+
+static int dsa_slave_ageing_time(struct net_device *dev,
+				 const struct switchdev_attr *attr,
+				 struct switchdev_trans *trans)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
+	unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
+
+	/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* Keep the fastest ageing time in case of multiple bridges */
+	ds->ports[p->port].ageing_time = ageing_time;
+	ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
+
+	if (ds->drv->set_ageing_time)
+		return ds->drv->set_ageing_time(ds, ageing_time);
+
+	return 0;
+}
+
 static int dsa_slave_port_attr_set(struct net_device *dev,
 				   const struct switchdev_attr *attr,
 				   struct switchdev_trans *trans)
@@ -346,6 +384,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
 		ret = dsa_slave_vlan_filtering(dev, attr, trans);
 		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		ret = dsa_slave_ageing_time(dev, attr, trans);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3


From 2d283bdd079c0ad4da020bbc9e9c2a4280823098 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 19 Jul 2016 11:54:16 +1000
Subject: net/ncsi: Resource management

NCSI spec (DSP0222) defines several objects: package, channel, mode,
filter, version and statistics etc. This introduces the data structs
to represent those objects and implement functions to manage them.
Also, this introduces CONFIG_NET_NCSI for the newly implemented NCSI
stack.

   * The user (e.g. netdev driver) dereference NCSI device by
     "struct ncsi_dev", which is embedded to "struct ncsi_dev_priv".
     The later one is used by NCSI stack internally.
   * Every NCSI device can have multiple packages simultaneously, up
     to 8 packages. It's represented by "struct ncsi_package" and
     identified by 3-bits ID.
   * Every NCSI package can have multiple channels, up to 32. It's
     represented by "struct ncsi_channel" and identified by 5-bits ID.
   * Every NCSI channel has version, statistics, various modes and
     filters. They are represented by "struct ncsi_channel_version",
     "struct ncsi_channel_stats", "struct ncsi_channel_mode" and
     "struct ncsi_channel_filter" separately.
   * Apart from AEN (Asynchronous Event Notification), the NCSI stack
     works in terms of command and response. This introduces "struct
     ncsi_req" to represent a complete NCSI transaction made of NCSI
     request and response.

link: https://www.dmtf.org/sites/default/files/standards/documents/DSP0222_1.1.0.pdf
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ncsi.h     |  46 ++++++
 net/Kconfig            |   1 +
 net/Makefile           |   1 +
 net/ncsi/Kconfig       |  12 ++
 net/ncsi/Makefile      |   4 +
 net/ncsi/internal.h    | 256 +++++++++++++++++++++++++++++
 net/ncsi/ncsi-manage.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 756 insertions(+)
 create mode 100644 include/net/ncsi.h
 create mode 100644 net/ncsi/Kconfig
 create mode 100644 net/ncsi/Makefile
 create mode 100644 net/ncsi/internal.h
 create mode 100644 net/ncsi/ncsi-manage.c

(limited to 'include/net')

diff --git a/include/net/ncsi.h b/include/net/ncsi.h
new file mode 100644
index 000000000000..70d14ee1ef84
--- /dev/null
+++ b/include/net/ncsi.h
@@ -0,0 +1,46 @@
+#ifndef __NET_NCSI_H
+#define __NET_NCSI_H
+
+/*
+ * The NCSI device states seen from external. More NCSI device states are
+ * only visible internally (in net/ncsi/internal.h). When the NCSI device
+ * is registered, it's in ncsi_dev_state_registered state. The state
+ * ncsi_dev_state_start is used to drive to choose active package and
+ * channel. After that, its state is changed to ncsi_dev_state_functional.
+ *
+ * The state ncsi_dev_state_stop helps to shut down the currently active
+ * package and channel while ncsi_dev_state_config helps to reconfigure
+ * them.
+ */
+enum {
+	ncsi_dev_state_registered	= 0x0000,
+	ncsi_dev_state_functional	= 0x0100,
+	ncsi_dev_state_probe		= 0x0200,
+	ncsi_dev_state_config		= 0x0300,
+	ncsi_dev_state_suspend		= 0x0400,
+};
+
+struct ncsi_dev {
+	int               state;
+	int		  link_up;
+	struct net_device *dev;
+	void		  (*handler)(struct ncsi_dev *ndev);
+};
+
+#ifdef CONFIG_NET_NCSI
+struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
+				   void (*notifier)(struct ncsi_dev *nd));
+void ncsi_unregister_dev(struct ncsi_dev *nd);
+#else /* !CONFIG_NET_NCSI */
+static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
+					void (*notifier)(struct ncsi_dev *nd))
+{
+	return NULL;
+}
+
+static inline void ncsi_unregister_dev(struct ncsi_dev *nd)
+{
+}
+#endif /* CONFIG_NET_NCSI */
+
+#endif /* __NET_NCSI_H */
diff --git a/net/Kconfig b/net/Kconfig
index ff40562a782c..c2cdbce629bd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -237,6 +237,7 @@ source "net/hsr/Kconfig"
 source "net/switchdev/Kconfig"
 source "net/l3mdev/Kconfig"
 source "net/qrtr/Kconfig"
+source "net/ncsi/Kconfig"
 
 config RPS
 	bool
diff --git a/net/Makefile b/net/Makefile
index bdd14553a774..9bd20bb86cc6 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,3 +79,4 @@ ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
 obj-y				+= l3mdev/
 endif
 obj-$(CONFIG_QRTR)		+= qrtr/
+obj-$(CONFIG_NET_NCSI)		+= ncsi/
diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
new file mode 100644
index 000000000000..08a8a6031fd7
--- /dev/null
+++ b/net/ncsi/Kconfig
@@ -0,0 +1,12 @@
+#
+# Configuration for NCSI support
+#
+
+config NET_NCSI
+	bool "NCSI interface support"
+	depends on INET
+	---help---
+	  This module provides NCSI (Network Controller Sideband Interface)
+	  support. Enable this only if your system connects to a network
+	  device via NCSI and the ethernet driver you're using supports
+	  the protocol explicitly.
diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile
new file mode 100644
index 000000000000..07b5625155d7
--- /dev/null
+++ b/net/ncsi/Makefile
@@ -0,0 +1,4 @@
+#
+# Makefile for NCSI API
+#
+obj-$(CONFIG_NET_NCSI) += ncsi-manage.o
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
new file mode 100644
index 000000000000..89028e1f83cd
--- /dev/null
+++ b/net/ncsi/internal.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NCSI_INTERNAL_H__
+#define __NCSI_INTERNAL_H__
+
+enum {
+	NCSI_CAP_BASE		= 0,
+	NCSI_CAP_GENERIC	= 0,
+	NCSI_CAP_BC,
+	NCSI_CAP_MC,
+	NCSI_CAP_BUFFER,
+	NCSI_CAP_AEN,
+	NCSI_CAP_VLAN,
+	NCSI_CAP_MAX
+};
+
+enum {
+	NCSI_CAP_GENERIC_HWA             = 0x01, /* HW arbitration           */
+	NCSI_CAP_GENERIC_HDS             = 0x02, /* HNC driver status change */
+	NCSI_CAP_GENERIC_FC              = 0x04, /* HNC to MC flow control   */
+	NCSI_CAP_GENERIC_FC1             = 0x08, /* MC to HNC flow control   */
+	NCSI_CAP_GENERIC_MC              = 0x10, /* Global MC filtering      */
+	NCSI_CAP_GENERIC_HWA_UNKNOWN     = 0x00, /* Unknown HW arbitration   */
+	NCSI_CAP_GENERIC_HWA_SUPPORT     = 0x20, /* Supported HW arbitration */
+	NCSI_CAP_GENERIC_HWA_NOT_SUPPORT = 0x40, /* No HW arbitration        */
+	NCSI_CAP_GENERIC_HWA_RESERVED    = 0x60, /* Reserved HW arbitration  */
+	NCSI_CAP_GENERIC_HWA_MASK        = 0x60, /* Mask for HW arbitration  */
+	NCSI_CAP_GENERIC_MASK            = 0x7f,
+	NCSI_CAP_BC_ARP                  = 0x01, /* ARP packet filtering     */
+	NCSI_CAP_BC_DHCPC                = 0x02, /* DHCP client filtering    */
+	NCSI_CAP_BC_DHCPS                = 0x04, /* DHCP server filtering    */
+	NCSI_CAP_BC_NETBIOS              = 0x08, /* NetBIOS packet filtering */
+	NCSI_CAP_BC_MASK                 = 0x0f,
+	NCSI_CAP_MC_IPV6_NEIGHBOR        = 0x01, /* IPv6 neighbor filtering  */
+	NCSI_CAP_MC_IPV6_ROUTER          = 0x02, /* IPv6 router filering     */
+	NCSI_CAP_MC_DHCPV6_RELAY         = 0x04, /* DHCPv6 relay / server MC */
+	NCSI_CAP_MC_DHCPV6_WELL_KNOWN    = 0x08, /* DHCPv6 well-known MC     */
+	NCSI_CAP_MC_IPV6_MLD             = 0x10, /* IPv6 MLD filtering       */
+	NCSI_CAP_MC_IPV6_NEIGHBOR_S      = 0x20, /* IPv6 neighbour filtering */
+	NCSI_CAP_MC_MASK                 = 0x3f,
+	NCSI_CAP_AEN_LSC                 = 0x01, /* Link status change       */
+	NCSI_CAP_AEN_CR                  = 0x02, /* Configuration required   */
+	NCSI_CAP_AEN_HDS                 = 0x04, /* HNC driver status        */
+	NCSI_CAP_AEN_MASK                = 0x07,
+	NCSI_CAP_VLAN_ONLY               = 0x01, /* Filter VLAN packet only  */
+	NCSI_CAP_VLAN_NO                 = 0x02, /* Filter VLAN and non-VLAN */
+	NCSI_CAP_VLAN_ANY                = 0x04, /* Filter Any-and-non-VLAN  */
+	NCSI_CAP_VLAN_MASK               = 0x07
+};
+
+enum {
+	NCSI_MODE_BASE		= 0,
+	NCSI_MODE_ENABLE	= 0,
+	NCSI_MODE_TX_ENABLE,
+	NCSI_MODE_LINK,
+	NCSI_MODE_VLAN,
+	NCSI_MODE_BC,
+	NCSI_MODE_MC,
+	NCSI_MODE_AEN,
+	NCSI_MODE_FC,
+	NCSI_MODE_MAX
+};
+
+enum {
+	NCSI_FILTER_BASE	= 0,
+	NCSI_FILTER_VLAN	= 0,
+	NCSI_FILTER_UC,
+	NCSI_FILTER_MC,
+	NCSI_FILTER_MIXED,
+	NCSI_FILTER_MAX
+};
+
+struct ncsi_channel_version {
+	u32 version;		/* Supported BCD encoded NCSI version */
+	u32 alpha2;		/* Supported BCD encoded NCSI version */
+	u8  fw_name[12];	/* Firware name string                */
+	u32 fw_version;		/* Firmware version                   */
+	u16 pci_ids[4];		/* PCI identification                 */
+	u32 mf_id;		/* Manufacture ID                     */
+};
+
+struct ncsi_channel_cap {
+	u32 index;	/* Index of channel capabilities */
+	u32 cap;	/* NCSI channel capability       */
+};
+
+struct ncsi_channel_mode {
+	u32 index;	/* Index of channel modes      */
+	u32 enable;	/* Enabled or disabled         */
+	u32 size;	/* Valid entries in ncm_data[] */
+	u32 data[8];	/* Data entries                */
+};
+
+struct ncsi_channel_filter {
+	u32 index;	/* Index of channel filters          */
+	u32 total;	/* Total entries in the filter table */
+	u64 bitmap;	/* Bitmap of valid entries           */
+	u32 data[];	/* Data for the valid entries        */
+};
+
+struct ncsi_channel_stats {
+	u32 hnc_cnt_hi;		/* Counter cleared            */
+	u32 hnc_cnt_lo;		/* Counter cleared            */
+	u32 hnc_rx_bytes;	/* Rx bytes                   */
+	u32 hnc_tx_bytes;	/* Tx bytes                   */
+	u32 hnc_rx_uc_pkts;	/* Rx UC packets              */
+	u32 hnc_rx_mc_pkts;     /* Rx MC packets              */
+	u32 hnc_rx_bc_pkts;	/* Rx BC packets              */
+	u32 hnc_tx_uc_pkts;	/* Tx UC packets              */
+	u32 hnc_tx_mc_pkts;	/* Tx MC packets              */
+	u32 hnc_tx_bc_pkts;	/* Tx BC packets              */
+	u32 hnc_fcs_err;	/* FCS errors                 */
+	u32 hnc_align_err;	/* Alignment errors           */
+	u32 hnc_false_carrier;	/* False carrier detection    */
+	u32 hnc_runt_pkts;	/* Rx runt packets            */
+	u32 hnc_jabber_pkts;	/* Rx jabber packets          */
+	u32 hnc_rx_pause_xon;	/* Rx pause XON frames        */
+	u32 hnc_rx_pause_xoff;	/* Rx XOFF frames             */
+	u32 hnc_tx_pause_xon;	/* Tx XON frames              */
+	u32 hnc_tx_pause_xoff;	/* Tx XOFF frames             */
+	u32 hnc_tx_s_collision;	/* Single collision frames    */
+	u32 hnc_tx_m_collision;	/* Multiple collision frames  */
+	u32 hnc_l_collision;	/* Late collision frames      */
+	u32 hnc_e_collision;	/* Excessive collision frames */
+	u32 hnc_rx_ctl_frames;	/* Rx control frames          */
+	u32 hnc_rx_64_frames;	/* Rx 64-bytes frames         */
+	u32 hnc_rx_127_frames;	/* Rx 65-127 bytes frames     */
+	u32 hnc_rx_255_frames;	/* Rx 128-255 bytes frames    */
+	u32 hnc_rx_511_frames;	/* Rx 256-511 bytes frames    */
+	u32 hnc_rx_1023_frames;	/* Rx 512-1023 bytes frames   */
+	u32 hnc_rx_1522_frames;	/* Rx 1024-1522 bytes frames  */
+	u32 hnc_rx_9022_frames;	/* Rx 1523-9022 bytes frames  */
+	u32 hnc_tx_64_frames;	/* Tx 64-bytes frames         */
+	u32 hnc_tx_127_frames;	/* Tx 65-127 bytes frames     */
+	u32 hnc_tx_255_frames;	/* Tx 128-255 bytes frames    */
+	u32 hnc_tx_511_frames;	/* Tx 256-511 bytes frames    */
+	u32 hnc_tx_1023_frames;	/* Tx 512-1023 bytes frames   */
+	u32 hnc_tx_1522_frames;	/* Tx 1024-1522 bytes frames  */
+	u32 hnc_tx_9022_frames;	/* Tx 1523-9022 bytes frames  */
+	u32 hnc_rx_valid_bytes;	/* Rx valid bytes             */
+	u32 hnc_rx_runt_pkts;	/* Rx error runt packets      */
+	u32 hnc_rx_jabber_pkts;	/* Rx error jabber packets    */
+	u32 ncsi_rx_cmds;	/* Rx NCSI commands           */
+	u32 ncsi_dropped_cmds;	/* Dropped commands           */
+	u32 ncsi_cmd_type_errs;	/* Command type errors        */
+	u32 ncsi_cmd_csum_errs;	/* Command checksum errors    */
+	u32 ncsi_rx_pkts;	/* Rx NCSI packets            */
+	u32 ncsi_tx_pkts;	/* Tx NCSI packets            */
+	u32 ncsi_tx_aen_pkts;	/* Tx AEN packets             */
+	u32 pt_tx_pkts;		/* Tx packets                 */
+	u32 pt_tx_dropped;	/* Tx dropped packets         */
+	u32 pt_tx_channel_err;	/* Tx channel errors          */
+	u32 pt_tx_us_err;	/* Tx undersize errors        */
+	u32 pt_rx_pkts;		/* Rx packets                 */
+	u32 pt_rx_dropped;	/* Rx dropped packets         */
+	u32 pt_rx_channel_err;	/* Rx channel errors          */
+	u32 pt_rx_us_err;	/* Rx undersize errors        */
+	u32 pt_rx_os_err;	/* Rx oversize errors         */
+};
+
+struct ncsi_dev_priv;
+struct ncsi_package;
+
+#define NCSI_PACKAGE_SHIFT	5
+#define NCSI_PACKAGE_INDEX(c)	(((c) >> NCSI_PACKAGE_SHIFT) & 0x7)
+#define NCSI_CHANNEL_INDEX(c)	((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
+#define NCSI_TO_CHANNEL(p, c)	(((p) << NCSI_PACKAGE_SHIFT) | (c))
+
+struct ncsi_channel {
+	unsigned char               id;
+	int                         state;
+#define NCSI_CHANNEL_INACTIVE		1
+#define NCSI_CHANNEL_ACTIVE		2
+	spinlock_t                  lock;	/* Protect filters etc */
+	struct ncsi_package         *package;
+	struct ncsi_channel_version version;
+	struct ncsi_channel_cap	    caps[NCSI_CAP_MAX];
+	struct ncsi_channel_mode    modes[NCSI_MODE_MAX];
+	struct ncsi_channel_filter  *filters[NCSI_FILTER_MAX];
+	struct ncsi_channel_stats   stats;
+	struct list_head            node;
+};
+
+struct ncsi_package {
+	unsigned char        id;          /* NCSI 3-bits package ID */
+	unsigned char        uuid[16];    /* UUID                   */
+	struct ncsi_dev_priv *ndp;        /* NCSI device            */
+	spinlock_t           lock;        /* Protect the package    */
+	unsigned int         channel_num; /* Number of channels     */
+	struct list_head     channels;    /* List of chanels        */
+	struct list_head     node;        /* Form list of packages  */
+};
+
+struct ncsi_request {
+	unsigned char        id;      /* Request ID - 0 to 255           */
+	bool                 used;    /* Request that has been assigned  */
+	bool                 driven;  /* Drive state machine             */
+	struct ncsi_dev_priv *ndp;    /* Associated NCSI device          */
+	struct sk_buff       *cmd;    /* Associated NCSI command packet  */
+	struct sk_buff       *rsp;    /* Associated NCSI response packet */
+	struct timer_list    timer;   /* Timer on waiting for response   */
+	bool                 enabled; /* Time has been enabled or not    */
+};
+
+struct ncsi_dev_priv {
+	struct ncsi_dev     ndev;            /* Associated NCSI device     */
+	unsigned int        flags;           /* NCSI device flags          */
+	spinlock_t          lock;            /* Protect the NCSI device    */
+	unsigned int        package_num;     /* Number of packages         */
+	struct list_head    packages;        /* List of packages           */
+	struct ncsi_request requests[256];   /* Request table              */
+	unsigned int        request_id;      /* Last used request ID       */
+	struct list_head    node;            /* Form NCSI device list      */
+};
+
+extern struct list_head ncsi_dev_list;
+extern spinlock_t ncsi_dev_lock;
+
+#define TO_NCSI_DEV_PRIV(nd) \
+	container_of(nd, struct ncsi_dev_priv, ndev)
+#define NCSI_FOR_EACH_DEV(ndp) \
+	list_for_each_entry_rcu(ndp, &ncsi_dev_list, node)
+#define NCSI_FOR_EACH_PACKAGE(ndp, np) \
+	list_for_each_entry_rcu(np, &ndp->packages, node)
+#define NCSI_FOR_EACH_CHANNEL(np, nc) \
+	list_for_each_entry_rcu(nc, &np->channels, node)
+
+/* Resources */
+int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
+int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
+int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
+struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
+				       unsigned char id);
+struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np,
+				      unsigned char id);
+struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp,
+				       unsigned char id);
+struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
+				      unsigned char id);
+void ncsi_remove_package(struct ncsi_package *np);
+void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
+				   unsigned char id,
+				   struct ncsi_package **np,
+				   struct ncsi_channel **nc);
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven);
+void ncsi_free_request(struct ncsi_request *nr);
+struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
+
+#endif /* __NCSI_INTERNAL_H__ */
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
new file mode 100644
index 000000000000..0e28ed8f2703
--- /dev/null
+++ b/net/ncsi/ncsi-manage.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright Gavin Shan, IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+#include <net/ncsi.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+
+LIST_HEAD(ncsi_dev_list);
+DEFINE_SPINLOCK(ncsi_dev_lock);
+
+static inline int ncsi_filter_size(int table)
+{
+	int sizes[] = { 2, 6, 6, 6 };
+
+	BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
+	if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
+		return -EINVAL;
+
+	return sizes[table];
+}
+
+int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
+{
+	struct ncsi_channel_filter *ncf;
+	void *bitmap;
+	int index, size;
+	unsigned long flags;
+
+	ncf = nc->filters[table];
+	if (!ncf)
+		return -ENXIO;
+
+	size = ncsi_filter_size(table);
+	if (size < 0)
+		return size;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = (void *)&ncf->bitmap;
+	index = -1;
+	while ((index = find_next_bit(bitmap, ncf->total, index + 1))
+	       < ncf->total) {
+		if (!memcmp(ncf->data + size * index, data, size)) {
+			spin_unlock_irqrestore(&nc->lock, flags);
+			return index;
+		}
+	}
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	return -ENOENT;
+}
+
+int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
+{
+	struct ncsi_channel_filter *ncf;
+	int index, size;
+	void *bitmap;
+	unsigned long flags;
+
+	size = ncsi_filter_size(table);
+	if (size < 0)
+		return size;
+
+	index = ncsi_find_filter(nc, table, data);
+	if (index >= 0)
+		return index;
+
+	ncf = nc->filters[table];
+	if (!ncf)
+		return -ENODEV;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = (void *)&ncf->bitmap;
+	do {
+		index = find_next_zero_bit(bitmap, ncf->total, 0);
+		if (index >= ncf->total) {
+			spin_unlock_irqrestore(&nc->lock, flags);
+			return -ENOSPC;
+		}
+	} while (test_and_set_bit(index, bitmap));
+
+	memcpy(ncf->data + size * index, data, size);
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	return index;
+}
+
+int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
+{
+	struct ncsi_channel_filter *ncf;
+	int size;
+	void *bitmap;
+	unsigned long flags;
+
+	size = ncsi_filter_size(table);
+	if (size < 0)
+		return size;
+
+	ncf = nc->filters[table];
+	if (!ncf || index >= ncf->total)
+		return -ENODEV;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = (void *)&ncf->bitmap;
+	if (test_and_clear_bit(index, bitmap))
+		memset(ncf->data + size * index, 0, size);
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	return 0;
+}
+
+struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
+				       unsigned char id)
+{
+	struct ncsi_channel *nc;
+
+	NCSI_FOR_EACH_CHANNEL(np, nc) {
+		if (nc->id == id)
+			return nc;
+	}
+
+	return NULL;
+}
+
+struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
+{
+	struct ncsi_channel *nc, *tmp;
+	int index;
+	unsigned long flags;
+
+	nc = kzalloc(sizeof(*nc), GFP_ATOMIC);
+	if (!nc)
+		return NULL;
+
+	nc->id = id;
+	nc->package = np;
+	nc->state = NCSI_CHANNEL_INACTIVE;
+	spin_lock_init(&nc->lock);
+	for (index = 0; index < NCSI_CAP_MAX; index++)
+		nc->caps[index].index = index;
+	for (index = 0; index < NCSI_MODE_MAX; index++)
+		nc->modes[index].index = index;
+
+	spin_lock_irqsave(&np->lock, flags);
+	tmp = ncsi_find_channel(np, id);
+	if (tmp) {
+		spin_unlock_irqrestore(&np->lock, flags);
+		kfree(nc);
+		return tmp;
+	}
+
+	list_add_tail_rcu(&nc->node, &np->channels);
+	np->channel_num++;
+	spin_unlock_irqrestore(&np->lock, flags);
+
+	return nc;
+}
+
+static void ncsi_remove_channel(struct ncsi_channel *nc)
+{
+	struct ncsi_package *np = nc->package;
+	struct ncsi_channel_filter *ncf;
+	unsigned long flags;
+	int i;
+
+	/* Release filters */
+	spin_lock_irqsave(&nc->lock, flags);
+	for (i = 0; i < NCSI_FILTER_MAX; i++) {
+		ncf = nc->filters[i];
+		if (!ncf)
+			continue;
+
+		nc->filters[i] = NULL;
+		kfree(ncf);
+	}
+
+	nc->state = NCSI_CHANNEL_INACTIVE;
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	/* Remove and free channel */
+	spin_lock_irqsave(&np->lock, flags);
+	list_del_rcu(&nc->node);
+	np->channel_num--;
+	spin_unlock_irqrestore(&np->lock, flags);
+
+	kfree(nc);
+}
+
+struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp,
+				       unsigned char id)
+{
+	struct ncsi_package *np;
+
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (np->id == id)
+			return np;
+	}
+
+	return NULL;
+}
+
+struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
+				      unsigned char id)
+{
+	struct ncsi_package *np, *tmp;
+	unsigned long flags;
+
+	np = kzalloc(sizeof(*np), GFP_ATOMIC);
+	if (!np)
+		return NULL;
+
+	np->id = id;
+	np->ndp = ndp;
+	spin_lock_init(&np->lock);
+	INIT_LIST_HEAD(&np->channels);
+
+	spin_lock_irqsave(&ndp->lock, flags);
+	tmp = ncsi_find_package(ndp, id);
+	if (tmp) {
+		spin_unlock_irqrestore(&ndp->lock, flags);
+		kfree(np);
+		return tmp;
+	}
+
+	list_add_tail_rcu(&np->node, &ndp->packages);
+	ndp->package_num++;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	return np;
+}
+
+void ncsi_remove_package(struct ncsi_package *np)
+{
+	struct ncsi_dev_priv *ndp = np->ndp;
+	struct ncsi_channel *nc, *tmp;
+	unsigned long flags;
+
+	/* Release all child channels */
+	list_for_each_entry_safe(nc, tmp, &np->channels, node)
+		ncsi_remove_channel(nc);
+
+	/* Remove and free package */
+	spin_lock_irqsave(&ndp->lock, flags);
+	list_del_rcu(&np->node);
+	ndp->package_num--;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	kfree(np);
+}
+
+void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
+				   unsigned char id,
+				   struct ncsi_package **np,
+				   struct ncsi_channel **nc)
+{
+	struct ncsi_package *p;
+	struct ncsi_channel *c;
+
+	p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id));
+	c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL;
+
+	if (np)
+		*np = p;
+	if (nc)
+		*nc = c;
+}
+
+/* For two consecutive NCSI commands, the packet IDs shouldn't
+ * be same. Otherwise, the bogus response might be replied. So
+ * the available IDs are allocated in round-robin fashion.
+ */
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
+{
+	struct ncsi_request *nr = NULL;
+	int i, limit = ARRAY_SIZE(ndp->requests);
+	unsigned long flags;
+
+	/* Check if there is one available request until the ceiling */
+	spin_lock_irqsave(&ndp->lock, flags);
+	for (i = ndp->request_id; !nr && i < limit; i++) {
+		if (ndp->requests[i].used)
+			continue;
+
+		nr = &ndp->requests[i];
+		nr->used = true;
+		nr->driven = driven;
+		if (++ndp->request_id >= limit)
+			ndp->request_id = 0;
+	}
+
+	/* Fail back to check from the starting cursor */
+	for (i = 0; !nr && i < ndp->request_id; i++) {
+		if (ndp->requests[i].used)
+			continue;
+
+		nr = &ndp->requests[i];
+		nr->used = true;
+		nr->driven = driven;
+		if (++ndp->request_id >= limit)
+			ndp->request_id = 0;
+	}
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	return nr;
+}
+
+void ncsi_free_request(struct ncsi_request *nr)
+{
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct sk_buff *cmd, *rsp;
+	unsigned long flags;
+
+	if (nr->enabled) {
+		nr->enabled = false;
+		del_timer_sync(&nr->timer);
+	}
+
+	spin_lock_irqsave(&ndp->lock, flags);
+	cmd = nr->cmd;
+	rsp = nr->rsp;
+	nr->cmd = NULL;
+	nr->rsp = NULL;
+	nr->used = false;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	/* Release command and response */
+	consume_skb(cmd);
+	consume_skb(rsp);
+}
+
+struct ncsi_dev *ncsi_find_dev(struct net_device *dev)
+{
+	struct ncsi_dev_priv *ndp;
+
+	NCSI_FOR_EACH_DEV(ndp) {
+		if (ndp->ndev.dev == dev)
+			return &ndp->ndev;
+	}
+
+	return NULL;
+}
+
+static void ncsi_request_timeout(unsigned long data)
+{
+	struct ncsi_request *nr = (struct ncsi_request *)data;
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	unsigned long flags;
+
+	/* If the request already had associated response,
+	 * let the response handler to release it.
+	 */
+	spin_lock_irqsave(&ndp->lock, flags);
+	nr->enabled = false;
+	if (nr->rsp || !nr->cmd) {
+		spin_unlock_irqrestore(&ndp->lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	/* Release the request */
+	ncsi_free_request(nr);
+}
+
+struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
+				   void (*handler)(struct ncsi_dev *ndev))
+{
+	struct ncsi_dev_priv *ndp;
+	struct ncsi_dev *nd;
+	unsigned long flags;
+	int i;
+
+	/* Check if the device has been registered or not */
+	nd = ncsi_find_dev(dev);
+	if (nd)
+		return nd;
+
+	/* Create NCSI device */
+	ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC);
+	if (!ndp)
+		return NULL;
+
+	nd = &ndp->ndev;
+	nd->state = ncsi_dev_state_registered;
+	nd->dev = dev;
+	nd->handler = handler;
+
+	/* Initialize private NCSI device */
+	spin_lock_init(&ndp->lock);
+	INIT_LIST_HEAD(&ndp->packages);
+	ndp->request_id = 0;
+	for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
+		ndp->requests[i].id = i;
+		ndp->requests[i].ndp = ndp;
+		setup_timer(&ndp->requests[i].timer,
+			    ncsi_request_timeout,
+			    (unsigned long)&ndp->requests[i]);
+	}
+
+	spin_lock_irqsave(&ncsi_dev_lock, flags);
+	list_add_tail_rcu(&ndp->node, &ncsi_dev_list);
+	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
+
+	return nd;
+}
+EXPORT_SYMBOL_GPL(ncsi_register_dev);
+
+void ncsi_unregister_dev(struct ncsi_dev *nd)
+{
+	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+	struct ncsi_package *np, *tmp;
+	unsigned long flags;
+
+	list_for_each_entry_safe(np, tmp, &ndp->packages, node)
+		ncsi_remove_package(np);
+
+	spin_lock_irqsave(&ncsi_dev_lock, flags);
+	list_del_rcu(&ndp->node);
+	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
+
+	kfree(ndp);
+}
+EXPORT_SYMBOL_GPL(ncsi_unregister_dev);
-- 
cgit v1.2.3


From e6f44ed6d04d3185dcd8e8e98af8742d87bdffcc Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 19 Jul 2016 11:54:19 +1000
Subject: net/ncsi: Package and channel management

This manages NCSI packages and channels:

 * The available packages and channels are enumerated in the first
   time of calling ncsi_start_dev(). The channels' capabilities are
   probed in the meanwhile. The NCSI network topology won't change
   until the NCSI device is destroyed.
 * There in a queue in every NCSI device. The element in the queue,
   channel, is waiting for configuration (bringup) or suspending
   (teardown). The channel's state (inactive/active) indicates the
   futher action (configuration or suspending) will be applied on the
   channel. Another channel's state (invisible) means the requested
   action is being applied.
 * The hardware arbitration will be enabled if all available packages
   and channels support it. All available channels try to provide
   service when hardware arbitration is enabled. Otherwise, one channel
   is selected as the active one at once.
 * When channel is in active state, meaning it's providing service, a
   timer started to retrieve the channe's link status. If the channel's
   link status fails to be updated in the determined period, the channel
   is going to be reconfigured. It's the error handling implementation
   as defined in NCSI spec.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ncsi.h     |   6 +
 net/ncsi/internal.h    |  50 ++++
 net/ncsi/ncsi-manage.c | 763 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-rsp.c    |  15 +
 4 files changed, 834 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ncsi.h b/include/net/ncsi.h
index 70d14ee1ef84..1dbf42f79750 100644
--- a/include/net/ncsi.h
+++ b/include/net/ncsi.h
@@ -30,6 +30,7 @@ struct ncsi_dev {
 #ifdef CONFIG_NET_NCSI
 struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 				   void (*notifier)(struct ncsi_dev *nd));
+int ncsi_start_dev(struct ncsi_dev *nd);
 void ncsi_unregister_dev(struct ncsi_dev *nd);
 #else /* !CONFIG_NET_NCSI */
 static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
@@ -38,6 +39,11 @@ static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	return NULL;
 }
 
+static inline int ncsi_start_dev(struct ncsi_dev *nd)
+{
+	return -ENOTTY;
+}
+
 static inline void ncsi_unregister_dev(struct ncsi_dev *nd)
 {
 }
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index bd000c9c8249..38fc95a26f8f 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -178,6 +178,7 @@ struct ncsi_channel {
 	int                         state;
 #define NCSI_CHANNEL_INACTIVE		1
 #define NCSI_CHANNEL_ACTIVE		2
+#define NCSI_CHANNEL_INVISIBLE		3
 	spinlock_t                  lock;	/* Protect filters etc */
 	struct ncsi_package         *package;
 	struct ncsi_channel_version version;
@@ -185,7 +186,11 @@ struct ncsi_channel {
 	struct ncsi_channel_mode    modes[NCSI_MODE_MAX];
 	struct ncsi_channel_filter  *filters[NCSI_FILTER_MAX];
 	struct ncsi_channel_stats   stats;
+	struct timer_list           timer;	/* Link monitor timer  */
+	bool                        enabled;	/* Timer is enabled    */
+	unsigned int                timeout;	/* Times of timeout    */
 	struct list_head            node;
+	struct list_head            link;
 };
 
 struct ncsi_package {
@@ -209,14 +214,56 @@ struct ncsi_request {
 	bool                 enabled; /* Time has been enabled or not    */
 };
 
+enum {
+	ncsi_dev_state_major		= 0xff00,
+	ncsi_dev_state_minor		= 0x00ff,
+	ncsi_dev_state_probe_deselect	= 0x0201,
+	ncsi_dev_state_probe_package,
+	ncsi_dev_state_probe_channel,
+	ncsi_dev_state_probe_cis,
+	ncsi_dev_state_probe_gvi,
+	ncsi_dev_state_probe_gc,
+	ncsi_dev_state_probe_gls,
+	ncsi_dev_state_probe_dp,
+	ncsi_dev_state_config_sp	= 0x0301,
+	ncsi_dev_state_config_cis,
+	ncsi_dev_state_config_sma,
+	ncsi_dev_state_config_ebf,
+#if IS_ENABLED(CONFIG_IPV6)
+	ncsi_dev_state_config_egmf,
+#endif
+	ncsi_dev_state_config_ecnt,
+	ncsi_dev_state_config_ec,
+	ncsi_dev_state_config_ae,
+	ncsi_dev_state_config_gls,
+	ncsi_dev_state_config_done,
+	ncsi_dev_state_suspend_select	= 0x0401,
+	ncsi_dev_state_suspend_dcnt,
+	ncsi_dev_state_suspend_dc,
+	ncsi_dev_state_suspend_deselect,
+	ncsi_dev_state_suspend_done
+};
+
 struct ncsi_dev_priv {
 	struct ncsi_dev     ndev;            /* Associated NCSI device     */
 	unsigned int        flags;           /* NCSI device flags          */
+#define NCSI_DEV_PROBED		1            /* Finalized NCSI topology    */
+#define NCSI_DEV_HWA		2            /* Enabled HW arbitration     */
+#define NCSI_DEV_RESHUFFLE	4
 	spinlock_t          lock;            /* Protect the NCSI device    */
+#if IS_ENABLED(CONFIG_IPV6)
+	unsigned int        inet6_addr_num;  /* Number of IPv6 addresses   */
+#endif
 	unsigned int        package_num;     /* Number of packages         */
 	struct list_head    packages;        /* List of packages           */
 	struct ncsi_request requests[256];   /* Request table              */
 	unsigned int        request_id;      /* Last used request ID       */
+	unsigned int        pending_req_num; /* Number of pending requests */
+	struct ncsi_package *active_package; /* Currently handled package  */
+	struct ncsi_channel *active_channel; /* Currently handled channel  */
+	struct list_head    channel_queue;   /* Config queue of channels   */
+	struct work_struct  work;            /* For channel management     */
+	struct packet_type  ptype;           /* NCSI packet Rx handler     */
 	struct list_head    node;            /* Form NCSI device list      */
 };
 
@@ -251,6 +298,8 @@ extern spinlock_t ncsi_dev_lock;
 int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
 int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
 int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
+void ncsi_start_channel_monitor(struct ncsi_channel *nc);
+void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
 struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
 				       unsigned char id);
 struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np,
@@ -267,6 +316,7 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
 struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven);
 void ncsi_free_request(struct ncsi_request *nr);
 struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
+int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
 
 /* Packet handlers */
 u32 ncsi_calculate_checksum(unsigned char *data, int len);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 0e28ed8f2703..d627a39ddcd0 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -17,8 +17,12 @@
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
 
 #include "internal.h"
+#include "ncsi-pkt.h"
 
 LIST_HEAD(ncsi_dev_list);
 DEFINE_SPINLOCK(ncsi_dev_lock);
@@ -123,6 +127,120 @@ int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
 	return 0;
 }
 
+static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
+{
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+
+	nd->state = ncsi_dev_state_functional;
+	if (force_down) {
+		nd->link_up = 0;
+		goto report;
+	}
+
+	nd->link_up = 0;
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			if (!list_empty(&nc->link) ||
+			    nc->state != NCSI_CHANNEL_ACTIVE)
+				continue;
+
+			if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+				nd->link_up = 1;
+				goto report;
+			}
+		}
+	}
+
+report:
+	nd->handler(nd);
+}
+
+static void ncsi_channel_monitor(unsigned long data)
+{
+	struct ncsi_channel *nc = (struct ncsi_channel *)data;
+	struct ncsi_package *np = nc->package;
+	struct ncsi_dev_priv *ndp = np->ndp;
+	struct ncsi_cmd_arg nca;
+	bool enabled;
+	unsigned int timeout;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	timeout = nc->timeout;
+	enabled = nc->enabled;
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	if (!enabled || !list_empty(&nc->link))
+		return;
+	if (nc->state != NCSI_CHANNEL_INACTIVE &&
+	    nc->state != NCSI_CHANNEL_ACTIVE)
+		return;
+
+	if (!(timeout % 2)) {
+		nca.ndp = ndp;
+		nca.package = np->id;
+		nca.channel = nc->id;
+		nca.type = NCSI_PKT_CMD_GLS;
+		nca.driven = false;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret) {
+			netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
+				   ret);
+			return;
+		}
+	}
+
+	if (timeout + 1 >= 3) {
+		if (!(ndp->flags & NCSI_DEV_HWA) &&
+		    nc->state == NCSI_CHANNEL_ACTIVE)
+			ncsi_report_link(ndp, true);
+
+		spin_lock_irqsave(&ndp->lock, flags);
+		xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+		list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+		spin_unlock_irqrestore(&ndp->lock, flags);
+		ncsi_process_next_channel(ndp);
+		return;
+	}
+
+	spin_lock_irqsave(&nc->lock, flags);
+	nc->timeout = timeout + 1;
+	nc->enabled = true;
+	spin_unlock_irqrestore(&nc->lock, flags);
+	mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+}
+
+void ncsi_start_channel_monitor(struct ncsi_channel *nc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	WARN_ON_ONCE(nc->enabled);
+	nc->timeout = 0;
+	nc->enabled = true;
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+}
+
+void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nc->lock, flags);
+	if (!nc->enabled) {
+		spin_unlock_irqrestore(&nc->lock, flags);
+		return;
+	}
+	nc->enabled = false;
+	spin_unlock_irqrestore(&nc->lock, flags);
+
+	del_timer_sync(&nc->timer);
+}
+
 struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
 				       unsigned char id)
 {
@@ -149,7 +267,10 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
 	nc->id = id;
 	nc->package = np;
 	nc->state = NCSI_CHANNEL_INACTIVE;
+	nc->enabled = false;
+	setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc);
 	spin_lock_init(&nc->lock);
+	INIT_LIST_HEAD(&nc->link);
 	for (index = 0; index < NCSI_CAP_MAX; index++)
 		nc->caps[index].index = index;
 	for (index = 0; index < NCSI_MODE_MAX; index++)
@@ -190,6 +311,7 @@ static void ncsi_remove_channel(struct ncsi_channel *nc)
 
 	nc->state = NCSI_CHANNEL_INACTIVE;
 	spin_unlock_irqrestore(&nc->lock, flags);
+	ncsi_stop_channel_monitor(nc);
 
 	/* Remove and free channel */
 	spin_lock_irqsave(&np->lock, flags);
@@ -323,6 +445,7 @@ void ncsi_free_request(struct ncsi_request *nr)
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct sk_buff *cmd, *rsp;
 	unsigned long flags;
+	bool driven;
 
 	if (nr->enabled) {
 		nr->enabled = false;
@@ -335,8 +458,12 @@ void ncsi_free_request(struct ncsi_request *nr)
 	nr->cmd = NULL;
 	nr->rsp = NULL;
 	nr->used = false;
+	driven = nr->driven;
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
+	if (driven && cmd && --ndp->pending_req_num == 0)
+		schedule_work(&ndp->work);
+
 	/* Release command and response */
 	consume_skb(cmd);
 	consume_skb(rsp);
@@ -375,6 +502,587 @@ static void ncsi_request_timeout(unsigned long data)
 	ncsi_free_request(nr);
 }
 
+static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct ncsi_package *np = ndp->active_package;
+	struct ncsi_channel *nc = ndp->active_channel;
+	struct ncsi_cmd_arg nca;
+	int ret;
+
+	nca.ndp = ndp;
+	nca.driven = true;
+	switch (nd->state) {
+	case ncsi_dev_state_suspend:
+		nd->state = ncsi_dev_state_suspend_select;
+		/* Fall through */
+	case ncsi_dev_state_suspend_select:
+	case ncsi_dev_state_suspend_dcnt:
+	case ncsi_dev_state_suspend_dc:
+	case ncsi_dev_state_suspend_deselect:
+		ndp->pending_req_num = 1;
+
+		np = ndp->active_package;
+		nc = ndp->active_channel;
+		nca.package = np->id;
+		if (nd->state == ncsi_dev_state_suspend_select) {
+			nca.type = NCSI_PKT_CMD_SP;
+			nca.channel = 0x1f;
+			if (ndp->flags & NCSI_DEV_HWA)
+				nca.bytes[0] = 0;
+			else
+				nca.bytes[0] = 1;
+			nd->state = ncsi_dev_state_suspend_dcnt;
+		} else if (nd->state == ncsi_dev_state_suspend_dcnt) {
+			nca.type = NCSI_PKT_CMD_DCNT;
+			nca.channel = nc->id;
+			nd->state = ncsi_dev_state_suspend_dc;
+		} else if (nd->state == ncsi_dev_state_suspend_dc) {
+			nca.type = NCSI_PKT_CMD_DC;
+			nca.channel = nc->id;
+			nca.bytes[0] = 1;
+			nd->state = ncsi_dev_state_suspend_deselect;
+		} else if (nd->state == ncsi_dev_state_suspend_deselect) {
+			nca.type = NCSI_PKT_CMD_DP;
+			nca.channel = 0x1f;
+			nd->state = ncsi_dev_state_suspend_done;
+		}
+
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret) {
+			nd->state = ncsi_dev_state_functional;
+			return;
+		}
+
+		break;
+	case ncsi_dev_state_suspend_done:
+		xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+		ncsi_process_next_channel(ndp);
+
+		break;
+	default:
+		netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
+			    nd->state);
+	}
+}
+
+static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct net_device *dev = nd->dev;
+	struct ncsi_package *np = ndp->active_package;
+	struct ncsi_channel *nc = ndp->active_channel;
+	struct ncsi_cmd_arg nca;
+	unsigned char index;
+	int ret;
+
+	nca.ndp = ndp;
+	nca.driven = true;
+	switch (nd->state) {
+	case ncsi_dev_state_config:
+	case ncsi_dev_state_config_sp:
+		ndp->pending_req_num = 1;
+
+		/* Select the specific package */
+		nca.type = NCSI_PKT_CMD_SP;
+		if (ndp->flags & NCSI_DEV_HWA)
+			nca.bytes[0] = 0;
+		else
+			nca.bytes[0] = 1;
+		nca.package = np->id;
+		nca.channel = 0x1f;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
+
+		nd->state = ncsi_dev_state_config_cis;
+		break;
+	case ncsi_dev_state_config_cis:
+		ndp->pending_req_num = 1;
+
+		/* Clear initial state */
+		nca.type = NCSI_PKT_CMD_CIS;
+		nca.package = np->id;
+		nca.channel = nc->id;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
+
+		nd->state = ncsi_dev_state_config_sma;
+		break;
+	case ncsi_dev_state_config_sma:
+	case ncsi_dev_state_config_ebf:
+#if IS_ENABLED(CONFIG_IPV6)
+	case ncsi_dev_state_config_egmf:
+#endif
+	case ncsi_dev_state_config_ecnt:
+	case ncsi_dev_state_config_ec:
+	case ncsi_dev_state_config_ae:
+	case ncsi_dev_state_config_gls:
+		ndp->pending_req_num = 1;
+
+		nca.package = np->id;
+		nca.channel = nc->id;
+
+		/* Use first entry in unicast filter table. Note that
+		 * the MAC filter table starts from entry 1 instead of
+		 * 0.
+		 */
+		if (nd->state == ncsi_dev_state_config_sma) {
+			nca.type = NCSI_PKT_CMD_SMA;
+			for (index = 0; index < 6; index++)
+				nca.bytes[index] = dev->dev_addr[index];
+			nca.bytes[6] = 0x1;
+			nca.bytes[7] = 0x1;
+			nd->state = ncsi_dev_state_config_ebf;
+		} else if (nd->state == ncsi_dev_state_config_ebf) {
+			nca.type = NCSI_PKT_CMD_EBF;
+			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
+			nd->state = ncsi_dev_state_config_ecnt;
+#if IS_ENABLED(CONFIG_IPV6)
+			if (ndp->inet6_addr_num > 0 &&
+			    (nc->caps[NCSI_CAP_GENERIC].cap &
+			     NCSI_CAP_GENERIC_MC))
+				nd->state = ncsi_dev_state_config_egmf;
+			else
+				nd->state = ncsi_dev_state_config_ecnt;
+		} else if (nd->state == ncsi_dev_state_config_egmf) {
+			nca.type = NCSI_PKT_CMD_EGMF;
+			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
+			nd->state = ncsi_dev_state_config_ecnt;
+#endif /* CONFIG_IPV6 */
+		} else if (nd->state == ncsi_dev_state_config_ecnt) {
+			nca.type = NCSI_PKT_CMD_ECNT;
+			nd->state = ncsi_dev_state_config_ec;
+		} else if (nd->state == ncsi_dev_state_config_ec) {
+			/* Enable AEN if it's supported */
+			nca.type = NCSI_PKT_CMD_EC;
+			nd->state = ncsi_dev_state_config_ae;
+			if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK))
+				nd->state = ncsi_dev_state_config_gls;
+		} else if (nd->state == ncsi_dev_state_config_ae) {
+			nca.type = NCSI_PKT_CMD_AE;
+			nca.bytes[0] = 0;
+			nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap;
+			nd->state = ncsi_dev_state_config_gls;
+		} else if (nd->state == ncsi_dev_state_config_gls) {
+			nca.type = NCSI_PKT_CMD_GLS;
+			nd->state = ncsi_dev_state_config_done;
+		}
+
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
+		break;
+	case ncsi_dev_state_config_done:
+		if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1)
+			xchg(&nc->state, NCSI_CHANNEL_ACTIVE);
+		else
+			xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+
+		ncsi_start_channel_monitor(nc);
+		ncsi_process_next_channel(ndp);
+		break;
+	default:
+		netdev_warn(dev, "Wrong NCSI state 0x%x in config\n",
+			    nd->state);
+	}
+
+	return;
+
+error:
+	ncsi_report_link(ndp, true);
+}
+
+static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_package *np;
+	struct ncsi_channel *nc, *found;
+	struct ncsi_channel_mode *ncm;
+	unsigned long flags;
+
+	/* The search is done once an inactive channel with up
+	 * link is found.
+	 */
+	found = NULL;
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			if (!list_empty(&nc->link) ||
+			    nc->state != NCSI_CHANNEL_INACTIVE)
+				continue;
+
+			if (!found)
+				found = nc;
+
+			ncm = &nc->modes[NCSI_MODE_LINK];
+			if (ncm->data[2] & 0x1) {
+				found = nc;
+				goto out;
+			}
+		}
+	}
+
+	if (!found) {
+		ncsi_report_link(ndp, true);
+		return -ENODEV;
+	}
+
+out:
+	spin_lock_irqsave(&ndp->lock, flags);
+	list_add_tail_rcu(&found->link, &ndp->channel_queue);
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	return ncsi_process_next_channel(ndp);
+}
+
+static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	unsigned int cap;
+
+	/* The hardware arbitration is disabled if any one channel
+	 * doesn't support explicitly.
+	 */
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			cap = nc->caps[NCSI_CAP_GENERIC].cap;
+			if (!(cap & NCSI_CAP_GENERIC_HWA) ||
+			    (cap & NCSI_CAP_GENERIC_HWA_MASK) !=
+			    NCSI_CAP_GENERIC_HWA_SUPPORT) {
+				ndp->flags &= ~NCSI_DEV_HWA;
+				return false;
+			}
+		}
+	}
+
+	ndp->flags |= NCSI_DEV_HWA;
+	return true;
+}
+
+static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	unsigned long flags;
+
+	/* Move all available channels to processing queue */
+	spin_lock_irqsave(&ndp->lock, flags);
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE ||
+				     !list_empty(&nc->link));
+			ncsi_stop_channel_monitor(nc);
+			list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+		}
+	}
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	/* We can have no channels in extremely case */
+	if (list_empty(&ndp->channel_queue)) {
+		ncsi_report_link(ndp, false);
+		return -ENOENT;
+	}
+
+	return ncsi_process_next_channel(ndp);
+}
+
+static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	struct ncsi_cmd_arg nca;
+	unsigned char index;
+	int ret;
+
+	nca.ndp = ndp;
+	nca.driven = true;
+	switch (nd->state) {
+	case ncsi_dev_state_probe:
+		nd->state = ncsi_dev_state_probe_deselect;
+		/* Fall through */
+	case ncsi_dev_state_probe_deselect:
+		ndp->pending_req_num = 8;
+
+		/* Deselect all possible packages */
+		nca.type = NCSI_PKT_CMD_DP;
+		nca.channel = 0x1f;
+		for (index = 0; index < 8; index++) {
+			nca.package = index;
+			ret = ncsi_xmit_cmd(&nca);
+			if (ret)
+				goto error;
+		}
+
+		nd->state = ncsi_dev_state_probe_package;
+		break;
+	case ncsi_dev_state_probe_package:
+		ndp->pending_req_num = 16;
+
+		/* Select all possible packages */
+		nca.type = NCSI_PKT_CMD_SP;
+		nca.bytes[0] = 1;
+		nca.channel = 0x1f;
+		for (index = 0; index < 8; index++) {
+			nca.package = index;
+			ret = ncsi_xmit_cmd(&nca);
+			if (ret)
+				goto error;
+		}
+
+		/* Disable all possible packages */
+		nca.type = NCSI_PKT_CMD_DP;
+		for (index = 0; index < 8; index++) {
+			nca.package = index;
+			ret = ncsi_xmit_cmd(&nca);
+			if (ret)
+				goto error;
+		}
+
+		nd->state = ncsi_dev_state_probe_channel;
+		break;
+	case ncsi_dev_state_probe_channel:
+		if (!ndp->active_package)
+			ndp->active_package = list_first_or_null_rcu(
+				&ndp->packages, struct ncsi_package, node);
+		else if (list_is_last(&ndp->active_package->node,
+				      &ndp->packages))
+			ndp->active_package = NULL;
+		else
+			ndp->active_package = list_next_entry(
+				ndp->active_package, node);
+
+		/* All available packages and channels are enumerated. The
+		 * enumeration happens for once when the NCSI interface is
+		 * started. So we need continue to start the interface after
+		 * the enumeration.
+		 *
+		 * We have to choose an active channel before configuring it.
+		 * Note that we possibly don't have active channel in extreme
+		 * situation.
+		 */
+		if (!ndp->active_package) {
+			ndp->flags |= NCSI_DEV_PROBED;
+			if (ncsi_check_hwa(ndp))
+				ncsi_enable_hwa(ndp);
+			else
+				ncsi_choose_active_channel(ndp);
+			return;
+		}
+
+		/* Select the active package */
+		ndp->pending_req_num = 1;
+		nca.type = NCSI_PKT_CMD_SP;
+		nca.bytes[0] = 1;
+		nca.package = ndp->active_package->id;
+		nca.channel = 0x1f;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
+
+		nd->state = ncsi_dev_state_probe_cis;
+		break;
+	case ncsi_dev_state_probe_cis:
+		ndp->pending_req_num = 32;
+
+		/* Clear initial state */
+		nca.type = NCSI_PKT_CMD_CIS;
+		nca.package = ndp->active_package->id;
+		for (index = 0; index < 0x20; index++) {
+			nca.channel = index;
+			ret = ncsi_xmit_cmd(&nca);
+			if (ret)
+				goto error;
+		}
+
+		nd->state = ncsi_dev_state_probe_gvi;
+		break;
+	case ncsi_dev_state_probe_gvi:
+	case ncsi_dev_state_probe_gc:
+	case ncsi_dev_state_probe_gls:
+		np = ndp->active_package;
+		ndp->pending_req_num = np->channel_num;
+
+		/* Retrieve version, capability or link status */
+		if (nd->state == ncsi_dev_state_probe_gvi)
+			nca.type = NCSI_PKT_CMD_GVI;
+		else if (nd->state == ncsi_dev_state_probe_gc)
+			nca.type = NCSI_PKT_CMD_GC;
+		else
+			nca.type = NCSI_PKT_CMD_GLS;
+
+		nca.package = np->id;
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			nca.channel = nc->id;
+			ret = ncsi_xmit_cmd(&nca);
+			if (ret)
+				goto error;
+		}
+
+		if (nd->state == ncsi_dev_state_probe_gvi)
+			nd->state = ncsi_dev_state_probe_gc;
+		else if (nd->state == ncsi_dev_state_probe_gc)
+			nd->state = ncsi_dev_state_probe_gls;
+		else
+			nd->state = ncsi_dev_state_probe_dp;
+		break;
+	case ncsi_dev_state_probe_dp:
+		ndp->pending_req_num = 1;
+
+		/* Deselect the active package */
+		nca.type = NCSI_PKT_CMD_DP;
+		nca.package = ndp->active_package->id;
+		nca.channel = 0x1f;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
+
+		/* Scan channels in next package */
+		nd->state = ncsi_dev_state_probe_channel;
+		break;
+	default:
+		netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n",
+			    nd->state);
+	}
+
+	return;
+error:
+	ncsi_report_link(ndp, true);
+}
+
+static void ncsi_dev_work(struct work_struct *work)
+{
+	struct ncsi_dev_priv *ndp = container_of(work,
+			struct ncsi_dev_priv, work);
+	struct ncsi_dev *nd = &ndp->ndev;
+
+	switch (nd->state & ncsi_dev_state_major) {
+	case ncsi_dev_state_probe:
+		ncsi_probe_channel(ndp);
+		break;
+	case ncsi_dev_state_suspend:
+		ncsi_suspend_channel(ndp);
+		break;
+	case ncsi_dev_state_config:
+		ncsi_configure_channel(ndp);
+		break;
+	default:
+		netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n",
+			    nd->state);
+	}
+}
+
+int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_channel *nc;
+	int old_state;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ndp->lock, flags);
+	nc = list_first_or_null_rcu(&ndp->channel_queue,
+				    struct ncsi_channel, link);
+	if (nc) {
+		old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE);
+		list_del_init(&nc->link);
+	}
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	ndp->active_channel = nc;
+	ndp->active_package = nc ? nc->package : NULL;
+	if (!nc) {
+		if (ndp->flags & NCSI_DEV_RESHUFFLE) {
+			ndp->flags &= ~NCSI_DEV_RESHUFFLE;
+			return ncsi_choose_active_channel(ndp);
+		}
+
+		ncsi_report_link(ndp, false);
+		return -ENODEV;
+	}
+
+	switch (old_state) {
+	case NCSI_CHANNEL_INACTIVE:
+		ndp->ndev.state = ncsi_dev_state_config;
+		ncsi_configure_channel(ndp);
+		break;
+	case NCSI_CHANNEL_ACTIVE:
+		ndp->ndev.state = ncsi_dev_state_suspend;
+		ncsi_suspend_channel(ndp);
+		break;
+	default:
+		netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n",
+			   nc->state, nc->package->id, nc->id);
+		ncsi_report_link(ndp, false);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ncsi_inet6addr_event(struct notifier_block *this,
+				unsigned long event, void *data)
+{
+	struct inet6_ifaddr *ifa = data;
+	struct net_device *dev = ifa->idev->dev;
+	struct ncsi_dev *nd = ncsi_find_dev(dev);
+	struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	struct ncsi_cmd_arg nca;
+	bool action;
+	int ret;
+
+	if (!ndp || (ipv6_addr_type(&ifa->addr) &
+	    (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)))
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_UP:
+		action = (++ndp->inet6_addr_num) == 1;
+		nca.type = NCSI_PKT_CMD_EGMF;
+		break;
+	case NETDEV_DOWN:
+		action = (--ndp->inet6_addr_num == 0);
+		nca.type = NCSI_PKT_CMD_DGMF;
+		break;
+	default:
+		return NOTIFY_OK;
+	}
+
+	/* We might not have active channel or packages. The IPv6
+	 * required multicast will be enabled when active channel
+	 * or packages are chosen.
+	 */
+	np = ndp->active_package;
+	nc = ndp->active_channel;
+	if (!action || !np || !nc)
+		return NOTIFY_OK;
+
+	/* We needn't enable or disable it if the function isn't supported */
+	if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC))
+		return NOTIFY_OK;
+
+	nca.ndp = ndp;
+	nca.driven = false;
+	nca.package = np->id;
+	nca.channel = nc->id;
+	nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
+	ret = ncsi_xmit_cmd(&nca);
+	if (ret) {
+		netdev_warn(dev, "Fail to %s global multicast filter (%d)\n",
+			    (event == NETDEV_UP) ? "enable" : "disable", ret);
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ncsi_inet6addr_notifier = {
+	.notifier_call = ncsi_inet6addr_event,
+};
+#endif /* CONFIG_IPV6 */
+
 struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 				   void (*handler)(struct ncsi_dev *ndev))
 {
@@ -397,6 +1105,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	nd->state = ncsi_dev_state_registered;
 	nd->dev = dev;
 	nd->handler = handler;
+	ndp->pending_req_num = 0;
+	INIT_LIST_HEAD(&ndp->channel_queue);
+	INIT_WORK(&ndp->work, ncsi_dev_work);
 
 	/* Initialize private NCSI device */
 	spin_lock_init(&ndp->lock);
@@ -411,24 +1122,76 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	}
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
+#if IS_ENABLED(CONFIG_IPV6)
+	ndp->inet6_addr_num = 0;
+	if (list_empty(&ncsi_dev_list))
+		register_inet6addr_notifier(&ncsi_inet6addr_notifier);
+#endif
 	list_add_tail_rcu(&ndp->node, &ncsi_dev_list);
 	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
 
+	/* Register NCSI packet Rx handler */
+	ndp->ptype.type = cpu_to_be16(ETH_P_NCSI);
+	ndp->ptype.func = ncsi_rcv_rsp;
+	ndp->ptype.dev = dev;
+	dev_add_pack(&ndp->ptype);
+
 	return nd;
 }
 EXPORT_SYMBOL_GPL(ncsi_register_dev);
 
+int ncsi_start_dev(struct ncsi_dev *nd)
+{
+	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	int old_state, ret;
+
+	if (nd->state != ncsi_dev_state_registered &&
+	    nd->state != ncsi_dev_state_functional)
+		return -ENOTTY;
+
+	if (!(ndp->flags & NCSI_DEV_PROBED)) {
+		nd->state = ncsi_dev_state_probe;
+		schedule_work(&ndp->work);
+		return 0;
+	}
+
+	/* Reset channel's state and start over */
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+			WARN_ON_ONCE(!list_empty(&nc->link) ||
+				     old_state == NCSI_CHANNEL_INVISIBLE);
+		}
+	}
+
+	if (ndp->flags & NCSI_DEV_HWA)
+		ret = ncsi_enable_hwa(ndp);
+	else
+		ret = ncsi_choose_active_channel(ndp);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ncsi_start_dev);
+
 void ncsi_unregister_dev(struct ncsi_dev *nd)
 {
 	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
 	struct ncsi_package *np, *tmp;
 	unsigned long flags;
 
+	dev_remove_pack(&ndp->ptype);
+
 	list_for_each_entry_safe(np, tmp, &ndp->packages, node)
 		ncsi_remove_package(np);
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
 	list_del_rcu(&ndp->node);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (list_empty(&ncsi_dev_list))
+		unregister_inet6addr_notifier(&ncsi_inet6addr_notifier);
+#endif
 	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
 
 	kfree(ndp);
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 6ec25cb2608c..a21af88330aa 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -69,6 +69,9 @@ static int ncsi_rsp_handler_cis(struct ncsi_request *nr)
 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
 	ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc);
 	if (!nc) {
+		if (ndp->flags & NCSI_DEV_PROBED)
+			return -ENXIO;
+
 		id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel);
 		nc = ncsi_add_channel(np, id);
 	}
@@ -90,6 +93,9 @@ static int ncsi_rsp_handler_sp(struct ncsi_request *nr)
 	ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
 				      &np, NULL);
 	if (!np) {
+		if (ndp->flags & NCSI_DEV_PROBED)
+			return -ENXIO;
+
 		id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel);
 		np = ncsi_add_package(ndp, id);
 		if (!np)
@@ -297,6 +303,7 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
 	struct ncsi_channel_mode *ncm;
+	unsigned long flags;
 
 	/* Find the package and channel */
 	rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp);
@@ -310,6 +317,14 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
 	ncm->data[3] = ntohl(rsp->other);
 	ncm->data[4] = ntohl(rsp->oem_status);
 
+	if (nr->driven)
+		return 0;
+
+	/* Reset the channel monitor if it has been enabled */
+	spin_lock_irqsave(&nc->lock, flags);
+	nc->timeout = 0;
+	spin_unlock_irqrestore(&nc->lock, flags);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 82de0be6862cdca2e6802267bda57cfc8844d3a7 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Mon, 18 Jul 2016 11:39:23 +0800
Subject: netfilter: Add helper array register/unregister functions

Add nf_ct_helper_init(), nf_conntrack_helpers_register() and
nf_conntrack_helpers_unregister() functions to avoid repetitive
opencoded initialization in helpers.

This patch keeps an id parameter for nf_ct_helper_init() not to break
helper matching by name that has been inconsistently exposed to
userspace through ports, eg. ftp-2121, and through an incremental id,
eg. tftp-1.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 15 ++++++
 net/netfilter/nf_conntrack_ftp.c            | 58 +++++++---------------
 net/netfilter/nf_conntrack_helper.c         | 57 ++++++++++++++++++++++
 net/netfilter/nf_conntrack_irc.c            | 36 +++++---------
 net/netfilter/nf_conntrack_sane.c           | 57 ++++++++--------------
 net/netfilter/nf_conntrack_sip.c            | 75 +++++++++++------------------
 net/netfilter/nf_conntrack_tftp.c           | 48 ++++++------------
 7 files changed, 165 insertions(+), 181 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 6cf614bc0029..1eaac1f4cd6a 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -58,10 +58,25 @@ struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name,
 struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name,
 							       u16 l3num,
 							       u8 protonum);
+void nf_ct_helper_init(struct nf_conntrack_helper *helper,
+		       u16 l3num, u16 protonum, const char *name,
+		       u16 default_port, u16 spec_port, u32 id,
+		       const struct nf_conntrack_expect_policy *exp_pol,
+		       u32 expect_class_max, u32 data_len,
+		       int (*help)(struct sk_buff *skb, unsigned int protoff,
+				   struct nf_conn *ct,
+				   enum ip_conntrack_info ctinfo),
+		       int (*from_nlattr)(struct nlattr *attr,
+					  struct nf_conn *ct),
+		       struct module *module);
 
 int nf_conntrack_helper_register(struct nf_conntrack_helper *);
 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *);
 
+int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int);
+void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *,
+				     unsigned int);
+
 struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct,
 					  struct nf_conntrack_helper *helper,
 					  gfp_t gfp);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 19efeba02abb..43147005bea3 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -572,7 +572,7 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
 	return 0;
 }
 
-static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper ftp[MAX_PORTS * 2] __read_mostly;
 
 static const struct nf_conntrack_expect_policy ftp_exp_policy = {
 	.max_expected	= 1,
@@ -582,24 +582,13 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
 /* don't make this __exit, since it's called from __init ! */
 static void nf_conntrack_ftp_fini(void)
 {
-	int i, j;
-	for (i = 0; i < ports_c; i++) {
-		for (j = 0; j < 2; j++) {
-			if (ftp[i][j].me == NULL)
-				continue;
-
-			pr_debug("unregistering helper for pf: %d port: %d\n",
-				 ftp[i][j].tuple.src.l3num, ports[i]);
-			nf_conntrack_helper_unregister(&ftp[i][j]);
-		}
-	}
-
+	nf_conntrack_helpers_unregister(ftp, ports_c * 2);
 	kfree(ftp_buffer);
 }
 
 static int __init nf_conntrack_ftp_init(void)
 {
-	int i, j = -1, ret = 0;
+	int i, ret = 0;
 
 	ftp_buffer = kmalloc(65536, GFP_KERNEL);
 	if (!ftp_buffer)
@@ -611,32 +600,21 @@ static int __init nf_conntrack_ftp_init(void)
 	/* FIXME should be configurable whether IPv4 and IPv6 FTP connections
 		 are tracked or not - YK */
 	for (i = 0; i < ports_c; i++) {
-		ftp[i][0].tuple.src.l3num = PF_INET;
-		ftp[i][1].tuple.src.l3num = PF_INET6;
-		for (j = 0; j < 2; j++) {
-			ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
-			ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
-			ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
-			ftp[i][j].expect_policy = &ftp_exp_policy;
-			ftp[i][j].me = THIS_MODULE;
-			ftp[i][j].help = help;
-			ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
-			if (ports[i] == FTP_PORT)
-				sprintf(ftp[i][j].name, "ftp");
-			else
-				sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
-
-			pr_debug("registering helper for pf: %d port: %d\n",
-				 ftp[i][j].tuple.src.l3num, ports[i]);
-			ret = nf_conntrack_helper_register(&ftp[i][j]);
-			if (ret) {
-				pr_err("failed to register helper for pf: %d port: %d\n",
-				       ftp[i][j].tuple.src.l3num, ports[i]);
-				ports_c = i;
-				nf_conntrack_ftp_fini();
-				return ret;
-			}
-		}
+		nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp",
+				  FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
+				  0, sizeof(struct nf_ct_ftp_master), help,
+				  nf_ct_ftp_from_nlattr, THIS_MODULE);
+		nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp",
+				  FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
+				  0, sizeof(struct nf_ct_ftp_master), help,
+				  nf_ct_ftp_from_nlattr, THIS_MODULE);
+	}
+
+	ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
+	if (ret < 0) {
+		pr_err("failed to register helpers\n");
+		kfree(ftp_buffer);
+		return ret;
 	}
 
 	return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a4294e949cdc..b989b81ac156 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -465,6 +465,63 @@ restart:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
 
+void nf_ct_helper_init(struct nf_conntrack_helper *helper,
+		       u16 l3num, u16 protonum, const char *name,
+		       u16 default_port, u16 spec_port, u32 id,
+		       const struct nf_conntrack_expect_policy *exp_pol,
+		       u32 expect_class_max, u32 data_len,
+		       int (*help)(struct sk_buff *skb, unsigned int protoff,
+				   struct nf_conn *ct,
+				   enum ip_conntrack_info ctinfo),
+		       int (*from_nlattr)(struct nlattr *attr,
+					  struct nf_conn *ct),
+		       struct module *module)
+{
+	helper->tuple.src.l3num = l3num;
+	helper->tuple.dst.protonum = protonum;
+	helper->tuple.src.u.all = htons(spec_port);
+	helper->expect_policy = exp_pol;
+	helper->expect_class_max = expect_class_max;
+	helper->data_len = data_len;
+	helper->help = help;
+	helper->from_nlattr = from_nlattr;
+	helper->me = module;
+
+	if (spec_port == default_port)
+		snprintf(helper->name, sizeof(helper->name), "%s", name);
+	else
+		snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_init);
+
+int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper,
+				  unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = nf_conntrack_helper_register(&helper[i]);
+		if (err < 0)
+			goto err;
+	}
+
+	return err;
+err:
+	if (i > 0)
+		nf_conntrack_helpers_unregister(helper, i);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register);
+
+void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper,
+				unsigned int n)
+{
+	while (n-- > 0)
+		nf_conntrack_helper_unregister(&helper[n]);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister);
+
 static struct nf_ct_ext_type helper_extend __read_mostly = {
 	.len	= sizeof(struct nf_conn_help),
 	.align	= __alignof__(struct nf_conn_help),
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index f97ac61d2536..1972a149f958 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -255,27 +255,18 @@ static int __init nf_conntrack_irc_init(void)
 		ports[ports_c++] = IRC_PORT;
 
 	for (i = 0; i < ports_c; i++) {
-		irc[i].tuple.src.l3num = AF_INET;
-		irc[i].tuple.src.u.tcp.port = htons(ports[i]);
-		irc[i].tuple.dst.protonum = IPPROTO_TCP;
-		irc[i].expect_policy = &irc_exp_policy;
-		irc[i].me = THIS_MODULE;
-		irc[i].help = help;
-
-		if (ports[i] == IRC_PORT)
-			sprintf(irc[i].name, "irc");
-		else
-			sprintf(irc[i].name, "irc-%u", i);
-
-		ret = nf_conntrack_helper_register(&irc[i]);
-		if (ret) {
-			pr_err("failed to register helper for pf: %u port: %u\n",
-			       irc[i].tuple.src.l3num, ports[i]);
-			ports_c = i;
-			nf_conntrack_irc_fini();
-			return ret;
-		}
+		nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc",
+				  IRC_PORT, ports[i], i, &irc_exp_policy,
+				  0, 0, help, NULL, THIS_MODULE);
+	}
+
+	ret = nf_conntrack_helpers_register(&irc[0], ports_c);
+	if (ret) {
+		pr_err("failed to register helpers\n");
+		kfree(irc_buffer);
+		return ret;
 	}
+
 	return 0;
 }
 
@@ -283,10 +274,7 @@ static int __init nf_conntrack_irc_init(void)
  * it is needed by the init function */
 static void nf_conntrack_irc_fini(void)
 {
-	int i;
-
-	for (i = 0; i < ports_c; i++)
-		nf_conntrack_helper_unregister(&irc[i]);
+	nf_conntrack_helpers_unregister(irc, ports_c);
 	kfree(irc_buffer);
 }
 
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 3fcbaab83b3d..9dcb9ee9b97d 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -166,7 +166,7 @@ out:
 	return ret;
 }
 
-static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper sane[MAX_PORTS * 2] __read_mostly;
 
 static const struct nf_conntrack_expect_policy sane_exp_policy = {
 	.max_expected	= 1,
@@ -176,22 +176,13 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
 /* don't make this __exit, since it's called from __init ! */
 static void nf_conntrack_sane_fini(void)
 {
-	int i, j;
-
-	for (i = 0; i < ports_c; i++) {
-		for (j = 0; j < 2; j++) {
-			pr_debug("unregistering helper for pf: %d port: %d\n",
-				 sane[i][j].tuple.src.l3num, ports[i]);
-			nf_conntrack_helper_unregister(&sane[i][j]);
-		}
-	}
-
+	nf_conntrack_helpers_unregister(sane, ports_c * 2);
 	kfree(sane_buffer);
 }
 
 static int __init nf_conntrack_sane_init(void)
 {
-	int i, j = -1, ret = 0;
+	int i, ret = 0;
 
 	sane_buffer = kmalloc(65536, GFP_KERNEL);
 	if (!sane_buffer)
@@ -203,31 +194,23 @@ static int __init nf_conntrack_sane_init(void)
 	/* FIXME should be configurable whether IPv4 and IPv6 connections
 		 are tracked or not - YK */
 	for (i = 0; i < ports_c; i++) {
-		sane[i][0].tuple.src.l3num = PF_INET;
-		sane[i][1].tuple.src.l3num = PF_INET6;
-		for (j = 0; j < 2; j++) {
-			sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
-			sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
-			sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
-			sane[i][j].expect_policy = &sane_exp_policy;
-			sane[i][j].me = THIS_MODULE;
-			sane[i][j].help = help;
-			if (ports[i] == SANE_PORT)
-				sprintf(sane[i][j].name, "sane");
-			else
-				sprintf(sane[i][j].name, "sane-%d", ports[i]);
-
-			pr_debug("registering helper for pf: %d port: %d\n",
-				 sane[i][j].tuple.src.l3num, ports[i]);
-			ret = nf_conntrack_helper_register(&sane[i][j]);
-			if (ret) {
-				pr_err("failed to register helper for pf: %d port: %d\n",
-				       sane[i][j].tuple.src.l3num, ports[i]);
-				ports_c = i;
-				nf_conntrack_sane_fini();
-				return ret;
-			}
-		}
+		nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane",
+				  SANE_PORT, ports[i], ports[i],
+				  &sane_exp_policy, 0,
+				  sizeof(struct nf_ct_sane_master), help, NULL,
+				  THIS_MODULE);
+		nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane",
+				  SANE_PORT, ports[i], ports[i],
+				  &sane_exp_policy, 0,
+				  sizeof(struct nf_ct_sane_master), help, NULL,
+				  THIS_MODULE);
+	}
+
+	ret = nf_conntrack_helpers_register(sane, ports_c * 2);
+	if (ret < 0) {
+		pr_err("failed to register helpers\n");
+		kfree(sane_buffer);
+		return ret;
 	}
 
 	return 0;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index f72ba5587588..8d9db9d4702b 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1589,7 +1589,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
 	return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);
 }
 
-static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
+static struct nf_conntrack_helper sip[MAX_PORTS * 4] __read_mostly;
 
 static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
 	[SIP_EXPECT_SIGNALLING] = {
@@ -1616,20 +1616,12 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
 
 static void nf_conntrack_sip_fini(void)
 {
-	int i, j;
-
-	for (i = 0; i < ports_c; i++) {
-		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
-			if (sip[i][j].me == NULL)
-				continue;
-			nf_conntrack_helper_unregister(&sip[i][j]);
-		}
-	}
+	nf_conntrack_helpers_unregister(sip, ports_c * 4);
 }
 
 static int __init nf_conntrack_sip_init(void)
 {
-	int i, j, ret;
+	int i, ret;
 
 	if (ports_c == 0)
 		ports[ports_c++] = SIP_PORT;
@@ -1637,43 +1629,32 @@ static int __init nf_conntrack_sip_init(void)
 	for (i = 0; i < ports_c; i++) {
 		memset(&sip[i], 0, sizeof(sip[i]));
 
-		sip[i][0].tuple.src.l3num = AF_INET;
-		sip[i][0].tuple.dst.protonum = IPPROTO_UDP;
-		sip[i][0].help = sip_help_udp;
-		sip[i][1].tuple.src.l3num = AF_INET;
-		sip[i][1].tuple.dst.protonum = IPPROTO_TCP;
-		sip[i][1].help = sip_help_tcp;
-
-		sip[i][2].tuple.src.l3num = AF_INET6;
-		sip[i][2].tuple.dst.protonum = IPPROTO_UDP;
-		sip[i][2].help = sip_help_udp;
-		sip[i][3].tuple.src.l3num = AF_INET6;
-		sip[i][3].tuple.dst.protonum = IPPROTO_TCP;
-		sip[i][3].help = sip_help_tcp;
-
-		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
-			sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
-			sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
-			sip[i][j].expect_policy = sip_exp_policy;
-			sip[i][j].expect_class_max = SIP_EXPECT_MAX;
-			sip[i][j].me = THIS_MODULE;
-
-			if (ports[i] == SIP_PORT)
-				sprintf(sip[i][j].name, "sip");
-			else
-				sprintf(sip[i][j].name, "sip-%u", i);
-
-			pr_debug("port #%u: %u\n", i, ports[i]);
+		nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
+				  SIP_PORT, ports[i], i, sip_exp_policy,
+				  SIP_EXPECT_MAX,
+				  sizeof(struct nf_ct_sip_master), sip_help_udp,
+				  NULL, THIS_MODULE);
+		nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip",
+				  SIP_PORT, ports[i], i, sip_exp_policy,
+				  SIP_EXPECT_MAX,
+				  sizeof(struct nf_ct_sip_master), sip_help_tcp,
+				  NULL, THIS_MODULE);
+		nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip",
+				  SIP_PORT, ports[i], i, sip_exp_policy,
+				  SIP_EXPECT_MAX,
+				  sizeof(struct nf_ct_sip_master), sip_help_udp,
+				  NULL, THIS_MODULE);
+		nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip",
+				  SIP_PORT, ports[i], i, sip_exp_policy,
+				  SIP_EXPECT_MAX,
+				  sizeof(struct nf_ct_sip_master), sip_help_tcp,
+				  NULL, THIS_MODULE);
+	}
 
-			ret = nf_conntrack_helper_register(&sip[i][j]);
-			if (ret) {
-				pr_err("failed to register helper for pf: %u port: %u\n",
-				       sip[i][j].tuple.src.l3num, ports[i]);
-				ports_c = i;
-				nf_conntrack_sip_fini();
-				return ret;
-			}
-		}
+	ret = nf_conntrack_helpers_register(sip, ports_c * 4);
+	if (ret < 0) {
+		pr_err("failed to register helpers\n");
+		return ret;
 	}
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 2e65b5430fba..b1227dc6f75e 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -97,7 +97,7 @@ static int tftp_help(struct sk_buff *skb,
 	return ret;
 }
 
-static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
+static struct nf_conntrack_helper tftp[MAX_PORTS * 2] __read_mostly;
 
 static const struct nf_conntrack_expect_policy tftp_exp_policy = {
 	.max_expected	= 1,
@@ -106,47 +106,29 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
 
 static void nf_conntrack_tftp_fini(void)
 {
-	int i, j;
-
-	for (i = 0; i < ports_c; i++) {
-		for (j = 0; j < 2; j++)
-			nf_conntrack_helper_unregister(&tftp[i][j]);
-	}
+	nf_conntrack_helpers_unregister(tftp, ports_c * 2);
 }
 
 static int __init nf_conntrack_tftp_init(void)
 {
-	int i, j, ret;
+	int i, ret;
 
 	if (ports_c == 0)
 		ports[ports_c++] = TFTP_PORT;
 
 	for (i = 0; i < ports_c; i++) {
-		memset(&tftp[i], 0, sizeof(tftp[i]));
-
-		tftp[i][0].tuple.src.l3num = AF_INET;
-		tftp[i][1].tuple.src.l3num = AF_INET6;
-		for (j = 0; j < 2; j++) {
-			tftp[i][j].tuple.dst.protonum = IPPROTO_UDP;
-			tftp[i][j].tuple.src.u.udp.port = htons(ports[i]);
-			tftp[i][j].expect_policy = &tftp_exp_policy;
-			tftp[i][j].me = THIS_MODULE;
-			tftp[i][j].help = tftp_help;
-
-			if (ports[i] == TFTP_PORT)
-				sprintf(tftp[i][j].name, "tftp");
-			else
-				sprintf(tftp[i][j].name, "tftp-%u", i);
-
-			ret = nf_conntrack_helper_register(&tftp[i][j]);
-			if (ret) {
-				pr_err("failed to register helper for pf: %u port: %u\n",
-				       tftp[i][j].tuple.src.l3num, ports[i]);
-				ports_c = i;
-				nf_conntrack_tftp_fini();
-				return ret;
-			}
-		}
+		nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp",
+				  TFTP_PORT, ports[i], i, &tftp_exp_policy,
+				  0, 0, tftp_help, NULL, THIS_MODULE);
+		nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp",
+				  TFTP_PORT, ports[i], i, &tftp_exp_policy,
+				  0, 0, tftp_help, NULL, THIS_MODULE);
+	}
+
+	ret = nf_conntrack_helpers_register(tftp, ports_c * 2);
+	if (ret < 0) {
+		pr_err("failed to register helpers\n");
+		return ret;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 5f652bb2eb3eb38f97194ce5c41da1fa12e914b8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Jul 2016 18:11:31 +0200
Subject: gro_cells: gro_cells_receive now return error code

so that the caller can update stats accordingly, if needed

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gro_cells.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index cf6c74550baa..d15214d673b2 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -14,27 +14,26 @@ struct gro_cells {
 	struct gro_cell __percpu	*cells;
 };
 
-static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+static inline int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
 {
 	struct gro_cell *cell;
 	struct net_device *dev = skb->dev;
 
-	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) {
-		netif_rx(skb);
-		return;
-	}
+	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+		return netif_rx(skb);
 
 	cell = this_cpu_ptr(gcells->cells);
 
 	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
 		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
-		return;
+		return NET_RX_DROP;
 	}
 
 	__skb_queue_tail(&cell->napi_skbs, skb);
 	if (skb_queue_len(&cell->napi_skbs) == 1)
 		napi_schedule(&cell->napi);
+	return NET_RX_SUCCESS;
 }
 
 /* called under BH context */
-- 
cgit v1.2.3


From 23014011ba4209a086931ff402eac1c41abbe456 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 Jul 2016 12:51:16 +0200
Subject: netfilter: conntrack: support a fixed size of 128 distinct labels

The conntrack label extension is currently variable-sized, e.g. if
only 2 labels are used by iptables rules then the labels->bits[] array
will only contain one element.

We track size of each label storage area in the 'words' member.

But in nftables and openvswitch we always have to ask for worst-case
since we don't know what bit will be used at configuration time.

As most arches are 64bit we need to allocate 24 bytes in this case:

struct nf_conn_labels {
    u8            words;   /*     0     1 */
    /* XXX 7 bytes hole, try to pack */
    long unsigned bits[2]; /*     8     24 */

Make bits a fixed size and drop the words member, it simplifies
the code and only increases memory requirements on x86 when
less than 64bit labels are required.

We still only allocate the extension if its needed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_labels.h | 16 ++++------------
 net/netfilter/nf_conntrack_labels.c         | 13 +++----------
 net/netfilter/nf_conntrack_netlink.c        | 10 +++++-----
 net/netfilter/nft_ct.c                      | 13 +++----------
 net/netfilter/xt_connlabel.c                |  2 +-
 net/openvswitch/conntrack.c                 |  4 ++--
 6 files changed, 18 insertions(+), 40 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index c5f8fc736b3d..0fd4989de836 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -10,8 +10,7 @@
 #define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
 
 struct nf_conn_labels {
-	u8 words;
-	unsigned long bits[];
+	unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
 };
 
 static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
@@ -26,20 +25,13 @@ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
 static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
 {
 #ifdef CONFIG_NF_CONNTRACK_LABELS
-	struct nf_conn_labels *cl_ext;
 	struct net *net = nf_ct_net(ct);
-	u8 words;
 
-	words = ACCESS_ONCE(net->ct.label_words);
-	if (words == 0)
+	if (net->ct.labels_used == 0)
 		return NULL;
 
-	cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS,
-				      words * sizeof(long), GFP_ATOMIC);
-	if (cl_ext != NULL)
-		cl_ext->words = words;
-
-	return cl_ext;
+	return nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS,
+				    sizeof(struct nf_conn_labels), GFP_ATOMIC);
 #else
 	return NULL;
 #endif
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 252e6a7cd2f1..7686200f9ace 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -20,7 +20,7 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
 {
 	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
 
-	if (!labels || BIT_WORD(bit) >= labels->words)
+	if (!labels)
 		return -ENOSPC;
 
 	if (test_bit(bit, labels->bits))
@@ -60,7 +60,7 @@ int nf_connlabels_replace(struct nf_conn *ct,
 	if (!labels)
 		return -ENOSPC;
 
-	size = labels->words * sizeof(long);
+	size = sizeof(labels->bits);
 	if (size < (words32 * sizeof(u32)))
 		words32 = size / sizeof(u32);
 
@@ -80,16 +80,11 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
 
 int nf_connlabels_get(struct net *net, unsigned int bits)
 {
-	size_t words;
-
-	words = BIT_WORD(bits) + 1;
-	if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long))
+	if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
 		return -ERANGE;
 
 	spin_lock(&nf_connlabels_lock);
 	net->ct.labels_used++;
-	if (words > net->ct.label_words)
-		net->ct.label_words = words;
 	spin_unlock(&nf_connlabels_lock);
 
 	return 0;
@@ -100,8 +95,6 @@ void nf_connlabels_put(struct net *net)
 {
 	spin_lock(&nf_connlabels_lock);
 	net->ct.labels_used--;
-	if (net->ct.labels_used == 0)
-		net->ct.label_words = 0;
 	spin_unlock(&nf_connlabels_lock);
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_put);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index a18d1ceabad5..050bb3420a6b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -346,25 +346,25 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
 
 	if (!labels)
 		return 0;
-	return nla_total_size(labels->words * sizeof(long));
+	return nla_total_size(sizeof(labels->bits));
 }
 
 static int
 ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
 {
 	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-	unsigned int len, i;
+	unsigned int i;
 
 	if (!labels)
 		return 0;
 
-	len = labels->words * sizeof(long);
 	i = 0;
 	do {
 		if (labels->bits[i] != 0)
-			return nla_put(skb, CTA_LABELS, len, labels->bits);
+			return nla_put(skb, CTA_LABELS, sizeof(labels->bits),
+				       labels->bits);
 		i++;
-	} while (i < labels->words);
+	} while (i < ARRAY_SIZE(labels->bits));
 
 	return 0;
 }
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d9e44ca34055..2f47d5d3ae3b 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -113,18 +113,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 	case NFT_CT_LABELS: {
 		struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-		unsigned int size;
 
-		if (!labels) {
+		if (labels)
+			memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE);
+		else
 			memset(dest, 0, NF_CT_LABELS_MAX_SIZE);
-			return;
-		}
-
-		size = labels->words * sizeof(long);
-		memcpy(dest, labels->bits, size);
-		if (size < NF_CT_LABELS_MAX_SIZE)
-			memset(((char *) dest) + size, 0,
-			       NF_CT_LABELS_MAX_SIZE - size);
 		return;
 	}
 #endif
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index a79af255561a..c9fba8ade3d5 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -25,7 +25,7 @@ static bool connlabel_match(const struct nf_conn *ct, u16 bit)
 	if (!labels)
 		return false;
 
-	return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits);
+	return test_bit(bit, labels->bits);
 }
 
 static bool
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b4069a90e375..c644c78ed485 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct,
 	struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
 
 	if (cl) {
-		size_t len = cl->words * sizeof(long);
+		size_t len = sizeof(cl->bits);
 
 		if (len > OVS_CT_LABELS_LEN)
 			len = OVS_CT_LABELS_LEN;
@@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 		nf_ct_labels_ext_add(ct);
 		cl = nf_ct_labels_find(ct);
 	}
-	if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN)
+	if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
 		return -ENOSPC;
 
 	err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
-- 
cgit v1.2.3


From 857ed310c013fe0d0059f955048dab589fa7a57a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 Jul 2016 12:51:17 +0200
Subject: netfilter: connlabels: move set helper to xt_connlabel

xt_connlabel is the only user so move it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_labels.h |  2 --
 net/netfilter/nf_conntrack_labels.c         | 17 -----------------
 net/netfilter/xt_connlabel.c                | 29 ++++++++++++++++-------------
 3 files changed, 16 insertions(+), 32 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index 0fd4989de836..498814626e28 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -37,8 +37,6 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
 #endif
 }
 
-int nf_connlabel_set(struct nf_conn *ct, u16 bit);
-
 int nf_connlabels_replace(struct nf_conn *ct,
 			  const u32 *data, const u32 *mask, unsigned int words);
 
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 7686200f9ace..bcab8bde7312 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -16,23 +16,6 @@
 
 static spinlock_t nf_connlabels_lock;
 
-int nf_connlabel_set(struct nf_conn *ct, u16 bit)
-{
-	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
-	if (!labels)
-		return -ENOSPC;
-
-	if (test_bit(bit, labels->bits))
-		return 0;
-
-	if (!test_and_set_bit(bit, labels->bits))
-		nf_conntrack_event_cache(IPCT_LABEL, ct);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nf_connlabel_set);
-
 static int replace_u32(u32 *address, u32 mask, u32 new)
 {
 	u32 old, tmp;
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index c9fba8ade3d5..03d66f1c5e69 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <linux/netfilter/x_tables.h>
 
@@ -18,21 +19,12 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
 MODULE_ALIAS("ipt_connlabel");
 MODULE_ALIAS("ip6t_connlabel");
 
-static bool connlabel_match(const struct nf_conn *ct, u16 bit)
-{
-	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
-	if (!labels)
-		return false;
-
-	return test_bit(bit, labels->bits);
-}
-
 static bool
 connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_connlabel_mtinfo *info = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
+	struct nf_conn_labels *labels;
 	struct nf_conn *ct;
 	bool invert = info->options & XT_CONNLABEL_OP_INVERT;
 
@@ -40,10 +32,21 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (ct == NULL || nf_ct_is_untracked(ct))
 		return invert;
 
-	if (info->options & XT_CONNLABEL_OP_SET)
-		return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
+	labels = nf_ct_labels_find(ct);
+	if (!labels)
+		return invert;
+
+	if (test_bit(info->bit, labels->bits))
+		return !invert;
+
+	if (info->options & XT_CONNLABEL_OP_SET) {
+		if (!test_and_set_bit(info->bit, labels->bits))
+			nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+		return !invert;
+	}
 
-	return connlabel_match(ct, info->bit) ^ invert;
+	return invert;
 }
 
 static int connlabel_mt_check(const struct xt_mtchk_param *par)
-- 
cgit v1.2.3


From b87f7936a93246804cf70e7e2e0568799c948bb1 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Thu, 21 Jul 2016 12:03:12 +0200
Subject: net/sched: Add match-all classifier hw offloading.

Following the work that have been done on offloading classifiers like u32
and flower, now the match-all classifier hw offloading is possible. if
the interface supports tc offloading.

To control the offloading, two tc flags have been introduced: skip_sw and
skip_hw. Typical usage:

tc filter add dev eth25 parent ffff: 	\
	matchall skip_sw		\
	action mirred egress mirror	\
	dev eth27

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 ++
 include/net/pkt_cls.h        | 11 +++++++
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_matchall.c     | 76 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 43c749b1b619..076df5360ba5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -787,6 +787,7 @@ enum {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
+	TC_SETUP_MATCHALL,
 };
 
 struct tc_cls_u32_offload;
@@ -797,6 +798,7 @@ struct tc_to_netdev {
 		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
 		struct tc_cls_flower_offload *cls_flower;
+		struct tc_cls_matchall_offload *cls_mall;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 3722dda0199d..6f8d65342d3a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -442,4 +442,15 @@ struct tc_cls_flower_offload {
 	struct tcf_exts *exts;
 };
 
+enum tc_matchall_command {
+	TC_CLSMATCHALL_REPLACE,
+	TC_CLSMATCHALL_DESTROY,
+};
+
+struct tc_cls_matchall_offload {
+	enum tc_matchall_command command;
+	struct tcf_exts *exts;
+	unsigned long cookie;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index a32494887e01..d1c1ccaba787 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -439,6 +439,7 @@ enum {
 	TCA_MATCHALL_UNSPEC,
 	TCA_MATCHALL_CLASSID,
 	TCA_MATCHALL_ACT,
+	TCA_MATCHALL_FLAGS,
 	__TCA_MATCHALL_MAX,
 };
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 8a6b4de7a99a..25927b6c4436 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_filter {
 	struct tcf_result res;
 	u32 handle;
 	struct rcu_head	rcu;
+	u32 flags;
 };
 
 struct cls_mall_head {
@@ -34,6 +35,9 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_mall_head *head = rcu_dereference_bh(tp->root);
 	struct cls_mall_filter *f = head->filter;
 
+	if (tc_skip_sw(f->flags))
+		return -1;
+
 	return tcf_exts_exec(skb, &f->exts, res);
 }
 
@@ -55,18 +59,61 @@ static void mall_destroy_filter(struct rcu_head *head)
 	struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu);
 
 	tcf_exts_destroy(&f->exts);
+
 	kfree(f);
 }
 
+static int mall_replace_hw_filter(struct tcf_proto *tp,
+				  struct cls_mall_filter *f,
+				  unsigned long cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_to_netdev offload;
+	struct tc_cls_matchall_offload mall_offload = {0};
+
+	offload.type = TC_SETUP_MATCHALL;
+	offload.cls_mall = &mall_offload;
+	offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
+	offload.cls_mall->exts = &f->exts;
+	offload.cls_mall->cookie = cookie;
+
+	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+					     &offload);
+}
+
+static void mall_destroy_hw_filter(struct tcf_proto *tp,
+				   struct cls_mall_filter *f,
+				   unsigned long cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_to_netdev offload;
+	struct tc_cls_matchall_offload mall_offload = {0};
+
+	offload.type = TC_SETUP_MATCHALL;
+	offload.cls_mall = &mall_offload;
+	offload.cls_mall->command = TC_CLSMATCHALL_DESTROY;
+	offload.cls_mall->exts = NULL;
+	offload.cls_mall->cookie = cookie;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+					     &offload);
+}
+
 static bool mall_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct cls_mall_filter *f = head->filter;
 
-	if (!force && head->filter)
+	if (!force && f)
 		return false;
 
-	if (head->filter)
-		call_rcu(&head->filter->rcu, mall_destroy_filter);
+	if (f) {
+		if (tc_should_offload(dev, tp, f->flags))
+			mall_destroy_hw_filter(tp, f, (unsigned long) f);
+
+		call_rcu(&f->rcu, mall_destroy_filter);
+	}
 	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
 	return true;
@@ -117,8 +164,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
+	struct net_device *dev = tp->q->dev_queue->dev;
 	struct cls_mall_filter *f;
 	struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+	u32 flags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -135,6 +184,12 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_MATCHALL_FLAGS]) {
+		flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+		if (!tc_flags_valid(flags))
+			return -EINVAL;
+	}
+
 	f = kzalloc(sizeof(*f), GFP_KERNEL);
 	if (!f)
 		return -ENOBUFS;
@@ -144,11 +199,22 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!handle)
 		handle = 1;
 	f->handle = handle;
+	f->flags = flags;
 
 	err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
 
+	if (tc_should_offload(dev, tp, flags)) {
+		err = mall_replace_hw_filter(tp, f, (unsigned long) f);
+		if (err) {
+			if (tc_skip_sw(flags))
+				goto errout;
+			else
+				err = 0;
+		}
+	}
+
 	*arg = (unsigned long) f;
 	rcu_assign_pointer(head->filter, f);
 
@@ -163,6 +229,10 @@ static int mall_delete(struct tcf_proto *tp, unsigned long arg)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
+	struct net_device *dev = tp->q->dev_queue->dev;
+
+	if (tc_should_offload(dev, tp, f->flags))
+		mall_destroy_hw_filter(tp, f, (unsigned long) f);
 
 	RCU_INIT_POINTER(head->filter, NULL);
 	tcf_unbind_filter(tp, &f->res);
-- 
cgit v1.2.3


From 56a20680f70393d199fc5e8ecc7859549ca5a0c0 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Thu, 21 Jul 2016 12:03:16 +0200
Subject: net/sched: act_mirred: Add helper inlines to access tcf_mirred info.

The helper function is_tcf_mirred_mirror helps finding whether an action
struct is of type mirred and is configured to be of type mirror.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index e891835eb74e..6a13a7c74e0c 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -24,6 +24,15 @@ static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
 	return false;
 }
 
+static inline bool is_tcf_mirred_mirror(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
+		return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR;
+#endif
+	return false;
+}
+
 static inline int tcf_mirred_ifindex(const struct tc_action *a)
 {
 	return to_mirred(a)->tcfm_ifindex;
-- 
cgit v1.2.3


From 9b8ac4f9dd60ac7375fb0e221dbf596db4c4e622 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 24 Jul 2016 19:24:09 +0100
Subject: gtp: #define #define _GTP_H_ and not #define _GTP_H

Fix clang build warning:

./include/net/gtp.h:1:9: warning: '_GTP_H_' is used as a header
guard here, followed by #define of a different macro [-Wheader-guard]

fix by defining _GTP_H_ and not _GTP_H

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gtp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/gtp.h b/include/net/gtp.h
index 894a37b87d63..6398891b99ba 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -1,5 +1,5 @@
 #ifndef _GTP_H_
-#define _GTP_H
+#define _GTP_H_
 
 /* General GTP protocol related definitions. */
 
-- 
cgit v1.2.3


From 86cb13e4ec5060d94069a8418fd4f3ccb38edee2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 25 Jul 2016 13:12:33 +0300
Subject: mlxsw: spectrum: Fix compilation error when CLS_ACT isn't set

When CONFIG_NET_CLS_ACT isn't set 'struct tcf_exts' has no member named
'actions' and we therefore must not access it. Otherwise compilation
fails.

Fix this by introducing a new macro similar to tc_no_actions(), which
always returns 'false' if CONFIG_NET_CLS_ACT isn't set.

Fixes: 763b4b70afcd ("mlxsw: spectrum: Add support in matchall mirror TC offloading")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 11 +++++------
 include/net/act_api.h                          |  4 ++++
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 552636b73416..c3e61500819d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1148,23 +1148,22 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 					  struct tc_cls_matchall_offload *cls,
 					  bool ingress)
 {
-	struct tcf_exts *exts = cls->exts;
 	const struct tc_action *a;
 	int err;
 
-	if (!list_is_singular(&exts->actions)) {
+	if (!tc_single_action(cls->exts)) {
 		netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
 		return -ENOTSUPP;
 	}
 
-	a = list_first_entry(&exts->actions, struct tc_action, list);
-	if (is_tcf_mirred_mirror(a) && protocol == htons(ETH_P_ALL)) {
+	tc_for_each_action(a, cls->exts) {
+		if (!is_tcf_mirred_mirror(a) || protocol != htons(ETH_P_ALL))
+			return -ENOTSUPP;
+
 		err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, cls,
 							    a, ingress);
 		if (err)
 			return err;
-	} else {
-		return -ENOTSUPP;
 	}
 
 	return 0;
diff --git a/include/net/act_api.h b/include/net/act_api.h
index fb82b5b5d9e7..0bb210635e5f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -192,6 +192,9 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 #define tc_for_each_action(_a, _exts) \
 	list_for_each_entry(a, &(_exts)->actions, list)
 
+#define tc_single_action(_exts) \
+	(list_is_singular(&(_exts)->actions))
+
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
 					   u64 packets, u64 lastuse)
 {
@@ -205,6 +208,7 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
 
 #define tc_no_actions(_exts) true
 #define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
+#define tc_single_action(_exts) false
 #define tcf_action_stats_update(a, bytes, packets, lastuse)
 
 #endif /* CONFIG_NET_CLS_ACT */
-- 
cgit v1.2.3


From a85a970af265f156740977168b542234511b28a8 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 25 Jul 2016 16:09:41 -0700
Subject: net_sched: move tc_action into tcf_common

struct tc_action is confusing, currently we use it for two purposes:
1) Pass in arguments and carry out results from helper functions
2) A generic representation for tc actions

The first one is error-prone, since we need to make sure we don't
miss anything. This patch aims to get rid of this use, by moving
tc_action into tcf_common, so that they are allocated together
in hashtable and can be cast'ed easily.

And together with the following patch, we could really make
tc_action a generic representation for all tc actions and each
type of action can inherit from it.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h            |  52 ++++++++------
 include/net/tc_act/tc_bpf.h      |   3 +-
 include/net/tc_act/tc_connmark.h |   3 +-
 include/net/tc_act/tc_csum.h     |   3 +-
 include/net/tc_act/tc_defact.h   |   3 +-
 include/net/tc_act/tc_gact.h     |   5 +-
 include/net/tc_act/tc_ife.h      |   3 +-
 include/net/tc_act/tc_ipt.h      |   3 +-
 include/net/tc_act/tc_mirred.h   |   3 +-
 include/net/tc_act/tc_nat.h      |   5 +-
 include/net/tc_act/tc_pedit.h    |   3 +-
 include/net/tc_act/tc_skbedit.h  |   3 +-
 include/net/tc_act/tc_vlan.h     |   3 +-
 net/sched/act_api.c              | 149 ++++++++++++++-------------------------
 net/sched/act_bpf.c              |  26 +++----
 net/sched/act_connmark.c         |  24 ++++---
 net/sched/act_csum.c             |  22 +++---
 net/sched/act_gact.c             |  24 ++++---
 net/sched/act_ife.c              |  38 +++++-----
 net/sched/act_ipt.c              |  48 +++++++------
 net/sched/act_mirred.c           |  26 +++----
 net/sched/act_nat.c              |  22 +++---
 net/sched/act_pedit.c            |  28 ++++----
 net/sched/act_police.c           |  45 ++++++------
 net/sched/act_simple.c           |  29 ++++----
 net/sched/act_skbedit.c          |  26 +++----
 net/sched/act_vlan.c             |  28 ++++----
 27 files changed, 298 insertions(+), 329 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 0bb210635e5f..8b199095ea51 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -10,7 +10,26 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+
+struct tcf_hashinfo {
+	struct hlist_head	*htab;
+	unsigned int		hmask;
+	spinlock_t		lock;
+	u32			index;
+};
+
+struct tc_action_ops;
+
+struct tc_action {
+	const struct tc_action_ops	*ops;
+	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
+	__u32			order;
+	struct list_head	list;
+	struct tcf_hashinfo	*hinfo;
+};
+
 struct tcf_common {
+	struct tc_action		tcfc_act;
 	struct hlist_node		tcfc_head;
 	u32				tcfc_index;
 	int				tcfc_refcnt;
@@ -26,6 +45,7 @@ struct tcf_common {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 };
+#define tcf_act		common.tcfc_act
 #define tcf_head	common.tcfc_head
 #define tcf_index	common.tcfc_index
 #define tcf_refcnt	common.tcfc_refcnt
@@ -39,13 +59,6 @@ struct tcf_common {
 #define tcf_lock	common.tcfc_lock
 #define tcf_rcu		common.tcfc_rcu
 
-struct tcf_hashinfo {
-	struct hlist_head	*htab;
-	unsigned int		hmask;
-	spinlock_t		lock;
-	u32			index;
-};
-
 static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
 {
 	return index & hmask;
@@ -88,15 +101,6 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
 	dtm->expires = jiffies_to_clock_t(stm->expires);
 }
 
-struct tc_action {
-	void			*priv;
-	const struct tc_action_ops	*ops;
-	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
-	__u32			order;
-	struct list_head	list;
-	struct tcf_hashinfo	*hinfo;
-};
-
 #ifdef CONFIG_NET_CLS_ACT
 
 #define ACT_P_CREATED 1
@@ -106,17 +110,18 @@ struct tc_action_ops {
 	struct list_head head;
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
+	size_t	size;
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, const struct tc_action *,
 		       struct tcf_result *);
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	void	(*cleanup)(struct tc_action *, int bind);
-	int     (*lookup)(struct net *, struct tc_action *, u32);
+	int     (*lookup)(struct net *, struct tc_action **, u32);
 	int     (*init)(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action *act, int ovr,
+			struct nlattr *est, struct tc_action **act, int ovr,
 			int bind);
 	int     (*walk)(struct net *, struct sk_buff *,
-			struct netlink_callback *, int, struct tc_action *);
+			struct netlink_callback *, int, const struct tc_action_ops *);
 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
 };
 
@@ -152,13 +157,14 @@ static inline void tc_action_net_exit(struct tc_action_net *tn)
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct netlink_callback *cb, int type,
-		       struct tc_action *a);
-int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index);
+		       const struct tc_action_ops *ops);
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index);
 u32 tcf_hash_new_index(struct tc_action_net *tn);
-bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 		    int bind);
 int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
-		    struct tc_action *a, int size, int bind, bool cpustats);
+		    struct tc_action **a, const struct tc_action_ops *ops, int bind,
+		    bool cpustats);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
 void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a);
 
diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h
index 958d69cfb19c..80a4d6f49773 100644
--- a/include/net/tc_act/tc_bpf.h
+++ b/include/net/tc_act/tc_bpf.h
@@ -23,7 +23,6 @@ struct tcf_bpf {
 	struct sock_filter	*bpf_ops;
 	const char		*bpf_name;
 };
-#define to_bpf(a) \
-	container_of(a->priv, struct tcf_bpf, common)
+#define to_bpf(a) ((struct tcf_bpf *)a)
 
 #endif /* __NET_TC_BPF_H */
diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h
index 02caa406611b..8a661135f4ac 100644
--- a/include/net/tc_act/tc_connmark.h
+++ b/include/net/tc_act/tc_connmark.h
@@ -9,7 +9,6 @@ struct tcf_connmark_info {
 	u16 zone;
 };
 
-#define to_connmark(a) \
-	container_of(a->priv, struct tcf_connmark_info, common)
+#define to_connmark(a) ((struct tcf_connmark_info *)a)
 
 #endif /* __NET_TC_CONNMARK_H */
diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index fa8f5fac65e9..1a9ef15d573b 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -9,7 +9,6 @@ struct tcf_csum {
 
 	u32 update_flags;
 };
-#define to_tcf_csum(a) \
-	container_of(a->priv,struct tcf_csum,common)
+#define to_tcf_csum(a) ((struct tcf_csum *)a)
 
 #endif /* __NET_TC_CSUM_H */
diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h
index ab9b5d6be67b..e25b4eb4fc66 100644
--- a/include/net/tc_act/tc_defact.h
+++ b/include/net/tc_act/tc_defact.h
@@ -8,7 +8,6 @@ struct tcf_defact {
 	u32		tcfd_datalen;
 	void		*tcfd_defdata;
 };
-#define to_defact(a) \
-	container_of(a->priv, struct tcf_defact, common)
+#define to_defact(a) ((struct tcf_defact *)a)
 
 #endif /* __NET_TC_DEF_H */
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 93c520b83d10..119cdb418c23 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -13,8 +13,7 @@ struct tcf_gact {
 	atomic_t		packets;
 #endif
 };
-#define to_gact(a) \
-	container_of(a->priv, struct tcf_gact, common)
+#define to_gact(a) ((struct tcf_gact *)a)
 
 static inline bool is_tcf_gact_shot(const struct tc_action *a)
 {
@@ -24,7 +23,7 @@ static inline bool is_tcf_gact_shot(const struct tc_action *a)
 	if (a->ops && a->ops->type != TCA_ACT_GACT)
 		return false;
 
-	gact = a->priv;
+	gact = to_gact(a);
 	if (gact->tcf_action == TC_ACT_SHOT)
 		return true;
 
diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index c55facd17b7e..7921abe42adc 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -16,8 +16,7 @@ struct tcf_ife_info {
 	/* list of metaids allowed */
 	struct list_head metalist;
 };
-#define to_ife(a) \
-	container_of(a->priv, struct tcf_ife_info, common)
+#define to_ife(a) ((struct tcf_ife_info *)a)
 
 struct tcf_meta_info {
 	const struct tcf_meta_ops *ops;
diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h
index c0f4193f432c..c22ae7ab66ed 100644
--- a/include/net/tc_act/tc_ipt.h
+++ b/include/net/tc_act/tc_ipt.h
@@ -11,7 +11,6 @@ struct tcf_ipt {
 	char			*tcfi_tname;
 	struct xt_entry_target	*tcfi_t;
 };
-#define to_ipt(a) \
-	container_of(a->priv, struct tcf_ipt, common)
+#define to_ipt(a) ((struct tcf_ipt *)a)
 
 #endif /* __NET_TC_IPT_H */
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 6a13a7c74e0c..89aebd22cd79 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -12,8 +12,7 @@ struct tcf_mirred {
 	struct net_device __rcu	*tcfm_dev;
 	struct list_head	tcfm_list;
 };
-#define to_mirred(a) \
-	container_of(a->priv, struct tcf_mirred, common)
+#define to_mirred(a) ((struct tcf_mirred *)a)
 
 static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
 {
diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h
index 63d8e9ca9d99..a91ad3ad565e 100644
--- a/include/net/tc_act/tc_nat.h
+++ b/include/net/tc_act/tc_nat.h
@@ -13,9 +13,6 @@ struct tcf_nat {
 	u32 flags;
 };
 
-static inline struct tcf_nat *to_tcf_nat(struct tc_action *a)
-{
-	return container_of(a->priv, struct tcf_nat, common);
-}
+#define to_tcf_nat(a) ((struct tcf_nat *)a)
 
 #endif /* __NET_TC_NAT_H */
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 5b80998879c7..2cccfbaae800 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -9,7 +9,6 @@ struct tcf_pedit {
 	unsigned char		tcfp_flags;
 	struct tc_pedit_key	*tcfp_keys;
 };
-#define to_pedit(a) \
-	container_of(a->priv, struct tcf_pedit, common)
+#define to_pedit(a) ((struct tcf_pedit *)a)
 
 #endif /* __NET_TC_PED_H */
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index d01a5d40cfb5..9e0548998327 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -30,8 +30,7 @@ struct tcf_skbedit {
 	u16		queue_mapping;
 	u16		ptype;
 };
-#define to_skbedit(a) \
-	container_of(a->priv, struct tcf_skbedit, common)
+#define to_skbedit(a) ((struct tcf_skbedit *)a)
 
 /* Return true iff action is mark */
 static inline bool is_tcf_skbedit_mark(const struct tc_action *a)
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 93b70ade1ff3..584b80788d52 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -21,7 +21,6 @@ struct tcf_vlan {
 	u16			tcfv_push_vid;
 	__be16			tcfv_push_proto;
 };
-#define to_vlan(a) \
-	container_of(a->priv, struct tcf_vlan, common)
+#define to_vlan(a) ((struct tcf_vlan *)a)
 
 #endif /* __NET_TC_VLAN_H */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 47ec2305f920..d97419f35e7e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -38,7 +38,7 @@ static void free_tcf(struct rcu_head *head)
 
 static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
 {
-	struct tcf_common *p = a->priv;
+	struct tcf_common *p = (struct tcf_common *)a;
 
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfc_head);
@@ -54,7 +54,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
 
 int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 {
-	struct tcf_common *p = a->priv;
+	struct tcf_common *p = (struct tcf_common *)a;
 	int ret = 0;
 
 	if (p) {
@@ -67,6 +67,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
 			if (a->ops->cleanup)
 				a->ops->cleanup(a, bind);
+			list_del(&a->list);
 			tcf_hash_destroy(a->hinfo, a);
 			ret = ACT_P_DELETED;
 		}
@@ -77,10 +78,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 EXPORT_SYMBOL(__tcf_hash_release);
 
 static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
-			   struct netlink_callback *cb, struct tc_action *a)
+			   struct netlink_callback *cb)
 {
-	struct hlist_head *head;
-	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct nlattr *nest;
 
@@ -89,19 +88,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 	s_i = cb->args[0];
 
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		struct hlist_head *head;
+		struct tcf_common *p;
+
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 
 		hlist_for_each_entry_rcu(p, head, tcfc_head) {
 			index++;
 			if (index < s_i)
 				continue;
-			a->priv = p;
-			a->order = n_i;
 
-			nest = nla_nest_start(skb, a->order);
+			nest = nla_nest_start(skb, n_i);
 			if (nest == NULL)
 				goto nla_put_failure;
-			err = tcf_action_dump_1(skb, a, 0, 0);
+			err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0);
 			if (err < 0) {
 				index--;
 				nlmsg_trim(skb, nest);
@@ -125,27 +125,27 @@ nla_put_failure:
 }
 
 static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
-			  struct tc_action *a)
+			  const struct tc_action_ops *ops)
 {
-	struct hlist_head *head;
-	struct hlist_node *n;
-	struct tcf_common *p;
 	struct nlattr *nest;
 	int i = 0, n_i = 0;
 	int ret = -EINVAL;
 
-	nest = nla_nest_start(skb, a->order);
+	nest = nla_nest_start(skb, 0);
 	if (nest == NULL)
 		goto nla_put_failure;
-	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+	if (nla_put_string(skb, TCA_KIND, ops->kind))
 		goto nla_put_failure;
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		struct hlist_head *head;
+		struct hlist_node *n;
+		struct tcf_common *p;
+
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
-			a->priv = p;
-			ret = __tcf_hash_release(a, false, true);
+			ret = __tcf_hash_release((struct tc_action *)p, false, true);
 			if (ret == ACT_P_DELETED) {
-				module_put(a->ops->owner);
+				module_put(p->tcfc_act.ops->owner);
 				n_i++;
 			} else if (ret < 0)
 				goto nla_put_failure;
@@ -163,16 +163,14 @@ nla_put_failure:
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct netlink_callback *cb, int type,
-		       struct tc_action *a)
+		       const struct tc_action_ops *ops)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 
-	a->hinfo = hinfo;
-
 	if (type == RTM_DELACTION) {
-		return tcf_del_walker(hinfo, skb, a);
+		return tcf_del_walker(hinfo, skb, ops);
 	} else if (type == RTM_GETACTION) {
-		return tcf_dump_walker(hinfo, skb, cb, a);
+		return tcf_dump_walker(hinfo, skb, cb);
 	} else {
 		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
 		return -EINVAL;
@@ -210,21 +208,20 @@ u32 tcf_hash_new_index(struct tc_action_net *tn)
 }
 EXPORT_SYMBOL(tcf_hash_new_index);
 
-int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
 
 	if (p) {
-		a->priv = p;
-		a->hinfo = hinfo;
+		*a = &p->tcfc_act;
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_search);
 
-bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 		    int bind)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
@@ -233,8 +230,7 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
 		if (bind)
 			p->tcfc_bindcnt++;
 		p->tcfc_refcnt++;
-		a->priv = p;
-		a->hinfo = hinfo;
+		*a = &p->tcfc_act;
 		return true;
 	}
 	return false;
@@ -243,7 +239,7 @@ EXPORT_SYMBOL(tcf_hash_check);
 
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
-	struct tcf_common *pc = a->priv;
+	struct tcf_common *pc = (struct tcf_common *)a;
 	if (est)
 		gen_kill_estimator(&pc->tcfc_bstats,
 				   &pc->tcfc_rate_est);
@@ -252,9 +248,10 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 EXPORT_SYMBOL(tcf_hash_cleanup);
 
 int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
-		    struct tc_action *a, int size, int bind, bool cpustats)
+		    struct tc_action **a, const struct tc_action_ops *ops,
+		    int bind, bool cpustats)
 {
-	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+	struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL);
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int err = -ENOMEM;
 
@@ -294,15 +291,17 @@ err2:
 		}
 	}
 
-	a->priv = (void *) p;
-	a->hinfo = hinfo;
+	p->tcfc_act.hinfo = hinfo;
+	p->tcfc_act.ops = ops;
+	INIT_LIST_HEAD(&p->tcfc_act.list);
+	*a = &p->tcfc_act;
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_create);
 
 void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
 {
-	struct tcf_common *p = a->priv;
+	struct tcf_common *p = (struct tcf_common *)a;
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
 
@@ -315,10 +314,6 @@ EXPORT_SYMBOL(tcf_hash_insert);
 void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
 			  struct tcf_hashinfo *hinfo)
 {
-	struct tc_action a = {
-		.ops = ops,
-		.hinfo = hinfo,
-	};
 	int i;
 
 	for (i = 0; i < hinfo->hmask + 1; i++) {
@@ -328,8 +323,7 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
 		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
 			int ret;
 
-			a.priv = p;
-			ret = __tcf_hash_release(&a, false, true);
+			ret = __tcf_hash_release((struct tc_action *)p, false, true);
 			if (ret == ACT_P_DELETED)
 				module_put(ops->owner);
 			else if (ret < 0)
@@ -466,8 +460,6 @@ int tcf_action_destroy(struct list_head *actions, int bind)
 			module_put(a->ops->owner);
 		else if (ret < 0)
 			return ret;
-		list_del(&a->list);
-		kfree(a);
 	}
 	return ret;
 }
@@ -581,20 +573,13 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 		goto err_out;
 	}
 
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_KERNEL);
-	if (a == NULL)
-		goto err_mod;
-
-	a->ops = a_o;
-	INIT_LIST_HEAD(&a->list);
 	/* backward compatibility for policer */
 	if (name == NULL)
-		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind);
 	else
-		err = a_o->init(net, nla, est, a, ovr, bind);
+		err = a_o->init(net, nla, est, &a, ovr, bind);
 	if (err < 0)
-		goto err_free;
+		goto err_mod;
 
 	/* module count goes up only when brand new policy is created
 	 * if it exists and is only bound to in a_o->init() then
@@ -605,8 +590,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 
 	return a;
 
-err_free:
-	kfree(a);
 err_mod:
 	module_put(a_o->owner);
 err_out:
@@ -647,7 +630,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 {
 	int err = 0;
 	struct gnet_dump d;
-	struct tcf_common *p = a->priv;
+	struct tcf_common *p = (struct tcf_common *)a;
 
 	if (p == NULL)
 		goto errout;
@@ -740,24 +723,11 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
 	return rtnl_unicast(skb, net, portid);
 }
 
-static struct tc_action *create_a(int i)
-{
-	struct tc_action *act;
-
-	act = kzalloc(sizeof(*act), GFP_KERNEL);
-	if (act == NULL) {
-		pr_debug("create_a: failed to alloc!\n");
-		return NULL;
-	}
-	act->order = i;
-	INIT_LIST_HEAD(&act->list);
-	return act;
-}
-
 static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
 					  struct nlmsghdr *n, u32 portid)
 {
 	struct nlattr *tb[TCA_ACT_MAX + 1];
+	const struct tc_action_ops *ops;
 	struct tc_action *a;
 	int index;
 	int err;
@@ -772,26 +742,19 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
 		goto err_out;
 	index = nla_get_u32(tb[TCA_ACT_INDEX]);
 
-	err = -ENOMEM;
-	a = create_a(0);
-	if (a == NULL)
-		goto err_out;
-
 	err = -EINVAL;
-	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
-	if (a->ops == NULL) /* could happen in batch of actions */
-		goto err_free;
+	ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+	if (!ops) /* could happen in batch of actions */
+		goto err_out;
 	err = -ENOENT;
-	if (a->ops->lookup(net, a, index) == 0)
+	if (ops->lookup(net, &a, index) == 0)
 		goto err_mod;
 
-	module_put(a->ops->owner);
+	module_put(ops->owner);
 	return a;
 
 err_mod:
-	module_put(a->ops->owner);
-err_free:
-	kfree(a);
+	module_put(ops->owner);
 err_out:
 	return ERR_PTR(err);
 }
@@ -816,8 +779,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	struct netlink_callback dcb;
 	struct nlattr *nest;
 	struct nlattr *tb[TCA_ACT_MAX + 1];
+	const struct tc_action_ops *ops;
 	struct nlattr *kind;
-	struct tc_action a;
 	int err = -ENOMEM;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
@@ -834,10 +797,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 
 	err = -EINVAL;
 	kind = tb[TCA_ACT_KIND];
-	memset(&a, 0, sizeof(struct tc_action));
-	INIT_LIST_HEAD(&a.list);
-	a.ops = tc_lookup_action(kind);
-	if (a.ops == NULL) /*some idjot trying to flush unknown action */
+	ops = tc_lookup_action(kind);
+	if (!ops) /*some idjot trying to flush unknown action */
 		goto err_out;
 
 	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION,
@@ -853,7 +814,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	if (nest == NULL)
 		goto out_module_put;
 
-	err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a);
+	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
 	if (err < 0)
 		goto out_module_put;
 	if (err == 0)
@@ -863,7 +824,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	nlh->nlmsg_flags |= NLM_F_ROOT;
-	module_put(a.ops->owner);
+	module_put(ops->owner);
 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
@@ -872,7 +833,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	return err;
 
 out_module_put:
-	module_put(a.ops->owner);
+	module_put(ops->owner);
 err_out:
 noflush_out:
 	kfree_skb(skb);
@@ -1084,7 +1045,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 	struct tc_action_ops *a_o;
-	struct tc_action a;
 	int ret = 0;
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
 	struct nlattr *kind = find_dump_kind(cb->nlh);
@@ -1098,9 +1058,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
-	memset(&a, 0, sizeof(struct tc_action));
-	a.ops = a_o;
-
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
@@ -1114,7 +1071,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nest == NULL)
 		goto out_module_put;
 
-	ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a);
+	ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o);
 	if (ret < 0)
 		goto out_module_put;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index ef74bffa6101..bfa870731e74 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -34,11 +34,12 @@ struct tcf_bpf_cfg {
 };
 
 static int bpf_net_id;
+static struct tc_action_ops act_bpf_ops;
 
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		   struct tcf_result *res)
 {
-	struct tcf_bpf *prog = act->priv;
+	struct tcf_bpf *prog = to_bpf(act);
 	struct bpf_prog *filter;
 	int action, filter_res;
 	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
@@ -134,7 +135,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
 			int bind, int ref)
 {
 	unsigned char *tp = skb_tail_pointer(skb);
-	struct tcf_bpf *prog = act->priv;
+	struct tcf_bpf *prog = to_bpf(act);
 	struct tc_act_bpf opt = {
 		.index   = prog->tcf_index,
 		.refcnt  = prog->tcf_refcnt - ref,
@@ -270,7 +271,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 }
 
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action *act,
+			struct nlattr *est, struct tc_action **act,
 			int replace, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
@@ -295,7 +296,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(tn, parm->index, act, bind)) {
 		ret = tcf_hash_create(tn, parm->index, est, act,
-				      sizeof(*prog), bind, true);
+				      &act_bpf_ops, bind, true);
 		if (ret < 0)
 			return ret;
 
@@ -305,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		tcf_hash_release(act, bind);
+		tcf_hash_release(*act, bind);
 		if (!replace)
 			return -EEXIST;
 	}
@@ -325,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	if (ret < 0)
 		goto out;
 
-	prog = to_bpf(act);
+	prog = to_bpf(*act);
 	ASSERT_RTNL();
 
 	if (res != ACT_P_CREATED)
@@ -343,7 +344,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	rcu_assign_pointer(prog->filter, cfg.filter);
 
 	if (res == ACT_P_CREATED) {
-		tcf_hash_insert(tn, act);
+		tcf_hash_insert(tn, *act);
 	} else {
 		/* make sure the program being replaced is no longer executing */
 		synchronize_rcu();
@@ -353,7 +354,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	return res;
 out:
 	if (res == ACT_P_CREATED)
-		tcf_hash_cleanup(act, est);
+		tcf_hash_cleanup(*act, est);
 
 	return ret;
 }
@@ -362,20 +363,20 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
 {
 	struct tcf_bpf_cfg tmp;
 
-	tcf_bpf_prog_fill_cfg(act->priv, &tmp);
+	tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp);
 	tcf_bpf_cfg_cleanup(&tmp);
 }
 
 static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  struct tc_action *a)
+			  const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
@@ -392,6 +393,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.init		=	tcf_bpf_init,
 	.walk		=	tcf_bpf_walker,
 	.lookup		=	tcf_bpf_search,
+	.size		=	sizeof(struct tcf_bpf),
 };
 
 static __net_init int bpf_init_net(struct net *net)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 35a5270f289d..eae07a2e774d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -31,6 +31,7 @@
 #define CONNMARK_TAB_MASK     3
 
 static int connmark_net_id;
+static struct tc_action_ops act_connmark_ops;
 
 static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 			struct tcf_result *res)
@@ -38,7 +39,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 	const struct nf_conntrack_tuple_hash *thash;
 	struct nf_conntrack_tuple tuple;
 	enum ip_conntrack_info ctinfo;
-	struct tcf_connmark_info *ca = a->priv;
+	struct tcf_connmark_info *ca = to_connmark(a);
 	struct nf_conntrack_zone zone;
 	struct nf_conn *c;
 	int proto;
@@ -96,7 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 };
 
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
-			     struct nlattr *est, struct tc_action *a,
+			     struct nlattr *est, struct tc_action **a,
 			     int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -116,22 +117,22 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*ci), bind, false);
+				      &act_connmark_ops, bind, false);
 		if (ret)
 			return ret;
 
-		ci = to_connmark(a);
+		ci = to_connmark(*a);
 		ci->tcf_action = parm->action;
 		ci->net = net;
 		ci->zone = parm->zone;
 
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 		ret = ACT_P_CREATED;
 	} else {
-		ci = to_connmark(a);
+		ci = to_connmark(*a);
 		if (bind)
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 		/* replacing action and zone */
@@ -146,7 +147,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 				    int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_connmark_info *ci = a->priv;
+	struct tcf_connmark_info *ci = to_connmark(a);
 
 	struct tc_connmark opt = {
 		.index   = ci->tcf_index,
@@ -173,14 +174,14 @@ nla_put_failure:
 
 static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
 			       struct netlink_callback *cb, int type,
-			       struct tc_action *a)
+			       const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
@@ -196,6 +197,7 @@ static struct tc_action_ops act_connmark_ops = {
 	.init		=	tcf_connmark_init,
 	.walk		=	tcf_connmark_walker,
 	.lookup		=	tcf_connmark_search,
+	.size		=	sizeof(struct tcf_connmark_info),
 };
 
 static __net_init int connmark_init_net(struct net *net)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index dcd9ababd351..b5dbf633a863 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,9 +43,10 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 };
 
 static int csum_net_id;
+static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action *a, int ovr,
+			 struct nlattr *est, struct tc_action **a, int ovr,
 			 int bind)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
@@ -67,26 +68,26 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*p), bind, false);
+				      &act_csum_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
 
-	p = to_tcf_csum(a);
+	p = to_tcf_csum(*a);
 	spin_lock_bh(&p->tcf_lock);
 	p->tcf_action = parm->action;
 	p->update_flags = parm->update_flags;
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 
 	return ret;
 }
@@ -496,7 +497,7 @@ fail:
 static int tcf_csum(struct sk_buff *skb,
 		    const struct tc_action *a, struct tcf_result *res)
 {
-	struct tcf_csum *p = a->priv;
+	struct tcf_csum *p = to_tcf_csum(a);
 	int action;
 	u32 update_flags;
 
@@ -534,7 +535,7 @@ static int tcf_csum_dump(struct sk_buff *skb,
 			 struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_csum *p = a->priv;
+	struct tcf_csum *p = to_tcf_csum(a);
 	struct tc_csum opt = {
 		.update_flags = p->update_flags,
 		.index   = p->tcf_index,
@@ -560,14 +561,14 @@ nla_put_failure:
 
 static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   struct tc_action *a)
+			   const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
@@ -583,6 +584,7 @@ static struct tc_action_ops act_csum_ops = {
 	.init		= tcf_csum_init,
 	.walk		= tcf_csum_walker,
 	.lookup		= tcf_csum_search,
+	.size		= sizeof(struct tcf_csum),
 };
 
 static __net_init int csum_init_net(struct net *net)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 19058a7f3e5c..e24a4093d6f6 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -26,6 +26,7 @@
 #define GACT_TAB_MASK	15
 
 static int gact_net_id;
+static struct tc_action_ops act_gact_ops;
 
 #ifdef CONFIG_GACT_PROB
 static int gact_net_rand(struct tcf_gact *gact)
@@ -56,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 };
 
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action *a,
+			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
@@ -93,19 +94,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*gact), bind, true);
+				      &act_gact_ops, bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
 
-	gact = to_gact(a);
+	gact = to_gact(*a);
 
 	ASSERT_RTNL();
 	gact->tcf_action = parm->action;
@@ -121,14 +122,14 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 }
 
 static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
-	struct tcf_gact *gact = a->priv;
+	struct tcf_gact *gact = to_gact(a);
 	int action = READ_ONCE(gact->tcf_action);
 
 #ifdef CONFIG_GACT_PROB
@@ -151,7 +152,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 				  u64 lastuse)
 {
-	struct tcf_gact *gact = a->priv;
+	struct tcf_gact *gact = to_gact(a);
 	int action = READ_ONCE(gact->tcf_action);
 	struct tcf_t *tm = &gact->tcf_tm;
 
@@ -166,7 +167,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_gact *gact = a->priv;
+	struct tcf_gact *gact = to_gact(a);
 	struct tc_gact opt = {
 		.index   = gact->tcf_index,
 		.refcnt  = gact->tcf_refcnt - ref,
@@ -201,14 +202,14 @@ nla_put_failure:
 
 static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   struct tc_action *a)
+			   const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
@@ -225,6 +226,7 @@ static struct tc_action_ops act_gact_ops = {
 	.init		=	tcf_gact_init,
 	.walk		=	tcf_gact_walker,
 	.lookup		=	tcf_gact_search,
+	.size		=	sizeof(struct tcf_gact),
 };
 
 static __net_init int gact_init_net(struct net *net)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 845ab5119c05..141a06eeb1e5 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -37,6 +37,7 @@
 
 static int ife_net_id;
 static int max_metacnt = IFE_META_MAX + 1;
+static struct tc_action_ops act_ife_ops;
 
 static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 	[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
@@ -364,7 +365,7 @@ out_nlmsg_trim:
 /* under ife->tcf_lock */
 static void _tcf_ife_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_meta_info *e, *n;
 
 	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
@@ -382,7 +383,7 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
 
 static void tcf_ife_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 
 	spin_lock_bh(&ife->tcf_lock);
 	_tcf_ife_cleanup(a, bind);
@@ -417,7 +418,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 }
 
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action *a,
+			struct nlattr *est, struct tc_action **a,
 			int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
@@ -451,25 +452,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 		**/
 		if (!tb[TCA_IFE_TYPE]) {
 			if (exists)
-				tcf_hash_release(a, bind);
+				tcf_hash_release(*a, bind);
 			pr_info("You MUST pass etherype for encoding\n");
 			return -EINVAL;
 		}
 	}
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+		ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops,
 				      bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
 
-	ife = to_ife(a);
+	ife = to_ife(*a);
 	ife->flags = parm->flags;
 
 	if (parm->flags & IFE_ENCODE) {
@@ -507,9 +508,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 		if (err) {
 metadata_parse_err:
 			if (exists)
-				tcf_hash_release(a, bind);
+				tcf_hash_release(*a, bind);
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(a, bind);
+				_tcf_ife_cleanup(*a, bind);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
@@ -529,7 +530,7 @@ metadata_parse_err:
 		err = use_all_metadata(ife);
 		if (err) {
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(a, bind);
+				_tcf_ife_cleanup(*a, bind);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
@@ -541,7 +542,7 @@ metadata_parse_err:
 		spin_unlock_bh(&ife->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 
 	return ret;
 }
@@ -550,7 +551,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 			int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 	struct tc_ife opt = {
 		.index = ife->tcf_index,
 		.refcnt = ife->tcf_refcnt - ref,
@@ -623,7 +624,7 @@ struct meta_tlvhdr {
 static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
 	u16 ifehdrln = ifehdr->metalen;
@@ -695,7 +696,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
 static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ethhdr *oethh;	/* outer ether header */
 	struct ethhdr *iethh;	/* inner eth header */
@@ -799,7 +800,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
 {
-	struct tcf_ife_info *ife = a->priv;
+	struct tcf_ife_info *ife = to_ife(a);
 
 	if (ife->flags & IFE_ENCODE)
 		return tcf_ife_encode(skb, a, res);
@@ -819,14 +820,14 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
 
 static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  struct tc_action *a)
+			  const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
@@ -843,6 +844,7 @@ static struct tc_action_ops act_ife_ops = {
 	.init = tcf_ife_init,
 	.walk = tcf_ife_walker,
 	.lookup = tcf_ife_search,
+	.size =	sizeof(struct tcf_ife_info),
 };
 
 static __net_init int ife_init_net(struct net *net)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index b8c50600697a..378c1c976058 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -31,8 +31,10 @@
 #define IPT_TAB_MASK     15
 
 static int ipt_net_id;
+static struct tc_action_ops act_ipt_ops;
 
 static int xt_net_id;
+static struct tc_action_ops act_xt_ops;
 
 static int ipt_init_target(struct xt_entry_target *t, char *table,
 			   unsigned int hook)
@@ -90,8 +92,8 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 };
 
 static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
-			  struct nlattr *est, struct tc_action *a, int ovr,
-			  int bind)
+			  struct nlattr *est, struct tc_action **a,
+			  const struct tc_action_ops *ops, int ovr, int bind)
 {
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
@@ -118,19 +120,19 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
 
 	if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
 		if (exists)
-			tcf_hash_release(a, bind);
+			tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 
 	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
 	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
 		if (exists)
-			tcf_hash_release(a, bind);
+			tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
+		ret = tcf_hash_create(tn, index, est, a, ops, bind,
 				      false);
 		if (ret)
 			return ret;
@@ -138,13 +140,11 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 
 		if (!ovr)
 			return -EEXIST;
 	}
-	ipt = to_ipt(a);
-
 	hook = nla_get_u32(tb[TCA_IPT_HOOK]);
 
 	err = -ENOMEM;
@@ -163,6 +163,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
 	if (err < 0)
 		goto err3;
 
+	ipt = to_ipt(*a);
+
 	spin_lock_bh(&ipt->tcf_lock);
 	if (ret != ACT_P_CREATED) {
 		ipt_destroy_target(ipt->tcfi_t);
@@ -174,7 +176,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
 	ipt->tcfi_hook  = hook;
 	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 
 err3:
@@ -183,33 +185,33 @@ err2:
 	kfree(tname);
 err1:
 	if (ret == ACT_P_CREATED)
-		tcf_hash_cleanup(a, est);
+		tcf_hash_cleanup(*a, est);
 	return err;
 }
 
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action *a, int ovr,
+			struct nlattr *est, struct tc_action **a, int ovr,
 			int bind)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
-	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+	return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind);
 }
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
-		       struct nlattr *est, struct tc_action *a, int ovr,
+		       struct nlattr *est, struct tc_action **a, int ovr,
 		       int bind)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
-	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+	return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind);
 }
 
 static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
 	int ret = 0, result = 0;
-	struct tcf_ipt *ipt = a->priv;
+	struct tcf_ipt *ipt = to_ipt(a);
 	struct xt_action_param par;
 
 	if (skb_unclone(skb, GFP_ATOMIC))
@@ -259,7 +261,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 			int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_ipt *ipt = a->priv;
+	struct tcf_ipt *ipt = to_ipt(a);
 	struct xt_entry_target *t;
 	struct tcf_t tm;
 	struct tc_cnt c;
@@ -299,14 +301,14 @@ nla_put_failure:
 
 static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  struct tc_action *a)
+			  const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
@@ -323,6 +325,7 @@ static struct tc_action_ops act_ipt_ops = {
 	.init		=	tcf_ipt_init,
 	.walk		=	tcf_ipt_walker,
 	.lookup		=	tcf_ipt_search,
+	.size		=	sizeof(struct tcf_ipt),
 };
 
 static __net_init int ipt_init_net(struct net *net)
@@ -348,14 +351,14 @@ static struct pernet_operations ipt_net_ops = {
 
 static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
 			 struct netlink_callback *cb, int type,
-			 struct tc_action *a)
+			 const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
@@ -372,6 +375,7 @@ static struct tc_action_ops act_xt_ops = {
 	.init		=	tcf_xt_init,
 	.walk		=	tcf_xt_walker,
 	.lookup		=	tcf_xt_search,
+	.size		=	sizeof(struct tcf_ipt),
 };
 
 static __net_init int xt_init_net(struct net *net)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 70cfbbf96af2..6038c85d92f5 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -52,9 +52,10 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 };
 
 static int mirred_net_id;
+static struct tc_action_ops act_mirred_ops;
 
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
-			   struct nlattr *est, struct tc_action *a, int ovr,
+			   struct nlattr *est, struct tc_action **a, int ovr,
 			   int bind)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
@@ -84,14 +85,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		break;
 	default:
 		if (exists)
-			tcf_hash_release(a, bind);
+			tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 	if (parm->ifindex) {
 		dev = __dev_get_by_index(net, parm->ifindex);
 		if (dev == NULL) {
 			if (exists)
-				tcf_hash_release(a, bind);
+				tcf_hash_release(*a, bind);
 			return -ENODEV;
 		}
 		switch (dev->type) {
@@ -115,16 +116,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		if (dev == NULL)
 			return -EINVAL;
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*m), bind, true);
+				      &act_mirred_ops, bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
-	m = to_mirred(a);
+	m = to_mirred(*a);
 
 	ASSERT_RTNL();
 	m->tcf_action = parm->action;
@@ -142,7 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
 		spin_unlock_bh(&mirred_list_lock);
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	}
 
 	return ret;
@@ -151,7 +152,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		      struct tcf_result *res)
 {
-	struct tcf_mirred *m = a->priv;
+	struct tcf_mirred *m = to_mirred(a);
 	struct net_device *dev;
 	struct sk_buff *skb2;
 	int retval, err;
@@ -206,7 +207,7 @@ out:
 static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_mirred *m = a->priv;
+	struct tcf_mirred *m = to_mirred(a);
 	struct tc_mirred opt = {
 		.index   = m->tcf_index,
 		.action  = m->tcf_action,
@@ -232,14 +233,14 @@ nla_put_failure:
 
 static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
 			     struct netlink_callback *cb, int type,
-			     struct tc_action *a)
+			     const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
@@ -284,6 +285,7 @@ static struct tc_action_ops act_mirred_ops = {
 	.init		=	tcf_mirred_init,
 	.walk		=	tcf_mirred_walker,
 	.lookup		=	tcf_mirred_search,
+	.size		=	sizeof(struct tcf_mirred),
 };
 
 static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 06ccb03f25da..8e8b0cc30704 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -32,13 +32,14 @@
 #define NAT_TAB_MASK	15
 
 static int nat_net_id;
+static struct tc_action_ops act_nat_ops;
 
 static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
 };
 
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action *a, int ovr, int bind)
+			struct tc_action **a, int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -59,18 +60,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*p), bind, false);
+				      &act_nat_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
-	p = to_tcf_nat(a);
+	p = to_tcf_nat(*a);
 
 	spin_lock_bh(&p->tcf_lock);
 	p->old_addr = parm->old_addr;
@@ -82,7 +83,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 
 	return ret;
 }
@@ -90,7 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
-	struct tcf_nat *p = a->priv;
+	struct tcf_nat *p = to_tcf_nat(a);
 	struct iphdr *iph;
 	__be32 old_addr;
 	__be32 new_addr;
@@ -248,7 +249,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 			int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_nat *p = a->priv;
+	struct tcf_nat *p = to_tcf_nat(a);
 	struct tc_nat opt = {
 		.old_addr = p->old_addr,
 		.new_addr = p->new_addr,
@@ -278,14 +279,14 @@ nla_put_failure:
 
 static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  struct tc_action *a)
+			  const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
@@ -301,6 +302,7 @@ static struct tc_action_ops act_nat_ops = {
 	.init		=	tcf_nat_init,
 	.walk		=	tcf_nat_walker,
 	.lookup		=	tcf_nat_search,
+	.size		=	sizeof(struct tcf_nat),
 };
 
 static __net_init int nat_init_net(struct net *net)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 82d3c1479029..b54d56d4959b 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -26,13 +26,14 @@
 #define PEDIT_TAB_MASK	15
 
 static int pedit_net_id;
+static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
 };
 
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
-			  struct nlattr *est, struct tc_action *a,
+			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
@@ -61,23 +62,23 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		if (!parm->nkeys)
 			return -EINVAL;
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*p), bind, false);
+				      &act_pedit_ops, bind, false);
 		if (ret)
 			return ret;
-		p = to_pedit(a);
+		p = to_pedit(*a);
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
-			tcf_hash_cleanup(a, est);
+			tcf_hash_cleanup(*a, est);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)
 			return 0;
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
-		p = to_pedit(a);
+		p = to_pedit(*a);
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
 			if (keys == NULL)
@@ -96,13 +97,13 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	memcpy(p->tcfp_keys, parm->keys, ksize);
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 }
 
 static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_pedit *p = a->priv;
+	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
 	kfree(keys);
 }
@@ -110,7 +111,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
-	struct tcf_pedit *p = a->priv;
+	struct tcf_pedit *p = to_pedit(a);
 	int i;
 	unsigned int off;
 
@@ -177,7 +178,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 			  int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_pedit *p = a->priv;
+	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit *opt;
 	struct tcf_t t;
 	int s;
@@ -216,14 +217,14 @@ nla_put_failure:
 
 static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
 			    struct netlink_callback *cb, int type,
-			    struct tc_action *a)
+			    const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
@@ -240,6 +241,7 @@ static struct tc_action_ops act_pedit_ops = {
 	.init		=	tcf_pedit_init,
 	.walk		=	tcf_pedit_walker,
 	.lookup		=	tcf_pedit_search,
+	.size		=	sizeof(struct tcf_pedit),
 };
 
 static __net_init int pedit_init_net(struct net *net)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 1e8ede3955f4..123794af55c3 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -37,8 +37,8 @@ struct tcf_police {
 	struct psched_ratecfg	peak;
 	bool			peak_present;
 };
-#define to_police(pc)	\
-	container_of(pc->priv, struct tcf_police, common)
+
+#define to_police(pc) ((struct tcf_police *)pc)
 
 #define POL_TAB_MASK     15
 
@@ -56,15 +56,14 @@ struct tc_police_compat {
 /* Each policer is serialized by its individual spinlock */
 
 static int police_net_id;
+static struct tc_action_ops act_police_ops;
 
 static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 				 struct netlink_callback *cb, int type,
-				 struct tc_action *a)
+				 const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 	struct tcf_hashinfo *hinfo = tn->hinfo;
-	struct hlist_head *head;
-	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct nlattr *nest;
 
@@ -73,21 +72,22 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 	s_i = cb->args[0];
 
 	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
+		struct hlist_head *head;
+		struct tcf_common *p;
+
 		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
 
 		hlist_for_each_entry_rcu(p, head, tcfc_head) {
 			index++;
 			if (index < s_i)
 				continue;
-			a->priv = p;
-			a->order = index;
-			nest = nla_nest_start(skb, a->order);
+			nest = nla_nest_start(skb, index);
 			if (nest == NULL)
 				goto nla_put_failure;
 			if (type == RTM_DELACTION)
-				err = tcf_action_dump_1(skb, a, 0, 1);
+				err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1);
 			else
-				err = tcf_action_dump_1(skb, a, 0, 0);
+				err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0);
 			if (err < 0) {
 				index--;
 				nla_nest_cancel(skb, nest);
@@ -116,7 +116,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 };
 
 static int tcf_act_police_init(struct net *net, struct nlattr *nla,
-			       struct nlattr *est, struct tc_action *a,
+			       struct nlattr *est, struct tc_action **a,
 			       int ovr, int bind)
 {
 	int ret = 0, err;
@@ -142,13 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_POLICE_TBF]);
 
 	if (parm->index) {
-		if (tcf_hash_search(tn, a, parm->index)) {
-			police = to_police(a);
-			if (bind) {
-				police->tcf_bindcnt += 1;
-				police->tcf_refcnt += 1;
-				return 0;
-			}
+		if (tcf_hash_check(tn, parm->index, a, bind)) {
 			if (ovr)
 				goto override;
 			/* not replacing */
@@ -156,14 +150,14 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 		}
 	} else {
 		ret = tcf_hash_create(tn, parm->index, NULL, a,
-				      sizeof(*police), bind, false);
+				      &act_police_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	}
 
-	police = to_police(a);
 override:
+	police = to_police(*a);
 	if (parm->rate.rate) {
 		err = -ENOMEM;
 		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
@@ -235,7 +229,7 @@ override:
 		return ret;
 
 	police->tcfp_t_c = ktime_get_ns();
-	tcf_hash_insert(tn, a);
+	tcf_hash_insert(tn, *a);
 
 	return ret;
 
@@ -245,14 +239,14 @@ failure:
 	qdisc_put_rtab(P_tab);
 	qdisc_put_rtab(R_tab);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_cleanup(a, est);
+		tcf_hash_cleanup(*a, est);
 	return err;
 }
 
 static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
-	struct tcf_police *police = a->priv;
+	struct tcf_police *police = to_police(a);
 	s64 now;
 	s64 toks;
 	s64 ptoks = 0;
@@ -311,7 +305,7 @@ static int
 tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_police *police = a->priv;
+	struct tcf_police *police = to_police(a);
 	struct tc_police opt = {
 		.index = police->tcf_index,
 		.action = police->tcf_action,
@@ -349,7 +343,7 @@ nla_put_failure:
 	return -1;
 }
 
-static int tcf_police_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
@@ -369,6 +363,7 @@ static struct tc_action_ops act_police_ops = {
 	.init		=	tcf_act_police_init,
 	.walk		=	tcf_act_police_walker,
 	.lookup		=	tcf_police_search,
+	.size		=	sizeof(struct tcf_police),
 };
 
 static __net_init int police_init_net(struct net *net)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 318328d34d12..289af6f9bb3b 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -27,12 +27,13 @@
 #define SIMP_TAB_MASK     7
 
 static int simp_net_id;
+static struct tc_action_ops act_simp_ops;
 
 #define SIMP_MAX_DATA	32
 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
-	struct tcf_defact *d = a->priv;
+	struct tcf_defact *d = to_defact(a);
 
 	spin_lock(&d->tcf_lock);
 	tcf_lastuse_update(&d->tcf_tm);
@@ -79,7 +80,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 };
 
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action *a,
+			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
@@ -100,7 +101,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_DEF_PARMS] == NULL)
 		return -EINVAL;
 
-
 	parm = nla_data(tb[TCA_DEF_PARMS]);
 	exists = tcf_hash_check(tn, parm->index, a, bind);
 	if (exists && bind)
@@ -108,7 +108,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 
 	if (tb[TCA_DEF_DATA] == NULL) {
 		if (exists)
-			tcf_hash_release(a, bind);
+			tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 
@@ -116,22 +116,22 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*d), bind, false);
+				      &act_simp_ops, bind, false);
 		if (ret)
 			return ret;
 
-		d = to_defact(a);
+		d = to_defact(*a);
 		ret = alloc_defdata(d, defdata);
 		if (ret < 0) {
-			tcf_hash_cleanup(a, est);
+			tcf_hash_cleanup(*a, est);
 			return ret;
 		}
 		d->tcf_action = parm->action;
 		ret = ACT_P_CREATED;
 	} else {
-		d = to_defact(a);
+		d = to_defact(*a);
 
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 
@@ -139,7 +139,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 }
 
@@ -147,7 +147,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_defact *d = a->priv;
+	struct tcf_defact *d = to_defact(a);
 	struct tc_defact opt = {
 		.index   = d->tcf_index,
 		.refcnt  = d->tcf_refcnt - ref,
@@ -172,14 +172,14 @@ nla_put_failure:
 
 static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   struct tc_action *a)
+			   const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
@@ -196,6 +196,7 @@ static struct tc_action_ops act_simp_ops = {
 	.init		=	tcf_simp_init,
 	.walk		=	tcf_simp_walker,
 	.lookup		=	tcf_simp_search,
+	.size		=	sizeof(struct tcf_defact),
 };
 
 static __net_init int simp_init_net(struct net *net)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 8e573c0f8742..a133dcb82132 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -30,11 +30,12 @@
 #define SKBEDIT_TAB_MASK     15
 
 static int skbedit_net_id;
+static struct tc_action_ops act_skbedit_ops;
 
 static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
 {
-	struct tcf_skbedit *d = a->priv;
+	struct tcf_skbedit *d = to_skbedit(a);
 
 	spin_lock(&d->tcf_lock);
 	tcf_lastuse_update(&d->tcf_tm);
@@ -63,7 +64,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
-			    struct nlattr *est, struct tc_action *a,
+			    struct nlattr *est, struct tc_action **a,
 			    int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -114,21 +115,21 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		return 0;
 
 	if (!flags) {
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 
 	if (!exists) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*d), bind, false);
+				      &act_skbedit_ops, bind, false);
 		if (ret)
 			return ret;
 
-		d = to_skbedit(a);
+		d = to_skbedit(*a);
 		ret = ACT_P_CREATED;
 	} else {
-		d = to_skbedit(a);
-		tcf_hash_release(a, bind);
+		d = to_skbedit(*a);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -150,7 +151,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&d->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 }
 
@@ -158,7 +159,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 			    int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_skbedit *d = a->priv;
+	struct tcf_skbedit *d = to_skbedit(a);
 	struct tc_skbedit opt = {
 		.index   = d->tcf_index,
 		.refcnt  = d->tcf_refcnt - ref,
@@ -194,14 +195,14 @@ nla_put_failure:
 
 static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
 			      struct netlink_callback *cb, int type,
-			      struct tc_action *a)
+			      const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
@@ -217,6 +218,7 @@ static struct tc_action_ops act_skbedit_ops = {
 	.init		=	tcf_skbedit_init,
 	.walk		=	tcf_skbedit_walker,
 	.lookup		=	tcf_skbedit_search,
+	.size		=	sizeof(struct tcf_skbedit),
 };
 
 static __net_init int skbedit_init_net(struct net *net)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index db9b7ed570ba..691409de3e1a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -22,11 +22,12 @@
 #define VLAN_TAB_MASK     15
 
 static int vlan_net_id;
+static struct tc_action_ops act_vlan_ops;
 
 static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
-	struct tcf_vlan *v = a->priv;
+	struct tcf_vlan *v = to_vlan(a);
 	int action;
 	int err;
 
@@ -67,7 +68,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 };
 
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action *a,
+			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -100,13 +101,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	case TCA_VLAN_ACT_PUSH:
 		if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
 			if (exists)
-				tcf_hash_release(a, bind);
+				tcf_hash_release(*a, bind);
 			return -EINVAL;
 		}
 		push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
 		if (push_vid >= VLAN_VID_MASK) {
 			if (exists)
-				tcf_hash_release(a, bind);
+				tcf_hash_release(*a, bind);
 			return -ERANGE;
 		}
 
@@ -125,25 +126,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		break;
 	default:
 		if (exists)
-			tcf_hash_release(a, bind);
+			tcf_hash_release(*a, bind);
 		return -EINVAL;
 	}
 	action = parm->v_action;
 
 	if (!exists) {
 		ret = tcf_hash_create(tn, parm->index, est, a,
-				      sizeof(*v), bind, false);
+				      &act_vlan_ops, bind, false);
 		if (ret)
 			return ret;
 
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(a, bind);
+		tcf_hash_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
 
-	v = to_vlan(a);
+	v = to_vlan(*a);
 
 	spin_lock_bh(&v->tcf_lock);
 
@@ -156,7 +157,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&v->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, a);
+		tcf_hash_insert(tn, *a);
 	return ret;
 }
 
@@ -164,7 +165,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_vlan *v = a->priv;
+	struct tcf_vlan *v = to_vlan(a);
 	struct tc_vlan opt = {
 		.index    = v->tcf_index,
 		.refcnt   = v->tcf_refcnt - ref,
@@ -195,14 +196,14 @@ nla_put_failure:
 
 static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   struct tc_action *a)
+			   const struct tc_action_ops *ops)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, a);
+	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index)
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
@@ -218,6 +219,7 @@ static struct tc_action_ops act_vlan_ops = {
 	.init		=	tcf_vlan_init,
 	.walk		=	tcf_vlan_walker,
 	.lookup		=	tcf_vlan_search,
+	.size		=	sizeof(struct tcf_vlan),
 };
 
 static __net_init int vlan_init_net(struct net *net)
-- 
cgit v1.2.3


From ec0595cc4495be579309b4bfd5e997af0f2ae6f9 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 25 Jul 2016 16:09:42 -0700
Subject: net_sched: get rid of struct tcf_common

After the previous patch, struct tc_action should be enough
to represent the generic tc action, tcf_common is not necessary
any more. This patch gets rid of it to make tc action code
more readable.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h            |  63 +++++++++---------
 include/net/tc_act/tc_bpf.h      |   2 +-
 include/net/tc_act/tc_connmark.h |   2 +-
 include/net/tc_act/tc_csum.h     |   2 +-
 include/net/tc_act/tc_defact.h   |   2 +-
 include/net/tc_act/tc_gact.h     |   2 +-
 include/net/tc_act/tc_ife.h      |   2 +-
 include/net/tc_act/tc_ipt.h      |   2 +-
 include/net/tc_act/tc_mirred.h   |   2 +-
 include/net/tc_act/tc_nat.h      |   2 +-
 include/net/tc_act/tc_pedit.h    |   2 +-
 include/net/tc_act/tc_skbedit.h  |   2 +-
 include/net/tc_act/tc_vlan.h     |   2 +-
 net/sched/act_api.c              | 139 +++++++++++++++++++--------------------
 net/sched/act_police.c           |  10 +--
 15 files changed, 114 insertions(+), 122 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8b199095ea51..41e6a24a44b9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -22,42 +22,39 @@ struct tc_action_ops;
 
 struct tc_action {
 	const struct tc_action_ops	*ops;
-	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
-	__u32			order;
-	struct list_head	list;
-	struct tcf_hashinfo	*hinfo;
-};
-
-struct tcf_common {
-	struct tc_action		tcfc_act;
-	struct hlist_node		tcfc_head;
-	u32				tcfc_index;
-	int				tcfc_refcnt;
-	int				tcfc_bindcnt;
-	u32				tcfc_capab;
-	int				tcfc_action;
-	struct tcf_t			tcfc_tm;
-	struct gnet_stats_basic_packed	tcfc_bstats;
-	struct gnet_stats_queue		tcfc_qstats;
-	struct gnet_stats_rate_est64	tcfc_rate_est;
-	spinlock_t			tcfc_lock;
-	struct rcu_head			tcfc_rcu;
+	__u32				type; /* for backward compat(TCA_OLD_COMPAT) */
+	__u32				order;
+	struct list_head		list;
+	struct tcf_hashinfo		*hinfo;
+
+	struct hlist_node		tcfa_head;
+	u32				tcfa_index;
+	int				tcfa_refcnt;
+	int				tcfa_bindcnt;
+	u32				tcfa_capab;
+	int				tcfa_action;
+	struct tcf_t			tcfa_tm;
+	struct gnet_stats_basic_packed	tcfa_bstats;
+	struct gnet_stats_queue		tcfa_qstats;
+	struct gnet_stats_rate_est64	tcfa_rate_est;
+	spinlock_t			tcfa_lock;
+	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 };
-#define tcf_act		common.tcfc_act
-#define tcf_head	common.tcfc_head
-#define tcf_index	common.tcfc_index
-#define tcf_refcnt	common.tcfc_refcnt
-#define tcf_bindcnt	common.tcfc_bindcnt
-#define tcf_capab	common.tcfc_capab
-#define tcf_action	common.tcfc_action
-#define tcf_tm		common.tcfc_tm
-#define tcf_bstats	common.tcfc_bstats
-#define tcf_qstats	common.tcfc_qstats
-#define tcf_rate_est	common.tcfc_rate_est
-#define tcf_lock	common.tcfc_lock
-#define tcf_rcu		common.tcfc_rcu
+#define tcf_act		common.tcfa_act
+#define tcf_head	common.tcfa_head
+#define tcf_index	common.tcfa_index
+#define tcf_refcnt	common.tcfa_refcnt
+#define tcf_bindcnt	common.tcfa_bindcnt
+#define tcf_capab	common.tcfa_capab
+#define tcf_action	common.tcfa_action
+#define tcf_tm		common.tcfa_tm
+#define tcf_bstats	common.tcfa_bstats
+#define tcf_qstats	common.tcfa_qstats
+#define tcf_rate_est	common.tcfa_rate_est
+#define tcf_lock	common.tcfa_lock
+#define tcf_rcu		common.tcfa_rcu
 
 static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
 {
diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h
index 80a4d6f49773..2b94673a3dbc 100644
--- a/include/net/tc_act/tc_bpf.h
+++ b/include/net/tc_act/tc_bpf.h
@@ -14,7 +14,7 @@
 #include <net/act_api.h>
 
 struct tcf_bpf {
-	struct tcf_common	common;
+	struct tc_action	common;
 	struct bpf_prog __rcu	*filter;
 	union {
 		u32		bpf_fd;
diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h
index 8a661135f4ac..59b515d32bb4 100644
--- a/include/net/tc_act/tc_connmark.h
+++ b/include/net/tc_act/tc_connmark.h
@@ -4,7 +4,7 @@
 #include <net/act_api.h>
 
 struct tcf_connmark_info {
-	struct tcf_common common;
+	struct tc_action common;
 	struct net *net;
 	u16 zone;
 };
diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 1a9ef15d573b..f31fb6331a53 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -5,7 +5,7 @@
 #include <net/act_api.h>
 
 struct tcf_csum {
-	struct tcf_common common;
+	struct tc_action common;
 
 	u32 update_flags;
 };
diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h
index e25b4eb4fc66..d47f040a3bdf 100644
--- a/include/net/tc_act/tc_defact.h
+++ b/include/net/tc_act/tc_defact.h
@@ -4,7 +4,7 @@
 #include <net/act_api.h>
 
 struct tcf_defact {
-	struct tcf_common	common;
+	struct tc_action	common;
 	u32		tcfd_datalen;
 	void		*tcfd_defdata;
 };
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 119cdb418c23..b6f173910226 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -5,7 +5,7 @@
 #include <linux/tc_act/tc_gact.h>
 
 struct tcf_gact {
-	struct tcf_common	common;
+	struct tc_action	common;
 #ifdef CONFIG_GACT_PROB
 	u16			tcfg_ptype;
 	u16			tcfg_pval;
diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 7921abe42adc..5164bd7a38fb 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -8,7 +8,7 @@
 
 #define IFE_METAHDRLEN 2
 struct tcf_ife_info {
-	struct tcf_common common;
+	struct tc_action common;
 	u8 eth_dst[ETH_ALEN];
 	u8 eth_src[ETH_ALEN];
 	u16 eth_type;
diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h
index c22ae7ab66ed..31309766e379 100644
--- a/include/net/tc_act/tc_ipt.h
+++ b/include/net/tc_act/tc_ipt.h
@@ -6,7 +6,7 @@
 struct xt_entry_target;
 
 struct tcf_ipt {
-	struct tcf_common	common;
+	struct tc_action	common;
 	u32			tcfi_hook;
 	char			*tcfi_tname;
 	struct xt_entry_target	*tcfi_t;
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 89aebd22cd79..62770add15bd 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -5,7 +5,7 @@
 #include <linux/tc_act/tc_mirred.h>
 
 struct tcf_mirred {
-	struct tcf_common	common;
+	struct tc_action	common;
 	int			tcfm_eaction;
 	int			tcfm_ifindex;
 	int			tcfm_ok_push;
diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h
index a91ad3ad565e..56681a320612 100644
--- a/include/net/tc_act/tc_nat.h
+++ b/include/net/tc_act/tc_nat.h
@@ -5,7 +5,7 @@
 #include <net/act_api.h>
 
 struct tcf_nat {
-	struct tcf_common common;
+	struct tc_action common;
 
 	__be32 old_addr;
 	__be32 new_addr;
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 2cccfbaae800..29e38d6823df 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -4,7 +4,7 @@
 #include <net/act_api.h>
 
 struct tcf_pedit {
-	struct tcf_common	common;
+	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
 	struct tc_pedit_key	*tcfp_keys;
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 9e0548998327..5767e9dbcf92 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -23,7 +23,7 @@
 #include <linux/tc_act/tc_skbedit.h>
 
 struct tcf_skbedit {
-	struct tcf_common	common;
+	struct tc_action	common;
 	u32		flags;
 	u32		priority;
 	u32		mark;
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 584b80788d52..e29f52e8bdf1 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -16,7 +16,7 @@
 #define VLAN_F_PUSH		0x2
 
 struct tcf_vlan {
-	struct tcf_common	common;
+	struct tc_action	common;
 	int			tcfv_action;
 	u16			tcfv_push_vid;
 	__be16			tcfv_push_proto;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d97419f35e7e..e4a5f2607ffa 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -29,46 +29,43 @@
 
 static void free_tcf(struct rcu_head *head)
 {
-	struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+	struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
 
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
 	kfree(p);
 }
 
-static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
+static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
 {
-	struct tcf_common *p = (struct tcf_common *)a;
-
 	spin_lock_bh(&hinfo->lock);
-	hlist_del(&p->tcfc_head);
+	hlist_del(&p->tcfa_head);
 	spin_unlock_bh(&hinfo->lock);
-	gen_kill_estimator(&p->tcfc_bstats,
-			   &p->tcfc_rate_est);
+	gen_kill_estimator(&p->tcfa_bstats,
+			   &p->tcfa_rate_est);
 	/*
-	 * gen_estimator est_timer() might access p->tcfc_lock
+	 * gen_estimator est_timer() might access p->tcfa_lock
 	 * or bstats, wait a RCU grace period before freeing p
 	 */
-	call_rcu(&p->tcfc_rcu, free_tcf);
+	call_rcu(&p->tcfa_rcu, free_tcf);
 }
 
-int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
+int __tcf_hash_release(struct tc_action *p, bool bind, bool strict)
 {
-	struct tcf_common *p = (struct tcf_common *)a;
 	int ret = 0;
 
 	if (p) {
 		if (bind)
-			p->tcfc_bindcnt--;
-		else if (strict && p->tcfc_bindcnt > 0)
+			p->tcfa_bindcnt--;
+		else if (strict && p->tcfa_bindcnt > 0)
 			return -EPERM;
 
-		p->tcfc_refcnt--;
-		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
-			if (a->ops->cleanup)
-				a->ops->cleanup(a, bind);
-			list_del(&a->list);
-			tcf_hash_destroy(a->hinfo, a);
+		p->tcfa_refcnt--;
+		if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+			if (p->ops->cleanup)
+				p->ops->cleanup(p, bind);
+			list_del(&p->list);
+			tcf_hash_destroy(p->hinfo, p);
 			ret = ACT_P_DELETED;
 		}
 	}
@@ -89,11 +86,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
 		struct hlist_head *head;
-		struct tcf_common *p;
+		struct tc_action *p;
 
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 
-		hlist_for_each_entry_rcu(p, head, tcfc_head) {
+		hlist_for_each_entry_rcu(p, head, tcfa_head) {
 			index++;
 			if (index < s_i)
 				continue;
@@ -101,7 +98,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			nest = nla_nest_start(skb, n_i);
 			if (nest == NULL)
 				goto nla_put_failure;
-			err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0);
+			err = tcf_action_dump_1(skb, p, 0, 0);
 			if (err < 0) {
 				index--;
 				nlmsg_trim(skb, nest);
@@ -139,13 +136,13 @@ static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
 		struct hlist_head *head;
 		struct hlist_node *n;
-		struct tcf_common *p;
+		struct tc_action *p;
 
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
-		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
-			ret = __tcf_hash_release((struct tc_action *)p, false, true);
+		hlist_for_each_entry_safe(p, n, head, tcfa_head) {
+			ret = __tcf_hash_release(p, false, true);
 			if (ret == ACT_P_DELETED) {
-				module_put(p->tcfc_act.ops->owner);
+				module_put(p->ops->owner);
 				n_i++;
 			} else if (ret < 0)
 				goto nla_put_failure;
@@ -178,15 +175,15 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(tcf_generic_walker);
 
-static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 {
-	struct tcf_common *p = NULL;
+	struct tc_action *p = NULL;
 	struct hlist_head *head;
 
 	spin_lock_bh(&hinfo->lock);
 	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
-	hlist_for_each_entry_rcu(p, head, tcfc_head)
-		if (p->tcfc_index == index)
+	hlist_for_each_entry_rcu(p, head, tcfa_head)
+		if (p->tcfa_index == index)
 			break;
 	spin_unlock_bh(&hinfo->lock);
 
@@ -211,10 +208,10 @@ EXPORT_SYMBOL(tcf_hash_new_index);
 int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
-	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+	struct tc_action *p = tcf_hash_lookup(index, hinfo);
 
 	if (p) {
-		*a = &p->tcfc_act;
+		*a = p;
 		return 1;
 	}
 	return 0;
@@ -225,12 +222,13 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 		    int bind)
 {
 	struct tcf_hashinfo *hinfo = tn->hinfo;
-	struct tcf_common *p = NULL;
+	struct tc_action *p = NULL;
+
 	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
 		if (bind)
-			p->tcfc_bindcnt++;
-		p->tcfc_refcnt++;
-		*a = &p->tcfc_act;
+			p->tcfa_bindcnt++;
+		p->tcfa_refcnt++;
+		*a = p;
 		return true;
 	}
 	return false;
@@ -239,11 +237,10 @@ EXPORT_SYMBOL(tcf_hash_check);
 
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
-	struct tcf_common *pc = (struct tcf_common *)a;
 	if (est)
-		gen_kill_estimator(&pc->tcfc_bstats,
-				   &pc->tcfc_rate_est);
-	call_rcu(&pc->tcfc_rcu, free_tcf);
+		gen_kill_estimator(&a->tcfa_bstats,
+				   &a->tcfa_rate_est);
+	call_rcu(&a->tcfa_rcu, free_tcf);
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
 
@@ -251,15 +248,15 @@ int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		    struct tc_action **a, const struct tc_action_ops *ops,
 		    int bind, bool cpustats)
 {
-	struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL);
+	struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
 	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int err = -ENOMEM;
 
 	if (unlikely(!p))
 		return -ENOMEM;
-	p->tcfc_refcnt = 1;
+	p->tcfa_refcnt = 1;
 	if (bind)
-		p->tcfc_bindcnt = 1;
+		p->tcfa_bindcnt = 1;
 
 	if (cpustats) {
 		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
@@ -275,38 +272,37 @@ err2:
 			goto err1;
 		}
 	}
-	spin_lock_init(&p->tcfc_lock);
-	INIT_HLIST_NODE(&p->tcfc_head);
-	p->tcfc_index = index ? index : tcf_hash_new_index(tn);
-	p->tcfc_tm.install = jiffies;
-	p->tcfc_tm.lastuse = jiffies;
-	p->tcfc_tm.firstuse = 0;
+	spin_lock_init(&p->tcfa_lock);
+	INIT_HLIST_NODE(&p->tcfa_head);
+	p->tcfa_index = index ? index : tcf_hash_new_index(tn);
+	p->tcfa_tm.install = jiffies;
+	p->tcfa_tm.lastuse = jiffies;
+	p->tcfa_tm.firstuse = 0;
 	if (est) {
-		err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
-					&p->tcfc_rate_est,
-					&p->tcfc_lock, NULL, est);
+		err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
+					&p->tcfa_rate_est,
+					&p->tcfa_lock, NULL, est);
 		if (err) {
 			free_percpu(p->cpu_qstats);
 			goto err2;
 		}
 	}
 
-	p->tcfc_act.hinfo = hinfo;
-	p->tcfc_act.ops = ops;
-	INIT_LIST_HEAD(&p->tcfc_act.list);
-	*a = &p->tcfc_act;
+	p->hinfo = hinfo;
+	p->ops = ops;
+	INIT_LIST_HEAD(&p->list);
+	*a = p;
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_create);
 
 void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
 {
-	struct tcf_common *p = (struct tcf_common *)a;
 	struct tcf_hashinfo *hinfo = tn->hinfo;
-	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+	unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask);
 
 	spin_lock_bh(&hinfo->lock);
-	hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+	hlist_add_head(&a->tcfa_head, &hinfo->htab[h]);
 	spin_unlock_bh(&hinfo->lock);
 }
 EXPORT_SYMBOL(tcf_hash_insert);
@@ -317,13 +313,13 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
 	int i;
 
 	for (i = 0; i < hinfo->hmask + 1; i++) {
-		struct tcf_common *p;
+		struct tc_action *p;
 		struct hlist_node *n;
 
-		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
+		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) {
 			int ret;
 
-			ret = __tcf_hash_release((struct tc_action *)p, false, true);
+			ret = __tcf_hash_release(p, false, true);
 			if (ret == ACT_P_DELETED)
 				module_put(ops->owner);
 			else if (ret < 0)
@@ -625,12 +621,11 @@ err:
 	return err;
 }
 
-int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 			  int compat_mode)
 {
 	int err = 0;
 	struct gnet_dump d;
-	struct tcf_common *p = (struct tcf_common *)a;
 
 	if (p == NULL)
 		goto errout;
@@ -639,27 +634,27 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	 * to add additional backward compatibility statistic TLVs.
 	 */
 	if (compat_mode) {
-		if (a->type == TCA_OLD_COMPAT)
+		if (p->type == TCA_OLD_COMPAT)
 			err = gnet_stats_start_copy_compat(skb, 0,
 							   TCA_STATS,
 							   TCA_XSTATS,
-							   &p->tcfc_lock, &d,
+							   &p->tcfa_lock, &d,
 							   TCA_PAD);
 		else
 			return 0;
 	} else
 		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
-					    &p->tcfc_lock, &d, TCA_ACT_PAD);
+					    &p->tcfa_lock, &d, TCA_ACT_PAD);
 
 	if (err < 0)
 		goto errout;
 
-	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
-				     &p->tcfc_rate_est) < 0 ||
+	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
+				     &p->tcfa_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
-				  &p->tcfc_qstats,
-				  p->tcfc_qstats.qlen) < 0)
+				  &p->tcfa_qstats,
+				  p->tcfa_qstats.qlen) < 0)
 		goto errout;
 
 	if (gnet_stats_finish_copy(&d) < 0)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 123794af55c3..b3c7e975fc9e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -23,7 +23,7 @@
 #include <net/netlink.h>
 
 struct tcf_police {
-	struct tcf_common	common;
+	struct tc_action	common;
 	int			tcfp_result;
 	u32			tcfp_ewma_rate;
 	s64			tcfp_burst;
@@ -73,11 +73,11 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 
 	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
 		struct hlist_head *head;
-		struct tcf_common *p;
+		struct tc_action *p;
 
 		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
 
-		hlist_for_each_entry_rcu(p, head, tcfc_head) {
+		hlist_for_each_entry_rcu(p, head, tcfa_head) {
 			index++;
 			if (index < s_i)
 				continue;
@@ -85,9 +85,9 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 			if (nest == NULL)
 				goto nla_put_failure;
 			if (type == RTM_DELACTION)
-				err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1);
+				err = tcf_action_dump_1(skb, p, 0, 1);
 			else
-				err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0);
+				err = tcf_action_dump_1(skb, p, 0, 0);
 			if (err < 0) {
 				index--;
 				nla_nest_cancel(skb, nest);
-- 
cgit v1.2.3


From df7e88f6cad8cc4e02c9275018a572fab59562c0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 30 Jul 2016 20:00:45 +0800
Subject: sctp: change to use TCP_CLOSE_WAIT as SCTP_SS_CLOSING

Prior to this patch, sctp defined TCP_CLOSING as SCTP_SS_CLOSING.
TCP_CLOSING is such a special sk state in TCP that inet common codes
even exclude it.

For instance, inet_accept thinks the accept sk's state never be
TCP_CLOSING, or it will give a WARN_ON. TCP works well with that
while SCTP may trigger the call trace, as CLOSING state in SCTP
has different meaning from TCP.

This fix is to change to use TCP_CLOSE_WAIT as SCTP_SS_CLOSING,
instead of TCP_CLOSING. Some side-effects could be expected,
regardless of not being used before. inet_accept will accept it
now.

I did all the func_tests in lksctp-tools and ran sctp codnomicon
fuzzer tests against this patch, no regression or failure found.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8c337cd0e1e4..5b847e49f7e9 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -214,7 +214,7 @@ typedef enum {
 	SCTP_SS_LISTENING      = TCP_LISTEN,
 	SCTP_SS_ESTABLISHING   = TCP_SYN_SENT,
 	SCTP_SS_ESTABLISHED    = TCP_ESTABLISHED,
-	SCTP_SS_CLOSING        = TCP_CLOSING,
+	SCTP_SS_CLOSING        = TCP_CLOSE_WAIT,
 } sctp_sock_state_t;
 
 /* These functions map various type to printable names.  */
-- 
cgit v1.2.3


From 0b01aeb3d2fbf16787f0c9629f4ca52ae792f732 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 28 Jul 2016 15:36:30 +0100
Subject: VSOCK: transport-specific vsock_transport functions

struct vsock_transport contains function pointers called by AF_VSOCK
core code.  The transport may want its own transport-specific function
pointers and they can be added after struct vsock_transport.

Allow the transport to fetch vsock_transport.  It can downcast it to
access transport-specific function pointers.

The virtio transport will use this.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/net/af_vsock.h   | 3 +++
 net/vmw_vsock/af_vsock.c | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e9eb2d6791b3..23f55259b60d 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -165,6 +165,9 @@ static inline int vsock_core_init(const struct vsock_transport *t)
 }
 void vsock_core_exit(void);
 
+/* The transport may downcast this to access transport-specific functions */
+const struct vsock_transport *vsock_core_get_transport(void);
+
 /**** UTILS ****/
 
 void vsock_release_pending(struct sock *pending);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b96ac918e0ba..e34d96f8bde2 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1995,6 +1995,15 @@ void vsock_core_exit(void)
 }
 EXPORT_SYMBOL_GPL(vsock_core_exit);
 
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+	/* vsock_register_mutex not taken since only the transport uses this
+	 * function and only while registered.
+	 */
+	return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Virtual Socket Family");
 MODULE_VERSION("1.0.1.0-k");
-- 
cgit v1.2.3


From 6773b7dc39f165bd9d824b50ac52cbb3f87d53c8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 28 Jul 2016 15:36:31 +0100
Subject: VSOCK: defer sock removal to transports

The virtio transport will implement graceful shutdown and the related
SO_LINGER socket option.  This requires orphaning the sock but keeping
it in the table of connections after .release().

This patch adds the vsock_remove_sock() function and leaves it up to the
transport when to remove the sock.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/net/af_vsock.h         |  1 +
 net/vmw_vsock/af_vsock.c       | 16 ++++++++++------
 net/vmw_vsock/vmci_transport.c |  2 ++
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 23f55259b60d..3af0b224f754 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -180,6 +180,7 @@ void vsock_remove_connected(struct vsock_sock *vsk);
 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst);
+void vsock_remove_sock(struct vsock_sock *vsk);
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index e34d96f8bde2..17dbbe64cd73 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
 	return ret;
 }
 
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+	if (vsock_in_bound_table(vsk))
+		vsock_remove_bound(vsk);
+
+	if (vsock_in_connected_table(vsk))
+		vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
 {
 	int i;
@@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk)
 		vsk = vsock_sk(sk);
 		pending = NULL;	/* Compiler warning. */
 
-		if (vsock_in_bound_table(vsk))
-			vsock_remove_bound(vsk);
-
-		if (vsock_in_connected_table(vsk))
-			vsock_remove_connected(vsk);
-
 		transport->release(vsk);
 
 		lock_sock(sk);
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 4120b7a538be..4be4fbbc0b50 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
 
 static void vmci_transport_release(struct vsock_sock *vsk)
 {
+	vsock_remove_sock(vsk);
+
 	if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
 		vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
 		vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
-- 
cgit v1.2.3


From 06a8fc78367d070720af960dcecec917d3ae5f3b Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:32 +0100
Subject: VSOCK: Introduce virtio_vsock_common.ko

This module contains the common code and header files for the following
virtio_transporto and vhost_vsock kernel modules.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS                                        |  10 +
 include/linux/virtio_vsock.h                       | 154 ++++
 include/net/af_vsock.h                             |   2 +
 .../trace/events/vsock_virtio_transport_common.h   | 144 +++
 include/uapi/linux/Kbuild                          |   1 +
 include/uapi/linux/virtio_ids.h                    |   1 +
 include/uapi/linux/virtio_vsock.h                  |  94 ++
 net/vmw_vsock/virtio_transport_common.c            | 992 +++++++++++++++++++++
 8 files changed, 1398 insertions(+)
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/trace/events/vsock_virtio_transport_common.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

(limited to 'include/net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8c20323d1277..b49ffb86ebf0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12138,6 +12138,16 @@ S:	Maintained
 F:	drivers/media/v4l2-core/videobuf2-*
 F:	include/media/videobuf2-*
 
+VIRTIO AND VHOST VSOCK DRIVER
+M:	Stefan Hajnoczi <stefanha@redhat.com>
+L:	kvm@vger.kernel.org
+L:	virtualization@lists.linux-foundation.org
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	include/linux/virtio_vsock.h
+F:	include/uapi/linux/virtio_vsock.h
+F:	net/vmw_vsock/virtio_transport_common.c
+
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
 S:	Maintained
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000000000000..9638bfeb0d1f
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,154 @@
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include <uapi/linux/virtio_vsock.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+
+enum {
+	VSOCK_VQ_RX     = 0, /* for host to guest data */
+	VSOCK_VQ_TX     = 1, /* for guest to host data */
+	VSOCK_VQ_EVENT  = 2,
+	VSOCK_VQ_MAX    = 3,
+};
+
+/* Per-socket state (accessed via vsk->trans) */
+struct virtio_vsock_sock {
+	struct vsock_sock *vsk;
+
+	/* Protected by lock_sock(sk_vsock(trans->vsk)) */
+	u32 buf_size;
+	u32 buf_size_min;
+	u32 buf_size_max;
+
+	spinlock_t tx_lock;
+	spinlock_t rx_lock;
+
+	/* Protected by tx_lock */
+	u32 tx_cnt;
+	u32 buf_alloc;
+	u32 peer_fwd_cnt;
+	u32 peer_buf_alloc;
+
+	/* Protected by rx_lock */
+	u32 fwd_cnt;
+	u32 rx_bytes;
+	struct list_head rx_queue;
+};
+
+struct virtio_vsock_pkt {
+	struct virtio_vsock_hdr	hdr;
+	struct work_struct work;
+	struct list_head list;
+	void *buf;
+	u32 len;
+	u32 off;
+	bool reply;
+};
+
+struct virtio_vsock_pkt_info {
+	u32 remote_cid, remote_port;
+	struct msghdr *msg;
+	u32 pkt_len;
+	u16 type;
+	u16 op;
+	u32 flags;
+	bool reply;
+};
+
+struct virtio_transport {
+	/* This must be the first field */
+	struct vsock_transport transport;
+
+	/* Takes ownership of the packet */
+	int (*send_pkt)(struct virtio_vsock_pkt *pkt);
+};
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len,
+				int type);
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				 struct vsock_sock *psk);
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now);
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_available_now);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
+bool virtio_transport_stream_allow(u32 cid, u32 port);
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr);
+bool virtio_transport_dgram_allow(u32 cid, u32 port);
+
+int virtio_transport_connect(struct vsock_sock *vsk);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
+
+void virtio_transport_release(struct vsock_sock *vsk);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len);
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t len);
+
+void virtio_transport_destruct(struct vsock_sock *vsk);
+
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+
+#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 3af0b224f754..f2758964ce6f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -63,6 +63,8 @@ struct vsock_sock {
 	struct list_head accept_queue;
 	bool rejected;
 	struct delayed_work dwork;
+	struct delayed_work close_work;
+	bool close_work_scheduled;
 	u32 peer_shutdown;
 	bool sent_request;
 	bool ignore_connecting_rst;
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
new file mode 100644
index 000000000000..b7f1d6278280
--- /dev/null
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -0,0 +1,144 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsock
+
+#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \
+    defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H
+
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+
+#define show_type(val) \
+	__print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST);
+
+#define show_op(val) \
+	__print_symbolic(val, \
+			 { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \
+			 { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \
+			 { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \
+			 { VIRTIO_VSOCK_OP_RST, "RST" }, \
+			 { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \
+			 { VIRTIO_VSOCK_OP_RW, "RW" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" })
+
+TRACE_EVENT(virtio_transport_alloc_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags)
+);
+
+TRACE_EVENT(virtio_transport_recv_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags,
+		 __u32 buf_alloc,
+		 __u32 fwd_cnt
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags,
+		buf_alloc,
+		fwd_cnt
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+		__field(__u32, buf_alloc)
+		__field(__u32, fwd_cnt)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+		__entry->buf_alloc = buf_alloc;
+		__entry->fwd_cnt = fwd_cnt;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x "
+		  "buf_alloc=%u fwd_cnt=%u",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags,
+		  __entry->buf_alloc,
+		  __entry->fwd_cnt)
+);
+
+#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE vsock_virtio_transport_common
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ec10cfef166a..3cf0116d9c2b 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -453,6 +453,7 @@ header-y += virtio_ring.h
 header-y += virtio_rng.h
 header-y += virtio_scsi.h
 header-y += virtio_types.h
+header-y += virtio_vsock.h
 header-y += vm_sockets.h
 header-y += vt.h
 header-y += wait.h
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f587b15..3228d582234a 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
+#define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
new file mode 100644
index 000000000000..6b011c19b50f
--- /dev/null
+++ b/include/uapi/linux/virtio_vsock.h
@@ -0,0 +1,94 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
+#define _UAPI_LINUX_VIRTIO_VOSCK_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_vsock_config {
+	__le64 guest_cid;
+} __attribute__((packed));
+
+enum virtio_vsock_event_id {
+	VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0,
+};
+
+struct virtio_vsock_event {
+	__le32 id;
+} __attribute__((packed));
+
+struct virtio_vsock_hdr {
+	__le64	src_cid;
+	__le64	dst_cid;
+	__le32	src_port;
+	__le32	dst_port;
+	__le32	len;
+	__le16	type;		/* enum virtio_vsock_type */
+	__le16	op;		/* enum virtio_vsock_op */
+	__le32	flags;
+	__le32	buf_alloc;
+	__le32	fwd_cnt;
+} __attribute__((packed));
+
+enum virtio_vsock_type {
+	VIRTIO_VSOCK_TYPE_STREAM = 1,
+};
+
+enum virtio_vsock_op {
+	VIRTIO_VSOCK_OP_INVALID = 0,
+
+	/* Connect operations */
+	VIRTIO_VSOCK_OP_REQUEST = 1,
+	VIRTIO_VSOCK_OP_RESPONSE = 2,
+	VIRTIO_VSOCK_OP_RST = 3,
+	VIRTIO_VSOCK_OP_SHUTDOWN = 4,
+
+	/* To send payload */
+	VIRTIO_VSOCK_OP_RW = 5,
+
+	/* Tell the peer our credit info */
+	VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
+	/* Request the peer to send the credit info to us */
+	VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+};
+
+/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
+enum virtio_vsock_shutdown {
+	VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
+	VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
+};
+
+#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..a53b3a16b4f1
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+	const struct vsock_transport *t = vsock_core_get_transport();
+
+	return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+			   size_t len,
+			   u32 src_cid,
+			   u32 src_port,
+			   u32 dst_cid,
+			   u32 dst_port)
+{
+	struct virtio_vsock_pkt *pkt;
+	int err;
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	pkt->hdr.type		= cpu_to_le16(info->type);
+	pkt->hdr.op		= cpu_to_le16(info->op);
+	pkt->hdr.src_cid	= cpu_to_le64(src_cid);
+	pkt->hdr.dst_cid	= cpu_to_le64(dst_cid);
+	pkt->hdr.src_port	= cpu_to_le32(src_port);
+	pkt->hdr.dst_port	= cpu_to_le32(dst_port);
+	pkt->hdr.flags		= cpu_to_le32(info->flags);
+	pkt->len		= len;
+	pkt->hdr.len		= cpu_to_le32(len);
+	pkt->reply		= info->reply;
+
+	if (info->msg && len > 0) {
+		pkt->buf = kmalloc(len, GFP_KERNEL);
+		if (!pkt->buf)
+			goto out_pkt;
+		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		if (err)
+			goto out;
+	}
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 len,
+					 info->type,
+					 info->op,
+					 info->flags);
+
+	return pkt;
+
+out:
+	kfree(pkt->buf);
+out_pkt:
+	kfree(pkt);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+					  struct virtio_vsock_pkt_info *info)
+{
+	u32 src_cid, src_port, dst_cid, dst_port;
+	struct virtio_vsock_sock *vvs;
+	struct virtio_vsock_pkt *pkt;
+	u32 pkt_len = info->pkt_len;
+
+	src_cid = vm_sockets_get_local_cid();
+	src_port = vsk->local_addr.svm_port;
+	if (!info->remote_cid) {
+		dst_cid	= vsk->remote_addr.svm_cid;
+		dst_port = vsk->remote_addr.svm_port;
+	} else {
+		dst_cid = info->remote_cid;
+		dst_port = info->remote_port;
+	}
+
+	vvs = vsk->trans;
+
+	/* we can send less than pkt_len bytes */
+	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+	/* virtio_transport_get_credit might return less than pkt_len credit */
+	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+	/* Do not send zero length OP_RW pkt */
+	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+		return pkt_len;
+
+	pkt = virtio_transport_alloc_pkt(info, pkt_len,
+					 src_cid, src_port,
+					 dst_cid, dst_port);
+	if (!pkt) {
+		virtio_transport_put_credit(vvs, pkt_len);
+		return -ENOMEM;
+	}
+
+	virtio_transport_inc_tx_pkt(vvs, pkt);
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes -= pkt->len;
+	vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+	pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	u32 ret;
+
+	spin_lock_bh(&vvs->tx_lock);
+	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (ret > credit)
+		ret = credit;
+	vvs->tx_cnt += ret;
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->tx_cnt -= credit;
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+					       int type,
+					       struct virtio_vsock_hdr *hdr)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+		.type = type,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	size_t bytes, total = 0;
+	int err = -EFAULT;
+
+	spin_lock_bh(&vvs->rx_lock);
+	while (total < len && !list_empty(&vvs->rx_queue)) {
+		pkt = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		bytes = len - total;
+		if (bytes > pkt->len - pkt->off)
+			bytes = pkt->len - pkt->off;
+
+		/* sk_lock is held by caller so no one else can dequeue.
+		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 */
+		spin_unlock_bh(&vvs->rx_lock);
+
+		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+		if (err)
+			goto out;
+
+		spin_lock_bh(&vvs->rx_lock);
+
+		total += bytes;
+		pkt->off += bytes;
+		if (pkt->off == pkt->len) {
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			list_del(&pkt->list);
+			virtio_transport_free_pkt(pkt);
+		}
+	}
+	spin_unlock_bh(&vvs->rx_lock);
+
+	/* Send a credit pkt to peer */
+	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+					    NULL);
+
+	return total;
+
+out:
+	if (total)
+		err = total;
+	return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len, int flags)
+{
+	if (flags & MSG_PEEK)
+		return -EOPNOTSUPP;
+
+	return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->rx_lock);
+	bytes = vvs->rx_bytes;
+	spin_unlock_bh(&vvs->rx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (bytes < 0)
+		bytes = 0;
+
+	return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->tx_lock);
+	bytes = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				    struct vsock_sock *psk)
+{
+	struct virtio_vsock_sock *vvs;
+
+	vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+	if (!vvs)
+		return -ENOMEM;
+
+	vsk->trans = vvs;
+	vvs->vsk = vsk;
+	if (psk) {
+		struct virtio_vsock_sock *ptrans = psk->trans;
+
+		vvs->buf_size	= ptrans->buf_size;
+		vvs->buf_size_min = ptrans->buf_size_min;
+		vvs->buf_size_max = ptrans->buf_size_max;
+		vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+	} else {
+		vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+		vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+		vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+	}
+
+	vvs->buf_alloc = vvs->buf_size;
+
+	spin_lock_init(&vvs->rx_lock);
+	spin_lock_init(&vvs->tx_lock);
+	INIT_LIST_HEAD(&vvs->rx_queue);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size_min)
+		vvs->buf_size_min = val;
+	if (val > vvs->buf_size_max)
+		vvs->buf_size_max = val;
+	vvs->buf_size = val;
+	vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val > vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now)
+{
+	if (vsock_stream_has_data(vsk))
+		*data_ready_now = true;
+	else
+		*data_ready_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_avail_now)
+{
+	s64 free_space;
+
+	free_space = vsock_stream_has_space(vsk);
+	if (free_space > 0)
+		*space_avail_now = true;
+	else if (free_space == 0)
+		*space_avail_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+	return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_REQUEST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_SHUTDOWN,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.flags = (mode & RCV_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+			 (mode & SEND_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t dgram_len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RW,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.msg = msg,
+		.pkt_len = len,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+				  struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.reply = !!pkt,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket.  There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = le16_to_cpu(pkt->hdr.type),
+		.reply = true,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	pkt = virtio_transport_alloc_pkt(&info, 0,
+					 le32_to_cpu(pkt->hdr.dst_cid),
+					 le32_to_cpu(pkt->hdr.dst_port),
+					 le32_to_cpu(pkt->hdr.src_cid),
+					 le32_to_cpu(pkt->hdr.src_port));
+	if (!pkt)
+		return -ENOMEM;
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+	if (timeout) {
+		DEFINE_WAIT(wait);
+
+		do {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk_wait_event(sk, &timeout,
+					  sock_flag(sk, SOCK_DONE)))
+				break;
+		} while (!signal_pending(current) && timeout);
+
+		finish_wait(sk_sleep(sk), &wait);
+	}
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+				      bool cancel_timeout)
+{
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_set_flag(sk, SOCK_DONE);
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	if (vsock_stream_has_data(vsk) <= 0)
+		sk->sk_state = SS_DISCONNECTING;
+	sk->sk_state_change(sk);
+
+	if (vsk->close_work_scheduled &&
+	    (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+		vsk->close_work_scheduled = false;
+
+		vsock_remove_sock(vsk);
+
+		/* Release refcnt obtained when we scheduled the timeout */
+		sock_put(sk);
+	}
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+	struct vsock_sock *vsk =
+		container_of(work, struct vsock_sock, close_work.work);
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_hold(sk);
+	lock_sock(sk);
+
+	if (!sock_flag(sk, SOCK_DONE)) {
+		(void)virtio_transport_reset(vsk, NULL);
+
+		virtio_transport_do_close(vsk, false);
+	}
+
+	vsk->close_work_scheduled = false;
+
+	release_sock(sk);
+	sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+
+	if (!(sk->sk_state == SS_CONNECTED ||
+	      sk->sk_state == SS_DISCONNECTING))
+		return true;
+
+	/* Already received SHUTDOWN from peer, reply with RST */
+	if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+		(void)virtio_transport_reset(vsk, NULL);
+		return true;
+	}
+
+	if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+		(void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+	if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+		virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+	if (sock_flag(sk, SOCK_DONE)) {
+		return true;
+	}
+
+	sock_hold(sk);
+	INIT_DELAYED_WORK(&vsk->close_work,
+			  virtio_transport_close_timeout);
+	vsk->close_work_scheduled = true;
+	schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+	return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+	bool remove_sock = true;
+
+	lock_sock(sk);
+	if (sk->sk_type == SOCK_STREAM)
+		remove_sock = virtio_transport_close(vsk);
+	release_sock(sk);
+
+	if (remove_sock)
+		vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+				 struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	int err;
+	int skerr;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RESPONSE:
+		sk->sk_state = SS_CONNECTED;
+		sk->sk_socket->state = SS_CONNECTED;
+		vsock_insert_connected(vsk);
+		sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_INVALID:
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		skerr = ECONNRESET;
+		err = 0;
+		goto destroy;
+	default:
+		skerr = EPROTO;
+		err = -EINVAL;
+		goto destroy;
+	}
+	return 0;
+
+destroy:
+	virtio_transport_reset(vsk, pkt);
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = skerr;
+	sk->sk_error_report(sk);
+	return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+				struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	int err = 0;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RW:
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+		pkt->off = 0;
+
+		spin_lock_bh(&vvs->rx_lock);
+		virtio_transport_inc_rx_pkt(vvs, pkt);
+		list_add_tail(&pkt->list, &vvs->rx_queue);
+		spin_unlock_bh(&vvs->rx_lock);
+
+		sk->sk_data_ready(sk);
+		return err;
+	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+		sk->sk_write_space(sk);
+		break;
+	case VIRTIO_VSOCK_OP_SHUTDOWN:
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+			vsk->peer_shutdown |= RCV_SHUTDOWN;
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+			vsk->peer_shutdown |= SEND_SHUTDOWN;
+		if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+		    vsock_stream_has_data(vsk) <= 0)
+			sk->sk_state = SS_DISCONNECTING;
+		if (le32_to_cpu(pkt->hdr.flags))
+			sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		virtio_transport_do_close(vsk, true);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	virtio_transport_free_pkt(pkt);
+	return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+				    struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+			       struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RESPONSE,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+		.remote_port = le32_to_cpu(pkt->hdr.src_port),
+		.reply = true,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_sock *vchild;
+	struct sock *child;
+
+	if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+		virtio_transport_reset(vsk, pkt);
+		return -EINVAL;
+	}
+
+	if (sk_acceptq_is_full(sk)) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+			       sk->sk_type, 0);
+	if (!child) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	sk->sk_ack_backlog++;
+
+	lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+	child->sk_state = SS_CONNECTED;
+
+	vchild = vsock_sk(child);
+	vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+	vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+
+	vsock_insert_connected(vchild);
+	vsock_enqueue_accept(sk, child);
+	virtio_transport_send_response(vchild, pkt);
+
+	release_sock(child);
+
+	sk->sk_data_ready(sk);
+	return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+					  struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool space_available;
+
+	/* buf_alloc and fwd_cnt is always included in the hdr */
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+	vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+	space_available = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+	return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct sockaddr_vm src, dst;
+	struct vsock_sock *vsk;
+	struct sock *sk;
+	bool space_available;
+
+	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+
+	trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+					dst.svm_cid, dst.svm_port,
+					le32_to_cpu(pkt->hdr.len),
+					le16_to_cpu(pkt->hdr.type),
+					le16_to_cpu(pkt->hdr.op),
+					le32_to_cpu(pkt->hdr.flags),
+					le32_to_cpu(pkt->hdr.buf_alloc),
+					le32_to_cpu(pkt->hdr.fwd_cnt));
+
+	if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+		(void)virtio_transport_reset_no_sock(pkt);
+		goto free_pkt;
+	}
+
+	/* The socket must be in connected or bound table
+	 * otherwise send reset back
+	 */
+	sk = vsock_find_connected_socket(&src, &dst);
+	if (!sk) {
+		sk = vsock_find_bound_socket(&dst);
+		if (!sk) {
+			(void)virtio_transport_reset_no_sock(pkt);
+			goto free_pkt;
+		}
+	}
+
+	vsk = vsock_sk(sk);
+
+	space_available = virtio_transport_space_update(sk, pkt);
+
+	lock_sock(sk);
+
+	/* Update CID in case it has changed after a transport reset event */
+	vsk->local_addr.svm_cid = dst.svm_cid;
+
+	if (space_available)
+		sk->sk_write_space(sk);
+
+	switch (sk->sk_state) {
+	case VSOCK_SS_LISTEN:
+		virtio_transport_recv_listen(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTING:
+		virtio_transport_recv_connecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTED:
+		virtio_transport_recv_connected(sk, pkt);
+		break;
+	case SS_DISCONNECTING:
+		virtio_transport_recv_disconnecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	default:
+		virtio_transport_free_pkt(pkt);
+		break;
+	}
+	release_sock(sk);
+
+	/* Release refcnt obtained when we fetched this socket out of the
+	 * bound or connected list.
+	 */
+	sock_put(sk);
+	return;
+
+free_pkt:
+	virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+	kfree(pkt->buf);
+	kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
-- 
cgit v1.2.3


From bd721ea73e1f965569b40620538c942001f76294 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Tue, 2 Aug 2016 14:03:33 -0700
Subject: treewide: replace obsolete _refok by __ref

There was only one use of __initdata_refok and __exit_refok

__init_refok was used 46 times against 82 for __ref.

Those definitions are obsolete since commit 312b1485fb50 ("Introduce new
section reference annotations tags: __ref, __refdata, __refconst")

This patch removes the following compatibility definitions and replaces
them treewide.

/* compatibility defines */
#define __init_refok     __ref
#define __initdata_refok __refdata
#define __exit_refok     __ref

I can also provide separate patches if necessary.
(One patch per tree and check in 1 month or 2 to remove old definitions)

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1466796271-3043-1-git-send-email-fabf@skynet.be
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/machvec_impl.h         | 2 +-
 arch/arc/mm/init.c                       | 2 +-
 arch/arm/mach-integrator/impd1.c         | 4 ++--
 arch/arm/mach-mv78xx0/common.c           | 2 +-
 arch/blackfin/mm/init.c                  | 2 +-
 arch/hexagon/mm/init.c                   | 2 +-
 arch/ia64/kernel/mca.c                   | 2 +-
 arch/microblaze/mm/init.c                | 4 ++--
 arch/microblaze/mm/pgtable.c             | 2 +-
 arch/mips/mm/init.c                      | 2 +-
 arch/mips/txx9/generic/pci.c             | 2 +-
 arch/nios2/mm/init.c                     | 2 +-
 arch/openrisc/mm/ioremap.c               | 4 ++--
 arch/powerpc/lib/alloc.c                 | 2 +-
 arch/powerpc/mm/pgtable_32.c             | 2 +-
 arch/powerpc/platforms/powermac/setup.c  | 4 ++--
 arch/powerpc/platforms/ps3/device-init.c | 2 +-
 arch/powerpc/sysdev/msi_bitmap.c         | 2 +-
 arch/score/mm/init.c                     | 2 +-
 arch/sh/drivers/pci/pci.c                | 4 ++--
 arch/sh/mm/ioremap.c                     | 2 +-
 arch/x86/mm/init.c                       | 4 ++--
 arch/x86/platform/efi/early_printk.c     | 4 ++--
 drivers/acpi/osl.c                       | 5 ++---
 drivers/base/node.c                      | 2 +-
 drivers/clk/clkdev.c                     | 4 ++--
 drivers/pci/xen-pcifront.c               | 2 +-
 drivers/video/logo/logo.c                | 4 ++--
 include/acpi/acpi_io.h                   | 2 +-
 include/linux/init.h                     | 6 ------
 include/net/net_namespace.h              | 2 +-
 init/main.c                              | 2 +-
 mm/page_alloc.c                          | 4 ++--
 mm/slab.c                                | 2 +-
 mm/sparse-vmemmap.c                      | 2 +-
 mm/sparse.c                              | 2 +-
 36 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'include/net')

diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h
index f54bdf658cd0..d3398f6ab74c 100644
--- a/arch/alpha/kernel/machvec_impl.h
+++ b/arch/alpha/kernel/machvec_impl.h
@@ -137,7 +137,7 @@
 #define __initmv __initdata
 #define ALIAS_MV(x)
 #else
-#define __initmv __initdata_refok
+#define __initmv __refdata
 
 /* GCC actually has a syntax for defining aliases, but is under some
    delusion that you shouldn't be able to declare it extern somewhere
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 8be930394750..399e2f223d25 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -220,7 +220,7 @@ void __init mem_init(void)
 /*
  * free_initmem: Free all the __init memory.
  */
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 	free_initmem_default(-1);
 }
diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index 38b0da300dd5..ed9a01484030 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -320,11 +320,11 @@ static struct impd1_device impd1_devs[] = {
 #define IMPD1_VALID_IRQS 0x00000bffU
 
 /*
- * As this module is bool, it is OK to have this as __init_refok() - no
+ * As this module is bool, it is OK to have this as __ref() - no
  * probe calls will be done after the initial system bootup, as devices
  * are discovered as part of the machine startup.
  */
-static int __init_refok impd1_probe(struct lm_device *dev)
+static int __ref impd1_probe(struct lm_device *dev)
 {
 	struct impd1_module *impd1;
 	int irq_base;
diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c
index 45a05207b418..6af5430d0d97 100644
--- a/arch/arm/mach-mv78xx0/common.c
+++ b/arch/arm/mach-mv78xx0/common.c
@@ -343,7 +343,7 @@ void __init mv78xx0_init_early(void)
 				DDR_WINDOW_CPU1_BASE, DDR_WINDOW_CPU_SZ);
 }
 
-void __init_refok mv78xx0_timer_init(void)
+void __ref mv78xx0_timer_init(void)
 {
 	orion_time_init(BRIDGE_VIRT_BASE, BRIDGE_INT_TIMER1_CLR,
 			IRQ_MV78XX0_TIMER_1, get_tclk());
diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index 166842de3dc7..b59cd7c3261a 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -112,7 +112,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU
 	free_initmem_default(-1);
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 88977e42af0a..192584d5ac2f 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -93,7 +93,7 @@ void __init mem_init(void)
  * Todo:  free pages between __init_begin and __init_end; possibly
  * some devtree related stuff as well.
  */
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 }
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 07a4e32ae96a..eb9220cde76c 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 }
 
 /* Caller prevents this from being called after init */
-static void * __init_refok mca_bootmem(void)
+static void * __ref mca_bootmem(void)
 {
 	return __alloc_bootmem(sizeof(struct ia64_mca_cpu),
 	                    KERNEL_STACK_SIZE, 0);
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 77bc7c7e6522..434639f9a3a6 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -414,7 +414,7 @@ void __init *early_get_page(void)
 
 #endif /* CONFIG_MMU */
 
-void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask)
+void * __ref alloc_maybe_bootmem(size_t size, gfp_t mask)
 {
 	if (mem_init_done)
 		return kmalloc(size, mask);
@@ -422,7 +422,7 @@ void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask)
 		return alloc_bootmem(size);
 }
 
-void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask)
+void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
 {
 	void *p;
 
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index eb99fcc76088..cc732fe357ad 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -234,7 +234,7 @@ unsigned long iopa(unsigned long addr)
 	return pa;
 }
 
-__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 		unsigned long address)
 {
 	pte_t *pte;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 9b58eb5fd0d5..a5509e7dcad2 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -504,7 +504,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 void (*free_init_pages_eva)(void *begin, void *end) = NULL;
 
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 	prom_free_prom_memory();
 	/*
diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c
index a77698ff2b6f..1f6bc9a3036c 100644
--- a/arch/mips/txx9/generic/pci.c
+++ b/arch/mips/txx9/generic/pci.c
@@ -268,7 +268,7 @@ static int txx9_i8259_irq_setup(int irq)
 	return err;
 }
 
-static void __init_refok quirk_slc90e66_bridge(struct pci_dev *dev)
+static void __ref quirk_slc90e66_bridge(struct pci_dev *dev)
 {
 	int irq;	/* PCI/ISA Bridge interrupt */
 	u8 reg_64;
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index e75c75d249d6..c92fe4234009 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -89,7 +89,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 	free_initmem_default(-1);
 }
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index 5b2a95116e8f..fa60b81aee3e 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -38,7 +38,7 @@ static unsigned int fixmaps_used __initdata;
  * have to convert them into an offset in a page-aligned mapping, but the
  * caller shouldn't need to know that small detail.
  */
-void __iomem *__init_refok
+void __iomem *__ref
 __ioremap(phys_addr_t addr, unsigned long size, pgprot_t prot)
 {
 	phys_addr_t p;
@@ -116,7 +116,7 @@ void iounmap(void *addr)
  * the memblock infrastructure.
  */
 
-pte_t __init_refok *pte_alloc_one_kernel(struct mm_struct *mm,
+pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm,
 					 unsigned long address)
 {
 	pte_t *pte;
diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c
index 60b0b3fc8fc1..a58abe4afbd1 100644
--- a/arch/powerpc/lib/alloc.c
+++ b/arch/powerpc/lib/alloc.c
@@ -6,7 +6,7 @@
 #include <asm/setup.h>
 
 
-void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask)
+void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
 {
 	void *p;
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 7f922f557936..0ae0572bc239 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -79,7 +79,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
-__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
 	pte_t *pte;
 
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 3de4a7c85140..6b4e9d181126 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -353,12 +353,12 @@ static int pmac_late_init(void)
 machine_late_initcall(powermac, pmac_late_init);
 
 /*
- * This is __init_refok because we check for "initializing" before
+ * This is __ref because we check for "initializing" before
  * touching any of the __init sensitive things and "initializing"
  * will be false after __init time. This can't be __init because it
  * can be called whenever a disk is first accessed.
  */
-void __init_refok note_bootable_part(dev_t dev, int part, int goodness)
+void __ref note_bootable_part(dev_t dev, int part, int goodness)
 {
 	char *p;
 
diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
index 3f175e8aedb4..57caaf11a83f 100644
--- a/arch/powerpc/platforms/ps3/device-init.c
+++ b/arch/powerpc/platforms/ps3/device-init.c
@@ -189,7 +189,7 @@ fail_malloc:
 	return result;
 }
 
-static int __init_refok ps3_setup_uhc_device(
+static int __ref ps3_setup_uhc_device(
 	const struct ps3_repository_device *repo, enum ps3_match_id match_id,
 	enum ps3_interrupt_type interrupt_type, enum ps3_reg_type reg_type)
 {
diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c
index ed5234ed8d3f..5ebd3f018295 100644
--- a/arch/powerpc/sysdev/msi_bitmap.c
+++ b/arch/powerpc/sysdev/msi_bitmap.c
@@ -112,7 +112,7 @@ int msi_bitmap_reserve_dt_hwirqs(struct msi_bitmap *bmp)
 	return 0;
 }
 
-int __init_refok msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count,
+int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count,
 		     struct device_node *of_node)
 {
 	int size;
diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c
index 9fbce49ad3bd..444c26c0f750 100644
--- a/arch/score/mm/init.c
+++ b/arch/score/mm/init.c
@@ -91,7 +91,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init_refok free_initmem(void)
+void __ref free_initmem(void)
 {
 	free_initmem_default(POISON_FREE_INITMEM);
 }
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index d5462b7bc514..84563e39a5b8 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -221,7 +221,7 @@ pcibios_bus_report_status_early(struct pci_channel *hose,
  * We can't use pci_find_device() here since we are
  * called from interrupt context.
  */
-static void __init_refok
+static void __ref
 pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask,
 			  int warn)
 {
@@ -256,7 +256,7 @@ pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask,
 			pcibios_bus_report_status(dev->subordinate, status_mask, warn);
 }
 
-void __init_refok pcibios_report_status(unsigned int status_mask, int warn)
+void __ref pcibios_report_status(unsigned int status_mask, int warn)
 {
 	struct pci_channel *hose;
 
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c
index 0c99ec2e7ed8..d09ddfe58fd8 100644
--- a/arch/sh/mm/ioremap.c
+++ b/arch/sh/mm/ioremap.c
@@ -34,7 +34,7 @@
  * have to convert them into an offset in a page-aligned mapping, but the
  * caller shouldn't need to know that small detail.
  */
-void __iomem * __init_refok
+void __iomem * __ref
 __ioremap_caller(phys_addr_t phys_addr, unsigned long size,
 		 pgprot_t pgprot, void *caller)
 {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fb4c1b42fc7e..620928903be3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -208,7 +208,7 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
  * adjust the page_size_mask for small range to go with
  *	big page size instead small one if nearby are ram too.
  */
-static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+static void __ref adjust_range_page_size_mask(struct map_range *mr,
 							 int nr_range)
 {
 	int i;
@@ -396,7 +396,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
  * This runs before bootmem is initialized and gets pages directly from
  * the physical memory. To access them they are temporarily mapped.
  */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
+unsigned long __ref init_memory_mapping(unsigned long start,
 					       unsigned long end)
 {
 	struct map_range mr[NR_RANGE_MR];
diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
index 524142117296..5fdacb322ceb 100644
--- a/arch/x86/platform/efi/early_printk.c
+++ b/arch/x86/platform/efi/early_printk.c
@@ -44,7 +44,7 @@ early_initcall(early_efi_map_fb);
  * In case earlyprintk=efi,keep we have the whole framebuffer mapped already
  * so just return the offset efi_fb + start.
  */
-static __init_refok void *early_efi_map(unsigned long start, unsigned long len)
+static __ref void *early_efi_map(unsigned long start, unsigned long len)
 {
 	unsigned long base;
 
@@ -56,7 +56,7 @@ static __init_refok void *early_efi_map(unsigned long start, unsigned long len)
 		return early_ioremap(base + start, len);
 }
 
-static __init_refok void early_efi_unmap(void *addr, unsigned long len)
+static __ref void early_efi_unmap(void *addr, unsigned long len)
 {
 	if (!efi_fb)
 		early_iounmap(addr, len);
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index b108f1358a32..4305ee9db4b2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -309,7 +309,7 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr)
  * During early init (when acpi_gbl_permanent_mmap has not been set yet) this
  * routine simply calls __acpi_map_table() to get the job done.
  */
-void __iomem *__init_refok
+void __iomem *__ref
 acpi_os_map_iomem(acpi_physical_address phys, acpi_size size)
 {
 	struct acpi_ioremap *map;
@@ -362,8 +362,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(acpi_os_map_iomem);
 
-void *__init_refok
-acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
+void *__ref acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
 {
 	return (void *)acpi_os_map_iomem(phys, size);
 }
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 29cd96661b30..5548f9686016 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -370,7 +370,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #define page_initialized(page)  (page->lru.next)
 
-static int __init_refok get_nid_for_pfn(unsigned long pfn)
+static int __ref get_nid_for_pfn(unsigned long pfn)
 {
 	struct page *page;
 
diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index 89cc700fbc37..97ae60fa1584 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -250,7 +250,7 @@ struct clk_lookup_alloc {
 	char	con_id[MAX_CON_ID];
 };
 
-static struct clk_lookup * __init_refok
+static struct clk_lookup * __ref
 vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
 	va_list ap)
 {
@@ -287,7 +287,7 @@ vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
 	return cl;
 }
 
-struct clk_lookup * __init_refok
+struct clk_lookup * __ref
 clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 {
 	struct clk_lookup *cl;
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 5f70fee59a94..d6ff5e82377d 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -1086,7 +1086,7 @@ out:
 	return err;
 }
 
-static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+static void __ref pcifront_backend_changed(struct xenbus_device *xdev,
 						  enum xenbus_state be_state)
 {
 	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c
index 10fbfd8ab963..b6bc4a0bda2a 100644
--- a/drivers/video/logo/logo.c
+++ b/drivers/video/logo/logo.c
@@ -36,11 +36,11 @@ static int __init fb_logo_late_init(void)
 
 late_initcall(fb_logo_late_init);
 
-/* logo's are marked __initdata. Use __init_refok to tell
+/* logo's are marked __initdata. Use __ref to tell
  * modpost that it is intended that this function uses data
  * marked __initdata.
  */
-const struct linux_logo * __init_refok fb_find_logo(int depth)
+const struct linux_logo * __ref fb_find_logo(int depth)
 {
 	const struct linux_logo *logo = NULL;
 
diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h
index dd86c5fc102d..d7d0f495a34e 100644
--- a/include/acpi/acpi_io.h
+++ b/include/acpi/acpi_io.h
@@ -13,7 +13,7 @@ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
 }
 #endif
 
-void __iomem *__init_refok
+void __iomem *__ref
 acpi_os_map_iomem(acpi_physical_address phys, acpi_size size);
 void __ref acpi_os_unmap_iomem(void __iomem *virt, acpi_size size);
 void __iomem *acpi_os_get_iomem(acpi_physical_address phys, unsigned int size);
diff --git a/include/linux/init.h b/include/linux/init.h
index aedb254abc37..6935d02474aa 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -77,12 +77,6 @@
 #define __refdata        __section(.ref.data)
 #define __refconst       __constsection(.ref.rodata)
 
-/* compatibility defines */
-#define __init_refok     __ref
-#define __initdata_refok __refdata
-#define __exit_refok     __ref
-
-
 #ifdef MODULE
 #define __exitused
 #else
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 4089abc6e9c0..0933c7455a30 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -275,7 +275,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #define __net_initconst
 #else
 #define __net_init	__init
-#define __net_exit	__exit_refok
+#define __net_exit	__ref
 #define __net_initdata	__initdata
 #define __net_initconst	__initconst
 #endif
diff --git a/init/main.c b/init/main.c
index eae02aa03c9e..e7345dcaaf05 100644
--- a/init/main.c
+++ b/init/main.c
@@ -380,7 +380,7 @@ static void __init setup_command_line(char *command_line)
 
 static __initdata DECLARE_COMPLETION(kthreadd_done);
 
-static noinline void __init_refok rest_init(void)
+static noinline void __ref rest_init(void)
 {
 	int pid;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea759b935360..39a372a2a1d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5276,7 +5276,7 @@ void __init setup_per_cpu_pageset(void)
 		setup_zone_pageset(zone);
 }
 
-static noinline __init_refok
+static noinline __ref
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
@@ -5903,7 +5903,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 	}
 }
 
-static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long __maybe_unused start = 0;
 	unsigned long __maybe_unused offset = 0;
diff --git a/mm/slab.c b/mm/slab.c
index ca135bd47c35..261147ba156f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1877,7 +1877,7 @@ static struct array_cache __percpu *alloc_kmem_cache_cpus(
 	return cpu_cache;
 }
 
-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
+static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (slab_state >= FULL)
 		return enable_cpucache(cachep, gfp);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 68885dcbaf40..574c67b663fe 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -36,7 +36,7 @@
  * Uses the main allocators if they are available, else bootmem.
  */
 
-static void * __init_refok __earlyonly_bootmem_alloc(int node,
+static void * __ref __earlyonly_bootmem_alloc(int node,
 				unsigned long size,
 				unsigned long align,
 				unsigned long goal)
diff --git a/mm/sparse.c b/mm/sparse.c
index 36d7bbb80e49..1e168bf2779a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -59,7 +59,7 @@ static inline void set_section_nid(unsigned long section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
+static noinline struct mem_section __ref *sparse_index_alloc(int nid)
 {
 	struct mem_section *section = NULL;
 	unsigned long array_size = SECTIONS_PER_ROOT *
-- 
cgit v1.2.3


From 2439ca0402091badb24415e1b073ba12b34ba423 Mon Sep 17 00:00:00 2001
From: Maxim Altshul <maxim.altshul@ti.com>
Date: Thu, 4 Aug 2016 15:43:04 +0300
Subject: mac80211: Add ieee80211_hw pointer to get_expected_throughput

The variable is added to allow the driver an easy access to
it's own hw->priv when the op is invoked.

This fixes a crash in wlcore because it was relying on a
station pointer that wasn't initialized yet. It's the wrong
way to fix the crash, but it solves the problem for now and
it does make sense to have the hw pointer here.

Signed-off-by: Maxim Altshul <maxim.altshul@ti.com>
[rewrite commit message, fix indentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ti/wlcore/main.c | 5 +++--
 include/net/mac80211.h                | 3 ++-
 net/mac80211/driver-ops.h             | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 1d689169da76..9e1f2d9c9865 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -5700,10 +5700,11 @@ out:
 	mutex_unlock(&wl->mutex);
 }
 
-static u32 wlcore_op_get_expected_throughput(struct ieee80211_sta *sta)
+static u32 wlcore_op_get_expected_throughput(struct ieee80211_hw *hw,
+					     struct ieee80211_sta *sta)
 {
 	struct wl1271_station *wl_sta = (struct wl1271_station *)sta->drv_priv;
-	struct wl1271 *wl = wl_sta->wl;
+	struct wl1271 *wl = hw->priv;
 	u8 hlid = wl_sta->hlid;
 
 	/* return in units of Kbps */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b4faadbb4e01..cca510a585c3 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3620,7 +3620,8 @@ struct ieee80211_ops {
 
 	int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 	void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
-	u32 (*get_expected_throughput)(struct ieee80211_sta *sta);
+	u32 (*get_expected_throughput)(struct ieee80211_hw *hw,
+				       struct ieee80211_sta *sta);
 	int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			   int *dbm);
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 184473c257eb..ba5fc1f01e53 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1094,7 +1094,7 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
 
 	trace_drv_get_expected_throughput(sta);
 	if (local->ops->get_expected_throughput)
-		ret = local->ops->get_expected_throughput(sta);
+		ret = local->ops->get_expected_throughput(&local->hw, sta);
 	trace_drv_return_u32(local, ret);
 
 	return ret;
-- 
cgit v1.2.3


From 372ee16386bbf6dc5eeb0387e1ede963debba82a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 3 Aug 2016 14:11:40 +0100
Subject: rxrpc: Fix races between skb free, ACK generation and replying

Inside the kafs filesystem it is possible to occasionally have a call
processed and terminated before we've had a chance to check whether we need
to clean up the rx queue for that call because afs_send_simple_reply() ends
the call when it is done, but this is done in a workqueue item that might
happen to run to completion before afs_deliver_to_call() completes.

Further, it is possible for rxrpc_kernel_send_data() to be called to send a
reply before the last request-phase data skb is released.  The rxrpc skb
destructor is where the ACK processing is done and the call state is
advanced upon release of the last skb.  ACK generation is also deferred to
a work item because it's possible that the skb destructor is not called in
a context where kernel_sendmsg() can be invoked.

To this end, the following changes are made:

 (1) kernel_rxrpc_data_consumed() is added.  This should be called whenever
     an skb is emptied so as to crank the ACK and call states.  This does
     not release the skb, however.  kernel_rxrpc_free_skb() must now be
     called to achieve that.  These together replace
     rxrpc_kernel_data_delivered().

 (2) kernel_rxrpc_data_consumed() is wrapped by afs_data_consumed().

     This makes afs_deliver_to_call() easier to work as the skb can simply
     be discarded unconditionally here without trying to work out what the
     return value of the ->deliver() function means.

     The ->deliver() functions can, via afs_data_complete(),
     afs_transfer_reply() and afs_extract_data() mark that an skb has been
     consumed (thereby cranking the state) without the need to
     conditionally free the skb to make sure the state is correct on an
     incoming call for when the call processor tries to send the reply.

 (3) rxrpc_recvmsg() now has to call kernel_rxrpc_data_consumed() when it
     has finished with a packet and MSG_PEEK isn't set.

 (4) rxrpc_packet_destructor() no longer calls rxrpc_hard_ACK_data().

     Because of this, we no longer need to clear the destructor and put the
     call before we free the skb in cases where we don't want the ACK/call
     state to be cranked.

 (5) The ->deliver() call-type callbacks are made to return -EAGAIN rather
     than 0 if they expect more data (afs_extract_data() returns -EAGAIN to
     the delivery function already), and the caller is now responsible for
     producing an abort if that was the last packet.

 (6) There are many bits of unmarshalling code where:

 		ret = afs_extract_data(call, skb, last, ...);
		switch (ret) {
		case 0:		break;
		case -EAGAIN:	return 0;
		default:	return ret;
		}

     is to be found.  As -EAGAIN can now be passed back to the caller, we
     now just return if ret < 0:

 		ret = afs_extract_data(call, skb, last, ...);
		if (ret < 0)
			return ret;

 (7) Checks for trailing data and empty final data packets has been
     consolidated as afs_data_complete().  So:

		if (skb->len > 0)
			return -EBADMSG;
		if (!last)
			return 0;

     becomes:

		ret = afs_data_complete(call, skb, last);
		if (ret < 0)
			return ret;

 (8) afs_transfer_reply() now checks the amount of data it has against the
     amount of data desired and the amount of data in the skb and returns
     an error to induce an abort if we don't get exactly what we want.

Without these changes, the following oops can occasionally be observed,
particularly if some printks are inserted into the delivery path:

general protection fault: 0000 [#1] SMP
Modules linked in: kafs(E) af_rxrpc(E) [last unloaded: af_rxrpc]
CPU: 0 PID: 1305 Comm: kworker/u8:3 Tainted: G            E   4.7.0-fsdevel+ #1303
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
Workqueue: kafsd afs_async_workfn [kafs]
task: ffff88040be041c0 ti: ffff88040c070000 task.ti: ffff88040c070000
RIP: 0010:[<ffffffff8108fd3c>]  [<ffffffff8108fd3c>] __lock_acquire+0xcf/0x15a1
RSP: 0018:ffff88040c073bc0  EFLAGS: 00010002
RAX: 6b6b6b6b6b6b6b6b RBX: 0000000000000000 RCX: ffff88040d29a710
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88040d29a710
RBP: ffff88040c073c70 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: ffff88040be041c0 R15: ffffffff814c928f
FS:  0000000000000000(0000) GS:ffff88041fa00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fa4595f4750 CR3: 0000000001c14000 CR4: 00000000001406f0
Stack:
 0000000000000006 000000000be04930 0000000000000000 ffff880400000000
 ffff880400000000 ffffffff8108f847 ffff88040be041c0 ffffffff81050446
 ffff8803fc08a920 ffff8803fc08a958 ffff88040be041c0 ffff88040c073c38
Call Trace:
 [<ffffffff8108f847>] ? mark_held_locks+0x5e/0x74
 [<ffffffff81050446>] ? __local_bh_enable_ip+0x9b/0xa1
 [<ffffffff8108f9ca>] ? trace_hardirqs_on_caller+0x16d/0x189
 [<ffffffff810915f4>] lock_acquire+0x122/0x1b6
 [<ffffffff810915f4>] ? lock_acquire+0x122/0x1b6
 [<ffffffff814c928f>] ? skb_dequeue+0x18/0x61
 [<ffffffff81609dbf>] _raw_spin_lock_irqsave+0x35/0x49
 [<ffffffff814c928f>] ? skb_dequeue+0x18/0x61
 [<ffffffff814c928f>] skb_dequeue+0x18/0x61
 [<ffffffffa009aa92>] afs_deliver_to_call+0x344/0x39d [kafs]
 [<ffffffffa009ab37>] afs_process_async_call+0x4c/0xd5 [kafs]
 [<ffffffffa0099e9c>] afs_async_workfn+0xe/0x10 [kafs]
 [<ffffffff81063a3a>] process_one_work+0x29d/0x57c
 [<ffffffff81064ac2>] worker_thread+0x24a/0x385
 [<ffffffff81064878>] ? rescuer_thread+0x2d0/0x2d0
 [<ffffffff810696f5>] kthread+0xf3/0xfb
 [<ffffffff8160a6ff>] ret_from_fork+0x1f/0x40
 [<ffffffff81069602>] ? kthread_create_on_node+0x1cf/0x1cf

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/rxrpc.txt |  21 ++--
 fs/afs/cmservice.c                 |  78 ++++++-------
 fs/afs/fsclient.c                  | 221 ++++++++++++-------------------------
 fs/afs/internal.h                  |  14 ++-
 fs/afs/rxrpc.c                     |  73 +++++++-----
 fs/afs/vlclient.c                  |  11 +-
 include/net/af_rxrpc.h             |   2 +-
 net/rxrpc/ar-internal.h            |   1 +
 net/rxrpc/call_accept.c            |   1 +
 net/rxrpc/call_event.c             |   3 +
 net/rxrpc/call_object.c            |   8 +-
 net/rxrpc/input.c                  |  12 +-
 net/rxrpc/recvmsg.c                |  25 +----
 net/rxrpc/skbuff.c                 |  41 +++++--
 14 files changed, 225 insertions(+), 286 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 16a924c486bf..70c926ae212d 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -790,13 +790,12 @@ The kernel interface functions are as follows:
      Data messages can have their contents extracted with the usual bunch of
      socket buffer manipulation functions.  A data message can be determined to
      be the last one in a sequence with rxrpc_kernel_is_data_last().  When a
-     data message has been used up, rxrpc_kernel_data_delivered() should be
-     called on it..
+     data message has been used up, rxrpc_kernel_data_consumed() should be
+     called on it.
 
-     Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose
-     of.  It is possible to get extra refs on all types of message for later
-     freeing, but this may pin the state of a call until the message is finally
-     freed.
+     Messages should be handled to rxrpc_kernel_free_skb() to dispose of.  It
+     is possible to get extra refs on all types of message for later freeing,
+     but this may pin the state of a call until the message is finally freed.
 
  (*) Accept an incoming call.
 
@@ -821,12 +820,14 @@ The kernel interface functions are as follows:
      Other errors may be returned if the call had been aborted (-ECONNABORTED)
      or had timed out (-ETIME).
 
- (*) Record the delivery of a data message and free it.
+ (*) Record the delivery of a data message.
 
-	void rxrpc_kernel_data_delivered(struct sk_buff *skb);
+	void rxrpc_kernel_data_consumed(struct rxrpc_call *call,
+					struct sk_buff *skb);
 
-     This is used to record a data message as having been delivered and to
-     update the ACK state for the call.  The socket buffer will be freed.
+     This is used to record a data message as having been consumed and to
+     update the ACK state for the call.  The message must still be passed to
+     rxrpc_kernel_free_skb() for disposal by the caller.
 
  (*) Free a message.
 
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 4b0eff6da674..85737e96ab8b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 	case 1:
 		_debug("extract FID count");
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("FID count: %u", call->count);
@@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		_debug("extract FID array");
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       call->count * 3 * 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		_debug("unmarshall FID array");
 		call->request = kcalloc(call->count,
@@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 	case 3:
 		_debug("extract CB count");
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		tmp = ntohl(call->tmp);
 		_debug("CB count: %u", tmp);
@@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		_debug("extract CB array");
 		ret = afs_extract_data(call, skb, last, call->request,
 				       call->count * 3 * 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		_debug("unmarshall CB array");
 		cb = call->request;
@@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		call->unmarshall++;
 
 	case 5:
-		_debug("trailer");
-		if (skb->len != 0)
-			return -EBADMSG;
+		ret = afs_data_complete(call, skb, last);
+		if (ret < 0)
+			return ret;
 
 		/* Record that the message was unmarshalled successfully so
 		 * that the call destructor can know do the callback breaking
@@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		break;
 	}
 
-	if (!last)
-		return 0;
 
 	call->state = AFS_CALL_REPLYING;
 
@@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 {
 	struct afs_server *server;
 	struct in_addr addr;
+	int ret;
 
 	_enter(",{%u},%d", skb->len, last);
 
-	if (skb->len > 0)
-		return -EBADMSG;
-	if (!last)
-		return 0;
+	ret = afs_data_complete(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
@@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
 
 	_enter(",{%u},%d", skb->len, last);
 
+	/* There are some arguments that we ignore */
+	afs_data_consumed(call, skb);
 	if (!last)
-		return 0;
+		return -EAGAIN;
 
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
@@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
 static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
 				bool last)
 {
+	int ret;
+
 	_enter(",{%u},%d", skb->len, last);
 
-	if (skb->len > 0)
-		return -EBADMSG;
-	if (!last)
-		return 0;
+	ret = afs_data_complete(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
@@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	if (skb->len > 0)
-		return -EBADMSG;
-	if (!last)
-		return 0;
+	ret = afs_data_complete(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	switch (call->unmarshall) {
 	case 0:
@@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 		break;
 	}
 
-	if (!last)
-		return 0;
+	ret = afs_data_complete(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	call->state = AFS_CALL_REPLYING;
 
@@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
 						 struct sk_buff *skb, bool last)
 {
+	int ret;
+
 	_enter(",{%u},%d", skb->len, last);
 
-	if (skb->len > 0)
-		return -EBADMSG;
-	if (!last)
-		return 0;
+	ret = afs_data_complete(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index c2e930ec2888..9312b92e54be 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter(",,%u", last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	case 1:
 		_debug("extract data length (MSW)");
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("DATA length MSW: %u", call->count);
@@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	case 2:
 		_debug("extract data length");
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("DATA length: %u", call->count);
@@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 			ret = afs_extract_data(call, skb, last, buffer,
 					       call->count);
 			kunmap_atomic(buffer);
-			switch (ret) {
-			case 0:		break;
-			case -EAGAIN:	return 0;
-			default:	return ret;
-			}
+			if (ret < 0)
+				return ret;
 		}
 
 		call->offset = 0;
@@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	case 4:
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       (21 + 3 + 6) * 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		bp = call->buffer;
 		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
@@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		call->unmarshall++;
 
 	case 5:
-		_debug("trailer");
-		if (skb->len != 0)
-			return -EBADMSG;
+		ret = afs_data_complete(call, skb, last);
+		if (ret < 0)
+			return ret;
 		break;
 	}
 
-	if (!last)
-		return 0;
-
 	if (call->count < PAGE_SIZE) {
 		_debug("clear");
 		page = call->reply3;
@@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
 {
 	_enter(",{%u},%d", skb->len, last);
 
-	if (skb->len > 0)
-		return -EBADMSG; /* shouldn't be any reply data */
-	return 0;
+	/* shouldn't be any reply data */
+	return afs_data_complete(call, skb, last);
 }
 
 /*
@@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call,
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call,
 {
 	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call,
 {
 	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call,
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter(",,%u", last);
 
-	afs_transfer_reply(call, skb);
-	if (!last) {
-		_leave(" = 0 [more]");
-		return 0;
-	}
-
-	if (call->reply_size != call->reply_max) {
-		_leave(" = -EBADMSG [%u != %u]",
-		       call->reply_size, call->reply_max);
-		return -EBADMSG;
-	}
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
@@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call,
 	afs_dataversion_t *store_version;
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
+	int ret;
 
 	_enter(",,%u", last);
 
-	afs_transfer_reply(call, skb);
-	if (!last) {
-		_leave(" = 0 [more]");
-		return 0;
-	}
-
-	if (call->reply_size != call->reply_max) {
-		_leave(" = -EBADMSG [%u != %u]",
-		       call->reply_size, call->reply_max);
-		return -EBADMSG;
-	}
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	store_version = NULL;
@@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		_debug("extract status");
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       12 * 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		bp = call->buffer;
 		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
@@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		/* extract the volume name length */
 	case 2:
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("volname length: %u", call->count);
@@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		if (call->count > 0) {
 			ret = afs_extract_data(call, skb, last, call->reply3,
 					       call->count);
-			switch (ret) {
-			case 0:		break;
-			case -EAGAIN:	return 0;
-			default:	return ret;
-			}
+			if (ret < 0)
+				return ret;
 		}
 
 		p = call->reply3;
@@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 4:
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       call->count);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->offset = 0;
 		call->unmarshall++;
@@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		/* extract the offline message length */
 	case 5:
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("offline msg length: %u", call->count);
@@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		if (call->count > 0) {
 			ret = afs_extract_data(call, skb, last, call->reply3,
 					       call->count);
-			switch (ret) {
-			case 0:		break;
-			case -EAGAIN:	return 0;
-			default:	return ret;
-			}
+			if (ret < 0)
+				return ret;
 		}
 
 		p = call->reply3;
@@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 7:
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       call->count);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->offset = 0;
 		call->unmarshall++;
@@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		/* extract the message of the day length */
 	case 8:
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("motd length: %u", call->count);
@@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		if (call->count > 0) {
 			ret = afs_extract_data(call, skb, last, call->reply3,
 					       call->count);
-			switch (ret) {
-			case 0:		break;
-			case -EAGAIN:	return 0;
-			default:	return ret;
-			}
+			if (ret < 0)
+				return ret;
 		}
 
 		p = call->reply3;
@@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 10:
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       call->count);
-		switch (ret) {
-		case 0:		break;
-		case -EAGAIN:	return 0;
-		default:	return ret;
-		}
+		if (ret < 0)
+			return ret;
 
 		call->offset = 0;
 		call->unmarshall++;
 	no_motd_padding:
 
 	case 11:
-		_debug("trailer %d", skb->len);
-		if (skb->len != 0)
-			return -EBADMSG;
+		ret = afs_data_complete(call, skb, last);
+		if (ret < 0)
+			return ret;
 		break;
 	}
 
-	if (!last)
-		return 0;
-
 	_leave(" = 0 [done]");
 	return 0;
 }
@@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
 				    struct sk_buff *skb, bool last)
 {
 	const __be32 *bp;
+	int ret;
 
 	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 71d5982312f3..df976b2a7f40 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *);
  */
 extern int afs_open_socket(void);
 extern void afs_close_socket(void);
+extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
 extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
 			 const struct afs_wait_mode *);
 extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
-extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
 extern void afs_send_empty_reply(struct afs_call *);
 extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
 			    size_t);
 
+static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
+				    bool last)
+{
+	if (skb->len > 0)
+		return -EBADMSG;
+	afs_data_consumed(call, skb);
+	if (!last)
+		return -EAGAIN;
+	return 0;
+}
+
 /*
  * security.c
  */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 4832de84d52c..14d04c848465 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -150,10 +150,9 @@ void afs_close_socket(void)
 }
 
 /*
- * note that the data in a socket buffer is now delivered and that the buffer
- * should be freed
+ * Note that the data in a socket buffer is now consumed.
  */
-static void afs_data_delivered(struct sk_buff *skb)
+void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
 {
 	if (!skb) {
 		_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
@@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb)
 	} else {
 		_debug("DLVR %p{%u} [%d]",
 		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
-			BUG();
-		rxrpc_kernel_data_delivered(skb);
+		rxrpc_kernel_data_consumed(call->rxcall, skb);
 	}
 }
 
@@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call)
 			last = rxrpc_kernel_is_data_last(skb);
 			ret = call->type->deliver(call, skb, last);
 			switch (ret) {
+			case -EAGAIN:
+				if (last) {
+					_debug("short data");
+					goto unmarshal_error;
+				}
+				break;
 			case 0:
-				if (last &&
-				    call->state == AFS_CALL_AWAIT_REPLY)
+				ASSERT(last);
+				if (call->state == AFS_CALL_AWAIT_REPLY)
 					call->state = AFS_CALL_COMPLETE;
 				break;
 			case -ENOTCONN:
@@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 				abort_code = RX_INVALID_OPERATION;
 				goto do_abort;
 			default:
+			unmarshal_error:
 				abort_code = RXGEN_CC_UNMARSHAL;
 				if (call->state != AFS_CALL_AWAIT_REPLY)
 					abort_code = RXGEN_SS_UNMARSHAL;
@@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 				call->state = AFS_CALL_ERROR;
 				break;
 			}
-			afs_data_delivered(skb);
-			skb = NULL;
-			continue;
+			break;
 		case RXRPC_SKB_MARK_FINAL_ACK:
 			_debug("Rcv ACK");
 			call->state = AFS_CALL_COMPLETE;
@@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call)
 }
 
 /*
- * empty a socket buffer into a flat reply buffer
+ * Empty a socket buffer into a flat reply buffer.
  */
-void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
 {
 	size_t len = skb->len;
 
-	if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
-		BUG();
-	call->reply_size += len;
+	if (len > call->reply_max - call->reply_size) {
+		_leave(" = -EBADMSG [%zu > %u]",
+		       len, call->reply_max - call->reply_size);
+		return -EBADMSG;
+	}
+
+	if (len > 0) {
+		if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
+				  len) < 0)
+			BUG();
+		call->reply_size += len;
+	}
+
+	afs_data_consumed(call, skb);
+	if (!last)
+		return -EAGAIN;
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+	return 0;
 }
 
 /*
@@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
 }
 
 /*
- * grab the operation ID from an incoming cache manager call
+ * Grab the operation ID from an incoming cache manager call.  The socket
+ * buffer is discarded on error or if we don't yet have sufficient data.
  */
 static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 				bool last)
@@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 	call->offset += len;
 
 	if (call->offset < 4) {
-		if (last) {
-			_leave(" = -EBADMSG [op ID short]");
-			return -EBADMSG;
-		}
-		_leave(" = 0 [incomplete]");
-		return 0;
+		afs_data_consumed(call, skb);
+		_leave(" = -EAGAIN");
+		return -EAGAIN;
 	}
 
 	call->state = AFS_CALL_AWAIT_REQUEST;
@@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 }
 
 /*
- * extract a piece of data from the received data socket buffers
+ * Extract a piece of data from the received data socket buffers.
  */
 int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
 		     bool last, void *buf, size_t count)
@@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
 	call->offset += len;
 
 	if (call->offset < count) {
-		if (last) {
-			_leave(" = -EBADMSG [%d < %zu]", call->offset, count);
-			return -EBADMSG;
-		}
+		afs_data_consumed(call, skb);
 		_leave(" = -EAGAIN");
 		return -EAGAIN;
 	}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 340afd0cd182..f94d1abdc3eb 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
 	struct afs_cache_vlocation *entry;
 	__be32 *bp;
 	u32 tmp;
-	int loop;
+	int loop, ret;
 
 	_enter(",,%u", last);
 
-	afs_transfer_reply(call, skb);
-	if (!last)
-		return 0;
-
-	if (call->reply_size != call->reply_max)
-		return -EBADMSG;
+	ret = afs_transfer_reply(call, skb, last);
+	if (ret < 0)
+		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	entry = call->reply;
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index ac1bc3c49fbd..7b0f88699b25 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -40,12 +40,12 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
 					   unsigned long,
 					   gfp_t);
 int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, size_t);
+void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_kernel_abort_call(struct rxrpc_call *, u32);
 void rxrpc_kernel_end_call(struct rxrpc_call *);
 bool rxrpc_kernel_is_data_last(struct sk_buff *);
 u32 rxrpc_kernel_get_abort_code(struct sk_buff *);
 int rxrpc_kernel_get_error_number(struct sk_buff *);
-void rxrpc_kernel_data_delivered(struct sk_buff *);
 void rxrpc_kernel_free_skb(struct sk_buff *);
 struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long);
 int rxrpc_kernel_reject_call(struct socket *);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 1bb9e7ac9e14..ff83fb1ddd47 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -425,6 +425,7 @@ struct rxrpc_call {
 	spinlock_t		lock;
 	rwlock_t		state_lock;	/* lock for state transition */
 	atomic_t		usage;
+	atomic_t		skb_count;	/* Outstanding packets on this call */
 	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u32			local_abort;	/* local abort code */
 	u32			remote_abort;	/* remote abort code */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 0b2832141bd0..9bae21e66d65 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -130,6 +130,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 			call->state = RXRPC_CALL_SERVER_ACCEPTING;
 			list_add_tail(&call->accept_link, &rx->acceptq);
 			rxrpc_get_call(call);
+			atomic_inc(&call->skb_count);
 			nsp = rxrpc_skb(notification);
 			nsp->call = call;
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fc32aa5764a2..f5e99163a09e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -460,6 +460,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
 	ASSERTCMP(sp->call, ==, NULL);
 	sp->call = call;
 	rxrpc_get_call(call);
+	atomic_inc(&call->skb_count);
 
 	/* insert into the buffer in sequence order */
 	spin_lock_bh(&call->lock);
@@ -734,6 +735,7 @@ all_acked:
 		skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
 		sp->call = call;
 		rxrpc_get_call(call);
+		atomic_inc(&call->skb_count);
 		spin_lock_bh(&call->lock);
 		if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
 			BUG();
@@ -793,6 +795,7 @@ static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
 		sp->error = error;
 		sp->call = call;
 		rxrpc_get_call(call);
+		atomic_inc(&call->skb_count);
 
 		spin_lock_bh(&call->lock);
 		ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 91287c9d01bb..c47f14fc5e88 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -491,13 +491,6 @@ void rxrpc_release_call(struct rxrpc_call *call)
 		spin_lock_bh(&call->lock);
 		while ((skb = skb_dequeue(&call->rx_queue)) ||
 		       (skb = skb_dequeue(&call->rx_oos_queue))) {
-			sp = rxrpc_skb(skb);
-			if (sp->call) {
-				ASSERTCMP(sp->call, ==, call);
-				rxrpc_put_call(call);
-				sp->call = NULL;
-			}
-			skb->destructor = NULL;
 			spin_unlock_bh(&call->lock);
 
 			_debug("- zap %s %%%u #%u",
@@ -605,6 +598,7 @@ void __rxrpc_put_call(struct rxrpc_call *call)
 
 	if (atomic_dec_and_test(&call->usage)) {
 		_debug("call %d dead", call->debug_id);
+		WARN_ON(atomic_read(&call->skb_count) != 0);
 		ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
 		rxrpc_queue_work(&call->destroyer);
 	}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 991a20d25093..9e0f58edcd01 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -55,9 +55,6 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 	if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
 		_debug("already terminated");
 		ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
-		skb->destructor = NULL;
-		sp->call = NULL;
-		rxrpc_put_call(call);
 		rxrpc_free_skb(skb);
 		return 0;
 	}
@@ -111,13 +108,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 	ret = 0;
 
 out:
-	/* release the socket buffer */
-	if (skb) {
-		skb->destructor = NULL;
-		sp->call = NULL;
-		rxrpc_put_call(call);
-		rxrpc_free_skb(skb);
-	}
+	rxrpc_free_skb(skb);
 
 	_leave(" = %d", ret);
 	return ret;
@@ -200,6 +191,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
 
 	sp->call = call;
 	rxrpc_get_call(call);
+	atomic_inc(&call->skb_count);
 	terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
 		    !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
 	ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a3fa2ed85d63..9ed66d533002 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -203,6 +203,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		}
 
 		/* we transferred the whole data packet */
+		if (!(flags & MSG_PEEK))
+			rxrpc_kernel_data_consumed(call, skb);
+
 		if (sp->hdr.flags & RXRPC_LAST_PACKET) {
 			_debug("last");
 			if (rxrpc_conn_is_client(call->conn)) {
@@ -359,28 +362,6 @@ wait_error:
 
 }
 
-/**
- * rxrpc_kernel_data_delivered - Record delivery of data message
- * @skb: Message holding data
- *
- * Record the delivery of a data message.  This permits RxRPC to keep its
- * tracking correct.  The socket buffer will be deleted.
- */
-void rxrpc_kernel_data_delivered(struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_call *call = sp->call;
-
-	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
-	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
-	call->rx_data_recv = sp->hdr.seq;
-
-	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
-	rxrpc_free_skb(skb);
-}
-
-EXPORT_SYMBOL(rxrpc_kernel_data_delivered);
-
 /**
  * rxrpc_kernel_is_data_last - Determine if data message is last one
  * @skb: Message holding data
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index eee0cfd9ac8c..06c51d4b622d 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -98,11 +98,39 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
 	spin_unlock_bh(&call->lock);
 }
 
+/**
+ * rxrpc_kernel_data_consumed - Record consumption of data message
+ * @call: The call to which the message pertains.
+ * @skb: Message holding data
+ *
+ * Record the consumption of a data message and generate an ACK if appropriate.
+ * The call state is shifted if this was the final packet.  The caller must be
+ * in process context with no spinlocks held.
+ *
+ * TODO: Actually generate the ACK here rather than punting this to the
+ * workqueue.
+ */
+void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	_enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
+
+	ASSERTCMP(sp->call, ==, call);
+	ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
+
+	/* TODO: Fix the sequence number tracking */
+	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+
+	call->rx_data_recv = sp->hdr.seq;
+	rxrpc_hard_ACK_data(call, sp);
+}
+EXPORT_SYMBOL(rxrpc_kernel_data_consumed);
+
 /*
- * destroy a packet that has an RxRPC control buffer
- * - advance the hard-ACK state of the parent call (done here in case something
- *   in the kernel bypasses recvmsg() and steals the packet directly off of the
- *   socket receive queue)
+ * Destroy a packet that has an RxRPC control buffer
  */
 void rxrpc_packet_destructor(struct sk_buff *skb)
 {
@@ -112,9 +140,8 @@ void rxrpc_packet_destructor(struct sk_buff *skb)
 	_enter("%p{%p}", skb, call);
 
 	if (call) {
-		/* send the final ACK on a client call */
-		if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
-			rxrpc_hard_ACK_data(call, sp);
+		if (atomic_dec_return(&call->skb_count) < 0)
+			BUG();
 		rxrpc_put_call(call);
 		sp->call = NULL;
 	}
-- 
cgit v1.2.3


From c15c0ab12fd62f2b19181d05c62d24bc9fa55a42 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Aug 2016 07:48:21 +0200
Subject: ipv6: suppress sparse warnings in IP6_ECN_set_ce()

Pass the correct type __wsum to csum_sub() and csum_add(). This doesn't
really change anything since __wsum really *is* __be32, but removes the
address space warnings from sparse.

Cc: Eric Dumazet <edumazet@google.com>
Fixes: 34ae6a1aa054 ("ipv6: update skb->csum when CE mark is propagated")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_ecn.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index 0dc0a51da38f..dce2d586d9ce 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -128,7 +128,8 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
 	to = from | htonl(INET_ECN_CE << 20);
 	*(__be32 *)iph = to;
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->csum = csum_add(csum_sub(skb->csum, from), to);
+		skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
+				     (__force __wsum)to);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 3d7b33209201cbfa090d614db993571ca3c6b090 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Mon, 15 Aug 2016 13:06:24 +0200
Subject: gre: set inner_protocol on xmit

Ensure that the inner_protocol is set on transmit so that GSO segmentation,
which relies on that field, works correctly.

This is achieved by setting the inner_protocol in gre_build_header rather
than each caller of that function. It ensures that the inner_protocol is
set when gre_fb_xmit() is used to transmit GRE which was not previously the
case.

I have observed this is not the case when OvS transmits GRE using
lwtunnel metadata (which it always does).

Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol")
Cc: Pravin Shelar <pshelar@ovn.org>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gre.h  | 1 +
 net/ipv4/ip_gre.c  | 1 -
 net/ipv6/ip6_gre.c | 2 --
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gre.h b/include/net/gre.h
index 7a54a31d1d4c..73ea256eb7d7 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -104,6 +104,7 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len,
 
 	skb_push(skb, hdr_len);
 
+	skb_set_inner_protocol(skb, proto);
 	skb_reset_transport_header(skb);
 	greh = (struct gre_base_hdr *)skb->data;
 	greh->flags = gre_tnl_flags_to_gre_flags(flags);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5b1481be0282..113cc43df789 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -370,7 +370,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 			 htonl(tunnel->o_seqno));
 
-	skb_set_inner_protocol(skb, proto);
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 776d145113e1..704274cbd495 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -519,8 +519,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
 			 protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
 
-	skb_set_inner_protocol(skb, protocol);
-
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
 			    NEXTHDR_GRE);
 }
-- 
cgit v1.2.3


From 0c23c3e705691cfb99c94f2760df2b456fe45194 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 13 Aug 2016 22:34:58 -0700
Subject: net_sched: fix a typo in tc_for_each_action()

It is harmless because all users pass 'a' to this macro.

Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef")
Cc: Amir Vadai <amir@vadai.me>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 41e6a24a44b9..f53ee9d315c1 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -193,7 +193,7 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 	(list_empty(&(_exts)->actions))
 
 #define tc_for_each_action(_a, _exts) \
-	list_for_each_entry(a, &(_exts)->actions, list)
+	list_for_each_entry(_a, &(_exts)->actions, list)
 
 #define tc_single_action(_exts) \
 	(list_is_singular(&(_exts)->actions))
-- 
cgit v1.2.3


From 2734437ef3c2943090d0914bf91caa6b30451615 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 13 Aug 2016 22:34:59 -0700
Subject: net_sched: move tc offload macros to pkt_cls.h

struct tcf_exts belongs to filters, should not be visible
to plain tc actions.

Cc: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 19 +++----------------
 include/net/pkt_cls.h | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index f53ee9d315c1..870332ff61eb 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -189,30 +189,17 @@ int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 
-#define tc_no_actions(_exts) \
-	(list_empty(&(_exts)->actions))
-
-#define tc_for_each_action(_a, _exts) \
-	list_for_each_entry(_a, &(_exts)->actions, list)
-
-#define tc_single_action(_exts) \
-	(list_is_singular(&(_exts)->actions))
+#endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
 					   u64 packets, u64 lastuse)
 {
+#ifdef CONFIG_NET_CLS_ACT
 	if (!a->ops->stats_update)
 		return;
 
 	a->ops->stats_update(a, bytes, packets, lastuse);
+#endif
 }
 
-#else /* CONFIG_NET_CLS_ACT */
-
-#define tc_no_actions(_exts) true
-#define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
-#define tc_single_action(_exts) false
-#define tcf_action_stats_update(a, bytes, packets, lastuse)
-
-#endif /* CONFIG_NET_CLS_ACT */
 #endif
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6f8d65342d3a..00dd5c4c1d0a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -130,6 +130,25 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 	return 0;
 }
 
+#ifdef CONFIG_NET_CLS_ACT
+
+#define tc_no_actions(_exts) \
+	(list_empty(&(_exts)->actions))
+
+#define tc_for_each_action(_a, _exts) \
+	list_for_each_entry(_a, &(_exts)->actions, list)
+
+#define tc_single_action(_exts) \
+	(list_is_singular(&(_exts)->actions))
+
+#else /* CONFIG_NET_CLS_ACT */
+
+#define tc_no_actions(_exts) true
+#define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
+#define tc_single_action(_exts) false
+
+#endif /* CONFIG_NET_CLS_ACT */
+
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
 		      struct tcf_exts *exts, bool ovr);
-- 
cgit v1.2.3


From 22dc13c837c33207548c8ee5116b64e2930a6e23 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 13 Aug 2016 22:35:00 -0700
Subject: net_sched: convert tcf_exts from list to pointer array

As pointed out by Jamal, an action could be shared by
multiple filters, so we can't use list to chain them
any more after we get rid of the original tc_action.
Instead, we could just save pointers to these actions
in tcf_exts, since they are refcount'ed, so convert
the list to an array of pointers.

The "ugly" part is the action API still accepts list
as a parameter, I just introduce a helper function to
convert the array of pointers to a list, instead of
relying on the C99 feature to iterate the array.

Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common")
Reported-by: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c  |  4 +-
 include/net/act_api.h                           |  4 +-
 include/net/pkt_cls.h                           | 40 ++++++++++++-------
 net/sched/act_api.c                             | 11 +++---
 net/sched/cls_api.c                             | 51 +++++++++++++++++--------
 7 files changed, 85 insertions(+), 41 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ee57a89252bb..b4f03748adc0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8396,12 +8396,14 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 			    struct tcf_exts *exts, u64 *action, u8 *queue)
 {
 	const struct tc_action *a;
+	LIST_HEAD(actions);
 	int err;
 
 	if (tc_no_actions(exts))
 		return -EINVAL;
 
-	tc_for_each_action(a, exts) {
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
 
 		/* Drop action */
 		if (is_tcf_gact_shot(a)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 0f19b01e3fff..dc8b1cb0fdc8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -318,6 +318,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				u32 *action, u32 *flow_tag)
 {
 	const struct tc_action *a;
+	LIST_HEAD(actions);
 
 	if (tc_no_actions(exts))
 		return -EINVAL;
@@ -325,7 +326,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	*flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 	*action = 0;
 
-	tc_for_each_action(a, exts) {
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
 		/* Only support a single action per rule */
 		if (*action)
 			return -EINVAL;
@@ -362,13 +364,15 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				u32 *action, u32 *dest_vport)
 {
 	const struct tc_action *a;
+	LIST_HEAD(actions);
 
 	if (tc_no_actions(exts))
 		return -EINVAL;
 
 	*action = 0;
 
-	tc_for_each_action(a, exts) {
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
 		/* Only support a single action per rule */
 		if (*action)
 			return -EINVAL;
@@ -503,6 +507,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 	struct tc_action *a;
 	struct mlx5_fc *counter;
+	LIST_HEAD(actions);
 	u64 bytes;
 	u64 packets;
 	u64 lastuse;
@@ -518,7 +523,8 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 
 	mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
 
-	tc_for_each_action(a, f->exts)
+	tcf_exts_to_list(f->exts, &actions);
+	list_for_each_entry(a, &actions, list)
 		tcf_action_stats_update(a, bytes, packets, lastuse);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 1fe9fbdc9102..1f8168906811 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1121,6 +1121,7 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 					  bool ingress)
 {
 	const struct tc_action *a;
+	LIST_HEAD(actions);
 	int err;
 
 	if (!tc_single_action(cls->exts)) {
@@ -1128,7 +1129,8 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 		return -ENOTSUPP;
 	}
 
-	tc_for_each_action(a, cls->exts) {
+	tcf_exts_to_list(cls->exts, &actions);
+	list_for_each_entry(a, &actions, list) {
 		if (!is_tcf_mirred_mirror(a) || protocol != htons(ETH_P_ALL))
 			return -ENOTSUPP;
 
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 870332ff61eb..82f3c912a5b1 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -176,8 +176,8 @@ int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
 int tcf_unregister_action(struct tc_action_ops *a,
 			  struct pernet_operations *ops);
 int tcf_action_destroy(struct list_head *actions, int bind);
-int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
-		    struct tcf_result *res);
+int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
+		    int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct nlattr *nla,
 				  struct nlattr *est, char *n, int ovr,
 				  int bind, struct list_head *);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 00dd5c4c1d0a..c99508d426cc 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -59,7 +59,8 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
 struct tcf_exts {
 #ifdef CONFIG_NET_CLS_ACT
 	__u32	type; /* for backward compat(TCA_OLD_COMPAT) */
-	struct list_head actions;
+	int nr_actions;
+	struct tc_action **actions;
 #endif
 	/* Map to export classifier specific extension TLV types to the
 	 * generic extensions API. Unsupported extensions must be set to 0.
@@ -72,7 +73,10 @@ static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	exts->type = 0;
-	INIT_LIST_HEAD(&exts->actions);
+	exts->nr_actions = 0;
+	exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *),
+				GFP_KERNEL);
+	WARN_ON(!exts->actions); /* TODO: propagate the error to callers */
 #endif
 	exts->action = action;
 	exts->police = police;
@@ -89,7 +93,7 @@ static inline int
 tcf_exts_is_predicative(struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return !list_empty(&exts->actions);
+	return exts->nr_actions;
 #else
 	return 0;
 #endif
@@ -108,6 +112,20 @@ tcf_exts_is_available(struct tcf_exts *exts)
 	return tcf_exts_is_predicative(exts);
 }
 
+static inline void tcf_exts_to_list(const struct tcf_exts *exts,
+				    struct list_head *actions)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	int i;
+
+	for (i = 0; i < exts->nr_actions; i++) {
+		struct tc_action *a = exts->actions[i];
+
+		list_add(&a->list, actions);
+	}
+#endif
+}
+
 /**
  * tcf_exts_exec - execute tc filter extensions
  * @skb: socket buffer
@@ -124,27 +142,21 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 	       struct tcf_result *res)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (!list_empty(&exts->actions))
-		return tcf_action_exec(skb, &exts->actions, res);
+	if (exts->nr_actions)
+		return tcf_action_exec(skb, exts->actions, exts->nr_actions,
+				       res);
 #endif
 	return 0;
 }
 
 #ifdef CONFIG_NET_CLS_ACT
 
-#define tc_no_actions(_exts) \
-	(list_empty(&(_exts)->actions))
-
-#define tc_for_each_action(_a, _exts) \
-	list_for_each_entry(_a, &(_exts)->actions, list)
-
-#define tc_single_action(_exts) \
-	(list_is_singular(&(_exts)->actions))
+#define tc_no_actions(_exts)  ((_exts)->nr_actions == 0)
+#define tc_single_action(_exts) ((_exts)->nr_actions == 1)
 
 #else /* CONFIG_NET_CLS_ACT */
 
 #define tc_no_actions(_exts) true
-#define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
 #define tc_single_action(_exts) false
 
 #endif /* CONFIG_NET_CLS_ACT */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index b4c7be38b632..d09d0687594b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -420,18 +420,19 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 	return res;
 }
 
-int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
-		    struct tcf_result *res)
+int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
+		    int nr_actions, struct tcf_result *res)
 {
-	const struct tc_action *a;
-	int ret = -1;
+	int ret = -1, i;
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
-	list_for_each_entry(a, actions, list) {
+	for (i = 0; i < nr_actions; i++) {
+		const struct tc_action *a = actions[i];
+
 repeat:
 		ret = a->ops->act(skb, a, res);
 		if (ret == TC_ACT_REPEAT)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 843a716a4303..a7c5645373af 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -541,8 +541,12 @@ out:
 void tcf_exts_destroy(struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
-	INIT_LIST_HEAD(&exts->actions);
+	LIST_HEAD(actions);
+
+	tcf_exts_to_list(exts, &actions);
+	tcf_action_destroy(&actions, TCA_ACT_UNBIND);
+	kfree(exts->actions);
+	exts->nr_actions = 0;
 #endif
 }
 EXPORT_SYMBOL(tcf_exts_destroy);
@@ -554,7 +558,6 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 	{
 		struct tc_action *act;
 
-		INIT_LIST_HEAD(&exts->actions);
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
 						"police", ovr,
@@ -563,14 +566,20 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 				return PTR_ERR(act);
 
 			act->type = exts->type = TCA_OLD_COMPAT;
-			list_add(&act->list, &exts->actions);
+			exts->actions[0] = act;
+			exts->nr_actions = 1;
 		} else if (exts->action && tb[exts->action]) {
-			int err;
+			LIST_HEAD(actions);
+			int err, i = 0;
+
 			err = tcf_action_init(net, tb[exts->action], rate_tlv,
 					      NULL, ovr,
-					      TCA_ACT_BIND, &exts->actions);
+					      TCA_ACT_BIND, &actions);
 			if (err)
 				return err;
+			list_for_each_entry(act, &actions, list)
+				exts->actions[i++] = act;
+			exts->nr_actions = i;
 		}
 	}
 #else
@@ -587,37 +596,49 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 		     struct tcf_exts *src)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	LIST_HEAD(tmp);
+	struct tcf_exts old = *dst;
+
 	tcf_tree_lock(tp);
-	list_splice_init(&dst->actions, &tmp);
-	list_splice(&src->actions, &dst->actions);
+	dst->nr_actions = src->nr_actions;
+	dst->actions = src->actions;
 	dst->type = src->type;
 	tcf_tree_unlock(tp);
-	tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
+
+	tcf_exts_destroy(&old);
 #endif
 }
 EXPORT_SYMBOL(tcf_exts_change);
 
-#define tcf_exts_first_act(ext)					\
-	list_first_entry_or_null(&(exts)->actions,		\
-				 struct tc_action, list)
+#ifdef CONFIG_NET_CLS_ACT
+static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
+{
+	if (exts->nr_actions == 0)
+		return NULL;
+	else
+		return exts->actions[0];
+}
+#endif
 
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct nlattr *nest;
 
-	if (exts->action && !list_empty(&exts->actions)) {
+	if (exts->action && exts->nr_actions) {
 		/*
 		 * again for backward compatible mode - we want
 		 * to work with both old and new modes of entering
 		 * tc data even if iproute2  was newer - jhs
 		 */
 		if (exts->type != TCA_OLD_COMPAT) {
+			LIST_HEAD(actions);
+
 			nest = nla_nest_start(skb, exts->action);
 			if (nest == NULL)
 				goto nla_put_failure;
-			if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
+
+			tcf_exts_to_list(exts, &actions);
+			if (tcf_action_dump(skb, &actions, 0, 0) < 0)
 				goto nla_put_failure;
 			nla_nest_end(skb, nest);
 		} else if (exts->police) {
-- 
cgit v1.2.3


From bb1fceca22492109be12640d49f5ea5a544c6bb4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 17 Aug 2016 05:56:26 -0700
Subject: tcp: fix use after free in tcp_xmit_retransmit_queue()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the
tail of the write queue using tcp_add_write_queue_tail()

Then it attempts to copy user data into this fresh skb.

If the copy fails, we undo the work and remove the fresh skb.

Unfortunately, this undo lacks the change done to tp->highest_sack and
we can leave a dangling pointer (to a freed skb)

Later, tcp_xmit_retransmit_queue() can dereference this pointer and
access freed memory. For regular kernels where memory is not unmapped,
this might cause SACK bugs because tcp_highest_sack_seq() is buggy,
returning garbage instead of tp->snd_nxt, but with various debug
features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel.

This bug was found by Marco Grassi thanks to syzkaller.

Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb")
Reported-by: Marco Grassi <marco.gra@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c00e7d51bb18..7717302cab91 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1523,6 +1523,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 {
 	if (sk->sk_send_head == skb_unlinked)
 		sk->sk_send_head = NULL;
+	if (tcp_sk(sk)->highest_sack == skb_unlinked)
+		tcp_sk(sk)->highest_sack = NULL;
 }
 
 static inline void tcp_init_send_head(struct sock *sk)
-- 
cgit v1.2.3


From 89e1f6d2b956649fbe0704d543a90b8e0cf872b0 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 01:02:18 +0800
Subject: netfilter: nft_reject: restrict to INPUT/FORWARD/OUTPUT

After I add the nft rule "nft add rule filter prerouting reject
with tcp reset", kernel panic happened on my system:
  NULL pointer dereference at ...
  IP: [<ffffffff81b9db2f>] nf_send_reset+0xaf/0x400
  Call Trace:
  [<ffffffff81b9da80>] ? nf_reject_ip_tcphdr_get+0x160/0x160
  [<ffffffffa0928061>] nft_reject_ipv4_eval+0x61/0xb0 [nft_reject_ipv4]
  [<ffffffffa08e836a>] nft_do_chain+0x1fa/0x890 [nf_tables]
  [<ffffffffa08e8170>] ? __nft_trace_packet+0x170/0x170 [nf_tables]
  [<ffffffffa06e0900>] ? nf_ct_invert_tuple+0xb0/0xc0 [nf_conntrack]
  [<ffffffffa07224d4>] ? nf_nat_setup_info+0x5d4/0x650 [nf_nat]
  [...]

Because in the PREROUTING chain, routing information is not exist,
then we will dereference the NULL pointer and oops happen.

So we restrict reject expression to INPUT, FORWARD and OUTPUT chain.
This is consistent with iptables REJECT target.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_reject.h   |  4 ++++
 net/ipv4/netfilter/nft_reject_ipv4.c |  1 +
 net/ipv6/netfilter/nft_reject_ipv6.c |  1 +
 net/netfilter/nft_reject.c           | 16 ++++++++++++++++
 net/netfilter/nft_reject_inet.c      |  7 ++++++-
 5 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index 60fa1530006b..02e28c529b29 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -8,6 +8,10 @@ struct nft_reject {
 
 extern const struct nla_policy nft_reject_policy[];
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[]);
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c816b3..2c2553b9026c 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
 	.eval		= nft_reject_ipv4_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 533cd5719c59..92bda9908bb9 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
 	.eval		= nft_reject_ipv6_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 0522fc9bfb0a..c64de3f7379d 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
 };
 EXPORT_SYMBOL_GPL(nft_reject_policy);
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data)
+{
+	return nft_chain_validate_hooks(ctx->chain,
+					(1 << NF_INET_LOCAL_IN) |
+					(1 << NF_INET_FORWARD) |
+					(1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_reject_validate);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
+	int err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 759ca5248a3d..e79d9ca2ffee 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx,
 				const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
-	int icmp_code;
+	int icmp_code, err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
@@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
 	.eval		= nft_reject_inet_eval,
 	.init		= nft_reject_inet_init,
 	.dump		= nft_reject_inet_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_inet_type __read_mostly = {
-- 
cgit v1.2.3


From 960fa72f67f1be6891d63a5518860d1ae4e14b88 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 22:57:56 +0800
Subject: netfilter: nft_meta: improve the validity check of pkttype set expr

"meta pkttype set" is only supported on prerouting chain with bridge
family and ingress chain with netdev family.

But the validate check is incomplete, and the user can add the nft
rules on input chain with bridge family, for example:
  # nft add table bridge filter
  # nft add chain bridge filter input {type filter hook input \
    priority 0 \;}
  # nft add chain bridge filter test
  # nft add rule bridge filter test meta pkttype set unicast
  # nft add rule bridge filter input jump test

This patch fixes the problem.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  4 ++++
 net/bridge/netfilter/nft_meta_bridge.c |  1 +
 net/netfilter/nft_meta.c               | 17 +++++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index d27588c8dbd9..1139cde0fdc5 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -36,4 +36,8 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 void nft_meta_set_destroy(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr);
 
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data);
+
 #endif
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 4b901d9f2e7c..ad47a921b701 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 2863f3493038..8a6bc7630912 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -291,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
-static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data)
 {
+	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
 
+	if (priv->key != NFT_META_PKTTYPE)
+		return 0;
+
 	switch (ctx->afi->family) {
 	case NFPROTO_BRIDGE:
 		hooks = 1 << NF_BR_PRE_ROUTING;
@@ -308,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
 int nft_meta_set_init(const struct nft_ctx *ctx,
 		      const struct nft_expr *expr,
@@ -327,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 		len = sizeof(u8);
 		break;
 	case NFT_META_PKTTYPE:
-		err = nft_meta_set_init_pkttype(ctx);
-		if (err)
-			return err;
 		len = sizeof(u8);
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	err = nft_meta_set_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
+
 	priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
 	err = nft_validate_register_load(priv->sreg, len);
 	if (err < 0)
@@ -407,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
-- 
cgit v1.2.3


From 61aaa0e8c1c15d9e045f0577f046be50f2f571ab Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Fri, 19 Aug 2016 22:02:48 +0200
Subject: cfg80211: Add stub for cfg80211_get_station()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows modules using this function (currently: batman-adv) to
compile even if cfg80211 is not built at all, thus relaxing
dependencies.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9c23f4d33e06..beb7610d64e9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1102,6 +1102,7 @@ struct station_info {
 	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
 };
 
+#if IS_ENABLED(CONFIG_CFG80211)
 /**
  * cfg80211_get_station - retrieve information about a given station
  * @dev: the device where the station is supposed to be connected to
@@ -1114,6 +1115,14 @@ struct station_info {
  */
 int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
 			 struct station_info *sinfo);
+#else
+static inline int cfg80211_get_station(struct net_device *dev,
+				       const u8 *mac_addr,
+				       struct station_info *sinfo)
+{
+	return -ENOENT;
+}
+#endif
 
 /**
  * enum monitor_flags - monitor flags
-- 
cgit v1.2.3


From 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 1 Sep 2016 14:43:53 -0700
Subject: af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'

Right now we use the 'readlock' both for protecting some of the af_unix
IO path and for making the bind be single-threaded.

The two are independent, but using the same lock makes for a nasty
deadlock due to ordering with regards to filesystem locking.  The bind
locking would want to nest outside the VSF pathname locking, but the IO
locking wants to nest inside some of those same locks.

We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
splice-bind deadlock") which moved the readlock inside the vfs locks,
but that caused problems with overlayfs that will then call back into
filesystem routines that take the lock in the wrong order anyway.

Splitting the locks means that we can go back to having the bind lock be
the outermost lock, and we don't have any deadlocks with lock ordering.

Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 45 +++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9b4c418bebd8..fd60eccb59a6 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -52,7 +52,7 @@ struct unix_sock {
 	struct sock		sk;
 	struct unix_address     *addr;
 	struct path		path;
-	struct mutex		readlock;
+	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
 	atomic_long_t		inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433ae1bbef97..8309687a56b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
 {
 	struct unix_sock *u = unix_sk(sk);
 
-	if (mutex_lock_interruptible(&u->readlock))
+	if (mutex_lock_interruptible(&u->iolock))
 		return -EINTR;
 
 	sk->sk_peek_off = val;
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 
 	return 0;
 }
@@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
-	mutex_init(&u->readlock); /* single task reading lock */
+	mutex_init(&u->iolock); /* single task reading lock */
+	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock)
 	int err;
 	unsigned int retries = 0;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		return err;
 
@@ -895,7 +896,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
@@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		goto out;
 
@@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->bindlock);
 out:
 	return err;
 }
@@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	if (false) {
 alloc_skb:
 		unix_state_unlock(other);
-		mutex_unlock(&unix_sk(other)->readlock);
+		mutex_unlock(&unix_sk(other)->iolock);
 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
 					      &err, 0);
 		if (!newskb)
 			goto err;
 	}
 
-	/* we must acquire readlock as we modify already present
+	/* we must acquire iolock as we modify already present
 	 * skbs in the sk_receive_queue and mess with skb->len
 	 */
-	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
 	if (err) {
 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
 		goto err;
@@ -2032,7 +2033,7 @@ alloc_skb:
 	}
 
 	unix_state_unlock(other);
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 
 	other->sk_data_ready(other);
 	scm_destroy(&scm);
@@ -2041,7 +2042,7 @@ alloc_skb:
 err_state_unlock:
 	unix_state_unlock(other);
 err_unlock:
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 err:
 	kfree_skb(newskb);
 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		mutex_lock(&u->readlock);
+		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
@@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		if (skb)
 			break;
 
-		mutex_unlock(&u->readlock);
+		mutex_unlock(&u->iolock);
 
 		if (err != -EAGAIN)
 			break;
 	} while (timeo &&
 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
 
-	if (!skb) { /* implies readlock unlocked */
+	if (!skb) { /* implies iolock unlocked */
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
@@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 out_free:
 	skb_free_datagram(sk, skb);
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 out:
 	return err;
 }
@@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 	/* Lock the socket to prevent queue disordering
 	 * while sleeps in memcpy_tomsg
 	 */
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	if (flags & MSG_PEEK)
 		skip = sk_peek_offset(sk, flags);
@@ -2326,7 +2327,7 @@ again:
 				break;
 			}
 
-			mutex_unlock(&u->readlock);
+			mutex_unlock(&u->iolock);
 
 			timeo = unix_stream_data_wait(sk, timeo, last,
 						      last_len);
@@ -2337,7 +2338,7 @@ again:
 				goto out;
 			}
 
-			mutex_lock(&u->readlock);
+			mutex_lock(&u->iolock);
 			goto redo;
 unlock:
 			unix_state_unlock(sk);
@@ -2440,7 +2441,7 @@ unlock:
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	if (state->msg)
 		scm_recv(sock, state->msg, &scm, flags);
 	else
@@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
 	int ret;
 	struct unix_sock *u = unix_sk(sk);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 5a56a0b3a45dd0cc5b2f7bec6afd053a474ed9f5 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Mon, 5 Sep 2016 10:20:20 +1200
Subject: net: Don't delete routes in different VRFs

When deleting an IP address from an interface, there is a clean-up of
routes which refer to this local address. However, there was no check to
see that the VRF matched. This meant that deletion wasn't confined to
the VRF it should have been.

To solve this, a new field has been added to fib_info to hold a table
id. When removing fib entries corresponding to a local ip address, this
table id is also used in the comparison.

The table id is populated when the fib_info is created. This was already
done in some places, but not in ip_rt_ioctl(). This has now been fixed.

Fixes: 021dd3b8a142 ("net: Add routes to the table associated with the device")
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 3 ++-
 net/ipv4/fib_frontend.c  | 3 ++-
 net/ipv4/fib_semantics.c | 8 ++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4079fc18ffe4..7d4a72e75f33 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -111,6 +111,7 @@ struct fib_info {
 	unsigned char		fib_scope;
 	unsigned char		fib_type;
 	__be32			fib_prefsrc;
+	u32			fib_tb_id;
 	u32			fib_priority;
 	u32			*fib_metrics;
 #define fib_mtu fib_metrics[RTAX_MTU-1]
@@ -319,7 +320,7 @@ void fib_flush_external(struct net *net);
 /* Exported by fib_semantics.c */
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
-int fib_sync_down_addr(struct net *net, __be32 local);
+int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 extern u32 fib_multipath_secret __read_mostly;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ef2ebeb89d0f..1b25daf8c7f1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 		if (!dev)
 			return -ENODEV;
 		cfg->fc_oif = dev->ifindex;
+		cfg->fc_table = l3mdev_fib_table(dev);
 		if (colon) {
 			struct in_ifaddr *ifa;
 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1027,7 +1028,7 @@ no_promotions:
 			 * First of all, we scan fib_info list searching
 			 * for stray nexthop entries, then ignite fib_flush.
 			 */
-			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+			if (fib_sync_down_addr(dev, ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 539fa264e67d..e9f56225e53f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_priority = cfg->fc_priority;
 	fi->fib_prefsrc = cfg->fc_prefsrc;
 	fi->fib_type = cfg->fc_type;
+	fi->fib_tb_id = cfg->fc_table;
 
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
  *   referring to it.
  * - device went down -> we must shutdown all nexthops going via it.
  */
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
 {
 	int ret = 0;
 	unsigned int hash = fib_laddr_hashfn(local);
 	struct hlist_head *head = &fib_info_laddrhash[hash];
+	struct net *net = dev_net(dev);
+	int tb_id = l3mdev_fib_table(dev);
 	struct fib_info *fi;
 
 	if (!fib_info_laddrhash || local == 0)
 		return 0;
 
 	hlist_for_each_entry(fi, head, fib_lhash) {
-		if (!net_eq(fi->fib_net, net))
+		if (!net_eq(fi->fib_net, net) ||
+		    fi->fib_tb_id != tb_id)
 			continue;
 		if (fi->fib_prefsrc == local) {
 			fi->fib_flags |= RTNH_F_DEAD;
-- 
cgit v1.2.3


From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Tue, 13 Sep 2016 08:49:18 +0800
Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct
 extensions

When memory is exhausted, nfct_seqadj_ext_add may fail to add the
synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't
check if get valid seqadj pointer by the nfct_seqadj.

Now drop the packet directly when fail to add seqadj extension to
avoid dereference NULL pointer in nf_ct_seqadj_init from
init_conntrack().

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++
 net/netfilter/nf_conntrack_core.c             |  6 +++---
 net/netfilter/nf_nat_core.c                   |  3 ++-
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index 6793614e6502..e6937318546c 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_ct_add_synproxy(struct nf_conn *ct,
+				      const struct nf_conn *tmpl)
+{
+	if (tmpl && nfct_synproxy(tmpl)) {
+		if (!nfct_seqadj_ext_add(ct))
+			return false;
+
+		if (!nfct_synproxy_ext_add(ct))
+			return false;
+	}
+
+	return true;
+}
+
 struct synproxy_stats {
 	unsigned int			syn_received;
 	unsigned int			cookie_invalid;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index dd2c43abf9e2..9934b0c93c1e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	if (IS_ERR(ct))
 		return (struct nf_conntrack_tuple_hash *)ct;
 
-	if (tmpl && nfct_synproxy(tmpl)) {
-		nfct_seqadj_ext_add(ct);
-		nfct_synproxy_ext_add(ct);
+	if (!nf_ct_add_synproxy(ct, tmpl)) {
+		nf_conntrack_free(ct);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 19c081e1b328..ecee105bbada 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_DST_NAT;
 
 		if (nfct_help(ct))
-			nfct_seqadj_ext_add(ct);
+			if (!nfct_seqadj_ext_add(ct))
+				return NF_DROP;
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-- 
cgit v1.2.3


From 20c64d5cd5a2bdcdc8982a06cb05e5e1bd851a3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Sep 2016 08:48:46 -0700
Subject: net: avoid sk_forward_alloc overflows

A malicious TCP receiver, sending SACK, can force the sender to split
skbs in write queue and increase its memory usage.

Then, when socket is closed and its write queue purged, we might
overflow sk_forward_alloc (It becomes negative)

sk_mem_reclaim() does nothing in this case, and more than 2GB
are leaked from TCP perspective (tcp_memory_allocated is not changed)

Then warnings trigger from inet_sock_destruct() and
sk_stream_kill_queues() seeing a not zero sk_forward_alloc

All TCP stack can be stuck because TCP is under memory pressure.

A simple fix is to preemptively reclaim from sk_mem_uncharge().

This makes sure a socket wont have more than 2 MB forward allocated,
after burst and idle period.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index ff5be7e8ddea..8741988e6880 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1332,6 +1332,16 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 	if (!sk_has_account(sk))
 		return;
 	sk->sk_forward_alloc += size;
+
+	/* Avoid a possible overflow.
+	 * TCP send queues can make this happen, if sk_mem_reclaim()
+	 * is not called and more than 2 GBytes are released at once.
+	 *
+	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
+	 * no need to hold that much forward allocation anyway.
+	 */
+	if (unlikely(sk->sk_forward_alloc >= 1 << 21))
+		__sk_mem_reclaim(sk, 1 << 20);
 }
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From 4496195ddd75c4ad57b783739414e69b7d79843e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 15 Sep 2016 15:02:38 -0300
Subject: sctp: fix SSN comparision

This function actually operates on u32 yet its paramteres were declared
as u16, causing integer truncation upon calling.

Note in patch context that ADDIP_SERIAL_SIGN_BIT is already 32 bits.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index efc01743b9d6..bafe2a0ab908 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -382,7 +382,7 @@ enum {
 	ADDIP_SERIAL_SIGN_BIT = (1<<31)
 };
 
-static inline int ADDIP_SERIAL_gte(__u16 s, __u16 t)
+static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t)
 {
 	return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT);
 }
-- 
cgit v1.2.3


From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 19 Sep 2016 16:17:57 +0200
Subject: vti6: fix input path

Since commit 1625f4529957, vti6 is broken, all input packets are dropped
(LINUX_MIB_XFRMINNOSTATES is incremented).

XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling
xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in
xfrm6_rcv_spi().

A new function xfrm6_rcv_tnl() that enables to pass a value to
xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function
is used in several handlers).

CC: Alexey Kodanev <alexey.kodanev@oracle.com>
Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  4 +++-
 net/ipv6/ip6_vti.c      |  4 +---
 net/ipv6/xfrm6_input.c  | 16 +++++++++++-----
 net/ipv6/xfrm6_tunnel.c |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index adfebd6f243c..17934312eecb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
 void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
 int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi);
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t);
 int xfrm6_transport_finish(struct sk_buff *skb, int async);
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
 int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 52a2f735881f..5bd3afdcc771 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
 			goto discard;
 		}
 
-		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
 		rcu_read_unlock();
 
-		return xfrm6_rcv(skb);
+		return xfrm6_rcv_tnl(skb, t);
 	}
 	rcu_read_unlock();
 	return -EINVAL;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00a2d40677d6..b5789562aded 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 	return xfrm6_extract_header(skb);
 }
 
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t)
 {
-	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, 0);
@@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	return -1;
 }
 
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
 {
 	return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
-			     0);
+			     0, t);
 }
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
 
+int xfrm6_rcv(struct sk_buff *skb)
+{
+	return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto)
 {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd660..e1c0bbe7996c 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
 	__be32 spi;
 
 	spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
 }
 
 static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-- 
cgit v1.2.3


From 73dca124cdbad2d67d47d6196c08325f18447d07 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 29 Sep 2016 02:37:26 +0800
Subject: sctp: move sent_count to the memory hole in sctp_chunk

Now pahole sctp_chunk, it has 2 memory holes:
   struct sctp_chunk {
	struct list_head           list;
	atomic_t                   refcnt;
	/* XXX 4 bytes hole, try to pack */
	...
	long unsigned int          prsctp_param;
	int                        sent_count;
	/* XXX 4 bytes hole, try to pack */

This patch is to move up sent_count to fill the 1st one and eliminate
the 2nd one.

It's not just another struct compaction, it also fixes the "netperf-
Throughput_Mbps -37.2% regression" issue when overloading the CPU.

Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ce93c4b10d26..4f097f538fff 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -554,6 +554,9 @@ struct sctp_chunk {
 
 	atomic_t refcnt;
 
+	/* How many times this chunk have been sent, for prsctp RTX policy */
+	int sent_count;
+
 	/* This is our link to the per-transport transmitted list.  */
 	struct list_head transmitted_list;
 
@@ -610,9 +613,6 @@ struct sctp_chunk {
 	 */
 	unsigned long prsctp_param;
 
-	/* How many times this chunk have been sent, for prsctp RTX policy */
-	int sent_count;
-
 	/* Which association does this belong to?  */
 	struct sctp_association *asoc;
 
-- 
cgit v1.2.3


From 0605483f6ace1f6b63e397c819a115ddcd13af0d Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 29 Sep 2016 02:37:27 +0800
Subject: sctp: remove prsctp_param from sctp_chunk

Now sctp uses chunk->prsctp_param to save the prsctp param for all the
prsctp polices, we didn't need to introduce prsctp_param to sctp_chunk.
We can just use chunk->sinfo.sinfo_timetolive for RTX and BUF polices,
and reuse msg->expires_at for TTL policy, as the prsctp polices and old
expires policy are mutual exclusive.

This patch is to remove prsctp_param from sctp_chunk, and reuse msg's
expires_at for TTL and chunk's sinfo.sinfo_timetolive for RTX and BUF
polices.

Note that sctp can't use chunk's sinfo.sinfo_timetolive for TTL policy,
as it needs a u64 variables to save the expires_at time.

This one also fixes the "netperf-Throughput_Mbps -37.2% regression"
issue.

Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  7 -------
 net/sctp/chunk.c           |  9 +++++++--
 net/sctp/outqueue.c        |  4 ++--
 net/sctp/sm_make_chunk.c   | 15 ---------------
 4 files changed, 9 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 4f097f538fff..ced0df374e60 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -606,13 +606,6 @@ struct sctp_chunk {
 	/* This needs to be recoverable for SCTP_SEND_FAILED events. */
 	struct sctp_sndrcvinfo sinfo;
 
-	/* We use this field to record param for prsctp policies,
-	 * for TTL policy, it is the time_to_drop of this chunk,
-	 * for RTX policy, it is the max_sent_count of this chunk,
-	 * for PRIO policy, it is the priority of this chunk.
-	 */
-	unsigned long prsctp_param;
-
 	/* Which association does this belong to?  */
 	struct sctp_association *asoc;
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a55e54738b81..14990e21cf55 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -179,6 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 			 msg, msg->expires_at, jiffies);
 	}
 
+	if (asoc->prsctp_enable &&
+	    SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
+		msg->expires_at =
+			jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
+
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
 	 */
@@ -349,14 +354,14 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 	}
 
 	if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
-	    time_after(jiffies, chunk->prsctp_param)) {
+	    time_after(jiffies, chunk->msg->expires_at)) {
 		if (chunk->sent_count)
 			chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
 		else
 			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		return 1;
 	} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
-		   chunk->sent_count > chunk->prsctp_param) {
+		   chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		return 1;
 	}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 72e54a416af6..e084f35d40d2 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -383,7 +383,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 
 	list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
 		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->prsctp_param <= sinfo->sinfo_timetolive)
+		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
 			continue;
 
 		list_del_init(&chk->transmitted_list);
@@ -418,7 +418,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 
 	list_for_each_entry_safe(chk, temp, queue, list) {
 		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->prsctp_param <= sinfo->sinfo_timetolive)
+		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
 			continue;
 
 		list_del_init(&chk->list);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8c77b87a8565..46ffecc57214 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -706,20 +706,6 @@ nodata:
 	return retval;
 }
 
-static void sctp_set_prsctp_policy(struct sctp_chunk *chunk,
-				   const struct sctp_sndrcvinfo *sinfo)
-{
-	if (!chunk->asoc->prsctp_enable)
-		return;
-
-	if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
-		chunk->prsctp_param =
-			jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
-	else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) ||
-		 SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags))
-		chunk->prsctp_param = sinfo->sinfo_timetolive;
-}
-
 /* Make a DATA chunk for the given association from the provided
  * parameters.  However, do not populate the data payload.
  */
@@ -753,7 +739,6 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 
 	retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
 	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
-	sctp_set_prsctp_policy(retval, sinfo);
 
 nodata:
 	return retval;
-- 
cgit v1.2.3