From 119363c7dc2bcc0c33c255a7b4979c8c0fdc1896 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 22 Apr 2013 16:29:30 +0200
Subject: cfg80211: add support for per-chain signal strength reporting

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d1e48b5e348f..cbf1e228650f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1991,6 +1991,10 @@ enum nl80211_sta_bss_param {
  * @NL80211_STA_INFO_PEER_PM: peer mesh STA link-specific power mode
  * @NL80211_STA_INFO_NONPEER_PM: neighbor mesh STA power save mode towards
  *	non-peer STA
+ * @NL80211_STA_INFO_CHAIN_SIGNAL: per-chain signal strength of last PPDU
+ *	Contains a nested array of signal strength attributes (u8, dBm)
+ * @NL80211_STA_INFO_CHAIN_SIGNAL_AVG: per-chain signal strength average
+ *	Same format as NL80211_STA_INFO_CHAIN_SIGNAL.
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -2020,6 +2024,8 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_NONPEER_PM,
 	NL80211_STA_INFO_RX_BYTES64,
 	NL80211_STA_INFO_TX_BYTES64,
+	NL80211_STA_INFO_CHAIN_SIGNAL,
+	NL80211_STA_INFO_CHAIN_SIGNAL_AVG,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
-- 
cgit v1.2.3


From fb4e156886ce6e8309e912d8b370d192330d19d3 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sun, 28 Apr 2013 16:22:06 -0700
Subject: nl80211: Add generic netlink module alias for cfg80211/nl80211

To support auto-loading of wireless modules from netlink users, add module
alias for nl80211 family.

This also adds NL80211_GENL_NAME constant to define the "nl80211" netlink
family name as part of uapi.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/core.c          | 1 +
 net/wireless/nl80211.c       | 8 ++++----
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cbf1e228650f..b48430769eda 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -27,6 +27,8 @@
 
 #include <linux/types.h>
 
+#define NL80211_GENL_NAME "nl80211"
+
 /**
  * DOC: Station handling
  *
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 84c9ad7e1dca..68f0c96c0565 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -34,6 +34,7 @@
 MODULE_AUTHOR("Johannes Berg");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("wireless configuration support");
+MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME);
 
 /* RCU-protected (and cfg80211_mutex for writers) */
 LIST_HEAD(cfg80211_rdev_list);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f687a8d0d026..9cdcd9ec3317 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -37,10 +37,10 @@ static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
 
 /* the netlink family */
 static struct genl_family nl80211_fam = {
-	.id = GENL_ID_GENERATE,	/* don't bother with a hardcoded ID */
-	.name = "nl80211",	/* have users key off the name instead */
-	.hdrsize = 0,		/* no private header */
-	.version = 1,		/* no particular meaning now */
+	.id = GENL_ID_GENERATE,		/* don't bother with a hardcoded ID */
+	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
+	.hdrsize = 0,			/* no private header */
+	.version = 1,			/* no particular meaning now */
 	.maxattr = NL80211_ATTR_MAX,
 	.netnsok = true,
 	.pre_doit = nl80211_pre_doit,
-- 
cgit v1.2.3


From 6e16d90b5218307db805e6b3e0b06d3946eb8c4c Mon Sep 17 00:00:00 2001
From: Colleen Twitty <colleen@cozybit.com>
Date: Wed, 8 May 2013 11:45:59 -0700
Subject: cfg80211: Userspace may inform kernel of mesh auth method.

Authentication takes place in userspace, but the beacon is
generated in the kernel.  Allow userspace to inform the
kernel of the authentication method so the appropriate
mesh config IE can be set prior to beacon generation when
joining the MBSS.

Signed-off-by: Colleen Twitty <colleen@cozybit.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 5 +++++
 net/wireless/mesh.c          | 1 +
 net/wireless/nl80211.c       | 8 ++++++++
 4 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 26e91138a2c6..32a2f1b20861 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1161,6 +1161,7 @@ struct mesh_config {
  * @sync_method: which synchronization method to use
  * @path_sel_proto: which path selection protocol to use
  * @path_metric: which metric to use
+ * @auth_id: which authentication method this mesh is using
  * @ie: vendor information elements (optional)
  * @ie_len: length of vendor information elements
  * @is_authenticated: this mesh requires authentication
@@ -1179,6 +1180,7 @@ struct mesh_setup {
 	u8 sync_method;
 	u8 path_sel_proto;
 	u8 path_metric;
+	u8 auth_id;
 	const u8 *ie;
 	u8 ie_len;
 	bool is_authenticated;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b48430769eda..06320713e9c9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2645,6 +2645,10 @@ enum nl80211_meshconf_params {
  * @NL80211_MESH_SETUP_USERSPACE_MPM: Enable this option if userspace will
  *	implement an MPM which handles peer allocation and state.
  *
+ * @NL80211_MESH_SETUP_AUTH_PROTOCOL: Inform the kernel of the authentication
+ *	method (u8, as defined in IEEE 8.4.2.100.6, e.g. 0x1 for SAE).
+ *	Default is no authentication method required.
+ *
  * @NL80211_MESH_SETUP_ATTR_MAX: highest possible mesh setup attribute number
  *
  * @__NL80211_MESH_SETUP_ATTR_AFTER_LAST: Internal use
@@ -2658,6 +2662,7 @@ enum nl80211_mesh_setup_params {
 	NL80211_MESH_SETUP_USERSPACE_AMPE,
 	NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC,
 	NL80211_MESH_SETUP_USERSPACE_MPM,
+	NL80211_MESH_SETUP_AUTH_PROTOCOL,
 
 	/* keep last */
 	__NL80211_MESH_SETUP_ATTR_AFTER_LAST,
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index 0bb93f3061a4..9546ad210550 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -82,6 +82,7 @@ const struct mesh_setup default_mesh_setup = {
 	.sync_method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
 	.path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP,
 	.path_metric = IEEE80211_PATH_METRIC_AIRTIME,
+	.auth_id = 0, /* open */
 	.ie = NULL,
 	.ie_len = 0,
 	.is_secure = false,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9cdcd9ec3317..5f10f7acfa06 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4672,6 +4672,7 @@ static const struct nla_policy
 	[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 },
 	[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 },
 	[NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
+	[NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 },
 	[NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG },
 	[NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY,
 				    .len = IEEE80211_MAX_DATA_LEN },
@@ -4857,6 +4858,13 @@ static int nl80211_parse_mesh_setup(struct genl_info *info,
 	if (setup->is_secure)
 		setup->user_mpm = true;
 
+	if (tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]) {
+		if (!setup->user_mpm)
+			return -EINVAL;
+		setup->auth_id =
+			nla_get_u8(tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8b21ab140ec581caf3340a51cdbe91a76c36d9d8 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 20 May 2013 10:27:00 +0930
Subject: virtio_pci: better macro exported in uapi

Macro VIRTIO_PCI_CONFIG assumes that userspace actually has a structure
with a field named msix_enabled.  Add VIRTIO_PCI_CONFIG_OFF that gets
the msix_enabled by value instead, to make it useful for userspace.  We
still keep VIRTIO_PCI_CONFIG around for now, in case some userspace uses
it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/uapi/linux/virtio_pci.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index ea66f3f60d63..e5ec1caab82a 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -80,7 +80,9 @@
 
 /* The remaining space is defined by each driver as the per-driver
  * configuration space */
-#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
+#define VIRTIO_PCI_CONFIG_OFF(msix_enabled)	((msix_enabled) ? 24 : 20)
+/* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
+#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
-- 
cgit v1.2.3


From 1c6264cc67ac8712d84392fbc9293289011407c0 Mon Sep 17 00:00:00 2001
From: Pranavkumar Sawargaonkar <pranavkumar@linaro.org>
Date: Mon, 20 May 2013 10:25:38 +0930
Subject: virtio: console: Add emergency writeonly register to config space

This patch adds an emerg_wr register (writeonly) in config space
of virtio console device which can be used for debugging.

Signed-off-by: Pranavkumar Sawargaonkar <pranavkumar@linaro.org>
Signed-off-by: Anup Patel <anup.patel@linaro.org>
Acked-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/uapi/linux/virtio_console.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_console.h b/include/uapi/linux/virtio_console.h
index ee13ab6c3614..3fd0dce3cb25 100644
--- a/include/uapi/linux/virtio_console.h
+++ b/include/uapi/linux/virtio_console.h
@@ -38,6 +38,7 @@
 /* Feature bits */
 #define VIRTIO_CONSOLE_F_SIZE	0	/* Does host provide console size? */
 #define VIRTIO_CONSOLE_F_MULTIPORT 1	/* Does host provide multiple ports? */
+#define VIRTIO_CONSOLE_F_EMERG_WRITE 2 /* Does host support emergency write? */
 
 #define VIRTIO_CONSOLE_BAD_ID		(~(u32)0)
 
@@ -48,6 +49,8 @@ struct virtio_console_config {
 	__u16 rows;
 	/* max. number of ports this device can hold */
 	__u32 max_nr_ports;
+	/* emergency write register */
+	__u32 emerg_wr;
 } __attribute__((packed));
 
 /*
-- 
cgit v1.2.3


From 5e4b6f5698421d94226cc2f80eae6d613c9acef8 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 16 May 2013 20:11:08 +0300
Subject: cfg80211: Allow TDLS peer AID to be configured for VHT

VHT uses peer AID in the PARTIAL_AID field in TDLS frames. The current
design for TDLS is to first add a dummy STA entry before completing TDLS
Setup and then update information on this STA entry based on what was
received from the peer during the setup exchange.

In theory, this could use NL80211_ATTR_STA_AID to set the peer AID just
like this is used in AP mode to set the AID of an association station.
However, existing cfg80211 validation rules prevent this attribute from
being used with set_station operation. To avoid interoperability issues
between different kernel and user space version combinations, introduce
a new nl80211 attribute for the purpose of setting TDLS peer AID. This
attribute can be used in both the new_station and set_station
operations. It is not supposed to be allowed to change the AID value
during the lifetime of the STA entry, but that validation is left for
drivers to do in the change_station callback.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/nl80211.c       | 11 +++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 06320713e9c9..32b060ea5266 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1431,6 +1431,11 @@ enum nl80211_commands {
  * @NL80211_ATTR_MAX_CRIT_PROT_DURATION: duration in milliseconds in which
  *      the connection should have increased reliability (u16).
  *
+ * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16).
+ *	This is similar to @NL80211_ATTR_STA_AID but with a difference of being
+ *	allowed to be used with the first @NL80211_CMD_SET_STATION command to
+ *	update a TDLS peer STA entry.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1729,6 +1734,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_CRIT_PROT_ID,
 	NL80211_ATTR_MAX_CRIT_PROT_DURATION,
 
+	NL80211_ATTR_PEER_AID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5f10f7acfa06..14276af7964b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -378,6 +378,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_MDID] = { .type = NLA_U16 },
 	[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
 				  .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_PEER_AID] = { .type = NLA_U16 },
 };
 
 /* policy for the key attributes */
@@ -3872,6 +3873,8 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 				    struct station_parameters *params)
 {
 	/* Dummy STA entry gets updated once the peer capabilities are known */
+	if (info->attrs[NL80211_ATTR_PEER_AID])
+		params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
 	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
 		params->ht_capa =
 			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
@@ -4012,7 +4015,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
 		return -EINVAL;
 
-	if (!info->attrs[NL80211_ATTR_STA_AID])
+	if (!info->attrs[NL80211_ATTR_STA_AID] &&
+	    !info->attrs[NL80211_ATTR_PEER_AID])
 		return -EINVAL;
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -4023,7 +4027,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	params.listen_interval =
 		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
 
-	params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+	if (info->attrs[NL80211_ATTR_STA_AID])
+		params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+	else
+		params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
 	if (!params.aid || params.aid > IEEE80211_MAX_AID)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 7ec872114251b618adf5a2688de0ca00de63635d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 23 May 2013 01:11:11 +0000
Subject: net: ethtool: disambiguate XCVR_* meaning

Add a comment which explains the real meaning of XCVR_INTERNAL (PHY and
Ethernet MAC in the same package/die) and XCVR_EXTERNAL (PHY and
Ethernet MAC in a different package/die). Most if not all of the drivers
setting their transceiver type already do it the way the comment
describes it.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 0c9b44871df0..38dbafaa5341 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -993,8 +993,8 @@ enum ethtool_sfeatures_retval_bits {
 #define PORT_OTHER		0xff
 
 /* Which transceiver to use. */
-#define XCVR_INTERNAL		0x00
-#define XCVR_EXTERNAL		0x01
+#define XCVR_INTERNAL		0x00 /* PHY and MAC are in the same package */
+#define XCVR_EXTERNAL		0x01 /* PHY and MAC are in different packages */
 #define XCVR_DUMMY1		0x02
 #define XCVR_DUMMY2		0x03
 #define XCVR_DUMMY3		0x04
-- 
cgit v1.2.3


From e057d3c31bdf87616b415c4b2cbf7310f54b9219 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 28 May 2013 13:01:52 +0200
Subject: cfg80211: support an active monitor interface flag

An active monitor interface is one that is used for communication (via
injection). It is expected to ACK incoming unicast packets. This is
useful for running various 802.11 testing utilities that associate to an
AP via injection and manage the state in user space.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/nl80211.c       | 10 ++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b3b076a46d50..13b247d26544 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -961,6 +961,7 @@ struct station_info {
  * @MONITOR_FLAG_CONTROL: pass control frames
  * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
  * @MONITOR_FLAG_COOK_FRAMES: report frames after processing
+ * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
  */
 enum monitor_flags {
 	MONITOR_FLAG_FCSFAIL		= 1<<NL80211_MNTR_FLAG_FCSFAIL,
@@ -968,6 +969,7 @@ enum monitor_flags {
 	MONITOR_FLAG_CONTROL		= 1<<NL80211_MNTR_FLAG_CONTROL,
 	MONITOR_FLAG_OTHER_BSS		= 1<<NL80211_MNTR_FLAG_OTHER_BSS,
 	MONITOR_FLAG_COOK_FRAMES	= 1<<NL80211_MNTR_FLAG_COOK_FRAMES,
+	MONITOR_FLAG_ACTIVE		= 1<<NL80211_MNTR_FLAG_ACTIVE,
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 32b060ea5266..5920715278c2 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2428,6 +2428,8 @@ enum nl80211_survey_info {
  * @NL80211_MNTR_FLAG_OTHER_BSS: disable BSSID filtering
  * @NL80211_MNTR_FLAG_COOK_FRAMES: report frames after processing.
  *	overrides all other flags.
+ * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address
+ *	and ACK incoming unicast packets.
  *
  * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use
  * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag
@@ -2439,6 +2441,7 @@ enum nl80211_mntr_flags {
 	NL80211_MNTR_FLAG_CONTROL,
 	NL80211_MNTR_FLAG_OTHER_BSS,
 	NL80211_MNTR_FLAG_COOK_FRAMES,
+	NL80211_MNTR_FLAG_ACTIVE,
 
 	/* keep last */
 	__NL80211_MNTR_FLAG_AFTER_LAST,
@@ -3595,6 +3598,7 @@ enum nl80211_feature_flags {
 	NL80211_FEATURE_ADVERTISE_CHAN_LIMITS		= 1 << 14,
 	NL80211_FEATURE_FULL_AP_CLIENT_STATE		= 1 << 15,
 	NL80211_FEATURE_USERSPACE_MPM			= 1 << 16,
+	NL80211_FEATURE_ACTIVE_MONITOR			= 1 << 17,
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fb6abcb359a1..31d265f36d2c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2227,6 +2227,7 @@ static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
 	[NL80211_MNTR_FLAG_CONTROL] = { .type = NLA_FLAG },
 	[NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG },
 	[NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG },
 };
 
 static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
@@ -2338,6 +2339,10 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		change = true;
 	}
 
+	if (flags && (*flags & NL80211_MNTR_FLAG_ACTIVE) &&
+	    !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
+		return -EOPNOTSUPP;
+
 	if (change)
 		err = cfg80211_change_iface(rdev, dev, ntype, flags, &params);
 	else
@@ -2395,6 +2400,11 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
 				  info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL,
 				  &flags);
+
+	if (!err && (flags & NL80211_MNTR_FLAG_ACTIVE) &&
+	    !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
+		return -EOPNOTSUPP;
+
 	wdev = rdev_add_virtual_intf(rdev,
 				nla_data(info->attrs[NL80211_ATTR_IFNAME]),
 				type, err ? NULL : &flags, &params);
-- 
cgit v1.2.3


From cb93b1864088eb833ea9cef2c20f07d1961241b0 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Wed, 29 May 2013 17:01:52 +0800
Subject: PCI: Fix comment typo for PCI_EXP_LNKCAP_CLKPM

Fix trivial typo for PCI_EXP_LNKCAP_CLKPM comment.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 864e324da80d..c3cc01d474b0 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -468,7 +468,7 @@
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
 #define  PCI_EXP_LNKCAP_L1EL	0x00038000 /* L1 Exit Latency */
-#define  PCI_EXP_LNKCAP_CLKPM	0x00040000 /* L1 Clock Power Management */
+#define  PCI_EXP_LNKCAP_CLKPM	0x00040000 /* Clock Power Management */
 #define  PCI_EXP_LNKCAP_SDERC	0x00080000 /* Surprise Down Error Reporting Capable */
 #define  PCI_EXP_LNKCAP_DLLLARC	0x00100000 /* Data Link Layer Link Active Reporting Capable */
 #define  PCI_EXP_LNKCAP_LBNC	0x00200000 /* Link Bandwidth Notification Capability */
-- 
cgit v1.2.3


From 4c4d41f200db375b2d2cc6d0a1de0606c8266398 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Thu, 6 Jun 2013 10:15:54 +0800
Subject: xfrm: add LINUX_MIB_XFRMACQUIREERROR statistic counter

When host ping its peer, ICMP echo request packet triggers IPsec
policy, then host negotiates SA secret with its peer. After IKE
installed SA for OUT direction, but before SA for IN direction
installed, host get ICMP echo reply from its peer. At the time
being, the SA state for IN direction could be XFRM_STATE_ACQ,
then the received packet will be dropped after adding
LINUX_MIB_XFRMINSTATEINVALID statistic.

Adding a LINUX_MIB_XFRMACQUIREERROR statistic counter for such
scenario when SA in larval state is much clearer for user than
LINUX_MIB_XFRMINSTATEINVALID which indicates the SA is totally
bad.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/snmp.h | 1 +
 net/xfrm/xfrm_input.c     | 5 +++++
 net/xfrm/xfrm_proc.c      | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index df2e8b4f9c03..ea17542ff321 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -287,6 +287,7 @@ enum
 	LINUX_MIB_XFRMOUTPOLERROR,		/* XfrmOutPolError */
 	LINUX_MIB_XFRMFWDHDRERROR,		/* XfrmFwdHdrError*/
 	LINUX_MIB_XFRMOUTSTATEINVALID,		/* XfrmOutStateInvalid */
+	LINUX_MIB_XFRMACQUIREERROR,		/* XfrmAcquireError */
 	__LINUX_MIB_XFRMMAX
 };
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index ab2bb42fe094..88843996f935 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -163,6 +163,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		skb->sp->xvec[skb->sp->len++] = x;
 
 		spin_lock(&x->lock);
+		if (unlikely(x->km.state == XFRM_STATE_ACQ)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
+			goto drop_unlock;
+		}
+
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID);
 			goto drop_unlock;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index c721b0d9ab8b..80cd1e55b834 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -44,6 +44,7 @@ static const struct snmp_mib xfrm_mib_list[] = {
 	SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR),
 	SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR),
 	SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID),
+	SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3


From 7c8c5e6a9101ea57a1c2c9faff0917e79251a21e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 10 Dec 2012 16:15:34 +0000
Subject: arm64: KVM: system register handling

Provide 64bit system register handling, modeled after the cp15
handling for ARM.

Reviewed-by: Christopher Covington <cov@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/include/asm/kvm_coproc.h |  51 +++
 arch/arm64/include/uapi/asm/kvm.h   |  29 ++
 arch/arm64/kvm/sys_regs.c           | 883 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/sys_regs.h           | 138 ++++++
 include/uapi/linux/kvm.h            |   1 +
 5 files changed, 1102 insertions(+)
 create mode 100644 arch/arm64/include/asm/kvm_coproc.h
 create mode 100644 arch/arm64/kvm/sys_regs.c
 create mode 100644 arch/arm64/kvm/sys_regs.h

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h
new file mode 100644
index 000000000000..9b4477acb554
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_coproc.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Derived from arch/arm/include/asm/kvm_coproc.h
+ * Copyright (C) 2012 Rusty Russell IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_KVM_COPROC_H__
+#define __ARM64_KVM_COPROC_H__
+
+#include <linux/kvm_host.h>
+
+void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
+
+struct kvm_sys_reg_table {
+	const struct sys_reg_desc *table;
+	size_t num;
+};
+
+struct kvm_sys_reg_target_table {
+	struct kvm_sys_reg_table table64;
+};
+
+void kvm_register_target_sys_reg_table(unsigned int target,
+				       struct kvm_sys_reg_target_table *table);
+
+int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run);
+
+#define kvm_coproc_table_init kvm_sys_reg_table_init
+void kvm_sys_reg_table_init(void);
+
+struct kvm_one_reg;
+int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
+int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
+int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
+unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu);
+
+#endif /* __ARM64_KVM_COPROC_H__ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 4e64570a20c9..ebac919dc0ca 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -92,6 +92,35 @@ struct kvm_sync_regs {
 struct kvm_arch_memory_slot {
 };
 
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
+#define KVM_REG_ARM_COPROC_SHIFT	16
+
+/* Normal registers are mapped as coprocessor 16. */
+#define KVM_REG_ARM_CORE		(0x0010 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_CORE_REG(name)	(offsetof(struct kvm_regs, name) / sizeof(__u32))
+
+/* Some registers need more space to represent values. */
+#define KVM_REG_ARM_DEMUX		(0x0011 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_DEMUX_ID_MASK	0x000000000000FF00
+#define KVM_REG_ARM_DEMUX_ID_SHIFT	8
+#define KVM_REG_ARM_DEMUX_ID_CCSIDR	(0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT)
+#define KVM_REG_ARM_DEMUX_VAL_MASK	0x00000000000000FF
+#define KVM_REG_ARM_DEMUX_VAL_SHIFT	0
+
+/* AArch64 system registers */
+#define KVM_REG_ARM64_SYSREG		(0x0013 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM64_SYSREG_OP0_MASK	0x000000000000c000
+#define KVM_REG_ARM64_SYSREG_OP0_SHIFT	14
+#define KVM_REG_ARM64_SYSREG_OP1_MASK	0x0000000000003800
+#define KVM_REG_ARM64_SYSREG_OP1_SHIFT	11
+#define KVM_REG_ARM64_SYSREG_CRN_MASK	0x0000000000000780
+#define KVM_REG_ARM64_SYSREG_CRN_SHIFT	7
+#define KVM_REG_ARM64_SYSREG_CRM_MASK	0x0000000000000078
+#define KVM_REG_ARM64_SYSREG_CRM_SHIFT	3
+#define KVM_REG_ARM64_SYSREG_OP2_MASK	0x0000000000000007
+#define KVM_REG_ARM64_SYSREG_OP2_SHIFT	0
+
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
 #define KVM_ARM_IRQ_TYPE_MASK		0xff
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
new file mode 100644
index 000000000000..52fff0ae3442
--- /dev/null
+++ b/arch/arm64/kvm/sys_regs.c
@@ -0,0 +1,883 @@
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Derived from arch/arm/kvm/coproc.c:
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Authors: Rusty Russell <rusty@rustcorp.com.au>
+ *          Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/mm.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_coproc.h>
+#include <asm/cacheflush.h>
+#include <asm/cputype.h>
+#include <trace/events/kvm.h>
+
+#include "sys_regs.h"
+
+/*
+ * All of this file is extremly similar to the ARM coproc.c, but the
+ * types are different. My gut feeling is that it should be pretty
+ * easy to merge, but that would be an ABI breakage -- again. VFP
+ * would also need to be abstracted.
+ */
+
+/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
+static u32 cache_levels;
+
+/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
+#define CSSELR_MAX 12
+
+/* Which cache CCSIDR represents depends on CSSELR value. */
+static u32 get_ccsidr(u32 csselr)
+{
+	u32 ccsidr;
+
+	/* Make sure noone else changes CSSELR during this! */
+	local_irq_disable();
+	/* Put value into CSSELR */
+	asm volatile("msr csselr_el1, %x0" : : "r" (csselr));
+	isb();
+	/* Read result out of CCSIDR */
+	asm volatile("mrs %0, ccsidr_el1" : "=r" (ccsidr));
+	local_irq_enable();
+
+	return ccsidr;
+}
+
+static void do_dc_cisw(u32 val)
+{
+	asm volatile("dc cisw, %x0" : : "r" (val));
+	dsb();
+}
+
+static void do_dc_csw(u32 val)
+{
+	asm volatile("dc csw, %x0" : : "r" (val));
+	dsb();
+}
+
+/* See note at ARM ARM B1.14.4 */
+static bool access_dcsw(struct kvm_vcpu *vcpu,
+			const struct sys_reg_params *p,
+			const struct sys_reg_desc *r)
+{
+	unsigned long val;
+	int cpu;
+
+	if (!p->is_write)
+		return read_from_write_only(vcpu, p);
+
+	cpu = get_cpu();
+
+	cpumask_setall(&vcpu->arch.require_dcache_flush);
+	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
+
+	/* If we were already preempted, take the long way around */
+	if (cpu != vcpu->arch.last_pcpu) {
+		flush_cache_all();
+		goto done;
+	}
+
+	val = *vcpu_reg(vcpu, p->Rt);
+
+	switch (p->CRm) {
+	case 6:			/* Upgrade DCISW to DCCISW, as per HCR.SWIO */
+	case 14:		/* DCCISW */
+		do_dc_cisw(val);
+		break;
+
+	case 10:		/* DCCSW */
+		do_dc_csw(val);
+		break;
+	}
+
+done:
+	put_cpu();
+
+	return true;
+}
+
+/*
+ * We could trap ID_DFR0 and tell the guest we don't support performance
+ * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
+ * NAKed, so it will read the PMCR anyway.
+ *
+ * Therefore we tell the guest we have 0 counters.  Unfortunately, we
+ * must always support PMCCNTR (the cycle counter): we just RAZ/WI for
+ * all PM registers, which doesn't crash the guest kernel at least.
+ */
+static bool pm_fake(struct kvm_vcpu *vcpu,
+		    const struct sys_reg_params *p,
+		    const struct sys_reg_desc *r)
+{
+	if (p->is_write)
+		return ignore_write(vcpu, p);
+	else
+		return read_zero(vcpu, p);
+}
+
+static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+	u64 amair;
+
+	asm volatile("mrs %0, amair_el1\n" : "=r" (amair));
+	vcpu_sys_reg(vcpu, AMAIR_EL1) = amair;
+}
+
+static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+	/*
+	 * Simply map the vcpu_id into the Aff0 field of the MPIDR.
+	 */
+	vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff);
+}
+
+/*
+ * Architected system registers.
+ * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
+ */
+static const struct sys_reg_desc sys_reg_descs[] = {
+	/* DC ISW */
+	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b0110), Op2(0b010),
+	  access_dcsw },
+	/* DC CSW */
+	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1010), Op2(0b010),
+	  access_dcsw },
+	/* DC CISW */
+	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b010),
+	  access_dcsw },
+
+	/* MPIDR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101),
+	  NULL, reset_mpidr, MPIDR_EL1 },
+	/* SCTLR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
+	  NULL, reset_val, SCTLR_EL1, 0x00C50078 },
+	/* CPACR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
+	  NULL, reset_val, CPACR_EL1, 0 },
+	/* TTBR0_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000),
+	  NULL, reset_unknown, TTBR0_EL1 },
+	/* TTBR1_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001),
+	  NULL, reset_unknown, TTBR1_EL1 },
+	/* TCR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010),
+	  NULL, reset_val, TCR_EL1, 0 },
+
+	/* AFSR0_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000),
+	  NULL, reset_unknown, AFSR0_EL1 },
+	/* AFSR1_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001),
+	  NULL, reset_unknown, AFSR1_EL1 },
+	/* ESR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000),
+	  NULL, reset_unknown, ESR_EL1 },
+	/* FAR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000),
+	  NULL, reset_unknown, FAR_EL1 },
+
+	/* PMINTENSET_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001),
+	  pm_fake },
+	/* PMINTENCLR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010),
+	  pm_fake },
+
+	/* MAIR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
+	  NULL, reset_unknown, MAIR_EL1 },
+	/* AMAIR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000),
+	  NULL, reset_amair_el1, AMAIR_EL1 },
+
+	/* VBAR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
+	  NULL, reset_val, VBAR_EL1, 0 },
+	/* CONTEXTIDR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
+	  NULL, reset_val, CONTEXTIDR_EL1, 0 },
+	/* TPIDR_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100),
+	  NULL, reset_unknown, TPIDR_EL1 },
+
+	/* CNTKCTL_EL1 */
+	{ Op0(0b11), Op1(0b000), CRn(0b1110), CRm(0b0001), Op2(0b000),
+	  NULL, reset_val, CNTKCTL_EL1, 0},
+
+	/* CSSELR_EL1 */
+	{ Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000),
+	  NULL, reset_unknown, CSSELR_EL1 },
+
+	/* PMCR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000),
+	  pm_fake },
+	/* PMCNTENSET_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
+	  pm_fake },
+	/* PMCNTENCLR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010),
+	  pm_fake },
+	/* PMOVSCLR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011),
+	  pm_fake },
+	/* PMSWINC_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
+	  pm_fake },
+	/* PMSELR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
+	  pm_fake },
+	/* PMCEID0_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
+	  pm_fake },
+	/* PMCEID1_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111),
+	  pm_fake },
+	/* PMCCNTR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
+	  pm_fake },
+	/* PMXEVTYPER_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
+	  pm_fake },
+	/* PMXEVCNTR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
+	  pm_fake },
+	/* PMUSERENR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
+	  pm_fake },
+	/* PMOVSSET_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011),
+	  pm_fake },
+
+	/* TPIDR_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010),
+	  NULL, reset_unknown, TPIDR_EL0 },
+	/* TPIDRRO_EL0 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
+	  NULL, reset_unknown, TPIDRRO_EL0 },
+};
+
+/* Target specific emulation tables */
+static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS];
+
+void kvm_register_target_sys_reg_table(unsigned int target,
+				       struct kvm_sys_reg_target_table *table)
+{
+	target_tables[target] = table;
+}
+
+/* Get specific register table for this target. */
+static const struct sys_reg_desc *get_target_table(unsigned target, size_t *num)
+{
+	struct kvm_sys_reg_target_table *table;
+
+	table = target_tables[target];
+	*num = table->table64.num;
+	return table->table64.table;
+}
+
+static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
+					 const struct sys_reg_desc table[],
+					 unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		const struct sys_reg_desc *r = &table[i];
+
+		if (params->Op0 != r->Op0)
+			continue;
+		if (params->Op1 != r->Op1)
+			continue;
+		if (params->CRn != r->CRn)
+			continue;
+		if (params->CRm != r->CRm)
+			continue;
+		if (params->Op2 != r->Op2)
+			continue;
+
+		return r;
+	}
+	return NULL;
+}
+
+static int emulate_sys_reg(struct kvm_vcpu *vcpu,
+			   const struct sys_reg_params *params)
+{
+	size_t num;
+	const struct sys_reg_desc *table, *r;
+
+	table = get_target_table(vcpu->arch.target, &num);
+
+	/* Search target-specific then generic table. */
+	r = find_reg(params, table, num);
+	if (!r)
+		r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+
+	if (likely(r)) {
+		/*
+		 * Not having an accessor means that we have
+		 * configured a trap that we don't know how to
+		 * handle. This certainly qualifies as a gross bug
+		 * that should be fixed right away.
+		 */
+		BUG_ON(!r->access);
+
+		if (likely(r->access(vcpu, params, r))) {
+			/* Skip instruction, since it was emulated */
+			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+			return 1;
+		}
+		/* If access function fails, it should complain. */
+	} else {
+		kvm_err("Unsupported guest sys_reg access at: %lx\n",
+			*vcpu_pc(vcpu));
+		print_sys_reg_instr(params);
+	}
+	kvm_inject_undefined(vcpu);
+	return 1;
+}
+
+static void reset_sys_reg_descs(struct kvm_vcpu *vcpu,
+			      const struct sys_reg_desc *table, size_t num)
+{
+	unsigned long i;
+
+	for (i = 0; i < num; i++)
+		if (table[i].reset)
+			table[i].reset(vcpu, &table[i]);
+}
+
+/**
+ * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access
+ * @vcpu: The VCPU pointer
+ * @run:  The kvm_run struct
+ */
+int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	struct sys_reg_params params;
+	unsigned long esr = kvm_vcpu_get_hsr(vcpu);
+
+	params.Op0 = (esr >> 20) & 3;
+	params.Op1 = (esr >> 14) & 0x7;
+	params.CRn = (esr >> 10) & 0xf;
+	params.CRm = (esr >> 1) & 0xf;
+	params.Op2 = (esr >> 17) & 0x7;
+	params.Rt = (esr >> 5) & 0x1f;
+	params.is_write = !(esr & 1);
+
+	return emulate_sys_reg(vcpu, &params);
+}
+
+/******************************************************************************
+ * Userspace API
+ *****************************************************************************/
+
+static bool index_to_params(u64 id, struct sys_reg_params *params)
+{
+	switch (id & KVM_REG_SIZE_MASK) {
+	case KVM_REG_SIZE_U64:
+		/* Any unused index bits means it's not valid. */
+		if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK
+			      | KVM_REG_ARM_COPROC_MASK
+			      | KVM_REG_ARM64_SYSREG_OP0_MASK
+			      | KVM_REG_ARM64_SYSREG_OP1_MASK
+			      | KVM_REG_ARM64_SYSREG_CRN_MASK
+			      | KVM_REG_ARM64_SYSREG_CRM_MASK
+			      | KVM_REG_ARM64_SYSREG_OP2_MASK))
+			return false;
+		params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP0_SHIFT);
+		params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP1_SHIFT);
+		params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK)
+			       >> KVM_REG_ARM64_SYSREG_CRN_SHIFT);
+		params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK)
+			       >> KVM_REG_ARM64_SYSREG_CRM_SHIFT);
+		params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP2_SHIFT);
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* Decode an index value, and find the sys_reg_desc entry. */
+static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
+						    u64 id)
+{
+	size_t num;
+	const struct sys_reg_desc *table, *r;
+	struct sys_reg_params params;
+
+	/* We only do sys_reg for now. */
+	if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG)
+		return NULL;
+
+	if (!index_to_params(id, &params))
+		return NULL;
+
+	table = get_target_table(vcpu->arch.target, &num);
+	r = find_reg(&params, table, num);
+	if (!r)
+		r = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+
+	/* Not saved in the sys_reg array? */
+	if (r && !r->reg)
+		r = NULL;
+
+	return r;
+}
+
+/*
+ * These are the invariant sys_reg registers: we let the guest see the
+ * host versions of these, so they're part of the guest state.
+ *
+ * A future CPU may provide a mechanism to present different values to
+ * the guest, or a future kvm may trap them.
+ */
+
+#define FUNCTION_INVARIANT(reg)						\
+	static void get_##reg(struct kvm_vcpu *v,			\
+			      const struct sys_reg_desc *r)		\
+	{								\
+		u64 val;						\
+									\
+		asm volatile("mrs %0, " __stringify(reg) "\n"		\
+			     : "=r" (val));				\
+		((struct sys_reg_desc *)r)->val = val;			\
+	}
+
+FUNCTION_INVARIANT(midr_el1)
+FUNCTION_INVARIANT(ctr_el0)
+FUNCTION_INVARIANT(revidr_el1)
+FUNCTION_INVARIANT(id_pfr0_el1)
+FUNCTION_INVARIANT(id_pfr1_el1)
+FUNCTION_INVARIANT(id_dfr0_el1)
+FUNCTION_INVARIANT(id_afr0_el1)
+FUNCTION_INVARIANT(id_mmfr0_el1)
+FUNCTION_INVARIANT(id_mmfr1_el1)
+FUNCTION_INVARIANT(id_mmfr2_el1)
+FUNCTION_INVARIANT(id_mmfr3_el1)
+FUNCTION_INVARIANT(id_isar0_el1)
+FUNCTION_INVARIANT(id_isar1_el1)
+FUNCTION_INVARIANT(id_isar2_el1)
+FUNCTION_INVARIANT(id_isar3_el1)
+FUNCTION_INVARIANT(id_isar4_el1)
+FUNCTION_INVARIANT(id_isar5_el1)
+FUNCTION_INVARIANT(clidr_el1)
+FUNCTION_INVARIANT(aidr_el1)
+
+/* ->val is filled in by kvm_sys_reg_table_init() */
+static struct sys_reg_desc invariant_sys_regs[] = {
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b000),
+	  NULL, get_midr_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b110),
+	  NULL, get_revidr_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b000),
+	  NULL, get_id_pfr0_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b001),
+	  NULL, get_id_pfr1_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b010),
+	  NULL, get_id_dfr0_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b011),
+	  NULL, get_id_afr0_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b100),
+	  NULL, get_id_mmfr0_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b101),
+	  NULL, get_id_mmfr1_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b110),
+	  NULL, get_id_mmfr2_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b111),
+	  NULL, get_id_mmfr3_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000),
+	  NULL, get_id_isar0_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b001),
+	  NULL, get_id_isar1_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010),
+	  NULL, get_id_isar2_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b011),
+	  NULL, get_id_isar3_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b100),
+	  NULL, get_id_isar4_el1 },
+	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b101),
+	  NULL, get_id_isar5_el1 },
+	{ Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b001),
+	  NULL, get_clidr_el1 },
+	{ Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b111),
+	  NULL, get_aidr_el1 },
+	{ Op0(0b11), Op1(0b011), CRn(0b0000), CRm(0b0000), Op2(0b001),
+	  NULL, get_ctr_el0 },
+};
+
+static int reg_from_user(void *val, const void __user *uaddr, u64 id)
+{
+	/* This Just Works because we are little endian. */
+	if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int reg_to_user(void __user *uaddr, const void *val, u64 id)
+{
+	/* This Just Works because we are little endian. */
+	if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int get_invariant_sys_reg(u64 id, void __user *uaddr)
+{
+	struct sys_reg_params params;
+	const struct sys_reg_desc *r;
+
+	if (!index_to_params(id, &params))
+		return -ENOENT;
+
+	r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
+	if (!r)
+		return -ENOENT;
+
+	return reg_to_user(uaddr, &r->val, id);
+}
+
+static int set_invariant_sys_reg(u64 id, void __user *uaddr)
+{
+	struct sys_reg_params params;
+	const struct sys_reg_desc *r;
+	int err;
+	u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */
+
+	if (!index_to_params(id, &params))
+		return -ENOENT;
+	r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
+	if (!r)
+		return -ENOENT;
+
+	err = reg_from_user(&val, uaddr, id);
+	if (err)
+		return err;
+
+	/* This is what we mean by invariant: you can't change it. */
+	if (r->val != val)
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool is_valid_cache(u32 val)
+{
+	u32 level, ctype;
+
+	if (val >= CSSELR_MAX)
+		return -ENOENT;
+
+	/* Bottom bit is Instruction or Data bit.  Next 3 bits are level. */
+	level = (val >> 1);
+	ctype = (cache_levels >> (level * 3)) & 7;
+
+	switch (ctype) {
+	case 0: /* No cache */
+		return false;
+	case 1: /* Instruction cache only */
+		return (val & 1);
+	case 2: /* Data cache only */
+	case 4: /* Unified cache */
+		return !(val & 1);
+	case 3: /* Separate instruction and data caches */
+		return true;
+	default: /* Reserved: we can't know instruction or data. */
+		return false;
+	}
+}
+
+static int demux_c15_get(u64 id, void __user *uaddr)
+{
+	u32 val;
+	u32 __user *uval = uaddr;
+
+	/* Fail if we have unknown bits set. */
+	if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK
+		   | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1)))
+		return -ENOENT;
+
+	switch (id & KVM_REG_ARM_DEMUX_ID_MASK) {
+	case KVM_REG_ARM_DEMUX_ID_CCSIDR:
+		if (KVM_REG_SIZE(id) != 4)
+			return -ENOENT;
+		val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
+			>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
+		if (!is_valid_cache(val))
+			return -ENOENT;
+
+		return put_user(get_ccsidr(val), uval);
+	default:
+		return -ENOENT;
+	}
+}
+
+static int demux_c15_set(u64 id, void __user *uaddr)
+{
+	u32 val, newval;
+	u32 __user *uval = uaddr;
+
+	/* Fail if we have unknown bits set. */
+	if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK
+		   | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1)))
+		return -ENOENT;
+
+	switch (id & KVM_REG_ARM_DEMUX_ID_MASK) {
+	case KVM_REG_ARM_DEMUX_ID_CCSIDR:
+		if (KVM_REG_SIZE(id) != 4)
+			return -ENOENT;
+		val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
+			>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
+		if (!is_valid_cache(val))
+			return -ENOENT;
+
+		if (get_user(newval, uval))
+			return -EFAULT;
+
+		/* This is also invariant: you can't change it. */
+		if (newval != get_ccsidr(val))
+			return -EINVAL;
+		return 0;
+	default:
+		return -ENOENT;
+	}
+}
+
+int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	const struct sys_reg_desc *r;
+	void __user *uaddr = (void __user *)(unsigned long)reg->addr;
+
+	if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
+		return demux_c15_get(reg->id, uaddr);
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(__u64))
+		return -ENOENT;
+
+	r = index_to_sys_reg_desc(vcpu, reg->id);
+	if (!r)
+		return get_invariant_sys_reg(reg->id, uaddr);
+
+	return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
+}
+
+int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	const struct sys_reg_desc *r;
+	void __user *uaddr = (void __user *)(unsigned long)reg->addr;
+
+	if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
+		return demux_c15_set(reg->id, uaddr);
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(__u64))
+		return -ENOENT;
+
+	r = index_to_sys_reg_desc(vcpu, reg->id);
+	if (!r)
+		return set_invariant_sys_reg(reg->id, uaddr);
+
+	return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
+}
+
+static unsigned int num_demux_regs(void)
+{
+	unsigned int i, count = 0;
+
+	for (i = 0; i < CSSELR_MAX; i++)
+		if (is_valid_cache(i))
+			count++;
+
+	return count;
+}
+
+static int write_demux_regids(u64 __user *uindices)
+{
+	u64 val = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX;
+	unsigned int i;
+
+	val |= KVM_REG_ARM_DEMUX_ID_CCSIDR;
+	for (i = 0; i < CSSELR_MAX; i++) {
+		if (!is_valid_cache(i))
+			continue;
+		if (put_user(val | i, uindices))
+			return -EFAULT;
+		uindices++;
+	}
+	return 0;
+}
+
+static u64 sys_reg_to_index(const struct sys_reg_desc *reg)
+{
+	return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 |
+		KVM_REG_ARM64_SYSREG |
+		(reg->Op0 << KVM_REG_ARM64_SYSREG_OP0_SHIFT) |
+		(reg->Op1 << KVM_REG_ARM64_SYSREG_OP1_SHIFT) |
+		(reg->CRn << KVM_REG_ARM64_SYSREG_CRN_SHIFT) |
+		(reg->CRm << KVM_REG_ARM64_SYSREG_CRM_SHIFT) |
+		(reg->Op2 << KVM_REG_ARM64_SYSREG_OP2_SHIFT));
+}
+
+static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind)
+{
+	if (!*uind)
+		return true;
+
+	if (put_user(sys_reg_to_index(reg), *uind))
+		return false;
+
+	(*uind)++;
+	return true;
+}
+
+/* Assumed ordered tables, see kvm_sys_reg_table_init. */
+static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
+{
+	const struct sys_reg_desc *i1, *i2, *end1, *end2;
+	unsigned int total = 0;
+	size_t num;
+
+	/* We check for duplicates here, to allow arch-specific overrides. */
+	i1 = get_target_table(vcpu->arch.target, &num);
+	end1 = i1 + num;
+	i2 = sys_reg_descs;
+	end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs);
+
+	BUG_ON(i1 == end1 || i2 == end2);
+
+	/* Walk carefully, as both tables may refer to the same register. */
+	while (i1 || i2) {
+		int cmp = cmp_sys_reg(i1, i2);
+		/* target-specific overrides generic entry. */
+		if (cmp <= 0) {
+			/* Ignore registers we trap but don't save. */
+			if (i1->reg) {
+				if (!copy_reg_to_user(i1, &uind))
+					return -EFAULT;
+				total++;
+			}
+		} else {
+			/* Ignore registers we trap but don't save. */
+			if (i2->reg) {
+				if (!copy_reg_to_user(i2, &uind))
+					return -EFAULT;
+				total++;
+			}
+		}
+
+		if (cmp <= 0 && ++i1 == end1)
+			i1 = NULL;
+		if (cmp >= 0 && ++i2 == end2)
+			i2 = NULL;
+	}
+	return total;
+}
+
+unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
+{
+	return ARRAY_SIZE(invariant_sys_regs)
+		+ num_demux_regs()
+		+ walk_sys_regs(vcpu, (u64 __user *)NULL);
+}
+
+int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
+{
+	unsigned int i;
+	int err;
+
+	/* Then give them all the invariant registers' indices. */
+	for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) {
+		if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices))
+			return -EFAULT;
+		uindices++;
+	}
+
+	err = walk_sys_regs(vcpu, uindices);
+	if (err < 0)
+		return err;
+	uindices += err;
+
+	return write_demux_regids(uindices);
+}
+
+void kvm_sys_reg_table_init(void)
+{
+	unsigned int i;
+	struct sys_reg_desc clidr;
+
+	/* Make sure tables are unique and in order. */
+	for (i = 1; i < ARRAY_SIZE(sys_reg_descs); i++)
+		BUG_ON(cmp_sys_reg(&sys_reg_descs[i-1], &sys_reg_descs[i]) >= 0);
+
+	/* We abuse the reset function to overwrite the table itself. */
+	for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
+		invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
+
+	/*
+	 * CLIDR format is awkward, so clean it up.  See ARM B4.1.20:
+	 *
+	 *   If software reads the Cache Type fields from Ctype1
+	 *   upwards, once it has seen a value of 0b000, no caches
+	 *   exist at further-out levels of the hierarchy. So, for
+	 *   example, if Ctype3 is the first Cache Type field with a
+	 *   value of 0b000, the values of Ctype4 to Ctype7 must be
+	 *   ignored.
+	 */
+	get_clidr_el1(NULL, &clidr); /* Ugly... */
+	cache_levels = clidr.val;
+	for (i = 0; i < 7; i++)
+		if (((cache_levels >> (i*3)) & 7) == 0)
+			break;
+	/* Clear all higher bits. */
+	cache_levels &= (1 << (i*3))-1;
+}
+
+/**
+ * kvm_reset_sys_regs - sets system registers to reset value
+ * @vcpu: The VCPU pointer
+ *
+ * This function finds the right table above and sets the registers on the
+ * virtual CPU struct to their architecturally defined reset values.
+ */
+void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
+{
+	size_t num;
+	const struct sys_reg_desc *table;
+
+	/* Catch someone adding a register without putting in reset entry. */
+	memset(&vcpu->arch.ctxt.sys_regs, 0x42, sizeof(vcpu->arch.ctxt.sys_regs));
+
+	/* Generic chip reset first (so target could override). */
+	reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+
+	table = get_target_table(vcpu->arch.target, &num);
+	reset_sys_reg_descs(vcpu, table, num);
+
+	for (num = 1; num < NR_SYS_REGS; num++)
+		if (vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
+			panic("Didn't reset vcpu_sys_reg(%zi)", num);
+}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
new file mode 100644
index 000000000000..d50d3722998e
--- /dev/null
+++ b/arch/arm64/kvm/sys_regs.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Derived from arch/arm/kvm/coproc.h
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Authors: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__
+#define __ARM64_KVM_SYS_REGS_LOCAL_H__
+
+struct sys_reg_params {
+	u8	Op0;
+	u8	Op1;
+	u8	CRn;
+	u8	CRm;
+	u8	Op2;
+	u8	Rt;
+	bool	is_write;
+};
+
+struct sys_reg_desc {
+	/* MRS/MSR instruction which accesses it. */
+	u8	Op0;
+	u8	Op1;
+	u8	CRn;
+	u8	CRm;
+	u8	Op2;
+
+	/* Trapped access from guest, if non-NULL. */
+	bool (*access)(struct kvm_vcpu *,
+		       const struct sys_reg_params *,
+		       const struct sys_reg_desc *);
+
+	/* Initialization for vcpu. */
+	void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
+
+	/* Index into sys_reg[], or 0 if we don't need to save it. */
+	int reg;
+
+	/* Value (usually reset value) */
+	u64 val;
+};
+
+static inline void print_sys_reg_instr(const struct sys_reg_params *p)
+{
+	/* Look, we even formatted it for you to paste into the table! */
+	kvm_pr_unimpl(" { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n",
+		      p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read");
+}
+
+static inline bool ignore_write(struct kvm_vcpu *vcpu,
+				const struct sys_reg_params *p)
+{
+	return true;
+}
+
+static inline bool read_zero(struct kvm_vcpu *vcpu,
+			     const struct sys_reg_params *p)
+{
+	*vcpu_reg(vcpu, p->Rt) = 0;
+	return true;
+}
+
+static inline bool write_to_read_only(struct kvm_vcpu *vcpu,
+				      const struct sys_reg_params *params)
+{
+	kvm_debug("sys_reg write to read-only register at: %lx\n",
+		  *vcpu_pc(vcpu));
+	print_sys_reg_instr(params);
+	return false;
+}
+
+static inline bool read_from_write_only(struct kvm_vcpu *vcpu,
+					const struct sys_reg_params *params)
+{
+	kvm_debug("sys_reg read to write-only register at: %lx\n",
+		  *vcpu_pc(vcpu));
+	print_sys_reg_instr(params);
+	return false;
+}
+
+/* Reset functions */
+static inline void reset_unknown(struct kvm_vcpu *vcpu,
+				 const struct sys_reg_desc *r)
+{
+	BUG_ON(!r->reg);
+	BUG_ON(r->reg >= NR_SYS_REGS);
+	vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+}
+
+static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+	BUG_ON(!r->reg);
+	BUG_ON(r->reg >= NR_SYS_REGS);
+	vcpu_sys_reg(vcpu, r->reg) = r->val;
+}
+
+static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
+			      const struct sys_reg_desc *i2)
+{
+	BUG_ON(i1 == i2);
+	if (!i1)
+		return 1;
+	else if (!i2)
+		return -1;
+	if (i1->Op0 != i2->Op0)
+		return i1->Op0 - i2->Op0;
+	if (i1->Op1 != i2->Op1)
+		return i1->Op1 - i2->Op1;
+	if (i1->CRn != i2->CRn)
+		return i1->CRn - i2->CRn;
+	if (i1->CRm != i2->CRm)
+		return i1->CRm - i2->CRm;
+	return i1->Op2 - i2->Op2;
+}
+
+
+#define Op0(_x) 	.Op0 = _x
+#define Op1(_x) 	.Op1 = _x
+#define CRn(_x)		.CRn = _x
+#define CRm(_x) 	.CRm = _x
+#define Op2(_x) 	.Op2 = _x
+
+#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a5c86fc34a37..2d1bcb891468 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -783,6 +783,7 @@ struct kvm_dirty_tlb {
 #define KVM_REG_IA64		0x3000000000000000ULL
 #define KVM_REG_ARM		0x4000000000000000ULL
 #define KVM_REG_S390		0x5000000000000000ULL
+#define KVM_REG_ARM64		0x6000000000000000ULL
 
 #define KVM_REG_SIZE_SHIFT	52
 #define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
-- 
cgit v1.2.3


From 060212928670593fb89243640bf05cf89560b023 Mon Sep 17 00:00:00 2001
From: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Date: Mon, 10 Jun 2013 11:39:50 +0300
Subject: net: add low latency socket poll

Adds an ndo_ll_poll method and the code that supports it.
This method can be used by low latency applications to busy-poll
Ethernet device queues directly from the socket code.
sysctl_net_ll_poll controls how many microseconds to poll.
Default is zero (disabled).
Individual protocol support will be added by subsequent patches.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt |   7 ++
 include/linux/netdevice.h    |   3 +
 include/linux/skbuff.h       |   8 ++-
 include/net/ll_poll.h        | 148 +++++++++++++++++++++++++++++++++++++++++++
 include/net/sock.h           |   4 ++
 include/uapi/linux/snmp.h    |   1 +
 net/Kconfig                  |  12 ++++
 net/core/skbuff.c            |   4 ++
 net/core/sock.c              |   6 ++
 net/core/sysctl_net_core.c   |  10 +++
 net/ipv4/proc.c              |   1 +
 net/socket.c                 |   6 ++
 12 files changed, 208 insertions(+), 2 deletions(-)
 create mode 100644 include/net/ll_poll.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index c1f8640c2fc8..85ab72dcdc3c 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+low_latency_poll
+----------------
+Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
+Approximate time in us to spin waiting for packets on the device queue.
+Recommended value is 50. May increase power usage.
+Default: 0 (off)
+
 rmem_default
 ------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 39bbd462d68e..2ecb96d9a1e5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -971,6 +971,9 @@ struct net_device_ops {
 						     struct netpoll_info *info,
 						     gfp_t gfp);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
+#endif
+#ifdef CONFIG_NET_LL_RX_POLL
+	int			(*ndo_ll_poll)(struct napi_struct *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9995834d2cb6..400d82ae2b03 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
+  *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@dropcount: total number of sk_receive_queue overflows
@@ -500,8 +501,11 @@ struct sk_buff {
 	/* 7/9 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-#ifdef CONFIG_NET_DMA
-	dma_cookie_t		dma_cookie;
+#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
+	union {
+		unsigned int	napi_id;
+		dma_cookie_t	dma_cookie;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
new file mode 100644
index 000000000000..bc262f88173f
--- /dev/null
+++ b/include/net/ll_poll.h
@@ -0,0 +1,148 @@
+/*
+ * Low Latency Sockets
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Eliezer Tamir
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ */
+
+/*
+ * For now this depends on CONFIG_X86_TSC
+ */
+
+#ifndef _LINUX_NET_LL_POLL_H
+#define _LINUX_NET_LL_POLL_H
+
+#include <linux/netdevice.h>
+#include <net/ip.h>
+
+#ifdef CONFIG_NET_LL_RX_POLL
+
+struct napi_struct;
+extern unsigned long sysctl_net_ll_poll __read_mostly;
+
+/* return values from ndo_ll_poll */
+#define LL_FLUSH_FAILED		-1
+#define LL_FLUSH_BUSY		-2
+
+/* we don't mind a ~2.5% imprecision */
+#define TSC_MHZ (tsc_khz >> 10)
+
+static inline cycles_t ll_end_time(void)
+{
+	return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
+}
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return sysctl_net_ll_poll && sk->sk_napi_id &&
+	       !need_resched() && !signal_pending(current);
+}
+
+static inline bool can_poll_ll(cycles_t end_time)
+{
+	return !time_after((unsigned long)get_cycles(),
+			    (unsigned long)end_time);
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	cycles_t end_time = ll_end_time();
+	const struct net_device_ops *ops;
+	struct napi_struct *napi;
+	int rc = false;
+
+	/*
+	 * rcu read lock for napi hash
+	 * bh so we don't race with net_rx_action
+	 */
+	rcu_read_lock_bh();
+
+	napi = napi_by_id(sk->sk_napi_id);
+	if (!napi)
+		goto out;
+
+	ops = napi->dev->netdev_ops;
+	if (!ops->ndo_ll_poll)
+		goto out;
+
+	do {
+
+		rc = ops->ndo_ll_poll(napi);
+
+		if (rc == LL_FLUSH_FAILED)
+			break; /* permanent failure */
+
+		if (rc > 0)
+			/* local bh are disabled so it is ok to use _BH */
+			NET_ADD_STATS_BH(sock_net(sk),
+					 LINUX_MIB_LOWLATENCYRXPACKETS, rc);
+
+	} while (skb_queue_empty(&sk->sk_receive_queue)
+			&& can_poll_ll(end_time) && !nonblock);
+
+	rc = !skb_queue_empty(&sk->sk_receive_queue);
+out:
+	rcu_read_unlock_bh();
+	return rc;
+}
+
+/* used in the NIC receive handler to mark the skb */
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+	skb->napi_id = napi->napi_id;
+}
+
+/* used in the protocol hanlder to propagate the napi_id to the socket */
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+	sk->sk_napi_id = skb->napi_id;
+}
+
+#else /* CONFIG_NET_LL_RX_POLL */
+
+static inline cycles_t ll_end_time(void)
+{
+	return 0;
+}
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return false;
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	return false;
+}
+
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static inline bool can_poll_ll(cycles_t end_time)
+{
+	return false;
+}
+
+#endif /* CONFIG_NET_LL_RX_POLL */
+#endif /* _LINUX_NET_LL_POLL_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 66772cf8c3c5..ac8e1818380c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -229,6 +229,7 @@ struct cg_proto;
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
+  *	@sk_napi_id: id of the last napi context to receive data for sk
   *	@sk_allocation: allocation mode
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
@@ -324,6 +325,9 @@ struct sock {
 	int			sk_forward_alloc;
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
+#endif
+#ifdef CONFIG_NET_LL_RX_POLL
+	unsigned int		sk_napi_id;
 #endif
 	atomic_t		sk_drops;
 	int			sk_rcvbuf;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index df2e8b4f9c03..26cbf76f8058 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -253,6 +253,7 @@ enum
 	LINUX_MIB_TCPFASTOPENLISTENOVERFLOW,	/* TCPFastOpenListenOverflow */
 	LINUX_MIB_TCPFASTOPENCOOKIEREQD,	/* TCPFastOpenCookieReqd */
 	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
+	LINUX_MIB_LOWLATENCYRXPACKETS,		/* LowLatencyRxPackets */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/Kconfig b/net/Kconfig
index 523e43e6da1b..d6a9ce6e1800 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,6 +243,18 @@ config NETPRIO_CGROUP
 	  Cgroup subsystem for use in assigning processes to network priorities on
 	  a per-interface basis
 
+config NET_LL_RX_POLL
+	bool "Low Latency Receive Poll"
+	depends on X86_TSC
+	default n
+	---help---
+	  Support Low Latency Receive Queue Poll.
+	  (For network card drivers which support this option.)
+	  When waiting for data in read or poll call directly into the the device driver
+	  to flush packets which may be pending on the device queues into the stack.
+
+	  If unsure, say N.
+
 config BQL
 	boolean
 	depends on SYSFS
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 73f57a0e1523..4a4181e16c1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->vlan_tci		= old->vlan_tci;
 
 	skb_copy_secmark(new, old);
+
+#ifdef CONFIG_NET_LL_RX_POLL
+	new->napi_id	= old->napi_id;
+#endif
 }
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 88868a9d21da..788c0da5eed1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #endif
 
+#include <net/ll_poll.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
+#ifdef CONFIG_NET_LL_RX_POLL
+	sk->sk_napi_id		=	0;
+#endif
+
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 741db5fc7806..4b48f39582b0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -19,6 +19,7 @@
 #include <net/ip.h>
 #include <net/sock.h>
 #include <net/net_ratelimit.h>
+#include <net/ll_poll.h>
 
 static int one = 1;
 
@@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= flow_limit_table_len_sysctl
 	},
 #endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_LL_RX_POLL
+	{
+		.procname	= "low_latency_poll",
+		.data		= &sysctl_net_ll_poll,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax
+	},
+#endif
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5bf86d2415..6577a1149a47 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
 	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
 	SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+	SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/socket.c b/net/socket.c
index 3ebdcb805c51..21fd29f63ed2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -104,6 +104,12 @@
 #include <linux/route.h>
 #include <linux/sockios.h>
 #include <linux/atalk.h>
+#include <net/ll_poll.h>
+
+#ifdef CONFIG_NET_LL_RX_POLL
+unsigned long sysctl_net_ll_poll __read_mostly;
+EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
+#endif
 
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
-- 
cgit v1.2.3


From 9ba18891f75535eca3ef53138b48970eb60f5255 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Wed, 5 Jun 2013 10:08:00 -0400
Subject: bridge: Add flag to control mac learning.

Allow user to control whether mac learning is enabled on the port.
By default, mac learning is enabled.  Disabling mac learning will
cause new dynamic FDB entries to not be created for a particular port.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_if.c           | 2 +-
 net/bridge/br_input.c        | 6 ++++--
 net/bridge/br_netlink.c      | 6 +++++-
 net/bridge/br_private.h      | 1 +
 net/bridge/br_sysfs_if.c     | 2 ++
 6 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b05823cae784..8643809d8417 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -221,6 +221,7 @@ enum {
 	IFLA_BRPORT_GUARD,	/* bpdu guard              */
 	IFLA_BRPORT_PROTECT,	/* root port protection    */
 	IFLA_BRPORT_FAST_LEAVE,	/* multicast fast leave    */
+	IFLA_BRPORT_LEARNING,	/* mac learning */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 4cdba60926ff..2c08911df578 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -221,7 +221,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->path_cost = port_cost(dev);
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
-	p->flags = 0;
+	p->flags = BR_LEARNING;
 	br_init_port(p);
 	p->state = BR_STATE_DISABLED;
 	br_stp_port_timer_init(p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 828e2bcc1f52..7e993667d4bf 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -75,7 +75,8 @@ int br_handle_frame_finish(struct sk_buff *skb)
 
 	/* insert into forwarding database after filtering to avoid spoofing */
 	br = p->br;
-	br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
+	if (p->flags & BR_LEARNING)
+		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
 
 	if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
 	    br_multicast_rcv(br, p, skb))
@@ -142,7 +143,8 @@ static int br_handle_local_finish(struct sk_buff *skb)
 	u16 vid = 0;
 
 	br_vlan_get_tag(skb, &vid);
-	br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid);
+	if (p->flags & BR_LEARNING)
+		br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid);
 	return 0;	 /* process further */
 }
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 8e3abf564798..ce902bf8a618 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -30,6 +30,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
+		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
 		+ 0;
 }
 
@@ -56,7 +57,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
 	    nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)))
+	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)))
 		return -EMSGSIZE;
 
 	return 0;
@@ -281,6 +283,7 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MODE]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_GUARD]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROTECT]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_LEARNING]	= { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -328,6 +331,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
+	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 
 	if (tb[IFLA_BRPORT_COST]) {
 		err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b0ac95a5c37..04d7f43508f7 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -158,6 +158,7 @@ struct net_bridge_port
 #define BR_ROOT_BLOCK		0x00000004
 #define BR_MULTICAST_FAST_LEAVE	0x00000008
 #define BR_ADMIN_COST		0x00000010
+#define BR_LEARNING		0x00000020
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	u32				multicast_startup_queries_sent;
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index a1ef1b6e14dc..707f3628e9cd 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -158,6 +158,7 @@ static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
+BRPORT_ATTR_FLAG(learning, BR_LEARNING);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -195,6 +196,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_hairpin_mode,
 	&brport_attr_bpdu_guard,
 	&brport_attr_root_block,
+	&brport_attr_learning,
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
-- 
cgit v1.2.3


From 867a59436fc35593ae0e0efcd56cc6d2f8506586 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Wed, 5 Jun 2013 10:08:01 -0400
Subject: bridge: Add a flag to control unicast packet flood.

Add a flag to control flood of unicast traffic.  By default, flood is
on and the bridge will flood unicast traffic if it doesn't know
the destination.  When the flag is turned off, unicast traffic
without an FDB will not be forwarded to the specified port.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_device.c       |  8 ++++----
 net/bridge/br_forward.c      | 14 +++++++++-----
 net/bridge/br_if.c           |  2 +-
 net/bridge/br_input.c        |  9 ++++++---
 net/bridge/br_netlink.c      |  6 +++++-
 net/bridge/br_private.h      |  6 ++++--
 net/bridge/br_sysfs_if.c     |  2 ++
 8 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8643809d8417..da05a2698cb5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -222,6 +222,7 @@ enum {
 	IFLA_BRPORT_PROTECT,	/* root port protection    */
 	IFLA_BRPORT_FAST_LEAVE,	/* multicast fast leave    */
 	IFLA_BRPORT_LEARNING,	/* mac learning */
+	IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 75f3239130f8..2ef66781fedb 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -58,10 +58,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_pull(skb, ETH_HLEN);
 
 	if (is_broadcast_ether_addr(dest))
-		br_flood_deliver(br, skb);
+		br_flood_deliver(br, skb, false);
 	else if (is_multicast_ether_addr(dest)) {
 		if (unlikely(netpoll_tx_running(dev))) {
-			br_flood_deliver(br, skb);
+			br_flood_deliver(br, skb, false);
 			goto out;
 		}
 		if (br_multicast_rcv(br, NULL, skb)) {
@@ -73,11 +73,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb))
 			br_multicast_deliver(mdst, skb);
 		else
-			br_flood_deliver(br, skb);
+			br_flood_deliver(br, skb, false);
 	} else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
 		br_deliver(dst->dst, skb);
 	else
-		br_flood_deliver(br, skb);
+		br_flood_deliver(br, skb, true);
 
 out:
 	rcu_read_unlock();
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 092b20e4ee4c..4b81b1471789 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -174,7 +174,8 @@ out:
 static void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		     struct sk_buff *skb0,
 		     void (*__packet_hook)(const struct net_bridge_port *p,
-					   struct sk_buff *skb))
+					   struct sk_buff *skb),
+		     bool unicast)
 {
 	struct net_bridge_port *p;
 	struct net_bridge_port *prev;
@@ -182,6 +183,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	prev = NULL;
 
 	list_for_each_entry_rcu(p, &br->port_list, list) {
+		/* Do not flood unicast traffic to ports that turn it off */
+		if (unicast && !(p->flags & BR_FLOOD))
+			continue;
 		prev = maybe_deliver(prev, p, skb, __packet_hook);
 		if (IS_ERR(prev))
 			goto out;
@@ -203,16 +207,16 @@ out:
 
 
 /* called with rcu_read_lock */
-void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
+void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast)
 {
-	br_flood(br, skb, NULL, __br_deliver);
+	br_flood(br, skb, NULL, __br_deliver, unicast);
 }
 
 /* called under bridge lock */
 void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
-		      struct sk_buff *skb2)
+		      struct sk_buff *skb2, bool unicast)
 {
-	br_flood(br, skb, skb2, __br_forward);
+	br_flood(br, skb, skb2, __br_forward, unicast);
 }
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 2c08911df578..5623be6b9ecd 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -221,7 +221,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->path_cost = port_cost(dev);
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
-	p->flags = BR_LEARNING;
+	p->flags = BR_LEARNING | BR_FLOOD;
 	br_init_port(p);
 	p->state = BR_STATE_DISABLED;
 	br_stp_port_timer_init(p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7e993667d4bf..1b8b8b824cd7 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -65,6 +65,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
 	struct sk_buff *skb2;
+	bool unicast = true;
 	u16 vid = 0;
 
 	if (!p || p->state == BR_STATE_DISABLED)
@@ -95,9 +96,10 @@ int br_handle_frame_finish(struct sk_buff *skb)
 
 	dst = NULL;
 
-	if (is_broadcast_ether_addr(dest))
+	if (is_broadcast_ether_addr(dest)) {
 		skb2 = skb;
-	else if (is_multicast_ether_addr(dest)) {
+		unicast = false;
+	} else if (is_multicast_ether_addr(dest)) {
 		mdst = br_mdb_get(br, skb, vid);
 		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
 			if ((mdst && mdst->mglist) ||
@@ -110,6 +112,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 		} else
 			skb2 = skb;
 
+		unicast = false;
 		br->dev->stats.multicast++;
 	} else if ((dst = __br_fdb_get(br, dest, vid)) &&
 			dst->is_local) {
@@ -123,7 +126,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 			dst->used = jiffies;
 			br_forward(dst->dst, skb, skb2);
 		} else
-			br_flood_forward(br, skb, skb2);
+			br_flood_forward(br, skb, skb2, unicast);
 	}
 
 	if (skb2)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ce902bf8a618..1fc30abd3a52 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -31,6 +31,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
+		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ 0;
 }
 
@@ -58,7 +59,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)))
+	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)))
 		return -EMSGSIZE;
 
 	return 0;
@@ -284,6 +286,7 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_GUARD]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROTECT]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_LEARNING]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -332,6 +335,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
+	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 
 	if (tb[IFLA_BRPORT_COST]) {
 		err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 04d7f43508f7..3be89b3ce17b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -159,6 +159,7 @@ struct net_bridge_port
 #define BR_MULTICAST_FAST_LEAVE	0x00000008
 #define BR_ADMIN_COST		0x00000010
 #define BR_LEARNING		0x00000020
+#define BR_FLOOD		0x00000040
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	u32				multicast_startup_queries_sent;
@@ -414,9 +415,10 @@ extern int br_dev_queue_push_xmit(struct sk_buff *skb);
 extern void br_forward(const struct net_bridge_port *to,
 		struct sk_buff *skb, struct sk_buff *skb0);
 extern int br_forward_finish(struct sk_buff *skb);
-extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
+extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb,
+			     bool unicast);
 extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
-			     struct sk_buff *skb2);
+			     struct sk_buff *skb2, bool unicast);
 
 /* br_if.c */
 extern void br_port_carrier_check(struct net_bridge_port *p);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 707f3628e9cd..2a2cdb756d51 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -159,6 +159,7 @@ BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
 BRPORT_ATTR_FLAG(learning, BR_LEARNING);
+BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -197,6 +198,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_bpdu_guard,
 	&brport_attr_root_block,
 	&brport_attr_learning,
+	&brport_attr_unicast_flood,
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
-- 
cgit v1.2.3


From 45203a3b380cee28f570475c0d28c169f908c209 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Jun 2013 08:43:22 -0700
Subject: net_sched: add 64bit rate estimators

struct gnet_stats_rate_est contains u32 fields, so the bytes per second
field can wrap at 34360Mbit.

Add a new gnet_stats_rate_est64 structure to get 64bit bps/pps fields,
and switch the kernel to use this structure natively.

This structure is dumped to user space as a new attribute :

TCA_STATS_RATE_EST64

Old tc command will now display the capped bps (to 34360Mbit), instead
of wrapped values, and updated tc command will display correct
information.

Old tc command output, after patch :

eric:~# tc -s -d qd sh dev lo
qdisc pfifo 8001: root refcnt 2 limit 1000p
 Sent 80868245400 bytes 1978837 pkt (dropped 0, overlimits 0 requeues 0)
 rate 34360Mbit 189696pps backlog 0b 0p requeues 0

This patch carefully reorganizes "struct Qdisc" layout to get optimal
performance on SMP.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h              |  2 +-
 include/net/gen_stats.h            | 10 +++++-----
 include/net/netfilter/xt_rateest.h |  2 +-
 include/net/sch_generic.h          | 13 +++++++------
 include/uapi/linux/gen_stats.h     | 11 +++++++++++
 net/core/gen_estimator.c           | 12 ++++++------
 net/core/gen_stats.c               | 22 +++++++++++++++++-----
 net/netfilter/xt_rateest.c         |  2 +-
 net/sched/sch_cbq.c                |  2 +-
 net/sched/sch_drr.c                |  2 +-
 net/sched/sch_hfsc.c               |  2 +-
 net/sched/sch_htb.c                |  2 +-
 net/sched/sch_qfq.c                |  2 +-
 13 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 06ef7e926a66..b8ffac7b6bab 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -18,7 +18,7 @@ struct tcf_common {
 	struct tcf_t			tcfc_tm;
 	struct gnet_stats_basic_packed	tcfc_bstats;
 	struct gnet_stats_queue		tcfc_qstats;
-	struct gnet_stats_rate_est	tcfc_rate_est;
+	struct gnet_stats_rate_est64	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
 	struct rcu_head			tcfc_rcu;
 };
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index a79b6cfb02a8..cf8439ba4d11 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -30,7 +30,7 @@ extern int gnet_stats_copy_basic(struct gnet_dump *d,
 				 struct gnet_stats_basic_packed *b);
 extern int gnet_stats_copy_rate_est(struct gnet_dump *d,
 				    const struct gnet_stats_basic_packed *b,
-				    struct gnet_stats_rate_est *r);
+				    struct gnet_stats_rate_est64 *r);
 extern int gnet_stats_copy_queue(struct gnet_dump *d,
 				 struct gnet_stats_queue *q);
 extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
@@ -38,13 +38,13 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 extern int gnet_stats_finish_copy(struct gnet_dump *d);
 
 extern int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
-			     struct gnet_stats_rate_est *rate_est,
+			     struct gnet_stats_rate_est64 *rate_est,
 			     spinlock_t *stats_lock, struct nlattr *opt);
 extern void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			       struct gnet_stats_rate_est *rate_est);
+			       struct gnet_stats_rate_est64 *rate_est);
 extern int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
-				 struct gnet_stats_rate_est *rate_est,
+				 struct gnet_stats_rate_est64 *rate_est,
 				 spinlock_t *stats_lock, struct nlattr *opt);
 extern bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-				 const struct gnet_stats_rate_est *rate_est);
+				 const struct gnet_stats_rate_est64 *rate_est);
 #endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 5a2978d1cb22..495c71f66e7e 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -6,7 +6,7 @@ struct xt_rateest {
 	struct gnet_stats_basic_packed	bstats;
 	spinlock_t			lock;
 	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
-	struct gnet_stats_rate_est	rstats;
+	struct gnet_stats_rate_est64	rstats;
 
 	/* following fields not accessed in hot path */
 	struct hlist_node		list;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e7f4e21cc3e1..df5676029827 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -58,14 +58,12 @@ struct Qdisc {
 				      * multiqueue device.
 				      */
 #define TCQ_F_WARN_NONWC	(1 << 16)
-	int			padded;
+	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
 	struct list_head	list;
 	u32			handle;
 	u32			parent;
-	atomic_t		refcnt;
-	struct gnet_stats_rate_est	rate_est;
 	int			(*reshape_fail)(struct sk_buff *skb,
 					struct Qdisc *q);
 
@@ -76,8 +74,9 @@ struct Qdisc {
 	 */
 	struct Qdisc		*__parent;
 	struct netdev_queue	*dev_queue;
-	struct Qdisc		*next_sched;
 
+	struct gnet_stats_rate_est64	rate_est;
+	struct Qdisc		*next_sched;
 	struct sk_buff		*gso_skb;
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
@@ -88,8 +87,10 @@ struct Qdisc {
 	unsigned int		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
-	spinlock_t		busylock;
-	u32			limit;
+	int			padded;
+	atomic_t		refcnt;
+
+	spinlock_t		busylock ____cacheline_aligned_in_smp;
 };
 
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 552c8a0a12d1..6487317ea619 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -9,6 +9,7 @@ enum {
 	TCA_STATS_RATE_EST,
 	TCA_STATS_QUEUE,
 	TCA_STATS_APP,
+	TCA_STATS_RATE_EST64,
 	__TCA_STATS_MAX,
 };
 #define TCA_STATS_MAX (__TCA_STATS_MAX - 1)
@@ -37,6 +38,16 @@ struct gnet_stats_rate_est {
 	__u32	pps;
 };
 
+/**
+ * struct gnet_stats_rate_est64 - rate estimator
+ * @bps: current byte rate
+ * @pps: current packet rate
+ */
+struct gnet_stats_rate_est64 {
+	__u64	bps;
+	__u64	pps;
+};
+
 /**
  * struct gnet_stats_queue - queuing statistics
  * @qlen: queue length
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index d9d198aa9fed..6b5b6e7013ca 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -82,7 +82,7 @@ struct gen_estimator
 {
 	struct list_head	list;
 	struct gnet_stats_basic_packed	*bstats;
-	struct gnet_stats_rate_est	*rate_est;
+	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
 	int			ewma_log;
 	u64			last_bytes;
@@ -167,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est)
 
 static
 struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
-				    const struct gnet_stats_rate_est *rate_est)
+				    const struct gnet_stats_rate_est64 *rate_est)
 {
 	struct rb_node *p = est_root.rb_node;
 
@@ -203,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  *
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
-		      struct gnet_stats_rate_est *rate_est,
+		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock,
 		      struct nlattr *opt)
 {
@@ -258,7 +258,7 @@ EXPORT_SYMBOL(gen_new_estimator);
  * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est *rate_est)
+			struct gnet_stats_rate_est64 *rate_est)
 {
 	struct gen_estimator *e;
 
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
  * Returns 0 on success or a negative error code.
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
-			  struct gnet_stats_rate_est *rate_est,
+			  struct gnet_stats_rate_est64 *rate_est,
 			  spinlock_t *stats_lock, struct nlattr *opt)
 {
 	gen_kill_estimator(bstats, rate_est);
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator);
  * Returns true if estimator is active, and false if not.
  */
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est *rate_est)
+			  const struct gnet_stats_rate_est64 *rate_est)
 {
 	bool res;
 
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ddedf211e588..9d3d9e78397b 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -143,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
 int
 gnet_stats_copy_rate_est(struct gnet_dump *d,
 			 const struct gnet_stats_basic_packed *b,
-			 struct gnet_stats_rate_est *r)
+			 struct gnet_stats_rate_est64 *r)
 {
+	struct gnet_stats_rate_est est;
+	int res;
+
 	if (b && !gen_estimator_active(b, r))
 		return 0;
 
+	est.bps = min_t(u64, UINT_MAX, r->bps);
+	/* we have some time before reaching 2^32 packets per second */
+	est.pps = r->pps;
+
 	if (d->compat_tc_stats) {
-		d->tc_stats.bps = r->bps;
-		d->tc_stats.pps = r->pps;
+		d->tc_stats.bps = est.bps;
+		d->tc_stats.pps = est.pps;
 	}
 
-	if (d->tail)
-		return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+	if (d->tail) {
+		res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+		if (res < 0 || est.bps == r->bps)
+			return res;
+		/* emit 64bit stats only if needed */
+		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index ed0db15ab00e..7720b036d76a 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,7 +18,7 @@ static bool
 xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_rateest_match_info *info = par->matchinfo;
-	struct gnet_stats_rate_est *r;
+	struct gnet_stats_rate_est64 *r;
 	u_int32_t bps1, bps2, pps1, pps2;
 	bool ret = true;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 1bc210ffcba2..71a568862557 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -130,7 +130,7 @@ struct cbq_class {
 	psched_time_t		penalized;
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto	*filter_list;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 759b308d1a8d..8302717ea303 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
 
 	struct gnet_stats_basic_packed		bstats;
 	struct gnet_stats_queue		qstats;
-	struct gnet_stats_rate_est	rate_est;
+	struct gnet_stats_rate_est64	rate_est;
 	struct list_head		alist;
 	struct Qdisc			*qdisc;
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 9facea03faeb..c4075610502c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	unsigned int	level;		/* class level in hierarchy */
 	struct tcf_proto *filter_list;	/* filter list */
 	unsigned int	filter_cnt;	/* filter count */
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index adaedd79389c..162fb800754c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -78,7 +78,7 @@ struct htb_class {
 	/* general class parameters */
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	struct tc_htb_xstats xstats;	/* our special stats */
 	int refcnt;		/* usage count of this class */
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index d51852bba01c..7c195d972bf0 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -138,7 +138,7 @@ struct qfq_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	struct Qdisc *qdisc;
 	struct list_head alist;		/* Link for active-classes list. */
 	struct qfq_aggregate *agg;	/* Parent aggregate. */
-- 
cgit v1.2.3


From a0a2af765543a96bdf188ec57dd201e708195302 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 3 Jun 2013 18:10:45 +0200
Subject: nl80211: add kernel-doc for NL80211_FEATURE_ACTIVE_MONITOR

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5920715278c2..1f0019d4cce6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3579,6 +3579,10 @@ enum nl80211_ap_sme_features {
  *	Peering Management entity which may be implemented by registering for
  *	beacons or NL80211_CMD_NEW_PEER_CANDIDATE events. The mesh beacon is
  *	still generated by the driver.
+ * @NL80211_FEATURE_ACTIVE_MONITOR: This driver supports an active monitor
+ *	interface. An active monitor interface behaves like a normal monitor
+ *	interface, but gets added to the driver. It ensures that incoming
+ *	unicast packets directed at the configured interface address get ACKed.
  */
 enum nl80211_feature_flags {
 	NL80211_FEATURE_SK_TX_STATUS			= 1 << 0,
-- 
cgit v1.2.3


From 8e7c053853b7d299e8a2b8733659b0df8eee51f7 Mon Sep 17 00:00:00 2001
From: Colleen Twitty <colleen@cozybit.com>
Date: Mon, 3 Jun 2013 09:53:39 -0700
Subject: {nl,cfg}80211: make peer link expiration time configurable

If a STA has a peer that it hasn't seen any tx activity
from for a certain length of time, the peer link is
expired. This means the inactive STA is removed from the
list of peers and that STA is not considered a peer again
unless it re-peers.  Previously, this inactivity time was
always 30 minutes.  Now, add it to the mesh configuration
and allow it to be configured.  Retain 30 minutes as a
default value.

Signed-off-by: Colleen Twitty <colleen@cozybit.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 4 ++++
 include/uapi/linux/nl80211.h | 5 +++++
 net/wireless/mesh.c          | 2 ++
 net/wireless/nl80211.c       | 8 +++++++-
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 855c76ccff38..31ca11672ca8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1124,6 +1124,9 @@ struct bss_parameters {
  *	setting for new peer links.
  * @dot11MeshAwakeWindowDuration: The duration in TUs the STA will remain awake
  *	after transmitting its beacon.
+ * @plink_timeout: If no tx activity is seen from a STA we've established
+ *	peering with for longer than this time (in seconds), then remove it
+ *	from the STA's list of peers.  Default is 30 minutes.
  */
 struct mesh_config {
 	u16 dot11MeshRetryTimeout;
@@ -1153,6 +1156,7 @@ struct mesh_config {
 	u16 dot11MeshHWMPconfirmationInterval;
 	enum nl80211_mesh_power_mode power_mode;
 	u16 dot11MeshAwakeWindowDuration;
+	u32 plink_timeout;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1f0019d4cce6..ca6facf4df0c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2577,6 +2577,10 @@ enum nl80211_mesh_power_mode {
  *
  * @NL80211_MESHCONF_AWAKE_WINDOW: awake window duration (in TUs)
  *
+ * @NL80211_MESHCONF_PLINK_TIMEOUT: If no tx activity is seen from a STA we've
+ *	established peering with for longer than this time (in seconds), then
+ *	remove it from the STA's list of peers.  Default is 30 minutes.
+ *
  * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use
  */
 enum nl80211_meshconf_params {
@@ -2608,6 +2612,7 @@ enum nl80211_meshconf_params {
 	NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
 	NL80211_MESHCONF_POWER_MODE,
 	NL80211_MESHCONF_AWAKE_WINDOW,
+	NL80211_MESHCONF_PLINK_TIMEOUT,
 
 	/* keep last */
 	__NL80211_MESHCONF_ATTR_AFTER_LAST,
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index 5dfb289ab761..0daaf72e1b81 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -18,6 +18,7 @@
 #define MESH_PATH_TO_ROOT_TIMEOUT      6000
 #define MESH_ROOT_INTERVAL     5000
 #define MESH_ROOT_CONFIRMATION_INTERVAL 2000
+#define MESH_DEFAULT_PLINK_TIMEOUT	1800 /* timeout in seconds */
 
 /*
  * Minimum interval between two consecutive PREQs originated by the same
@@ -75,6 +76,7 @@ const struct mesh_config default_mesh_config = {
 	.dot11MeshHWMPconfirmationInterval = MESH_ROOT_CONFIRMATION_INTERVAL,
 	.power_mode = NL80211_MESH_POWER_ACTIVE,
 	.dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW,
+	.plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT,
 };
 
 const struct mesh_setup default_mesh_setup = {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 88e820b73674..8aa83c04d4eb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4575,7 +4575,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 	    nla_put_u32(msg, NL80211_MESHCONF_POWER_MODE,
 			cur_params.power_mode) ||
 	    nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW,
-			cur_params.dot11MeshAwakeWindowDuration))
+			cur_params.dot11MeshAwakeWindowDuration) ||
+	    nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT,
+			cur_params.plink_timeout))
 		goto nla_put_failure;
 	nla_nest_end(msg, pinfoattr);
 	genlmsg_end(msg, hdr);
@@ -4616,6 +4618,7 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A
 	[NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 },
 	[NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 },
 	[NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
 };
 
 static const struct nla_policy
@@ -4753,6 +4756,9 @@ do {									    \
 	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
 				  0, 65535, mask,
 				  NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 1, 0xffffffff,
+				  mask, NL80211_MESHCONF_PLINK_TIMEOUT,
+				  nla_get_u32);
 	if (mask_out)
 		*mask_out = mask;
 
-- 
cgit v1.2.3


From 0d854a60b1d7d39a37b25dd28f63cfa0df637b91 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 7 Feb 2013 10:46:46 +0000
Subject: arm64: KVM: enable initialization of a 32bit vcpu

Wire the init of a 32bit vcpu by allowing 32bit modes in pstate,
and providing sensible defaults out of reset state.

This feature is of course conditioned by the presence of 32bit
capability on the physical CPU, and is checked by the KVM_CAP_ARM_EL1_32BIT
capability.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 arch/arm64/kvm/guest.c            |  6 ++++++
 arch/arm64/kvm/reset.c            | 26 +++++++++++++++++++++++++-
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3f5830b3ca3f..644d73956864 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -34,7 +34,7 @@
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
 
-#define KVM_VCPU_MAX_FEATURES 1
+#define KVM_VCPU_MAX_FEATURES 2
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 5b1110c49df5..5031f4263937 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -75,6 +75,7 @@ struct kvm_regs {
 #define KVM_VGIC_V2_CPU_SIZE		0x2000
 
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
+#define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
 
 struct kvm_vcpu_init {
 	__u32 target;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 3d7518a7ebaa..2c3ff67a8ecb 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -99,6 +99,12 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) {
 		u32 mode = (*(u32 *)valp) & COMPAT_PSR_MODE_MASK;
 		switch (mode) {
+		case COMPAT_PSR_MODE_USR:
+		case COMPAT_PSR_MODE_FIQ:
+		case COMPAT_PSR_MODE_IRQ:
+		case COMPAT_PSR_MODE_SVC:
+		case COMPAT_PSR_MODE_ABT:
+		case COMPAT_PSR_MODE_UND:
 		case PSR_MODE_EL0t:
 		case PSR_MODE_EL1t:
 		case PSR_MODE_EL1h:
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 766150ac76ed..70a7816535cd 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -38,16 +38,32 @@ static const struct kvm_regs default_regs_reset = {
 			PSR_F_BIT | PSR_D_BIT),
 };
 
+static const struct kvm_regs default_regs_reset32 = {
+	.regs.pstate = (COMPAT_PSR_MODE_SVC | COMPAT_PSR_A_BIT |
+			COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT),
+};
+
 static const struct kvm_irq_level default_vtimer_irq = {
 	.irq	= 27,
 	.level	= 1,
 };
 
+static bool cpu_has_32bit_el1(void)
+{
+	u64 pfr0;
+
+	pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+	return !!(pfr0 & 0x20);
+}
+
 int kvm_arch_dev_ioctl_check_extension(long ext)
 {
 	int r;
 
 	switch (ext) {
+	case KVM_CAP_ARM_EL1_32BIT:
+		r = cpu_has_32bit_el1();
+		break;
 	default:
 		r = 0;
 	}
@@ -70,7 +86,15 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 
 	switch (vcpu->arch.target) {
 	default:
-		cpu_reset = &default_regs_reset;
+		if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
+			if (!cpu_has_32bit_el1())
+				return -EINVAL;
+			cpu_reset = &default_regs_reset32;
+			vcpu->arch.hcr_el2 &= ~HCR_RW;
+		} else {
+			cpu_reset = &default_regs_reset;
+		}
+
 		cpu_vtimer_irq = &default_vtimer_irq;
 		break;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2d1bcb891468..aac27640bec2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_ARM_EL1_32BIT 93
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 274038f8c94c493e2977983e2aecb5f5f0778479 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 11 Jun 2013 14:41:24 +0400
Subject: tun: Report "persist" flag to userspace

The TUN_PERSIST flag is not reported at all -- both TUNGETIFF, and sysfs
"flags" attribute skip one. Knowing whether a device is persistent or not
is critical for checkpoint-restore, thus I propose to add the read-only
IFF_PERSIST one for this.

Setting this new IFF_PERSIST is hardly possible, as TUNSETIFF doesn't check
for unknown flags being zero and thus there can be trash.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 3 +++
 include/uapi/linux/if_tun.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 89776c592151..b90a73165d30 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1530,6 +1530,9 @@ static int tun_flags(struct tun_struct *tun)
 	if (tun->flags & TUN_TAP_MQ)
 		flags |= IFF_MULTI_QUEUE;
 
+	if (tun->flags & TUN_PERSIST)
+		flags |= IFF_PERSIST;
+
 	return flags;
 }
 
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 2835b85fd46d..82334f88967e 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -68,6 +68,8 @@
 #define IFF_MULTI_QUEUE 0x0100
 #define IFF_ATTACH_QUEUE 0x0200
 #define IFF_DETACH_QUEUE 0x0400
+/* read-only flag */
+#define IFF_PERSIST	0x0800
 
 /* Features for GSO (TUNSETOFFLOAD). */
 #define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
-- 
cgit v1.2.3


From 9674da8759df0d6c0d24e1ede6e2a1acdef91e3c Mon Sep 17 00:00:00 2001
From: Eric Lapuyade <eric.lapuyade@linux.intel.com>
Date: Mon, 29 Apr 2013 17:13:27 +0200
Subject: NFC: Add firmware upload netlink command

As several NFC chipsets can have their firmwares upgraded and
reflashed, this patchset adds a new netlink command to trigger
that the driver loads or flashes a new firmware. This will allows
userspace triggered firmware upgrade through netlink.
The firmware name or hint is passed as a parameter, and the driver
will eventually fetch the firmware binary through the request_firmware
API.
The cmd can only be executed when the nfc dev is not in use. Actual
firmware loading/flashing is an asynchronous operation. Result of the
operation shall send a new event up to user space through the nfc dev
multicast socket. During operation, the nfc dev is not openable and
thus not usable.

Signed-off-by: Eric Lapuyade <eric.lapuyade@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h    |  2 ++
 include/uapi/linux/nfc.h |  6 +++++
 net/nfc/core.c           | 46 +++++++++++++++++++++++++++++++++++
 net/nfc/netlink.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/nfc/nfc.h            |  5 ++++
 5 files changed, 122 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 5eb80bb3cbb2..3563dbdcaaf2 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -70,6 +70,7 @@ struct nfc_ops {
 	int (*check_presence)(struct nfc_dev *dev, struct nfc_target *target);
 	int (*enable_se)(struct nfc_dev *dev, u32 secure_element);
 	int (*disable_se)(struct nfc_dev *dev, u32 secure_element);
+	int (*fw_upload)(struct nfc_dev *dev, const char *firmware_name);
 };
 
 #define NFC_TARGET_IDX_ANY -1
@@ -104,6 +105,7 @@ struct nfc_dev {
 	int targets_generation;
 	struct device dev;
 	bool dev_up;
+	bool fw_upload_in_progress;
 	u8 rf_mode;
 	bool polling;
 	struct nfc_target *active_target;
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 7c6f627a717d..b6cbd164f146 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -69,6 +69,8 @@
  *	starting a poll from a device which has a secure element enabled means
  *	we want to do SE based card emulation.
  * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element.
+ * @NFC_CMD_FW_UPLOAD: Request to Load/flash firmware, or event to inform that
+ *	some firmware was loaded
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -92,6 +94,7 @@ enum nfc_commands {
 	NFC_CMD_DISABLE_SE,
 	NFC_CMD_LLC_SDREQ,
 	NFC_EVENT_LLC_SDRES,
+	NFC_CMD_FW_UPLOAD,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
@@ -121,6 +124,7 @@ enum nfc_commands {
  * @NFC_ATTR_LLC_PARAM_RW: Receive Window size parameter
  * @NFC_ATTR_LLC_PARAM_MIUX: MIU eXtension parameter
  * @NFC_ATTR_SE: Available Secure Elements
+ * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -143,6 +147,7 @@ enum nfc_attrs {
 	NFC_ATTR_LLC_PARAM_MIUX,
 	NFC_ATTR_SE,
 	NFC_ATTR_LLC_SDP,
+	NFC_ATTR_FIRMWARE_NAME,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
@@ -162,6 +167,7 @@ enum nfc_sdp_attr {
 #define NFC_SENSB_RES_MAXSIZE 12
 #define NFC_SENSF_RES_MAXSIZE 18
 #define NFC_GB_MAXSIZE        48
+#define NFC_FIRMWARE_NAME_MAXSIZE 32
 
 /* NFC protocols */
 #define NFC_PROTO_JEWEL		1
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 40d2527693da..eb3cecf1764e 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -44,6 +44,47 @@ DEFINE_MUTEX(nfc_devlist_mutex);
 /* NFC device ID bitmap */
 static DEFINE_IDA(nfc_index_ida);
 
+int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name)
+{
+	int rc = 0;
+
+	pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name);
+
+	device_lock(&dev->dev);
+
+	if (!device_is_registered(&dev->dev)) {
+		rc = -ENODEV;
+		goto error;
+	}
+
+	if (dev->dev_up) {
+		rc = -EBUSY;
+		goto error;
+	}
+
+	if (!dev->ops->fw_upload) {
+		rc = -EOPNOTSUPP;
+		goto error;
+	}
+
+	dev->fw_upload_in_progress = true;
+	rc = dev->ops->fw_upload(dev, firmware_name);
+	if (rc)
+		dev->fw_upload_in_progress = false;
+
+error:
+	device_unlock(&dev->dev);
+	return rc;
+}
+
+int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name)
+{
+	dev->fw_upload_in_progress = false;
+
+	return nfc_genl_fw_upload_done(dev, firmware_name);
+}
+EXPORT_SYMBOL(nfc_fw_upload_done);
+
 /**
  * nfc_dev_up - turn on the NFC device
  *
@@ -69,6 +110,11 @@ int nfc_dev_up(struct nfc_dev *dev)
 		goto error;
 	}
 
+	if (dev->fw_upload_in_progress) {
+		rc = -EBUSY;
+		goto error;
+	}
+
 	if (dev->dev_up) {
 		rc = -EALREADY;
 		goto error;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f0c4d61f37c0..1deadad9a285 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -56,6 +56,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
 	[NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 },
 	[NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 },
 	[NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED },
+	[NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
+				     .len = NFC_FIRMWARE_NAME_MAXSIZE },
 };
 
 static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = {
@@ -1025,6 +1027,62 @@ exit:
 	return rc;
 }
 
+static int nfc_genl_fw_upload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nfc_dev *dev;
+	int rc;
+	u32 idx;
+	char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1];
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+		return -EINVAL;
+
+	idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(idx);
+	if (!dev)
+		return -ENODEV;
+
+	nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
+		    sizeof(firmware_name));
+
+	rc = nfc_fw_upload(dev, firmware_name);
+
+	nfc_put_device(dev);
+	return rc;
+}
+
+int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+			  NFC_CMD_FW_UPLOAD);
+	if (!hdr)
+		goto free_msg;
+
+	if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) ||
+	    nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
 static struct genl_ops nfc_genl_ops[] = {
 	{
 		.cmd = NFC_CMD_GET_DEVICE,
@@ -1084,6 +1142,11 @@ static struct genl_ops nfc_genl_ops[] = {
 		.doit = nfc_genl_llc_sdreq,
 		.policy = nfc_genl_policy,
 	},
+	{
+		.cmd = NFC_CMD_FW_UPLOAD,
+		.doit = nfc_genl_fw_upload,
+		.policy = nfc_genl_policy,
+	},
 };
 
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index afa1f84ba040..cf0c48165996 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -120,6 +120,11 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter)
 	class_dev_iter_exit(iter);
 }
 
+int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name);
+int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name);
+
+int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name);
+
 int nfc_dev_up(struct nfc_dev *dev);
 
 int nfc_dev_down(struct nfc_dev *dev);
-- 
cgit v1.2.3


From 1d8faf48c74b8329a0322dc4b2a2030ae5003c86 Mon Sep 17 00:00:00 2001
From: Rony Efraim <ronye@mellanox.com>
Date: Thu, 13 Jun 2013 13:19:10 +0300
Subject: net/core: Add VF link state control

Add netlink directives and ndo entry to allow for controling
VF link, which can be in one of three states:

Auto - VF link state reflects the PF link state (default)

Up - VF link state is up, traffic from VF to VF works even if
the actual PF link is down

Down - VF link state is down, no traffic from/to this VF, can be of
use while configuring the VF

Signed-off-by: Rony Efraim <ronye@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_link.h      |  1 +
 include/linux/netdevice.h    |  3 +++
 include/uapi/linux/if_link.h | 13 +++++++++++++
 net/core/rtnetlink.c         | 22 ++++++++++++++++++++--
 4 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index c3f817c3eb45..a86784dec3d3 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -12,5 +12,6 @@ struct ifla_vf_info {
 	__u32 qos;
 	__u32 tx_rate;
 	__u32 spoofchk;
+	__u32 linkstate;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8c9fcc42502a..09b4188c1ea7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -829,6 +829,7 @@ struct netdev_fcoe_hbainfo {
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_config)(struct net_device *dev,
  *			    int vf, struct ifla_vf_info *ivf);
+ * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
@@ -986,6 +987,8 @@ struct net_device_ops {
 	int			(*ndo_get_vf_config)(struct net_device *dev,
 						     int vf,
 						     struct ifla_vf_info *ivf);
+	int			(*ndo_set_vf_link_state)(struct net_device *dev,
+							 int vf, int link_state);
 	int			(*ndo_set_vf_port)(struct net_device *dev,
 						   int vf,
 						   struct nlattr *port[]);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index da05a2698cb5..03f6170ab337 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -338,6 +338,7 @@ enum {
 	IFLA_VF_VLAN,
 	IFLA_VF_TX_RATE,	/* TX Bandwidth Allocation */
 	IFLA_VF_SPOOFCHK,	/* Spoof Checking on/off switch */
+	IFLA_VF_LINK_STATE,	/* link state enable/disable/auto switch */
 	__IFLA_VF_MAX,
 };
 
@@ -364,6 +365,18 @@ struct ifla_vf_spoofchk {
 	__u32 setting;
 };
 
+enum {
+	IFLA_VF_LINK_STATE_AUTO,	/* link state of the uplink */
+	IFLA_VF_LINK_STATE_ENABLE,	/* link always up */
+	IFLA_VF_LINK_STATE_DISABLE,	/* link always down */
+	__IFLA_VF_LINK_STATE_MAX,
+};
+
+struct ifla_vf_link_state {
+	__u32 vf;
+	__u32 link_state;
+};
+
 /* VF ports management section
  *
  *	Nested layout of set/get msg is:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 49c14451d8ab..9007533867f0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -947,6 +947,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			struct ifla_vf_vlan vf_vlan;
 			struct ifla_vf_tx_rate vf_tx_rate;
 			struct ifla_vf_spoofchk vf_spoofchk;
+			struct ifla_vf_link_state vf_linkstate;
 
 			/*
 			 * Not all SR-IOV capable drivers support the
@@ -956,18 +957,24 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			 */
 			ivi.spoofchk = -1;
 			memset(ivi.mac, 0, sizeof(ivi.mac));
+			/* The default value for VF link state is "auto"
+			 * IFLA_VF_LINK_STATE_AUTO which equals zero
+			 */
+			ivi.linkstate = 0;
 			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
 				break;
 			vf_mac.vf =
 				vf_vlan.vf =
 				vf_tx_rate.vf =
-				vf_spoofchk.vf = ivi.vf;
+				vf_spoofchk.vf =
+				vf_linkstate.vf = ivi.vf;
 
 			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
 			vf_vlan.vlan = ivi.vlan;
 			vf_vlan.qos = ivi.qos;
 			vf_tx_rate.rate = ivi.tx_rate;
 			vf_spoofchk.setting = ivi.spoofchk;
+			vf_linkstate.link_state = ivi.linkstate;
 			vf = nla_nest_start(skb, IFLA_VF_INFO);
 			if (!vf) {
 				nla_nest_cancel(skb, vfinfo);
@@ -978,7 +985,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
 				    &vf_tx_rate) ||
 			    nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
-				    &vf_spoofchk))
+				    &vf_spoofchk) ||
+			    nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+				    &vf_linkstate))
 				goto nla_put_failure;
 			nla_nest_end(skb, vf);
 		}
@@ -1238,6 +1247,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
 							       ivs->setting);
 			break;
 		}
+		case IFLA_VF_LINK_STATE: {
+			struct ifla_vf_link_state *ivl;
+			ivl = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_link_state)
+				err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+								 ivl->link_state);
+			break;
+		}
 		default:
 			err = -EINVAL;
 			break;
-- 
cgit v1.2.3


From 322bce957e9b0e30ef7147dae0414ad8f3f558c8 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Mon, 27 May 2013 15:29:11 +0200
Subject: NFC: pn533: Copy NFCID2 through ATR_REQ

When using NFC-F we should copy the NFCID2 buffer that we got from
SENSF_RES through the ATR_REQ NFCID3 buffer. Not doing so violates
NFC Forum digital requirement #189.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/pn533.c      | 14 +++++++++++++-
 include/net/nfc/nfc.h    |  2 ++
 include/uapi/linux/nfc.h |  2 ++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/nfc/pn533.c b/drivers/nfc/pn533.c
index 6bd4f598b3e1..e196bdfcfc30 100644
--- a/drivers/nfc/pn533.c
+++ b/drivers/nfc/pn533.c
@@ -1235,7 +1235,7 @@ static int pn533_target_found_type_a(struct nfc_target *nfc_tgt, u8 *tgt_data,
 struct pn533_target_felica {
 	u8 pol_res;
 	u8 opcode;
-	u8 nfcid2[8];
+	u8 nfcid2[NFC_NFCID2_MAXSIZE];
 	u8 pad[8];
 	/* optional */
 	u8 syst_code[];
@@ -1275,6 +1275,9 @@ static int pn533_target_found_felica(struct nfc_target *nfc_tgt, u8 *tgt_data,
 	memcpy(nfc_tgt->sensf_res, &tgt_felica->opcode, 9);
 	nfc_tgt->sensf_res_len = 9;
 
+	memcpy(nfc_tgt->nfcid2, tgt_felica->nfcid2, NFC_NFCID2_MAXSIZE);
+	nfc_tgt->nfcid2_len = NFC_NFCID2_MAXSIZE;
+
 	return 0;
 }
 
@@ -2084,6 +2087,9 @@ static int pn533_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target,
 	if (comm_mode == NFC_COMM_PASSIVE)
 		skb_len += PASSIVE_DATA_LEN;
 
+	if (target && target->nfcid2_len)
+		skb_len += NFC_NFCID3_MAXSIZE;
+
 	skb = pn533_alloc_skb(dev, skb_len);
 	if (!skb)
 		return -ENOMEM;
@@ -2100,6 +2106,12 @@ static int pn533_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target,
 		*next |= 1;
 	}
 
+	if (target && target->nfcid2_len) {
+		memcpy(skb_put(skb, NFC_NFCID3_MAXSIZE), target->nfcid2,
+		       target->nfcid2_len);
+		*next |= 2;
+	}
+
 	if (gb != NULL && gb_len > 0) {
 		memcpy(skb_put(skb, gb_len), gb, gb_len);
 		*next |= 4; /* We have some Gi */
diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 3563dbdcaaf2..8fc1784a264d 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -84,6 +84,8 @@ struct nfc_target {
 	u8 sel_res;
 	u8 nfcid1_len;
 	u8 nfcid1[NFC_NFCID1_MAXSIZE];
+	u8 nfcid2_len;
+	u8 nfcid2[NFC_NFCID2_MAXSIZE];
 	u8 sensb_res_len;
 	u8 sensb_res[NFC_SENSB_RES_MAXSIZE];
 	u8 sensf_res_len;
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index b6cbd164f146..fb304fb774cc 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -164,6 +164,8 @@ enum nfc_sdp_attr {
 
 #define NFC_DEVICE_NAME_MAXSIZE 8
 #define NFC_NFCID1_MAXSIZE 10
+#define NFC_NFCID2_MAXSIZE 8
+#define NFC_NFCID3_MAXSIZE 10
 #define NFC_SENSB_RES_MAXSIZE 12
 #define NFC_SENSF_RES_MAXSIZE 18
 #define NFC_GB_MAXSIZE        48
-- 
cgit v1.2.3


From fed7c25ec0d4894edfc36bbe5c5231e52f45483a Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Fri, 10 May 2013 15:28:38 +0200
Subject: NFC: Add secure elements addition and removal API

This API will allow NFC drivers to add and remove the secure elements
they know about or detect. Typically this should be called (asynchronously
or not) from the driver or the host interface stack detect_se hook.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h    | 22 +++++++++++++++++++++-
 include/uapi/linux/nfc.h |  4 +++-
 net/nfc/core.c           | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 5187ec70b66a..0e353f1658bb 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -97,6 +97,23 @@ struct nfc_target {
 	u8 logical_idx;
 };
 
+/**
+ * nfc_se - A structure for NFC accessible secure elements.
+ *
+ * @idx: The secure element index. User space will enable or
+ *       disable a secure element by its index.
+ * @type: The secure element type. It can be SE_UICC or
+ *        SE_EMBEDDED.
+ * @state: The secure element state, either enabled or disabled.
+ *
+ */
+struct nfc_se {
+	struct list_head list;
+	u32 idx;
+	u16 type;
+	u16 state;
+};
+
 struct nfc_genl_data {
 	u32 poll_req_portid;
 	struct mutex genl_data_mutex;
@@ -118,7 +135,7 @@ struct nfc_dev {
 	struct nfc_genl_data genl_data;
 	u32 supported_protocols;
 
-	u32 active_se;
+	struct list_head secure_elements;
 
 	int tx_headroom;
 	int tx_tailroom;
@@ -221,4 +238,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb);
 
 void nfc_driver_failure(struct nfc_dev *dev, int err);
 
+int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type);
+int nfc_remove_se(struct nfc_dev *dev, u32 se_idx);
+
 #endif /* __NET_NFC_H */
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index fb304fb774cc..3a57cef0b986 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -199,10 +199,12 @@ enum nfc_sdp_attr {
 #define NFC_PROTO_ISO14443_B_MASK (1 << NFC_PROTO_ISO14443_B)
 
 /* NFC Secure Elements */
-#define NFC_SE_NONE     0x0
 #define NFC_SE_UICC     0x1
 #define NFC_SE_EMBEDDED 0x2
 
+#define NFC_SE_DISABLED 0x0
+#define NFC_SE_ENABLED  0x1
+
 struct sockaddr_nfc {
 	sa_family_t sa_family;
 	__u32 dev_idx;
diff --git a/net/nfc/core.c b/net/nfc/core.c
index a43a56d7f4be..dacadfbcacea 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -760,6 +760,49 @@ inline void nfc_driver_failure(struct nfc_dev *dev, int err)
 }
 EXPORT_SYMBOL(nfc_driver_failure);
 
+int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type)
+{
+	struct nfc_se *se, *n;
+
+	pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+	list_for_each_entry_safe(se, n, &dev->secure_elements, list)
+		if (se->idx == se_idx)
+			return -EALREADY;
+
+	se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL);
+	if (!se)
+		return -ENOMEM;
+
+	se->idx = se_idx;
+	se->type = type;
+	se->state = NFC_SE_DISABLED;
+	INIT_LIST_HEAD(&se->list);
+
+	list_add(&se->list, &dev->secure_elements);
+
+	return 0;
+}
+EXPORT_SYMBOL(nfc_add_se);
+
+int nfc_remove_se(struct nfc_dev *dev, u32 se_idx)
+{
+	struct nfc_se *se, *n;
+
+	pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+	list_for_each_entry_safe(se, n, &dev->secure_elements, list)
+		if (se->idx == se_idx) {
+			list_del(&se->list);
+			kfree(se);
+
+			return 0;
+		}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(nfc_remove_se);
+
 static void nfc_release(struct device *d)
 {
 	struct nfc_dev *dev = to_nfc_dev(d);
@@ -856,9 +899,9 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
 
 	dev->ops = ops;
 	dev->supported_protocols = supported_protocols;
-	dev->active_se = NFC_SE_NONE;
 	dev->tx_headroom = tx_headroom;
 	dev->tx_tailroom = tx_tailroom;
+	INIT_LIST_HEAD(&dev->secure_elements);
 
 	nfc_genl_data_init(&dev->genl_data);
 
-- 
cgit v1.2.3


From 2757c3723c3d2b13e3a8bfaa034826f64e9cca43 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Fri, 10 May 2013 15:47:37 +0200
Subject: NFC: Send netlink events for secure elements additions and removals

When an NFC driver or host controller stack discovers a secure element,
it will call nfc_add_se(). In order for userspace applications to use
these secure elements, a netlink event will then be sent with the SE
index and its type. With that information userspace applications can
decide wether or not to enable SEs, through their indexes.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h |  6 +++++
 net/nfc/core.c           | 14 +++++++++++
 net/nfc/netlink.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/nfc/nfc.h            |  3 +++
 4 files changed, 86 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 3a57cef0b986..caed0f324d5f 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -95,6 +95,8 @@ enum nfc_commands {
 	NFC_CMD_LLC_SDREQ,
 	NFC_EVENT_LLC_SDRES,
 	NFC_CMD_FW_UPLOAD,
+	NFC_EVENT_SE_ADDED,
+	NFC_EVENT_SE_REMOVED,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
@@ -125,6 +127,8 @@ enum nfc_commands {
  * @NFC_ATTR_LLC_PARAM_MIUX: MIU eXtension parameter
  * @NFC_ATTR_SE: Available Secure Elements
  * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version
+ * @NFC_ATTR_SE_INDEX: Secure element index
+ * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED)
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -148,6 +152,8 @@ enum nfc_attrs {
 	NFC_ATTR_SE,
 	NFC_ATTR_LLC_SDP,
 	NFC_ATTR_FIRMWARE_NAME,
+	NFC_ATTR_SE_INDEX,
+	NFC_ATTR_SE_TYPE,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
diff --git a/net/nfc/core.c b/net/nfc/core.c
index dacadfbcacea..bb5f16cfc201 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -763,6 +763,7 @@ EXPORT_SYMBOL(nfc_driver_failure);
 int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type)
 {
 	struct nfc_se *se, *n;
+	int rc;
 
 	pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
 
@@ -781,6 +782,14 @@ int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type)
 
 	list_add(&se->list, &dev->secure_elements);
 
+	rc = nfc_genl_se_added(dev, se_idx, type);
+	if (rc < 0) {
+		list_del(&se->list);
+		kfree(se);
+
+		return rc;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(nfc_add_se);
@@ -788,11 +797,16 @@ EXPORT_SYMBOL(nfc_add_se);
 int nfc_remove_se(struct nfc_dev *dev, u32 se_idx)
 {
 	struct nfc_se *se, *n;
+	int rc;
 
 	pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
 
 	list_for_each_entry_safe(se, n, &dev->secure_elements, list)
 		if (se->idx == se_idx) {
+			rc = nfc_genl_se_removed(dev, se_idx);
+			if (rc < 0)
+				return rc;
+
 			list_del(&se->list);
 			kfree(se);
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index fdbc662c564a..8a11a3a27e69 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -426,6 +426,69 @@ free_msg:
 	return rc;
 }
 
+int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+			  NFC_EVENT_SE_ADDED);
+	if (!hdr)
+		goto free_msg;
+
+	if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+	    nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
+	    nla_put_u8(msg, NFC_ATTR_SE_TYPE, type))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+			  NFC_EVENT_SE_REMOVED);
+	if (!hdr)
+		goto free_msg;
+
+	if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+	    nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
 static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
 				u32 portid, u32 seq,
 				struct netlink_callback *cb,
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index cf0c48165996..a6aeee094aa4 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -94,6 +94,9 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev);
 
 int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list);
 
+int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type);
+int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx);
+
 struct nfc_dev *nfc_get_device(unsigned int idx);
 
 static inline void nfc_put_device(struct nfc_dev *dev)
-- 
cgit v1.2.3


From 57254b6ebce4ceca02d9c8b615f6059c56c19238 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 6 May 2013 19:14:17 +0000
Subject: Btrfs: add ioctl to wait for qgroup rescan completion

btrfs_qgroup_wait_for_completion waits until the currently running qgroup
operation completes. It returns immediately when no rescan process is in
progress. This is useful to automate things around the rescan process (e.g.
testing).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h           |  2 ++
 fs/btrfs/ioctl.c           | 12 ++++++++++++
 fs/btrfs/qgroup.c          | 21 +++++++++++++++++++++
 include/uapi/linux/btrfs.h |  1 +
 4 files changed, 36 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a365400e38da..e36e97b473a8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1613,6 +1613,7 @@ struct btrfs_fs_info {
 	struct mutex qgroup_rescan_lock; /* protects the progress item */
 	struct btrfs_key qgroup_rescan_progress;
 	struct btrfs_workers qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
 
 	/* filesystem state */
 	unsigned long fs_state;
@@ -3820,6 +3821,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f81d67cdc8d..1e0dda1feefe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3937,6 +3937,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
 	return ret;
 }
 
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
 static long btrfs_ioctl_set_received_subvol(struct file *file,
 					    void __user *arg)
 {
@@ -4179,6 +4189,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_quota_rescan(file, argp);
 	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
 		return btrfs_ioctl_quota_rescan_status(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+		return btrfs_ioctl_quota_rescan_wait(file, argp);
 	case BTRFS_IOC_DEV_REPLACE:
 		return btrfs_ioctl_dev_replace(root, argp);
 	case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7f38cce2528d..d059d86c3131 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2070,6 +2070,8 @@ out:
 	} else {
 		pr_err("btrfs: qgroup scan failed with %d\n", err);
 	}
+
+	complete_all(&fs_info->qgroup_rescan_completion);
 }
 
 static void
@@ -2110,6 +2112,7 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
 	fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 	memset(&fs_info->qgroup_rescan_progress, 0,
 		sizeof(fs_info->qgroup_rescan_progress));
+	init_completion(&fs_info->qgroup_rescan_completion);
 
 	/* clear all current qgroup tracking information */
 	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
@@ -2126,3 +2129,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
 
 	return 0;
 }
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+	int running;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (running)
+		ret = wait_for_completion_interruptible(
+					&fs_info->qgroup_rescan_completion);
+
+	return ret;
+}
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ef0df545a2a..5b683b5f63cd 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -530,6 +530,7 @@ struct btrfs_ioctl_send_args {
 			       struct btrfs_ioctl_quota_rescan_args)
 #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \
 			       struct btrfs_ioctl_quota_rescan_args)
+#define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46)
 #define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \
 				   char[BTRFS_LABEL_SIZE])
 #define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \
-- 
cgit v1.2.3


From 183860f6a0646b876645ecce0553a7ef2dd71254 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Fri, 17 May 2013 10:52:45 +0000
Subject: btrfs: device delete to get errors from the kernel

when user runs command btrfs dev del the raid requisite error if any
goes to the /var/log/messages, its not good idea to clutter messages
with these user (knowledge) errors, further user don't have to review
the system messages to know problem with the cli it should be dropped
to the user as part of the cli return.

to bring this feature created a set of the ERROR defined
BTRFS_ERROR_DEV* error codes and created their error string.

I expect this enum to be added with other error which we might
want to communicate to the user land

v3:
moved the code with in the file no logical change

v1->v2:
introduce error codes for the device mgmt usage

v1:
adds a parameter in the ioctl arg struct to carry the error string

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ioctl.c           | 22 +++++++++++-----------
 fs/btrfs/volumes.c         | 26 +++++++-------------------
 include/uapi/linux/btrfs.h | 41 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 31 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1f50fe2e62c7..015689a158a4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2360,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-			1)) {
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-		mnt_drop_write_file(file);
-		return -EINVAL;
-	}
-
-	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -2375,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	ret = btrfs_rm_device(root, vol_args->name);
 
-	kfree(vol_args);
-out:
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	ret = btrfs_rm_device(root, vol_args->name);
 	mutex_unlock(&root->fs_info->volume_mutex);
 	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+	kfree(vol_args);
 	mnt_drop_write_file(file);
 	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 440de708f9eb..b1446c0da154 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1462,31 +1462,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-		printk(KERN_ERR "btrfs: unable to go below four devices "
-		       "on raid10\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-		printk(KERN_ERR "btrfs: unable to go below two "
-		       "devices on raid1\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
 	    root->fs_info->fs_devices->rw_devices <= 2) {
-		printk(KERN_ERR "btrfs: unable to go below two "
-		       "devices on raid5\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
 		goto out;
 	}
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
 	    root->fs_info->fs_devices->rw_devices <= 3) {
-		printk(KERN_ERR "btrfs: unable to go below three "
-		       "devices on raid6\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
 		goto out;
 	}
 
@@ -1512,8 +1504,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk(KERN_ERR "btrfs: no missing devices found to "
-			       "remove\n");
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 			goto out;
 		}
 	} else {
@@ -1535,15 +1526,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->is_tgtdev_for_dev_replace) {
-		pr_err("btrfs: unable to remove the dev_replace target dev\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
 		goto error_brelse;
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk(KERN_ERR "btrfs: unable to remove the only writeable "
-		       "device\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
 		goto error_brelse;
 	}
 
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5b683b5f63cd..05aed70627e2 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -447,6 +447,46 @@ struct btrfs_ioctl_send_args {
 	__u64 reserved[4];		/* in */
 };
 
+/* Error codes as returned by the kernel */
+enum btrfs_err_code {
+	notused,
+	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+	BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+	BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+	BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+	BTRFS_ERROR_DEV_TGT_REPLACE,
+	BTRFS_ERROR_DEV_MISSING_NOT_FOUND,
+	BTRFS_ERROR_DEV_ONLY_WRITABLE,
+	BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS
+};
+/* An error code to error string mapping for the kernel
+*  error codes
+*/
+static inline char *btrfs_err_str(enum btrfs_err_code err_code)
+{
+	switch (err_code) {
+		case BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET:
+			return "unable to go below two devices on raid1";
+		case BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET:
+			return "unable to go below four devices on raid10";
+		case BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET:
+			return "unable to go below two devices on raid5";
+		case BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET:
+			return "unable to go below three devices on raid6";
+		case BTRFS_ERROR_DEV_TGT_REPLACE:
+			return "unable to remove the dev_replace target dev";
+		case BTRFS_ERROR_DEV_MISSING_NOT_FOUND:
+			return "no missing devices found to remove";
+		case BTRFS_ERROR_DEV_ONLY_WRITABLE:
+			return "unable to remove the only writeable device";
+		case BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS:
+			return "add/delete/balance/replace/resize operation "\
+				"in progress";
+		default:
+			return NULL;
+	}
+}
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -539,5 +579,4 @@ struct btrfs_ioctl_send_args {
 				      struct btrfs_ioctl_get_dev_stats)
 #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
 				    struct btrfs_ioctl_dev_replace_args)
-
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From 45bfa52e36ef016b789eca73c7847db3f7b4742d Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 12 Jun 2013 15:57:10 -0700
Subject: openvswitch: Fix struct comment.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/uapi/linux/openvswitch.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 405918dd7b3f..424672db7f12 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -192,7 +192,6 @@ enum ovs_vport_type {
  * optional; if not specified a free port number is automatically selected.
  * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type
  * of vport.
- * and other attributes are ignored.
  *
  * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to
  * look up the vport to operate on; otherwise dp_idx from the &struct
-- 
cgit v1.2.3


From 8941bbcd572a8860ad03c76e2f3d1dafa820b842 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 17 Jun 2013 10:54:36 -0400
Subject: tipc: update code comments to reflect new uapi header path

Files tipc.h and tipc_config.h were moved to uapi directory, but
the corresponding comments were not updated at the same time.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h        | 2 +-
 include/uapi/linux/tipc_config.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index f2d90091cc20..852373d27dbb 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -1,5 +1,5 @@
 /*
- * include/linux/tipc.h: Include file for TIPC socket interface
+ * include/uapi/linux/tipc.h: Header for TIPC socket interface
  *
  * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, 2010-2011, Wind River Systems
diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 0b1e3f218a36..6b0bff09b3a7 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -1,5 +1,5 @@
 /*
- * include/linux/tipc_config.h: Include file for TIPC configuration interface
+ * include/uapi/linux/tipc_config.h: Header for TIPC configuration interface
  *
  * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005-2007, 2010-2011, Wind River Systems
-- 
cgit v1.2.3


From 2f301ab29e4656af824592363039d8f6bd5a9f68 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 16 May 2013 13:00:28 +0200
Subject: nl80211/cfg80211: add 5 and 10 MHz defines and wiphy flag

Add defines for 5 and 10 MHz channel width and fix channel
handling functions accordingly.

Also check for and report the WIPHY_FLAG_SUPPORTS_5_10_MHZ
capability.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
[fix spelling in comment]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/chan.c          | 57 +++++++++++++++++++++++++++++++++++++-------
 net/wireless/nl80211.c       | 21 ++++++++++++----
 4 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6a43c34ce96f..316f34bb8e6e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2342,6 +2342,7 @@ struct cfg80211_ops {
  *	responds to probe-requests in hardware.
  * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
  * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
+ * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
  */
 enum wiphy_flags {
 	WIPHY_FLAG_CUSTOM_REGULATORY		= BIT(0),
@@ -2365,6 +2366,7 @@ enum wiphy_flags {
 	WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD	= BIT(19),
 	WIPHY_FLAG_OFFCHAN_TX			= BIT(20),
 	WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL	= BIT(21),
+	WIPHY_FLAG_SUPPORTS_5_10_MHZ		= BIT(22),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ca6facf4df0c..861e5eba3953 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2758,6 +2758,8 @@ enum nl80211_channel_type {
  *	and %NL80211_ATTR_CENTER_FREQ2 attributes must be provided as well
  * @NL80211_CHAN_WIDTH_160: 160 MHz channel, the %NL80211_ATTR_CENTER_FREQ1
  *	attribute must be provided as well
+ * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel
+ * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel
  */
 enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_20_NOHT,
@@ -2766,6 +2768,8 @@ enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_80,
 	NL80211_CHAN_WIDTH_80P80,
 	NL80211_CHAN_WIDTH_160,
+	NL80211_CHAN_WIDTH_5,
+	NL80211_CHAN_WIDTH_10,
 };
 
 /**
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index fd556ac05fdb..50f6195c8b70 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -54,6 +54,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 	control_freq = chandef->chan->center_freq;
 
 	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+	case NL80211_CHAN_WIDTH_10:
 	case NL80211_CHAN_WIDTH_20:
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		if (chandef->center_freq1 != control_freq)
@@ -152,6 +154,12 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
 	int width;
 
 	switch (c->width) {
+	case NL80211_CHAN_WIDTH_5:
+		width = 5;
+		break;
+	case NL80211_CHAN_WIDTH_10:
+		width = 10;
+		break;
 	case NL80211_CHAN_WIDTH_20:
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		width = 20;
@@ -194,6 +202,16 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
 	if (c1->width == c2->width)
 		return NULL;
 
+	/*
+	 * can't be compatible if one of them is 5 or 10 MHz,
+	 * but they don't have the same width.
+	 */
+	if (c1->width == NL80211_CHAN_WIDTH_5 ||
+	    c1->width == NL80211_CHAN_WIDTH_10 ||
+	    c2->width == NL80211_CHAN_WIDTH_5 ||
+	    c2->width == NL80211_CHAN_WIDTH_10)
+		return NULL;
+
 	if (c1->width == NL80211_CHAN_WIDTH_20_NOHT ||
 	    c1->width == NL80211_CHAN_WIDTH_20)
 		return c2;
@@ -264,11 +282,17 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
 					    u32 bandwidth)
 {
 	struct ieee80211_channel *c;
-	u32 freq;
+	u32 freq, start_freq, end_freq;
+
+	if (bandwidth <= 20) {
+		start_freq = center_freq;
+		end_freq = center_freq;
+	} else {
+		start_freq = center_freq - bandwidth/2 + 10;
+		end_freq = center_freq + bandwidth/2 - 10;
+	}
 
-	for (freq = center_freq - bandwidth/2 + 10;
-	     freq <= center_freq + bandwidth/2 - 10;
-	     freq += 20) {
+	for (freq = start_freq; freq <= end_freq; freq += 20) {
 		c = ieee80211_get_channel(wiphy, freq);
 		if (!c)
 			return -EINVAL;
@@ -310,11 +334,17 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 					u32 prohibited_flags)
 {
 	struct ieee80211_channel *c;
-	u32 freq;
+	u32 freq, start_freq, end_freq;
+
+	if (bandwidth <= 20) {
+		start_freq = center_freq;
+		end_freq = center_freq;
+	} else {
+		start_freq = center_freq - bandwidth/2 + 10;
+		end_freq = center_freq + bandwidth/2 - 10;
+	}
 
-	for (freq = center_freq - bandwidth/2 + 10;
-	     freq <= center_freq + bandwidth/2 - 10;
-	     freq += 20) {
+	for (freq = start_freq; freq <= end_freq; freq += 20) {
 		c = ieee80211_get_channel(wiphy, freq);
 		if (!c)
 			return false;
@@ -349,6 +379,12 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 	control_freq = chandef->chan->center_freq;
 
 	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+		width = 5;
+		break;
+	case NL80211_CHAN_WIDTH_10:
+		width = 10;
+		break;
 	case NL80211_CHAN_WIDTH_20:
 		if (!ht_cap->ht_supported)
 			return false;
@@ -405,6 +441,11 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 	if (width > 20)
 		prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
 
+	/* 5 and 10 MHz are only defined for the OFDM PHY */
+	if (width < 20)
+		prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
+
+
 	if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1,
 					 width, prohibited_flags))
 		return false;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1c4f7daea6c7..4ab1ffa9df11 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1188,6 +1188,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev,
 		if ((dev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) &&
 		    nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP))
 			goto nla_put_failure;
+		if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) &&
+		    nla_put_flag(msg, WIPHY_FLAG_SUPPORTS_5_10_MHZ))
+			goto nla_put_failure;
 
 		(*split_start)++;
 		if (split)
@@ -1731,6 +1734,11 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 				     IEEE80211_CHAN_DISABLED))
 		return -EINVAL;
 
+	if ((chandef->width == NL80211_CHAN_WIDTH_5 ||
+	     chandef->width == NL80211_CHAN_WIDTH_10) &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -6280,11 +6288,16 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef))
 		return -EINVAL;
 
-	if (ibss.chandef.width > NL80211_CHAN_WIDTH_40)
-		return -EINVAL;
-	if (ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
-	    !(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS))
+	switch (ibss.chandef.width) {
+	case NL80211_CHAN_WIDTH_20_NOHT:
+		break;
+	case NL80211_CHAN_WIDTH_20:
+	case NL80211_CHAN_WIDTH_40:
+		if (rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)
+			break;
+	default:
 		return -EINVAL;
+	}
 
 	ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
 	ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
-- 
cgit v1.2.3


From 7d5437c709ded4f152cb8b305d17972d6707f20c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 17 Jun 2013 17:50:18 -0700
Subject: openvswitch: Add tunneling interface.

Add ovs tunnel interface for set tunnel action for userspace.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h     |  18 +++++
 net/openvswitch/actions.c            |   4 ++
 net/openvswitch/datapath.c           |  78 +++++++++++++++++++++-
 net/openvswitch/datapath.h           |   3 +
 net/openvswitch/flow.c               | 125 +++++++++++++++++++++++++++++++++++
 net/openvswitch/flow.h               |  19 ++++++
 net/openvswitch/vport-internal_dev.c |   2 +-
 net/openvswitch/vport-netdev.c       |   2 +-
 net/openvswitch/vport.c              |   4 +-
 net/openvswitch/vport.h              |   3 +-
 10 files changed, 251 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 424672db7f12..b15a445927d6 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -246,11 +246,29 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
 	OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
 	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
+	OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
+
+#ifdef __KERNEL__
+	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
+#endif
 	__OVS_KEY_ATTR_MAX
 };
 
 #define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
 
+enum ovs_tunnel_key_attr {
+	OVS_TUNNEL_KEY_ATTR_ID,                 /* be64 Tunnel ID */
+	OVS_TUNNEL_KEY_ATTR_IPV4_SRC,           /* be32 src IP address. */
+	OVS_TUNNEL_KEY_ATTR_IPV4_DST,           /* be32 dst IP address. */
+	OVS_TUNNEL_KEY_ATTR_TOS,                /* u8 Tunnel IP ToS. */
+	OVS_TUNNEL_KEY_ATTR_TTL,                /* u8 Tunnel IP TTL. */
+	OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
+	OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
+	__OVS_TUNNEL_KEY_ATTR_MAX
+};
+
+#define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1)
+
 /**
  * enum ovs_frag_type - IPv4 and IPv6 fragment type
  * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 596d6373399d..22c5f399f1cf 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -436,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb,
 		skb->mark = nla_get_u32(nested_attr);
 		break;
 
+	case OVS_KEY_ATTR_IPV4_TUNNEL:
+		OVS_CB(skb)->tun_key = nla_data(nested_attr);
+		break;
+
 	case OVS_KEY_ATTR_ETHERNET:
 		err = set_eth_addr(skb, nla_data(nested_attr));
 		break;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f14816b80b80..bbd310646bc8 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -362,6 +362,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex,
 static size_t key_attr_size(void)
 {
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
+		  + nla_total_size(8)   /* OVS_TUNNEL_KEY_ATTR_ID */
+		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
+		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TOS */
+		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
@@ -600,8 +608,30 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
 	return -EINVAL;
 }
 
+static int validate_and_copy_set_tun(const struct nlattr *attr,
+				     struct sw_flow_actions **sfa)
+{
+	struct ovs_key_ipv4_tunnel tun_key;
+	int err, start;
+
+	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key);
+	if (err)
+		return err;
+
+	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
+	if (start < 0)
+		return start;
+
+	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
+	add_nested_action_end(*sfa, start);
+
+	return err;
+}
+
 static int validate_set(const struct nlattr *a,
-			const struct sw_flow_key *flow_key)
+			const struct sw_flow_key *flow_key,
+			struct sw_flow_actions **sfa,
+			bool *set_tun)
 {
 	const struct nlattr *ovs_key = nla_data(a);
 	int key_type = nla_type(ovs_key);
@@ -611,18 +641,27 @@ static int validate_set(const struct nlattr *a,
 		return -EINVAL;
 
 	if (key_type > OVS_KEY_ATTR_MAX ||
-	    nla_len(ovs_key) != ovs_key_lens[key_type])
+	   (ovs_key_lens[key_type] != nla_len(ovs_key) &&
+	    ovs_key_lens[key_type] != -1))
 		return -EINVAL;
 
 	switch (key_type) {
 	const struct ovs_key_ipv4 *ipv4_key;
 	const struct ovs_key_ipv6 *ipv6_key;
+	int err;
 
 	case OVS_KEY_ATTR_PRIORITY:
 	case OVS_KEY_ATTR_SKB_MARK:
 	case OVS_KEY_ATTR_ETHERNET:
 		break;
 
+	case OVS_KEY_ATTR_TUNNEL:
+		*set_tun = true;
+		err = validate_and_copy_set_tun(a, sfa);
+		if (err)
+			return err;
+		break;
+
 	case OVS_KEY_ATTR_IPV4:
 		if (flow_key->eth.type != htons(ETH_P_IP))
 			return -EINVAL;
@@ -771,7 +810,7 @@ static int validate_and_copy_actions(const struct nlattr *attr,
 			break;
 
 		case OVS_ACTION_ATTR_SET:
-			err = validate_set(a, key);
+			err = validate_set(a, key, sfa, &skip_copy);
 			if (err)
 				return err;
 			break;
@@ -993,6 +1032,33 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
 	return err;
 }
 
+static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
+{
+	const struct nlattr *ovs_key = nla_data(a);
+	int key_type = nla_type(ovs_key);
+	struct nlattr *start;
+	int err;
+
+	switch (key_type) {
+	case OVS_KEY_ATTR_IPV4_TUNNEL:
+		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+		if (!start)
+			return -EMSGSIZE;
+
+		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key));
+		if (err)
+			return err;
+		nla_nest_end(skb, start);
+		break;
+	default:
+		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
+			return -EMSGSIZE;
+		break;
+	}
+
+	return 0;
+}
+
 static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb)
 {
 	const struct nlattr *a;
@@ -1002,6 +1068,12 @@ static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *s
 		int type = nla_type(a);
 
 		switch (type) {
+		case OVS_ACTION_ATTR_SET:
+			err = set_action_to_attr(a, skb);
+			if (err)
+				return err;
+			break;
+
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample_action_to_attr(a, skb);
 			if (err)
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 16b840695216..e88ebc2f1c54 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,9 +88,12 @@ struct datapath {
 /**
  * struct ovs_skb_cb - OVS data in skb CB
  * @flow: The flow associated with this packet.  May be %NULL if no flow.
+ * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
+ * packet is not being tunneled.
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
+	struct ovs_key_ipv4_tunnel  *tun_key;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 940d4b803ff5..976a8b766a6a 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -40,6 +40,7 @@
 #include <linux/icmpv6.h>
 #include <linux/rculist.h>
 #include <net/ip.h>
+#include <net/ip_tunnels.h>
 #include <net/ipv6.h>
 #include <net/ndisc.h>
 
@@ -603,6 +604,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 	memset(key, 0, sizeof(*key));
 
 	key->phy.priority = skb->priority;
+	if (OVS_CB(skb)->tun_key)
+		memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
 	key->phy.in_port = in_port;
 	key->phy.skb_mark = skb->mark;
 
@@ -818,6 +821,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
 	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
 	[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+	[OVS_KEY_ATTR_TUNNEL] = -1,
 };
 
 static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
@@ -955,6 +959,105 @@ static int parse_flow_nlattrs(const struct nlattr *attr,
 	return 0;
 }
 
+int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
+			     struct ovs_key_ipv4_tunnel *tun_key)
+{
+	struct nlattr *a;
+	int rem;
+	bool ttl = false;
+
+	memset(tun_key, 0, sizeof(*tun_key));
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+			[OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
+			[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
+			[OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
+			[OVS_TUNNEL_KEY_ATTR_TOS] = 1,
+			[OVS_TUNNEL_KEY_ATTR_TTL] = 1,
+			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
+			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+		};
+
+		if (type > OVS_TUNNEL_KEY_ATTR_MAX ||
+			ovs_tunnel_key_lens[type] != nla_len(a))
+			return -EINVAL;
+
+		switch (type) {
+		case OVS_TUNNEL_KEY_ATTR_ID:
+			tun_key->tun_id = nla_get_be64(a);
+			tun_key->tun_flags |= TUNNEL_KEY;
+			break;
+		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+			tun_key->ipv4_src = nla_get_be32(a);
+			break;
+		case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+			tun_key->ipv4_dst = nla_get_be32(a);
+			break;
+		case OVS_TUNNEL_KEY_ATTR_TOS:
+			tun_key->ipv4_tos = nla_get_u8(a);
+			break;
+		case OVS_TUNNEL_KEY_ATTR_TTL:
+			tun_key->ipv4_ttl = nla_get_u8(a);
+			ttl = true;
+			break;
+		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+			tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT;
+			break;
+		case OVS_TUNNEL_KEY_ATTR_CSUM:
+			tun_key->tun_flags |= TUNNEL_CSUM;
+			break;
+		default:
+			return -EINVAL;
+
+		}
+	}
+	if (rem > 0)
+		return -EINVAL;
+
+	if (!tun_key->ipv4_dst)
+		return -EINVAL;
+
+	if (!ttl)
+		return -EINVAL;
+
+	return 0;
+}
+
+int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
+			   const struct ovs_key_ipv4_tunnel *tun_key)
+{
+	struct nlattr *nla;
+
+	nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+	if (!nla)
+		return -EMSGSIZE;
+
+	if (tun_key->tun_flags & TUNNEL_KEY &&
+	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id))
+		return -EMSGSIZE;
+	if (tun_key->ipv4_src &&
+	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src))
+		return -EMSGSIZE;
+	if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst))
+		return -EMSGSIZE;
+	if (tun_key->ipv4_tos &&
+	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos))
+		return -EMSGSIZE;
+	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl))
+		return -EMSGSIZE;
+	if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+		return -EMSGSIZE;
+	if ((tun_key->tun_flags & TUNNEL_CSUM) &&
+		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+		return -EMSGSIZE;
+
+	nla_nest_end(skb, nla);
+	return 0;
+}
+
 /**
  * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
  * @swkey: receives the extracted flow key.
@@ -997,6 +1100,14 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
 		attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+		err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key);
+		if (err)
+			return err;
+
+		attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
+	}
+
 	/* Data attributes. */
 	if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
 		return -EINVAL;
@@ -1135,17 +1246,21 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
 int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
 				   const struct nlattr *attr)
 {
+	struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
 	const struct nlattr *nla;
 	int rem;
 
 	flow->key.phy.in_port = DP_MAX_PORTS;
 	flow->key.phy.priority = 0;
 	flow->key.phy.skb_mark = 0;
+	memset(tun_key, 0, sizeof(flow->key.tun_key));
 
 	nla_for_each_nested(nla, attr, rem) {
 		int type = nla_type(nla);
 
 		if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
+			int err;
+
 			if (nla_len(nla) != ovs_key_lens[type])
 				return -EINVAL;
 
@@ -1154,6 +1269,12 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
 				flow->key.phy.priority = nla_get_u32(nla);
 				break;
 
+			case OVS_KEY_ATTR_TUNNEL:
+				err = ovs_ipv4_tun_from_nlattr(nla, tun_key);
+				if (err)
+					return err;
+				break;
+
 			case OVS_KEY_ATTR_IN_PORT:
 				if (nla_get_u32(nla) >= DP_MAX_PORTS)
 					return -EINVAL;
@@ -1180,6 +1301,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 	    nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
 		goto nla_put_failure;
 
+	if (swkey->tun_key.ipv4_dst &&
+	    ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key))
+		goto nla_put_failure;
+
 	if (swkey->phy.in_port != DP_MAX_PORTS &&
 	    nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
 		goto nla_put_failure;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index e370f6246ee9..aec5e43f690b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -40,7 +40,22 @@ struct sw_flow_actions {
 	struct nlattr actions[];
 };
 
+/* Used to memset ovs_key_ipv4_tunnel padding. */
+#define OVS_TUNNEL_KEY_SIZE					\
+	(offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) +	\
+	FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl))
+
+struct ovs_key_ipv4_tunnel {
+	__be64 tun_id;
+	__be32 ipv4_src;
+	__be32 ipv4_dst;
+	u16  tun_flags;
+	u8   ipv4_tos;
+	u8   ipv4_ttl;
+};
+
 struct sw_flow_key {
+	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
 	struct {
 		u32	priority;	/* Packet QoS priority. */
 		u32	skb_mark;	/* SKB mark. */
@@ -179,5 +194,9 @@ u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
 
 struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
 extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
+int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
+			 struct ovs_key_ipv4_tunnel *tun_key);
+int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
+			const struct ovs_key_ipv4_tunnel *tun_key);
 
 #endif /* flow.h */
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index e284c7e1fec4..98d3edbbc235 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -67,7 +67,7 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde
 static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
 	rcu_read_lock();
-	ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
+	ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
 	rcu_read_unlock();
 	return 0;
 }
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 40de815b4213..5982f3f62835 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -51,7 +51,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
 	skb_push(skb, ETH_HLEN);
 	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
 
-	ovs_vport_receive(vport, skb);
+	ovs_vport_receive(vport, skb, NULL);
 	return;
 
 error:
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 176d449351eb..413287a1877f 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -325,7 +325,8 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
  * Must be called with rcu_read_lock.  The packet cannot be shared and
  * skb->data should point to the Ethernet header.
  */
-void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
+void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+		       struct ovs_key_ipv4_tunnel *tun_key)
 {
 	struct pcpu_tstats *stats;
 
@@ -335,6 +336,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
 	stats->rx_bytes += skb->len;
 	u64_stats_update_end(&stats->syncp);
 
+	OVS_CB(skb)->tun_key = tun_key;
 	ovs_dp_process_received_packet(vport, skb);
 }
 
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 293278c4c2df..2d961aedd71d 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -184,7 +184,8 @@ static inline struct vport *vport_from_priv(const void *priv)
 	return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
 }
 
-void ovs_vport_receive(struct vport *, struct sk_buff *);
+void ovs_vport_receive(struct vport *, struct sk_buff *,
+		       struct ovs_key_ipv4_tunnel *);
 void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
 
 /* List of statically compiled vport implementations.  Don't forget to also
-- 
cgit v1.2.3


From aa310701e787087dbfbccf1409982a96e16c57a6 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 17 Jun 2013 17:50:33 -0700
Subject: openvswitch: Add gre tunnel support.

Add gre vport implementation.  Most of gre protocol processing
is pushed to gre module. It make use of gre demultiplexer
therefore it can co-exist with linux device based gre tunnels.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |   1 +
 net/openvswitch/Kconfig          |   2 +
 net/openvswitch/Makefile         |   3 +-
 net/openvswitch/datapath.h       |   1 +
 net/openvswitch/flow.h           |  18 ++-
 net/openvswitch/vport-gre.c      | 274 +++++++++++++++++++++++++++++++++++++++
 net/openvswitch/vport.c          |  19 +++
 net/openvswitch/vport.h          |   7 +
 8 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/vport-gre.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index b15a445927d6..c55efaaa9bb4 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -164,6 +164,7 @@ enum ovs_vport_type {
 	OVS_VPORT_TYPE_UNSPEC,
 	OVS_VPORT_TYPE_NETDEV,   /* network device */
 	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
+	OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
 	__OVS_VPORT_TYPE_MAX
 };
 
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d9ea33c361be..9fbc04a31ed6 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -19,6 +19,8 @@ config OPENVSWITCH
 	  which is able to accept configuration from a variety of sources and
 	  translate it into packet processing rules.
 
+	  Open vSwitch GRE support depends on CONFIG_NET_IPGRE_DEMUX.
+
 	  See http://openvswitch.org for more information and userspace
 	  utilities.
 
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 15e7384745c1..01bddb2991e3 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -10,5 +10,6 @@ openvswitch-y := \
 	dp_notify.o \
 	flow.o \
 	vport.o \
+	vport-gre.o \
 	vport-internal_dev.o \
-	vport-netdev.o \
+	vport-netdev.o
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index e88ebc2f1c54..a91486484916 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -122,6 +122,7 @@ struct dp_upcall_info {
 struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
+	struct vport_net vport_net;
 };
 
 extern int ovs_net_id;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 999842f247a0..66ef7220293e 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -49,11 +49,27 @@ struct ovs_key_ipv4_tunnel {
 	__be64 tun_id;
 	__be32 ipv4_src;
 	__be32 ipv4_dst;
-	u16  tun_flags;
+	__be16 tun_flags;
 	u8   ipv4_tos;
 	u8   ipv4_ttl;
 };
 
+static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
+					 const struct iphdr *iph, __be64 tun_id,
+					 __be16 tun_flags)
+{
+	tun_key->tun_id = tun_id;
+	tun_key->ipv4_src = iph->saddr;
+	tun_key->ipv4_dst = iph->daddr;
+	tun_key->ipv4_tos = iph->tos;
+	tun_key->ipv4_ttl = iph->ttl;
+	tun_key->tun_flags = tun_flags;
+
+	/* clear struct padding. */
+	memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
+	       sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+}
+
 struct sw_flow_key {
 	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
 	struct {
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
new file mode 100644
index 000000000000..3a8d1900aa78
--- /dev/null
+++ b/net/openvswitch/vport-gre.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifdef CONFIG_NET_IPGRE_DEMUX
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/gre.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/protocol.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 be64_get_low32(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be32)x;
+#else
+	return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
+static __be16 filter_tnl_flags(__be16 flags)
+{
+	return flags & (TUNNEL_CSUM | TUNNEL_KEY);
+}
+
+static struct sk_buff *__build_header(struct sk_buff *skb,
+				      int tunnel_hlen)
+{
+	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
+	struct tnl_ptk_info tpi;
+
+	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
+	if (IS_ERR(skb))
+		return NULL;
+
+	tpi.flags = filter_tnl_flags(tun_key->tun_flags);
+	tpi.proto = htons(ETH_P_TEB);
+	tpi.key = be64_get_low32(tun_key->tun_id);
+	tpi.seq = 0;
+	gre_build_header(skb, &tpi, tunnel_hlen);
+
+	return skb;
+}
+
+static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
+#else
+	return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
+#endif
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_rcv(struct sk_buff *skb,
+		   const struct tnl_ptk_info *tpi)
+{
+	struct ovs_key_ipv4_tunnel tun_key;
+	struct ovs_net *ovs_net;
+	struct vport *vport;
+	__be64 key;
+
+	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+	vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+	if (unlikely(!vport))
+		return PACKET_REJECT;
+
+	key = key_to_tunnel_id(tpi->key, tpi->seq);
+	ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key,
+			      filter_tnl_flags(tpi->flags));
+
+	ovs_vport_receive(vport, skb, &tun_key);
+	return PACKET_RCVD;
+}
+
+static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+	struct net *net = ovs_dp_get_net(vport->dp);
+	struct flowi4 fl;
+	struct rtable *rt;
+	int min_headroom;
+	int tunnel_hlen;
+	__be16 df;
+	int err;
+
+	if (unlikely(!OVS_CB(skb)->tun_key)) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	/* Route lookup */
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+	fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_GRE;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags);
+
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ tunnel_hlen + sizeof(struct iphdr)
+			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+		int head_delta = SKB_DATA_ALIGN(min_headroom -
+						skb_headroom(skb) +
+						16);
+		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+					0, GFP_ATOMIC);
+		if (unlikely(err))
+			goto err_free_rt;
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		if (unlikely(!__vlan_put_tag(skb,
+					     skb->vlan_proto,
+					     vlan_tx_tag_get(skb)))) {
+			err = -ENOMEM;
+			goto err_free_rt;
+		}
+		skb->vlan_tci = 0;
+	}
+
+	/* Push Tunnel header. */
+	skb = __build_header(skb, tunnel_hlen);
+	if (unlikely(!skb)) {
+		err = 0;
+		goto err_free_rt;
+	}
+
+	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+		htons(IP_DF) : 0;
+
+	skb->local_df = 1;
+
+	return iptunnel_xmit(net, rt, skb, fl.saddr,
+			     OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
+			     OVS_CB(skb)->tun_key->ipv4_tos,
+			     OVS_CB(skb)->tun_key->ipv4_ttl, df);
+err_free_rt:
+	ip_rt_put(rt);
+error:
+	return err;
+}
+
+static struct gre_cisco_protocol gre_protocol = {
+	.handler        = gre_rcv,
+	.priority       = 1,
+};
+
+static int gre_ports;
+static int gre_init(void)
+{
+	int err;
+
+	gre_ports++;
+	if (gre_ports > 1)
+		return 0;
+
+	err = gre_cisco_register(&gre_protocol);
+	if (err)
+		pr_warn("cannot register gre protocol handler\n");
+
+	return err;
+}
+
+static void gre_exit(void)
+{
+	gre_ports--;
+	if (gre_ports > 0)
+		return;
+
+	gre_cisco_unregister(&gre_protocol);
+}
+
+static const char *gre_get_name(const struct vport *vport)
+{
+	return vport_priv(vport);
+}
+
+static struct vport *gre_create(const struct vport_parms *parms)
+{
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct ovs_net *ovs_net;
+	struct vport *vport;
+	int err;
+
+	err = gre_init();
+	if (err)
+		return ERR_PTR(err);
+
+	ovs_net = net_generic(net, ovs_net_id);
+	if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
+		vport = ERR_PTR(-EEXIST);
+		goto error;
+	}
+
+	vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+	if (IS_ERR(vport))
+		goto error;
+
+	strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
+	rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
+	return vport;
+
+error:
+	gre_exit();
+	return vport;
+}
+
+static void gre_tnl_destroy(struct vport *vport)
+{
+	struct net *net = ovs_dp_get_net(vport->dp);
+	struct ovs_net *ovs_net;
+
+	ovs_net = net_generic(net, ovs_net_id);
+
+	rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL);
+	ovs_vport_deferred_free(vport);
+	gre_exit();
+}
+
+const struct vport_ops ovs_gre_vport_ops = {
+	.type		= OVS_VPORT_TYPE_GRE,
+	.create		= gre_create,
+	.destroy	= gre_tnl_destroy,
+	.get_name	= gre_get_name,
+	.send		= gre_tnl_send,
+};
+#endif
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 413287a1877f..f52dfb9cb5a7 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -38,6 +38,10 @@
 static const struct vport_ops *vport_ops_list[] = {
 	&ovs_netdev_vport_ops,
 	&ovs_internal_vport_ops,
+
+#ifdef CONFIG_NET_IPGRE_DEMUX
+	&ovs_gre_vport_ops,
+#endif
 };
 
 /* Protected by RCU read lock for reading, ovs_mutex for writing. */
@@ -404,3 +408,18 @@ void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
 
 	spin_unlock(&vport->stats_lock);
 }
+
+static void free_vport_rcu(struct rcu_head *rcu)
+{
+	struct vport *vport = container_of(rcu, struct vport, rcu);
+
+	ovs_vport_free(vport);
+}
+
+void ovs_vport_deferred_free(struct vport *vport)
+{
+	if (!vport)
+		return;
+
+	call_rcu(&vport->rcu, free_vport_rcu);
+}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 2d961aedd71d..376045c42f8b 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -34,6 +34,11 @@ struct vport_parms;
 
 /* The following definitions are for users of the vport subsytem: */
 
+/* The following definitions are for users of the vport subsytem: */
+struct vport_net {
+	struct vport __rcu *gre_vport;
+};
+
 int ovs_vport_init(void);
 void ovs_vport_exit(void);
 
@@ -152,6 +157,7 @@ enum vport_err_type {
 struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
 			      const struct vport_parms *);
 void ovs_vport_free(struct vport *);
+void ovs_vport_deferred_free(struct vport *vport);
 
 #define VPORT_ALIGN 8
 
@@ -192,6 +198,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
  * add yours to the list at the top of vport.c. */
 extern const struct vport_ops ovs_netdev_vport_ops;
 extern const struct vport_ops ovs_internal_vport_ops;
+extern const struct vport_ops ovs_gre_vport_ops;
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
-- 
cgit v1.2.3


From bcefe17cffd06efdda3e7ad679ea743236e6271a Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 15 Jun 2013 09:39:18 +0800
Subject: tcp: introduce a per-route knob for quick ack

In previous discussions, I tried to find some reasonable heuristics
for delayed ACK, however this seems not possible, according to Eric:

	"ACKS might also be delayed because of bidirectional
	traffic, and is more controlled by the application
	response time. TCP stack can not easily estimate it."

	"ACK can be incredibly useful to recover from losses in
	a short time.

	The vast majority of TCP sessions are small lived, and we
	send one ACK per received segment anyway at beginning or
	retransmits to let the sender smoothly increase its cwnd,
	so an auto-tuning facility wont help them that much."

and according to David:

	"ACKs are the only information we have to detect loss.

	And, for the same reasons that TCP VEGAS is fundamentally
	broken, we cannot measure the pipe or some other
	receiver-side-visible piece of information to determine
	when it's "safe" to stretch ACK.

	And even if it's "safe", we should not do it so that losses are
	accurately detected and we don't spuriously retransmit.

	The only way to know when the bandwidth increases is to
	"test" it, by sending more and more packets until drops happen.
	That's why all successful congestion control algorithms must
	operate on explicited tested pieces of information.

	Similarly, it's not really possible to universally know if
	it's safe to stretch ACK or not."

It still makes sense to enable or disable quick ack mode like
what TCP_QUICK_ACK does.

Similar to TCP_QUICK_ACK option, but for people who can't
modify the source code and still wants to control
TCP delayed ACK behavior. As David suggested, this should belong
to per-path scope, since different pathes may want different
behaviors.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Rick Jones <rick.jones2@hp.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
CC: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 2 ++
 net/ipv4/tcp_input.c           | 5 ++++-
 net/ipv4/tcp_output.c          | 6 ++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7a2144e1afae..eb0f1a554d7b 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -386,6 +386,8 @@ enum {
 #define RTAX_RTO_MIN RTAX_RTO_MIN
 	RTAX_INITRWND,
 #define RTAX_INITRWND RTAX_INITRWND
+	RTAX_QUICKACK,
+#define RTAX_QUICKACK RTAX_QUICKACK
 	__RTAX_MAX
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 46271cdcf088..28af45abe062 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3717,6 +3717,7 @@ void tcp_reset(struct sock *sk)
 static void tcp_fin(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	const struct dst_entry *dst;
 
 	inet_csk_schedule_ack(sk);
 
@@ -3728,7 +3729,9 @@ static void tcp_fin(struct sock *sk)
 	case TCP_ESTABLISHED:
 		/* Move to CLOSE_WAIT */
 		tcp_set_state(sk, TCP_CLOSE_WAIT);
-		inet_csk(sk)->icsk_ack.pingpong = 1;
+		dst = __sk_dst_get(sk);
+		if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+			inet_csk(sk)->icsk_ack.pingpong = 1;
 		break;
 
 	case TCP_CLOSE_WAIT:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2c1333ee318..3d609490f118 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,6 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
+	const struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (sysctl_tcp_slow_start_after_idle &&
 	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
@@ -170,8 +171,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	/* If it is a reply for ato after last received
 	 * packet, enter pingpong mode.
 	 */
-	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
-		icsk->icsk_ack.pingpong = 1;
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
+	    (!dst || !dst_metric(dst, RTAX_QUICKACK)))
+			icsk->icsk_ack.pingpong = 1;
 }
 
 /* Account for an ACK we sent. */
-- 
cgit v1.2.3


From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 May 2013 13:33:10 +1000
Subject: powerpc/vfio: Implement IOMMU driver for VFIO

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling.  This implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWER
guest).

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 Documentation/vfio.txt              |  63 ++++++
 drivers/vfio/Kconfig                |   6 +
 drivers/vfio/Makefile               |   1 +
 drivers/vfio/vfio.c                 |   1 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 377 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h           |  34 ++++
 6 files changed, 482 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda3635a17d..c55533c0adb3 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
 interfaces implement the device region access defined by the device's
 own VFIO_DEVICE_GET_REGION_INFO ioctl.
 
+
+PPC64 sPAPR implementation note
+-------------------------------------------------------------------------------
+
+This implementation has some specifics:
+
+1) Only one IOMMU group per container is supported as an IOMMU group
+represents the minimal entity which isolation can be guaranteed for and
+groups are allocated statically, one per a Partitionable Endpoint (PE)
+(PE is often a PCI domain but not always).
+
+2) The hardware supports so called DMA windows - the PCI address range
+within which DMA transfer is allowed, any attempt to access address space
+out of the window leads to the whole PE isolation.
+
+3) PPC64 guests are paravirtualized but not fully emulated. There is an API
+to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
+currently there is no way to reduce the number of calls. In order to make things
+faster, the map/unmap handling has been implemented in real mode which provides
+an excellent performance which has limitations such as inability to do
+locked pages accounting in real time.
+
+So 3 additional ioctls have been added:
+
+	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
+		of the DMA window on the PCI bus.
+
+	VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
+		is done at this point. This lets user first to know what
+		the DMA window is and adjust rlimit before doing any real job.
+
+	VFIO_IOMMU_DISABLE - disables the container.
+
+
+The code flow from the example above should be slightly changed:
+
+	.....
+	/* Add the group to the container */
+	ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
+
+	/* Enable the IOMMU model we want */
+	ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
+
+	/* Get addition sPAPR IOMMU info */
+	vfio_iommu_spapr_tce_info spapr_iommu_info;
+	ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
+
+	if (ioctl(container, VFIO_IOMMU_ENABLE))
+		/* Cannot enable container, may be low rlimit */
+
+	/* Allocate some space and setup a DMA mapping */
+	dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
+			     MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+	dma_map.size = 1024 * 1024;
+	dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
+	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+	/* Check here is .iova/.size are within DMA window from spapr_iommu_info */
+
+	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
+	.....
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec0abd1..b464687f6e14 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a0e38b..72bfabc8629e 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736563de..259ad282ae5d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
 	 * drivers.
 	 */
 	request_module_nowait("vfio_iommu_type1");
+	request_module_nowait("vfio_iommu_spapr_tce");
 
 	return 0;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000000000000..bdae7a04af75
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,377 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2013 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+	bool enabled;
+};
+
+static int tce_iommu_enable(struct tce_container *container)
+{
+	int ret = 0;
+	unsigned long locked, lock_limit, npages;
+	struct iommu_table *tbl = container->tbl;
+
+	if (!container->tbl)
+		return -ENXIO;
+
+	if (!current->mm)
+		return -ESRCH; /* process exited */
+
+	if (container->enabled)
+		return -EBUSY;
+
+	/*
+	 * When userspace pages are mapped into the IOMMU, they are effectively
+	 * locked memory, so, theoretically, we need to update the accounting
+	 * of locked pages on each map and unmap.  For powerpc, the map unmap
+	 * paths can be very hot, though, and the accounting would kill
+	 * performance, especially since it would be difficult to impossible
+	 * to handle the accounting in real mode only.
+	 *
+	 * To address that, rather than precisely accounting every page, we
+	 * instead account for a worst case on locked memory when the iommu is
+	 * enabled and disabled.  The worst case upper bound on locked memory
+	 * is the size of the whole iommu window, which is usually relatively
+	 * small (compared to total memory sizes) on POWER hardware.
+	 *
+	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+	 * that would effectively kill the guest at random points, much better
+	 * enforcing the limit based on the max that the guest can map.
+	 */
+	down_write(&current->mm->mmap_sem);
+	npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	locked = current->mm->locked_vm + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		ret = -ENOMEM;
+	} else {
+
+		current->mm->locked_vm += npages;
+		container->enabled = true;
+	}
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void tce_iommu_disable(struct tce_container *container)
+{
+	if (!container->enabled)
+		return;
+
+	container->enabled = false;
+
+	if (!container->tbl || !current->mm)
+		return;
+
+	down_write(&current->mm->mmap_sem);
+	current->mm->locked_vm -= (container->tbl->it_size <<
+			IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	up_write(&current->mm->mmap_sem);
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	tce_iommu_disable(container);
+
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		struct vfio_iommu_type1_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		unsigned long tce, i;
+
+		if (!tbl)
+			return -ENXIO;
+
+		BUG_ON(!tbl->it_group);
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE))
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* iova is checked by the IOMMU API */
+		tce = param.vaddr;
+		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			tce |= TCE_PCI_READ;
+		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			tce |= TCE_PCI_WRITE;
+
+		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
+			ret = iommu_put_tce_user_mode(tbl,
+					(param.iova >> IOMMU_PAGE_SHIFT) + i,
+					tce);
+			if (ret)
+				break;
+			tce += IOMMU_PAGE_SIZE;
+		}
+		if (ret)
+			iommu_clear_tces_and_put_pages(tbl,
+					param.iova >> IOMMU_PAGE_SHIFT,	i);
+
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		struct vfio_iommu_type1_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		if (param.size & ~IOMMU_PAGE_MASK)
+			return -EINVAL;
+
+		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret)
+			return ret;
+
+		ret = iommu_clear_tces_and_put_pages(tbl,
+				param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_ENABLE:
+		mutex_lock(&container->lock);
+		ret = tce_iommu_enable(container);
+		mutex_unlock(&container->lock);
+		return ret;
+
+
+	case VFIO_IOMMU_DISABLE:
+		mutex_lock(&container->lock);
+		tce_iommu_disable(container);
+		mutex_unlock(&container->lock);
+		return 0;
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	int ret;
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+
+	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group); */
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else if (container->enabled) {
+		pr_err("tce_vfio: attaching group #%u to enabled container\n",
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else {
+		ret = iommu_take_ownership(tbl);
+		if (!ret)
+			container->tbl = tbl;
+	}
+
+	mutex_unlock(&container->lock);
+
+	return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+		if (container->enabled) {
+			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+					iommu_group_id(tbl->it_group));
+			tce_iommu_disable(container);
+		}
+
+		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group); */
+		container->tbl = NULL;
+		iommu_release_ownership(tbl);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..87ee4f4cff25 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -22,6 +22,7 @@
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/*
+ * IOCTLs to enable/disable IOMMU container usage.
+ * No parameters are supported.
+ */
+#define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
+#define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
+
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* ***************************************************************** */
+
 #endif /* _UAPIVFIO_H */
-- 
cgit v1.2.3


From 681f130f39e10087475383e6771b9366e26bab0c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 20 Jun 2013 05:52:22 -0700
Subject: netfilter: xt_socket: add XT_SOCKET_NOWILDCARD flag

xt_socket module can be a nice replacement to conntrack module
in some cases (SYN filtering for example)

But it lacks the ability to match the 3rd packet of TCP
handshake (ACK coming from the client).

Add a XT_SOCKET_NOWILDCARD flag to disable the wildcard mechanism.

The wildcard is the legacy socket match behavior, that ignores
LISTEN sockets bound to INADDR_ANY (or ipv6 equivalent)

iptables -I INPUT -p tcp --syn -j SYN_CHAIN
iptables -I INPUT -m socket --nowildcard -j ACCEPT

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_socket.h |  7 ++++
 net/netfilter/xt_socket.c                | 70 ++++++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h
index 26d7217bd4f1..6315e2ac3474 100644
--- a/include/uapi/linux/netfilter/xt_socket.h
+++ b/include/uapi/linux/netfilter/xt_socket.h
@@ -5,10 +5,17 @@
 
 enum {
 	XT_SOCKET_TRANSPARENT = 1 << 0,
+	XT_SOCKET_NOWILDCARD = 1 << 1,
 };
 
 struct xt_socket_mtinfo1 {
 	__u8 flags;
 };
+#define XT_SOCKET_FLAGS_V1 XT_SOCKET_TRANSPARENT
+
+struct xt_socket_mtinfo2 {
+	__u8 flags;
+};
+#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD)
 
 #endif /* _XT_SOCKET_H */
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 02704245710e..f8b71911037a 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -163,8 +163,11 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		bool wildcard;
 		bool transparent = true;
 
-		/* Ignore sockets listening on INADDR_ANY */
-		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+		/* Ignore sockets listening on INADDR_ANY,
+		 * unless XT_SOCKET_NOWILDCARD is set
+		 */
+		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+			    sk->sk_state != TCP_TIME_WAIT &&
 			    inet_sk(sk)->inet_rcv_saddr == 0);
 
 		/* Ignore non-transparent sockets,
@@ -197,7 +200,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 }
 
 static bool
-socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, par->matchinfo);
 }
@@ -259,7 +262,7 @@ extract_icmp6_fields(const struct sk_buff *skb,
 }
 
 static bool
-socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct udphdr _hdr, *hp = NULL;
@@ -302,8 +305,11 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		bool wildcard;
 		bool transparent = true;
 
-		/* Ignore sockets listening on INADDR_ANY */
-		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+		/* Ignore sockets listening on INADDR_ANY
+		 * unless XT_SOCKET_NOWILDCARD is set
+		 */
+		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+			    sk->sk_state != TCP_TIME_WAIT &&
 			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
 
 		/* Ignore non-transparent sockets,
@@ -331,6 +337,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
 }
 #endif
 
+static int socket_mt_v1_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+	if (info->flags & ~XT_SOCKET_FLAGS_V1) {
+		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int socket_mt_v2_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+
+	if (info->flags & ~XT_SOCKET_FLAGS_V2) {
+		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
 	{
 		.name		= "socket",
@@ -345,7 +373,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt4_v1,
+		.match		= socket_mt4_v1_v2,
+		.checkentry	= socket_mt_v1_check,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
@@ -356,7 +385,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV6,
-		.match		= socket_mt6_v1,
+		.match		= socket_mt6_v1_v2,
+		.checkentry	= socket_mt_v1_check,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
+	{
+		.name		= "socket",
+		.revision	= 2,
+		.family		= NFPROTO_IPV4,
+		.match		= socket_mt4_v1_v2,
+		.checkentry	= socket_mt_v2_check,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#ifdef XT_SOCKET_HAVE_IPV6
+	{
+		.name		= "socket",
+		.revision	= 2,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1_v2,
+		.checkentry	= socket_mt_v2_check,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
-- 
cgit v1.2.3


From 166fd7d94afdac040b28c473e45241820ca522a2 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Jun 2013 09:38:02 -0600
Subject: vfio: hugepage support for vfio_iommu_type1

We currently send all mappings to the iommu in PAGE_SIZE chunks,
which prevents the iommu from enabling support for larger page sizes.
We still need to pin pages, which means we step through them in
PAGE_SIZE chunks, but we can batch up contiguous physical memory
chunks to allow the iommu the opportunity to use larger pages.  The
approach here is a bit different that the one currently used for
legacy KVM device assignment.  Rather than looking at the vma page
size and using that as the maximum size to pass to the iommu, we
instead simply look at whether the next page is physically
contiguous.  This means we might ask the iommu to map a 4MB region,
while legacy KVM might limit itself to a maximum of 2MB.

Splitting our mapping path also allows us to be smarter about locked
memory because we can more easily unwind if the user attempts to
exceed the limit.  Therefore, rather than assuming that a mapping
will result in locked memory, we test each page as it is pinned to
determine whether it locks RAM vs an mmap'd MMIO region.  This should
result in better locking granularity and less locked page fudge
factors in userspace.

The unmap path uses the same algorithm as legacy KVM.  We don't want
to track the pfn for each mapping ourselves, but we need the pfn in
order to unpin pages.  We therefore ask the iommu for the iova to
physical address translation, ask it to unpin a page, and see how many
pages were actually unpinned.  iommus supporting large pages will
often return something bigger than a page here, which we know will be
physically contiguous and we can unpin a batch of pfns.  iommus not
supporting large mappings won't see an improvement in batching here as
they only unmap a page at a time.

With this change, we also make a clarification to the API for mapping
and unmapping DMA.  We can only guarantee unmaps at the same
granularity as used for the original mapping.  In other words,
unmapping a subregion of a previous mapping is not guaranteed and may
result in a larger or smaller unmapping than requested.  The size
field in the unmapping structure is updated to reflect this.
Previously this was unmodified on mapping, always returning the the
requested unmap size.  This is now updated to return the actual unmap
size on success, allowing userspace to appropriately track mappings.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 523 ++++++++++++++++++++++++++--------------
 include/uapi/linux/vfio.h       |   8 +-
 2 files changed, 344 insertions(+), 187 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0e863b3ddcab..6654a7eb42d3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		vaddr;		/* Process virtual addr */
-	long			npage;		/* Number of pages */
+	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
 };
 
@@ -74,8 +74,6 @@ struct vfio_group {
  * into DMA'ble space using the IOMMU
  */
 
-#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
-
 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 				      dma_addr_t start, size_t size)
 {
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 
 		if (start + size <= dma->iova)
 			node = node->rb_left;
-		else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+		else if (start >= dma->iova + dma->size)
 			node = node->rb_right;
 		else
 			return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 		parent = *link;
 		dma = rb_entry(parent, struct vfio_dma, node);
 
-		if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+		if (new->iova + new->size <= dma->iova)
 			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm)
-		return; /* process exited */
+	if (!current->mm || !npage)
+		return; /* process exited or nothing to do */
 
 	if (down_write_trylock(&current->mm->mmap_sem)) {
 		current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-/* Unmap DMA region */
-static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			     long npage, int prot)
-{
-	long i, unlocked = 0;
-
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
-		unsigned long pfn;
-
-		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
-		if (pfn) {
-			iommu_unmap(iommu->domain, iova, PAGE_SIZE);
-			unlocked += put_pfn(pfn, prot);
-		}
-	}
-	return unlocked;
-}
-
-static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			   long npage, int prot)
-{
-	long unlocked;
-
-	unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
-	vfio_lock_acct(-unlocked);
-}
-
 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 {
 	struct page *page[1];
@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 	return ret;
 }
 
-/* Map DMA region */
-static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
-			  unsigned long vaddr, long npage, int prot)
+/*
+ * Attempt to pin pages.  We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+static long vfio_pin_pages(unsigned long vaddr, long npage,
+			   int prot, unsigned long *pfn_base)
 {
-	dma_addr_t start = iova;
-	long i, locked = 0;
-	int ret;
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret, i;
 
-	/* Verify that pages are not already mapped */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
-		if (iommu_iova_to_phys(iommu->domain, iova))
-			return -EBUSY;
+	if (!current->mm)
+		return -ENODEV;
 
-	iova = start;
+	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
 
-	if (iommu->cache)
-		prot |= IOMMU_CACHE;
+	if (is_invalid_reserved_pfn(*pfn_base))
+		return 1;
 
-	/*
-	 * XXX We break mappings into pages and use get_user_pages_fast to
-	 * pin the pages in memory.  It's been suggested that mlock might
-	 * provide a more efficient mechanism, but nothing prevents the
-	 * user from munlocking the pages, which could then allow the user
-	 * access to random host memory.  We also have no guarantee from the
-	 * IOMMU API that the iommu driver can unmap sub-pages of previous
-	 * mappings.  This means we might lose an entire range if a single
-	 * page within it is unmapped.  Single page mappings are inefficient,
-	 * but provide the most flexibility for now.
-	 */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
+	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	/* Lock all the consecutive pages from pfn_base */
+	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
 		ret = vaddr_get_pfn(vaddr, prot, &pfn);
-		if (ret) {
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+		if (ret)
+			break;
+
+		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+			put_pfn(pfn, prot);
+			break;
 		}
 
+		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
+			put_pfn(pfn, prot);
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+				__func__, limit << PAGE_SHIFT);
+			break;
+		}
+	}
+
+	vfio_lock_acct(i);
+
+	return i;
+}
+
+static long vfio_unpin_pages(unsigned long pfn, long npage,
+			     int prot, bool do_accounting)
+{
+	unsigned long unlocked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++)
+		unlocked += put_pfn(pfn++, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(-unlocked);
+
+	return unlocked;
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    dma_addr_t iova, size_t *size)
+{
+	dma_addr_t start = iova, end = iova + *size;
+	long unlocked = 0;
+
+	while (iova < end) {
+		size_t unmapped;
+		phys_addr_t phys;
+
 		/*
-		 * Only add actual locked pages to accounting
-		 * XXX We're effectively marking a page locked for every
-		 * IOVA page even though it's possible the user could be
-		 * backing multiple IOVAs with the same vaddr.  This over-
-		 * penalizes the user process, but we currently have no
-		 * easy way to do this properly.
+		 * We use the IOMMU to track the physical address.  This
+		 * saves us from having a lot more entries in our mapping
+		 * tree.  The downside is that we don't track the size
+		 * used to do the mapping.  We request unmap of a single
+		 * page, but expect IOMMUs that support large pages to
+		 * unmap a larger chunk.
 		 */
-		if (!is_invalid_reserved_pfn(pfn))
-			locked++;
-
-		ret = iommu_map(iommu->domain, iova,
-				(phys_addr_t)pfn << PAGE_SHIFT,
-				PAGE_SIZE, prot);
-		if (ret) {
-			/* Back out mappings on error */
-			put_pfn(pfn, prot);
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+		phys = iommu_iova_to_phys(iommu->domain, iova);
+		if (WARN_ON(!phys)) {
+			iova += PAGE_SIZE;
+			continue;
 		}
+
+		unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+		if (!unmapped)
+			break;
+
+		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+					     unmapped >> PAGE_SHIFT,
+					     dma->prot, false);
+		iova += unmapped;
 	}
-	vfio_lock_acct(locked);
+
+	vfio_lock_acct(-unlocked);
+
+	*size = iova - start;
+
 	return 0;
 }
 
 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
-				   size_t size, struct vfio_dma *dma)
+				   size_t *size, struct vfio_dma *dma)
 {
+	size_t offset, overlap, tmp;
 	struct vfio_dma *split;
-	long npage_lo, npage_hi;
+	int ret;
+
+	/*
+	 * Existing dma region is completely covered, unmap all.  This is
+	 * the likely case since userspace tends to map and unmap buffers
+	 * in one shot rather than multiple mappings within a buffer.
+	 */
+	if (likely(start <= dma->iova &&
+		   start + *size >= dma->iova + dma->size)) {
+		*size = dma->size;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+		if (ret)
+			return ret;
+
+		/*
+		 * Did we remove more than we have?  Should never happen
+		 * since a vfio_dma is contiguous in iova and vaddr.
+		 */
+		WARN_ON(*size != dma->size);
 
-	/* Existing dma region is completely covered, unmap all */
-	if (start <= dma->iova &&
-	    start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
 		vfio_remove_dma(iommu, dma);
 		kfree(dma);
 		return 0;
@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 
 	/* Overlap low address of existing range */
 	if (start <= dma->iova) {
-		size_t overlap;
+		overlap = start + *size - dma->iova;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+		if (ret)
+			return ret;
 
-		overlap = start + size - dma->iova;
-		npage_lo = overlap >> PAGE_SHIFT;
+		vfio_remove_dma(iommu, dma);
 
-		vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
-		dma->iova += overlap;
-		dma->vaddr += overlap;
-		dma->npage -= npage_lo;
+		/*
+		 * Check, we may have removed to whole vfio_dma.  If not
+		 * fixup and re-insert.
+		 */
+		if (overlap < dma->size) {
+			dma->iova += overlap;
+			dma->vaddr += overlap;
+			dma->size -= overlap;
+			vfio_insert_dma(iommu, dma);
+		}
+		*size = overlap;
 		return 0;
 	}
 
 	/* Overlap high address of existing range */
-	if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		size_t overlap;
+	if (start + *size >= dma->iova + dma->size) {
+		offset = start - dma->iova;
+		overlap = dma->size - offset;
 
-		overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
-		npage_hi = overlap >> PAGE_SHIFT;
+		ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+		if (ret)
+			return ret;
+
+		/*
+		 * We may have unmapped the entire vfio_dma if the user is
+		 * trying to unmap a sub-region of what was originally
+		 * mapped.  If anything left, we can resize in place since
+		 * iova is unchanged.
+		 */
+		if (overlap < dma->size)
+			dma->size -= overlap;
+		else
+			vfio_remove_dma(iommu, dma);
 
-		vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
-		dma->npage -= npage_hi;
+		*size = overlap;
 		return 0;
 	}
 
 	/* Split existing */
-	npage_lo = (start - dma->iova) >> PAGE_SHIFT;
-	npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
+	offset = start - dma->iova;
 
-	split = kzalloc(sizeof *split, GFP_KERNEL);
-	if (!split)
-		return -ENOMEM;
+	ret = vfio_unmap_unpin(iommu, dma, start, size);
+	if (ret)
+		return ret;
 
-	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
+	WARN_ON(!*size);
+	tmp = dma->size;
 
-	dma->npage = npage_lo;
+	/*
+	 * Resize the lower vfio_dma in place, insert new for remaining
+	 * upper segment.
+	 */
+	dma->size = offset;
+
+	if (offset + *size < tmp) {
+		split = kzalloc(sizeof(*split), GFP_KERNEL);
+		if (!split)
+			return -ENOMEM;
+
+		split->size = tmp - offset - *size;
+		split->iova = dma->iova + offset + *size;
+		split->vaddr = dma->vaddr + offset + *size;
+		split->prot = dma->prot;
+		vfio_insert_dma(iommu, split);
+	}
 
-	split->npage = npage_hi;
-	split->iova = start + size;
-	split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
-	split->prot = dma->prot;
-	vfio_insert_dma(iommu, split);
 	return 0;
 }
 
@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 {
 	uint64_t mask;
 	struct vfio_dma *dma;
+	size_t unmapped = 0, size;
 	int ret = 0;
 
 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	if (unmap->size & mask)
 		return -EINVAL;
 
-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);
 
 	mutex_lock(&iommu->lock);
 
-	while (!ret && (dma = vfio_find_dma(iommu,
-					    unmap->iova, unmap->size)))
-		ret = vfio_remove_dma_overlap(iommu, unmap->iova,
-					      unmap->size, dma);
+	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+		size = unmap->size;
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+		if (ret)
+			break;
+		unmapped += size;
+	}
 
 	mutex_unlock(&iommu->lock);
+
+	/*
+	 * We may unmap more than requested, update the unmap struct so
+	 * userspace can know.
+	 */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+/*
+ * Turns out AMD IOMMU has a page table bug where it won't map large pages
+ * to a region that previously mapped smaller pages.  This should be fixed
+ * soon, so this is just a temporary workaround to break mappings down into
+ * PAGE_SIZE.  Better to map smaller pages than nothing.
+ */
+static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long pfn, long npage, int prot)
+{
+	long i;
+	int ret;
+
+	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				PAGE_SIZE, prot);
+		if (ret)
+			break;
+	}
+
+	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
+		iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+
 	return ret;
 }
 
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
-	struct vfio_dma *dma;
-	dma_addr_t iova = map->iova;
-	unsigned long locked, lock_limit, vaddr = map->vaddr;
+	dma_addr_t end, iova;
+	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
+	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
-	long npage;
+
+	end = map->iova + map->size;
 
 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 
@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (!prot)
 		return -EINVAL; /* No READ/WRITE? */
 
+	if (iommu->cache)
+		prot |= IOMMU_CACHE;
+
 	if (vaddr & mask)
 		return -EINVAL;
-	if (iova & mask)
+	if (map->iova & mask)
 		return -EINVAL;
-	if (size & mask)
+	if (!map->size || map->size & mask)
 		return -EINVAL;
 
-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);
 
 	/* Don't allow IOVA wrap */
-	if (iova + size && iova + size < iova)
+	if (end && end < map->iova)
 		return -EINVAL;
 
 	/* Don't allow virtual address wrap */
-	if (vaddr + size && vaddr + size < vaddr)
-		return -EINVAL;
-
-	npage = size >> PAGE_SHIFT;
-	if (!npage)
+	if (vaddr + map->size && vaddr + map->size < vaddr)
 		return -EINVAL;
 
-	dma = kzalloc(sizeof *dma, GFP_KERNEL);
-	if (!dma)
-		return -ENOMEM;
-
 	mutex_lock(&iommu->lock);
 
-	if (vfio_find_dma(iommu, iova, size)) {
-		ret = -EBUSY;
-		goto out_lock;
+	if (vfio_find_dma(iommu, map->iova, map->size)) {
+		mutex_unlock(&iommu->lock);
+		return -EEXIST;
 	}
 
-	/* account for locked pages */
-	locked = current->mm->locked_vm + npage;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-			__func__, rlimit(RLIMIT_MEMLOCK));
-		ret = -ENOMEM;
-		goto out_lock;
-	}
+	for (iova = map->iova; iova < end; iova += size, vaddr += size) {
+		struct vfio_dma *dma = NULL;
+		unsigned long pfn;
+		long i;
+
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
+				       prot, &pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
 
-	ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
-	if (ret)
-		goto out_lock;
-
-	dma->npage = npage;
-	dma->iova = iova;
-	dma->vaddr = vaddr;
-	dma->prot = prot;
-
-	/* Check if we abut a region below - nothing below 0 */
-	if (iova) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
-		if (tmp && tmp->prot == prot &&
-		    tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
-			vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
-			dma->iova = iova = tmp->iova;
-			dma->vaddr = vaddr = tmp->vaddr;
-			kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
+		/* Verify pages are not already mapped */
+		for (i = 0; i < npage; i++) {
+			if (iommu_iova_to_phys(iommu->domain,
+					       iova + (i << PAGE_SHIFT))) {
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				npage << PAGE_SHIFT, prot);
+		if (ret) {
+			if (ret != -EBUSY ||
+			    map_try_harder(iommu, iova, pfn, npage, prot)) {
+				vfio_unpin_pages(pfn, npage, prot, true);
+				break;
+			}
+		}
+
+		size = npage << PAGE_SHIFT;
+
+		/*
+		 * Check if we abut a region below - nothing below 0.
+		 * This is the most likely case when mapping chunks of
+		 * physically contiguous regions within a virtual address
+		 * range.  Update the abutting entry in place since iova
+		 * doesn't change.
+		 */
+		if (likely(iova)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iova - 1, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr + tmp->size == vaddr) {
+				tmp->size += size;
+
+				iova = tmp->iova;
+				size = tmp->size;
+				vaddr = tmp->vaddr;
+				dma = tmp;
+			}
+		}
+
+		/* Check if we abut a region above - nothing above ~0 + 1 */
+		if (likely(iova + size)) {
+			struct vfio_dma *tmp;
+
+			tmp = vfio_find_dma(iommu, iova + size, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr == vaddr + size) {
+				vfio_remove_dma(iommu, tmp);
+				if (dma)
+					dma->size += tmp->size;
+				else
+					size += tmp->size;
+				kfree(tmp);
+			}
 		}
-	}
 
-	/* Check if we abut a region above - nothing above ~0 + 1 */
-	if (iova + size) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
-		if (tmp && tmp->prot == prot &&
-		    tmp->vaddr == vaddr + size) {
-			vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
-			kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
+		if (!dma) {
+			dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+			if (!dma) {
+				iommu_unmap(iommu->domain, iova, size);
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -ENOMEM;
+				break;
+			}
+
+			dma->size = size;
+			dma->iova = iova;
+			dma->vaddr = vaddr;
+			dma->prot = prot;
+			vfio_insert_dma(iommu, dma);
 		}
 	}
 
-	vfio_insert_dma(iommu, dma);
+	if (ret) {
+		struct vfio_dma *tmp;
+		iova = map->iova;
+		size = map->size;
+		while ((tmp = vfio_find_dma(iommu, iova, size))) {
+			if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
+				pr_warn("%s: Error rolling back failed map\n",
+					__func__);
+				break;
+			}
+		}
+	}
 
-out_lock:
 	mutex_unlock(&iommu->lock);
-	if (ret)
-		kfree(dma);
 	return ret;
 }
 
@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)
 
 	while ((node = rb_first(&iommu->dma_list))) {
 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-		vfio_remove_dma(iommu, dma);
-		kfree(dma);
+		size_t size = dma->size;
+		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 	}
 
 	iommu_domain_free(iommu->domain);
@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
+		long ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		if (unmap.argsz < minsz || unmap.flags)
 			return -EINVAL;
 
-		return vfio_dma_do_unmap(iommu, &unmap);
+		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &unmap, minsz);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..513600612995 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map {
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
 /**
- * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
+ *							struct vfio_dma_unmap)
  *
  * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
- * Caller sets argsz.
+ * Caller sets argsz.  The actual unmapped size is returned in the size
+ * field.  No guarantee is made to the user that arbitrary unmaps of iova
+ * or size different from those used in the original mapping call will
+ * succeed.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
-- 
cgit v1.2.3


From 77e2af0312b12dccd5043a7cc9cd49ab6a212996 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 21 Jun 2013 19:38:06 +0200
Subject: net: if_arp: add ARPHRD_NETLINK type

This small patch adds the definition of ARPHRD_NETLINK which can for
example be used by netlink monitoring devices as device type. So that
sockaddr_ll can pick it up and based on that choose the correct packet
dissector.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_arp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index 82c7d1bdadeb..d7fea3496f32 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -93,6 +93,7 @@
 #define ARPHRD_PHONET_PIPE 821		/* PhoNet pipe header		*/
 #define ARPHRD_CAIF	822		/* CAIF media type		*/
 #define ARPHRD_IP6GRE	823		/* GRE over IPv6		*/
+#define ARPHRD_NETLINK	824		/* Netlink header		*/
 
 #define ARPHRD_VOID	  0xFFFF	/* Void type, nothing is known */
 #define ARPHRD_NONE	  0xFFFE	/* zero header length */
-- 
cgit v1.2.3


From eba3b5a78799d21dea05118b294524958f0ab592 Mon Sep 17 00:00:00 2001
From: Alexander Frolkin <avf@eldamar.org.uk>
Date: Wed, 19 Jun 2013 10:54:25 +0100
Subject: ipvs: SH fallback and L4 hashing

By default the SH scheduler rejects connections that are hashed onto a
realserver of weight 0.  This patch adds a flag to make SH choose a
different realserver in this case, instead of rejecting the connection.

The patch also adds a flag to make SH include the source port (TCP, UDP,
SCTP) in the hash as well as the source address.  This basically allows
for deterministic round-robin load balancing (i.e., where any director
in a cluster of directors with identical config will send the same
packet the same way).

The flags are service flags (IP_VS_SVC_F_SCHED*) so that these options
can be set per service.  They are set using a new option to ipvsadm.

Signed-off-by: Alexander Frolkin <avf@eldamar.org.uk>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/uapi/linux/ip_vs.h    |   6 +++
 net/netfilter/ipvs/ip_vs_sh.c | 100 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 91 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a24537725e80..29458223d044 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,12 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e0d5d1653566..f16c027df15b 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -71,10 +75,19 @@ struct ip_vs_sh_state {
 	struct ip_vs_sh_bucket		buckets[IP_VS_SH_TAB_SIZE];
 };
 
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->weight) <= 0 ||
+	       dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+		 __be16 port, unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,12 +105,42 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+	     const union nf_inet_addr *addr, __be16 port)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+	return (!dest || is_unavailable(dest)) ? NULL : dest;
 }
 
 
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+		      const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int offset;
+	unsigned int hash;
+	struct ip_vs_dest *dest;
+
+	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
+		dest = rcu_dereference(s->buckets[hash].dest);
+		if (!dest)
+			break;
+		if (is_unavailable(dest))
+			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
+				      "%s:%d (offset %d)",
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port), offset);
+		else
+			return dest;
+	}
+
+	return NULL;
+}
+
 /*
  *      Assign all the hash buckets of the specified table with the service.
  */
@@ -213,13 +257,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
 }
 
 
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 {
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
 }
 
 
@@ -232,15 +296,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
+	__be16 port = 0;
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+		port = ip_vs_sh_get_port(skb, iph);
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph->saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+		dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+	else
+		dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+	if (!dest) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}
-- 
cgit v1.2.3


From d09bbfd2a8408a995419dff0d2ba906013cf4cc9 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Sat, 15 Jun 2013 15:32:44 +0200
Subject: input: document gamepad API and add extra keycodes

Until today all gamepad input drivers report their data differently. It is
nearly impossible to write applications for more than one device in a
generic way. Therefore, this patch introduces a uniform gamepad API which
will be used for all new drivers.

Instead of mapping buttons by their labels, we now map them by position.
This allows applications to work with any gamepad regardless of the labels
on the buttons. Furthermore, we standardize the ABS_* codes for analog
triggers and sticks.

For D-Pads the long overdue BTN_DPAD_* codes are introduced. They should
be fairly obvious how to use. To avoid confusion, the action buttons now
have BTN_EAST/SOUTH/WEST/NORTH aliases.

Reported-by: Todd Showalter <todd@electronjump.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/uapi/linux/input.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 4649ee35b605..8f82ad29fc30 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -507,10 +507,14 @@ struct input_keymap_entry {
 
 #define BTN_GAMEPAD		0x130
 #define BTN_A			0x130
+#define BTN_SOUTH		0x130
 #define BTN_B			0x131
+#define BTN_EAST		0x131
 #define BTN_C			0x132
 #define BTN_X			0x133
+#define BTN_NORTH		0x133
 #define BTN_Y			0x134
+#define BTN_WEST		0x134
 #define BTN_Z			0x135
 #define BTN_TL			0x136
 #define BTN_TR			0x137
@@ -707,6 +711,11 @@ struct input_keymap_entry {
 #define KEY_ATTENDANT_TOGGLE	0x21d	/* Attendant call on or off */
 #define KEY_LIGHTS_TOGGLE	0x21e	/* Reading light on or off */
 
+#define BTN_DPAD_UP		0x220
+#define BTN_DPAD_DOWN		0x221
+#define BTN_DPAD_LEFT		0x222
+#define BTN_DPAD_RIGHT		0x223
+
 #define BTN_TRIGGER_HAPPY		0x2c0
 #define BTN_TRIGGER_HAPPY1		0x2c0
 #define BTN_TRIGGER_HAPPY2		0x2c1
-- 
cgit v1.2.3


From 701ba533f6587aaf402a9bb732548503263d2b23 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 27 Jun 2013 11:54:51 +0200
Subject: Input: make gamepad API keycodes more clear

Shuffle the defines around so that it is clear that BTN_A, BTN_B, etc are
legacy definitions and not an accidental typos that need their own key codes.

Suggested-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/uapi/linux/input.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 8f82ad29fc30..d584047b072b 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -506,15 +506,15 @@ struct input_keymap_entry {
 #define BTN_DEAD		0x12f
 
 #define BTN_GAMEPAD		0x130
-#define BTN_A			0x130
 #define BTN_SOUTH		0x130
-#define BTN_B			0x131
+#define BTN_A			BTN_SOUTH
 #define BTN_EAST		0x131
+#define BTN_B			BTN_EAST
 #define BTN_C			0x132
-#define BTN_X			0x133
 #define BTN_NORTH		0x133
-#define BTN_Y			0x134
+#define BTN_X			BTN_NORTH
 #define BTN_WEST		0x134
+#define BTN_Y			BTN_WEST
 #define BTN_Z			0x135
 #define BTN_TL			0x136
 #define BTN_TR			0x137
-- 
cgit v1.2.3


From 496e4ae7dc944faa1721bfda7e9d834d5611a874 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 29 Jun 2013 14:15:47 +0200
Subject: netfilter: nf_queue: add NFQA_SKB_CSUM_NOTVERIFIED info flag

The common case is that TCP/IP checksums have already been
verified, e.g. by hardware (rx checksum offload), or conntrack.

Userspace can use this flag to determine when the checksum
has not been validated yet.

If the flag is set, this doesn't necessarily mean that the packet has
an invalid checksum, e.g. if NIC doesn't support rx checksum.

Userspace that sucessfully enabled NFQA_CFG_F_GSO queue feature flag can
infer that IP/TCP checksum has already been validated if either the
SKB_INFO attribute is not present or the NFQA_SKB_CSUM_NOTVERIFIED
flag is unset.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h |  2 ++
 net/netfilter/nfnetlink_queue_core.c           | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index a2308ae5a73d..3a9b92147339 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -105,5 +105,7 @@ enum nfqnl_attr_config {
 #define NFQA_SKB_CSUMNOTREADY (1 << 0)
 /* packet is GSO (i.e., exceeds device mtu) */
 #define NFQA_SKB_GSO (1 << 1)
+/* csum not validated (incoming device doesn't support hw checksum, etc.) */
+#define NFQA_SKB_CSUM_NOTVERIFIED (1 << 2)
 
 #endif /* _NFNETLINK_QUEUE_H */
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 299a48ae5dc9..971ea145ab3e 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -280,12 +280,17 @@ nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
 	skb_shinfo(to)->nr_frags = j;
 }
 
-static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet)
+static int
+nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
+		      bool csum_verify)
 {
 	__u32 flags = 0;
 
 	if (packet->ip_summed == CHECKSUM_PARTIAL)
 		flags = NFQA_SKB_CSUMNOTREADY;
+	else if (csum_verify)
+		flags = NFQA_SKB_CSUM_NOTVERIFIED;
+
 	if (skb_is_gso(packet))
 		flags |= NFQA_SKB_GSO;
 
@@ -310,6 +315,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	struct net_device *outdev;
 	struct nf_conn *ct = NULL;
 	enum ip_conntrack_info uninitialized_var(ctinfo);
+	bool csum_verify;
 
 	size =    nlmsg_total_size(sizeof(struct nfgenmsg))
 		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -327,6 +333,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	if (entskb->tstamp.tv64)
 		size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
 
+	if (entry->hook <= NF_INET_FORWARD ||
+	   (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
+		csum_verify = !skb_csum_unnecessary(entskb);
+	else
+		csum_verify = false;
+
 	outdev = entry->outdev;
 
 	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
@@ -476,7 +488,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	    nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
 		goto nla_put_failure;
 
-	if (nfqnl_put_packet_info(skb, entskb))
+	if (nfqnl_put_packet_info(skb, entskb, csum_verify))
 		goto nla_put_failure;
 
 	if (data_len) {
-- 
cgit v1.2.3


From b1a5a34bd0b8767ea689e68f8ea513e9710b671e Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Sat, 29 Jun 2013 00:15:51 +0800
Subject: net: Swap ver and type in pppoe_hdr

Ver and type in pppoe_hdr should be swapped as defined by RFC2516
section-4.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_pppox.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index 0b46fd57c8f6..e36a4aecd311 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -135,11 +135,11 @@ struct pppoe_tag {
 
 struct pppoe_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8 ver : 4;
 	__u8 type : 4;
+	__u8 ver : 4;
 #elif defined(__BIG_ENDIAN_BITFIELD)
-	__u8 type : 4;
 	__u8 ver : 4;
+	__u8 type : 4;
 #else
 #error	"Please fix <asm/byteorder.h>"
 #endif
-- 
cgit v1.2.3


From 29000caecbe87b6b66f144f72111f0d02fbbf0c1 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 3 Jul 2013 15:08:12 -0700
Subject: ptrace: add ability to get/set signal-blocked mask

crtools uses a parasite code for dumping processes.  The parasite code is
injected into a process with help PTRACE_SEIZE.

Currently crtools blocks signals from a parasite code.  If a process has
pending signals, crtools wait while a process handles these signals.

This method is not suitable for stopped tasks.  A stopped task can have a
few pending signals, when we will try to execute a parasite code, we will
need to drop SIGSTOP, but all other signals must remain pending, because a
state of processes must not be changed during checkpointing.

This patch adds two ptrace commands to set/get signal-blocked mask.

I think gdb can use this commands too.

[akpm@linux-foundation.org: be consistent with brace layout]
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/ptrace.h |  3 +++
 kernel/ptrace.c             | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 52ebcc89f306..cf1019e15f5b 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -61,6 +61,9 @@ struct ptrace_peeksiginfo_args {
 	__s32 nr;	/* how may siginfos to take */
 };
 
+#define PTRACE_GETSIGMASK	0x420a
+#define PTRACE_SETSIGMASK	0x420b
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335a7ae697f5..ba5e6cea181a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -844,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
 
+	case PTRACE_GETSIGMASK:
+		if (addr != sizeof(sigset_t)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+			ret = -EFAULT;
+		else
+			ret = 0;
+
+		break;
+
+	case PTRACE_SETSIGMASK: {
+		sigset_t new_set;
+
+		if (addr != sizeof(sigset_t)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+		/*
+		 * Every thread does recalc_sigpending() after resume, so
+		 * retarget_shared_pending() and recalc_sigpending() are not
+		 * called here.
+		 */
+		spin_lock_irq(&child->sighand->siglock);
+		child->blocked = new_set;
+		spin_unlock_irq(&child->sighand->siglock);
+
+		ret = 0;
+		break;
+	}
+
 	case PTRACE_INTERRUPT:
 		/*
 		 * Stop tracee without any side-effect on signal or job
@@ -948,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request,
 
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	case PTRACE_GETREGSET:
-	case PTRACE_SETREGSET:
-	{
+	case PTRACE_SETREGSET: {
 		struct iovec kiov;
 		struct iovec __user *uiov = datavp;
 
-- 
cgit v1.2.3


From 62525a00b87cc967bce9779d63fcc84fb9199130 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Jul 2013 10:33:37 +0930
Subject: virtio: VIRTIO_F_ANY_LAYOUT feature

Also known as the "no really, I read the spec" bit.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/uapi/linux/virtio_config.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index b7cda390fd00..3ce768c6910d 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -51,4 +51,7 @@
  * suppressed them? */
 #define VIRTIO_F_NOTIFY_ON_EMPTY	24
 
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
cgit v1.2.3


From 6e5b93ee55d401f1619092fb675b57c28c9ed7ec Mon Sep 17 00:00:00 2001
From: Mike Lockwood <lockwood@android.com>
Date: Mon, 8 Jul 2013 16:00:46 -0700
Subject: fatfs: add FAT_IOCTL_GET_VOLUME_ID

This patch, originally from Android kernel, adds vfat ioctl command
FAT_IOCTL_GET_VOLUME_ID, with this command we can get the vfat volume ID
using following code:

	ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &volume_ID)

This patch is a modified version of the patch by Mike Lockwood, with
changes from Dmitry Pervushin, who noticed the original patch makes some
volume IDs abiguous with error returns: for example, if volume id is
0xFFFFFDAD, that matches -ENOIOCTLCMD, we get "FFFFFFFF" from the user
space.

So add a parameter to ioctl to get the correct volume ID.

Android uses vfat volume ID to identify different sd card, when a new sd
card is inserted to device, android can scan the media on it and pop up
new contents.

Signed-off-by: Bintian Wang <bintian.wang@linaro.org>
Cc: dmitry pervushin <dpervushin@gmail.com>
Cc: Mike Lockwood <lockwood@android.com>
Cc: Colin Cross <ccross@android.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Sean McNeil <sean@mcneil.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h                  |  1 +
 fs/fat/file.c                 |  8 ++++++++
 fs/fat/inode.c                | 12 ++++++++++++
 include/uapi/linux/msdos_fs.h | 10 ++++++++++
 4 files changed, 31 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
 	const void *dir_ops;	      /* Opaque; default directory operations */
 	int dir_per_block;	      /* dir entries per block */
 	int dir_per_block_bits;	      /* log2(dir_per_block) */
+	unsigned int vol_id;		/*volume ID*/
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
 	return err;
 }
 
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	return put_user(sbi->vol_id, user_attr);
+}
+
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return fat_ioctl_get_attributes(inode, user_attr);
 	case FAT_IOCTL_SET_ATTRIBUTES:
 		return fat_ioctl_set_attributes(filp, user_attr);
+	case FAT_IOCTL_GET_VOLUME_ID:
+		return fat_ioctl_get_volume_id(inode, user_attr);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		brelse(fsinfo_bh);
 	}
 
+	/* interpret volume ID as a little endian 32 bit integer */
+	if (sbi->fat_bits == 32)
+		sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+					((u32)b->fat32.vol_id[1] << 8) |
+					((u32)b->fat32.vol_id[2] << 16) |
+					((u32)b->fat32.vol_id[3] << 24));
+	else /* fat 16 or 12 */
+		sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+					((u32)b->fat16.vol_id[1] << 8) |
+					((u32)b->fat16.vol_id[2] << 16) |
+					((u32)b->fat16.vol_id[3] << 24));
+
 	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
 	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
 
diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h
index f055e58b3147..e284ff919d6e 100644
--- a/include/uapi/linux/msdos_fs.h
+++ b/include/uapi/linux/msdos_fs.h
@@ -104,6 +104,8 @@ struct __fat_dirent {
 /* <linux/videotext.h> has used 0x72 ('r') in collision, so skip a few */
 #define FAT_IOCTL_GET_ATTRIBUTES	_IOR('r', 0x10, __u32)
 #define FAT_IOCTL_SET_ATTRIBUTES	_IOW('r', 0x11, __u32)
+/*Android kernel has used 0x12, so we use 0x13*/
+#define FAT_IOCTL_GET_VOLUME_ID		_IOR('r', 0x13, __u32)
 
 struct fat_boot_sector {
 	__u8	ignored[3];	/* Boot strap short or near jump */
@@ -128,6 +130,10 @@ struct fat_boot_sector {
 			__u8	drive_number;	/* Physical drive number */
 			__u8	state;		/* undocumented, but used
 						   for mount state. */
+			__u8	signature;  /* extended boot signature */
+			__u8	vol_id[4];	/* volume ID */
+			__u8	vol_label[11];	/* volume label */
+			__u8	fs_type[8];		/* file system type */
 			/* other fiealds are not added here */
 		} fat16;
 
@@ -147,6 +153,10 @@ struct fat_boot_sector {
 			__u8	drive_number;   /* Physical drive number */
 			__u8    state;       	/* undocumented, but used
 						   for mount state. */
+			__u8	signature;  /* extended boot signature */
+			__u8	vol_id[4];	/* volume ID */
+			__u8	vol_label[11];	/* volume label */
+			__u8	fs_type[8];		/* file system type */
 			/* other fiealds are not added here */
 		} fat32;
 	};
-- 
cgit v1.2.3


From 83d5e5b0af907d46d241a86d9e44003b3f0accbd Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 10 Jul 2013 23:41:18 +0100
Subject: dm: optimize use SRCU and RCU

This patch removes "io_lock" and "map_lock" in struct mapped_device and
"holders" in struct dm_table and replaces these mechanisms with
sleepable-rcu.

Previously, the code would call "dm_get_live_table" and "dm_table_put" to
get and release table. Now, the code is changed to call "dm_get_live_table"
and "dm_put_live_table". dm_get_live_table locks sleepable-rcu and
dm_put_live_table unlocks it.

dm_get_live_table_fast/dm_put_live_table_fast can be used instead of
dm_get_live_table/dm_put_live_table. These *_fast functions use
non-sleepable RCU, so the caller must not block between them.

If the code changes active or inactive dm table, it must call
dm_sync_table before destroying the old table.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c         | 122 +++++++++++++++++++++-----------
 drivers/md/dm-table.c         |  35 ---------
 drivers/md/dm.c               | 160 ++++++++++++++++++++++++------------------
 include/linux/device-mapper.h |   6 +-
 include/uapi/linux/dm-ioctl.h |   4 +-
 5 files changed, 180 insertions(+), 147 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 12f04868e899..f1b758675ec7 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
+
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
 	return -EBUSY;
 }
 
-static void __hash_remove(struct hash_cell *hc)
+static struct dm_table *__hash_remove(struct hash_cell *hc)
 {
 	struct dm_table *table;
+	int srcu_idx;
 
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
@@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc)
 	dm_set_mdptr(hc->md, NULL);
 	mutex_unlock(&dm_hash_cells_mutex);
 
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
+	table = NULL;
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		table = hc->new_map;
 	dm_put(hc->md);
 	free_cell(hc);
+
+	return table;
 }
 
 static void dm_hash_remove_all(int keep_open_devices)
@@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices)
 	int i, dev_skipped;
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *t;
 
 retry:
 	dev_skipped = 0;
@@ -287,10 +299,14 @@ retry:
 				continue;
 			}
 
-			__hash_remove(hc);
+			t = __hash_remove(hc);
 
 			up_write(&_hash_lock);
 
+			if (t) {
+				dm_sync_table(md);
+				dm_table_destroy(t);
+			}
 			dm_put(md);
 			if (likely(keep_open_devices))
 				dm_destroy(md);
@@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	struct dm_table *table;
 	struct mapped_device *md;
 	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+	int srcu_idx;
 
 	/*
 	 * duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Wake up any dm event waiters.
 	 */
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
 	if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
  * _hash_lock without first calling dm_table_put, because dm_table_destroy
  * waits for this dm_table_put and could be called under this lock.
  */
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
 {
 	struct hash_cell *hc;
 	struct dm_table *table = NULL;
 
+	/* increment rcu count, we don't care about the table pointer */
+	dm_get_live_table(md, srcu_idx);
+
 	down_read(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
 	}
 
 	table = hc->new_map;
-	if (table)
-		dm_table_get(table);
 
 out:
 	up_read(&_hash_lock);
@@ -643,10 +660,11 @@ out:
 }
 
 static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
-						      struct dm_ioctl *param)
+						      struct dm_ioctl *param,
+						      int *srcu_idx)
 {
 	return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
-		dm_get_inactive_table(md) : dm_get_live_table(md);
+		dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
 }
 
 /*
@@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 {
 	struct gendisk *disk = dm_disk(md);
 	struct dm_table *table;
+	int srcu_idx;
 
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
 			  DM_ACTIVE_PRESENT_FLAG);
@@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	param->event_nr = dm_get_event_nr(md);
 	param->target_count = 0;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_table(md, &srcu_idx);
 	if (table) {
 		if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
 			if (get_disk_ro(disk))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
 		}
-		dm_table_put(table);
 
 		param->flags |= DM_ACTIVE_PRESENT_FLAG;
 	}
+	dm_put_live_table(md, srcu_idx);
 
 	if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
-		table = dm_get_inactive_table(md);
+		int srcu_idx;
+		table = dm_get_inactive_table(md, &srcu_idx);
 		if (table) {
 			if (!(dm_table_get_mode(table) & FMODE_WRITE))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
-			dm_table_put(table);
 		}
+		dm_put_live_table(md, srcu_idx);
 	}
 }
 
@@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	int r;
+	struct dm_table *t;
 
 	down_write(&_hash_lock);
 	hc = __find_device_hash_cell(param);
@@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 		return r;
 	}
 
-	__hash_remove(hc);
+	t = __hash_remove(hc);
 	up_write(&_hash_lock);
 
+	if (t) {
+		dm_sync_table(md);
+		dm_table_destroy(t);
+	}
+
 	if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
 
@@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param)
 
 		old_map = dm_swap_table(md, new_map);
 		if (IS_ERR(old_map)) {
+			dm_sync_table(md);
 			dm_table_destroy(new_map);
 			dm_put(md);
 			return PTR_ERR(old_map);
@@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param)
 			param->flags |= DM_UEVENT_GENERATED_FLAG;
 	}
 
+	/*
+	 * Since dm_swap_table synchronizes RCU, nobody should be in
+	 * read-side critical section already.
+	 */
 	if (old_map)
 		dm_table_destroy(old_map);
 
@@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	int r = 0;
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	 */
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 out:
 	dm_put(md);
@@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	struct hash_cell *hc;
-	struct dm_table *t;
+	struct dm_table *t, *old_map = NULL;
 	struct mapped_device *md;
 	struct target_type *immutable_target_type;
 
@@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
 		DMWARN("device has been removed from the dev hash table.");
-		dm_table_destroy(t);
 		up_write(&_hash_lock);
+		dm_table_destroy(t);
 		r = -ENXIO;
 		goto out;
 	}
 
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 	hc->new_map = t;
 	up_write(&_hash_lock);
 
@@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	__dev_status(md, param);
 
 out:
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
+
 	dm_put(md);
 
 	return r;
@@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *old_map = NULL;
 
 	down_write(&_hash_lock);
 
@@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	}
 
 	if (hc->new_map) {
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 		hc->new_map = NULL;
 	}
 
@@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	__dev_status(hc->md, param);
 	md = hc->md;
 	up_write(&_hash_lock);
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
 	dm_put(md);
 
 	return 0;
@@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_deps(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	struct dm_target_msg *tmsg = (void *) param + param->data_start;
 	size_t maxlen;
 	char *result = get_result_buffer(param, param_size, &maxlen);
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	if (r <= 1)
 		goto out_argv;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_table(md, &srcu_idx);
 	if (!table)
-		goto out_argv;
+		goto out_table;
 
 	if (dm_deleting_md(md)) {
 		r = -ENXIO;
@@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	}
 
  out_table:
-	dm_table_put(table);
+	dm_put_live_table(md, srcu_idx);
  out_argv:
 	kfree(argv);
  out:
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1ff252ab7d46..f221812b7dbc 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -26,22 +26,8 @@
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 
-/*
- * The table has always exactly one reference from either mapped_device->map
- * or hash_cell->new_map. This reference is not counted in table->holders.
- * A pair of dm_create_table/dm_destroy_table functions is used for table
- * creation/destruction.
- *
- * Temporary references from the other code increase table->holders. A pair
- * of dm_table_get/dm_table_put functions is used to manipulate it.
- *
- * When the table is about to be destroyed, we wait for table->holders to
- * drop to zero.
- */
-
 struct dm_table {
 	struct mapped_device *md;
-	atomic_t holders;
 	unsigned type;
 
 	/* btree table */
@@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
-	atomic_set(&t->holders, 0);
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t)
 	if (!t)
 		return;
 
-	while (atomic_read(&t->holders))
-		msleep(1);
-	smp_mb();
-
 	/* free the indexes */
 	if (t->depth >= 2)
 		vfree(t->index[t->depth - 2]);
@@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t)
 	kfree(t);
 }
 
-void dm_table_get(struct dm_table *t)
-{
-	atomic_inc(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_get);
-
-void dm_table_put(struct dm_table *t)
-{
-	if (!t)
-		return;
-
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_put);
-
 /*
  * Checks to see if we need to extend highs or targets.
  */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 33f20103d8d5..ecff83f5b53a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -116,13 +116,20 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
 
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
+
 /*
  * Work processed by per-device workqueue.
  */
 struct mapped_device {
-	struct rw_semaphore io_lock;
+	struct srcu_struct io_barrier;
 	struct mutex suspend_lock;
-	rwlock_t map_lock;
 	atomic_t holders;
 	atomic_t open_count;
 
@@ -156,6 +163,8 @@ struct mapped_device {
 
 	/*
 	 * The current mapping.
+	 * Use dm_get_live_table{_fast} or take suspend_lock for
+	 * dereference.
 	 */
 	struct dm_table *map;
 
@@ -386,12 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
+	int srcu_idx;
 	struct dm_table *map;
 	struct dm_target *tgt;
 	int r = -ENOTTY;
 
 retry:
-	map = dm_get_live_table(md);
+	map = dm_get_live_table(md, &srcu_idx);
+
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -410,7 +421,7 @@ retry:
 		r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
 
 	if (r == -ENOTCONN) {
 		msleep(10);
@@ -509,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 /*
  * Everyone (including functions in this file), should use this
  * function to access the md->map field, and make sure they call
- * dm_table_put() when finished.
+ * dm_put_live_table() when finished.
  */
-struct dm_table *dm_get_live_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 {
-	struct dm_table *t;
-	unsigned long flags;
+	*srcu_idx = srcu_read_lock(&md->io_barrier);
+
+	return srcu_dereference(md->map, &md->io_barrier);
+}
 
-	read_lock_irqsave(&md->map_lock, flags);
-	t = md->map;
-	if (t)
-		dm_table_get(t);
-	read_unlock_irqrestore(&md->map_lock, flags);
+void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+{
+	srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
+
+void dm_sync_table(struct mapped_device *md)
+{
+	synchronize_srcu(&md->io_barrier);
+	synchronize_rcu_expedited();
+}
+
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
+{
+	rcu_read_lock();
+	return rcu_dereference(md->map);
+}
 
-	return t;
+static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
+{
+	rcu_read_unlock();
 }
 
 /*
@@ -1356,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 /*
  * Entry point to split a bio into clones and submit them to the targets.
  */
-static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md,
+				    struct dm_table *map, struct bio *bio)
 {
 	struct clone_info ci;
 	int error = 0;
 
-	ci.map = dm_get_live_table(md);
-	if (unlikely(!ci.map)) {
+	if (unlikely(!map)) {
 		bio_io_error(bio);
 		return;
 	}
 
+	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
@@ -1393,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
-	dm_table_put(ci.map);
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -1404,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q,
 			 struct bio_vec *biovec)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 	struct dm_target *ti;
 	sector_t max_sectors;
 	int max_size = 0;
@@ -1414,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q,
 
 	ti = dm_table_find_target(map, bvm->bi_sector);
 	if (!dm_target_is_valid(ti))
-		goto out_table;
+		goto out;
 
 	/*
 	 * Find maximum amount of I/O that won't need splitting
@@ -1443,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q,
 
 		max_size = 0;
 
-out_table:
-	dm_table_put(map);
-
 out:
+	dm_put_live_table_fast(md);
 	/*
 	 * Always allow an entire first page
 	 */
@@ -1465,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
@@ -1475,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-		up_read(&md->io_lock);
+		dm_put_live_table(md, srcu_idx);
 
 		if (bio_rw(bio) != READA)
 			queue_io(md, bio);
@@ -1484,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	__split_and_process_bio(md, bio);
-	up_read(&md->io_lock);
+	__split_and_process_bio(md, map, bio);
+	dm_put_live_table(md, srcu_idx);
 	return;
 }
 
@@ -1671,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	int srcu_idx;
+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 	struct dm_target *ti;
 	struct request *rq, *clone;
 	sector_t pos;
@@ -1726,7 +1757,7 @@ requeued:
 delay_and_out:
 	blk_delay_queue(q, HZ / 10);
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
 }
 
 int dm_underlying_device_busy(struct request_queue *q)
@@ -1739,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 
 	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
 		r = 1;
 	else
 		r = dm_table_any_busy_target(map);
 
-	dm_table_put(map);
+	dm_put_live_table_fast(md);
 
 	return r;
 }
@@ -1758,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct dm_table *map;
 
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-		map = dm_get_live_table(md);
+		map = dm_get_live_table_fast(md);
 		if (map) {
 			/*
 			 * Request-based dm cares about only own queue for
@@ -1769,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
-
-			dm_table_put(map);
 		}
+		dm_put_live_table_fast(md);
 	}
 
 	return r;
@@ -1876,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_minor;
 
+	r = init_srcu_struct(&md->io_barrier);
+	if (r < 0)
+		goto bad_io_barrier;
+
 	md->type = DM_TYPE_NONE;
-	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
-	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
@@ -1944,6 +1976,8 @@ bad_thread:
 bad_disk:
 	blk_cleanup_queue(md->queue);
 bad_queue:
+	cleanup_srcu_struct(&md->io_barrier);
+bad_io_barrier:
 	free_minor(minor);
 bad_minor:
 	module_put(THIS_MODULE);
@@ -1967,6 +2001,7 @@ static void free_dev(struct mapped_device *md)
 		bioset_free(md->bs);
 	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
+	cleanup_srcu_struct(&md->io_barrier);
 	free_minor(minor);
 
 	spin_lock(&_minor_lock);
@@ -2109,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
-	unsigned long flags;
 	int merge_is_optional;
 
 	size = dm_table_get_size(t);
@@ -2138,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	merge_is_optional = dm_table_merge_is_optional(t);
 
-	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
-	md->map = t;
+	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
 	dm_table_set_restrictions(t, q, limits);
@@ -2148,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	else
 		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-	write_unlock_irqrestore(&md->map_lock, flags);
+	dm_sync_table(md);
 
 	return old_map;
 }
@@ -2159,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 static struct dm_table *__unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
-	unsigned long flags;
 
 	if (!map)
 		return NULL;
 
 	dm_table_event_callback(map, NULL, NULL);
-	write_lock_irqsave(&md->map_lock, flags);
-	md->map = NULL;
-	write_unlock_irqrestore(&md->map_lock, flags);
+	rcu_assign_pointer(md->map, NULL);
+	dm_sync_table(md);
 
 	return map;
 }
@@ -2319,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
 	struct dm_table *map;
+	int srcu_idx;
 
 	might_sleep();
 
 	spin_lock(&_minor_lock);
-	map = dm_get_live_table(md);
+	map = dm_get_live_table(md, &srcu_idx);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
@@ -2333,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		dm_table_postsuspend_targets(map);
 	}
 
+	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+	dm_put_live_table(md, srcu_idx);
+
 	/*
 	 * Rare, but there may be I/O requests still going to complete,
 	 * for example.  Wait for all references to disappear.
@@ -2347,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		       dm_device_name(md), atomic_read(&md->holders));
 
 	dm_sysfs_exit(md);
-	dm_table_put(map);
 	dm_table_destroy(__unbind(md));
 	free_dev(md);
 }
@@ -2404,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work)
 	struct mapped_device *md = container_of(work, struct mapped_device,
 						work);
 	struct bio *c;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
@@ -2415,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work)
 		if (!c)
 			break;
 
-		up_read(&md->io_lock);
-
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else
-			__split_and_process_bio(md, c);
-
-		down_read(&md->io_lock);
+			__split_and_process_bio(md, map, c);
 	}
 
-	up_read(&md->io_lock);
+	dm_put_live_table(md, srcu_idx);
 }
 
 static void dm_queue_flush(struct mapped_device *md)
@@ -2457,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	 * reappear.
 	 */
 	if (dm_table_has_no_data_devices(table)) {
-		live_map = dm_get_live_table(md);
+		live_map = dm_get_live_table_fast(md);
 		if (live_map)
 			limits = md->queue->limits;
-		dm_table_put(live_map);
+		dm_put_live_table_fast(md);
 	}
 
 	if (!live_map) {
@@ -2540,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	map = dm_get_live_table(md);
+	map = md->map;
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2561,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
 		if (r)
-			goto out;
+			goto out_unlock;
 	}
 
 	/*
@@ -2576,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
 	 * flush_workqueue(md->wq).
 	 */
-	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/*
 	 * Stop md->queue before flushing md->wq in case request-based
@@ -2596,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
-	down_write(&md->io_lock);
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/* were we interrupted ? */
 	if (r < 0) {
@@ -2609,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 			start_queue(md->queue);
 
 		unlock_fs(md);
-		goto out; /* pushback list is already flushed, so skip flush */
+		goto out_unlock; /* pushback list is already flushed, so skip flush */
 	}
 
 	/*
@@ -2622,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	dm_table_postsuspend_targets(map);
 
-out:
-	dm_table_put(map);
-
 out_unlock:
 	mutex_unlock(&md->suspend_lock);
 	return r;
@@ -2639,7 +2666,7 @@ int dm_resume(struct mapped_device *md)
 	if (!dm_suspended_md(md))
 		goto out;
 
-	map = dm_get_live_table(md);
+	map = md->map;
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -2663,7 +2690,6 @@ int dm_resume(struct mapped_device *md)
 
 	r = 0;
 out:
-	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
 
 	return r;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3cd32478f2fd..e151d4c9298d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -446,9 +446,9 @@ int __must_check dm_set_target_max_io_len(struct dm_target *ti, sector_t len);
 /*
  * Table reference counting.
  */
-struct dm_table *dm_get_live_table(struct mapped_device *md);
-void dm_table_get(struct dm_table *t);
-void dm_table_put(struct dm_table *t);
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx);
+void dm_put_live_table(struct mapped_device *md, int srcu_idx);
+void dm_sync_table(struct mapped_device *md);
 
 /*
  * Queries
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 7e75b6fd8d45..afd0cbd52edb 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	24
+#define DM_VERSION_MINOR	25
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2013-01-15)"
+#define DM_VERSION_EXTRA	"-ioctl (2013-06-26)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3